diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6b1ad030d8dc51c98535bc6be58bc0d0d780757a --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Instruct-2507 +library_name: transformers +model_name: Qwen3-8B_n3000_math +tags: +- generated_from_trainer +- sft +- trl +licence: license +--- + +# Model Card for Qwen3-8B_n3000_math + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + + + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.3 +- Pytorch: 2.8.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..70adff8a08fb31e0636f618564838d4bf3c05286 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-1000/chat_template.jinja b/checkpoint-1000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..70adff8a08fb31e0636f618564838d4bf3c05286 --- /dev/null +++ b/checkpoint-1000/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-1000/config.json b/checkpoint-1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c351e5fb52f50ea6e07b40981aef81c80f9df7e4 --- /dev/null +++ b/checkpoint-1000/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151662, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-1000/generation_config.json b/checkpoint-1000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2104b83493c2833855e8fe32a7a784805ab5c2ee --- /dev/null +++ b/checkpoint-1000/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151662, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.5.3" +} diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e47e52c4e7f0b2bcf2103a878790216f3f6436d --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 1010000, + "pad_token": "<|fim_pad|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..87532bf7321a64560106312a2f81138a0e52ebd6 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,9034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7598784194528876, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007598784194528875, + "grad_norm": 11.767926216125488, + "learning_rate": 0.0, + "loss": 0.7937269806861877, + "mean_token_accuracy": 0.7822731137275696, + "num_tokens": 10507.0, + "step": 1 + }, + { + "epoch": 0.001519756838905775, + "grad_norm": 14.9199800491333, + "learning_rate": 2.5252525252525256e-08, + "loss": 0.7665389776229858, + "mean_token_accuracy": 0.8342233300209045, + "num_tokens": 14806.0, + "step": 2 + }, + { + "epoch": 0.0022796352583586625, + "grad_norm": 11.991217613220215, + "learning_rate": 5.050505050505051e-08, + "loss": 0.9597002267837524, + "mean_token_accuracy": 0.7054992318153381, + "num_tokens": 27170.0, + "step": 3 + }, + { + "epoch": 0.00303951367781155, + "grad_norm": 12.958333015441895, + "learning_rate": 7.575757575757576e-08, + "loss": 0.9971482753753662, + "mean_token_accuracy": 0.7261134386062622, + "num_tokens": 33729.0, + "step": 4 + }, + { + "epoch": 0.003799392097264438, + "grad_norm": 13.5665283203125, + "learning_rate": 1.0101010101010103e-07, + "loss": 0.9504883885383606, + "mean_token_accuracy": 0.745307445526123, + "num_tokens": 41174.0, + "step": 5 + }, + { + "epoch": 0.004559270516717325, + "grad_norm": 10.09444808959961, + "learning_rate": 1.2626262626262626e-07, + "loss": 0.759548008441925, + "mean_token_accuracy": 0.7842121124267578, + "num_tokens": 47943.0, + "step": 6 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 10.741650581359863, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.8231598138809204, + "mean_token_accuracy": 0.7550969123840332, + "num_tokens": 56665.0, + "step": 7 + }, + { + "epoch": 0.0060790273556231, + "grad_norm": 12.250170707702637, + "learning_rate": 1.767676767676768e-07, + "loss": 0.8576581478118896, + "mean_token_accuracy": 0.7568671703338623, + "num_tokens": 67606.0, + "step": 8 + }, + { + "epoch": 0.006838905775075988, + "grad_norm": 12.828629493713379, + "learning_rate": 2.0202020202020205e-07, + "loss": 0.9886435866355896, + "mean_token_accuracy": 0.733400285243988, + "num_tokens": 74272.0, + "step": 9 + }, + { + "epoch": 0.007598784194528876, + "grad_norm": 15.966923713684082, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.064985990524292, + "mean_token_accuracy": 0.7101132869720459, + "num_tokens": 80524.0, + "step": 10 + }, + { + "epoch": 0.008358662613981762, + "grad_norm": 10.864850044250488, + "learning_rate": 2.525252525252525e-07, + "loss": 0.8311550617218018, + "mean_token_accuracy": 0.7431639432907104, + "num_tokens": 96292.0, + "step": 11 + }, + { + "epoch": 0.00911854103343465, + "grad_norm": 16.438785552978516, + "learning_rate": 2.7777777777777776e-07, + "loss": 1.0579866170883179, + "mean_token_accuracy": 0.7222976684570312, + "num_tokens": 102992.0, + "step": 12 + }, + { + "epoch": 0.009878419452887538, + "grad_norm": 11.179214477539062, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9816144704818726, + "mean_token_accuracy": 0.7206371426582336, + "num_tokens": 113571.0, + "step": 13 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 12.780299186706543, + "learning_rate": 3.2828282828282834e-07, + "loss": 0.847449004650116, + "mean_token_accuracy": 0.7826199531555176, + "num_tokens": 119568.0, + "step": 14 + }, + { + "epoch": 0.011398176291793313, + "grad_norm": 14.800421714782715, + "learning_rate": 3.535353535353536e-07, + "loss": 0.9275516271591187, + "mean_token_accuracy": 0.7655045986175537, + "num_tokens": 126258.0, + "step": 15 + }, + { + "epoch": 0.0121580547112462, + "grad_norm": 11.267602920532227, + "learning_rate": 3.787878787878788e-07, + "loss": 0.8464037179946899, + "mean_token_accuracy": 0.7606508731842041, + "num_tokens": 136831.0, + "step": 16 + }, + { + "epoch": 0.012917933130699088, + "grad_norm": 12.891013145446777, + "learning_rate": 4.040404040404041e-07, + "loss": 0.9903074502944946, + "mean_token_accuracy": 0.7247487306594849, + "num_tokens": 150434.0, + "step": 17 + }, + { + "epoch": 0.013677811550151976, + "grad_norm": 11.13957691192627, + "learning_rate": 4.2929292929292934e-07, + "loss": 0.8287211656570435, + "mean_token_accuracy": 0.7621913552284241, + "num_tokens": 158516.0, + "step": 18 + }, + { + "epoch": 0.014437689969604863, + "grad_norm": 18.39569664001465, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.150015115737915, + "mean_token_accuracy": 0.7349498271942139, + "num_tokens": 162214.0, + "step": 19 + }, + { + "epoch": 0.015197568389057751, + "grad_norm": 9.353750228881836, + "learning_rate": 4.797979797979798e-07, + "loss": 0.7228299379348755, + "mean_token_accuracy": 0.7969573736190796, + "num_tokens": 173035.0, + "step": 20 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 8.267163276672363, + "learning_rate": 5.05050505050505e-07, + "loss": 0.7358136177062988, + "mean_token_accuracy": 0.7903937101364136, + "num_tokens": 183568.0, + "step": 21 + }, + { + "epoch": 0.016717325227963525, + "grad_norm": 11.137128829956055, + "learning_rate": 5.303030303030304e-07, + "loss": 1.0075397491455078, + "mean_token_accuracy": 0.702807605266571, + "num_tokens": 192759.0, + "step": 22 + }, + { + "epoch": 0.017477203647416412, + "grad_norm": 10.734103202819824, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8925919532775879, + "mean_token_accuracy": 0.7475671768188477, + "num_tokens": 201280.0, + "step": 23 + }, + { + "epoch": 0.0182370820668693, + "grad_norm": 11.945566177368164, + "learning_rate": 5.808080808080809e-07, + "loss": 0.7260514497756958, + "mean_token_accuracy": 0.7859152555465698, + "num_tokens": 218053.0, + "step": 24 + }, + { + "epoch": 0.018996960486322188, + "grad_norm": 18.610652923583984, + "learning_rate": 6.060606060606061e-07, + "loss": 0.8995465636253357, + "mean_token_accuracy": 0.7931990623474121, + "num_tokens": 220953.0, + "step": 25 + }, + { + "epoch": 0.019756838905775075, + "grad_norm": 10.51898193359375, + "learning_rate": 6.313131313131314e-07, + "loss": 0.9532671570777893, + "mean_token_accuracy": 0.7257645726203918, + "num_tokens": 231200.0, + "step": 26 + }, + { + "epoch": 0.020516717325227963, + "grad_norm": 9.581812858581543, + "learning_rate": 6.565656565656567e-07, + "loss": 0.9038010239601135, + "mean_token_accuracy": 0.7390379905700684, + "num_tokens": 237711.0, + "step": 27 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 12.297484397888184, + "learning_rate": 6.818181818181818e-07, + "loss": 1.048936367034912, + "mean_token_accuracy": 0.7175670862197876, + "num_tokens": 242503.0, + "step": 28 + }, + { + "epoch": 0.022036474164133738, + "grad_norm": 7.437953472137451, + "learning_rate": 7.070707070707071e-07, + "loss": 0.8308826684951782, + "mean_token_accuracy": 0.7415335774421692, + "num_tokens": 250842.0, + "step": 29 + }, + { + "epoch": 0.022796352583586626, + "grad_norm": 6.134475231170654, + "learning_rate": 7.323232323232324e-07, + "loss": 0.647913932800293, + "mean_token_accuracy": 0.8124054670333862, + "num_tokens": 267453.0, + "step": 30 + }, + { + "epoch": 0.023556231003039513, + "grad_norm": 6.678966045379639, + "learning_rate": 7.575757575757576e-07, + "loss": 0.7052810192108154, + "mean_token_accuracy": 0.7908754348754883, + "num_tokens": 284416.0, + "step": 31 + }, + { + "epoch": 0.0243161094224924, + "grad_norm": 7.42232084274292, + "learning_rate": 7.82828282828283e-07, + "loss": 1.022383213043213, + "mean_token_accuracy": 0.7053230404853821, + "num_tokens": 292073.0, + "step": 32 + }, + { + "epoch": 0.02507598784194529, + "grad_norm": 6.463219165802002, + "learning_rate": 8.080808080808082e-07, + "loss": 0.7603012323379517, + "mean_token_accuracy": 0.7728140354156494, + "num_tokens": 298550.0, + "step": 33 + }, + { + "epoch": 0.025835866261398176, + "grad_norm": 5.668411731719971, + "learning_rate": 8.333333333333333e-07, + "loss": 0.7707852721214294, + "mean_token_accuracy": 0.7827773094177246, + "num_tokens": 306683.0, + "step": 34 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 4.984964847564697, + "learning_rate": 8.585858585858587e-07, + "loss": 0.6317349672317505, + "mean_token_accuracy": 0.8106861114501953, + "num_tokens": 318842.0, + "step": 35 + }, + { + "epoch": 0.02735562310030395, + "grad_norm": 4.421732425689697, + "learning_rate": 8.838383838383839e-07, + "loss": 0.6228617429733276, + "mean_token_accuracy": 0.8023355603218079, + "num_tokens": 329850.0, + "step": 36 + }, + { + "epoch": 0.02811550151975684, + "grad_norm": 5.970808029174805, + "learning_rate": 9.090909090909091e-07, + "loss": 0.8443238139152527, + "mean_token_accuracy": 0.7462409734725952, + "num_tokens": 335844.0, + "step": 37 + }, + { + "epoch": 0.028875379939209727, + "grad_norm": 4.5389084815979, + "learning_rate": 9.343434343434345e-07, + "loss": 0.6976436376571655, + "mean_token_accuracy": 0.790410041809082, + "num_tokens": 348768.0, + "step": 38 + }, + { + "epoch": 0.029635258358662615, + "grad_norm": 4.116631507873535, + "learning_rate": 9.595959595959596e-07, + "loss": 0.6698519587516785, + "mean_token_accuracy": 0.7818127870559692, + "num_tokens": 355460.0, + "step": 39 + }, + { + "epoch": 0.030395136778115502, + "grad_norm": 3.3714773654937744, + "learning_rate": 9.84848484848485e-07, + "loss": 0.5723201036453247, + "mean_token_accuracy": 0.8100086450576782, + "num_tokens": 368507.0, + "step": 40 + }, + { + "epoch": 0.03115501519756839, + "grad_norm": 4.4438347816467285, + "learning_rate": 1.01010101010101e-06, + "loss": 0.7508786916732788, + "mean_token_accuracy": 0.7711942791938782, + "num_tokens": 376467.0, + "step": 41 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 5.609974384307861, + "learning_rate": 1.0353535353535354e-06, + "loss": 0.566256046295166, + "mean_token_accuracy": 0.8319284319877625, + "num_tokens": 381399.0, + "step": 42 + }, + { + "epoch": 0.03267477203647416, + "grad_norm": 5.124386787414551, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.8151067495346069, + "mean_token_accuracy": 0.7537785768508911, + "num_tokens": 387389.0, + "step": 43 + }, + { + "epoch": 0.03343465045592705, + "grad_norm": 3.6318116188049316, + "learning_rate": 1.085858585858586e-06, + "loss": 0.5989949107170105, + "mean_token_accuracy": 0.8129256963729858, + "num_tokens": 395302.0, + "step": 44 + }, + { + "epoch": 0.03419452887537994, + "grad_norm": 2.694424629211426, + "learning_rate": 1.111111111111111e-06, + "loss": 0.5831396579742432, + "mean_token_accuracy": 0.8056820631027222, + "num_tokens": 409920.0, + "step": 45 + }, + { + "epoch": 0.034954407294832825, + "grad_norm": 2.2949178218841553, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.472550630569458, + "mean_token_accuracy": 0.8343006372451782, + "num_tokens": 428323.0, + "step": 46 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 3.3930575847625732, + "learning_rate": 1.1616161616161617e-06, + "loss": 0.6246505379676819, + "mean_token_accuracy": 0.783149003982544, + "num_tokens": 435889.0, + "step": 47 + }, + { + "epoch": 0.0364741641337386, + "grad_norm": 3.692598819732666, + "learning_rate": 1.186868686868687e-06, + "loss": 0.46132946014404297, + "mean_token_accuracy": 0.8583089113235474, + "num_tokens": 441192.0, + "step": 48 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 6.571533203125, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.9351121783256531, + "mean_token_accuracy": 0.7580878734588623, + "num_tokens": 444277.0, + "step": 49 + }, + { + "epoch": 0.037993920972644375, + "grad_norm": 5.029570579528809, + "learning_rate": 1.2373737373737375e-06, + "loss": 0.6921554803848267, + "mean_token_accuracy": 0.8131166100502014, + "num_tokens": 447646.0, + "step": 50 + }, + { + "epoch": 0.03875379939209726, + "grad_norm": 2.9174208641052246, + "learning_rate": 1.2626262626262629e-06, + "loss": 0.591706395149231, + "mean_token_accuracy": 0.8108617067337036, + "num_tokens": 461397.0, + "step": 51 + }, + { + "epoch": 0.03951367781155015, + "grad_norm": 4.315536022186279, + "learning_rate": 1.287878787878788e-06, + "loss": 0.6986310482025146, + "mean_token_accuracy": 0.7710754871368408, + "num_tokens": 472047.0, + "step": 52 + }, + { + "epoch": 0.04027355623100304, + "grad_norm": 2.6216275691986084, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5553690791130066, + "mean_token_accuracy": 0.8167896866798401, + "num_tokens": 482795.0, + "step": 53 + }, + { + "epoch": 0.041033434650455926, + "grad_norm": 3.0562477111816406, + "learning_rate": 1.3383838383838385e-06, + "loss": 0.6909202337265015, + "mean_token_accuracy": 0.7859863638877869, + "num_tokens": 494818.0, + "step": 54 + }, + { + "epoch": 0.04179331306990881, + "grad_norm": 2.1420412063598633, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.5415265560150146, + "mean_token_accuracy": 0.818886399269104, + "num_tokens": 513695.0, + "step": 55 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.9610488414764404, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.6602212190628052, + "mean_token_accuracy": 0.7830734252929688, + "num_tokens": 523784.0, + "step": 56 + }, + { + "epoch": 0.04331306990881459, + "grad_norm": 2.511972665786743, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.5717809796333313, + "mean_token_accuracy": 0.8053616285324097, + "num_tokens": 546308.0, + "step": 57 + }, + { + "epoch": 0.044072948328267476, + "grad_norm": 3.52642822265625, + "learning_rate": 1.4393939393939396e-06, + "loss": 0.6242594718933105, + "mean_token_accuracy": 0.8162082433700562, + "num_tokens": 552019.0, + "step": 58 + }, + { + "epoch": 0.044832826747720364, + "grad_norm": 3.02362322807312, + "learning_rate": 1.4646464646464648e-06, + "loss": 0.6634255647659302, + "mean_token_accuracy": 0.7682032585144043, + "num_tokens": 560009.0, + "step": 59 + }, + { + "epoch": 0.04559270516717325, + "grad_norm": 2.3910107612609863, + "learning_rate": 1.48989898989899e-06, + "loss": 0.5519146919250488, + "mean_token_accuracy": 0.8270269632339478, + "num_tokens": 571005.0, + "step": 60 + }, + { + "epoch": 0.04635258358662614, + "grad_norm": 4.28154993057251, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.7437789440155029, + "mean_token_accuracy": 0.7782418131828308, + "num_tokens": 574950.0, + "step": 61 + }, + { + "epoch": 0.04711246200607903, + "grad_norm": 3.4078686237335205, + "learning_rate": 1.5404040404040404e-06, + "loss": 0.6345915198326111, + "mean_token_accuracy": 0.7903392314910889, + "num_tokens": 581657.0, + "step": 62 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 2.6834158897399902, + "learning_rate": 1.565656565656566e-06, + "loss": 0.5981127023696899, + "mean_token_accuracy": 0.7911489605903625, + "num_tokens": 591267.0, + "step": 63 + }, + { + "epoch": 0.0486322188449848, + "grad_norm": 2.1054461002349854, + "learning_rate": 1.590909090909091e-06, + "loss": 0.5523523688316345, + "mean_token_accuracy": 0.8194501399993896, + "num_tokens": 606787.0, + "step": 64 + }, + { + "epoch": 0.04939209726443769, + "grad_norm": 3.322596788406372, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.48417025804519653, + "mean_token_accuracy": 0.8293706178665161, + "num_tokens": 611068.0, + "step": 65 + }, + { + "epoch": 0.05015197568389058, + "grad_norm": 2.302450180053711, + "learning_rate": 1.6414141414141415e-06, + "loss": 0.6498389840126038, + "mean_token_accuracy": 0.7728497385978699, + "num_tokens": 624452.0, + "step": 66 + }, + { + "epoch": 0.050911854103343465, + "grad_norm": 2.680191993713379, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.6347037553787231, + "mean_token_accuracy": 0.8108306527137756, + "num_tokens": 638049.0, + "step": 67 + }, + { + "epoch": 0.05167173252279635, + "grad_norm": 3.0297021865844727, + "learning_rate": 1.6919191919191922e-06, + "loss": 0.5344363451004028, + "mean_token_accuracy": 0.8113535046577454, + "num_tokens": 643892.0, + "step": 68 + }, + { + "epoch": 0.05243161094224924, + "grad_norm": 2.9283676147460938, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.6999260187149048, + "mean_token_accuracy": 0.7782022356987, + "num_tokens": 654418.0, + "step": 69 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 3.4098572731018066, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.6508946418762207, + "mean_token_accuracy": 0.7942900657653809, + "num_tokens": 659837.0, + "step": 70 + }, + { + "epoch": 0.053951367781155016, + "grad_norm": 2.6756019592285156, + "learning_rate": 1.7676767676767678e-06, + "loss": 0.603486180305481, + "mean_token_accuracy": 0.8015457391738892, + "num_tokens": 668361.0, + "step": 71 + }, + { + "epoch": 0.0547112462006079, + "grad_norm": 2.2630293369293213, + "learning_rate": 1.792929292929293e-06, + "loss": 0.6608274579048157, + "mean_token_accuracy": 0.7753809690475464, + "num_tokens": 679025.0, + "step": 72 + }, + { + "epoch": 0.05547112462006079, + "grad_norm": 2.123962879180908, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4525482654571533, + "mean_token_accuracy": 0.8425612449645996, + "num_tokens": 688574.0, + "step": 73 + }, + { + "epoch": 0.05623100303951368, + "grad_norm": 7.90519905090332, + "learning_rate": 1.8434343434343434e-06, + "loss": 0.6507195830345154, + "mean_token_accuracy": 0.7714964151382446, + "num_tokens": 694534.0, + "step": 74 + }, + { + "epoch": 0.056990881458966566, + "grad_norm": 2.372203826904297, + "learning_rate": 1.868686868686869e-06, + "loss": 0.4458143413066864, + "mean_token_accuracy": 0.7991449236869812, + "num_tokens": 703114.0, + "step": 75 + }, + { + "epoch": 0.057750759878419454, + "grad_norm": 2.918677568435669, + "learning_rate": 1.8939393939393941e-06, + "loss": 0.5614339113235474, + "mean_token_accuracy": 0.8211464881896973, + "num_tokens": 709038.0, + "step": 76 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 1.6106709241867065, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.5802098512649536, + "mean_token_accuracy": 0.8055065870285034, + "num_tokens": 730482.0, + "step": 77 + }, + { + "epoch": 0.05927051671732523, + "grad_norm": 2.8069989681243896, + "learning_rate": 1.944444444444445e-06, + "loss": 0.5709059238433838, + "mean_token_accuracy": 0.8024872541427612, + "num_tokens": 751817.0, + "step": 78 + }, + { + "epoch": 0.06003039513677812, + "grad_norm": 2.641667127609253, + "learning_rate": 1.96969696969697e-06, + "loss": 0.6480152606964111, + "mean_token_accuracy": 0.7912271618843079, + "num_tokens": 759236.0, + "step": 79 + }, + { + "epoch": 0.060790273556231005, + "grad_norm": 2.6034350395202637, + "learning_rate": 1.994949494949495e-06, + "loss": 0.5535176396369934, + "mean_token_accuracy": 0.7980542778968811, + "num_tokens": 766496.0, + "step": 80 + }, + { + "epoch": 0.06155015197568389, + "grad_norm": 1.7095069885253906, + "learning_rate": 2.02020202020202e-06, + "loss": 0.4545496106147766, + "mean_token_accuracy": 0.8229660391807556, + "num_tokens": 780124.0, + "step": 81 + }, + { + "epoch": 0.06231003039513678, + "grad_norm": 3.788830518722534, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.6679391264915466, + "mean_token_accuracy": 0.7942397594451904, + "num_tokens": 784555.0, + "step": 82 + }, + { + "epoch": 0.06306990881458967, + "grad_norm": 2.009831666946411, + "learning_rate": 2.070707070707071e-06, + "loss": 0.5067101120948792, + "mean_token_accuracy": 0.8276634216308594, + "num_tokens": 797459.0, + "step": 83 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 2.201627731323242, + "learning_rate": 2.095959595959596e-06, + "loss": 0.5012127161026001, + "mean_token_accuracy": 0.8432504534721375, + "num_tokens": 810817.0, + "step": 84 + }, + { + "epoch": 0.06458966565349544, + "grad_norm": 2.492568016052246, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.6142797470092773, + "mean_token_accuracy": 0.8338661193847656, + "num_tokens": 818191.0, + "step": 85 + }, + { + "epoch": 0.06534954407294832, + "grad_norm": 2.8360862731933594, + "learning_rate": 2.1464646464646467e-06, + "loss": 0.5569300651550293, + "mean_token_accuracy": 0.8121030330657959, + "num_tokens": 825325.0, + "step": 86 + }, + { + "epoch": 0.06610942249240122, + "grad_norm": 2.407548427581787, + "learning_rate": 2.171717171717172e-06, + "loss": 0.6442930102348328, + "mean_token_accuracy": 0.792514443397522, + "num_tokens": 834439.0, + "step": 87 + }, + { + "epoch": 0.0668693009118541, + "grad_norm": 2.340728759765625, + "learning_rate": 2.196969696969697e-06, + "loss": 0.6494365930557251, + "mean_token_accuracy": 0.7746615409851074, + "num_tokens": 843078.0, + "step": 88 + }, + { + "epoch": 0.067629179331307, + "grad_norm": 1.7703697681427002, + "learning_rate": 2.222222222222222e-06, + "loss": 0.598991870880127, + "mean_token_accuracy": 0.7992157340049744, + "num_tokens": 860171.0, + "step": 89 + }, + { + "epoch": 0.06838905775075987, + "grad_norm": 2.5779271125793457, + "learning_rate": 2.2474747474747476e-06, + "loss": 0.5693082809448242, + "mean_token_accuracy": 0.8093700408935547, + "num_tokens": 866669.0, + "step": 90 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 2.014092206954956, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5346695780754089, + "mean_token_accuracy": 0.8165590763092041, + "num_tokens": 876698.0, + "step": 91 + }, + { + "epoch": 0.06990881458966565, + "grad_norm": 1.7555919885635376, + "learning_rate": 2.2979797979797983e-06, + "loss": 0.5321458578109741, + "mean_token_accuracy": 0.8166656494140625, + "num_tokens": 889488.0, + "step": 92 + }, + { + "epoch": 0.07066869300911854, + "grad_norm": 1.8631824254989624, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.5246532559394836, + "mean_token_accuracy": 0.8088107705116272, + "num_tokens": 901322.0, + "step": 93 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.2332139015197754, + "learning_rate": 2.348484848484849e-06, + "loss": 0.5141711235046387, + "mean_token_accuracy": 0.8382217884063721, + "num_tokens": 905792.0, + "step": 94 + }, + { + "epoch": 0.07218844984802432, + "grad_norm": 1.7806555032730103, + "learning_rate": 2.373737373737374e-06, + "loss": 0.5233149528503418, + "mean_token_accuracy": 0.8101529479026794, + "num_tokens": 917320.0, + "step": 95 + }, + { + "epoch": 0.0729483282674772, + "grad_norm": 1.8169859647750854, + "learning_rate": 2.3989898989898993e-06, + "loss": 0.578881561756134, + "mean_token_accuracy": 0.8044873476028442, + "num_tokens": 931062.0, + "step": 96 + }, + { + "epoch": 0.0737082066869301, + "grad_norm": 4.677402496337891, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.7842556238174438, + "mean_token_accuracy": 0.7579764127731323, + "num_tokens": 934712.0, + "step": 97 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 2.6987264156341553, + "learning_rate": 2.4494949494949495e-06, + "loss": 0.5669287443161011, + "mean_token_accuracy": 0.8186933994293213, + "num_tokens": 941058.0, + "step": 98 + }, + { + "epoch": 0.07522796352583587, + "grad_norm": 1.6906023025512695, + "learning_rate": 2.474747474747475e-06, + "loss": 0.4976363778114319, + "mean_token_accuracy": 0.8198553323745728, + "num_tokens": 956509.0, + "step": 99 + }, + { + "epoch": 0.07598784194528875, + "grad_norm": 2.7256152629852295, + "learning_rate": 2.5e-06, + "loss": 0.7138420343399048, + "mean_token_accuracy": 0.7752805948257446, + "num_tokens": 963920.0, + "step": 100 + }, + { + "epoch": 0.07674772036474165, + "grad_norm": 2.174870491027832, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.6733541488647461, + "mean_token_accuracy": 0.7745175361633301, + "num_tokens": 975268.0, + "step": 101 + }, + { + "epoch": 0.07750759878419453, + "grad_norm": 1.5587213039398193, + "learning_rate": 2.5505050505050505e-06, + "loss": 0.44223445653915405, + "mean_token_accuracy": 0.8278359174728394, + "num_tokens": 991837.0, + "step": 102 + }, + { + "epoch": 0.07826747720364742, + "grad_norm": 2.181840658187866, + "learning_rate": 2.575757575757576e-06, + "loss": 0.625128448009491, + "mean_token_accuracy": 0.7941786050796509, + "num_tokens": 1004325.0, + "step": 103 + }, + { + "epoch": 0.0790273556231003, + "grad_norm": 1.4986687898635864, + "learning_rate": 2.601010101010101e-06, + "loss": 0.39262527227401733, + "mean_token_accuracy": 0.8412648439407349, + "num_tokens": 1018331.0, + "step": 104 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 2.3416061401367188, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.5495132803916931, + "mean_token_accuracy": 0.8193322420120239, + "num_tokens": 1026090.0, + "step": 105 + }, + { + "epoch": 0.08054711246200608, + "grad_norm": 3.8168859481811523, + "learning_rate": 2.6515151515151514e-06, + "loss": 0.4898706376552582, + "mean_token_accuracy": 0.8467956185340881, + "num_tokens": 1029955.0, + "step": 106 + }, + { + "epoch": 0.08130699088145897, + "grad_norm": 4.113908767700195, + "learning_rate": 2.676767676767677e-06, + "loss": 0.6189584732055664, + "mean_token_accuracy": 0.8019394278526306, + "num_tokens": 1033598.0, + "step": 107 + }, + { + "epoch": 0.08206686930091185, + "grad_norm": 2.50003981590271, + "learning_rate": 2.7020202020202025e-06, + "loss": 0.6479471921920776, + "mean_token_accuracy": 0.7790026664733887, + "num_tokens": 1042533.0, + "step": 108 + }, + { + "epoch": 0.08282674772036475, + "grad_norm": 1.408934473991394, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.3909248113632202, + "mean_token_accuracy": 0.8477586507797241, + "num_tokens": 1061755.0, + "step": 109 + }, + { + "epoch": 0.08358662613981763, + "grad_norm": 3.360633611679077, + "learning_rate": 2.7525252525252528e-06, + "loss": 0.6952459812164307, + "mean_token_accuracy": 0.777535080909729, + "num_tokens": 1067316.0, + "step": 110 + }, + { + "epoch": 0.08434650455927052, + "grad_norm": 1.8631696701049805, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.5420593023300171, + "mean_token_accuracy": 0.8157662749290466, + "num_tokens": 1079930.0, + "step": 111 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.4308314323425293, + "learning_rate": 2.803030303030303e-06, + "loss": 0.5863882303237915, + "mean_token_accuracy": 0.8206346035003662, + "num_tokens": 1088069.0, + "step": 112 + }, + { + "epoch": 0.0858662613981763, + "grad_norm": 2.922808885574341, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5217319130897522, + "mean_token_accuracy": 0.8253234028816223, + "num_tokens": 1093607.0, + "step": 113 + }, + { + "epoch": 0.08662613981762918, + "grad_norm": 2.3596107959747314, + "learning_rate": 2.8535353535353537e-06, + "loss": 0.5070714950561523, + "mean_token_accuracy": 0.8258323669433594, + "num_tokens": 1100405.0, + "step": 114 + }, + { + "epoch": 0.08738601823708207, + "grad_norm": 3.0853066444396973, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.591964840888977, + "mean_token_accuracy": 0.8047322630882263, + "num_tokens": 1107535.0, + "step": 115 + }, + { + "epoch": 0.08814589665653495, + "grad_norm": 1.9251092672348022, + "learning_rate": 2.904040404040404e-06, + "loss": 0.5226191878318787, + "mean_token_accuracy": 0.8022720217704773, + "num_tokens": 1118716.0, + "step": 116 + }, + { + "epoch": 0.08890577507598785, + "grad_norm": 1.9692988395690918, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.5462069511413574, + "mean_token_accuracy": 0.8157015442848206, + "num_tokens": 1131917.0, + "step": 117 + }, + { + "epoch": 0.08966565349544073, + "grad_norm": 1.4738909006118774, + "learning_rate": 2.954545454545455e-06, + "loss": 0.4564219117164612, + "mean_token_accuracy": 0.849632978439331, + "num_tokens": 1148534.0, + "step": 118 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 2.72646164894104, + "learning_rate": 2.97979797979798e-06, + "loss": 0.6654808521270752, + "mean_token_accuracy": 0.7752684354782104, + "num_tokens": 1155438.0, + "step": 119 + }, + { + "epoch": 0.0911854103343465, + "grad_norm": 2.7843852043151855, + "learning_rate": 3.0050505050505054e-06, + "loss": 0.5354680418968201, + "mean_token_accuracy": 0.8196378946304321, + "num_tokens": 1161815.0, + "step": 120 + }, + { + "epoch": 0.0919452887537994, + "grad_norm": 2.8052573204040527, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.6366757154464722, + "mean_token_accuracy": 0.7967483997344971, + "num_tokens": 1168295.0, + "step": 121 + }, + { + "epoch": 0.09270516717325228, + "grad_norm": 2.7462735176086426, + "learning_rate": 3.055555555555556e-06, + "loss": 0.59470534324646, + "mean_token_accuracy": 0.8023771047592163, + "num_tokens": 1174502.0, + "step": 122 + }, + { + "epoch": 0.09346504559270517, + "grad_norm": 2.2743821144104004, + "learning_rate": 3.0808080808080807e-06, + "loss": 0.5720560550689697, + "mean_token_accuracy": 0.8162771463394165, + "num_tokens": 1183615.0, + "step": 123 + }, + { + "epoch": 0.09422492401215805, + "grad_norm": 1.8669533729553223, + "learning_rate": 3.1060606060606063e-06, + "loss": 0.4655378758907318, + "mean_token_accuracy": 0.8360732793807983, + "num_tokens": 1193761.0, + "step": 124 + }, + { + "epoch": 0.09498480243161095, + "grad_norm": 1.7666901350021362, + "learning_rate": 3.131313131313132e-06, + "loss": 0.5524153709411621, + "mean_token_accuracy": 0.8252713680267334, + "num_tokens": 1207870.0, + "step": 125 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 2.4720070362091064, + "learning_rate": 3.1565656565656566e-06, + "loss": 0.5003011226654053, + "mean_token_accuracy": 0.8491042852401733, + "num_tokens": 1214603.0, + "step": 126 + }, + { + "epoch": 0.09650455927051672, + "grad_norm": 1.6500422954559326, + "learning_rate": 3.181818181818182e-06, + "loss": 0.5137069225311279, + "mean_token_accuracy": 0.8273531198501587, + "num_tokens": 1228717.0, + "step": 127 + }, + { + "epoch": 0.0972644376899696, + "grad_norm": 3.402543067932129, + "learning_rate": 3.2070707070707072e-06, + "loss": 0.708167552947998, + "mean_token_accuracy": 0.7705385684967041, + "num_tokens": 1234361.0, + "step": 128 + }, + { + "epoch": 0.0980243161094225, + "grad_norm": 2.547285795211792, + "learning_rate": 3.232323232323233e-06, + "loss": 0.6020137071609497, + "mean_token_accuracy": 0.7981340289115906, + "num_tokens": 1244169.0, + "step": 129 + }, + { + "epoch": 0.09878419452887538, + "grad_norm": 2.0578792095184326, + "learning_rate": 3.257575757575758e-06, + "loss": 0.4425000250339508, + "mean_token_accuracy": 0.8567807674407959, + "num_tokens": 1252709.0, + "step": 130 + }, + { + "epoch": 0.09954407294832827, + "grad_norm": 1.672614336013794, + "learning_rate": 3.282828282828283e-06, + "loss": 0.4860966205596924, + "mean_token_accuracy": 0.8393139243125916, + "num_tokens": 1265766.0, + "step": 131 + }, + { + "epoch": 0.10030395136778116, + "grad_norm": 3.2560198307037354, + "learning_rate": 3.3080808080808086e-06, + "loss": 0.624736487865448, + "mean_token_accuracy": 0.7875322699546814, + "num_tokens": 1270779.0, + "step": 132 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 2.4468185901641846, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5062227249145508, + "mean_token_accuracy": 0.8217229843139648, + "num_tokens": 1277113.0, + "step": 133 + }, + { + "epoch": 0.10182370820668693, + "grad_norm": 2.6371328830718994, + "learning_rate": 3.358585858585859e-06, + "loss": 0.477113276720047, + "mean_token_accuracy": 0.8605583906173706, + "num_tokens": 1282514.0, + "step": 134 + }, + { + "epoch": 0.10258358662613981, + "grad_norm": 2.48421311378479, + "learning_rate": 3.3838383838383844e-06, + "loss": 0.40855684876441956, + "mean_token_accuracy": 0.864548921585083, + "num_tokens": 1287859.0, + "step": 135 + }, + { + "epoch": 0.1033434650455927, + "grad_norm": 1.993099331855774, + "learning_rate": 3.409090909090909e-06, + "loss": 0.5913145542144775, + "mean_token_accuracy": 0.8248485922813416, + "num_tokens": 1301074.0, + "step": 136 + }, + { + "epoch": 0.10410334346504559, + "grad_norm": 3.5947680473327637, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.5028599500656128, + "mean_token_accuracy": 0.8367215394973755, + "num_tokens": 1305219.0, + "step": 137 + }, + { + "epoch": 0.10486322188449848, + "grad_norm": 2.5778582096099854, + "learning_rate": 3.45959595959596e-06, + "loss": 0.5297672748565674, + "mean_token_accuracy": 0.8232187032699585, + "num_tokens": 1312482.0, + "step": 138 + }, + { + "epoch": 0.10562310030395136, + "grad_norm": 1.8961588144302368, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.39954107999801636, + "mean_token_accuracy": 0.8605833053588867, + "num_tokens": 1323404.0, + "step": 139 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 1.9687960147857666, + "learning_rate": 3.51010101010101e-06, + "loss": 0.48791587352752686, + "mean_token_accuracy": 0.8200347423553467, + "num_tokens": 1333027.0, + "step": 140 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 2.520242691040039, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.6106002330780029, + "mean_token_accuracy": 0.790692150592804, + "num_tokens": 1340999.0, + "step": 141 + }, + { + "epoch": 0.10790273556231003, + "grad_norm": 3.751617431640625, + "learning_rate": 3.560606060606061e-06, + "loss": 0.48141729831695557, + "mean_token_accuracy": 0.8421382904052734, + "num_tokens": 1344687.0, + "step": 142 + }, + { + "epoch": 0.10866261398176291, + "grad_norm": 2.7101709842681885, + "learning_rate": 3.585858585858586e-06, + "loss": 0.5375241637229919, + "mean_token_accuracy": 0.8061438202857971, + "num_tokens": 1350192.0, + "step": 143 + }, + { + "epoch": 0.1094224924012158, + "grad_norm": 2.583484411239624, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.6492470502853394, + "mean_token_accuracy": 0.7863001823425293, + "num_tokens": 1358148.0, + "step": 144 + }, + { + "epoch": 0.11018237082066869, + "grad_norm": 1.792561650276184, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.48480600118637085, + "mean_token_accuracy": 0.8358709812164307, + "num_tokens": 1369519.0, + "step": 145 + }, + { + "epoch": 0.11094224924012158, + "grad_norm": 2.6480472087860107, + "learning_rate": 3.661616161616162e-06, + "loss": 0.5268933176994324, + "mean_token_accuracy": 0.8214013576507568, + "num_tokens": 1375862.0, + "step": 146 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 2.3174469470977783, + "learning_rate": 3.686868686868687e-06, + "loss": 0.42517897486686707, + "mean_token_accuracy": 0.8523461222648621, + "num_tokens": 1381546.0, + "step": 147 + }, + { + "epoch": 0.11246200607902736, + "grad_norm": 3.0090949535369873, + "learning_rate": 3.7121212121212124e-06, + "loss": 0.4042336940765381, + "mean_token_accuracy": 0.8670448064804077, + "num_tokens": 1385896.0, + "step": 148 + }, + { + "epoch": 0.11322188449848024, + "grad_norm": 2.4928104877471924, + "learning_rate": 3.737373737373738e-06, + "loss": 0.6498878598213196, + "mean_token_accuracy": 0.7967068552970886, + "num_tokens": 1394169.0, + "step": 149 + }, + { + "epoch": 0.11398176291793313, + "grad_norm": 1.5984913110733032, + "learning_rate": 3.7626262626262627e-06, + "loss": 0.546096920967102, + "mean_token_accuracy": 0.8035850524902344, + "num_tokens": 1408785.0, + "step": 150 + }, + { + "epoch": 0.11474164133738601, + "grad_norm": 2.3663532733917236, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.6111721992492676, + "mean_token_accuracy": 0.8015355467796326, + "num_tokens": 1417510.0, + "step": 151 + }, + { + "epoch": 0.11550151975683891, + "grad_norm": 2.518932819366455, + "learning_rate": 3.8131313131313138e-06, + "loss": 0.5274964570999146, + "mean_token_accuracy": 0.8155480623245239, + "num_tokens": 1424186.0, + "step": 152 + }, + { + "epoch": 0.11626139817629179, + "grad_norm": 2.14353609085083, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.5283297896385193, + "mean_token_accuracy": 0.8275758028030396, + "num_tokens": 1432630.0, + "step": 153 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 1.8243604898452759, + "learning_rate": 3.863636363636364e-06, + "loss": 0.41854870319366455, + "mean_token_accuracy": 0.8222295045852661, + "num_tokens": 1442691.0, + "step": 154 + }, + { + "epoch": 0.11778115501519756, + "grad_norm": 2.088212251663208, + "learning_rate": 3.88888888888889e-06, + "loss": 0.6062943339347839, + "mean_token_accuracy": 0.8009427785873413, + "num_tokens": 1456890.0, + "step": 155 + }, + { + "epoch": 0.11854103343465046, + "grad_norm": 1.3469511270523071, + "learning_rate": 3.914141414141415e-06, + "loss": 0.4390433728694916, + "mean_token_accuracy": 0.8436295986175537, + "num_tokens": 1475349.0, + "step": 156 + }, + { + "epoch": 0.11930091185410334, + "grad_norm": 3.247023105621338, + "learning_rate": 3.93939393939394e-06, + "loss": 0.6490433216094971, + "mean_token_accuracy": 0.8037861585617065, + "num_tokens": 1479952.0, + "step": 157 + }, + { + "epoch": 0.12006079027355623, + "grad_norm": 2.6610445976257324, + "learning_rate": 3.964646464646465e-06, + "loss": 0.6221826076507568, + "mean_token_accuracy": 0.7848749160766602, + "num_tokens": 1487306.0, + "step": 158 + }, + { + "epoch": 0.12082066869300911, + "grad_norm": 2.3060810565948486, + "learning_rate": 3.98989898989899e-06, + "loss": 0.5052388310432434, + "mean_token_accuracy": 0.8281195759773254, + "num_tokens": 1495367.0, + "step": 159 + }, + { + "epoch": 0.12158054711246201, + "grad_norm": 2.504448652267456, + "learning_rate": 4.015151515151515e-06, + "loss": 0.5005477666854858, + "mean_token_accuracy": 0.8408058881759644, + "num_tokens": 1502069.0, + "step": 160 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 3.993938446044922, + "learning_rate": 4.04040404040404e-06, + "loss": 0.5569638013839722, + "mean_token_accuracy": 0.8095242977142334, + "num_tokens": 1510224.0, + "step": 161 + }, + { + "epoch": 0.12310030395136778, + "grad_norm": 2.2287683486938477, + "learning_rate": 4.065656565656566e-06, + "loss": 0.524042546749115, + "mean_token_accuracy": 0.8102203607559204, + "num_tokens": 1518364.0, + "step": 162 + }, + { + "epoch": 0.12386018237082067, + "grad_norm": 1.9531738758087158, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.45794573426246643, + "mean_token_accuracy": 0.8560376167297363, + "num_tokens": 1528097.0, + "step": 163 + }, + { + "epoch": 0.12462006079027356, + "grad_norm": 1.5841206312179565, + "learning_rate": 4.116161616161617e-06, + "loss": 0.5420972108840942, + "mean_token_accuracy": 0.8092726469039917, + "num_tokens": 1544119.0, + "step": 164 + }, + { + "epoch": 0.12537993920972645, + "grad_norm": 1.7536218166351318, + "learning_rate": 4.141414141414142e-06, + "loss": 0.554668664932251, + "mean_token_accuracy": 0.8193825483322144, + "num_tokens": 1559140.0, + "step": 165 + }, + { + "epoch": 0.12613981762917933, + "grad_norm": 3.545454740524292, + "learning_rate": 4.166666666666667e-06, + "loss": 0.580947995185852, + "mean_token_accuracy": 0.8286383152008057, + "num_tokens": 1563625.0, + "step": 166 + }, + { + "epoch": 0.12689969604863222, + "grad_norm": 1.6608915328979492, + "learning_rate": 4.191919191919192e-06, + "loss": 0.5523324012756348, + "mean_token_accuracy": 0.8155215978622437, + "num_tokens": 1574945.0, + "step": 167 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 1.4832708835601807, + "learning_rate": 4.217171717171717e-06, + "loss": 0.5133191347122192, + "mean_token_accuracy": 0.8367571830749512, + "num_tokens": 1595865.0, + "step": 168 + }, + { + "epoch": 0.128419452887538, + "grad_norm": 1.7807520627975464, + "learning_rate": 4.242424242424243e-06, + "loss": 0.5131410360336304, + "mean_token_accuracy": 0.8129367232322693, + "num_tokens": 1608723.0, + "step": 169 + }, + { + "epoch": 0.12917933130699089, + "grad_norm": 2.707569122314453, + "learning_rate": 4.267676767676767e-06, + "loss": 0.6129013299942017, + "mean_token_accuracy": 0.7926048040390015, + "num_tokens": 1616136.0, + "step": 170 + }, + { + "epoch": 0.12993920972644377, + "grad_norm": 2.5831644535064697, + "learning_rate": 4.292929292929293e-06, + "loss": 0.6264227628707886, + "mean_token_accuracy": 0.8074911236763, + "num_tokens": 1624228.0, + "step": 171 + }, + { + "epoch": 0.13069908814589665, + "grad_norm": 3.1124250888824463, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.41763827204704285, + "mean_token_accuracy": 0.8565453290939331, + "num_tokens": 1628098.0, + "step": 172 + }, + { + "epoch": 0.13145896656534956, + "grad_norm": 2.3214211463928223, + "learning_rate": 4.343434343434344e-06, + "loss": 0.421974778175354, + "mean_token_accuracy": 0.8391546010971069, + "num_tokens": 1634950.0, + "step": 173 + }, + { + "epoch": 0.13221884498480244, + "grad_norm": 2.1010327339172363, + "learning_rate": 4.368686868686869e-06, + "loss": 0.5307331681251526, + "mean_token_accuracy": 0.8139588236808777, + "num_tokens": 1644132.0, + "step": 174 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 2.533612012863159, + "learning_rate": 4.393939393939394e-06, + "loss": 0.5626664161682129, + "mean_token_accuracy": 0.8029808402061462, + "num_tokens": 1651637.0, + "step": 175 + }, + { + "epoch": 0.1337386018237082, + "grad_norm": 1.669508457183838, + "learning_rate": 4.41919191919192e-06, + "loss": 0.5351508259773254, + "mean_token_accuracy": 0.8281655311584473, + "num_tokens": 1666776.0, + "step": 176 + }, + { + "epoch": 0.1344984802431611, + "grad_norm": 1.7579659223556519, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5235031247138977, + "mean_token_accuracy": 0.8143284320831299, + "num_tokens": 1679241.0, + "step": 177 + }, + { + "epoch": 0.135258358662614, + "grad_norm": 3.123563528060913, + "learning_rate": 4.46969696969697e-06, + "loss": 0.43051332235336304, + "mean_token_accuracy": 0.8518186211585999, + "num_tokens": 1683317.0, + "step": 178 + }, + { + "epoch": 0.13601823708206687, + "grad_norm": 2.2411575317382812, + "learning_rate": 4.494949494949495e-06, + "loss": 0.5471380949020386, + "mean_token_accuracy": 0.8267596960067749, + "num_tokens": 1691366.0, + "step": 179 + }, + { + "epoch": 0.13677811550151975, + "grad_norm": 2.621973991394043, + "learning_rate": 4.520202020202021e-06, + "loss": 0.5685839653015137, + "mean_token_accuracy": 0.8260642290115356, + "num_tokens": 1698148.0, + "step": 180 + }, + { + "epoch": 0.13753799392097266, + "grad_norm": 2.1553852558135986, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5703883171081543, + "mean_token_accuracy": 0.8219090700149536, + "num_tokens": 1707225.0, + "step": 181 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 5.1767897605896, + "learning_rate": 4.5707070707070715e-06, + "loss": 0.32704639434814453, + "mean_token_accuracy": 0.8754568099975586, + "num_tokens": 1712748.0, + "step": 182 + }, + { + "epoch": 0.13905775075987842, + "grad_norm": 2.609168291091919, + "learning_rate": 4.595959595959597e-06, + "loss": 0.5939987301826477, + "mean_token_accuracy": 0.8034975528717041, + "num_tokens": 1719932.0, + "step": 183 + }, + { + "epoch": 0.1398176291793313, + "grad_norm": 2.2059099674224854, + "learning_rate": 4.621212121212122e-06, + "loss": 0.5310720205307007, + "mean_token_accuracy": 0.8177368640899658, + "num_tokens": 1727640.0, + "step": 184 + }, + { + "epoch": 0.1405775075987842, + "grad_norm": 2.6367759704589844, + "learning_rate": 4.646464646464647e-06, + "loss": 0.522086501121521, + "mean_token_accuracy": 0.826233983039856, + "num_tokens": 1733609.0, + "step": 185 + }, + { + "epoch": 0.1413373860182371, + "grad_norm": 3.326732873916626, + "learning_rate": 4.671717171717172e-06, + "loss": 0.4127829074859619, + "mean_token_accuracy": 0.8551101684570312, + "num_tokens": 1737256.0, + "step": 186 + }, + { + "epoch": 0.14209726443768997, + "grad_norm": 1.828412413597107, + "learning_rate": 4.696969696969698e-06, + "loss": 0.5444269180297852, + "mean_token_accuracy": 0.8350818157196045, + "num_tokens": 1750196.0, + "step": 187 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.209203004837036, + "learning_rate": 4.722222222222222e-06, + "loss": 0.5087994933128357, + "mean_token_accuracy": 0.8349015712738037, + "num_tokens": 1754836.0, + "step": 188 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 1.7339166402816772, + "learning_rate": 4.747474747474748e-06, + "loss": 0.5151352286338806, + "mean_token_accuracy": 0.8321266174316406, + "num_tokens": 1766015.0, + "step": 189 + }, + { + "epoch": 0.14437689969604864, + "grad_norm": 2.699068069458008, + "learning_rate": 4.772727272727273e-06, + "loss": 0.4406203031539917, + "mean_token_accuracy": 0.8425000905990601, + "num_tokens": 1771684.0, + "step": 190 + }, + { + "epoch": 0.14513677811550152, + "grad_norm": 2.8117282390594482, + "learning_rate": 4.7979797979797985e-06, + "loss": 0.40428489446640015, + "mean_token_accuracy": 0.8654326796531677, + "num_tokens": 1776301.0, + "step": 191 + }, + { + "epoch": 0.1458966565349544, + "grad_norm": 2.9204647541046143, + "learning_rate": 4.823232323232324e-06, + "loss": 0.4191770553588867, + "mean_token_accuracy": 0.8574687242507935, + "num_tokens": 1781678.0, + "step": 192 + }, + { + "epoch": 0.1466565349544073, + "grad_norm": 2.1648988723754883, + "learning_rate": 4.848484848484849e-06, + "loss": 0.5839012861251831, + "mean_token_accuracy": 0.8053664565086365, + "num_tokens": 1792516.0, + "step": 193 + }, + { + "epoch": 0.1474164133738602, + "grad_norm": 2.3221631050109863, + "learning_rate": 4.873737373737374e-06, + "loss": 0.5037894248962402, + "mean_token_accuracy": 0.8427227139472961, + "num_tokens": 1800192.0, + "step": 194 + }, + { + "epoch": 0.14817629179331307, + "grad_norm": 2.4536430835723877, + "learning_rate": 4.898989898989899e-06, + "loss": 0.42326074838638306, + "mean_token_accuracy": 0.8510633111000061, + "num_tokens": 1806159.0, + "step": 195 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 2.4875805377960205, + "learning_rate": 4.924242424242425e-06, + "loss": 0.539531409740448, + "mean_token_accuracy": 0.8060250282287598, + "num_tokens": 1813392.0, + "step": 196 + }, + { + "epoch": 0.14969604863221886, + "grad_norm": 2.1664798259735107, + "learning_rate": 4.94949494949495e-06, + "loss": 0.42502015829086304, + "mean_token_accuracy": 0.8503251075744629, + "num_tokens": 1821424.0, + "step": 197 + }, + { + "epoch": 0.15045592705167174, + "grad_norm": 2.568808078765869, + "learning_rate": 4.974747474747475e-06, + "loss": 0.5025098323822021, + "mean_token_accuracy": 0.8182311058044434, + "num_tokens": 1827225.0, + "step": 198 + }, + { + "epoch": 0.15121580547112462, + "grad_norm": 1.9116802215576172, + "learning_rate": 5e-06, + "loss": 0.4907258450984955, + "mean_token_accuracy": 0.8310189843177795, + "num_tokens": 1836297.0, + "step": 199 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 3.150765895843506, + "learning_rate": 4.999999122701883e-06, + "loss": 0.390616774559021, + "mean_token_accuracy": 0.8626647591590881, + "num_tokens": 1839984.0, + "step": 200 + }, + { + "epoch": 0.15273556231003038, + "grad_norm": 3.2229044437408447, + "learning_rate": 4.999996490808146e-06, + "loss": 0.48009657859802246, + "mean_token_accuracy": 0.825214147567749, + "num_tokens": 1844610.0, + "step": 201 + }, + { + "epoch": 0.1534954407294833, + "grad_norm": 1.4473289251327515, + "learning_rate": 4.9999921043206356e-06, + "loss": 0.40135183930397034, + "mean_token_accuracy": 0.8537827730178833, + "num_tokens": 1859573.0, + "step": 202 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 4.072319507598877, + "learning_rate": 4.999985963242432e-06, + "loss": 0.6158689260482788, + "mean_token_accuracy": 0.8075432777404785, + "num_tokens": 1863147.0, + "step": 203 + }, + { + "epoch": 0.15501519756838905, + "grad_norm": 3.15741229057312, + "learning_rate": 4.999978067577844e-06, + "loss": 0.4603108763694763, + "mean_token_accuracy": 0.8418779373168945, + "num_tokens": 1867201.0, + "step": 204 + }, + { + "epoch": 0.15577507598784193, + "grad_norm": 2.1925418376922607, + "learning_rate": 4.999968417332415e-06, + "loss": 0.5552488565444946, + "mean_token_accuracy": 0.8216016292572021, + "num_tokens": 1874837.0, + "step": 205 + }, + { + "epoch": 0.15653495440729484, + "grad_norm": 2.2518117427825928, + "learning_rate": 4.999957012512916e-06, + "loss": 0.4912569522857666, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 1881842.0, + "step": 206 + }, + { + "epoch": 0.15729483282674772, + "grad_norm": 1.8223762512207031, + "learning_rate": 4.999943853127351e-06, + "loss": 0.47709137201309204, + "mean_token_accuracy": 0.8311659097671509, + "num_tokens": 1890805.0, + "step": 207 + }, + { + "epoch": 0.1580547112462006, + "grad_norm": 2.066499948501587, + "learning_rate": 4.999928939184958e-06, + "loss": 0.44794657826423645, + "mean_token_accuracy": 0.8513424396514893, + "num_tokens": 1898264.0, + "step": 208 + }, + { + "epoch": 0.15881458966565348, + "grad_norm": 3.53865909576416, + "learning_rate": 4.999912270696202e-06, + "loss": 0.5978270769119263, + "mean_token_accuracy": 0.8080137968063354, + "num_tokens": 1902435.0, + "step": 209 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 2.0760679244995117, + "learning_rate": 4.999893847672783e-06, + "loss": 0.5930601358413696, + "mean_token_accuracy": 0.8028650283813477, + "num_tokens": 1912252.0, + "step": 210 + }, + { + "epoch": 0.16033434650455927, + "grad_norm": 2.21551513671875, + "learning_rate": 4.99987367012763e-06, + "loss": 0.6336753964424133, + "mean_token_accuracy": 0.7902286648750305, + "num_tokens": 1922095.0, + "step": 211 + }, + { + "epoch": 0.16109422492401215, + "grad_norm": 1.7654480934143066, + "learning_rate": 4.999851738074904e-06, + "loss": 0.6373403668403625, + "mean_token_accuracy": 0.7802424430847168, + "num_tokens": 1938962.0, + "step": 212 + }, + { + "epoch": 0.16185410334346503, + "grad_norm": 2.852834701538086, + "learning_rate": 4.9998280515300006e-06, + "loss": 0.6418683528900146, + "mean_token_accuracy": 0.7895716428756714, + "num_tokens": 1944668.0, + "step": 213 + }, + { + "epoch": 0.16261398176291794, + "grad_norm": 3.4737212657928467, + "learning_rate": 4.999802610509541e-06, + "loss": 0.6323273181915283, + "mean_token_accuracy": 0.7982614636421204, + "num_tokens": 1949142.0, + "step": 214 + }, + { + "epoch": 0.16337386018237082, + "grad_norm": 3.0802664756774902, + "learning_rate": 4.999775415031381e-06, + "loss": 0.5929068326950073, + "mean_token_accuracy": 0.8112219572067261, + "num_tokens": 1954141.0, + "step": 215 + }, + { + "epoch": 0.1641337386018237, + "grad_norm": 2.9808855056762695, + "learning_rate": 4.999746465114609e-06, + "loss": 0.5556406378746033, + "mean_token_accuracy": 0.8117628693580627, + "num_tokens": 1959406.0, + "step": 216 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 1.7346166372299194, + "learning_rate": 4.999715760779541e-06, + "loss": 0.5122925043106079, + "mean_token_accuracy": 0.8040724992752075, + "num_tokens": 1971921.0, + "step": 217 + }, + { + "epoch": 0.1656534954407295, + "grad_norm": 1.4183907508850098, + "learning_rate": 4.999683302047729e-06, + "loss": 0.46471893787384033, + "mean_token_accuracy": 0.8381330966949463, + "num_tokens": 1988863.0, + "step": 218 + }, + { + "epoch": 0.16641337386018237, + "grad_norm": 1.6797802448272705, + "learning_rate": 4.999649088941951e-06, + "loss": 0.38348832726478577, + "mean_token_accuracy": 0.8344278931617737, + "num_tokens": 2000003.0, + "step": 219 + }, + { + "epoch": 0.16717325227963525, + "grad_norm": 3.036963939666748, + "learning_rate": 4.999613121486222e-06, + "loss": 0.6062780618667603, + "mean_token_accuracy": 0.8217900991439819, + "num_tokens": 2004813.0, + "step": 220 + }, + { + "epoch": 0.16793313069908813, + "grad_norm": 2.0343217849731445, + "learning_rate": 4.999575399705782e-06, + "loss": 0.5052450895309448, + "mean_token_accuracy": 0.8368623852729797, + "num_tokens": 2013565.0, + "step": 221 + }, + { + "epoch": 0.16869300911854104, + "grad_norm": 2.1162009239196777, + "learning_rate": 4.9995359236271094e-06, + "loss": 0.5169756412506104, + "mean_token_accuracy": 0.8339958190917969, + "num_tokens": 2025763.0, + "step": 222 + }, + { + "epoch": 0.16945288753799392, + "grad_norm": 2.055333375930786, + "learning_rate": 4.9994946932779076e-06, + "loss": 0.6327048540115356, + "mean_token_accuracy": 0.8078711032867432, + "num_tokens": 2037005.0, + "step": 223 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.334620475769043, + "learning_rate": 4.999451708687114e-06, + "loss": 0.5688358545303345, + "mean_token_accuracy": 0.8015589714050293, + "num_tokens": 2041473.0, + "step": 224 + }, + { + "epoch": 0.17097264437689969, + "grad_norm": 2.3734676837921143, + "learning_rate": 4.999406969884897e-06, + "loss": 0.5673821568489075, + "mean_token_accuracy": 0.8054057359695435, + "num_tokens": 2049397.0, + "step": 225 + }, + { + "epoch": 0.1717325227963526, + "grad_norm": 1.807358980178833, + "learning_rate": 4.999360476902656e-06, + "loss": 0.4376158118247986, + "mean_token_accuracy": 0.8456039428710938, + "num_tokens": 2058721.0, + "step": 226 + }, + { + "epoch": 0.17249240121580547, + "grad_norm": 3.231638193130493, + "learning_rate": 4.999312229773022e-06, + "loss": 0.5592809915542603, + "mean_token_accuracy": 0.8170154094696045, + "num_tokens": 2063455.0, + "step": 227 + }, + { + "epoch": 0.17325227963525835, + "grad_norm": 2.2717151641845703, + "learning_rate": 4.999262228529855e-06, + "loss": 0.6144396066665649, + "mean_token_accuracy": 0.7948470115661621, + "num_tokens": 2071686.0, + "step": 228 + }, + { + "epoch": 0.17401215805471124, + "grad_norm": 1.4171342849731445, + "learning_rate": 4.99921047320825e-06, + "loss": 0.43680912256240845, + "mean_token_accuracy": 0.84850013256073, + "num_tokens": 2086999.0, + "step": 229 + }, + { + "epoch": 0.17477203647416414, + "grad_norm": 3.162736654281616, + "learning_rate": 4.99915696384453e-06, + "loss": 0.6025407910346985, + "mean_token_accuracy": 0.8042335510253906, + "num_tokens": 2092001.0, + "step": 230 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 1.8672804832458496, + "learning_rate": 4.99910170047625e-06, + "loss": 0.5843087434768677, + "mean_token_accuracy": 0.8016980886459351, + "num_tokens": 2103372.0, + "step": 231 + }, + { + "epoch": 0.1762917933130699, + "grad_norm": 2.967587471008301, + "learning_rate": 4.999044683142196e-06, + "loss": 0.5123642086982727, + "mean_token_accuracy": 0.8216149806976318, + "num_tokens": 2108008.0, + "step": 232 + }, + { + "epoch": 0.1770516717325228, + "grad_norm": 1.9651981592178345, + "learning_rate": 4.998985911882383e-06, + "loss": 0.5868178606033325, + "mean_token_accuracy": 0.7904198169708252, + "num_tokens": 2119009.0, + "step": 233 + }, + { + "epoch": 0.1778115501519757, + "grad_norm": 2.7785449028015137, + "learning_rate": 4.998925386738063e-06, + "loss": 0.5075510144233704, + "mean_token_accuracy": 0.8280210494995117, + "num_tokens": 2124915.0, + "step": 234 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.957470417022705, + "learning_rate": 4.998863107751711e-06, + "loss": 0.5351958274841309, + "mean_token_accuracy": 0.846825122833252, + "num_tokens": 2129905.0, + "step": 235 + }, + { + "epoch": 0.17933130699088146, + "grad_norm": 3.207671880722046, + "learning_rate": 4.99879907496704e-06, + "loss": 0.6209091544151306, + "mean_token_accuracy": 0.789960503578186, + "num_tokens": 2135027.0, + "step": 236 + }, + { + "epoch": 0.18009118541033434, + "grad_norm": 2.018953800201416, + "learning_rate": 4.998733288428987e-06, + "loss": 0.601510763168335, + "mean_token_accuracy": 0.8136930465698242, + "num_tokens": 2147016.0, + "step": 237 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 2.437281847000122, + "learning_rate": 4.998665748183727e-06, + "loss": 0.5813639163970947, + "mean_token_accuracy": 0.8116716146469116, + "num_tokens": 2155386.0, + "step": 238 + }, + { + "epoch": 0.18161094224924013, + "grad_norm": 1.5708180665969849, + "learning_rate": 4.998596454278661e-06, + "loss": 0.5252395272254944, + "mean_token_accuracy": 0.8193864822387695, + "num_tokens": 2170295.0, + "step": 239 + }, + { + "epoch": 0.182370820668693, + "grad_norm": 1.9921495914459229, + "learning_rate": 4.998525406762422e-06, + "loss": 0.5335029363632202, + "mean_token_accuracy": 0.8120872974395752, + "num_tokens": 2180012.0, + "step": 240 + }, + { + "epoch": 0.1831306990881459, + "grad_norm": 2.6562681198120117, + "learning_rate": 4.998452605684874e-06, + "loss": 0.48021435737609863, + "mean_token_accuracy": 0.8388714790344238, + "num_tokens": 2185607.0, + "step": 241 + }, + { + "epoch": 0.1838905775075988, + "grad_norm": 2.2535853385925293, + "learning_rate": 4.998378051097111e-06, + "loss": 0.5747300386428833, + "mean_token_accuracy": 0.8004639148712158, + "num_tokens": 2194105.0, + "step": 242 + }, + { + "epoch": 0.18465045592705168, + "grad_norm": 1.6151788234710693, + "learning_rate": 4.998301743051459e-06, + "loss": 0.6190565824508667, + "mean_token_accuracy": 0.7816627621650696, + "num_tokens": 2210629.0, + "step": 243 + }, + { + "epoch": 0.18541033434650456, + "grad_norm": 2.1088173389434814, + "learning_rate": 4.9982236816014735e-06, + "loss": 0.4715560972690582, + "mean_token_accuracy": 0.8485721349716187, + "num_tokens": 2218958.0, + "step": 244 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 2.6168735027313232, + "learning_rate": 4.998143866801941e-06, + "loss": 0.6077103018760681, + "mean_token_accuracy": 0.8057924509048462, + "num_tokens": 2226368.0, + "step": 245 + }, + { + "epoch": 0.18693009118541035, + "grad_norm": 2.5988616943359375, + "learning_rate": 4.99806229870888e-06, + "loss": 0.5021637678146362, + "mean_token_accuracy": 0.8361666202545166, + "num_tokens": 2232485.0, + "step": 246 + }, + { + "epoch": 0.18768996960486323, + "grad_norm": 2.015887498855591, + "learning_rate": 4.9979789773795365e-06, + "loss": 0.4309737980365753, + "mean_token_accuracy": 0.8508044481277466, + "num_tokens": 2240819.0, + "step": 247 + }, + { + "epoch": 0.1884498480243161, + "grad_norm": 2.3115265369415283, + "learning_rate": 4.997893902872389e-06, + "loss": 0.5776500701904297, + "mean_token_accuracy": 0.8079549074172974, + "num_tokens": 2249460.0, + "step": 248 + }, + { + "epoch": 0.189209726443769, + "grad_norm": 1.7387021780014038, + "learning_rate": 4.997807075247147e-06, + "loss": 0.430944561958313, + "mean_token_accuracy": 0.8483544588088989, + "num_tokens": 2259124.0, + "step": 249 + }, + { + "epoch": 0.1899696048632219, + "grad_norm": 1.6378381252288818, + "learning_rate": 4.997718494564747e-06, + "loss": 0.4123363792896271, + "mean_token_accuracy": 0.8557409644126892, + "num_tokens": 2269899.0, + "step": 250 + }, + { + "epoch": 0.19072948328267478, + "grad_norm": 1.336282730102539, + "learning_rate": 4.997628160887361e-06, + "loss": 0.502329409122467, + "mean_token_accuracy": 0.8186938166618347, + "num_tokens": 2292821.0, + "step": 251 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 3.3335583209991455, + "learning_rate": 4.997536074278388e-06, + "loss": 0.584446907043457, + "mean_token_accuracy": 0.8062717318534851, + "num_tokens": 2297175.0, + "step": 252 + }, + { + "epoch": 0.19224924012158054, + "grad_norm": 2.246727228164673, + "learning_rate": 4.9974422348024565e-06, + "loss": 0.5683060884475708, + "mean_token_accuracy": 0.8193703293800354, + "num_tokens": 2305456.0, + "step": 253 + }, + { + "epoch": 0.19300911854103345, + "grad_norm": 2.3520865440368652, + "learning_rate": 4.997346642525429e-06, + "loss": 0.4724946618080139, + "mean_token_accuracy": 0.8426719307899475, + "num_tokens": 2312241.0, + "step": 254 + }, + { + "epoch": 0.19376899696048633, + "grad_norm": 2.7115702629089355, + "learning_rate": 4.9972492975143936e-06, + "loss": 0.5019032955169678, + "mean_token_accuracy": 0.8253573179244995, + "num_tokens": 2318094.0, + "step": 255 + }, + { + "epoch": 0.1945288753799392, + "grad_norm": 1.705528974533081, + "learning_rate": 4.997150199837671e-06, + "loss": 0.45588475465774536, + "mean_token_accuracy": 0.836666464805603, + "num_tokens": 2329025.0, + "step": 256 + }, + { + "epoch": 0.1952887537993921, + "grad_norm": 2.161400318145752, + "learning_rate": 4.997049349564814e-06, + "loss": 0.5170183777809143, + "mean_token_accuracy": 0.8287534117698669, + "num_tokens": 2337448.0, + "step": 257 + }, + { + "epoch": 0.196048632218845, + "grad_norm": 2.629669189453125, + "learning_rate": 4.996946746766602e-06, + "loss": 0.44650501012802124, + "mean_token_accuracy": 0.850114107131958, + "num_tokens": 2343207.0, + "step": 258 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 1.6735503673553467, + "learning_rate": 4.996842391515045e-06, + "loss": 0.5247820019721985, + "mean_token_accuracy": 0.8285071849822998, + "num_tokens": 2356801.0, + "step": 259 + }, + { + "epoch": 0.19756838905775076, + "grad_norm": 1.2753115892410278, + "learning_rate": 4.996736283883382e-06, + "loss": 0.41870927810668945, + "mean_token_accuracy": 0.8448047637939453, + "num_tokens": 2377306.0, + "step": 260 + }, + { + "epoch": 0.19832826747720364, + "grad_norm": 2.6947314739227295, + "learning_rate": 4.9966284239460875e-06, + "loss": 0.5059205889701843, + "mean_token_accuracy": 0.8430814743041992, + "num_tokens": 2383352.0, + "step": 261 + }, + { + "epoch": 0.19908814589665655, + "grad_norm": 2.0509963035583496, + "learning_rate": 4.996518811778858e-06, + "loss": 0.4565388560295105, + "mean_token_accuracy": 0.8453130722045898, + "num_tokens": 2391149.0, + "step": 262 + }, + { + "epoch": 0.19984802431610943, + "grad_norm": 2.1856348514556885, + "learning_rate": 4.996407447458626e-06, + "loss": 0.531380832195282, + "mean_token_accuracy": 0.8387004137039185, + "num_tokens": 2399875.0, + "step": 263 + }, + { + "epoch": 0.2006079027355623, + "grad_norm": 2.7348573207855225, + "learning_rate": 4.99629433106355e-06, + "loss": 0.5242817401885986, + "mean_token_accuracy": 0.8177423477172852, + "num_tokens": 2406586.0, + "step": 264 + }, + { + "epoch": 0.2013677811550152, + "grad_norm": 1.76587975025177, + "learning_rate": 4.99617946267302e-06, + "loss": 0.49298471212387085, + "mean_token_accuracy": 0.8271149396896362, + "num_tokens": 2418683.0, + "step": 265 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 2.8129730224609375, + "learning_rate": 4.996062842367655e-06, + "loss": 0.46420302987098694, + "mean_token_accuracy": 0.8453244566917419, + "num_tokens": 2422929.0, + "step": 266 + }, + { + "epoch": 0.20288753799392098, + "grad_norm": 2.575744152069092, + "learning_rate": 4.9959444702293025e-06, + "loss": 0.43208545446395874, + "mean_token_accuracy": 0.8494843244552612, + "num_tokens": 2429567.0, + "step": 267 + }, + { + "epoch": 0.20364741641337386, + "grad_norm": 2.7586750984191895, + "learning_rate": 4.995824346341041e-06, + "loss": 0.4390473961830139, + "mean_token_accuracy": 0.8348895311355591, + "num_tokens": 2434700.0, + "step": 268 + }, + { + "epoch": 0.20440729483282674, + "grad_norm": 1.972145438194275, + "learning_rate": 4.99570247078718e-06, + "loss": 0.6219544410705566, + "mean_token_accuracy": 0.7939999103546143, + "num_tokens": 2447007.0, + "step": 269 + }, + { + "epoch": 0.20516717325227962, + "grad_norm": 2.2963485717773438, + "learning_rate": 4.995578843653255e-06, + "loss": 0.5008970499038696, + "mean_token_accuracy": 0.8255308866500854, + "num_tokens": 2453936.0, + "step": 270 + }, + { + "epoch": 0.20592705167173253, + "grad_norm": 1.8897721767425537, + "learning_rate": 4.995453465026033e-06, + "loss": 0.5436089038848877, + "mean_token_accuracy": 0.819086492061615, + "num_tokens": 2464494.0, + "step": 271 + }, + { + "epoch": 0.2066869300911854, + "grad_norm": 2.319728374481201, + "learning_rate": 4.995326334993508e-06, + "loss": 0.5136368870735168, + "mean_token_accuracy": 0.820817232131958, + "num_tokens": 2470938.0, + "step": 272 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 2.230414390563965, + "learning_rate": 4.9951974536449055e-06, + "loss": 0.5272846817970276, + "mean_token_accuracy": 0.8203279972076416, + "num_tokens": 2478629.0, + "step": 273 + }, + { + "epoch": 0.20820668693009117, + "grad_norm": 3.401937484741211, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.4389592111110687, + "mean_token_accuracy": 0.8647899031639099, + "num_tokens": 2482193.0, + "step": 274 + }, + { + "epoch": 0.20896656534954408, + "grad_norm": 2.1278507709503174, + "learning_rate": 4.994934437362513e-06, + "loss": 0.598863422870636, + "mean_token_accuracy": 0.7945119738578796, + "num_tokens": 2492465.0, + "step": 275 + }, + { + "epoch": 0.20972644376899696, + "grad_norm": 1.9259960651397705, + "learning_rate": 4.994800302613318e-06, + "loss": 0.49520939588546753, + "mean_token_accuracy": 0.8371536135673523, + "num_tokens": 2500825.0, + "step": 276 + }, + { + "epoch": 0.21048632218844984, + "grad_norm": 2.346418857574463, + "learning_rate": 4.994664416917236e-06, + "loss": 0.5412614345550537, + "mean_token_accuracy": 0.810661792755127, + "num_tokens": 2509513.0, + "step": 277 + }, + { + "epoch": 0.21124620060790272, + "grad_norm": 1.3092039823532104, + "learning_rate": 4.994526780369636e-06, + "loss": 0.46305379271507263, + "mean_token_accuracy": 0.8358527421951294, + "num_tokens": 2531405.0, + "step": 278 + }, + { + "epoch": 0.21200607902735563, + "grad_norm": 2.924611806869507, + "learning_rate": 4.9943873930671175e-06, + "loss": 0.6134544610977173, + "mean_token_accuracy": 0.7947378754615784, + "num_tokens": 2536744.0, + "step": 279 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.8290598392486572, + "learning_rate": 4.994246255107506e-06, + "loss": 0.465520441532135, + "mean_token_accuracy": 0.8440108299255371, + "num_tokens": 2541184.0, + "step": 280 + }, + { + "epoch": 0.2135258358662614, + "grad_norm": 3.8081259727478027, + "learning_rate": 4.994103366589859e-06, + "loss": 0.43394139409065247, + "mean_token_accuracy": 0.8579148054122925, + "num_tokens": 2545395.0, + "step": 281 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.7994529008865356, + "learning_rate": 4.993958727614462e-06, + "loss": 0.5076484680175781, + "mean_token_accuracy": 0.8270803093910217, + "num_tokens": 2556541.0, + "step": 282 + }, + { + "epoch": 0.21504559270516718, + "grad_norm": 2.5582659244537354, + "learning_rate": 4.993812338282826e-06, + "loss": 0.4453684389591217, + "mean_token_accuracy": 0.8488293886184692, + "num_tokens": 2562949.0, + "step": 283 + }, + { + "epoch": 0.21580547112462006, + "grad_norm": 1.6448938846588135, + "learning_rate": 4.993664198697694e-06, + "loss": 0.461971640586853, + "mean_token_accuracy": 0.824763298034668, + "num_tokens": 2576407.0, + "step": 284 + }, + { + "epoch": 0.21656534954407294, + "grad_norm": 2.1264469623565674, + "learning_rate": 4.993514308963037e-06, + "loss": 0.6241602897644043, + "mean_token_accuracy": 0.7916014790534973, + "num_tokens": 2585695.0, + "step": 285 + }, + { + "epoch": 0.21732522796352582, + "grad_norm": 3.629991292953491, + "learning_rate": 4.993362669184051e-06, + "loss": 0.610355019569397, + "mean_token_accuracy": 0.7847568988800049, + "num_tokens": 2589778.0, + "step": 286 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 1.9070756435394287, + "learning_rate": 4.993209279467164e-06, + "loss": 0.5513623952865601, + "mean_token_accuracy": 0.7911607027053833, + "num_tokens": 2600920.0, + "step": 287 + }, + { + "epoch": 0.2188449848024316, + "grad_norm": 1.761062741279602, + "learning_rate": 4.993054139920031e-06, + "loss": 0.4579957127571106, + "mean_token_accuracy": 0.8189530372619629, + "num_tokens": 2611856.0, + "step": 288 + }, + { + "epoch": 0.2196048632218845, + "grad_norm": 1.7264713048934937, + "learning_rate": 4.992897250651535e-06, + "loss": 0.5871305465698242, + "mean_token_accuracy": 0.7918527126312256, + "num_tokens": 2624730.0, + "step": 289 + }, + { + "epoch": 0.22036474164133737, + "grad_norm": 1.7455977201461792, + "learning_rate": 4.992738611771787e-06, + "loss": 0.5475119948387146, + "mean_token_accuracy": 0.8226917386054993, + "num_tokens": 2635705.0, + "step": 290 + }, + { + "epoch": 0.22112462006079028, + "grad_norm": 2.095095157623291, + "learning_rate": 4.992578223392124e-06, + "loss": 0.5952225923538208, + "mean_token_accuracy": 0.8078469038009644, + "num_tokens": 2643954.0, + "step": 291 + }, + { + "epoch": 0.22188449848024316, + "grad_norm": 2.994664192199707, + "learning_rate": 4.992416085625115e-06, + "loss": 0.5432442426681519, + "mean_token_accuracy": 0.8329008221626282, + "num_tokens": 2648800.0, + "step": 292 + }, + { + "epoch": 0.22264437689969604, + "grad_norm": 2.796790361404419, + "learning_rate": 4.992252198584554e-06, + "loss": 0.5168961882591248, + "mean_token_accuracy": 0.8393474817276001, + "num_tokens": 2653546.0, + "step": 293 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 1.8610522747039795, + "learning_rate": 4.992086562385462e-06, + "loss": 0.5728024244308472, + "mean_token_accuracy": 0.797406792640686, + "num_tokens": 2667483.0, + "step": 294 + }, + { + "epoch": 0.22416413373860183, + "grad_norm": 1.695472002029419, + "learning_rate": 4.9919191771440905e-06, + "loss": 0.5460028648376465, + "mean_token_accuracy": 0.8123016357421875, + "num_tokens": 2683574.0, + "step": 295 + }, + { + "epoch": 0.22492401215805471, + "grad_norm": 2.8627376556396484, + "learning_rate": 4.9917500429779165e-06, + "loss": 0.5566985011100769, + "mean_token_accuracy": 0.815531313419342, + "num_tokens": 2688985.0, + "step": 296 + }, + { + "epoch": 0.2256838905775076, + "grad_norm": 2.73323655128479, + "learning_rate": 4.991579160005644e-06, + "loss": 0.48197102546691895, + "mean_token_accuracy": 0.8471829295158386, + "num_tokens": 2694799.0, + "step": 297 + }, + { + "epoch": 0.22644376899696048, + "grad_norm": 1.8436161279678345, + "learning_rate": 4.991406528347206e-06, + "loss": 0.4528339207172394, + "mean_token_accuracy": 0.8603188395500183, + "num_tokens": 2707321.0, + "step": 298 + }, + { + "epoch": 0.22720364741641338, + "grad_norm": 2.6231515407562256, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.5916541814804077, + "mean_token_accuracy": 0.8050242066383362, + "num_tokens": 2714233.0, + "step": 299 + }, + { + "epoch": 0.22796352583586627, + "grad_norm": 3.08776593208313, + "learning_rate": 4.991056019457697e-06, + "loss": 0.4860580563545227, + "mean_token_accuracy": 0.8464088439941406, + "num_tokens": 2718443.0, + "step": 300 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 2.2537803649902344, + "learning_rate": 4.990878142472628e-06, + "loss": 0.5158311128616333, + "mean_token_accuracy": 0.824694812297821, + "num_tokens": 2726158.0, + "step": 301 + }, + { + "epoch": 0.22948328267477203, + "grad_norm": 2.1122705936431885, + "learning_rate": 4.990698517293394e-06, + "loss": 0.495265394449234, + "mean_token_accuracy": 0.8343238830566406, + "num_tokens": 2735022.0, + "step": 302 + }, + { + "epoch": 0.23024316109422494, + "grad_norm": 3.5503528118133545, + "learning_rate": 4.9905171440460645e-06, + "loss": 0.46063232421875, + "mean_token_accuracy": 0.8420047760009766, + "num_tokens": 2738550.0, + "step": 303 + }, + { + "epoch": 0.23100303951367782, + "grad_norm": 3.9858486652374268, + "learning_rate": 4.990334022857932e-06, + "loss": 0.5832710266113281, + "mean_token_accuracy": 0.8144199848175049, + "num_tokens": 2741720.0, + "step": 304 + }, + { + "epoch": 0.2317629179331307, + "grad_norm": 2.407231330871582, + "learning_rate": 4.990149153857519e-06, + "loss": 0.4692630171775818, + "mean_token_accuracy": 0.8429223299026489, + "num_tokens": 2748693.0, + "step": 305 + }, + { + "epoch": 0.23252279635258358, + "grad_norm": 1.6996397972106934, + "learning_rate": 4.989962537174573e-06, + "loss": 0.49143946170806885, + "mean_token_accuracy": 0.8340128064155579, + "num_tokens": 2761254.0, + "step": 306 + }, + { + "epoch": 0.23328267477203649, + "grad_norm": 3.746432065963745, + "learning_rate": 4.989774172940071e-06, + "loss": 0.6282026767730713, + "mean_token_accuracy": 0.775698184967041, + "num_tokens": 2765115.0, + "step": 307 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 2.212872266769409, + "learning_rate": 4.989584061286211e-06, + "loss": 0.5193763971328735, + "mean_token_accuracy": 0.8168246746063232, + "num_tokens": 2772345.0, + "step": 308 + }, + { + "epoch": 0.23480243161094225, + "grad_norm": 1.752297282218933, + "learning_rate": 4.989392202346423e-06, + "loss": 0.4437984824180603, + "mean_token_accuracy": 0.8451256155967712, + "num_tokens": 2783072.0, + "step": 309 + }, + { + "epoch": 0.23556231003039513, + "grad_norm": 2.386019706726074, + "learning_rate": 4.989198596255361e-06, + "loss": 0.4090752899646759, + "mean_token_accuracy": 0.8480085134506226, + "num_tokens": 2788757.0, + "step": 310 + }, + { + "epoch": 0.23632218844984804, + "grad_norm": 3.9981489181518555, + "learning_rate": 4.989003243148904e-06, + "loss": 0.5149132013320923, + "mean_token_accuracy": 0.8179056644439697, + "num_tokens": 2792096.0, + "step": 311 + }, + { + "epoch": 0.23708206686930092, + "grad_norm": 1.8723100423812866, + "learning_rate": 4.988806143164159e-06, + "loss": 0.4531487822532654, + "mean_token_accuracy": 0.8400167226791382, + "num_tokens": 2802210.0, + "step": 312 + }, + { + "epoch": 0.2378419452887538, + "grad_norm": 2.3415136337280273, + "learning_rate": 4.988607296439459e-06, + "loss": 0.5974439978599548, + "mean_token_accuracy": 0.8035976886749268, + "num_tokens": 2810088.0, + "step": 313 + }, + { + "epoch": 0.23860182370820668, + "grad_norm": 1.5317577123641968, + "learning_rate": 4.98840670311436e-06, + "loss": 0.49247145652770996, + "mean_token_accuracy": 0.8292540311813354, + "num_tokens": 2824005.0, + "step": 314 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 2.170772075653076, + "learning_rate": 4.988204363329648e-06, + "loss": 0.6359974145889282, + "mean_token_accuracy": 0.7785564661026001, + "num_tokens": 2834680.0, + "step": 315 + }, + { + "epoch": 0.24012158054711247, + "grad_norm": 3.2655932903289795, + "learning_rate": 4.988000277227334e-06, + "loss": 0.5080196857452393, + "mean_token_accuracy": 0.8295877575874329, + "num_tokens": 2838735.0, + "step": 316 + }, + { + "epoch": 0.24088145896656535, + "grad_norm": 3.406589984893799, + "learning_rate": 4.987794444950651e-06, + "loss": 0.3939085006713867, + "mean_token_accuracy": 0.8700719475746155, + "num_tokens": 2842127.0, + "step": 317 + }, + { + "epoch": 0.24164133738601823, + "grad_norm": 1.8211106061935425, + "learning_rate": 4.987586866644061e-06, + "loss": 0.5270540118217468, + "mean_token_accuracy": 0.826683521270752, + "num_tokens": 2853656.0, + "step": 318 + }, + { + "epoch": 0.24240121580547114, + "grad_norm": 1.8429969549179077, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.4705049991607666, + "mean_token_accuracy": 0.8355701565742493, + "num_tokens": 2863513.0, + "step": 319 + }, + { + "epoch": 0.24316109422492402, + "grad_norm": 2.2425320148468018, + "learning_rate": 4.9871664725251314e-06, + "loss": 0.485736608505249, + "mean_token_accuracy": 0.835182785987854, + "num_tokens": 2871556.0, + "step": 320 + }, + { + "epoch": 0.2439209726443769, + "grad_norm": 1.6202056407928467, + "learning_rate": 4.986953657007841e-06, + "loss": 0.4437887370586395, + "mean_token_accuracy": 0.8282591700553894, + "num_tokens": 2884335.0, + "step": 321 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 1.1027268171310425, + "learning_rate": 4.98673909605074e-06, + "loss": 0.3770800828933716, + "mean_token_accuracy": 0.8325437307357788, + "num_tokens": 2904286.0, + "step": 322 + }, + { + "epoch": 0.2454407294832827, + "grad_norm": 2.3239076137542725, + "learning_rate": 4.986522789804417e-06, + "loss": 0.5387254953384399, + "mean_token_accuracy": 0.806242823600769, + "num_tokens": 2910975.0, + "step": 323 + }, + { + "epoch": 0.24620060790273557, + "grad_norm": 2.243482828140259, + "learning_rate": 4.986304738420684e-06, + "loss": 0.4396553039550781, + "mean_token_accuracy": 0.8561904430389404, + "num_tokens": 2917087.0, + "step": 324 + }, + { + "epoch": 0.24696048632218845, + "grad_norm": 2.537264347076416, + "learning_rate": 4.986084942052577e-06, + "loss": 0.395110160112381, + "mean_token_accuracy": 0.8636915683746338, + "num_tokens": 2921887.0, + "step": 325 + }, + { + "epoch": 0.24772036474164133, + "grad_norm": 2.319399118423462, + "learning_rate": 4.9858634008543574e-06, + "loss": 0.581517219543457, + "mean_token_accuracy": 0.8157487511634827, + "num_tokens": 2928996.0, + "step": 326 + }, + { + "epoch": 0.24848024316109424, + "grad_norm": 1.9787474870681763, + "learning_rate": 4.985640114981513e-06, + "loss": 0.5084106922149658, + "mean_token_accuracy": 0.835221529006958, + "num_tokens": 2940302.0, + "step": 327 + }, + { + "epoch": 0.24924012158054712, + "grad_norm": 2.4783265590667725, + "learning_rate": 4.985415084590752e-06, + "loss": 0.6062222719192505, + "mean_token_accuracy": 0.7885516285896301, + "num_tokens": 2946386.0, + "step": 328 + }, + { + "epoch": 0.25, + "grad_norm": 2.4081411361694336, + "learning_rate": 4.985188309840012e-06, + "loss": 0.5079880356788635, + "mean_token_accuracy": 0.8313904404640198, + "num_tokens": 2952323.0, + "step": 329 + }, + { + "epoch": 0.2507598784194529, + "grad_norm": 2.64993953704834, + "learning_rate": 4.984959790888451e-06, + "loss": 0.5461447834968567, + "mean_token_accuracy": 0.8125468492507935, + "num_tokens": 2958119.0, + "step": 330 + }, + { + "epoch": 0.25151975683890576, + "grad_norm": 2.549734115600586, + "learning_rate": 4.984729527896451e-06, + "loss": 0.5998573303222656, + "mean_token_accuracy": 0.8076666593551636, + "num_tokens": 2964947.0, + "step": 331 + }, + { + "epoch": 0.25227963525835867, + "grad_norm": 3.2185161113739014, + "learning_rate": 4.984497521025622e-06, + "loss": 0.4232945442199707, + "mean_token_accuracy": 0.8543803095817566, + "num_tokens": 2968598.0, + "step": 332 + }, + { + "epoch": 0.2530395136778115, + "grad_norm": 2.588994264602661, + "learning_rate": 4.984263770438793e-06, + "loss": 0.460967481136322, + "mean_token_accuracy": 0.8416207432746887, + "num_tokens": 2974510.0, + "step": 333 + }, + { + "epoch": 0.25379939209726443, + "grad_norm": 2.1373162269592285, + "learning_rate": 4.984028276300021e-06, + "loss": 0.49382102489471436, + "mean_token_accuracy": 0.8388048410415649, + "num_tokens": 2981632.0, + "step": 334 + }, + { + "epoch": 0.25455927051671734, + "grad_norm": 2.2524826526641846, + "learning_rate": 4.983791038774585e-06, + "loss": 0.4947671890258789, + "mean_token_accuracy": 0.8066365122795105, + "num_tokens": 2988736.0, + "step": 335 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.7244199514389038, + "learning_rate": 4.983552058028985e-06, + "loss": 0.48096776008605957, + "mean_token_accuracy": 0.830735445022583, + "num_tokens": 3003576.0, + "step": 336 + }, + { + "epoch": 0.2560790273556231, + "grad_norm": 3.0628933906555176, + "learning_rate": 4.9833113342309495e-06, + "loss": 0.6027032136917114, + "mean_token_accuracy": 0.8008694648742676, + "num_tokens": 3009549.0, + "step": 337 + }, + { + "epoch": 0.256838905775076, + "grad_norm": 2.438674211502075, + "learning_rate": 4.983068867549427e-06, + "loss": 0.517090916633606, + "mean_token_accuracy": 0.827893853187561, + "num_tokens": 3015236.0, + "step": 338 + }, + { + "epoch": 0.25759878419452886, + "grad_norm": 2.131535053253174, + "learning_rate": 4.982824658154589e-06, + "loss": 0.6656812429428101, + "mean_token_accuracy": 0.7772425413131714, + "num_tokens": 3028142.0, + "step": 339 + }, + { + "epoch": 0.25835866261398177, + "grad_norm": 2.3206584453582764, + "learning_rate": 4.9825787062178315e-06, + "loss": 0.5757625699043274, + "mean_token_accuracy": 0.8073873519897461, + "num_tokens": 3040996.0, + "step": 340 + }, + { + "epoch": 0.2591185410334346, + "grad_norm": 1.3905521631240845, + "learning_rate": 4.982331011911774e-06, + "loss": 0.4193805456161499, + "mean_token_accuracy": 0.8399466872215271, + "num_tokens": 3061931.0, + "step": 341 + }, + { + "epoch": 0.25987841945288753, + "grad_norm": 2.184173345565796, + "learning_rate": 4.982081575410256e-06, + "loss": 0.4751223921775818, + "mean_token_accuracy": 0.8409271240234375, + "num_tokens": 3069081.0, + "step": 342 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.538764238357544, + "learning_rate": 4.9818303968883445e-06, + "loss": 0.8119601011276245, + "mean_token_accuracy": 0.7442739009857178, + "num_tokens": 3073628.0, + "step": 343 + }, + { + "epoch": 0.2613981762917933, + "grad_norm": 1.8063762187957764, + "learning_rate": 4.981577476522323e-06, + "loss": 0.5615730881690979, + "mean_token_accuracy": 0.8207751512527466, + "num_tokens": 3086596.0, + "step": 344 + }, + { + "epoch": 0.2621580547112462, + "grad_norm": 2.4346961975097656, + "learning_rate": 4.981322814489703e-06, + "loss": 0.5266709327697754, + "mean_token_accuracy": 0.8211277723312378, + "num_tokens": 3092631.0, + "step": 345 + }, + { + "epoch": 0.2629179331306991, + "grad_norm": 1.91289484500885, + "learning_rate": 4.981066410969215e-06, + "loss": 0.5047177672386169, + "mean_token_accuracy": 0.8356877565383911, + "num_tokens": 3101102.0, + "step": 346 + }, + { + "epoch": 0.26367781155015196, + "grad_norm": 2.1495707035064697, + "learning_rate": 4.980808266140813e-06, + "loss": 0.47876280546188354, + "mean_token_accuracy": 0.8364313244819641, + "num_tokens": 3107998.0, + "step": 347 + }, + { + "epoch": 0.26443768996960487, + "grad_norm": 2.5961992740631104, + "learning_rate": 4.9805483801856744e-06, + "loss": 0.5512958765029907, + "mean_token_accuracy": 0.8181467652320862, + "num_tokens": 3113848.0, + "step": 348 + }, + { + "epoch": 0.2651975683890577, + "grad_norm": 3.2828900814056396, + "learning_rate": 4.980286753286196e-06, + "loss": 0.4217945635318756, + "mean_token_accuracy": 0.8617103099822998, + "num_tokens": 3117652.0, + "step": 349 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 1.425554871559143, + "learning_rate": 4.980023385625996e-06, + "loss": 0.4042487144470215, + "mean_token_accuracy": 0.8492785692214966, + "num_tokens": 3132336.0, + "step": 350 + }, + { + "epoch": 0.26671732522796354, + "grad_norm": 2.933504104614258, + "learning_rate": 4.979758277389919e-06, + "loss": 0.5406704545021057, + "mean_token_accuracy": 0.8035423755645752, + "num_tokens": 3137544.0, + "step": 351 + }, + { + "epoch": 0.2674772036474164, + "grad_norm": 1.9958966970443726, + "learning_rate": 4.9794914287640264e-06, + "loss": 0.5857555270195007, + "mean_token_accuracy": 0.7965140342712402, + "num_tokens": 3149705.0, + "step": 352 + }, + { + "epoch": 0.2682370820668693, + "grad_norm": 2.467694044113159, + "learning_rate": 4.979222839935602e-06, + "loss": 0.6404043436050415, + "mean_token_accuracy": 0.7823755741119385, + "num_tokens": 3158353.0, + "step": 353 + }, + { + "epoch": 0.2689969604863222, + "grad_norm": 2.0102720260620117, + "learning_rate": 4.9789525110931545e-06, + "loss": 0.5681496858596802, + "mean_token_accuracy": 0.8108169436454773, + "num_tokens": 3167121.0, + "step": 354 + }, + { + "epoch": 0.26975683890577506, + "grad_norm": 2.6017866134643555, + "learning_rate": 4.978680442426409e-06, + "loss": 0.6309828162193298, + "mean_token_accuracy": 0.7742617130279541, + "num_tokens": 3175012.0, + "step": 355 + }, + { + "epoch": 0.270516717325228, + "grad_norm": 1.8799268007278442, + "learning_rate": 4.978406634126315e-06, + "loss": 0.524029016494751, + "mean_token_accuracy": 0.8317689895629883, + "num_tokens": 3185331.0, + "step": 356 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 1.508332371711731, + "learning_rate": 4.978131086385041e-06, + "loss": 0.46656402945518494, + "mean_token_accuracy": 0.8339117765426636, + "num_tokens": 3198813.0, + "step": 357 + }, + { + "epoch": 0.27203647416413373, + "grad_norm": 3.595707654953003, + "learning_rate": 4.977853799395976e-06, + "loss": 0.5101234912872314, + "mean_token_accuracy": 0.8251723051071167, + "num_tokens": 3206557.0, + "step": 358 + }, + { + "epoch": 0.27279635258358664, + "grad_norm": 3.5317916870117188, + "learning_rate": 4.977574773353732e-06, + "loss": 0.5684665441513062, + "mean_token_accuracy": 0.8124493360519409, + "num_tokens": 3210912.0, + "step": 359 + }, + { + "epoch": 0.2735562310030395, + "grad_norm": 2.8606204986572266, + "learning_rate": 4.97729400845414e-06, + "loss": 0.4746384620666504, + "mean_token_accuracy": 0.8195606470108032, + "num_tokens": 3215365.0, + "step": 360 + }, + { + "epoch": 0.2743161094224924, + "grad_norm": 1.8214033842086792, + "learning_rate": 4.977011504894253e-06, + "loss": 0.4842769503593445, + "mean_token_accuracy": 0.82928866147995, + "num_tokens": 3224037.0, + "step": 361 + }, + { + "epoch": 0.2750759878419453, + "grad_norm": 1.628746509552002, + "learning_rate": 4.97672726287234e-06, + "loss": 0.4397493302822113, + "mean_token_accuracy": 0.8606528043746948, + "num_tokens": 3235589.0, + "step": 362 + }, + { + "epoch": 0.27583586626139817, + "grad_norm": 3.557973861694336, + "learning_rate": 4.976441282587894e-06, + "loss": 0.5732032060623169, + "mean_token_accuracy": 0.8041545748710632, + "num_tokens": 3239958.0, + "step": 363 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 1.3467901945114136, + "learning_rate": 4.9761535642416284e-06, + "loss": 0.4525323510169983, + "mean_token_accuracy": 0.8281061053276062, + "num_tokens": 3257703.0, + "step": 364 + }, + { + "epoch": 0.2773556231003039, + "grad_norm": 2.2649986743927, + "learning_rate": 4.9758641080354745e-06, + "loss": 0.5074734687805176, + "mean_token_accuracy": 0.8447474241256714, + "num_tokens": 3264334.0, + "step": 365 + }, + { + "epoch": 0.27811550151975684, + "grad_norm": 2.8667566776275635, + "learning_rate": 4.975572914172581e-06, + "loss": 0.5759559869766235, + "mean_token_accuracy": 0.7976793050765991, + "num_tokens": 3269314.0, + "step": 366 + }, + { + "epoch": 0.27887537993920974, + "grad_norm": 2.2514986991882324, + "learning_rate": 4.975279982857324e-06, + "loss": 0.5786465406417847, + "mean_token_accuracy": 0.8058781623840332, + "num_tokens": 3277324.0, + "step": 367 + }, + { + "epoch": 0.2796352583586626, + "grad_norm": 1.3826723098754883, + "learning_rate": 4.97498531429529e-06, + "loss": 0.40801727771759033, + "mean_token_accuracy": 0.8601310849189758, + "num_tokens": 3290530.0, + "step": 368 + }, + { + "epoch": 0.2803951367781155, + "grad_norm": 2.084092617034912, + "learning_rate": 4.97468890869329e-06, + "loss": 0.47076648473739624, + "mean_token_accuracy": 0.8310186862945557, + "num_tokens": 3298325.0, + "step": 369 + }, + { + "epoch": 0.2811550151975684, + "grad_norm": 1.3467998504638672, + "learning_rate": 4.974390766259353e-06, + "loss": 0.44668465852737427, + "mean_token_accuracy": 0.8275353908538818, + "num_tokens": 3314302.0, + "step": 370 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 2.5921075344085693, + "learning_rate": 4.974090887202726e-06, + "loss": 0.5343953967094421, + "mean_token_accuracy": 0.8110706806182861, + "num_tokens": 3320963.0, + "step": 371 + }, + { + "epoch": 0.2826747720364742, + "grad_norm": 2.042781352996826, + "learning_rate": 4.973789271733877e-06, + "loss": 0.6293343305587769, + "mean_token_accuracy": 0.7800243496894836, + "num_tokens": 3332742.0, + "step": 372 + }, + { + "epoch": 0.28343465045592703, + "grad_norm": 4.822193145751953, + "learning_rate": 4.973485920064491e-06, + "loss": 0.6256728768348694, + "mean_token_accuracy": 0.7962433099746704, + "num_tokens": 3335872.0, + "step": 373 + }, + { + "epoch": 0.28419452887537994, + "grad_norm": 1.260988473892212, + "learning_rate": 4.973180832407471e-06, + "loss": 0.38731223344802856, + "mean_token_accuracy": 0.8385066986083984, + "num_tokens": 3351884.0, + "step": 374 + }, + { + "epoch": 0.28495440729483285, + "grad_norm": 2.669966697692871, + "learning_rate": 4.97287400897694e-06, + "loss": 0.5594710111618042, + "mean_token_accuracy": 0.8097212314605713, + "num_tokens": 3358197.0, + "step": 375 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 3.0344486236572266, + "learning_rate": 4.972565449988238e-06, + "loss": 0.34449583292007446, + "mean_token_accuracy": 0.8813316822052002, + "num_tokens": 3362133.0, + "step": 376 + }, + { + "epoch": 0.2864741641337386, + "grad_norm": 2.562251091003418, + "learning_rate": 4.972255155657925e-06, + "loss": 0.5331522822380066, + "mean_token_accuracy": 0.8212941288948059, + "num_tokens": 3370346.0, + "step": 377 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 2.7083740234375, + "learning_rate": 4.9719431262037755e-06, + "loss": 0.5403046011924744, + "mean_token_accuracy": 0.8108335733413696, + "num_tokens": 3375588.0, + "step": 378 + }, + { + "epoch": 0.28799392097264437, + "grad_norm": 1.396430492401123, + "learning_rate": 4.971629361844785e-06, + "loss": 0.4041529893875122, + "mean_token_accuracy": 0.8588063716888428, + "num_tokens": 3390749.0, + "step": 379 + }, + { + "epoch": 0.2887537993920973, + "grad_norm": 1.9872784614562988, + "learning_rate": 4.971313862801166e-06, + "loss": 0.4336993098258972, + "mean_token_accuracy": 0.8511303663253784, + "num_tokens": 3399064.0, + "step": 380 + }, + { + "epoch": 0.28951367781155013, + "grad_norm": 1.9652575254440308, + "learning_rate": 4.9709966292943455e-06, + "loss": 0.4578358232975006, + "mean_token_accuracy": 0.8229440450668335, + "num_tokens": 3407229.0, + "step": 381 + }, + { + "epoch": 0.29027355623100304, + "grad_norm": 1.6626898050308228, + "learning_rate": 4.970677661546972e-06, + "loss": 0.5427594184875488, + "mean_token_accuracy": 0.815427303314209, + "num_tokens": 3422321.0, + "step": 382 + }, + { + "epoch": 0.29103343465045595, + "grad_norm": 3.5265562534332275, + "learning_rate": 4.970356959782909e-06, + "loss": 0.6661460995674133, + "mean_token_accuracy": 0.7856965065002441, + "num_tokens": 3427442.0, + "step": 383 + }, + { + "epoch": 0.2917933130699088, + "grad_norm": 1.667205572128296, + "learning_rate": 4.970034524227239e-06, + "loss": 0.36256325244903564, + "mean_token_accuracy": 0.8711205720901489, + "num_tokens": 3436662.0, + "step": 384 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 1.3389486074447632, + "learning_rate": 4.969710355106256e-06, + "loss": 0.4282698631286621, + "mean_token_accuracy": 0.838951587677002, + "num_tokens": 3450060.0, + "step": 385 + }, + { + "epoch": 0.2933130699088146, + "grad_norm": 2.5163397789001465, + "learning_rate": 4.969384452647477e-06, + "loss": 0.5176984071731567, + "mean_token_accuracy": 0.8235267996788025, + "num_tokens": 3456990.0, + "step": 386 + }, + { + "epoch": 0.29407294832826747, + "grad_norm": 1.7588495016098022, + "learning_rate": 4.969056817079633e-06, + "loss": 0.49710947275161743, + "mean_token_accuracy": 0.818520724773407, + "num_tokens": 3468098.0, + "step": 387 + }, + { + "epoch": 0.2948328267477204, + "grad_norm": 2.6381046772003174, + "learning_rate": 4.968727448632669e-06, + "loss": 0.4425308108329773, + "mean_token_accuracy": 0.8451643586158752, + "num_tokens": 3472899.0, + "step": 388 + }, + { + "epoch": 0.29559270516717323, + "grad_norm": 1.6345038414001465, + "learning_rate": 4.968396347537751e-06, + "loss": 0.4177059829235077, + "mean_token_accuracy": 0.8498886227607727, + "num_tokens": 3484826.0, + "step": 389 + }, + { + "epoch": 0.29635258358662614, + "grad_norm": 3.0466468334198, + "learning_rate": 4.968063514027258e-06, + "loss": 0.4274463951587677, + "mean_token_accuracy": 0.8387278318405151, + "num_tokens": 3488610.0, + "step": 390 + }, + { + "epoch": 0.29711246200607905, + "grad_norm": 2.6509406566619873, + "learning_rate": 4.967728948334784e-06, + "loss": 0.5401753783226013, + "mean_token_accuracy": 0.8252490162849426, + "num_tokens": 3493657.0, + "step": 391 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.6372219324111938, + "learning_rate": 4.967392650695141e-06, + "loss": 0.3862472176551819, + "mean_token_accuracy": 0.8555525541305542, + "num_tokens": 3505588.0, + "step": 392 + }, + { + "epoch": 0.2986322188449848, + "grad_norm": 2.1615452766418457, + "learning_rate": 4.967054621344356e-06, + "loss": 0.57850581407547, + "mean_token_accuracy": 0.8222678899765015, + "num_tokens": 3514396.0, + "step": 393 + }, + { + "epoch": 0.2993920972644377, + "grad_norm": 1.8610916137695312, + "learning_rate": 4.96671486051967e-06, + "loss": 0.5440595149993896, + "mean_token_accuracy": 0.8196715116500854, + "num_tokens": 3523604.0, + "step": 394 + }, + { + "epoch": 0.30015197568389057, + "grad_norm": 2.9585862159729004, + "learning_rate": 4.966373368459542e-06, + "loss": 0.6921588182449341, + "mean_token_accuracy": 0.7816659808158875, + "num_tokens": 3529849.0, + "step": 395 + }, + { + "epoch": 0.3009118541033435, + "grad_norm": 1.9374035596847534, + "learning_rate": 4.966030145403642e-06, + "loss": 0.5494055151939392, + "mean_token_accuracy": 0.8126792907714844, + "num_tokens": 3539529.0, + "step": 396 + }, + { + "epoch": 0.30167173252279633, + "grad_norm": 1.730530023574829, + "learning_rate": 4.965685191592859e-06, + "loss": 0.4271572232246399, + "mean_token_accuracy": 0.8383668661117554, + "num_tokens": 3550972.0, + "step": 397 + }, + { + "epoch": 0.30243161094224924, + "grad_norm": 3.9635560512542725, + "learning_rate": 4.9653385072692935e-06, + "loss": 0.5576210021972656, + "mean_token_accuracy": 0.799404501914978, + "num_tokens": 3554147.0, + "step": 398 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 2.5731968879699707, + "learning_rate": 4.964990092676263e-06, + "loss": 0.5478942394256592, + "mean_token_accuracy": 0.8220961093902588, + "num_tokens": 3559972.0, + "step": 399 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 2.2096588611602783, + "learning_rate": 4.964639948058297e-06, + "loss": 0.35461270809173584, + "mean_token_accuracy": 0.8640927076339722, + "num_tokens": 3565770.0, + "step": 400 + }, + { + "epoch": 0.3047112462006079, + "grad_norm": 1.7874189615249634, + "learning_rate": 4.964288073661142e-06, + "loss": 0.38849619030952454, + "mean_token_accuracy": 0.8443037271499634, + "num_tokens": 3574514.0, + "step": 401 + }, + { + "epoch": 0.30547112462006076, + "grad_norm": 1.5583146810531616, + "learning_rate": 4.963934469731756e-06, + "loss": 0.48909449577331543, + "mean_token_accuracy": 0.8429768681526184, + "num_tokens": 3585877.0, + "step": 402 + }, + { + "epoch": 0.30623100303951367, + "grad_norm": 3.026599645614624, + "learning_rate": 4.963579136518312e-06, + "loss": 0.5138992071151733, + "mean_token_accuracy": 0.8283728361129761, + "num_tokens": 3590412.0, + "step": 403 + }, + { + "epoch": 0.3069908814589666, + "grad_norm": 2.777505874633789, + "learning_rate": 4.963222074270197e-06, + "loss": 0.6241534948348999, + "mean_token_accuracy": 0.8130464553833008, + "num_tokens": 3596246.0, + "step": 404 + }, + { + "epoch": 0.30775075987841943, + "grad_norm": 2.4772839546203613, + "learning_rate": 4.962863283238011e-06, + "loss": 0.5930814146995544, + "mean_token_accuracy": 0.8036394715309143, + "num_tokens": 3602878.0, + "step": 405 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 1.5049982070922852, + "learning_rate": 4.962502763673566e-06, + "loss": 0.4903082549571991, + "mean_token_accuracy": 0.8184912204742432, + "num_tokens": 3617018.0, + "step": 406 + }, + { + "epoch": 0.30927051671732525, + "grad_norm": 2.453155040740967, + "learning_rate": 4.96214051582989e-06, + "loss": 0.5138067603111267, + "mean_token_accuracy": 0.8336835503578186, + "num_tokens": 3624188.0, + "step": 407 + }, + { + "epoch": 0.3100303951367781, + "grad_norm": 2.4038336277008057, + "learning_rate": 4.961776539961222e-06, + "loss": 0.5752760171890259, + "mean_token_accuracy": 0.8054730892181396, + "num_tokens": 3634152.0, + "step": 408 + }, + { + "epoch": 0.310790273556231, + "grad_norm": 2.629068374633789, + "learning_rate": 4.961410836323014e-06, + "loss": 0.5580606460571289, + "mean_token_accuracy": 0.8121089935302734, + "num_tokens": 3639528.0, + "step": 409 + }, + { + "epoch": 0.31155015197568386, + "grad_norm": 1.4245928525924683, + "learning_rate": 4.961043405171931e-06, + "loss": 0.5399882793426514, + "mean_token_accuracy": 0.812280535697937, + "num_tokens": 3655744.0, + "step": 410 + }, + { + "epoch": 0.3123100303951368, + "grad_norm": 1.5236459970474243, + "learning_rate": 4.9606742467658505e-06, + "loss": 0.5234690308570862, + "mean_token_accuracy": 0.8188928365707397, + "num_tokens": 3675010.0, + "step": 411 + }, + { + "epoch": 0.3130699088145897, + "grad_norm": 2.27961802482605, + "learning_rate": 4.960303361363863e-06, + "loss": 0.5502505898475647, + "mean_token_accuracy": 0.8161963224411011, + "num_tokens": 3682328.0, + "step": 412 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 1.554518222808838, + "learning_rate": 4.959930749226269e-06, + "loss": 0.420867919921875, + "mean_token_accuracy": 0.8499157428741455, + "num_tokens": 3694980.0, + "step": 413 + }, + { + "epoch": 0.31458966565349544, + "grad_norm": 2.609218120574951, + "learning_rate": 4.9595564106145825e-06, + "loss": 0.4706704318523407, + "mean_token_accuracy": 0.8412490487098694, + "num_tokens": 3700033.0, + "step": 414 + }, + { + "epoch": 0.31534954407294835, + "grad_norm": 1.5303231477737427, + "learning_rate": 4.959180345791528e-06, + "loss": 0.4668654799461365, + "mean_token_accuracy": 0.8125015497207642, + "num_tokens": 3715012.0, + "step": 415 + }, + { + "epoch": 0.3161094224924012, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.958802555021042e-06, + "loss": 0.4339369237422943, + "mean_token_accuracy": 0.8442851901054382, + "num_tokens": 3733928.0, + "step": 416 + }, + { + "epoch": 0.3168693009118541, + "grad_norm": 2.1240181922912598, + "learning_rate": 4.958423038568274e-06, + "loss": 0.4029104709625244, + "mean_token_accuracy": 0.8627674579620361, + "num_tokens": 3740202.0, + "step": 417 + }, + { + "epoch": 0.31762917933130697, + "grad_norm": 2.00538969039917, + "learning_rate": 4.958041796699583e-06, + "loss": 0.5229607820510864, + "mean_token_accuracy": 0.8282366394996643, + "num_tokens": 3749308.0, + "step": 418 + }, + { + "epoch": 0.3183890577507599, + "grad_norm": 2.6555092334747314, + "learning_rate": 4.957658829682539e-06, + "loss": 0.5344101190567017, + "mean_token_accuracy": 0.8183202743530273, + "num_tokens": 3754595.0, + "step": 419 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 1.7468839883804321, + "learning_rate": 4.9572741377859225e-06, + "loss": 0.5667245984077454, + "mean_token_accuracy": 0.8080123662948608, + "num_tokens": 3765761.0, + "step": 420 + }, + { + "epoch": 0.31990881458966564, + "grad_norm": 2.9612457752227783, + "learning_rate": 4.956887721279726e-06, + "loss": 0.5389559864997864, + "mean_token_accuracy": 0.8019476532936096, + "num_tokens": 3770844.0, + "step": 421 + }, + { + "epoch": 0.32066869300911854, + "grad_norm": 1.842403769493103, + "learning_rate": 4.95649958043515e-06, + "loss": 0.38279837369918823, + "mean_token_accuracy": 0.858866810798645, + "num_tokens": 3778094.0, + "step": 422 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 2.3108131885528564, + "learning_rate": 4.956109715524609e-06, + "loss": 0.5453893542289734, + "mean_token_accuracy": 0.8085013031959534, + "num_tokens": 3785015.0, + "step": 423 + }, + { + "epoch": 0.3221884498480243, + "grad_norm": 3.0326945781707764, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.5550523400306702, + "mean_token_accuracy": 0.8125876188278198, + "num_tokens": 3789830.0, + "step": 424 + }, + { + "epoch": 0.3229483282674772, + "grad_norm": 1.8851977586746216, + "learning_rate": 4.955324814601324e-06, + "loss": 0.4902324974536896, + "mean_token_accuracy": 0.8205406665802002, + "num_tokens": 3799862.0, + "step": 425 + }, + { + "epoch": 0.32370820668693007, + "grad_norm": 2.6018171310424805, + "learning_rate": 4.954929779139455e-06, + "loss": 0.5920133590698242, + "mean_token_accuracy": 0.8340690732002258, + "num_tokens": 3806617.0, + "step": 426 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 2.4283878803253174, + "learning_rate": 4.954533020713367e-06, + "loss": 0.5305854082107544, + "mean_token_accuracy": 0.8137468099594116, + "num_tokens": 3813843.0, + "step": 427 + }, + { + "epoch": 0.3252279635258359, + "grad_norm": 2.667978525161743, + "learning_rate": 4.954134539601519e-06, + "loss": 0.5333638787269592, + "mean_token_accuracy": 0.8402629494667053, + "num_tokens": 3819450.0, + "step": 428 + }, + { + "epoch": 0.32598784194528874, + "grad_norm": 1.7302523851394653, + "learning_rate": 4.953734336083582e-06, + "loss": 0.422895610332489, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 3831027.0, + "step": 429 + }, + { + "epoch": 0.32674772036474165, + "grad_norm": 2.427192211151123, + "learning_rate": 4.953332410440434e-06, + "loss": 0.6334598064422607, + "mean_token_accuracy": 0.7817479968070984, + "num_tokens": 3841776.0, + "step": 430 + }, + { + "epoch": 0.32750759878419455, + "grad_norm": 1.460949182510376, + "learning_rate": 4.952928762954161e-06, + "loss": 0.3654777705669403, + "mean_token_accuracy": 0.8780122995376587, + "num_tokens": 3852213.0, + "step": 431 + }, + { + "epoch": 0.3282674772036474, + "grad_norm": 1.9855005741119385, + "learning_rate": 4.952523393908059e-06, + "loss": 0.5117089748382568, + "mean_token_accuracy": 0.811911404132843, + "num_tokens": 3861176.0, + "step": 432 + }, + { + "epoch": 0.3290273556231003, + "grad_norm": 2.2653207778930664, + "learning_rate": 4.952116303586631e-06, + "loss": 0.42514950037002563, + "mean_token_accuracy": 0.8448518514633179, + "num_tokens": 3867164.0, + "step": 433 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 1.9780964851379395, + "learning_rate": 4.951707492275589e-06, + "loss": 0.5095293521881104, + "mean_token_accuracy": 0.8262748718261719, + "num_tokens": 3876406.0, + "step": 434 + }, + { + "epoch": 0.3305471124620061, + "grad_norm": 2.9480233192443848, + "learning_rate": 4.951296960261853e-06, + "loss": 0.3494448959827423, + "mean_token_accuracy": 0.8781307935714722, + "num_tokens": 3880298.0, + "step": 435 + }, + { + "epoch": 0.331306990881459, + "grad_norm": 2.335571527481079, + "learning_rate": 4.95088470783355e-06, + "loss": 0.5456914901733398, + "mean_token_accuracy": 0.816297173500061, + "num_tokens": 3886487.0, + "step": 436 + }, + { + "epoch": 0.33206686930091184, + "grad_norm": 2.3046419620513916, + "learning_rate": 4.950470735280013e-06, + "loss": 0.4835948944091797, + "mean_token_accuracy": 0.8539175391197205, + "num_tokens": 3892706.0, + "step": 437 + }, + { + "epoch": 0.33282674772036475, + "grad_norm": 2.44047474861145, + "learning_rate": 4.950055042891786e-06, + "loss": 0.5154092907905579, + "mean_token_accuracy": 0.8579919338226318, + "num_tokens": 3899532.0, + "step": 438 + }, + { + "epoch": 0.33358662613981765, + "grad_norm": 4.826764106750488, + "learning_rate": 4.949637630960618e-06, + "loss": 0.5270259976387024, + "mean_token_accuracy": 0.8172192573547363, + "num_tokens": 3902260.0, + "step": 439 + }, + { + "epoch": 0.3343465045592705, + "grad_norm": 2.001574754714966, + "learning_rate": 4.949218499779462e-06, + "loss": 0.5413002967834473, + "mean_token_accuracy": 0.8162837028503418, + "num_tokens": 3911706.0, + "step": 440 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 1.7998944520950317, + "learning_rate": 4.948797649642484e-06, + "loss": 0.5131614208221436, + "mean_token_accuracy": 0.8367440700531006, + "num_tokens": 3923490.0, + "step": 441 + }, + { + "epoch": 0.33586626139817627, + "grad_norm": 3.4566173553466797, + "learning_rate": 4.94837508084505e-06, + "loss": 0.7258909940719604, + "mean_token_accuracy": 0.771377444267273, + "num_tokens": 3928099.0, + "step": 442 + }, + { + "epoch": 0.3366261398176292, + "grad_norm": 2.0040442943573, + "learning_rate": 4.9479507936837364e-06, + "loss": 0.482135534286499, + "mean_token_accuracy": 0.8339327573776245, + "num_tokens": 3937328.0, + "step": 443 + }, + { + "epoch": 0.3373860182370821, + "grad_norm": 2.949502944946289, + "learning_rate": 4.947524788456325e-06, + "loss": 0.6474795341491699, + "mean_token_accuracy": 0.7951677441596985, + "num_tokens": 3942529.0, + "step": 444 + }, + { + "epoch": 0.33814589665653494, + "grad_norm": 1.5528364181518555, + "learning_rate": 4.947097065461801e-06, + "loss": 0.48791584372520447, + "mean_token_accuracy": 0.8425545692443848, + "num_tokens": 3955200.0, + "step": 445 + }, + { + "epoch": 0.33890577507598785, + "grad_norm": 1.8813284635543823, + "learning_rate": 4.946667625000358e-06, + "loss": 0.45922309160232544, + "mean_token_accuracy": 0.8206527233123779, + "num_tokens": 3962975.0, + "step": 446 + }, + { + "epoch": 0.33966565349544076, + "grad_norm": 1.7157847881317139, + "learning_rate": 4.946236467373392e-06, + "loss": 0.5454182028770447, + "mean_token_accuracy": 0.8049604892730713, + "num_tokens": 3973956.0, + "step": 447 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 2.008857250213623, + "learning_rate": 4.945803592883509e-06, + "loss": 0.5151860117912292, + "mean_token_accuracy": 0.8262045383453369, + "num_tokens": 3982853.0, + "step": 448 + }, + { + "epoch": 0.3411854103343465, + "grad_norm": 1.6632496118545532, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.42710691690444946, + "mean_token_accuracy": 0.8521314859390259, + "num_tokens": 3993838.0, + "step": 449 + }, + { + "epoch": 0.34194528875379937, + "grad_norm": 1.365234375, + "learning_rate": 4.944932694531423e-06, + "loss": 0.5172526836395264, + "mean_token_accuracy": 0.8277045488357544, + "num_tokens": 4014179.0, + "step": 450 + }, + { + "epoch": 0.3427051671732523, + "grad_norm": 1.7610243558883667, + "learning_rate": 4.94449467128045e-06, + "loss": 0.42104798555374146, + "mean_token_accuracy": 0.8552065491676331, + "num_tokens": 4023663.0, + "step": 451 + }, + { + "epoch": 0.3434650455927052, + "grad_norm": 2.3732354640960693, + "learning_rate": 4.944054932389018e-06, + "loss": 0.5471175909042358, + "mean_token_accuracy": 0.8487317562103271, + "num_tokens": 4030100.0, + "step": 452 + }, + { + "epoch": 0.34422492401215804, + "grad_norm": 1.5973623991012573, + "learning_rate": 4.943613478165753e-06, + "loss": 0.419813871383667, + "mean_token_accuracy": 0.8484025001525879, + "num_tokens": 4041124.0, + "step": 453 + }, + { + "epoch": 0.34498480243161095, + "grad_norm": 2.966381549835205, + "learning_rate": 4.943170308920484e-06, + "loss": 0.5370652675628662, + "mean_token_accuracy": 0.8439491987228394, + "num_tokens": 4045675.0, + "step": 454 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 2.5097248554229736, + "learning_rate": 4.9427254249642445e-06, + "loss": 0.5776349306106567, + "mean_token_accuracy": 0.8060523867607117, + "num_tokens": 4053250.0, + "step": 455 + }, + { + "epoch": 0.3465045592705167, + "grad_norm": 1.6779125928878784, + "learning_rate": 4.942278826609272e-06, + "loss": 0.5245476961135864, + "mean_token_accuracy": 0.8168526887893677, + "num_tokens": 4064106.0, + "step": 456 + }, + { + "epoch": 0.3472644376899696, + "grad_norm": 1.5945546627044678, + "learning_rate": 4.9418305141690045e-06, + "loss": 0.4972047209739685, + "mean_token_accuracy": 0.8257735967636108, + "num_tokens": 4077687.0, + "step": 457 + }, + { + "epoch": 0.34802431610942247, + "grad_norm": 2.864778757095337, + "learning_rate": 4.9413804879580865e-06, + "loss": 0.5372499823570251, + "mean_token_accuracy": 0.8423776626586914, + "num_tokens": 4082632.0, + "step": 458 + }, + { + "epoch": 0.3487841945288754, + "grad_norm": 1.4797078371047974, + "learning_rate": 4.940928748292363e-06, + "loss": 0.5903409719467163, + "mean_token_accuracy": 0.8061295747756958, + "num_tokens": 4104218.0, + "step": 459 + }, + { + "epoch": 0.3495440729483283, + "grad_norm": 2.4376983642578125, + "learning_rate": 4.940475295488882e-06, + "loss": 0.4534894824028015, + "mean_token_accuracy": 0.8395825028419495, + "num_tokens": 4110530.0, + "step": 460 + }, + { + "epoch": 0.35030395136778114, + "grad_norm": 1.2955626249313354, + "learning_rate": 4.940020129865895e-06, + "loss": 0.47155818343162537, + "mean_token_accuracy": 0.8253582715988159, + "num_tokens": 4128398.0, + "step": 461 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 2.066575527191162, + "learning_rate": 4.9395632517428546e-06, + "loss": 0.5555641651153564, + "mean_token_accuracy": 0.814624547958374, + "num_tokens": 4137623.0, + "step": 462 + }, + { + "epoch": 0.3518237082066869, + "grad_norm": 1.6407525539398193, + "learning_rate": 4.939104661440415e-06, + "loss": 0.4361790418624878, + "mean_token_accuracy": 0.8544459342956543, + "num_tokens": 4152803.0, + "step": 463 + }, + { + "epoch": 0.3525835866261398, + "grad_norm": 2.1685116291046143, + "learning_rate": 4.938644359280433e-06, + "loss": 0.5347012877464294, + "mean_token_accuracy": 0.853853702545166, + "num_tokens": 4160778.0, + "step": 464 + }, + { + "epoch": 0.3533434650455927, + "grad_norm": 1.8824869394302368, + "learning_rate": 4.938182345585967e-06, + "loss": 0.5512481927871704, + "mean_token_accuracy": 0.7985891699790955, + "num_tokens": 4170380.0, + "step": 465 + }, + { + "epoch": 0.3541033434650456, + "grad_norm": 2.2229504585266113, + "learning_rate": 4.937718620681273e-06, + "loss": 0.516828179359436, + "mean_token_accuracy": 0.8265621066093445, + "num_tokens": 4178179.0, + "step": 466 + }, + { + "epoch": 0.3548632218844985, + "grad_norm": 1.955990195274353, + "learning_rate": 4.9372531848918145e-06, + "loss": 0.5586158037185669, + "mean_token_accuracy": 0.8367916345596313, + "num_tokens": 4188626.0, + "step": 467 + }, + { + "epoch": 0.3556231003039514, + "grad_norm": 1.9687023162841797, + "learning_rate": 4.936786038544251e-06, + "loss": 0.5517531633377075, + "mean_token_accuracy": 0.8134098052978516, + "num_tokens": 4198144.0, + "step": 468 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 1.405516505241394, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.5305492877960205, + "mean_token_accuracy": 0.8014427423477173, + "num_tokens": 4222818.0, + "step": 469 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.6355695724487305, + "learning_rate": 4.9358466154874535e-06, + "loss": 0.5303391218185425, + "mean_token_accuracy": 0.8028861284255981, + "num_tokens": 4228318.0, + "step": 470 + }, + { + "epoch": 0.35790273556231, + "grad_norm": 1.5133824348449707, + "learning_rate": 4.935374339437543e-06, + "loss": 0.5329189300537109, + "mean_token_accuracy": 0.8479441404342651, + "num_tokens": 4244527.0, + "step": 471 + }, + { + "epoch": 0.3586626139817629, + "grad_norm": 3.4356725215911865, + "learning_rate": 4.934900354148173e-06, + "loss": 0.5431582927703857, + "mean_token_accuracy": 0.8328983783721924, + "num_tokens": 4248034.0, + "step": 472 + }, + { + "epoch": 0.3594224924012158, + "grad_norm": 2.5789499282836914, + "learning_rate": 4.934424659952006e-06, + "loss": 0.4141455292701721, + "mean_token_accuracy": 0.8658635020256042, + "num_tokens": 4252953.0, + "step": 473 + }, + { + "epoch": 0.3601823708206687, + "grad_norm": 1.145262598991394, + "learning_rate": 4.933947257182901e-06, + "loss": 0.40294092893600464, + "mean_token_accuracy": 0.8565847277641296, + "num_tokens": 4277813.0, + "step": 474 + }, + { + "epoch": 0.3609422492401216, + "grad_norm": 1.7242133617401123, + "learning_rate": 4.933468146175918e-06, + "loss": 0.6036738753318787, + "mean_token_accuracy": 0.8072597980499268, + "num_tokens": 4291088.0, + "step": 475 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 2.3490941524505615, + "learning_rate": 4.932987327267317e-06, + "loss": 0.49456146359443665, + "mean_token_accuracy": 0.8372673988342285, + "num_tokens": 4297376.0, + "step": 476 + }, + { + "epoch": 0.36246200607902734, + "grad_norm": 1.3605526685714722, + "learning_rate": 4.932504800794553e-06, + "loss": 0.43595948815345764, + "mean_token_accuracy": 0.8415953516960144, + "num_tokens": 4312054.0, + "step": 477 + }, + { + "epoch": 0.36322188449848025, + "grad_norm": 1.4525885581970215, + "learning_rate": 4.9320205670962815e-06, + "loss": 0.5390371680259705, + "mean_token_accuracy": 0.8101649284362793, + "num_tokens": 4328701.0, + "step": 478 + }, + { + "epoch": 0.3639817629179331, + "grad_norm": 1.9862419366836548, + "learning_rate": 4.931534626512359e-06, + "loss": 0.45436930656433105, + "mean_token_accuracy": 0.8352861404418945, + "num_tokens": 4338372.0, + "step": 479 + }, + { + "epoch": 0.364741641337386, + "grad_norm": 1.7804961204528809, + "learning_rate": 4.931046979383836e-06, + "loss": 0.4677754044532776, + "mean_token_accuracy": 0.840467095375061, + "num_tokens": 4347897.0, + "step": 480 + }, + { + "epoch": 0.3655015197568389, + "grad_norm": 2.066632032394409, + "learning_rate": 4.930557626052961e-06, + "loss": 0.42418140172958374, + "mean_token_accuracy": 0.8528275489807129, + "num_tokens": 4354061.0, + "step": 481 + }, + { + "epoch": 0.3662613981762918, + "grad_norm": 1.6155282258987427, + "learning_rate": 4.930066566863182e-06, + "loss": 0.5424284934997559, + "mean_token_accuracy": 0.825040876865387, + "num_tokens": 4370400.0, + "step": 482 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 2.1452953815460205, + "learning_rate": 4.929573802159143e-06, + "loss": 0.5105804204940796, + "mean_token_accuracy": 0.8284053802490234, + "num_tokens": 4377579.0, + "step": 483 + }, + { + "epoch": 0.3677811550151976, + "grad_norm": 1.8940945863723755, + "learning_rate": 4.929079332286685e-06, + "loss": 0.43478304147720337, + "mean_token_accuracy": 0.8505665063858032, + "num_tokens": 4385686.0, + "step": 484 + }, + { + "epoch": 0.36854103343465044, + "grad_norm": 1.6785860061645508, + "learning_rate": 4.928583157592846e-06, + "loss": 0.40227848291397095, + "mean_token_accuracy": 0.8623573780059814, + "num_tokens": 4396128.0, + "step": 485 + }, + { + "epoch": 0.36930091185410335, + "grad_norm": 1.6416733264923096, + "learning_rate": 4.928085278425862e-06, + "loss": 0.526267409324646, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 4407963.0, + "step": 486 + }, + { + "epoch": 0.3700607902735562, + "grad_norm": 1.8882389068603516, + "learning_rate": 4.927585695135162e-06, + "loss": 0.5555213093757629, + "mean_token_accuracy": 0.8115293979644775, + "num_tokens": 4418057.0, + "step": 487 + }, + { + "epoch": 0.3708206686930091, + "grad_norm": 2.300248384475708, + "learning_rate": 4.9270844080713735e-06, + "loss": 0.5812339186668396, + "mean_token_accuracy": 0.800270676612854, + "num_tokens": 4425358.0, + "step": 488 + }, + { + "epoch": 0.371580547112462, + "grad_norm": 1.6802922487258911, + "learning_rate": 4.926581417586319e-06, + "loss": 0.5134941935539246, + "mean_token_accuracy": 0.8247408866882324, + "num_tokens": 4437702.0, + "step": 489 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 1.7620291709899902, + "learning_rate": 4.926076724033016e-06, + "loss": 0.5233973264694214, + "mean_token_accuracy": 0.8102161884307861, + "num_tokens": 4448584.0, + "step": 490 + }, + { + "epoch": 0.3731003039513678, + "grad_norm": 1.6911998987197876, + "learning_rate": 4.925570327765678e-06, + "loss": 0.5337274074554443, + "mean_token_accuracy": 0.845306396484375, + "num_tokens": 4462651.0, + "step": 491 + }, + { + "epoch": 0.3738601823708207, + "grad_norm": 1.7991242408752441, + "learning_rate": 4.9250622291397144e-06, + "loss": 0.31018948554992676, + "mean_token_accuracy": 0.8857606053352356, + "num_tokens": 4469971.0, + "step": 492 + }, + { + "epoch": 0.37462006079027355, + "grad_norm": 4.9776835441589355, + "learning_rate": 4.924552428511727e-06, + "loss": 0.44114983081817627, + "mean_token_accuracy": 0.8429906368255615, + "num_tokens": 4478275.0, + "step": 493 + }, + { + "epoch": 0.37537993920972645, + "grad_norm": 1.8007272481918335, + "learning_rate": 4.924040926239515e-06, + "loss": 0.574328601360321, + "mean_token_accuracy": 0.7669196128845215, + "num_tokens": 4491551.0, + "step": 494 + }, + { + "epoch": 0.3761398176291793, + "grad_norm": 2.021300792694092, + "learning_rate": 4.92352772268207e-06, + "loss": 0.45636120438575745, + "mean_token_accuracy": 0.840438723564148, + "num_tokens": 4498658.0, + "step": 495 + }, + { + "epoch": 0.3768996960486322, + "grad_norm": 2.369748592376709, + "learning_rate": 4.923012818199576e-06, + "loss": 0.5206376910209656, + "mean_token_accuracy": 0.8521823287010193, + "num_tokens": 4504648.0, + "step": 496 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 2.733485221862793, + "learning_rate": 4.922496213153416e-06, + "loss": 0.5067723989486694, + "mean_token_accuracy": 0.8168281316757202, + "num_tokens": 4509990.0, + "step": 497 + }, + { + "epoch": 0.378419452887538, + "grad_norm": 2.3751676082611084, + "learning_rate": 4.921977907906161e-06, + "loss": 0.49757206439971924, + "mean_token_accuracy": 0.8325017690658569, + "num_tokens": 4518373.0, + "step": 498 + }, + { + "epoch": 0.3791793313069909, + "grad_norm": 2.1672775745391846, + "learning_rate": 4.921457902821578e-06, + "loss": 0.4237566590309143, + "mean_token_accuracy": 0.8404698371887207, + "num_tokens": 4524338.0, + "step": 499 + }, + { + "epoch": 0.3799392097264438, + "grad_norm": 1.8374360799789429, + "learning_rate": 4.9209361982646275e-06, + "loss": 0.4995468854904175, + "mean_token_accuracy": 0.8299649953842163, + "num_tokens": 4533396.0, + "step": 500 + }, + { + "epoch": 0.38069908814589665, + "grad_norm": 2.083967924118042, + "learning_rate": 4.920412794601461e-06, + "loss": 0.489935040473938, + "mean_token_accuracy": 0.8315291404724121, + "num_tokens": 4540941.0, + "step": 501 + }, + { + "epoch": 0.38145896656534956, + "grad_norm": 2.2075610160827637, + "learning_rate": 4.919887692199423e-06, + "loss": 0.5233147740364075, + "mean_token_accuracy": 0.804171085357666, + "num_tokens": 4548215.0, + "step": 502 + }, + { + "epoch": 0.3822188449848024, + "grad_norm": 2.076775312423706, + "learning_rate": 4.9193608914270515e-06, + "loss": 0.5785550475120544, + "mean_token_accuracy": 0.7993186116218567, + "num_tokens": 4558204.0, + "step": 503 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.238546133041382, + "learning_rate": 4.918832392654075e-06, + "loss": 0.5287384390830994, + "mean_token_accuracy": 0.8214945793151855, + "num_tokens": 4565407.0, + "step": 504 + }, + { + "epoch": 0.3837386018237082, + "grad_norm": 1.6783074140548706, + "learning_rate": 4.9183021962514145e-06, + "loss": 0.6063359379768372, + "mean_token_accuracy": 0.7914625406265259, + "num_tokens": 4580991.0, + "step": 505 + }, + { + "epoch": 0.3844984802431611, + "grad_norm": 1.6287449598312378, + "learning_rate": 4.917770302591183e-06, + "loss": 0.3598247766494751, + "mean_token_accuracy": 0.8706809878349304, + "num_tokens": 4590579.0, + "step": 506 + }, + { + "epoch": 0.385258358662614, + "grad_norm": 1.5432041883468628, + "learning_rate": 4.917236712046682e-06, + "loss": 0.5267890095710754, + "mean_token_accuracy": 0.8032117486000061, + "num_tokens": 4608380.0, + "step": 507 + }, + { + "epoch": 0.3860182370820669, + "grad_norm": 1.7664037942886353, + "learning_rate": 4.9167014249924075e-06, + "loss": 0.3552354574203491, + "mean_token_accuracy": 0.8569793701171875, + "num_tokens": 4616426.0, + "step": 508 + }, + { + "epoch": 0.38677811550151975, + "grad_norm": 2.1147472858428955, + "learning_rate": 4.916164441804044e-06, + "loss": 0.5212404727935791, + "mean_token_accuracy": 0.8196578025817871, + "num_tokens": 4623908.0, + "step": 509 + }, + { + "epoch": 0.38753799392097266, + "grad_norm": 2.1092333793640137, + "learning_rate": 4.915625762858467e-06, + "loss": 0.5197038650512695, + "mean_token_accuracy": 0.8245604634284973, + "num_tokens": 4630956.0, + "step": 510 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 1.23331880569458, + "learning_rate": 4.915085388533743e-06, + "loss": 0.4759839177131653, + "mean_token_accuracy": 0.8192248344421387, + "num_tokens": 4651269.0, + "step": 511 + }, + { + "epoch": 0.3890577507598784, + "grad_norm": 2.424199104309082, + "learning_rate": 4.914543319209126e-06, + "loss": 0.5576270818710327, + "mean_token_accuracy": 0.8203302621841431, + "num_tokens": 4657296.0, + "step": 512 + }, + { + "epoch": 0.3898176291793313, + "grad_norm": 2.725156307220459, + "learning_rate": 4.913999555265062e-06, + "loss": 0.4337949752807617, + "mean_token_accuracy": 0.8382406234741211, + "num_tokens": 4661850.0, + "step": 513 + }, + { + "epoch": 0.3905775075987842, + "grad_norm": 2.3120534420013428, + "learning_rate": 4.913454097083185e-06, + "loss": 0.4941597580909729, + "mean_token_accuracy": 0.8302834033966064, + "num_tokens": 4667769.0, + "step": 514 + }, + { + "epoch": 0.3913373860182371, + "grad_norm": 2.3111207485198975, + "learning_rate": 4.912906945046319e-06, + "loss": 0.5253715515136719, + "mean_token_accuracy": 0.84515380859375, + "num_tokens": 4674537.0, + "step": 515 + }, + { + "epoch": 0.39209726443769, + "grad_norm": 1.4117841720581055, + "learning_rate": 4.912358099538476e-06, + "loss": 0.4521017074584961, + "mean_token_accuracy": 0.8208256959915161, + "num_tokens": 4690605.0, + "step": 516 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 2.3742799758911133, + "learning_rate": 4.911807560944858e-06, + "loss": 0.41572901606559753, + "mean_token_accuracy": 0.8550551533699036, + "num_tokens": 4706437.0, + "step": 517 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 2.4052202701568604, + "learning_rate": 4.911255329651852e-06, + "loss": 0.6003736257553101, + "mean_token_accuracy": 0.8247885704040527, + "num_tokens": 4712746.0, + "step": 518 + }, + { + "epoch": 0.3943768996960486, + "grad_norm": 1.9335490465164185, + "learning_rate": 4.910701406047037e-06, + "loss": 0.5457713603973389, + "mean_token_accuracy": 0.787429690361023, + "num_tokens": 4731937.0, + "step": 519 + }, + { + "epoch": 0.3951367781155015, + "grad_norm": 2.257706880569458, + "learning_rate": 4.910145790519177e-06, + "loss": 0.5300652980804443, + "mean_token_accuracy": 0.8192912936210632, + "num_tokens": 4739422.0, + "step": 520 + }, + { + "epoch": 0.3958966565349544, + "grad_norm": 1.2099462747573853, + "learning_rate": 4.9095884834582256e-06, + "loss": 0.45872747898101807, + "mean_token_accuracy": 0.8362667560577393, + "num_tokens": 4757113.0, + "step": 521 + }, + { + "epoch": 0.3966565349544073, + "grad_norm": 2.7991135120391846, + "learning_rate": 4.909029485255321e-06, + "loss": 0.49039560556411743, + "mean_token_accuracy": 0.8260016441345215, + "num_tokens": 4761709.0, + "step": 522 + }, + { + "epoch": 0.3974164133738602, + "grad_norm": 2.2360129356384277, + "learning_rate": 4.90846879630279e-06, + "loss": 0.49556830525398254, + "mean_token_accuracy": 0.827864408493042, + "num_tokens": 4769048.0, + "step": 523 + }, + { + "epoch": 0.3981762917933131, + "grad_norm": 2.5953688621520996, + "learning_rate": 4.907906416994146e-06, + "loss": 0.387208491563797, + "mean_token_accuracy": 0.8467001914978027, + "num_tokens": 4774637.0, + "step": 524 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 2.1046814918518066, + "learning_rate": 4.907342347724088e-06, + "loss": 0.5477259755134583, + "mean_token_accuracy": 0.8060322999954224, + "num_tokens": 4782774.0, + "step": 525 + }, + { + "epoch": 0.39969604863221886, + "grad_norm": 2.5622646808624268, + "learning_rate": 4.906776588888502e-06, + "loss": 0.5684159398078918, + "mean_token_accuracy": 0.8095303177833557, + "num_tokens": 4788766.0, + "step": 526 + }, + { + "epoch": 0.4004559270516717, + "grad_norm": 1.9027913808822632, + "learning_rate": 4.906209140884459e-06, + "loss": 0.535524845123291, + "mean_token_accuracy": 0.815237820148468, + "num_tokens": 4798492.0, + "step": 527 + }, + { + "epoch": 0.4012158054711246, + "grad_norm": 2.1447622776031494, + "learning_rate": 4.905640004110216e-06, + "loss": 0.5628632307052612, + "mean_token_accuracy": 0.8085395097732544, + "num_tokens": 4805737.0, + "step": 528 + }, + { + "epoch": 0.40197568389057753, + "grad_norm": 1.6754741668701172, + "learning_rate": 4.905069178965215e-06, + "loss": 0.5046736598014832, + "mean_token_accuracy": 0.8247535228729248, + "num_tokens": 4816912.0, + "step": 529 + }, + { + "epoch": 0.4027355623100304, + "grad_norm": 2.271230459213257, + "learning_rate": 4.904496665850083e-06, + "loss": 0.6086187958717346, + "mean_token_accuracy": 0.7935276627540588, + "num_tokens": 4824577.0, + "step": 530 + }, + { + "epoch": 0.4034954407294833, + "grad_norm": 2.107595205307007, + "learning_rate": 4.903922465166633e-06, + "loss": 0.5431341528892517, + "mean_token_accuracy": 0.8129537105560303, + "num_tokens": 4831772.0, + "step": 531 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 1.3860732316970825, + "learning_rate": 4.903346577317859e-06, + "loss": 0.45816320180892944, + "mean_token_accuracy": 0.8328287601470947, + "num_tokens": 4850302.0, + "step": 532 + }, + { + "epoch": 0.40501519756838905, + "grad_norm": 1.9186837673187256, + "learning_rate": 4.902769002707942e-06, + "loss": 0.3294633626937866, + "mean_token_accuracy": 0.8853933811187744, + "num_tokens": 4856624.0, + "step": 533 + }, + { + "epoch": 0.40577507598784196, + "grad_norm": 1.516194462776184, + "learning_rate": 4.902189741742247e-06, + "loss": 0.45482105016708374, + "mean_token_accuracy": 0.8370342254638672, + "num_tokens": 4870395.0, + "step": 534 + }, + { + "epoch": 0.4065349544072948, + "grad_norm": 2.3235628604888916, + "learning_rate": 4.901608794827321e-06, + "loss": 0.40688639879226685, + "mean_token_accuracy": 0.8643521666526794, + "num_tokens": 4875645.0, + "step": 535 + }, + { + "epoch": 0.4072948328267477, + "grad_norm": 2.29286527633667, + "learning_rate": 4.9010261623708945e-06, + "loss": 0.45482826232910156, + "mean_token_accuracy": 0.8429383039474487, + "num_tokens": 4881772.0, + "step": 536 + }, + { + "epoch": 0.40805471124620063, + "grad_norm": 1.5907070636749268, + "learning_rate": 4.900441844781882e-06, + "loss": 0.5266948342323303, + "mean_token_accuracy": 0.8348641395568848, + "num_tokens": 4894289.0, + "step": 537 + }, + { + "epoch": 0.4088145896656535, + "grad_norm": 2.1816294193267822, + "learning_rate": 4.89985584247038e-06, + "loss": 0.4797617793083191, + "mean_token_accuracy": 0.8549500703811646, + "num_tokens": 4901106.0, + "step": 538 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 1.7347146272659302, + "learning_rate": 4.899268155847667e-06, + "loss": 0.4754739999771118, + "mean_token_accuracy": 0.8278418183326721, + "num_tokens": 4912131.0, + "step": 539 + }, + { + "epoch": 0.41033434650455924, + "grad_norm": 2.0694527626037598, + "learning_rate": 4.898678785326205e-06, + "loss": 0.5071008801460266, + "mean_token_accuracy": 0.8157946467399597, + "num_tokens": 4921141.0, + "step": 540 + }, + { + "epoch": 0.41109422492401215, + "grad_norm": 2.570047616958618, + "learning_rate": 4.898087731319637e-06, + "loss": 0.43639278411865234, + "mean_token_accuracy": 0.8682913780212402, + "num_tokens": 4926182.0, + "step": 541 + }, + { + "epoch": 0.41185410334346506, + "grad_norm": 4.064006805419922, + "learning_rate": 4.8974949942427854e-06, + "loss": 0.539260745048523, + "mean_token_accuracy": 0.8225528001785278, + "num_tokens": 4929449.0, + "step": 542 + }, + { + "epoch": 0.4126139817629179, + "grad_norm": 1.7644332647323608, + "learning_rate": 4.896900574511657e-06, + "loss": 0.472618043422699, + "mean_token_accuracy": 0.8332902193069458, + "num_tokens": 4939443.0, + "step": 543 + }, + { + "epoch": 0.4133738601823708, + "grad_norm": 2.879918336868286, + "learning_rate": 4.89630447254344e-06, + "loss": 0.6360667943954468, + "mean_token_accuracy": 0.8215296268463135, + "num_tokens": 4950838.0, + "step": 544 + }, + { + "epoch": 0.41413373860182373, + "grad_norm": 1.4575570821762085, + "learning_rate": 4.8957066887565005e-06, + "loss": 0.45617997646331787, + "mean_token_accuracy": 0.8373187184333801, + "num_tokens": 4965222.0, + "step": 545 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 2.4829535484313965, + "learning_rate": 4.895107223570386e-06, + "loss": 0.42285341024398804, + "mean_token_accuracy": 0.8686380386352539, + "num_tokens": 4970724.0, + "step": 546 + }, + { + "epoch": 0.4156534954407295, + "grad_norm": 2.639474630355835, + "learning_rate": 4.894506077405824e-06, + "loss": 0.5906289219856262, + "mean_token_accuracy": 0.8174435496330261, + "num_tokens": 4976766.0, + "step": 547 + }, + { + "epoch": 0.41641337386018235, + "grad_norm": 2.7960562705993652, + "learning_rate": 4.893903250684723e-06, + "loss": 0.4518949091434479, + "mean_token_accuracy": 0.8387585282325745, + "num_tokens": 4980991.0, + "step": 548 + }, + { + "epoch": 0.41717325227963525, + "grad_norm": 2.184176206588745, + "learning_rate": 4.893298743830168e-06, + "loss": 0.5223842859268188, + "mean_token_accuracy": 0.8170937299728394, + "num_tokens": 4987781.0, + "step": 549 + }, + { + "epoch": 0.41793313069908816, + "grad_norm": 2.2393438816070557, + "learning_rate": 4.892692557266429e-06, + "loss": 0.5238431692123413, + "mean_token_accuracy": 0.8217905759811401, + "num_tokens": 4994321.0, + "step": 550 + }, + { + "epoch": 0.418693009118541, + "grad_norm": 3.579047441482544, + "learning_rate": 4.8920846914189465e-06, + "loss": 0.5367584228515625, + "mean_token_accuracy": 0.8312011361122131, + "num_tokens": 4997951.0, + "step": 551 + }, + { + "epoch": 0.4194528875379939, + "grad_norm": 1.6330240964889526, + "learning_rate": 4.891475146714348e-06, + "loss": 0.6054705381393433, + "mean_token_accuracy": 0.7938206791877747, + "num_tokens": 5012726.0, + "step": 552 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 1.5775716304779053, + "learning_rate": 4.8908639235804324e-06, + "loss": 0.4774656891822815, + "mean_token_accuracy": 0.828762948513031, + "num_tokens": 5026751.0, + "step": 553 + }, + { + "epoch": 0.4209726443768997, + "grad_norm": 1.5719101428985596, + "learning_rate": 4.890251022446181e-06, + "loss": 0.549429178237915, + "mean_token_accuracy": 0.8110791444778442, + "num_tokens": 5041861.0, + "step": 554 + }, + { + "epoch": 0.4217325227963526, + "grad_norm": 1.8585275411605835, + "learning_rate": 4.889636443741752e-06, + "loss": 0.4448118805885315, + "mean_token_accuracy": 0.8462690711021423, + "num_tokens": 5052690.0, + "step": 555 + }, + { + "epoch": 0.42249240121580545, + "grad_norm": 2.189202070236206, + "learning_rate": 4.88902018789848e-06, + "loss": 0.4296762943267822, + "mean_token_accuracy": 0.8488791584968567, + "num_tokens": 5058964.0, + "step": 556 + }, + { + "epoch": 0.42325227963525835, + "grad_norm": 1.9328460693359375, + "learning_rate": 4.888402255348877e-06, + "loss": 0.5369474291801453, + "mean_token_accuracy": 0.8184729814529419, + "num_tokens": 5068465.0, + "step": 557 + }, + { + "epoch": 0.42401215805471126, + "grad_norm": 1.6233323812484741, + "learning_rate": 4.887782646526631e-06, + "loss": 0.5284391641616821, + "mean_token_accuracy": 0.8276044726371765, + "num_tokens": 5081052.0, + "step": 558 + }, + { + "epoch": 0.4247720364741641, + "grad_norm": 2.222813844680786, + "learning_rate": 4.887161361866608e-06, + "loss": 0.5679137706756592, + "mean_token_accuracy": 0.8012375831604004, + "num_tokens": 5090001.0, + "step": 559 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.1062207221984863, + "learning_rate": 4.8865384018048494e-06, + "loss": 0.5554201602935791, + "mean_token_accuracy": 0.8128066062927246, + "num_tokens": 5097644.0, + "step": 560 + }, + { + "epoch": 0.42629179331306993, + "grad_norm": 1.5380984544754028, + "learning_rate": 4.8859137667785735e-06, + "loss": 0.4948265850543976, + "mean_token_accuracy": 0.8258291482925415, + "num_tokens": 5110069.0, + "step": 561 + }, + { + "epoch": 0.4270516717325228, + "grad_norm": 2.0290257930755615, + "learning_rate": 4.8852874572261715e-06, + "loss": 0.4969530403614044, + "mean_token_accuracy": 0.8297134637832642, + "num_tokens": 5117452.0, + "step": 562 + }, + { + "epoch": 0.4278115501519757, + "grad_norm": 1.5651452541351318, + "learning_rate": 4.884659473587213e-06, + "loss": 0.5353102087974548, + "mean_token_accuracy": 0.8161719441413879, + "num_tokens": 5133756.0, + "step": 563 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.2470998764038086, + "learning_rate": 4.884029816302441e-06, + "loss": 0.5104288458824158, + "mean_token_accuracy": 0.8081635236740112, + "num_tokens": 5140278.0, + "step": 564 + }, + { + "epoch": 0.42933130699088146, + "grad_norm": 1.726891279220581, + "learning_rate": 4.883398485813772e-06, + "loss": 0.4508771002292633, + "mean_token_accuracy": 0.8548800349235535, + "num_tokens": 5150115.0, + "step": 565 + }, + { + "epoch": 0.43009118541033436, + "grad_norm": 1.4779289960861206, + "learning_rate": 4.8827654825642984e-06, + "loss": 0.46861088275909424, + "mean_token_accuracy": 0.8209476470947266, + "num_tokens": 5163225.0, + "step": 566 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 1.2361034154891968, + "learning_rate": 4.882130806998287e-06, + "loss": 0.4591076672077179, + "mean_token_accuracy": 0.803041934967041, + "num_tokens": 5180342.0, + "step": 567 + }, + { + "epoch": 0.4316109422492401, + "grad_norm": 1.882467269897461, + "learning_rate": 4.881494459561177e-06, + "loss": 0.579258143901825, + "mean_token_accuracy": 0.8007112741470337, + "num_tokens": 5189595.0, + "step": 568 + }, + { + "epoch": 0.43237082066869303, + "grad_norm": 1.095462441444397, + "learning_rate": 4.880856440699582e-06, + "loss": 0.3806574046611786, + "mean_token_accuracy": 0.8650111556053162, + "num_tokens": 5211642.0, + "step": 569 + }, + { + "epoch": 0.4331306990881459, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.880216750861288e-06, + "loss": 0.544589638710022, + "mean_token_accuracy": 0.8060122728347778, + "num_tokens": 5224137.0, + "step": 570 + }, + { + "epoch": 0.4338905775075988, + "grad_norm": 1.8561251163482666, + "learning_rate": 4.879575390495254e-06, + "loss": 0.4094924330711365, + "mean_token_accuracy": 0.8591406345367432, + "num_tokens": 5231588.0, + "step": 571 + }, + { + "epoch": 0.43465045592705165, + "grad_norm": 3.01326847076416, + "learning_rate": 4.878932360051611e-06, + "loss": 0.6139192581176758, + "mean_token_accuracy": 0.8108739852905273, + "num_tokens": 5236853.0, + "step": 572 + }, + { + "epoch": 0.43541033434650456, + "grad_norm": 2.1753034591674805, + "learning_rate": 4.878287659981663e-06, + "loss": 0.49082931876182556, + "mean_token_accuracy": 0.862828254699707, + "num_tokens": 5243264.0, + "step": 573 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 1.4437755346298218, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.5608728528022766, + "mean_token_accuracy": 0.8271626234054565, + "num_tokens": 5261757.0, + "step": 574 + }, + { + "epoch": 0.4369300911854103, + "grad_norm": 1.786683440208435, + "learning_rate": 4.876993252773923e-06, + "loss": 0.4377627968788147, + "mean_token_accuracy": 0.844936192035675, + "num_tokens": 5271038.0, + "step": 575 + }, + { + "epoch": 0.4376899696048632, + "grad_norm": 1.3425915241241455, + "learning_rate": 4.876343546544596e-06, + "loss": 0.44762521982192993, + "mean_token_accuracy": 0.8397793769836426, + "num_tokens": 5285555.0, + "step": 576 + }, + { + "epoch": 0.43844984802431614, + "grad_norm": 2.1549675464630127, + "learning_rate": 4.8756921725058935e-06, + "loss": 0.5332942008972168, + "mean_token_accuracy": 0.820149302482605, + "num_tokens": 5294595.0, + "step": 577 + }, + { + "epoch": 0.439209726443769, + "grad_norm": 1.5254042148590088, + "learning_rate": 4.875039131114975e-06, + "loss": 0.3646543622016907, + "mean_token_accuracy": 0.8442583084106445, + "num_tokens": 5304955.0, + "step": 578 + }, + { + "epoch": 0.4399696048632219, + "grad_norm": 1.5751557350158691, + "learning_rate": 4.8743844228301676e-06, + "loss": 0.4854734539985657, + "mean_token_accuracy": 0.8317523002624512, + "num_tokens": 5317351.0, + "step": 579 + }, + { + "epoch": 0.44072948328267475, + "grad_norm": 1.6950466632843018, + "learning_rate": 4.873728048110973e-06, + "loss": 0.5907570719718933, + "mean_token_accuracy": 0.7946986556053162, + "num_tokens": 5332542.0, + "step": 580 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 2.1180708408355713, + "learning_rate": 4.873070007418059e-06, + "loss": 0.5220296382904053, + "mean_token_accuracy": 0.8037363290786743, + "num_tokens": 5341722.0, + "step": 581 + }, + { + "epoch": 0.44224924012158057, + "grad_norm": 1.3643816709518433, + "learning_rate": 4.872410301213265e-06, + "loss": 0.4865502417087555, + "mean_token_accuracy": 0.8377852439880371, + "num_tokens": 5359359.0, + "step": 582 + }, + { + "epoch": 0.4430091185410334, + "grad_norm": 1.483280897140503, + "learning_rate": 4.871748929959598e-06, + "loss": 0.36856764554977417, + "mean_token_accuracy": 0.8709549903869629, + "num_tokens": 5369749.0, + "step": 583 + }, + { + "epoch": 0.44376899696048633, + "grad_norm": 1.6891541481018066, + "learning_rate": 4.871085894121234e-06, + "loss": 0.5768930912017822, + "mean_token_accuracy": 0.8030461668968201, + "num_tokens": 5383912.0, + "step": 584 + }, + { + "epoch": 0.44452887537993924, + "grad_norm": 2.1318740844726562, + "learning_rate": 4.870421194163515e-06, + "loss": 0.4337100386619568, + "mean_token_accuracy": 0.8562518358230591, + "num_tokens": 5389412.0, + "step": 585 + }, + { + "epoch": 0.4452887537993921, + "grad_norm": 2.540255546569824, + "learning_rate": 4.869754830552956e-06, + "loss": 0.4708256125450134, + "mean_token_accuracy": 0.8446552753448486, + "num_tokens": 5394762.0, + "step": 586 + }, + { + "epoch": 0.446048632218845, + "grad_norm": 2.048015594482422, + "learning_rate": 4.869086803757235e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8181137442588806, + "num_tokens": 5402379.0, + "step": 587 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 2.9821012020111084, + "learning_rate": 4.868417114245199e-06, + "loss": 0.6299797296524048, + "mean_token_accuracy": 0.8237329125404358, + "num_tokens": 5408229.0, + "step": 588 + }, + { + "epoch": 0.44756838905775076, + "grad_norm": 1.7807202339172363, + "learning_rate": 4.867745762486862e-06, + "loss": 0.5176759958267212, + "mean_token_accuracy": 0.8184244632720947, + "num_tokens": 5418383.0, + "step": 589 + }, + { + "epoch": 0.44832826747720367, + "grad_norm": 1.5466399192810059, + "learning_rate": 4.8670727489534035e-06, + "loss": 0.5137228965759277, + "mean_token_accuracy": 0.8365053534507751, + "num_tokens": 5432127.0, + "step": 590 + }, + { + "epoch": 0.4490881458966565, + "grad_norm": 2.9521141052246094, + "learning_rate": 4.866398074117173e-06, + "loss": 0.4056887924671173, + "mean_token_accuracy": 0.8561501502990723, + "num_tokens": 5436062.0, + "step": 591 + }, + { + "epoch": 0.44984802431610943, + "grad_norm": 2.058743953704834, + "learning_rate": 4.86572173845168e-06, + "loss": 0.6124799251556396, + "mean_token_accuracy": 0.8007957339286804, + "num_tokens": 5444989.0, + "step": 592 + }, + { + "epoch": 0.4506079027355623, + "grad_norm": 2.1243767738342285, + "learning_rate": 4.865043742431605e-06, + "loss": 0.5659694671630859, + "mean_token_accuracy": 0.8084750175476074, + "num_tokens": 5453865.0, + "step": 593 + }, + { + "epoch": 0.4513677811550152, + "grad_norm": 1.6732314825057983, + "learning_rate": 4.864364086532792e-06, + "loss": 0.47879064083099365, + "mean_token_accuracy": 0.8346436023712158, + "num_tokens": 5466398.0, + "step": 594 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 1.3793858289718628, + "learning_rate": 4.863682771232249e-06, + "loss": 0.45989373326301575, + "mean_token_accuracy": 0.8254791498184204, + "num_tokens": 5482121.0, + "step": 595 + }, + { + "epoch": 0.45288753799392095, + "grad_norm": 1.9812315702438354, + "learning_rate": 4.862999797008149e-06, + "loss": 0.5778874754905701, + "mean_token_accuracy": 0.8041508197784424, + "num_tokens": 5493000.0, + "step": 596 + }, + { + "epoch": 0.45364741641337386, + "grad_norm": 3.3065083026885986, + "learning_rate": 4.862315164339829e-06, + "loss": 0.4623975157737732, + "mean_token_accuracy": 0.8426318168640137, + "num_tokens": 5496723.0, + "step": 597 + }, + { + "epoch": 0.45440729483282677, + "grad_norm": 3.167119026184082, + "learning_rate": 4.861628873707792e-06, + "loss": 0.6984533667564392, + "mean_token_accuracy": 0.772136926651001, + "num_tokens": 5501161.0, + "step": 598 + }, + { + "epoch": 0.4551671732522796, + "grad_norm": 2.2130985260009766, + "learning_rate": 4.860940925593703e-06, + "loss": 0.4823192059993744, + "mean_token_accuracy": 0.8462972640991211, + "num_tokens": 5509544.0, + "step": 599 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 3.029191732406616, + "learning_rate": 4.86025132048039e-06, + "loss": 0.523664116859436, + "mean_token_accuracy": 0.8229140043258667, + "num_tokens": 5514586.0, + "step": 600 + }, + { + "epoch": 0.4566869300911854, + "grad_norm": 1.6983962059020996, + "learning_rate": 4.859560058851844e-06, + "loss": 0.4832698106765747, + "mean_token_accuracy": 0.8403248190879822, + "num_tokens": 5525773.0, + "step": 601 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 3.0504038333892822, + "learning_rate": 4.8588671411932195e-06, + "loss": 0.5158926248550415, + "mean_token_accuracy": 0.8098392486572266, + "num_tokens": 5529739.0, + "step": 602 + }, + { + "epoch": 0.4582066869300912, + "grad_norm": 2.584836483001709, + "learning_rate": 4.858172567990832e-06, + "loss": 0.5724587440490723, + "mean_token_accuracy": 0.8128519058227539, + "num_tokens": 5535763.0, + "step": 603 + }, + { + "epoch": 0.45896656534954405, + "grad_norm": 2.0514042377471924, + "learning_rate": 4.857476339732162e-06, + "loss": 0.4337679445743561, + "mean_token_accuracy": 0.8405929207801819, + "num_tokens": 5543075.0, + "step": 604 + }, + { + "epoch": 0.45972644376899696, + "grad_norm": 2.2949347496032715, + "learning_rate": 4.856778456905846e-06, + "loss": 0.46532145142555237, + "mean_token_accuracy": 0.8345137238502502, + "num_tokens": 5549035.0, + "step": 605 + }, + { + "epoch": 0.46048632218844987, + "grad_norm": 2.2067551612854004, + "learning_rate": 4.856078920001689e-06, + "loss": 0.5855136513710022, + "mean_token_accuracy": 0.8043795228004456, + "num_tokens": 5555545.0, + "step": 606 + }, + { + "epoch": 0.4612462006079027, + "grad_norm": 2.101945161819458, + "learning_rate": 4.855377729510648e-06, + "loss": 0.6071814298629761, + "mean_token_accuracy": 0.7973253130912781, + "num_tokens": 5563615.0, + "step": 607 + }, + { + "epoch": 0.46200607902735563, + "grad_norm": 2.5958821773529053, + "learning_rate": 4.8546748859248504e-06, + "loss": 0.6278061866760254, + "mean_token_accuracy": 0.7864972352981567, + "num_tokens": 5570078.0, + "step": 608 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 2.778101921081543, + "learning_rate": 4.853970389737576e-06, + "loss": 0.35521194338798523, + "mean_token_accuracy": 0.8752605319023132, + "num_tokens": 5573995.0, + "step": 609 + }, + { + "epoch": 0.4635258358662614, + "grad_norm": 2.600534677505493, + "learning_rate": 4.8532642414432675e-06, + "loss": 0.6541563868522644, + "mean_token_accuracy": 0.7843613028526306, + "num_tokens": 5580333.0, + "step": 610 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.778337836265564, + "learning_rate": 4.852556441537528e-06, + "loss": 0.3561405837535858, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 5588430.0, + "step": 611 + }, + { + "epoch": 0.46504559270516715, + "grad_norm": 1.5653862953186035, + "learning_rate": 4.851846990517118e-06, + "loss": 0.6067906618118286, + "mean_token_accuracy": 0.7919317483901978, + "num_tokens": 5601700.0, + "step": 612 + }, + { + "epoch": 0.46580547112462006, + "grad_norm": 1.6097723245620728, + "learning_rate": 4.851135888879958e-06, + "loss": 0.446664422750473, + "mean_token_accuracy": 0.8441969156265259, + "num_tokens": 5612063.0, + "step": 613 + }, + { + "epoch": 0.46656534954407297, + "grad_norm": 1.961207389831543, + "learning_rate": 4.850423137125126e-06, + "loss": 0.5508605241775513, + "mean_token_accuracy": 0.8240450024604797, + "num_tokens": 5620245.0, + "step": 614 + }, + { + "epoch": 0.4673252279635258, + "grad_norm": 2.2189085483551025, + "learning_rate": 4.8497087357528585e-06, + "loss": 0.6805076599121094, + "mean_token_accuracy": 0.771978497505188, + "num_tokens": 5629590.0, + "step": 615 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 2.5176279544830322, + "learning_rate": 4.8489926852645505e-06, + "loss": 0.4512156844139099, + "mean_token_accuracy": 0.836459755897522, + "num_tokens": 5635259.0, + "step": 616 + }, + { + "epoch": 0.4688449848024316, + "grad_norm": 1.5327287912368774, + "learning_rate": 4.848274986162754e-06, + "loss": 0.4884302616119385, + "mean_token_accuracy": 0.8194037079811096, + "num_tokens": 5649993.0, + "step": 617 + }, + { + "epoch": 0.4696048632218845, + "grad_norm": 2.184554100036621, + "learning_rate": 4.847555638951177e-06, + "loss": 0.5141451358795166, + "mean_token_accuracy": 0.8245922327041626, + "num_tokens": 5657375.0, + "step": 618 + }, + { + "epoch": 0.4703647416413374, + "grad_norm": 1.6143407821655273, + "learning_rate": 4.846834644134686e-06, + "loss": 0.4276641607284546, + "mean_token_accuracy": 0.8481845855712891, + "num_tokens": 5667941.0, + "step": 619 + }, + { + "epoch": 0.47112462006079026, + "grad_norm": 2.3747270107269287, + "learning_rate": 4.846112002219301e-06, + "loss": 0.5608246922492981, + "mean_token_accuracy": 0.8073011040687561, + "num_tokens": 5675042.0, + "step": 620 + }, + { + "epoch": 0.47188449848024316, + "grad_norm": 2.390404224395752, + "learning_rate": 4.845387713712203e-06, + "loss": 0.46616724133491516, + "mean_token_accuracy": 0.8468319177627563, + "num_tokens": 5680207.0, + "step": 621 + }, + { + "epoch": 0.4726443768996961, + "grad_norm": 1.7245099544525146, + "learning_rate": 4.844661779121723e-06, + "loss": 0.5652435421943665, + "mean_token_accuracy": 0.8010749816894531, + "num_tokens": 5693759.0, + "step": 622 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 2.6923108100891113, + "learning_rate": 4.843934198957351e-06, + "loss": 0.6254661679267883, + "mean_token_accuracy": 0.8236024975776672, + "num_tokens": 5699916.0, + "step": 623 + }, + { + "epoch": 0.47416413373860183, + "grad_norm": 2.516901969909668, + "learning_rate": 4.84320497372973e-06, + "loss": 0.6334252953529358, + "mean_token_accuracy": 0.7803834676742554, + "num_tokens": 5706554.0, + "step": 624 + }, + { + "epoch": 0.4749240121580547, + "grad_norm": 2.3744447231292725, + "learning_rate": 4.842474103950658e-06, + "loss": 0.4221811890602112, + "mean_token_accuracy": 0.8639545440673828, + "num_tokens": 5711756.0, + "step": 625 + }, + { + "epoch": 0.4756838905775076, + "grad_norm": 3.2373476028442383, + "learning_rate": 4.841741590133089e-06, + "loss": 0.6637828946113586, + "mean_token_accuracy": 0.7968347072601318, + "num_tokens": 5716458.0, + "step": 626 + }, + { + "epoch": 0.4764437689969605, + "grad_norm": 2.153888463973999, + "learning_rate": 4.841007432791129e-06, + "loss": 0.4877486228942871, + "mean_token_accuracy": 0.8345249891281128, + "num_tokens": 5723155.0, + "step": 627 + }, + { + "epoch": 0.47720364741641336, + "grad_norm": 2.120497703552246, + "learning_rate": 4.8402716324400375e-06, + "loss": 0.37323033809661865, + "mean_token_accuracy": 0.8734050393104553, + "num_tokens": 5729171.0, + "step": 628 + }, + { + "epoch": 0.47796352583586627, + "grad_norm": 1.5294172763824463, + "learning_rate": 4.839534189596228e-06, + "loss": 0.4057067334651947, + "mean_token_accuracy": 0.8523319959640503, + "num_tokens": 5740112.0, + "step": 629 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 2.1913886070251465, + "learning_rate": 4.8387951047772656e-06, + "loss": 0.4835960865020752, + "mean_token_accuracy": 0.8438145518302917, + "num_tokens": 5746838.0, + "step": 630 + }, + { + "epoch": 0.479483282674772, + "grad_norm": 1.482897162437439, + "learning_rate": 4.838054378501868e-06, + "loss": 0.46967992186546326, + "mean_token_accuracy": 0.8315759897232056, + "num_tokens": 5760428.0, + "step": 631 + }, + { + "epoch": 0.48024316109422494, + "grad_norm": 1.38850998878479, + "learning_rate": 4.837312011289907e-06, + "loss": 0.41845446825027466, + "mean_token_accuracy": 0.8557186126708984, + "num_tokens": 5773437.0, + "step": 632 + }, + { + "epoch": 0.4810030395136778, + "grad_norm": 3.8337457180023193, + "learning_rate": 4.836568003662403e-06, + "loss": 0.5102912187576294, + "mean_token_accuracy": 0.830644965171814, + "num_tokens": 5776367.0, + "step": 633 + }, + { + "epoch": 0.4817629179331307, + "grad_norm": 1.2084007263183594, + "learning_rate": 4.8358223561415304e-06, + "loss": 0.3835333585739136, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 5792246.0, + "step": 634 + }, + { + "epoch": 0.4825227963525836, + "grad_norm": 1.939408540725708, + "learning_rate": 4.835075069250613e-06, + "loss": 0.4044850468635559, + "mean_token_accuracy": 0.8488376140594482, + "num_tokens": 5799853.0, + "step": 635 + }, + { + "epoch": 0.48328267477203646, + "grad_norm": 1.345870852470398, + "learning_rate": 4.8343261435141245e-06, + "loss": 0.46660199761390686, + "mean_token_accuracy": 0.8371681571006775, + "num_tokens": 5817478.0, + "step": 636 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 1.6531339883804321, + "learning_rate": 4.833575579457691e-06, + "loss": 0.3886989951133728, + "mean_token_accuracy": 0.8763507008552551, + "num_tokens": 5825739.0, + "step": 637 + }, + { + "epoch": 0.4848024316109423, + "grad_norm": 1.6443969011306763, + "learning_rate": 4.832823377608088e-06, + "loss": 0.4070289731025696, + "mean_token_accuracy": 0.8586630821228027, + "num_tokens": 5837917.0, + "step": 638 + }, + { + "epoch": 0.48556231003039513, + "grad_norm": 2.005136013031006, + "learning_rate": 4.832069538493237e-06, + "loss": 0.40616685152053833, + "mean_token_accuracy": 0.8571510314941406, + "num_tokens": 5845250.0, + "step": 639 + }, + { + "epoch": 0.48632218844984804, + "grad_norm": 1.5244266986846924, + "learning_rate": 4.831314062642213e-06, + "loss": 0.49530288577079773, + "mean_token_accuracy": 0.8328841924667358, + "num_tokens": 5857407.0, + "step": 640 + }, + { + "epoch": 0.4870820668693009, + "grad_norm": 1.9876971244812012, + "learning_rate": 4.830556950585239e-06, + "loss": 0.4583776593208313, + "mean_token_accuracy": 0.8427221179008484, + "num_tokens": 5865391.0, + "step": 641 + }, + { + "epoch": 0.4878419452887538, + "grad_norm": 3.023336172103882, + "learning_rate": 4.829798202853683e-06, + "loss": 0.6134771108627319, + "mean_token_accuracy": 0.7981935739517212, + "num_tokens": 5870729.0, + "step": 642 + }, + { + "epoch": 0.4886018237082067, + "grad_norm": 1.8889515399932861, + "learning_rate": 4.829037819980065e-06, + "loss": 0.4420135021209717, + "mean_token_accuracy": 0.8480775356292725, + "num_tokens": 5878982.0, + "step": 643 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.2408435344696045, + "learning_rate": 4.828275802498051e-06, + "loss": 0.525706946849823, + "mean_token_accuracy": 0.8271557092666626, + "num_tokens": 5885097.0, + "step": 644 + }, + { + "epoch": 0.49012158054711247, + "grad_norm": 1.9734224081039429, + "learning_rate": 4.827512150942454e-06, + "loss": 0.44246578216552734, + "mean_token_accuracy": 0.8456668257713318, + "num_tokens": 5893941.0, + "step": 645 + }, + { + "epoch": 0.4908814589665654, + "grad_norm": 1.9618173837661743, + "learning_rate": 4.8267468658492335e-06, + "loss": 0.5119768381118774, + "mean_token_accuracy": 0.8355510830879211, + "num_tokens": 5902829.0, + "step": 646 + }, + { + "epoch": 0.49164133738601823, + "grad_norm": 1.7181587219238281, + "learning_rate": 4.825979947755496e-06, + "loss": 0.5666520595550537, + "mean_token_accuracy": 0.7951971888542175, + "num_tokens": 5915212.0, + "step": 647 + }, + { + "epoch": 0.49240121580547114, + "grad_norm": 3.0121164321899414, + "learning_rate": 4.8252113971994955e-06, + "loss": 0.628632128238678, + "mean_token_accuracy": 0.8041050434112549, + "num_tokens": 5921410.0, + "step": 648 + }, + { + "epoch": 0.493161094224924, + "grad_norm": 2.9980475902557373, + "learning_rate": 4.824441214720629e-06, + "loss": 0.4507424831390381, + "mean_token_accuracy": 0.8636263608932495, + "num_tokens": 5925179.0, + "step": 649 + }, + { + "epoch": 0.4939209726443769, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.823669400859441e-06, + "loss": 0.602759838104248, + "mean_token_accuracy": 0.8104915618896484, + "num_tokens": 5934160.0, + "step": 650 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 1.1186442375183105, + "learning_rate": 4.8228959561576195e-06, + "loss": 0.41168469190597534, + "mean_token_accuracy": 0.8461419939994812, + "num_tokens": 5954163.0, + "step": 651 + }, + { + "epoch": 0.49544072948328266, + "grad_norm": 1.855465054512024, + "learning_rate": 4.822120881157998e-06, + "loss": 0.5049735307693481, + "mean_token_accuracy": 0.8225747346878052, + "num_tokens": 5963840.0, + "step": 652 + }, + { + "epoch": 0.49620060790273557, + "grad_norm": 3.550563335418701, + "learning_rate": 4.821344176404554e-06, + "loss": 0.49025264382362366, + "mean_token_accuracy": 0.8265978693962097, + "num_tokens": 5967358.0, + "step": 653 + }, + { + "epoch": 0.4969604863221885, + "grad_norm": 3.063910484313965, + "learning_rate": 4.820565842442408e-06, + "loss": 0.5652767419815063, + "mean_token_accuracy": 0.811700701713562, + "num_tokens": 5971858.0, + "step": 654 + }, + { + "epoch": 0.49772036474164133, + "grad_norm": 2.4613308906555176, + "learning_rate": 4.819785879817827e-06, + "loss": 0.5296125411987305, + "mean_token_accuracy": 0.8336488008499146, + "num_tokens": 5977442.0, + "step": 655 + }, + { + "epoch": 0.49848024316109424, + "grad_norm": 2.342519760131836, + "learning_rate": 4.819004289078217e-06, + "loss": 0.5753380060195923, + "mean_token_accuracy": 0.7922406792640686, + "num_tokens": 5984531.0, + "step": 656 + }, + { + "epoch": 0.4992401215805471, + "grad_norm": 2.0410680770874023, + "learning_rate": 4.818221070772129e-06, + "loss": 0.5433275699615479, + "mean_token_accuracy": 0.8043830990791321, + "num_tokens": 5992642.0, + "step": 657 + }, + { + "epoch": 0.5, + "grad_norm": 1.4999698400497437, + "learning_rate": 4.8174362254492555e-06, + "loss": 0.5248899459838867, + "mean_token_accuracy": 0.8107168674468994, + "num_tokens": 6005543.0, + "step": 658 + }, + { + "epoch": 0.5007598784194529, + "grad_norm": 1.9494401216506958, + "learning_rate": 4.816649753660431e-06, + "loss": 0.41291385889053345, + "mean_token_accuracy": 0.8650569915771484, + "num_tokens": 6012185.0, + "step": 659 + }, + { + "epoch": 0.5015197568389058, + "grad_norm": 2.7514095306396484, + "learning_rate": 4.815861655957632e-06, + "loss": 0.4244142770767212, + "mean_token_accuracy": 0.8485112190246582, + "num_tokens": 6016809.0, + "step": 660 + }, + { + "epoch": 0.5022796352583586, + "grad_norm": 1.4354928731918335, + "learning_rate": 4.815071932893976e-06, + "loss": 0.4332060217857361, + "mean_token_accuracy": 0.8386815786361694, + "num_tokens": 6034795.0, + "step": 661 + }, + { + "epoch": 0.5030395136778115, + "grad_norm": 1.3113417625427246, + "learning_rate": 4.81428058502372e-06, + "loss": 0.5415540933609009, + "mean_token_accuracy": 0.8115285038948059, + "num_tokens": 6053624.0, + "step": 662 + }, + { + "epoch": 0.5037993920972644, + "grad_norm": 1.820868730545044, + "learning_rate": 4.813487612902265e-06, + "loss": 0.5360245108604431, + "mean_token_accuracy": 0.8313555717468262, + "num_tokens": 6063399.0, + "step": 663 + }, + { + "epoch": 0.5045592705167173, + "grad_norm": 2.347001552581787, + "learning_rate": 4.812693017086145e-06, + "loss": 0.4926982820034027, + "mean_token_accuracy": 0.8137006759643555, + "num_tokens": 6070111.0, + "step": 664 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 1.8830888271331787, + "learning_rate": 4.811896798133042e-06, + "loss": 0.5419014692306519, + "mean_token_accuracy": 0.8027454614639282, + "num_tokens": 6081090.0, + "step": 665 + }, + { + "epoch": 0.506079027355623, + "grad_norm": 2.3258056640625, + "learning_rate": 4.811098956601772e-06, + "loss": 0.4629337787628174, + "mean_token_accuracy": 0.8416580557823181, + "num_tokens": 6087921.0, + "step": 666 + }, + { + "epoch": 0.506838905775076, + "grad_norm": 1.9578291177749634, + "learning_rate": 4.810299493052289e-06, + "loss": 0.40305402874946594, + "mean_token_accuracy": 0.8529061079025269, + "num_tokens": 6100034.0, + "step": 667 + }, + { + "epoch": 0.5075987841945289, + "grad_norm": 2.800635576248169, + "learning_rate": 4.809498408045691e-06, + "loss": 0.5087342262268066, + "mean_token_accuracy": 0.8214689493179321, + "num_tokens": 6104742.0, + "step": 668 + }, + { + "epoch": 0.5083586626139818, + "grad_norm": 1.5318149328231812, + "learning_rate": 4.808695702144206e-06, + "loss": 0.4733222723007202, + "mean_token_accuracy": 0.837577223777771, + "num_tokens": 6117242.0, + "step": 669 + }, + { + "epoch": 0.5091185410334347, + "grad_norm": 1.2368661165237427, + "learning_rate": 4.807891375911207e-06, + "loss": 0.3929097056388855, + "mean_token_accuracy": 0.8331400752067566, + "num_tokens": 6133509.0, + "step": 670 + }, + { + "epoch": 0.5098784194528876, + "grad_norm": 2.4711415767669678, + "learning_rate": 4.8070854299112e-06, + "loss": 0.6294851303100586, + "mean_token_accuracy": 0.7956781983375549, + "num_tokens": 6140294.0, + "step": 671 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.590961217880249, + "learning_rate": 4.806277864709828e-06, + "loss": 0.580160915851593, + "mean_token_accuracy": 0.809589684009552, + "num_tokens": 6145803.0, + "step": 672 + }, + { + "epoch": 0.5113981762917933, + "grad_norm": 2.4653842449188232, + "learning_rate": 4.805468680873874e-06, + "loss": 0.5262120366096497, + "mean_token_accuracy": 0.822458803653717, + "num_tokens": 6151236.0, + "step": 673 + }, + { + "epoch": 0.5121580547112462, + "grad_norm": 2.860720157623291, + "learning_rate": 4.804657878971252e-06, + "loss": 0.4007391035556793, + "mean_token_accuracy": 0.8637382984161377, + "num_tokens": 6155310.0, + "step": 674 + }, + { + "epoch": 0.5129179331306991, + "grad_norm": 2.520282030105591, + "learning_rate": 4.803845459571014e-06, + "loss": 0.45798182487487793, + "mean_token_accuracy": 0.8270114660263062, + "num_tokens": 6160326.0, + "step": 675 + }, + { + "epoch": 0.513677811550152, + "grad_norm": 2.7290921211242676, + "learning_rate": 4.803031423243349e-06, + "loss": 0.5745848417282104, + "mean_token_accuracy": 0.8401234745979309, + "num_tokens": 6165709.0, + "step": 676 + }, + { + "epoch": 0.5144376899696048, + "grad_norm": 1.6678650379180908, + "learning_rate": 4.802215770559578e-06, + "loss": 0.5257721543312073, + "mean_token_accuracy": 0.8241991996765137, + "num_tokens": 6177875.0, + "step": 677 + }, + { + "epoch": 0.5151975683890577, + "grad_norm": 2.1720468997955322, + "learning_rate": 4.801398502092156e-06, + "loss": 0.45342206954956055, + "mean_token_accuracy": 0.8463799953460693, + "num_tokens": 6185415.0, + "step": 678 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 2.282259702682495, + "learning_rate": 4.800579618414677e-06, + "loss": 0.4864169955253601, + "mean_token_accuracy": 0.8300632238388062, + "num_tokens": 6191832.0, + "step": 679 + }, + { + "epoch": 0.5167173252279635, + "grad_norm": 2.0092248916625977, + "learning_rate": 4.799759120101861e-06, + "loss": 0.5781463980674744, + "mean_token_accuracy": 0.8267031908035278, + "num_tokens": 6199440.0, + "step": 680 + }, + { + "epoch": 0.5174772036474165, + "grad_norm": 1.396580696105957, + "learning_rate": 4.798937007729568e-06, + "loss": 0.49689239263534546, + "mean_token_accuracy": 0.8257499933242798, + "num_tokens": 6213840.0, + "step": 681 + }, + { + "epoch": 0.5182370820668692, + "grad_norm": 1.9060769081115723, + "learning_rate": 4.798113281874788e-06, + "loss": 0.48969539999961853, + "mean_token_accuracy": 0.8171790838241577, + "num_tokens": 6223006.0, + "step": 682 + }, + { + "epoch": 0.5189969604863222, + "grad_norm": 1.6255282163619995, + "learning_rate": 4.797287943115642e-06, + "loss": 0.5532330870628357, + "mean_token_accuracy": 0.8173393607139587, + "num_tokens": 6234857.0, + "step": 683 + }, + { + "epoch": 0.5197568389057751, + "grad_norm": 1.6923905611038208, + "learning_rate": 4.796460992031386e-06, + "loss": 0.4880887269973755, + "mean_token_accuracy": 0.834983229637146, + "num_tokens": 6245252.0, + "step": 684 + }, + { + "epoch": 0.520516717325228, + "grad_norm": 2.13161301612854, + "learning_rate": 4.7956324292024045e-06, + "loss": 0.5687593817710876, + "mean_token_accuracy": 0.7996571063995361, + "num_tokens": 6253726.0, + "step": 685 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 2.509375810623169, + "learning_rate": 4.794802255210217e-06, + "loss": 0.5396929979324341, + "mean_token_accuracy": 0.8007107973098755, + "num_tokens": 6259238.0, + "step": 686 + }, + { + "epoch": 0.5220364741641338, + "grad_norm": 2.393710136413574, + "learning_rate": 4.793970470637469e-06, + "loss": 0.6165191531181335, + "mean_token_accuracy": 0.7891418933868408, + "num_tokens": 6266325.0, + "step": 687 + }, + { + "epoch": 0.5227963525835866, + "grad_norm": 1.511647343635559, + "learning_rate": 4.7931370760679415e-06, + "loss": 0.4773876965045929, + "mean_token_accuracy": 0.8381044864654541, + "num_tokens": 6277447.0, + "step": 688 + }, + { + "epoch": 0.5235562310030395, + "grad_norm": 2.206587314605713, + "learning_rate": 4.792302072086542e-06, + "loss": 0.5482058525085449, + "mean_token_accuracy": 0.8239108920097351, + "num_tokens": 6285163.0, + "step": 689 + }, + { + "epoch": 0.5243161094224924, + "grad_norm": 3.018146514892578, + "learning_rate": 4.7914654592793065e-06, + "loss": 0.4880615472793579, + "mean_token_accuracy": 0.8361308574676514, + "num_tokens": 6289386.0, + "step": 690 + }, + { + "epoch": 0.5250759878419453, + "grad_norm": 1.6469231843948364, + "learning_rate": 4.790627238233405e-06, + "loss": 0.4164774715900421, + "mean_token_accuracy": 0.8496290445327759, + "num_tokens": 6298915.0, + "step": 691 + }, + { + "epoch": 0.5258358662613982, + "grad_norm": 2.352505922317505, + "learning_rate": 4.789787409537131e-06, + "loss": 0.5366303324699402, + "mean_token_accuracy": 0.8350417613983154, + "num_tokens": 6306130.0, + "step": 692 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 1.7463021278381348, + "learning_rate": 4.7889459737799105e-06, + "loss": 0.4389137923717499, + "mean_token_accuracy": 0.8463300466537476, + "num_tokens": 6315503.0, + "step": 693 + }, + { + "epoch": 0.5273556231003039, + "grad_norm": 2.257706642150879, + "learning_rate": 4.788102931552294e-06, + "loss": 0.5309344530105591, + "mean_token_accuracy": 0.8164352178573608, + "num_tokens": 6321852.0, + "step": 694 + }, + { + "epoch": 0.5281155015197568, + "grad_norm": 2.392732620239258, + "learning_rate": 4.787258283445962e-06, + "loss": 0.3956204056739807, + "mean_token_accuracy": 0.8671456575393677, + "num_tokens": 6327380.0, + "step": 695 + }, + { + "epoch": 0.5288753799392097, + "grad_norm": 2.210514545440674, + "learning_rate": 4.786412030053721e-06, + "loss": 0.4842875003814697, + "mean_token_accuracy": 0.8508446216583252, + "num_tokens": 6334898.0, + "step": 696 + }, + { + "epoch": 0.5296352583586627, + "grad_norm": 1.8678946495056152, + "learning_rate": 4.785564171969503e-06, + "loss": 0.47399595379829407, + "mean_token_accuracy": 0.8514996767044067, + "num_tokens": 6346374.0, + "step": 697 + }, + { + "epoch": 0.5303951367781155, + "grad_norm": 2.604079484939575, + "learning_rate": 4.784714709788368e-06, + "loss": 0.5950228571891785, + "mean_token_accuracy": 0.7983481884002686, + "num_tokens": 6351648.0, + "step": 698 + }, + { + "epoch": 0.5311550151975684, + "grad_norm": 1.662381649017334, + "learning_rate": 4.783863644106502e-06, + "loss": 0.41616758704185486, + "mean_token_accuracy": 0.8554803133010864, + "num_tokens": 6360506.0, + "step": 699 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 1.6300342082977295, + "learning_rate": 4.783010975521216e-06, + "loss": 0.43029269576072693, + "mean_token_accuracy": 0.8443028926849365, + "num_tokens": 6370675.0, + "step": 700 + }, + { + "epoch": 0.5326747720364742, + "grad_norm": 1.731873869895935, + "learning_rate": 4.782156704630944e-06, + "loss": 0.4383814334869385, + "mean_token_accuracy": 0.8443183898925781, + "num_tokens": 6381803.0, + "step": 701 + }, + { + "epoch": 0.5334346504559271, + "grad_norm": 3.1788413524627686, + "learning_rate": 4.7813008320352475e-06, + "loss": 0.32194480299949646, + "mean_token_accuracy": 0.8870962858200073, + "num_tokens": 6389263.0, + "step": 702 + }, + { + "epoch": 0.53419452887538, + "grad_norm": 2.099513530731201, + "learning_rate": 4.78044335833481e-06, + "loss": 0.36962923407554626, + "mean_token_accuracy": 0.8661133646965027, + "num_tokens": 6395589.0, + "step": 703 + }, + { + "epoch": 0.5349544072948328, + "grad_norm": 1.4859435558319092, + "learning_rate": 4.77958428413144e-06, + "loss": 0.4619954824447632, + "mean_token_accuracy": 0.8438555002212524, + "num_tokens": 6407470.0, + "step": 704 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.2561073303222656, + "learning_rate": 4.7787236100280685e-06, + "loss": 0.3770977258682251, + "mean_token_accuracy": 0.8515733480453491, + "num_tokens": 6422888.0, + "step": 705 + }, + { + "epoch": 0.5364741641337386, + "grad_norm": 1.4455817937850952, + "learning_rate": 4.777861336628751e-06, + "loss": 0.46481069922447205, + "mean_token_accuracy": 0.8502002954483032, + "num_tokens": 6441266.0, + "step": 706 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 1.1387295722961426, + "learning_rate": 4.7769974645386616e-06, + "loss": 0.36964765191078186, + "mean_token_accuracy": 0.8719524145126343, + "num_tokens": 6463686.0, + "step": 707 + }, + { + "epoch": 0.5379939209726444, + "grad_norm": 1.7179663181304932, + "learning_rate": 4.776131994364102e-06, + "loss": 0.4231719970703125, + "mean_token_accuracy": 0.8416585922241211, + "num_tokens": 6472956.0, + "step": 708 + }, + { + "epoch": 0.5387537993920972, + "grad_norm": 1.6328502893447876, + "learning_rate": 4.775264926712489e-06, + "loss": 0.5836569666862488, + "mean_token_accuracy": 0.8039724230766296, + "num_tokens": 6485773.0, + "step": 709 + }, + { + "epoch": 0.5395136778115501, + "grad_norm": 1.8515360355377197, + "learning_rate": 4.774396262192368e-06, + "loss": 0.5477553009986877, + "mean_token_accuracy": 0.8136521577835083, + "num_tokens": 6496379.0, + "step": 710 + }, + { + "epoch": 0.540273556231003, + "grad_norm": 1.741858959197998, + "learning_rate": 4.7735260014133986e-06, + "loss": 0.4663267731666565, + "mean_token_accuracy": 0.8473691940307617, + "num_tokens": 6507652.0, + "step": 711 + }, + { + "epoch": 0.541033434650456, + "grad_norm": 1.7516659498214722, + "learning_rate": 4.772654144986364e-06, + "loss": 0.374914288520813, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 6519030.0, + "step": 712 + }, + { + "epoch": 0.5417933130699089, + "grad_norm": 2.662343978881836, + "learning_rate": 4.7717806935231665e-06, + "loss": 0.4206875264644623, + "mean_token_accuracy": 0.8544126749038696, + "num_tokens": 6523669.0, + "step": 713 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 1.4088834524154663, + "learning_rate": 4.770905647636828e-06, + "loss": 0.5824331045150757, + "mean_token_accuracy": 0.7857901453971863, + "num_tokens": 6540560.0, + "step": 714 + }, + { + "epoch": 0.5433130699088146, + "grad_norm": 2.173656940460205, + "learning_rate": 4.77002900794149e-06, + "loss": 0.555023729801178, + "mean_token_accuracy": 0.8067290782928467, + "num_tokens": 6548946.0, + "step": 715 + }, + { + "epoch": 0.5440729483282675, + "grad_norm": 2.121018648147583, + "learning_rate": 4.769150775052411e-06, + "loss": 0.559730052947998, + "mean_token_accuracy": 0.8166372776031494, + "num_tokens": 6556065.0, + "step": 716 + }, + { + "epoch": 0.5448328267477204, + "grad_norm": 3.335866928100586, + "learning_rate": 4.768270949585968e-06, + "loss": 0.6442267894744873, + "mean_token_accuracy": 0.7858607769012451, + "num_tokens": 6560615.0, + "step": 717 + }, + { + "epoch": 0.5455927051671733, + "grad_norm": 2.3813695907592773, + "learning_rate": 4.767389532159659e-06, + "loss": 0.4027421474456787, + "mean_token_accuracy": 0.8635619282722473, + "num_tokens": 6565841.0, + "step": 718 + }, + { + "epoch": 0.5463525835866262, + "grad_norm": 2.0657708644866943, + "learning_rate": 4.766506523392095e-06, + "loss": 0.38899827003479004, + "mean_token_accuracy": 0.8660480380058289, + "num_tokens": 6572362.0, + "step": 719 + }, + { + "epoch": 0.547112462006079, + "grad_norm": 1.093705415725708, + "learning_rate": 4.765621923903005e-06, + "loss": 0.45967352390289307, + "mean_token_accuracy": 0.8338102102279663, + "num_tokens": 6595998.0, + "step": 720 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 2.942065954208374, + "learning_rate": 4.764735734313236e-06, + "loss": 0.42910510301589966, + "mean_token_accuracy": 0.8406122922897339, + "num_tokens": 6601075.0, + "step": 721 + }, + { + "epoch": 0.5486322188449848, + "grad_norm": 2.049011707305908, + "learning_rate": 4.763847955244749e-06, + "loss": 0.5584231615066528, + "mean_token_accuracy": 0.8171684741973877, + "num_tokens": 6609310.0, + "step": 722 + }, + { + "epoch": 0.5493920972644377, + "grad_norm": 2.485543966293335, + "learning_rate": 4.762958587320623e-06, + "loss": 0.5396170020103455, + "mean_token_accuracy": 0.8158525824546814, + "num_tokens": 6616185.0, + "step": 723 + }, + { + "epoch": 0.5501519756838906, + "grad_norm": 1.87015962600708, + "learning_rate": 4.762067631165049e-06, + "loss": 0.49739527702331543, + "mean_token_accuracy": 0.8303765654563904, + "num_tokens": 6625629.0, + "step": 724 + }, + { + "epoch": 0.5509118541033434, + "grad_norm": 4.239654541015625, + "learning_rate": 4.761175087403336e-06, + "loss": 0.6029239296913147, + "mean_token_accuracy": 0.8123486042022705, + "num_tokens": 6629194.0, + "step": 725 + }, + { + "epoch": 0.5516717325227963, + "grad_norm": 2.0134730339050293, + "learning_rate": 4.760280956661904e-06, + "loss": 0.4777873754501343, + "mean_token_accuracy": 0.8283513784408569, + "num_tokens": 6636929.0, + "step": 726 + }, + { + "epoch": 0.5524316109422492, + "grad_norm": 1.991780400276184, + "learning_rate": 4.75938523956829e-06, + "loss": 0.4631248116493225, + "mean_token_accuracy": 0.8275107741355896, + "num_tokens": 6645135.0, + "step": 727 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.423792839050293, + "learning_rate": 4.75848793675114e-06, + "loss": 0.49630722403526306, + "mean_token_accuracy": 0.8388000130653381, + "num_tokens": 6662690.0, + "step": 728 + }, + { + "epoch": 0.5539513677811551, + "grad_norm": 2.345294952392578, + "learning_rate": 4.757589048840219e-06, + "loss": 0.37830638885498047, + "mean_token_accuracy": 0.8782080411911011, + "num_tokens": 6667285.0, + "step": 729 + }, + { + "epoch": 0.5547112462006079, + "grad_norm": 2.7452144622802734, + "learning_rate": 4.756688576466398e-06, + "loss": 0.51595538854599, + "mean_token_accuracy": 0.8441770672798157, + "num_tokens": 6672324.0, + "step": 730 + }, + { + "epoch": 0.5554711246200608, + "grad_norm": 1.5247859954833984, + "learning_rate": 4.755786520261666e-06, + "loss": 0.48365193605422974, + "mean_token_accuracy": 0.8276445269584656, + "num_tokens": 6685296.0, + "step": 731 + }, + { + "epoch": 0.5562310030395137, + "grad_norm": 1.4018276929855347, + "learning_rate": 4.75488288085912e-06, + "loss": 0.3876481354236603, + "mean_token_accuracy": 0.8612343072891235, + "num_tokens": 6697515.0, + "step": 732 + }, + { + "epoch": 0.5569908814589666, + "grad_norm": 2.9570324420928955, + "learning_rate": 4.753977658892967e-06, + "loss": 0.5468149185180664, + "mean_token_accuracy": 0.8054271340370178, + "num_tokens": 6702194.0, + "step": 733 + }, + { + "epoch": 0.5577507598784195, + "grad_norm": 1.9282715320587158, + "learning_rate": 4.753070854998529e-06, + "loss": 0.4758574962615967, + "mean_token_accuracy": 0.8379775285720825, + "num_tokens": 6709938.0, + "step": 734 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 1.981264591217041, + "learning_rate": 4.752162469812234e-06, + "loss": 0.48461222648620605, + "mean_token_accuracy": 0.833509087562561, + "num_tokens": 6718125.0, + "step": 735 + }, + { + "epoch": 0.5592705167173252, + "grad_norm": 1.1643427610397339, + "learning_rate": 4.751252503971624e-06, + "loss": 0.410121887922287, + "mean_token_accuracy": 0.8221402764320374, + "num_tokens": 6735125.0, + "step": 736 + }, + { + "epoch": 0.5600303951367781, + "grad_norm": 1.786566972732544, + "learning_rate": 4.750340958115346e-06, + "loss": 0.5964341163635254, + "mean_token_accuracy": 0.8038164377212524, + "num_tokens": 6747369.0, + "step": 737 + }, + { + "epoch": 0.560790273556231, + "grad_norm": 1.7256991863250732, + "learning_rate": 4.749427832883158e-06, + "loss": 0.48737066984176636, + "mean_token_accuracy": 0.830894947052002, + "num_tokens": 6758115.0, + "step": 738 + }, + { + "epoch": 0.5615501519756839, + "grad_norm": 1.997747540473938, + "learning_rate": 4.748513128915928e-06, + "loss": 0.5238886475563049, + "mean_token_accuracy": 0.8066858053207397, + "num_tokens": 6766111.0, + "step": 739 + }, + { + "epoch": 0.5623100303951368, + "grad_norm": 2.127016305923462, + "learning_rate": 4.747596846855629e-06, + "loss": 0.5045586228370667, + "mean_token_accuracy": 0.821424126625061, + "num_tokens": 6772893.0, + "step": 740 + }, + { + "epoch": 0.5630699088145896, + "grad_norm": 1.7664796113967896, + "learning_rate": 4.7466789873453446e-06, + "loss": 0.42954835295677185, + "mean_token_accuracy": 0.8533384799957275, + "num_tokens": 6785133.0, + "step": 741 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 1.4987404346466064, + "learning_rate": 4.7457595510292615e-06, + "loss": 0.5378558039665222, + "mean_token_accuracy": 0.8184819221496582, + "num_tokens": 6799563.0, + "step": 742 + }, + { + "epoch": 0.5645896656534954, + "grad_norm": 1.4444655179977417, + "learning_rate": 4.744838538552678e-06, + "loss": 0.42193782329559326, + "mean_token_accuracy": 0.837514340877533, + "num_tokens": 6812470.0, + "step": 743 + }, + { + "epoch": 0.5653495440729484, + "grad_norm": 3.867751121520996, + "learning_rate": 4.7439159505619946e-06, + "loss": 0.4457814693450928, + "mean_token_accuracy": 0.8630104660987854, + "num_tokens": 6815652.0, + "step": 744 + }, + { + "epoch": 0.5661094224924013, + "grad_norm": 2.1250710487365723, + "learning_rate": 4.74299178770472e-06, + "loss": 0.5638922452926636, + "mean_token_accuracy": 0.7969781160354614, + "num_tokens": 6824566.0, + "step": 745 + }, + { + "epoch": 0.5668693009118541, + "grad_norm": 2.547072410583496, + "learning_rate": 4.742066050629465e-06, + "loss": 0.5516207814216614, + "mean_token_accuracy": 0.8160669803619385, + "num_tokens": 6830589.0, + "step": 746 + }, + { + "epoch": 0.567629179331307, + "grad_norm": 1.2975233793258667, + "learning_rate": 4.741138739985951e-06, + "loss": 0.3823344111442566, + "mean_token_accuracy": 0.8668368458747864, + "num_tokens": 6842707.0, + "step": 747 + }, + { + "epoch": 0.5683890577507599, + "grad_norm": 1.3410450220108032, + "learning_rate": 4.740209856424998e-06, + "loss": 0.5148671269416809, + "mean_token_accuracy": 0.8188045024871826, + "num_tokens": 6857624.0, + "step": 748 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 1.219467282295227, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.3998957872390747, + "mean_token_accuracy": 0.855175256729126, + "num_tokens": 6875064.0, + "step": 749 + }, + { + "epoch": 0.5699088145896657, + "grad_norm": 1.3530343770980835, + "learning_rate": 4.738347373159585e-06, + "loss": 0.5359633564949036, + "mean_token_accuracy": 0.8178457021713257, + "num_tokens": 6890911.0, + "step": 750 + }, + { + "epoch": 0.5706686930091185, + "grad_norm": 2.146988868713379, + "learning_rate": 4.737413774762287e-06, + "loss": 0.4460008144378662, + "mean_token_accuracy": 0.8172903060913086, + "num_tokens": 6896959.0, + "step": 751 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.456023097038269, + "learning_rate": 4.736478606061876e-06, + "loss": 0.43616920709609985, + "mean_token_accuracy": 0.8465108871459961, + "num_tokens": 6908904.0, + "step": 752 + }, + { + "epoch": 0.5721884498480243, + "grad_norm": 2.9696967601776123, + "learning_rate": 4.735541867714687e-06, + "loss": 0.43464532494544983, + "mean_token_accuracy": 0.8608652353286743, + "num_tokens": 6913026.0, + "step": 753 + }, + { + "epoch": 0.5729483282674772, + "grad_norm": 2.2990667819976807, + "learning_rate": 4.73460356037816e-06, + "loss": 0.6619116067886353, + "mean_token_accuracy": 0.7821142673492432, + "num_tokens": 6920588.0, + "step": 754 + }, + { + "epoch": 0.5737082066869301, + "grad_norm": 2.054746389389038, + "learning_rate": 4.733663684710835e-06, + "loss": 0.5304250717163086, + "mean_token_accuracy": 0.8265531063079834, + "num_tokens": 6928910.0, + "step": 755 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.0050594806671143, + "learning_rate": 4.732722241372354e-06, + "loss": 0.6393026113510132, + "mean_token_accuracy": 0.796819806098938, + "num_tokens": 6940217.0, + "step": 756 + }, + { + "epoch": 0.5752279635258358, + "grad_norm": 1.4285320043563843, + "learning_rate": 4.731779231023456e-06, + "loss": 0.5432837009429932, + "mean_token_accuracy": 0.8104778528213501, + "num_tokens": 6959101.0, + "step": 757 + }, + { + "epoch": 0.5759878419452887, + "grad_norm": 2.3941943645477295, + "learning_rate": 4.730834654325984e-06, + "loss": 0.46550673246383667, + "mean_token_accuracy": 0.8444503545761108, + "num_tokens": 6965036.0, + "step": 758 + }, + { + "epoch": 0.5767477203647416, + "grad_norm": 2.3850574493408203, + "learning_rate": 4.729888511942877e-06, + "loss": 0.4916389584541321, + "mean_token_accuracy": 0.8228527307510376, + "num_tokens": 6971184.0, + "step": 759 + }, + { + "epoch": 0.5775075987841946, + "grad_norm": 1.627480149269104, + "learning_rate": 4.728940804538176e-06, + "loss": 0.5863215923309326, + "mean_token_accuracy": 0.7995302677154541, + "num_tokens": 6982569.0, + "step": 760 + }, + { + "epoch": 0.5782674772036475, + "grad_norm": 1.1723195314407349, + "learning_rate": 4.727991532777016e-06, + "loss": 0.36908864974975586, + "mean_token_accuracy": 0.8355655670166016, + "num_tokens": 6998659.0, + "step": 761 + }, + { + "epoch": 0.5790273556231003, + "grad_norm": 1.5324925184249878, + "learning_rate": 4.727040697325634e-06, + "loss": 0.557658851146698, + "mean_token_accuracy": 0.8141458034515381, + "num_tokens": 7012969.0, + "step": 762 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 2.4106390476226807, + "learning_rate": 4.726088298851362e-06, + "loss": 0.5004243850708008, + "mean_token_accuracy": 0.8376860618591309, + "num_tokens": 7018301.0, + "step": 763 + }, + { + "epoch": 0.5805471124620061, + "grad_norm": 2.2594921588897705, + "learning_rate": 4.725134338022631e-06, + "loss": 0.6067016124725342, + "mean_token_accuracy": 0.8100241422653198, + "num_tokens": 7025201.0, + "step": 764 + }, + { + "epoch": 0.581306990881459, + "grad_norm": 1.4649826288223267, + "learning_rate": 4.724178815508967e-06, + "loss": 0.36200693249702454, + "mean_token_accuracy": 0.8621826171875, + "num_tokens": 7035112.0, + "step": 765 + }, + { + "epoch": 0.5820668693009119, + "grad_norm": 2.3634560108184814, + "learning_rate": 4.723221731980993e-06, + "loss": 0.41862213611602783, + "mean_token_accuracy": 0.8541463613510132, + "num_tokens": 7040339.0, + "step": 766 + }, + { + "epoch": 0.5828267477203647, + "grad_norm": 2.7798104286193848, + "learning_rate": 4.722263088110426e-06, + "loss": 0.4647108018398285, + "mean_token_accuracy": 0.8505672216415405, + "num_tokens": 7044880.0, + "step": 767 + }, + { + "epoch": 0.5835866261398176, + "grad_norm": 2.070528507232666, + "learning_rate": 4.721302884570079e-06, + "loss": 0.5147565007209778, + "mean_token_accuracy": 0.8113877773284912, + "num_tokens": 7052433.0, + "step": 768 + }, + { + "epoch": 0.5843465045592705, + "grad_norm": 2.1953284740448, + "learning_rate": 4.720341122033862e-06, + "loss": 0.5075466632843018, + "mean_token_accuracy": 0.8474211096763611, + "num_tokens": 7058686.0, + "step": 769 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 1.9287755489349365, + "learning_rate": 4.719377801176774e-06, + "loss": 0.5382202863693237, + "mean_token_accuracy": 0.8148090243339539, + "num_tokens": 7067538.0, + "step": 770 + }, + { + "epoch": 0.5858662613981763, + "grad_norm": 1.5574456453323364, + "learning_rate": 4.718412922674913e-06, + "loss": 0.43406790494918823, + "mean_token_accuracy": 0.8477081060409546, + "num_tokens": 7077853.0, + "step": 771 + }, + { + "epoch": 0.5866261398176292, + "grad_norm": 1.5490336418151855, + "learning_rate": 4.717446487205466e-06, + "loss": 0.43164271116256714, + "mean_token_accuracy": 0.8504570126533508, + "num_tokens": 7091728.0, + "step": 772 + }, + { + "epoch": 0.587386018237082, + "grad_norm": 1.6945984363555908, + "learning_rate": 4.716478495446717e-06, + "loss": 0.5153743624687195, + "mean_token_accuracy": 0.8213579058647156, + "num_tokens": 7108680.0, + "step": 773 + }, + { + "epoch": 0.5881458966565349, + "grad_norm": 2.2633883953094482, + "learning_rate": 4.715508948078037e-06, + "loss": 0.45254790782928467, + "mean_token_accuracy": 0.8392219543457031, + "num_tokens": 7115546.0, + "step": 774 + }, + { + "epoch": 0.5889057750759878, + "grad_norm": 1.5731090307235718, + "learning_rate": 4.714537845779894e-06, + "loss": 0.38678881525993347, + "mean_token_accuracy": 0.8800252676010132, + "num_tokens": 7126360.0, + "step": 775 + }, + { + "epoch": 0.5896656534954408, + "grad_norm": 2.4873392581939697, + "learning_rate": 4.7135651892338445e-06, + "loss": 0.5190927386283875, + "mean_token_accuracy": 0.8145407438278198, + "num_tokens": 7135705.0, + "step": 776 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 1.2931004762649536, + "learning_rate": 4.712590979122534e-06, + "loss": 0.3686544895172119, + "mean_token_accuracy": 0.8720537424087524, + "num_tokens": 7150688.0, + "step": 777 + }, + { + "epoch": 0.5911854103343465, + "grad_norm": 1.6353671550750732, + "learning_rate": 4.7116152161297045e-06, + "loss": 0.49065062403678894, + "mean_token_accuracy": 0.8203760385513306, + "num_tokens": 7161040.0, + "step": 778 + }, + { + "epoch": 0.5919452887537994, + "grad_norm": 1.2345483303070068, + "learning_rate": 4.710637900940181e-06, + "loss": 0.4004976451396942, + "mean_token_accuracy": 0.8302007913589478, + "num_tokens": 7178074.0, + "step": 779 + }, + { + "epoch": 0.5927051671732523, + "grad_norm": 2.2506837844848633, + "learning_rate": 4.7096590342398825e-06, + "loss": 0.45142874121665955, + "mean_token_accuracy": 0.8481036424636841, + "num_tokens": 7184153.0, + "step": 780 + }, + { + "epoch": 0.5934650455927052, + "grad_norm": 1.420479416847229, + "learning_rate": 4.708678616715815e-06, + "loss": 0.4802100360393524, + "mean_token_accuracy": 0.8586992025375366, + "num_tokens": 7202810.0, + "step": 781 + }, + { + "epoch": 0.5942249240121581, + "grad_norm": 3.457632303237915, + "learning_rate": 4.707696649056073e-06, + "loss": 0.5265094041824341, + "mean_token_accuracy": 0.8260114192962646, + "num_tokens": 7206396.0, + "step": 782 + }, + { + "epoch": 0.5949848024316109, + "grad_norm": 1.1592093706130981, + "learning_rate": 4.706713131949839e-06, + "loss": 0.3708173632621765, + "mean_token_accuracy": 0.8476542234420776, + "num_tokens": 7225034.0, + "step": 783 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.6761400699615479, + "learning_rate": 4.705728066087384e-06, + "loss": 0.4137252867221832, + "mean_token_accuracy": 0.8462049961090088, + "num_tokens": 7237101.0, + "step": 784 + }, + { + "epoch": 0.5965045592705167, + "grad_norm": 2.320185422897339, + "learning_rate": 4.704741452160064e-06, + "loss": 0.5157154202461243, + "mean_token_accuracy": 0.8391785621643066, + "num_tokens": 7243826.0, + "step": 785 + }, + { + "epoch": 0.5972644376899696, + "grad_norm": 2.079423427581787, + "learning_rate": 4.703753290860323e-06, + "loss": 0.4734993278980255, + "mean_token_accuracy": 0.8353281021118164, + "num_tokens": 7250175.0, + "step": 786 + }, + { + "epoch": 0.5980243161094225, + "grad_norm": 1.8215159177780151, + "learning_rate": 4.702763582881692e-06, + "loss": 0.520193338394165, + "mean_token_accuracy": 0.844062864780426, + "num_tokens": 7258868.0, + "step": 787 + }, + { + "epoch": 0.5987841945288754, + "grad_norm": 1.3823071718215942, + "learning_rate": 4.701772328918784e-06, + "loss": 0.4177844822406769, + "mean_token_accuracy": 0.8363165259361267, + "num_tokens": 7271744.0, + "step": 788 + }, + { + "epoch": 0.5995440729483282, + "grad_norm": 2.4749298095703125, + "learning_rate": 4.700779529667301e-06, + "loss": 0.5115069150924683, + "mean_token_accuracy": 0.8473520278930664, + "num_tokens": 7277040.0, + "step": 789 + }, + { + "epoch": 0.6003039513677811, + "grad_norm": 1.7072296142578125, + "learning_rate": 4.699785185824026e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8161447048187256, + "num_tokens": 7288288.0, + "step": 790 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 1.6479384899139404, + "learning_rate": 4.69878929808683e-06, + "loss": 0.4445168972015381, + "mean_token_accuracy": 0.8381255865097046, + "num_tokens": 7298640.0, + "step": 791 + }, + { + "epoch": 0.601823708206687, + "grad_norm": 1.9095896482467651, + "learning_rate": 4.6977918671546635e-06, + "loss": 0.5841238498687744, + "mean_token_accuracy": 0.7971454858779907, + "num_tokens": 7307220.0, + "step": 792 + }, + { + "epoch": 0.6025835866261399, + "grad_norm": 1.9614146947860718, + "learning_rate": 4.696792893727562e-06, + "loss": 0.34684082865715027, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 7313875.0, + "step": 793 + }, + { + "epoch": 0.6033434650455927, + "grad_norm": 2.015570640563965, + "learning_rate": 4.695792378506645e-06, + "loss": 0.42779117822647095, + "mean_token_accuracy": 0.8625012636184692, + "num_tokens": 7321439.0, + "step": 794 + }, + { + "epoch": 0.6041033434650456, + "grad_norm": 2.8581228256225586, + "learning_rate": 4.694790322194111e-06, + "loss": 0.6519991159439087, + "mean_token_accuracy": 0.7629562616348267, + "num_tokens": 7326916.0, + "step": 795 + }, + { + "epoch": 0.6048632218844985, + "grad_norm": 2.482715368270874, + "learning_rate": 4.693786725493242e-06, + "loss": 0.532963216304779, + "mean_token_accuracy": 0.832184910774231, + "num_tokens": 7333311.0, + "step": 796 + }, + { + "epoch": 0.6056231003039514, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.692781589108402e-06, + "loss": 0.43381205201148987, + "mean_token_accuracy": 0.8402494192123413, + "num_tokens": 7343731.0, + "step": 797 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 2.2133216857910156, + "learning_rate": 4.691774913745033e-06, + "loss": 0.4380851089954376, + "mean_token_accuracy": 0.8600908517837524, + "num_tokens": 7350224.0, + "step": 798 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 2.046280860900879, + "learning_rate": 4.690766700109659e-06, + "loss": 0.3821919560432434, + "mean_token_accuracy": 0.8691814541816711, + "num_tokens": 7356717.0, + "step": 799 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 1.8482693433761597, + "learning_rate": 4.689756948909884e-06, + "loss": 0.5217651128768921, + "mean_token_accuracy": 0.803473711013794, + "num_tokens": 7365806.0, + "step": 800 + }, + { + "epoch": 0.6086626139817629, + "grad_norm": 2.192134141921997, + "learning_rate": 4.688745660854388e-06, + "loss": 0.573980987071991, + "mean_token_accuracy": 0.8198676109313965, + "num_tokens": 7380281.0, + "step": 801 + }, + { + "epoch": 0.6094224924012158, + "grad_norm": 2.363626718521118, + "learning_rate": 4.687732836652935e-06, + "loss": 0.5204599499702454, + "mean_token_accuracy": 0.8373252153396606, + "num_tokens": 7386938.0, + "step": 802 + }, + { + "epoch": 0.6101823708206687, + "grad_norm": 1.9320523738861084, + "learning_rate": 4.686718477016361e-06, + "loss": 0.47316622734069824, + "mean_token_accuracy": 0.830596923828125, + "num_tokens": 7395069.0, + "step": 803 + }, + { + "epoch": 0.6109422492401215, + "grad_norm": 2.6573057174682617, + "learning_rate": 4.6857025826565845e-06, + "loss": 0.5495861768722534, + "mean_token_accuracy": 0.8187421560287476, + "num_tokens": 7400563.0, + "step": 804 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 2.0893123149871826, + "learning_rate": 4.684685154286599e-06, + "loss": 0.5362675786018372, + "mean_token_accuracy": 0.8394701480865479, + "num_tokens": 7406973.0, + "step": 805 + }, + { + "epoch": 0.6124620060790273, + "grad_norm": 2.455130100250244, + "learning_rate": 4.683666192620474e-06, + "loss": 0.5405995845794678, + "mean_token_accuracy": 0.8079100847244263, + "num_tokens": 7412931.0, + "step": 806 + }, + { + "epoch": 0.6132218844984803, + "grad_norm": 2.311915636062622, + "learning_rate": 4.682645698373357e-06, + "loss": 0.5395106077194214, + "mean_token_accuracy": 0.8156260251998901, + "num_tokens": 7419699.0, + "step": 807 + }, + { + "epoch": 0.6139817629179332, + "grad_norm": 1.686838984489441, + "learning_rate": 4.6816236722614694e-06, + "loss": 0.6034521460533142, + "mean_token_accuracy": 0.7855954170227051, + "num_tokens": 7431899.0, + "step": 808 + }, + { + "epoch": 0.6147416413373861, + "grad_norm": 1.682759165763855, + "learning_rate": 4.680600115002109e-06, + "loss": 0.48593831062316895, + "mean_token_accuracy": 0.8229435682296753, + "num_tokens": 7443187.0, + "step": 809 + }, + { + "epoch": 0.6155015197568389, + "grad_norm": 2.064589738845825, + "learning_rate": 4.679575027313649e-06, + "loss": 0.5098468661308289, + "mean_token_accuracy": 0.8234638571739197, + "num_tokens": 7450868.0, + "step": 810 + }, + { + "epoch": 0.6162613981762918, + "grad_norm": 2.2063486576080322, + "learning_rate": 4.6785484099155324e-06, + "loss": 0.5138497352600098, + "mean_token_accuracy": 0.8152111172676086, + "num_tokens": 7457176.0, + "step": 811 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 1.6258726119995117, + "learning_rate": 4.67752026352828e-06, + "loss": 0.4064181447029114, + "mean_token_accuracy": 0.8720619678497314, + "num_tokens": 7466557.0, + "step": 812 + }, + { + "epoch": 0.6177811550151976, + "grad_norm": 2.3309383392333984, + "learning_rate": 4.676490588873486e-06, + "loss": 0.5180112719535828, + "mean_token_accuracy": 0.8233879804611206, + "num_tokens": 7472650.0, + "step": 813 + }, + { + "epoch": 0.6185410334346505, + "grad_norm": 1.4545246362686157, + "learning_rate": 4.675459386673815e-06, + "loss": 0.37917959690093994, + "mean_token_accuracy": 0.8598103523254395, + "num_tokens": 7485171.0, + "step": 814 + }, + { + "epoch": 0.6193009118541033, + "grad_norm": 2.654231071472168, + "learning_rate": 4.674426657653003e-06, + "loss": 0.554074227809906, + "mean_token_accuracy": 0.8026446104049683, + "num_tokens": 7490787.0, + "step": 815 + }, + { + "epoch": 0.6200607902735562, + "grad_norm": 1.5543994903564453, + "learning_rate": 4.67339240253586e-06, + "loss": 0.6335440278053284, + "mean_token_accuracy": 0.783241868019104, + "num_tokens": 7505975.0, + "step": 816 + }, + { + "epoch": 0.6208206686930091, + "grad_norm": 2.079998016357422, + "learning_rate": 4.672356622048266e-06, + "loss": 0.5169394016265869, + "mean_token_accuracy": 0.8088761568069458, + "num_tokens": 7513470.0, + "step": 817 + }, + { + "epoch": 0.621580547112462, + "grad_norm": 1.5971896648406982, + "learning_rate": 4.671319316917172e-06, + "loss": 0.44588586688041687, + "mean_token_accuracy": 0.8518649339675903, + "num_tokens": 7524352.0, + "step": 818 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 2.477579116821289, + "learning_rate": 4.670280487870599e-06, + "loss": 0.5713893175125122, + "mean_token_accuracy": 0.8116940259933472, + "num_tokens": 7530359.0, + "step": 819 + }, + { + "epoch": 0.6231003039513677, + "grad_norm": 2.066211700439453, + "learning_rate": 4.669240135637635e-06, + "loss": 0.5295331478118896, + "mean_token_accuracy": 0.819536566734314, + "num_tokens": 7536963.0, + "step": 820 + }, + { + "epoch": 0.6238601823708206, + "grad_norm": 2.1217997074127197, + "learning_rate": 4.668198260948442e-06, + "loss": 0.6146406531333923, + "mean_token_accuracy": 0.7932635545730591, + "num_tokens": 7545800.0, + "step": 821 + }, + { + "epoch": 0.6246200607902735, + "grad_norm": 2.0173542499542236, + "learning_rate": 4.667154864534245e-06, + "loss": 0.6240535974502563, + "mean_token_accuracy": 0.7883644104003906, + "num_tokens": 7556165.0, + "step": 822 + }, + { + "epoch": 0.6253799392097265, + "grad_norm": 2.014526128768921, + "learning_rate": 4.666109947127343e-06, + "loss": 0.40367332100868225, + "mean_token_accuracy": 0.8653522729873657, + "num_tokens": 7562665.0, + "step": 823 + }, + { + "epoch": 0.6261398176291794, + "grad_norm": 2.5078861713409424, + "learning_rate": 4.665063509461098e-06, + "loss": 0.5903617739677429, + "mean_token_accuracy": 0.7902897596359253, + "num_tokens": 7568922.0, + "step": 824 + }, + { + "epoch": 0.6268996960486323, + "grad_norm": 2.454622745513916, + "learning_rate": 4.664015552269938e-06, + "loss": 0.5238361358642578, + "mean_token_accuracy": 0.838546872138977, + "num_tokens": 7575965.0, + "step": 825 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 2.920919418334961, + "learning_rate": 4.662966076289363e-06, + "loss": 0.5028782486915588, + "mean_token_accuracy": 0.8311152458190918, + "num_tokens": 7580193.0, + "step": 826 + }, + { + "epoch": 0.628419452887538, + "grad_norm": 1.545382022857666, + "learning_rate": 4.661915082255932e-06, + "loss": 0.4817378520965576, + "mean_token_accuracy": 0.8373227119445801, + "num_tokens": 7593024.0, + "step": 827 + }, + { + "epoch": 0.6291793313069909, + "grad_norm": 1.5152469873428345, + "learning_rate": 4.6608625709072766e-06, + "loss": 0.4693033695220947, + "mean_token_accuracy": 0.8150848150253296, + "num_tokens": 7606459.0, + "step": 828 + }, + { + "epoch": 0.6299392097264438, + "grad_norm": 2.1310224533081055, + "learning_rate": 4.659808542982089e-06, + "loss": 0.4653395414352417, + "mean_token_accuracy": 0.8286294341087341, + "num_tokens": 7613036.0, + "step": 829 + }, + { + "epoch": 0.6306990881458967, + "grad_norm": 2.1949679851531982, + "learning_rate": 4.658752999220125e-06, + "loss": 0.3698633909225464, + "mean_token_accuracy": 0.871590793132782, + "num_tokens": 7618527.0, + "step": 830 + }, + { + "epoch": 0.6314589665653495, + "grad_norm": 2.2770416736602783, + "learning_rate": 4.657695940362207e-06, + "loss": 0.5202419757843018, + "mean_token_accuracy": 0.817577600479126, + "num_tokens": 7624459.0, + "step": 831 + }, + { + "epoch": 0.6322188449848024, + "grad_norm": 1.402042269706726, + "learning_rate": 4.65663736715022e-06, + "loss": 0.51531583070755, + "mean_token_accuracy": 0.8228116631507874, + "num_tokens": 7639371.0, + "step": 832 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.3554883003234863, + "learning_rate": 4.65557728032711e-06, + "loss": 0.6771188378334045, + "mean_token_accuracy": 0.7880028486251831, + "num_tokens": 7643924.0, + "step": 833 + }, + { + "epoch": 0.6337386018237082, + "grad_norm": 2.081040143966675, + "learning_rate": 4.654515680636888e-06, + "loss": 0.5712796449661255, + "mean_token_accuracy": 0.8177868127822876, + "num_tokens": 7651881.0, + "step": 834 + }, + { + "epoch": 0.6344984802431611, + "grad_norm": 0.9128716588020325, + "learning_rate": 4.653452568824625e-06, + "loss": 0.3423936069011688, + "mean_token_accuracy": 0.8782886266708374, + "num_tokens": 7677829.0, + "step": 835 + }, + { + "epoch": 0.6352583586626139, + "grad_norm": 3.49015736579895, + "learning_rate": 4.652387945636454e-06, + "loss": 0.34657734632492065, + "mean_token_accuracy": 0.8770567178726196, + "num_tokens": 7680796.0, + "step": 836 + }, + { + "epoch": 0.6360182370820668, + "grad_norm": 2.026247501373291, + "learning_rate": 4.651321811819568e-06, + "loss": 0.5098431706428528, + "mean_token_accuracy": 0.8216961622238159, + "num_tokens": 7688746.0, + "step": 837 + }, + { + "epoch": 0.6367781155015197, + "grad_norm": 2.444343090057373, + "learning_rate": 4.650254168122222e-06, + "loss": 0.5490090250968933, + "mean_token_accuracy": 0.8092857599258423, + "num_tokens": 7695220.0, + "step": 838 + }, + { + "epoch": 0.6375379939209727, + "grad_norm": 2.0171122550964355, + "learning_rate": 4.649185015293728e-06, + "loss": 0.47221142053604126, + "mean_token_accuracy": 0.8514408469200134, + "num_tokens": 7702759.0, + "step": 839 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.9800984859466553, + "learning_rate": 4.64811435408446e-06, + "loss": 0.5238803625106812, + "mean_token_accuracy": 0.8479194641113281, + "num_tokens": 7714017.0, + "step": 840 + }, + { + "epoch": 0.6390577507598785, + "grad_norm": 3.0674357414245605, + "learning_rate": 4.647042185245848e-06, + "loss": 0.4668245315551758, + "mean_token_accuracy": 0.8381714820861816, + "num_tokens": 7717801.0, + "step": 841 + }, + { + "epoch": 0.6398176291793313, + "grad_norm": 1.5672820806503296, + "learning_rate": 4.645968509530381e-06, + "loss": 0.4428741931915283, + "mean_token_accuracy": 0.8416479825973511, + "num_tokens": 7728342.0, + "step": 842 + }, + { + "epoch": 0.6405775075987842, + "grad_norm": 2.3042354583740234, + "learning_rate": 4.644893327691608e-06, + "loss": 0.49937760829925537, + "mean_token_accuracy": 0.827070951461792, + "num_tokens": 7734576.0, + "step": 843 + }, + { + "epoch": 0.6413373860182371, + "grad_norm": 2.057772159576416, + "learning_rate": 4.6438166404841316e-06, + "loss": 0.5912986993789673, + "mean_token_accuracy": 0.805509090423584, + "num_tokens": 7742481.0, + "step": 844 + }, + { + "epoch": 0.64209726443769, + "grad_norm": 1.9688186645507812, + "learning_rate": 4.6427384486636115e-06, + "loss": 0.482401967048645, + "mean_token_accuracy": 0.8358086347579956, + "num_tokens": 7750002.0, + "step": 845 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.6852948665618896, + "learning_rate": 4.6416587529867665e-06, + "loss": 0.5479315519332886, + "mean_token_accuracy": 0.8091106414794922, + "num_tokens": 7755578.0, + "step": 846 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 2.0547337532043457, + "learning_rate": 4.640577554211366e-06, + "loss": 0.5327274203300476, + "mean_token_accuracy": 0.8280376195907593, + "num_tokens": 7763513.0, + "step": 847 + }, + { + "epoch": 0.6443768996960486, + "grad_norm": 2.0328633785247803, + "learning_rate": 4.63949485309624e-06, + "loss": 0.4814409613609314, + "mean_token_accuracy": 0.8527672290802002, + "num_tokens": 7771131.0, + "step": 848 + }, + { + "epoch": 0.6451367781155015, + "grad_norm": 1.5892863273620605, + "learning_rate": 4.638410650401267e-06, + "loss": 0.4492785334587097, + "mean_token_accuracy": 0.846997857093811, + "num_tokens": 7781572.0, + "step": 849 + }, + { + "epoch": 0.6458966565349544, + "grad_norm": 1.8295910358428955, + "learning_rate": 4.637324946887384e-06, + "loss": 0.37088239192962646, + "mean_token_accuracy": 0.8616628646850586, + "num_tokens": 7788604.0, + "step": 850 + }, + { + "epoch": 0.6466565349544073, + "grad_norm": 3.380040168762207, + "learning_rate": 4.636237743316578e-06, + "loss": 0.4737280607223511, + "mean_token_accuracy": 0.855940580368042, + "num_tokens": 7792504.0, + "step": 851 + }, + { + "epoch": 0.6474164133738601, + "grad_norm": 2.8790009021759033, + "learning_rate": 4.635149040451891e-06, + "loss": 0.39790448546409607, + "mean_token_accuracy": 0.8710698485374451, + "num_tokens": 7796333.0, + "step": 852 + }, + { + "epoch": 0.648176291793313, + "grad_norm": 1.914914608001709, + "learning_rate": 4.634058839057417e-06, + "loss": 0.2954312562942505, + "mean_token_accuracy": 0.8880234956741333, + "num_tokens": 7802456.0, + "step": 853 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 1.3709120750427246, + "learning_rate": 4.632967139898301e-06, + "loss": 0.43224576115608215, + "mean_token_accuracy": 0.8446190357208252, + "num_tokens": 7816770.0, + "step": 854 + }, + { + "epoch": 0.6496960486322189, + "grad_norm": 1.6579312086105347, + "learning_rate": 4.63187394374074e-06, + "loss": 0.3535553514957428, + "mean_token_accuracy": 0.8738704919815063, + "num_tokens": 7824963.0, + "step": 855 + }, + { + "epoch": 0.6504559270516718, + "grad_norm": 2.4055678844451904, + "learning_rate": 4.63077925135198e-06, + "loss": 0.5078744292259216, + "mean_token_accuracy": 0.8430874347686768, + "num_tokens": 7830962.0, + "step": 856 + }, + { + "epoch": 0.6512158054711246, + "grad_norm": 2.5171499252319336, + "learning_rate": 4.629683063500319e-06, + "loss": 0.5172419548034668, + "mean_token_accuracy": 0.8087141513824463, + "num_tokens": 7836638.0, + "step": 857 + }, + { + "epoch": 0.6519756838905775, + "grad_norm": 1.7588486671447754, + "learning_rate": 4.628585380955104e-06, + "loss": 0.5759496092796326, + "mean_token_accuracy": 0.8043236136436462, + "num_tokens": 7844654.0, + "step": 858 + }, + { + "epoch": 0.6527355623100304, + "grad_norm": 1.5887070894241333, + "learning_rate": 4.62748620448673e-06, + "loss": 0.41849038004875183, + "mean_token_accuracy": 0.8556643724441528, + "num_tokens": 7855642.0, + "step": 859 + }, + { + "epoch": 0.6534954407294833, + "grad_norm": 3.227942705154419, + "learning_rate": 4.626385534866642e-06, + "loss": 0.5279449224472046, + "mean_token_accuracy": 0.8250958323478699, + "num_tokens": 7859890.0, + "step": 860 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 2.440467119216919, + "learning_rate": 4.625283372867333e-06, + "loss": 0.5294933319091797, + "mean_token_accuracy": 0.8235013484954834, + "num_tokens": 7866766.0, + "step": 861 + }, + { + "epoch": 0.6550151975683891, + "grad_norm": 2.4106903076171875, + "learning_rate": 4.624179719262342e-06, + "loss": 0.5662813186645508, + "mean_token_accuracy": 0.8061668872833252, + "num_tokens": 7872809.0, + "step": 862 + }, + { + "epoch": 0.6557750759878419, + "grad_norm": 3.5151145458221436, + "learning_rate": 4.623074574826254e-06, + "loss": 0.5471097230911255, + "mean_token_accuracy": 0.8220691084861755, + "num_tokens": 7876136.0, + "step": 863 + }, + { + "epoch": 0.6565349544072948, + "grad_norm": 1.5319840908050537, + "learning_rate": 4.621967940334705e-06, + "loss": 0.4178982377052307, + "mean_token_accuracy": 0.8517135977745056, + "num_tokens": 7886113.0, + "step": 864 + }, + { + "epoch": 0.6572948328267477, + "grad_norm": 1.63701331615448, + "learning_rate": 4.620859816564371e-06, + "loss": 0.4666512608528137, + "mean_token_accuracy": 0.8223508596420288, + "num_tokens": 7897982.0, + "step": 865 + }, + { + "epoch": 0.6580547112462006, + "grad_norm": 2.1515414714813232, + "learning_rate": 4.619750204292978e-06, + "loss": 0.5359305143356323, + "mean_token_accuracy": 0.8192868232727051, + "num_tokens": 7904947.0, + "step": 866 + }, + { + "epoch": 0.6588145896656535, + "grad_norm": 2.2140955924987793, + "learning_rate": 4.618639104299294e-06, + "loss": 0.5275633931159973, + "mean_token_accuracy": 0.8120715618133545, + "num_tokens": 7913913.0, + "step": 867 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 1.3956893682479858, + "learning_rate": 4.6175265173631304e-06, + "loss": 0.4378768503665924, + "mean_token_accuracy": 0.8479125499725342, + "num_tokens": 7927979.0, + "step": 868 + }, + { + "epoch": 0.6603343465045592, + "grad_norm": 2.98103928565979, + "learning_rate": 4.616412444265344e-06, + "loss": 0.42614591121673584, + "mean_token_accuracy": 0.8595094680786133, + "num_tokens": 7934293.0, + "step": 869 + }, + { + "epoch": 0.6610942249240122, + "grad_norm": 2.554845094680786, + "learning_rate": 4.6152968857878365e-06, + "loss": 0.3698030412197113, + "mean_token_accuracy": 0.8717041015625, + "num_tokens": 7938547.0, + "step": 870 + }, + { + "epoch": 0.6618541033434651, + "grad_norm": 3.0901825428009033, + "learning_rate": 4.6141798427135475e-06, + "loss": 0.5037497282028198, + "mean_token_accuracy": 0.8354041576385498, + "num_tokens": 7942829.0, + "step": 871 + }, + { + "epoch": 0.662613981762918, + "grad_norm": 2.8692073822021484, + "learning_rate": 4.6130613158264605e-06, + "loss": 0.5418164134025574, + "mean_token_accuracy": 0.8298909664154053, + "num_tokens": 7949303.0, + "step": 872 + }, + { + "epoch": 0.6633738601823708, + "grad_norm": 3.960404396057129, + "learning_rate": 4.611941305911602e-06, + "loss": 0.6284480094909668, + "mean_token_accuracy": 0.837495744228363, + "num_tokens": 7952486.0, + "step": 873 + }, + { + "epoch": 0.6641337386018237, + "grad_norm": 2.6690115928649902, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5214360952377319, + "mean_token_accuracy": 0.8213508129119873, + "num_tokens": 7957559.0, + "step": 874 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 2.3376171588897705, + "learning_rate": 4.609696840143875e-06, + "loss": 0.46887528896331787, + "mean_token_accuracy": 0.8438819646835327, + "num_tokens": 7962826.0, + "step": 875 + }, + { + "epoch": 0.6656534954407295, + "grad_norm": 2.2222683429718018, + "learning_rate": 4.6085723858662575e-06, + "loss": 0.5607719421386719, + "mean_token_accuracy": 0.8128405809402466, + "num_tokens": 7970131.0, + "step": 876 + }, + { + "epoch": 0.6664133738601824, + "grad_norm": 2.069091558456421, + "learning_rate": 4.607446451711372e-06, + "loss": 0.506301760673523, + "mean_token_accuracy": 0.8256827592849731, + "num_tokens": 7977524.0, + "step": 877 + }, + { + "epoch": 0.6671732522796353, + "grad_norm": 1.3724967241287231, + "learning_rate": 4.606319038469443e-06, + "loss": 0.43285101652145386, + "mean_token_accuracy": 0.8525032997131348, + "num_tokens": 7989174.0, + "step": 878 + }, + { + "epoch": 0.6679331306990881, + "grad_norm": 2.278205156326294, + "learning_rate": 4.605190146931731e-06, + "loss": 0.4845905303955078, + "mean_token_accuracy": 0.8284652829170227, + "num_tokens": 7998524.0, + "step": 879 + }, + { + "epoch": 0.668693009118541, + "grad_norm": 1.3871766328811646, + "learning_rate": 4.604059777890537e-06, + "loss": 0.5736679434776306, + "mean_token_accuracy": 0.8223285675048828, + "num_tokens": 8015776.0, + "step": 880 + }, + { + "epoch": 0.6694528875379939, + "grad_norm": 1.926164984703064, + "learning_rate": 4.602927932139197e-06, + "loss": 0.4133230447769165, + "mean_token_accuracy": 0.8653768301010132, + "num_tokens": 8022979.0, + "step": 881 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 2.109272003173828, + "learning_rate": 4.601794610472083e-06, + "loss": 0.7005600929260254, + "mean_token_accuracy": 0.7777010202407837, + "num_tokens": 8032618.0, + "step": 882 + }, + { + "epoch": 0.6709726443768997, + "grad_norm": 2.077977418899536, + "learning_rate": 4.6006598136846056e-06, + "loss": 0.5278208255767822, + "mean_token_accuracy": 0.8230358958244324, + "num_tokens": 8040534.0, + "step": 883 + }, + { + "epoch": 0.6717325227963525, + "grad_norm": 1.678581714630127, + "learning_rate": 4.599523542573207e-06, + "loss": 0.4955351650714874, + "mean_token_accuracy": 0.8270003795623779, + "num_tokens": 8052249.0, + "step": 884 + }, + { + "epoch": 0.6724924012158054, + "grad_norm": 2.0751662254333496, + "learning_rate": 4.598385797935368e-06, + "loss": 0.5266247987747192, + "mean_token_accuracy": 0.8263581991195679, + "num_tokens": 8060600.0, + "step": 885 + }, + { + "epoch": 0.6732522796352584, + "grad_norm": 2.418405771255493, + "learning_rate": 4.5972465805696e-06, + "loss": 0.4481425881385803, + "mean_token_accuracy": 0.846164345741272, + "num_tokens": 8066025.0, + "step": 886 + }, + { + "epoch": 0.6740121580547113, + "grad_norm": 2.3936474323272705, + "learning_rate": 4.596105891275449e-06, + "loss": 0.4553404450416565, + "mean_token_accuracy": 0.8412896394729614, + "num_tokens": 8071544.0, + "step": 887 + }, + { + "epoch": 0.6747720364741642, + "grad_norm": 2.2024407386779785, + "learning_rate": 4.594963730853497e-06, + "loss": 0.6218541860580444, + "mean_token_accuracy": 0.7890232801437378, + "num_tokens": 8079061.0, + "step": 888 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 2.51015567779541, + "learning_rate": 4.593820100105355e-06, + "loss": 0.5149124264717102, + "mean_token_accuracy": 0.8241918087005615, + "num_tokens": 8084293.0, + "step": 889 + }, + { + "epoch": 0.6762917933130699, + "grad_norm": 1.8748939037322998, + "learning_rate": 4.5926749998336665e-06, + "loss": 0.50836181640625, + "mean_token_accuracy": 0.8067223429679871, + "num_tokens": 8092511.0, + "step": 890 + }, + { + "epoch": 0.6770516717325228, + "grad_norm": 1.801193118095398, + "learning_rate": 4.5915284308421075e-06, + "loss": 0.4372861683368683, + "mean_token_accuracy": 0.8510604500770569, + "num_tokens": 8101174.0, + "step": 891 + }, + { + "epoch": 0.6778115501519757, + "grad_norm": 2.6476457118988037, + "learning_rate": 4.590380393935383e-06, + "loss": 0.38700711727142334, + "mean_token_accuracy": 0.8659796714782715, + "num_tokens": 8105398.0, + "step": 892 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.1147183179855347, + "learning_rate": 4.589230889919232e-06, + "loss": 0.38546115159988403, + "mean_token_accuracy": 0.8570581674575806, + "num_tokens": 8127394.0, + "step": 893 + }, + { + "epoch": 0.6793313069908815, + "grad_norm": 2.908905506134033, + "learning_rate": 4.588079919600419e-06, + "loss": 0.5108504295349121, + "mean_token_accuracy": 0.8121406435966492, + "num_tokens": 8131801.0, + "step": 894 + }, + { + "epoch": 0.6800911854103343, + "grad_norm": 3.1522326469421387, + "learning_rate": 4.586927483786739e-06, + "loss": 0.44059112668037415, + "mean_token_accuracy": 0.8448011875152588, + "num_tokens": 8154416.0, + "step": 895 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.5142440795898438, + "learning_rate": 4.585773583287017e-06, + "loss": 0.513217568397522, + "mean_token_accuracy": 0.8386049270629883, + "num_tokens": 8171156.0, + "step": 896 + }, + { + "epoch": 0.6816109422492401, + "grad_norm": 2.597881317138672, + "learning_rate": 4.584618218911104e-06, + "loss": 0.4937712550163269, + "mean_token_accuracy": 0.8223681449890137, + "num_tokens": 8176124.0, + "step": 897 + }, + { + "epoch": 0.682370820668693, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.583461391469879e-06, + "loss": 0.519811749458313, + "mean_token_accuracy": 0.8169777393341064, + "num_tokens": 8185136.0, + "step": 898 + }, + { + "epoch": 0.6831306990881459, + "grad_norm": 3.2061994075775146, + "learning_rate": 4.582303101775249e-06, + "loss": 0.4655115008354187, + "mean_token_accuracy": 0.8425977230072021, + "num_tokens": 8188864.0, + "step": 899 + }, + { + "epoch": 0.6838905775075987, + "grad_norm": 1.3485229015350342, + "learning_rate": 4.581143350640146e-06, + "loss": 0.5014470815658569, + "mean_token_accuracy": 0.8273109197616577, + "num_tokens": 8203460.0, + "step": 900 + }, + { + "epoch": 0.6846504559270516, + "grad_norm": 1.3264713287353516, + "learning_rate": 4.579982138878527e-06, + "loss": 0.5073703527450562, + "mean_token_accuracy": 0.8259357213973999, + "num_tokens": 8219348.0, + "step": 901 + }, + { + "epoch": 0.6854103343465046, + "grad_norm": 2.4436347484588623, + "learning_rate": 4.578819467305375e-06, + "loss": 0.47020310163497925, + "mean_token_accuracy": 0.8567265272140503, + "num_tokens": 8224427.0, + "step": 902 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 1.921749234199524, + "learning_rate": 4.5776553367367e-06, + "loss": 0.622514009475708, + "mean_token_accuracy": 0.7863982319831848, + "num_tokens": 8233151.0, + "step": 903 + }, + { + "epoch": 0.6869300911854104, + "grad_norm": 1.8815616369247437, + "learning_rate": 4.576489747989532e-06, + "loss": 0.4910545349121094, + "mean_token_accuracy": 0.8147122859954834, + "num_tokens": 8240762.0, + "step": 904 + }, + { + "epoch": 0.6876899696048632, + "grad_norm": 1.2366989850997925, + "learning_rate": 4.575322701881926e-06, + "loss": 0.3947566747665405, + "mean_token_accuracy": 0.873993992805481, + "num_tokens": 8259381.0, + "step": 905 + }, + { + "epoch": 0.6884498480243161, + "grad_norm": 1.5767735242843628, + "learning_rate": 4.57415419923296e-06, + "loss": 0.57136070728302, + "mean_token_accuracy": 0.8028088808059692, + "num_tokens": 8273296.0, + "step": 906 + }, + { + "epoch": 0.689209726443769, + "grad_norm": 2.378675699234009, + "learning_rate": 4.572984240862733e-06, + "loss": 0.5894849896430969, + "mean_token_accuracy": 0.7977708578109741, + "num_tokens": 8280083.0, + "step": 907 + }, + { + "epoch": 0.6899696048632219, + "grad_norm": 2.0401132106781006, + "learning_rate": 4.57181282759237e-06, + "loss": 0.5524613261222839, + "mean_token_accuracy": 0.8138598203659058, + "num_tokens": 8288236.0, + "step": 908 + }, + { + "epoch": 0.6907294832826748, + "grad_norm": 2.293701648712158, + "learning_rate": 4.570639960244011e-06, + "loss": 0.5154546499252319, + "mean_token_accuracy": 0.8234660625457764, + "num_tokens": 8294493.0, + "step": 909 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 1.9286527633666992, + "learning_rate": 4.56946563964082e-06, + "loss": 0.5364264845848083, + "mean_token_accuracy": 0.8147368431091309, + "num_tokens": 8303441.0, + "step": 910 + }, + { + "epoch": 0.6922492401215805, + "grad_norm": 1.2571251392364502, + "learning_rate": 4.5682898666069815e-06, + "loss": 0.43535223603248596, + "mean_token_accuracy": 0.859239935874939, + "num_tokens": 8321548.0, + "step": 911 + }, + { + "epoch": 0.6930091185410334, + "grad_norm": 1.2224860191345215, + "learning_rate": 4.567112641967697e-06, + "loss": 0.40205076336860657, + "mean_token_accuracy": 0.8724711537361145, + "num_tokens": 8335205.0, + "step": 912 + }, + { + "epoch": 0.6937689969604863, + "grad_norm": 1.2064491510391235, + "learning_rate": 4.5659339665491894e-06, + "loss": 0.37790587544441223, + "mean_token_accuracy": 0.8464339971542358, + "num_tokens": 8350926.0, + "step": 913 + }, + { + "epoch": 0.6945288753799392, + "grad_norm": 2.1755270957946777, + "learning_rate": 4.5647538411786965e-06, + "loss": 0.42034298181533813, + "mean_token_accuracy": 0.84148108959198, + "num_tokens": 8356739.0, + "step": 914 + }, + { + "epoch": 0.6952887537993921, + "grad_norm": 1.234864592552185, + "learning_rate": 4.563572266684478e-06, + "loss": 0.5062938332557678, + "mean_token_accuracy": 0.8132052421569824, + "num_tokens": 8373660.0, + "step": 915 + }, + { + "epoch": 0.6960486322188449, + "grad_norm": 2.4250621795654297, + "learning_rate": 4.562389243895807e-06, + "loss": 0.4907791018486023, + "mean_token_accuracy": 0.8337979912757874, + "num_tokens": 8378661.0, + "step": 916 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 1.5018314123153687, + "learning_rate": 4.561204773642974e-06, + "loss": 0.41041281819343567, + "mean_token_accuracy": 0.8569784164428711, + "num_tokens": 8390322.0, + "step": 917 + }, + { + "epoch": 0.6975683890577508, + "grad_norm": 2.797269344329834, + "learning_rate": 4.5600188567572874e-06, + "loss": 0.3146931529045105, + "mean_token_accuracy": 0.8913302421569824, + "num_tokens": 8393567.0, + "step": 918 + }, + { + "epoch": 0.6983282674772037, + "grad_norm": 1.4002827405929565, + "learning_rate": 4.558831494071069e-06, + "loss": 0.4275597333908081, + "mean_token_accuracy": 0.8504893779754639, + "num_tokens": 8407119.0, + "step": 919 + }, + { + "epoch": 0.6990881458966566, + "grad_norm": 1.7045831680297852, + "learning_rate": 4.557642686417654e-06, + "loss": 0.49593430757522583, + "mean_token_accuracy": 0.8185091018676758, + "num_tokens": 8417408.0, + "step": 920 + }, + { + "epoch": 0.6998480243161094, + "grad_norm": 2.8818066120147705, + "learning_rate": 4.556452434631396e-06, + "loss": 0.637908935546875, + "mean_token_accuracy": 0.7883946895599365, + "num_tokens": 8422319.0, + "step": 921 + }, + { + "epoch": 0.7006079027355623, + "grad_norm": 2.3587265014648438, + "learning_rate": 4.555260739547657e-06, + "loss": 0.38749319314956665, + "mean_token_accuracy": 0.8774704933166504, + "num_tokens": 8427315.0, + "step": 922 + }, + { + "epoch": 0.7013677811550152, + "grad_norm": 1.6648749113082886, + "learning_rate": 4.554067602002815e-06, + "loss": 0.4044865369796753, + "mean_token_accuracy": 0.8524141311645508, + "num_tokens": 8438662.0, + "step": 923 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.467787742614746, + "learning_rate": 4.55287302283426e-06, + "loss": 0.591016411781311, + "mean_token_accuracy": 0.81184983253479, + "num_tokens": 8442237.0, + "step": 924 + }, + { + "epoch": 0.702887537993921, + "grad_norm": 2.1458635330200195, + "learning_rate": 4.551677002880395e-06, + "loss": 0.5017476677894592, + "mean_token_accuracy": 0.822914183139801, + "num_tokens": 8449494.0, + "step": 925 + }, + { + "epoch": 0.7036474164133738, + "grad_norm": 2.521714448928833, + "learning_rate": 4.550479542980632e-06, + "loss": 0.531912088394165, + "mean_token_accuracy": 0.8225687742233276, + "num_tokens": 8454983.0, + "step": 926 + }, + { + "epoch": 0.7044072948328267, + "grad_norm": 3.5248100757598877, + "learning_rate": 4.549280643975394e-06, + "loss": 0.4631815254688263, + "mean_token_accuracy": 0.8443771600723267, + "num_tokens": 8458504.0, + "step": 927 + }, + { + "epoch": 0.7051671732522796, + "grad_norm": 2.5105819702148438, + "learning_rate": 4.548080306706114e-06, + "loss": 0.30487123131752014, + "mean_token_accuracy": 0.9018767476081848, + "num_tokens": 8462589.0, + "step": 928 + }, + { + "epoch": 0.7059270516717325, + "grad_norm": 1.3367713689804077, + "learning_rate": 4.5468785320152365e-06, + "loss": 0.4355026185512543, + "mean_token_accuracy": 0.8323584794998169, + "num_tokens": 8478450.0, + "step": 929 + }, + { + "epoch": 0.7066869300911854, + "grad_norm": 2.2506282329559326, + "learning_rate": 4.545675320746212e-06, + "loss": 0.5082957744598389, + "mean_token_accuracy": 0.823430597782135, + "num_tokens": 8485991.0, + "step": 930 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 1.7164632081985474, + "learning_rate": 4.544470673743502e-06, + "loss": 0.3960164785385132, + "mean_token_accuracy": 0.8592486381530762, + "num_tokens": 8495217.0, + "step": 931 + }, + { + "epoch": 0.7082066869300911, + "grad_norm": 1.5864969491958618, + "learning_rate": 4.543264591852572e-06, + "loss": 0.49114471673965454, + "mean_token_accuracy": 0.8330780267715454, + "num_tokens": 8508904.0, + "step": 932 + }, + { + "epoch": 0.708966565349544, + "grad_norm": 2.1707003116607666, + "learning_rate": 4.542057075919898e-06, + "loss": 0.49895772337913513, + "mean_token_accuracy": 0.8327431082725525, + "num_tokens": 8515792.0, + "step": 933 + }, + { + "epoch": 0.709726443768997, + "grad_norm": 1.9002083539962769, + "learning_rate": 4.54084812679296e-06, + "loss": 0.4548531472682953, + "mean_token_accuracy": 0.834532618522644, + "num_tokens": 8524006.0, + "step": 934 + }, + { + "epoch": 0.7104863221884499, + "grad_norm": 1.8505141735076904, + "learning_rate": 4.539637745320247e-06, + "loss": 0.35716521739959717, + "mean_token_accuracy": 0.872222900390625, + "num_tokens": 8533647.0, + "step": 935 + }, + { + "epoch": 0.7112462006079028, + "grad_norm": 2.092620849609375, + "learning_rate": 4.53842593235125e-06, + "loss": 0.4673694372177124, + "mean_token_accuracy": 0.8460999131202698, + "num_tokens": 8540734.0, + "step": 936 + }, + { + "epoch": 0.7120060790273556, + "grad_norm": 2.689514636993408, + "learning_rate": 4.537212688736466e-06, + "loss": 0.45461273193359375, + "mean_token_accuracy": 0.8450704216957092, + "num_tokens": 8544948.0, + "step": 937 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 2.4507734775543213, + "learning_rate": 4.535998015327396e-06, + "loss": 0.4571906626224518, + "mean_token_accuracy": 0.8429360389709473, + "num_tokens": 8550445.0, + "step": 938 + }, + { + "epoch": 0.7135258358662614, + "grad_norm": 1.8960013389587402, + "learning_rate": 4.534781912976546e-06, + "loss": 0.4461391568183899, + "mean_token_accuracy": 0.8487973213195801, + "num_tokens": 8557630.0, + "step": 939 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.602611780166626, + "learning_rate": 4.533564382537421e-06, + "loss": 0.5277102589607239, + "mean_token_accuracy": 0.8330916166305542, + "num_tokens": 8570397.0, + "step": 940 + }, + { + "epoch": 0.7150455927051672, + "grad_norm": 1.8936395645141602, + "learning_rate": 4.532345424864533e-06, + "loss": 0.38619571924209595, + "mean_token_accuracy": 0.8514572381973267, + "num_tokens": 8582673.0, + "step": 941 + }, + { + "epoch": 0.71580547112462, + "grad_norm": 1.3898619413375854, + "learning_rate": 4.531125040813392e-06, + "loss": 0.4825032949447632, + "mean_token_accuracy": 0.833012580871582, + "num_tokens": 8597239.0, + "step": 942 + }, + { + "epoch": 0.7165653495440729, + "grad_norm": 2.128230571746826, + "learning_rate": 4.529903231240511e-06, + "loss": 0.4862118065357208, + "mean_token_accuracy": 0.8210917711257935, + "num_tokens": 8605877.0, + "step": 943 + }, + { + "epoch": 0.7173252279635258, + "grad_norm": 1.6552259922027588, + "learning_rate": 4.528679997003403e-06, + "loss": 0.5092059373855591, + "mean_token_accuracy": 0.8247389793395996, + "num_tokens": 8617060.0, + "step": 944 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 2.1174771785736084, + "learning_rate": 4.52745533896058e-06, + "loss": 0.39110174775123596, + "mean_token_accuracy": 0.8672944903373718, + "num_tokens": 8623306.0, + "step": 945 + }, + { + "epoch": 0.7188449848024316, + "grad_norm": 2.8648383617401123, + "learning_rate": 4.526229257971556e-06, + "loss": 0.49864327907562256, + "mean_token_accuracy": 0.8305130004882812, + "num_tokens": 8627466.0, + "step": 946 + }, + { + "epoch": 0.7196048632218845, + "grad_norm": 2.155514717102051, + "learning_rate": 4.52500175489684e-06, + "loss": 0.5070191025733948, + "mean_token_accuracy": 0.8311188817024231, + "num_tokens": 8634759.0, + "step": 947 + }, + { + "epoch": 0.7203647416413373, + "grad_norm": 1.8432683944702148, + "learning_rate": 4.523772830597942e-06, + "loss": 0.5569252371788025, + "mean_token_accuracy": 0.8070821762084961, + "num_tokens": 8644160.0, + "step": 948 + }, + { + "epoch": 0.7211246200607903, + "grad_norm": 2.8912241458892822, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4799427390098572, + "mean_token_accuracy": 0.8443552851676941, + "num_tokens": 8648377.0, + "step": 949 + }, + { + "epoch": 0.7218844984802432, + "grad_norm": 3.3449625968933105, + "learning_rate": 4.521310721778622e-06, + "loss": 0.44043463468551636, + "mean_token_accuracy": 0.8521315455436707, + "num_tokens": 8651846.0, + "step": 950 + }, + { + "epoch": 0.7226443768996961, + "grad_norm": 1.4127917289733887, + "learning_rate": 4.520077538986203e-06, + "loss": 0.4700999855995178, + "mean_token_accuracy": 0.8377952575683594, + "num_tokens": 8665199.0, + "step": 951 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.1607301235198975, + "learning_rate": 4.518842938425606e-06, + "loss": 0.4374256730079651, + "mean_token_accuracy": 0.8448896408081055, + "num_tokens": 8672158.0, + "step": 952 + }, + { + "epoch": 0.7241641337386018, + "grad_norm": 1.3442779779434204, + "learning_rate": 4.51760692096332e-06, + "loss": 0.38948923349380493, + "mean_token_accuracy": 0.8598923683166504, + "num_tokens": 8684532.0, + "step": 953 + }, + { + "epoch": 0.7249240121580547, + "grad_norm": 2.0003178119659424, + "learning_rate": 4.516369487466832e-06, + "loss": 0.3797217011451721, + "mean_token_accuracy": 0.8652102947235107, + "num_tokens": 8691460.0, + "step": 954 + }, + { + "epoch": 0.7256838905775076, + "grad_norm": 1.8196535110473633, + "learning_rate": 4.5151306388046175e-06, + "loss": 0.5676811933517456, + "mean_token_accuracy": 0.818500816822052, + "num_tokens": 8701624.0, + "step": 955 + }, + { + "epoch": 0.7264437689969605, + "grad_norm": 2.1962296962738037, + "learning_rate": 4.513890375846152e-06, + "loss": 0.45399484038352966, + "mean_token_accuracy": 0.8463879227638245, + "num_tokens": 8707410.0, + "step": 956 + }, + { + "epoch": 0.7272036474164134, + "grad_norm": 1.8798872232437134, + "learning_rate": 4.512648699461897e-06, + "loss": 0.5679811239242554, + "mean_token_accuracy": 0.8089900016784668, + "num_tokens": 8715630.0, + "step": 957 + }, + { + "epoch": 0.7279635258358662, + "grad_norm": 2.3540258407592773, + "learning_rate": 4.511405610523309e-06, + "loss": 0.5282865762710571, + "mean_token_accuracy": 0.8196114301681519, + "num_tokens": 8721934.0, + "step": 958 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 2.5630908012390137, + "learning_rate": 4.510161109902837e-06, + "loss": 0.39442378282546997, + "mean_token_accuracy": 0.8400980830192566, + "num_tokens": 8726511.0, + "step": 959 + }, + { + "epoch": 0.729483282674772, + "grad_norm": 1.9829226732254028, + "learning_rate": 4.508915198473919e-06, + "loss": 0.4611976742744446, + "mean_token_accuracy": 0.8439624309539795, + "num_tokens": 8733460.0, + "step": 960 + }, + { + "epoch": 0.7302431610942249, + "grad_norm": 3.0291950702667236, + "learning_rate": 4.507667877110982e-06, + "loss": 0.5158340930938721, + "mean_token_accuracy": 0.8300060033798218, + "num_tokens": 8737629.0, + "step": 961 + }, + { + "epoch": 0.7310030395136778, + "grad_norm": 1.9208252429962158, + "learning_rate": 4.506419146689445e-06, + "loss": 0.3807099163532257, + "mean_token_accuracy": 0.871469259262085, + "num_tokens": 8744615.0, + "step": 962 + }, + { + "epoch": 0.7317629179331308, + "grad_norm": 3.051565408706665, + "learning_rate": 4.505169008085717e-06, + "loss": 0.38461726903915405, + "mean_token_accuracy": 0.874465823173523, + "num_tokens": 8748154.0, + "step": 963 + }, + { + "epoch": 0.7325227963525835, + "grad_norm": 1.375466227531433, + "learning_rate": 4.503917462177192e-06, + "loss": 0.42490679025650024, + "mean_token_accuracy": 0.8457326889038086, + "num_tokens": 8760965.0, + "step": 964 + }, + { + "epoch": 0.7332826747720365, + "grad_norm": 2.216681957244873, + "learning_rate": 4.5026645098422515e-06, + "loss": 0.43149900436401367, + "mean_token_accuracy": 0.8527278900146484, + "num_tokens": 8766996.0, + "step": 965 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 1.9422595500946045, + "learning_rate": 4.5014101519602684e-06, + "loss": 0.4964504539966583, + "mean_token_accuracy": 0.8137556314468384, + "num_tokens": 8774411.0, + "step": 966 + }, + { + "epoch": 0.7348024316109423, + "grad_norm": 2.058887004852295, + "learning_rate": 4.500154389411598e-06, + "loss": 0.4977570176124573, + "mean_token_accuracy": 0.8254626989364624, + "num_tokens": 8782220.0, + "step": 967 + }, + { + "epoch": 0.7355623100303952, + "grad_norm": 2.9977786540985107, + "learning_rate": 4.498897223077582e-06, + "loss": 0.4061415195465088, + "mean_token_accuracy": 0.8752427101135254, + "num_tokens": 8786120.0, + "step": 968 + }, + { + "epoch": 0.736322188449848, + "grad_norm": 2.2636303901672363, + "learning_rate": 4.49763865384055e-06, + "loss": 0.5062161087989807, + "mean_token_accuracy": 0.8171653747558594, + "num_tokens": 8792459.0, + "step": 969 + }, + { + "epoch": 0.7370820668693009, + "grad_norm": 1.8850842714309692, + "learning_rate": 4.496378682583813e-06, + "loss": 0.5014280676841736, + "mean_token_accuracy": 0.8547511100769043, + "num_tokens": 8800675.0, + "step": 970 + }, + { + "epoch": 0.7378419452887538, + "grad_norm": 1.191985011100769, + "learning_rate": 4.495117310191667e-06, + "loss": 0.4713883101940155, + "mean_token_accuracy": 0.8213596343994141, + "num_tokens": 8820740.0, + "step": 971 + }, + { + "epoch": 0.7386018237082067, + "grad_norm": 1.823000192642212, + "learning_rate": 4.493854537549393e-06, + "loss": 0.46332645416259766, + "mean_token_accuracy": 0.8359860777854919, + "num_tokens": 8828884.0, + "step": 972 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 2.590446949005127, + "learning_rate": 4.492590365543253e-06, + "loss": 0.49074703454971313, + "mean_token_accuracy": 0.8433758020401001, + "num_tokens": 8833859.0, + "step": 973 + }, + { + "epoch": 0.7401215805471124, + "grad_norm": 2.2762670516967773, + "learning_rate": 4.491324795060491e-06, + "loss": 0.39465656876564026, + "mean_token_accuracy": 0.8734766244888306, + "num_tokens": 8839350.0, + "step": 974 + }, + { + "epoch": 0.7408814589665653, + "grad_norm": 2.698725461959839, + "learning_rate": 4.490057826989333e-06, + "loss": 0.5552085041999817, + "mean_token_accuracy": 0.8132266998291016, + "num_tokens": 8844373.0, + "step": 975 + }, + { + "epoch": 0.7416413373860182, + "grad_norm": 2.704606294631958, + "learning_rate": 4.488789462218988e-06, + "loss": 0.3447791635990143, + "mean_token_accuracy": 0.8736170530319214, + "num_tokens": 8848236.0, + "step": 976 + }, + { + "epoch": 0.7424012158054711, + "grad_norm": 3.1260716915130615, + "learning_rate": 4.487519701639641e-06, + "loss": 0.5945233702659607, + "mean_token_accuracy": 0.7997599840164185, + "num_tokens": 8852935.0, + "step": 977 + }, + { + "epoch": 0.743161094224924, + "grad_norm": 1.6895452737808228, + "learning_rate": 4.486248546142459e-06, + "loss": 0.4823892116546631, + "mean_token_accuracy": 0.8279662132263184, + "num_tokens": 8861743.0, + "step": 978 + }, + { + "epoch": 0.743920972644377, + "grad_norm": 1.9161452054977417, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.5266581773757935, + "mean_token_accuracy": 0.8218623399734497, + "num_tokens": 8870601.0, + "step": 979 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 1.6894301176071167, + "learning_rate": 4.483702053964154e-06, + "loss": 0.4186219573020935, + "mean_token_accuracy": 0.8471781015396118, + "num_tokens": 8885617.0, + "step": 980 + }, + { + "epoch": 0.7454407294832827, + "grad_norm": 1.6319992542266846, + "learning_rate": 4.482426719070258e-06, + "loss": 0.541317880153656, + "mean_token_accuracy": 0.8216162323951721, + "num_tokens": 8897595.0, + "step": 981 + }, + { + "epoch": 0.7462006079027356, + "grad_norm": 5.102413177490234, + "learning_rate": 4.4811499928329775e-06, + "loss": 0.3928517699241638, + "mean_token_accuracy": 0.858033299446106, + "num_tokens": 8901682.0, + "step": 982 + }, + { + "epoch": 0.7469604863221885, + "grad_norm": 2.213860273361206, + "learning_rate": 4.479871876148368e-06, + "loss": 0.4276347756385803, + "mean_token_accuracy": 0.8529798984527588, + "num_tokens": 8908088.0, + "step": 983 + }, + { + "epoch": 0.7477203647416414, + "grad_norm": 1.2180038690567017, + "learning_rate": 4.478592369913464e-06, + "loss": 0.3941590189933777, + "mean_token_accuracy": 0.8608149290084839, + "num_tokens": 8925876.0, + "step": 984 + }, + { + "epoch": 0.7484802431610942, + "grad_norm": 2.849802255630493, + "learning_rate": 4.477311475026271e-06, + "loss": 0.42190325260162354, + "mean_token_accuracy": 0.860505223274231, + "num_tokens": 8930190.0, + "step": 985 + }, + { + "epoch": 0.7492401215805471, + "grad_norm": 1.704128384590149, + "learning_rate": 4.476029192385769e-06, + "loss": 0.4786282777786255, + "mean_token_accuracy": 0.8302322626113892, + "num_tokens": 8938340.0, + "step": 986 + }, + { + "epoch": 0.75, + "grad_norm": 2.06322979927063, + "learning_rate": 4.474745522891915e-06, + "loss": 0.4648786187171936, + "mean_token_accuracy": 0.8366481065750122, + "num_tokens": 8944633.0, + "step": 987 + }, + { + "epoch": 0.7507598784194529, + "grad_norm": 2.0745396614074707, + "learning_rate": 4.473460467445637e-06, + "loss": 0.5744885206222534, + "mean_token_accuracy": 0.8357284069061279, + "num_tokens": 8954457.0, + "step": 988 + }, + { + "epoch": 0.7515197568389058, + "grad_norm": 1.9281407594680786, + "learning_rate": 4.472174026948836e-06, + "loss": 0.528974175453186, + "mean_token_accuracy": 0.8083580732345581, + "num_tokens": 8962701.0, + "step": 989 + }, + { + "epoch": 0.7522796352583586, + "grad_norm": 3.012381076812744, + "learning_rate": 4.470886202304385e-06, + "loss": 0.48754751682281494, + "mean_token_accuracy": 0.8368391990661621, + "num_tokens": 8967272.0, + "step": 990 + }, + { + "epoch": 0.7530395136778115, + "grad_norm": 1.691826581954956, + "learning_rate": 4.469596994416131e-06, + "loss": 0.484740674495697, + "mean_token_accuracy": 0.8500643968582153, + "num_tokens": 8976615.0, + "step": 991 + }, + { + "epoch": 0.7537993920972644, + "grad_norm": 2.4961965084075928, + "learning_rate": 4.468306404188887e-06, + "loss": 0.50777268409729, + "mean_token_accuracy": 0.8168395757675171, + "num_tokens": 8983235.0, + "step": 992 + }, + { + "epoch": 0.7545592705167173, + "grad_norm": 1.512007713317871, + "learning_rate": 4.467014432528441e-06, + "loss": 0.4583340287208557, + "mean_token_accuracy": 0.8465162515640259, + "num_tokens": 8993815.0, + "step": 993 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 1.9362257719039917, + "learning_rate": 4.465721080341547e-06, + "loss": 0.6027892827987671, + "mean_token_accuracy": 0.8052380084991455, + "num_tokens": 9002697.0, + "step": 994 + }, + { + "epoch": 0.756079027355623, + "grad_norm": 2.473632335662842, + "learning_rate": 4.4644263485359316e-06, + "loss": 0.5394320487976074, + "mean_token_accuracy": 0.834665834903717, + "num_tokens": 9007428.0, + "step": 995 + }, + { + "epoch": 0.756838905775076, + "grad_norm": 2.2527434825897217, + "learning_rate": 4.463130238020284e-06, + "loss": 0.5485198497772217, + "mean_token_accuracy": 0.8090173006057739, + "num_tokens": 9013570.0, + "step": 996 + }, + { + "epoch": 0.7575987841945289, + "grad_norm": 1.4130940437316895, + "learning_rate": 4.4618327497042676e-06, + "loss": 0.37994423508644104, + "mean_token_accuracy": 0.8625167012214661, + "num_tokens": 9025485.0, + "step": 997 + }, + { + "epoch": 0.7583586626139818, + "grad_norm": 2.685115098953247, + "learning_rate": 4.460533884498509e-06, + "loss": 0.447973370552063, + "mean_token_accuracy": 0.8564165234565735, + "num_tokens": 9030355.0, + "step": 998 + }, + { + "epoch": 0.7591185410334347, + "grad_norm": 3.2743139266967773, + "learning_rate": 4.4592336433146e-06, + "loss": 0.45275989174842834, + "mean_token_accuracy": 0.8462578058242798, + "num_tokens": 9034406.0, + "step": 999 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 1.9383049011230469, + "learning_rate": 4.457932027065102e-06, + "loss": 0.5387729406356812, + "mean_token_accuracy": 0.8357330560684204, + "num_tokens": 9041502.0, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 3948, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.855721706985882e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/chat_template.jinja b/checkpoint-2000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..70adff8a08fb31e0636f618564838d4bf3c05286 --- /dev/null +++ b/checkpoint-2000/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-2000/config.json b/checkpoint-2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c351e5fb52f50ea6e07b40981aef81c80f9df7e4 --- /dev/null +++ b/checkpoint-2000/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151662, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-2000/generation_config.json b/checkpoint-2000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2104b83493c2833855e8fe32a7a784805ab5c2ee --- /dev/null +++ b/checkpoint-2000/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151662, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.5.3" +} diff --git a/checkpoint-2000/model.safetensors b/checkpoint-2000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d26c4ee9aa6752602ff6db19d02edfff6e062f7 --- /dev/null +++ b/checkpoint-2000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1ce241be74f81ade1793d7d1184e1cf7ce2e9afe46f5dd9418012bd1861b43 +size 17645743048 diff --git a/checkpoint-2000/optimizer.bin b/checkpoint-2000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..9bf26616282816435a39edb78ec22ebe2461696f --- /dev/null +++ b/checkpoint-2000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07e07657f743306d7736d8218c799dfc731283d7dedfca7eb48d4dcc64c64623 +size 32180124005 diff --git a/checkpoint-2000/pytorch_model_fsdp.bin b/checkpoint-2000/pytorch_model_fsdp.bin new file mode 100644 index 0000000000000000000000000000000000000000..675400f377bfee7718a7693c8e10f410f7ec7242 --- /dev/null +++ b/checkpoint-2000/pytorch_model_fsdp.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27df8f98b77baf9afbd9bdac0a9ff6cc9e53f4d44310a5d8c665d45656911b2e +size 17645897996 diff --git a/checkpoint-2000/rng_state_0.pth b/checkpoint-2000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..870021e3fa5ac35c2f711adf0c93a556ab4842da --- /dev/null +++ b/checkpoint-2000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e5fc2074c0df31522a514f862c86cb00d71c946a7f15cc9ec0e53a69fb28a7 +size 14917 diff --git a/checkpoint-2000/rng_state_1.pth b/checkpoint-2000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..21f20da5eb1da017f08aaa88bd19cf24d40e3fbf --- /dev/null +++ b/checkpoint-2000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7153eae67b6c9232a41bc996a2bf5b83229b8c7230d61911ac0fd40e64154e +size 14917 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e47e52c4e7f0b2bcf2103a878790216f3f6436d --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 1010000, + "pad_token": "<|fim_pad|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..18d493bffa53c88bc213582a98da8699d575acdc --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,18034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5197568389057752, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007598784194528875, + "grad_norm": 11.767926216125488, + "learning_rate": 0.0, + "loss": 0.7937269806861877, + "mean_token_accuracy": 0.7822731137275696, + "num_tokens": 10507.0, + "step": 1 + }, + { + "epoch": 0.001519756838905775, + "grad_norm": 14.9199800491333, + "learning_rate": 2.5252525252525256e-08, + "loss": 0.7665389776229858, + "mean_token_accuracy": 0.8342233300209045, + "num_tokens": 14806.0, + "step": 2 + }, + { + "epoch": 0.0022796352583586625, + "grad_norm": 11.991217613220215, + "learning_rate": 5.050505050505051e-08, + "loss": 0.9597002267837524, + "mean_token_accuracy": 0.7054992318153381, + "num_tokens": 27170.0, + "step": 3 + }, + { + "epoch": 0.00303951367781155, + "grad_norm": 12.958333015441895, + "learning_rate": 7.575757575757576e-08, + "loss": 0.9971482753753662, + "mean_token_accuracy": 0.7261134386062622, + "num_tokens": 33729.0, + "step": 4 + }, + { + "epoch": 0.003799392097264438, + "grad_norm": 13.5665283203125, + "learning_rate": 1.0101010101010103e-07, + "loss": 0.9504883885383606, + "mean_token_accuracy": 0.745307445526123, + "num_tokens": 41174.0, + "step": 5 + }, + { + "epoch": 0.004559270516717325, + "grad_norm": 10.09444808959961, + "learning_rate": 1.2626262626262626e-07, + "loss": 0.759548008441925, + "mean_token_accuracy": 0.7842121124267578, + "num_tokens": 47943.0, + "step": 6 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 10.741650581359863, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.8231598138809204, + "mean_token_accuracy": 0.7550969123840332, + "num_tokens": 56665.0, + "step": 7 + }, + { + "epoch": 0.0060790273556231, + "grad_norm": 12.250170707702637, + "learning_rate": 1.767676767676768e-07, + "loss": 0.8576581478118896, + "mean_token_accuracy": 0.7568671703338623, + "num_tokens": 67606.0, + "step": 8 + }, + { + "epoch": 0.006838905775075988, + "grad_norm": 12.828629493713379, + "learning_rate": 2.0202020202020205e-07, + "loss": 0.9886435866355896, + "mean_token_accuracy": 0.733400285243988, + "num_tokens": 74272.0, + "step": 9 + }, + { + "epoch": 0.007598784194528876, + "grad_norm": 15.966923713684082, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.064985990524292, + "mean_token_accuracy": 0.7101132869720459, + "num_tokens": 80524.0, + "step": 10 + }, + { + "epoch": 0.008358662613981762, + "grad_norm": 10.864850044250488, + "learning_rate": 2.525252525252525e-07, + "loss": 0.8311550617218018, + "mean_token_accuracy": 0.7431639432907104, + "num_tokens": 96292.0, + "step": 11 + }, + { + "epoch": 0.00911854103343465, + "grad_norm": 16.438785552978516, + "learning_rate": 2.7777777777777776e-07, + "loss": 1.0579866170883179, + "mean_token_accuracy": 0.7222976684570312, + "num_tokens": 102992.0, + "step": 12 + }, + { + "epoch": 0.009878419452887538, + "grad_norm": 11.179214477539062, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9816144704818726, + "mean_token_accuracy": 0.7206371426582336, + "num_tokens": 113571.0, + "step": 13 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 12.780299186706543, + "learning_rate": 3.2828282828282834e-07, + "loss": 0.847449004650116, + "mean_token_accuracy": 0.7826199531555176, + "num_tokens": 119568.0, + "step": 14 + }, + { + "epoch": 0.011398176291793313, + "grad_norm": 14.800421714782715, + "learning_rate": 3.535353535353536e-07, + "loss": 0.9275516271591187, + "mean_token_accuracy": 0.7655045986175537, + "num_tokens": 126258.0, + "step": 15 + }, + { + "epoch": 0.0121580547112462, + "grad_norm": 11.267602920532227, + "learning_rate": 3.787878787878788e-07, + "loss": 0.8464037179946899, + "mean_token_accuracy": 0.7606508731842041, + "num_tokens": 136831.0, + "step": 16 + }, + { + "epoch": 0.012917933130699088, + "grad_norm": 12.891013145446777, + "learning_rate": 4.040404040404041e-07, + "loss": 0.9903074502944946, + "mean_token_accuracy": 0.7247487306594849, + "num_tokens": 150434.0, + "step": 17 + }, + { + "epoch": 0.013677811550151976, + "grad_norm": 11.13957691192627, + "learning_rate": 4.2929292929292934e-07, + "loss": 0.8287211656570435, + "mean_token_accuracy": 0.7621913552284241, + "num_tokens": 158516.0, + "step": 18 + }, + { + "epoch": 0.014437689969604863, + "grad_norm": 18.39569664001465, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.150015115737915, + "mean_token_accuracy": 0.7349498271942139, + "num_tokens": 162214.0, + "step": 19 + }, + { + "epoch": 0.015197568389057751, + "grad_norm": 9.353750228881836, + "learning_rate": 4.797979797979798e-07, + "loss": 0.7228299379348755, + "mean_token_accuracy": 0.7969573736190796, + "num_tokens": 173035.0, + "step": 20 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 8.267163276672363, + "learning_rate": 5.05050505050505e-07, + "loss": 0.7358136177062988, + "mean_token_accuracy": 0.7903937101364136, + "num_tokens": 183568.0, + "step": 21 + }, + { + "epoch": 0.016717325227963525, + "grad_norm": 11.137128829956055, + "learning_rate": 5.303030303030304e-07, + "loss": 1.0075397491455078, + "mean_token_accuracy": 0.702807605266571, + "num_tokens": 192759.0, + "step": 22 + }, + { + "epoch": 0.017477203647416412, + "grad_norm": 10.734103202819824, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8925919532775879, + "mean_token_accuracy": 0.7475671768188477, + "num_tokens": 201280.0, + "step": 23 + }, + { + "epoch": 0.0182370820668693, + "grad_norm": 11.945566177368164, + "learning_rate": 5.808080808080809e-07, + "loss": 0.7260514497756958, + "mean_token_accuracy": 0.7859152555465698, + "num_tokens": 218053.0, + "step": 24 + }, + { + "epoch": 0.018996960486322188, + "grad_norm": 18.610652923583984, + "learning_rate": 6.060606060606061e-07, + "loss": 0.8995465636253357, + "mean_token_accuracy": 0.7931990623474121, + "num_tokens": 220953.0, + "step": 25 + }, + { + "epoch": 0.019756838905775075, + "grad_norm": 10.51898193359375, + "learning_rate": 6.313131313131314e-07, + "loss": 0.9532671570777893, + "mean_token_accuracy": 0.7257645726203918, + "num_tokens": 231200.0, + "step": 26 + }, + { + "epoch": 0.020516717325227963, + "grad_norm": 9.581812858581543, + "learning_rate": 6.565656565656567e-07, + "loss": 0.9038010239601135, + "mean_token_accuracy": 0.7390379905700684, + "num_tokens": 237711.0, + "step": 27 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 12.297484397888184, + "learning_rate": 6.818181818181818e-07, + "loss": 1.048936367034912, + "mean_token_accuracy": 0.7175670862197876, + "num_tokens": 242503.0, + "step": 28 + }, + { + "epoch": 0.022036474164133738, + "grad_norm": 7.437953472137451, + "learning_rate": 7.070707070707071e-07, + "loss": 0.8308826684951782, + "mean_token_accuracy": 0.7415335774421692, + "num_tokens": 250842.0, + "step": 29 + }, + { + "epoch": 0.022796352583586626, + "grad_norm": 6.134475231170654, + "learning_rate": 7.323232323232324e-07, + "loss": 0.647913932800293, + "mean_token_accuracy": 0.8124054670333862, + "num_tokens": 267453.0, + "step": 30 + }, + { + "epoch": 0.023556231003039513, + "grad_norm": 6.678966045379639, + "learning_rate": 7.575757575757576e-07, + "loss": 0.7052810192108154, + "mean_token_accuracy": 0.7908754348754883, + "num_tokens": 284416.0, + "step": 31 + }, + { + "epoch": 0.0243161094224924, + "grad_norm": 7.42232084274292, + "learning_rate": 7.82828282828283e-07, + "loss": 1.022383213043213, + "mean_token_accuracy": 0.7053230404853821, + "num_tokens": 292073.0, + "step": 32 + }, + { + "epoch": 0.02507598784194529, + "grad_norm": 6.463219165802002, + "learning_rate": 8.080808080808082e-07, + "loss": 0.7603012323379517, + "mean_token_accuracy": 0.7728140354156494, + "num_tokens": 298550.0, + "step": 33 + }, + { + "epoch": 0.025835866261398176, + "grad_norm": 5.668411731719971, + "learning_rate": 8.333333333333333e-07, + "loss": 0.7707852721214294, + "mean_token_accuracy": 0.7827773094177246, + "num_tokens": 306683.0, + "step": 34 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 4.984964847564697, + "learning_rate": 8.585858585858587e-07, + "loss": 0.6317349672317505, + "mean_token_accuracy": 0.8106861114501953, + "num_tokens": 318842.0, + "step": 35 + }, + { + "epoch": 0.02735562310030395, + "grad_norm": 4.421732425689697, + "learning_rate": 8.838383838383839e-07, + "loss": 0.6228617429733276, + "mean_token_accuracy": 0.8023355603218079, + "num_tokens": 329850.0, + "step": 36 + }, + { + "epoch": 0.02811550151975684, + "grad_norm": 5.970808029174805, + "learning_rate": 9.090909090909091e-07, + "loss": 0.8443238139152527, + "mean_token_accuracy": 0.7462409734725952, + "num_tokens": 335844.0, + "step": 37 + }, + { + "epoch": 0.028875379939209727, + "grad_norm": 4.5389084815979, + "learning_rate": 9.343434343434345e-07, + "loss": 0.6976436376571655, + "mean_token_accuracy": 0.790410041809082, + "num_tokens": 348768.0, + "step": 38 + }, + { + "epoch": 0.029635258358662615, + "grad_norm": 4.116631507873535, + "learning_rate": 9.595959595959596e-07, + "loss": 0.6698519587516785, + "mean_token_accuracy": 0.7818127870559692, + "num_tokens": 355460.0, + "step": 39 + }, + { + "epoch": 0.030395136778115502, + "grad_norm": 3.3714773654937744, + "learning_rate": 9.84848484848485e-07, + "loss": 0.5723201036453247, + "mean_token_accuracy": 0.8100086450576782, + "num_tokens": 368507.0, + "step": 40 + }, + { + "epoch": 0.03115501519756839, + "grad_norm": 4.4438347816467285, + "learning_rate": 1.01010101010101e-06, + "loss": 0.7508786916732788, + "mean_token_accuracy": 0.7711942791938782, + "num_tokens": 376467.0, + "step": 41 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 5.609974384307861, + "learning_rate": 1.0353535353535354e-06, + "loss": 0.566256046295166, + "mean_token_accuracy": 0.8319284319877625, + "num_tokens": 381399.0, + "step": 42 + }, + { + "epoch": 0.03267477203647416, + "grad_norm": 5.124386787414551, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.8151067495346069, + "mean_token_accuracy": 0.7537785768508911, + "num_tokens": 387389.0, + "step": 43 + }, + { + "epoch": 0.03343465045592705, + "grad_norm": 3.6318116188049316, + "learning_rate": 1.085858585858586e-06, + "loss": 0.5989949107170105, + "mean_token_accuracy": 0.8129256963729858, + "num_tokens": 395302.0, + "step": 44 + }, + { + "epoch": 0.03419452887537994, + "grad_norm": 2.694424629211426, + "learning_rate": 1.111111111111111e-06, + "loss": 0.5831396579742432, + "mean_token_accuracy": 0.8056820631027222, + "num_tokens": 409920.0, + "step": 45 + }, + { + "epoch": 0.034954407294832825, + "grad_norm": 2.2949178218841553, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.472550630569458, + "mean_token_accuracy": 0.8343006372451782, + "num_tokens": 428323.0, + "step": 46 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 3.3930575847625732, + "learning_rate": 1.1616161616161617e-06, + "loss": 0.6246505379676819, + "mean_token_accuracy": 0.783149003982544, + "num_tokens": 435889.0, + "step": 47 + }, + { + "epoch": 0.0364741641337386, + "grad_norm": 3.692598819732666, + "learning_rate": 1.186868686868687e-06, + "loss": 0.46132946014404297, + "mean_token_accuracy": 0.8583089113235474, + "num_tokens": 441192.0, + "step": 48 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 6.571533203125, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.9351121783256531, + "mean_token_accuracy": 0.7580878734588623, + "num_tokens": 444277.0, + "step": 49 + }, + { + "epoch": 0.037993920972644375, + "grad_norm": 5.029570579528809, + "learning_rate": 1.2373737373737375e-06, + "loss": 0.6921554803848267, + "mean_token_accuracy": 0.8131166100502014, + "num_tokens": 447646.0, + "step": 50 + }, + { + "epoch": 0.03875379939209726, + "grad_norm": 2.9174208641052246, + "learning_rate": 1.2626262626262629e-06, + "loss": 0.591706395149231, + "mean_token_accuracy": 0.8108617067337036, + "num_tokens": 461397.0, + "step": 51 + }, + { + "epoch": 0.03951367781155015, + "grad_norm": 4.315536022186279, + "learning_rate": 1.287878787878788e-06, + "loss": 0.6986310482025146, + "mean_token_accuracy": 0.7710754871368408, + "num_tokens": 472047.0, + "step": 52 + }, + { + "epoch": 0.04027355623100304, + "grad_norm": 2.6216275691986084, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5553690791130066, + "mean_token_accuracy": 0.8167896866798401, + "num_tokens": 482795.0, + "step": 53 + }, + { + "epoch": 0.041033434650455926, + "grad_norm": 3.0562477111816406, + "learning_rate": 1.3383838383838385e-06, + "loss": 0.6909202337265015, + "mean_token_accuracy": 0.7859863638877869, + "num_tokens": 494818.0, + "step": 54 + }, + { + "epoch": 0.04179331306990881, + "grad_norm": 2.1420412063598633, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.5415265560150146, + "mean_token_accuracy": 0.818886399269104, + "num_tokens": 513695.0, + "step": 55 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.9610488414764404, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.6602212190628052, + "mean_token_accuracy": 0.7830734252929688, + "num_tokens": 523784.0, + "step": 56 + }, + { + "epoch": 0.04331306990881459, + "grad_norm": 2.511972665786743, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.5717809796333313, + "mean_token_accuracy": 0.8053616285324097, + "num_tokens": 546308.0, + "step": 57 + }, + { + "epoch": 0.044072948328267476, + "grad_norm": 3.52642822265625, + "learning_rate": 1.4393939393939396e-06, + "loss": 0.6242594718933105, + "mean_token_accuracy": 0.8162082433700562, + "num_tokens": 552019.0, + "step": 58 + }, + { + "epoch": 0.044832826747720364, + "grad_norm": 3.02362322807312, + "learning_rate": 1.4646464646464648e-06, + "loss": 0.6634255647659302, + "mean_token_accuracy": 0.7682032585144043, + "num_tokens": 560009.0, + "step": 59 + }, + { + "epoch": 0.04559270516717325, + "grad_norm": 2.3910107612609863, + "learning_rate": 1.48989898989899e-06, + "loss": 0.5519146919250488, + "mean_token_accuracy": 0.8270269632339478, + "num_tokens": 571005.0, + "step": 60 + }, + { + "epoch": 0.04635258358662614, + "grad_norm": 4.28154993057251, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.7437789440155029, + "mean_token_accuracy": 0.7782418131828308, + "num_tokens": 574950.0, + "step": 61 + }, + { + "epoch": 0.04711246200607903, + "grad_norm": 3.4078686237335205, + "learning_rate": 1.5404040404040404e-06, + "loss": 0.6345915198326111, + "mean_token_accuracy": 0.7903392314910889, + "num_tokens": 581657.0, + "step": 62 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 2.6834158897399902, + "learning_rate": 1.565656565656566e-06, + "loss": 0.5981127023696899, + "mean_token_accuracy": 0.7911489605903625, + "num_tokens": 591267.0, + "step": 63 + }, + { + "epoch": 0.0486322188449848, + "grad_norm": 2.1054461002349854, + "learning_rate": 1.590909090909091e-06, + "loss": 0.5523523688316345, + "mean_token_accuracy": 0.8194501399993896, + "num_tokens": 606787.0, + "step": 64 + }, + { + "epoch": 0.04939209726443769, + "grad_norm": 3.322596788406372, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.48417025804519653, + "mean_token_accuracy": 0.8293706178665161, + "num_tokens": 611068.0, + "step": 65 + }, + { + "epoch": 0.05015197568389058, + "grad_norm": 2.302450180053711, + "learning_rate": 1.6414141414141415e-06, + "loss": 0.6498389840126038, + "mean_token_accuracy": 0.7728497385978699, + "num_tokens": 624452.0, + "step": 66 + }, + { + "epoch": 0.050911854103343465, + "grad_norm": 2.680191993713379, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.6347037553787231, + "mean_token_accuracy": 0.8108306527137756, + "num_tokens": 638049.0, + "step": 67 + }, + { + "epoch": 0.05167173252279635, + "grad_norm": 3.0297021865844727, + "learning_rate": 1.6919191919191922e-06, + "loss": 0.5344363451004028, + "mean_token_accuracy": 0.8113535046577454, + "num_tokens": 643892.0, + "step": 68 + }, + { + "epoch": 0.05243161094224924, + "grad_norm": 2.9283676147460938, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.6999260187149048, + "mean_token_accuracy": 0.7782022356987, + "num_tokens": 654418.0, + "step": 69 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 3.4098572731018066, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.6508946418762207, + "mean_token_accuracy": 0.7942900657653809, + "num_tokens": 659837.0, + "step": 70 + }, + { + "epoch": 0.053951367781155016, + "grad_norm": 2.6756019592285156, + "learning_rate": 1.7676767676767678e-06, + "loss": 0.603486180305481, + "mean_token_accuracy": 0.8015457391738892, + "num_tokens": 668361.0, + "step": 71 + }, + { + "epoch": 0.0547112462006079, + "grad_norm": 2.2630293369293213, + "learning_rate": 1.792929292929293e-06, + "loss": 0.6608274579048157, + "mean_token_accuracy": 0.7753809690475464, + "num_tokens": 679025.0, + "step": 72 + }, + { + "epoch": 0.05547112462006079, + "grad_norm": 2.123962879180908, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4525482654571533, + "mean_token_accuracy": 0.8425612449645996, + "num_tokens": 688574.0, + "step": 73 + }, + { + "epoch": 0.05623100303951368, + "grad_norm": 7.90519905090332, + "learning_rate": 1.8434343434343434e-06, + "loss": 0.6507195830345154, + "mean_token_accuracy": 0.7714964151382446, + "num_tokens": 694534.0, + "step": 74 + }, + { + "epoch": 0.056990881458966566, + "grad_norm": 2.372203826904297, + "learning_rate": 1.868686868686869e-06, + "loss": 0.4458143413066864, + "mean_token_accuracy": 0.7991449236869812, + "num_tokens": 703114.0, + "step": 75 + }, + { + "epoch": 0.057750759878419454, + "grad_norm": 2.918677568435669, + "learning_rate": 1.8939393939393941e-06, + "loss": 0.5614339113235474, + "mean_token_accuracy": 0.8211464881896973, + "num_tokens": 709038.0, + "step": 76 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 1.6106709241867065, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.5802098512649536, + "mean_token_accuracy": 0.8055065870285034, + "num_tokens": 730482.0, + "step": 77 + }, + { + "epoch": 0.05927051671732523, + "grad_norm": 2.8069989681243896, + "learning_rate": 1.944444444444445e-06, + "loss": 0.5709059238433838, + "mean_token_accuracy": 0.8024872541427612, + "num_tokens": 751817.0, + "step": 78 + }, + { + "epoch": 0.06003039513677812, + "grad_norm": 2.641667127609253, + "learning_rate": 1.96969696969697e-06, + "loss": 0.6480152606964111, + "mean_token_accuracy": 0.7912271618843079, + "num_tokens": 759236.0, + "step": 79 + }, + { + "epoch": 0.060790273556231005, + "grad_norm": 2.6034350395202637, + "learning_rate": 1.994949494949495e-06, + "loss": 0.5535176396369934, + "mean_token_accuracy": 0.7980542778968811, + "num_tokens": 766496.0, + "step": 80 + }, + { + "epoch": 0.06155015197568389, + "grad_norm": 1.7095069885253906, + "learning_rate": 2.02020202020202e-06, + "loss": 0.4545496106147766, + "mean_token_accuracy": 0.8229660391807556, + "num_tokens": 780124.0, + "step": 81 + }, + { + "epoch": 0.06231003039513678, + "grad_norm": 3.788830518722534, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.6679391264915466, + "mean_token_accuracy": 0.7942397594451904, + "num_tokens": 784555.0, + "step": 82 + }, + { + "epoch": 0.06306990881458967, + "grad_norm": 2.009831666946411, + "learning_rate": 2.070707070707071e-06, + "loss": 0.5067101120948792, + "mean_token_accuracy": 0.8276634216308594, + "num_tokens": 797459.0, + "step": 83 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 2.201627731323242, + "learning_rate": 2.095959595959596e-06, + "loss": 0.5012127161026001, + "mean_token_accuracy": 0.8432504534721375, + "num_tokens": 810817.0, + "step": 84 + }, + { + "epoch": 0.06458966565349544, + "grad_norm": 2.492568016052246, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.6142797470092773, + "mean_token_accuracy": 0.8338661193847656, + "num_tokens": 818191.0, + "step": 85 + }, + { + "epoch": 0.06534954407294832, + "grad_norm": 2.8360862731933594, + "learning_rate": 2.1464646464646467e-06, + "loss": 0.5569300651550293, + "mean_token_accuracy": 0.8121030330657959, + "num_tokens": 825325.0, + "step": 86 + }, + { + "epoch": 0.06610942249240122, + "grad_norm": 2.407548427581787, + "learning_rate": 2.171717171717172e-06, + "loss": 0.6442930102348328, + "mean_token_accuracy": 0.792514443397522, + "num_tokens": 834439.0, + "step": 87 + }, + { + "epoch": 0.0668693009118541, + "grad_norm": 2.340728759765625, + "learning_rate": 2.196969696969697e-06, + "loss": 0.6494365930557251, + "mean_token_accuracy": 0.7746615409851074, + "num_tokens": 843078.0, + "step": 88 + }, + { + "epoch": 0.067629179331307, + "grad_norm": 1.7703697681427002, + "learning_rate": 2.222222222222222e-06, + "loss": 0.598991870880127, + "mean_token_accuracy": 0.7992157340049744, + "num_tokens": 860171.0, + "step": 89 + }, + { + "epoch": 0.06838905775075987, + "grad_norm": 2.5779271125793457, + "learning_rate": 2.2474747474747476e-06, + "loss": 0.5693082809448242, + "mean_token_accuracy": 0.8093700408935547, + "num_tokens": 866669.0, + "step": 90 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 2.014092206954956, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5346695780754089, + "mean_token_accuracy": 0.8165590763092041, + "num_tokens": 876698.0, + "step": 91 + }, + { + "epoch": 0.06990881458966565, + "grad_norm": 1.7555919885635376, + "learning_rate": 2.2979797979797983e-06, + "loss": 0.5321458578109741, + "mean_token_accuracy": 0.8166656494140625, + "num_tokens": 889488.0, + "step": 92 + }, + { + "epoch": 0.07066869300911854, + "grad_norm": 1.8631824254989624, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.5246532559394836, + "mean_token_accuracy": 0.8088107705116272, + "num_tokens": 901322.0, + "step": 93 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.2332139015197754, + "learning_rate": 2.348484848484849e-06, + "loss": 0.5141711235046387, + "mean_token_accuracy": 0.8382217884063721, + "num_tokens": 905792.0, + "step": 94 + }, + { + "epoch": 0.07218844984802432, + "grad_norm": 1.7806555032730103, + "learning_rate": 2.373737373737374e-06, + "loss": 0.5233149528503418, + "mean_token_accuracy": 0.8101529479026794, + "num_tokens": 917320.0, + "step": 95 + }, + { + "epoch": 0.0729483282674772, + "grad_norm": 1.8169859647750854, + "learning_rate": 2.3989898989898993e-06, + "loss": 0.578881561756134, + "mean_token_accuracy": 0.8044873476028442, + "num_tokens": 931062.0, + "step": 96 + }, + { + "epoch": 0.0737082066869301, + "grad_norm": 4.677402496337891, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.7842556238174438, + "mean_token_accuracy": 0.7579764127731323, + "num_tokens": 934712.0, + "step": 97 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 2.6987264156341553, + "learning_rate": 2.4494949494949495e-06, + "loss": 0.5669287443161011, + "mean_token_accuracy": 0.8186933994293213, + "num_tokens": 941058.0, + "step": 98 + }, + { + "epoch": 0.07522796352583587, + "grad_norm": 1.6906023025512695, + "learning_rate": 2.474747474747475e-06, + "loss": 0.4976363778114319, + "mean_token_accuracy": 0.8198553323745728, + "num_tokens": 956509.0, + "step": 99 + }, + { + "epoch": 0.07598784194528875, + "grad_norm": 2.7256152629852295, + "learning_rate": 2.5e-06, + "loss": 0.7138420343399048, + "mean_token_accuracy": 0.7752805948257446, + "num_tokens": 963920.0, + "step": 100 + }, + { + "epoch": 0.07674772036474165, + "grad_norm": 2.174870491027832, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.6733541488647461, + "mean_token_accuracy": 0.7745175361633301, + "num_tokens": 975268.0, + "step": 101 + }, + { + "epoch": 0.07750759878419453, + "grad_norm": 1.5587213039398193, + "learning_rate": 2.5505050505050505e-06, + "loss": 0.44223445653915405, + "mean_token_accuracy": 0.8278359174728394, + "num_tokens": 991837.0, + "step": 102 + }, + { + "epoch": 0.07826747720364742, + "grad_norm": 2.181840658187866, + "learning_rate": 2.575757575757576e-06, + "loss": 0.625128448009491, + "mean_token_accuracy": 0.7941786050796509, + "num_tokens": 1004325.0, + "step": 103 + }, + { + "epoch": 0.0790273556231003, + "grad_norm": 1.4986687898635864, + "learning_rate": 2.601010101010101e-06, + "loss": 0.39262527227401733, + "mean_token_accuracy": 0.8412648439407349, + "num_tokens": 1018331.0, + "step": 104 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 2.3416061401367188, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.5495132803916931, + "mean_token_accuracy": 0.8193322420120239, + "num_tokens": 1026090.0, + "step": 105 + }, + { + "epoch": 0.08054711246200608, + "grad_norm": 3.8168859481811523, + "learning_rate": 2.6515151515151514e-06, + "loss": 0.4898706376552582, + "mean_token_accuracy": 0.8467956185340881, + "num_tokens": 1029955.0, + "step": 106 + }, + { + "epoch": 0.08130699088145897, + "grad_norm": 4.113908767700195, + "learning_rate": 2.676767676767677e-06, + "loss": 0.6189584732055664, + "mean_token_accuracy": 0.8019394278526306, + "num_tokens": 1033598.0, + "step": 107 + }, + { + "epoch": 0.08206686930091185, + "grad_norm": 2.50003981590271, + "learning_rate": 2.7020202020202025e-06, + "loss": 0.6479471921920776, + "mean_token_accuracy": 0.7790026664733887, + "num_tokens": 1042533.0, + "step": 108 + }, + { + "epoch": 0.08282674772036475, + "grad_norm": 1.408934473991394, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.3909248113632202, + "mean_token_accuracy": 0.8477586507797241, + "num_tokens": 1061755.0, + "step": 109 + }, + { + "epoch": 0.08358662613981763, + "grad_norm": 3.360633611679077, + "learning_rate": 2.7525252525252528e-06, + "loss": 0.6952459812164307, + "mean_token_accuracy": 0.777535080909729, + "num_tokens": 1067316.0, + "step": 110 + }, + { + "epoch": 0.08434650455927052, + "grad_norm": 1.8631696701049805, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.5420593023300171, + "mean_token_accuracy": 0.8157662749290466, + "num_tokens": 1079930.0, + "step": 111 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.4308314323425293, + "learning_rate": 2.803030303030303e-06, + "loss": 0.5863882303237915, + "mean_token_accuracy": 0.8206346035003662, + "num_tokens": 1088069.0, + "step": 112 + }, + { + "epoch": 0.0858662613981763, + "grad_norm": 2.922808885574341, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5217319130897522, + "mean_token_accuracy": 0.8253234028816223, + "num_tokens": 1093607.0, + "step": 113 + }, + { + "epoch": 0.08662613981762918, + "grad_norm": 2.3596107959747314, + "learning_rate": 2.8535353535353537e-06, + "loss": 0.5070714950561523, + "mean_token_accuracy": 0.8258323669433594, + "num_tokens": 1100405.0, + "step": 114 + }, + { + "epoch": 0.08738601823708207, + "grad_norm": 3.0853066444396973, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.591964840888977, + "mean_token_accuracy": 0.8047322630882263, + "num_tokens": 1107535.0, + "step": 115 + }, + { + "epoch": 0.08814589665653495, + "grad_norm": 1.9251092672348022, + "learning_rate": 2.904040404040404e-06, + "loss": 0.5226191878318787, + "mean_token_accuracy": 0.8022720217704773, + "num_tokens": 1118716.0, + "step": 116 + }, + { + "epoch": 0.08890577507598785, + "grad_norm": 1.9692988395690918, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.5462069511413574, + "mean_token_accuracy": 0.8157015442848206, + "num_tokens": 1131917.0, + "step": 117 + }, + { + "epoch": 0.08966565349544073, + "grad_norm": 1.4738909006118774, + "learning_rate": 2.954545454545455e-06, + "loss": 0.4564219117164612, + "mean_token_accuracy": 0.849632978439331, + "num_tokens": 1148534.0, + "step": 118 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 2.72646164894104, + "learning_rate": 2.97979797979798e-06, + "loss": 0.6654808521270752, + "mean_token_accuracy": 0.7752684354782104, + "num_tokens": 1155438.0, + "step": 119 + }, + { + "epoch": 0.0911854103343465, + "grad_norm": 2.7843852043151855, + "learning_rate": 3.0050505050505054e-06, + "loss": 0.5354680418968201, + "mean_token_accuracy": 0.8196378946304321, + "num_tokens": 1161815.0, + "step": 120 + }, + { + "epoch": 0.0919452887537994, + "grad_norm": 2.8052573204040527, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.6366757154464722, + "mean_token_accuracy": 0.7967483997344971, + "num_tokens": 1168295.0, + "step": 121 + }, + { + "epoch": 0.09270516717325228, + "grad_norm": 2.7462735176086426, + "learning_rate": 3.055555555555556e-06, + "loss": 0.59470534324646, + "mean_token_accuracy": 0.8023771047592163, + "num_tokens": 1174502.0, + "step": 122 + }, + { + "epoch": 0.09346504559270517, + "grad_norm": 2.2743821144104004, + "learning_rate": 3.0808080808080807e-06, + "loss": 0.5720560550689697, + "mean_token_accuracy": 0.8162771463394165, + "num_tokens": 1183615.0, + "step": 123 + }, + { + "epoch": 0.09422492401215805, + "grad_norm": 1.8669533729553223, + "learning_rate": 3.1060606060606063e-06, + "loss": 0.4655378758907318, + "mean_token_accuracy": 0.8360732793807983, + "num_tokens": 1193761.0, + "step": 124 + }, + { + "epoch": 0.09498480243161095, + "grad_norm": 1.7666901350021362, + "learning_rate": 3.131313131313132e-06, + "loss": 0.5524153709411621, + "mean_token_accuracy": 0.8252713680267334, + "num_tokens": 1207870.0, + "step": 125 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 2.4720070362091064, + "learning_rate": 3.1565656565656566e-06, + "loss": 0.5003011226654053, + "mean_token_accuracy": 0.8491042852401733, + "num_tokens": 1214603.0, + "step": 126 + }, + { + "epoch": 0.09650455927051672, + "grad_norm": 1.6500422954559326, + "learning_rate": 3.181818181818182e-06, + "loss": 0.5137069225311279, + "mean_token_accuracy": 0.8273531198501587, + "num_tokens": 1228717.0, + "step": 127 + }, + { + "epoch": 0.0972644376899696, + "grad_norm": 3.402543067932129, + "learning_rate": 3.2070707070707072e-06, + "loss": 0.708167552947998, + "mean_token_accuracy": 0.7705385684967041, + "num_tokens": 1234361.0, + "step": 128 + }, + { + "epoch": 0.0980243161094225, + "grad_norm": 2.547285795211792, + "learning_rate": 3.232323232323233e-06, + "loss": 0.6020137071609497, + "mean_token_accuracy": 0.7981340289115906, + "num_tokens": 1244169.0, + "step": 129 + }, + { + "epoch": 0.09878419452887538, + "grad_norm": 2.0578792095184326, + "learning_rate": 3.257575757575758e-06, + "loss": 0.4425000250339508, + "mean_token_accuracy": 0.8567807674407959, + "num_tokens": 1252709.0, + "step": 130 + }, + { + "epoch": 0.09954407294832827, + "grad_norm": 1.672614336013794, + "learning_rate": 3.282828282828283e-06, + "loss": 0.4860966205596924, + "mean_token_accuracy": 0.8393139243125916, + "num_tokens": 1265766.0, + "step": 131 + }, + { + "epoch": 0.10030395136778116, + "grad_norm": 3.2560198307037354, + "learning_rate": 3.3080808080808086e-06, + "loss": 0.624736487865448, + "mean_token_accuracy": 0.7875322699546814, + "num_tokens": 1270779.0, + "step": 132 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 2.4468185901641846, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5062227249145508, + "mean_token_accuracy": 0.8217229843139648, + "num_tokens": 1277113.0, + "step": 133 + }, + { + "epoch": 0.10182370820668693, + "grad_norm": 2.6371328830718994, + "learning_rate": 3.358585858585859e-06, + "loss": 0.477113276720047, + "mean_token_accuracy": 0.8605583906173706, + "num_tokens": 1282514.0, + "step": 134 + }, + { + "epoch": 0.10258358662613981, + "grad_norm": 2.48421311378479, + "learning_rate": 3.3838383838383844e-06, + "loss": 0.40855684876441956, + "mean_token_accuracy": 0.864548921585083, + "num_tokens": 1287859.0, + "step": 135 + }, + { + "epoch": 0.1033434650455927, + "grad_norm": 1.993099331855774, + "learning_rate": 3.409090909090909e-06, + "loss": 0.5913145542144775, + "mean_token_accuracy": 0.8248485922813416, + "num_tokens": 1301074.0, + "step": 136 + }, + { + "epoch": 0.10410334346504559, + "grad_norm": 3.5947680473327637, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.5028599500656128, + "mean_token_accuracy": 0.8367215394973755, + "num_tokens": 1305219.0, + "step": 137 + }, + { + "epoch": 0.10486322188449848, + "grad_norm": 2.5778582096099854, + "learning_rate": 3.45959595959596e-06, + "loss": 0.5297672748565674, + "mean_token_accuracy": 0.8232187032699585, + "num_tokens": 1312482.0, + "step": 138 + }, + { + "epoch": 0.10562310030395136, + "grad_norm": 1.8961588144302368, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.39954107999801636, + "mean_token_accuracy": 0.8605833053588867, + "num_tokens": 1323404.0, + "step": 139 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 1.9687960147857666, + "learning_rate": 3.51010101010101e-06, + "loss": 0.48791587352752686, + "mean_token_accuracy": 0.8200347423553467, + "num_tokens": 1333027.0, + "step": 140 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 2.520242691040039, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.6106002330780029, + "mean_token_accuracy": 0.790692150592804, + "num_tokens": 1340999.0, + "step": 141 + }, + { + "epoch": 0.10790273556231003, + "grad_norm": 3.751617431640625, + "learning_rate": 3.560606060606061e-06, + "loss": 0.48141729831695557, + "mean_token_accuracy": 0.8421382904052734, + "num_tokens": 1344687.0, + "step": 142 + }, + { + "epoch": 0.10866261398176291, + "grad_norm": 2.7101709842681885, + "learning_rate": 3.585858585858586e-06, + "loss": 0.5375241637229919, + "mean_token_accuracy": 0.8061438202857971, + "num_tokens": 1350192.0, + "step": 143 + }, + { + "epoch": 0.1094224924012158, + "grad_norm": 2.583484411239624, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.6492470502853394, + "mean_token_accuracy": 0.7863001823425293, + "num_tokens": 1358148.0, + "step": 144 + }, + { + "epoch": 0.11018237082066869, + "grad_norm": 1.792561650276184, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.48480600118637085, + "mean_token_accuracy": 0.8358709812164307, + "num_tokens": 1369519.0, + "step": 145 + }, + { + "epoch": 0.11094224924012158, + "grad_norm": 2.6480472087860107, + "learning_rate": 3.661616161616162e-06, + "loss": 0.5268933176994324, + "mean_token_accuracy": 0.8214013576507568, + "num_tokens": 1375862.0, + "step": 146 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 2.3174469470977783, + "learning_rate": 3.686868686868687e-06, + "loss": 0.42517897486686707, + "mean_token_accuracy": 0.8523461222648621, + "num_tokens": 1381546.0, + "step": 147 + }, + { + "epoch": 0.11246200607902736, + "grad_norm": 3.0090949535369873, + "learning_rate": 3.7121212121212124e-06, + "loss": 0.4042336940765381, + "mean_token_accuracy": 0.8670448064804077, + "num_tokens": 1385896.0, + "step": 148 + }, + { + "epoch": 0.11322188449848024, + "grad_norm": 2.4928104877471924, + "learning_rate": 3.737373737373738e-06, + "loss": 0.6498878598213196, + "mean_token_accuracy": 0.7967068552970886, + "num_tokens": 1394169.0, + "step": 149 + }, + { + "epoch": 0.11398176291793313, + "grad_norm": 1.5984913110733032, + "learning_rate": 3.7626262626262627e-06, + "loss": 0.546096920967102, + "mean_token_accuracy": 0.8035850524902344, + "num_tokens": 1408785.0, + "step": 150 + }, + { + "epoch": 0.11474164133738601, + "grad_norm": 2.3663532733917236, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.6111721992492676, + "mean_token_accuracy": 0.8015355467796326, + "num_tokens": 1417510.0, + "step": 151 + }, + { + "epoch": 0.11550151975683891, + "grad_norm": 2.518932819366455, + "learning_rate": 3.8131313131313138e-06, + "loss": 0.5274964570999146, + "mean_token_accuracy": 0.8155480623245239, + "num_tokens": 1424186.0, + "step": 152 + }, + { + "epoch": 0.11626139817629179, + "grad_norm": 2.14353609085083, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.5283297896385193, + "mean_token_accuracy": 0.8275758028030396, + "num_tokens": 1432630.0, + "step": 153 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 1.8243604898452759, + "learning_rate": 3.863636363636364e-06, + "loss": 0.41854870319366455, + "mean_token_accuracy": 0.8222295045852661, + "num_tokens": 1442691.0, + "step": 154 + }, + { + "epoch": 0.11778115501519756, + "grad_norm": 2.088212251663208, + "learning_rate": 3.88888888888889e-06, + "loss": 0.6062943339347839, + "mean_token_accuracy": 0.8009427785873413, + "num_tokens": 1456890.0, + "step": 155 + }, + { + "epoch": 0.11854103343465046, + "grad_norm": 1.3469511270523071, + "learning_rate": 3.914141414141415e-06, + "loss": 0.4390433728694916, + "mean_token_accuracy": 0.8436295986175537, + "num_tokens": 1475349.0, + "step": 156 + }, + { + "epoch": 0.11930091185410334, + "grad_norm": 3.247023105621338, + "learning_rate": 3.93939393939394e-06, + "loss": 0.6490433216094971, + "mean_token_accuracy": 0.8037861585617065, + "num_tokens": 1479952.0, + "step": 157 + }, + { + "epoch": 0.12006079027355623, + "grad_norm": 2.6610445976257324, + "learning_rate": 3.964646464646465e-06, + "loss": 0.6221826076507568, + "mean_token_accuracy": 0.7848749160766602, + "num_tokens": 1487306.0, + "step": 158 + }, + { + "epoch": 0.12082066869300911, + "grad_norm": 2.3060810565948486, + "learning_rate": 3.98989898989899e-06, + "loss": 0.5052388310432434, + "mean_token_accuracy": 0.8281195759773254, + "num_tokens": 1495367.0, + "step": 159 + }, + { + "epoch": 0.12158054711246201, + "grad_norm": 2.504448652267456, + "learning_rate": 4.015151515151515e-06, + "loss": 0.5005477666854858, + "mean_token_accuracy": 0.8408058881759644, + "num_tokens": 1502069.0, + "step": 160 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 3.993938446044922, + "learning_rate": 4.04040404040404e-06, + "loss": 0.5569638013839722, + "mean_token_accuracy": 0.8095242977142334, + "num_tokens": 1510224.0, + "step": 161 + }, + { + "epoch": 0.12310030395136778, + "grad_norm": 2.2287683486938477, + "learning_rate": 4.065656565656566e-06, + "loss": 0.524042546749115, + "mean_token_accuracy": 0.8102203607559204, + "num_tokens": 1518364.0, + "step": 162 + }, + { + "epoch": 0.12386018237082067, + "grad_norm": 1.9531738758087158, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.45794573426246643, + "mean_token_accuracy": 0.8560376167297363, + "num_tokens": 1528097.0, + "step": 163 + }, + { + "epoch": 0.12462006079027356, + "grad_norm": 1.5841206312179565, + "learning_rate": 4.116161616161617e-06, + "loss": 0.5420972108840942, + "mean_token_accuracy": 0.8092726469039917, + "num_tokens": 1544119.0, + "step": 164 + }, + { + "epoch": 0.12537993920972645, + "grad_norm": 1.7536218166351318, + "learning_rate": 4.141414141414142e-06, + "loss": 0.554668664932251, + "mean_token_accuracy": 0.8193825483322144, + "num_tokens": 1559140.0, + "step": 165 + }, + { + "epoch": 0.12613981762917933, + "grad_norm": 3.545454740524292, + "learning_rate": 4.166666666666667e-06, + "loss": 0.580947995185852, + "mean_token_accuracy": 0.8286383152008057, + "num_tokens": 1563625.0, + "step": 166 + }, + { + "epoch": 0.12689969604863222, + "grad_norm": 1.6608915328979492, + "learning_rate": 4.191919191919192e-06, + "loss": 0.5523324012756348, + "mean_token_accuracy": 0.8155215978622437, + "num_tokens": 1574945.0, + "step": 167 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 1.4832708835601807, + "learning_rate": 4.217171717171717e-06, + "loss": 0.5133191347122192, + "mean_token_accuracy": 0.8367571830749512, + "num_tokens": 1595865.0, + "step": 168 + }, + { + "epoch": 0.128419452887538, + "grad_norm": 1.7807520627975464, + "learning_rate": 4.242424242424243e-06, + "loss": 0.5131410360336304, + "mean_token_accuracy": 0.8129367232322693, + "num_tokens": 1608723.0, + "step": 169 + }, + { + "epoch": 0.12917933130699089, + "grad_norm": 2.707569122314453, + "learning_rate": 4.267676767676767e-06, + "loss": 0.6129013299942017, + "mean_token_accuracy": 0.7926048040390015, + "num_tokens": 1616136.0, + "step": 170 + }, + { + "epoch": 0.12993920972644377, + "grad_norm": 2.5831644535064697, + "learning_rate": 4.292929292929293e-06, + "loss": 0.6264227628707886, + "mean_token_accuracy": 0.8074911236763, + "num_tokens": 1624228.0, + "step": 171 + }, + { + "epoch": 0.13069908814589665, + "grad_norm": 3.1124250888824463, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.41763827204704285, + "mean_token_accuracy": 0.8565453290939331, + "num_tokens": 1628098.0, + "step": 172 + }, + { + "epoch": 0.13145896656534956, + "grad_norm": 2.3214211463928223, + "learning_rate": 4.343434343434344e-06, + "loss": 0.421974778175354, + "mean_token_accuracy": 0.8391546010971069, + "num_tokens": 1634950.0, + "step": 173 + }, + { + "epoch": 0.13221884498480244, + "grad_norm": 2.1010327339172363, + "learning_rate": 4.368686868686869e-06, + "loss": 0.5307331681251526, + "mean_token_accuracy": 0.8139588236808777, + "num_tokens": 1644132.0, + "step": 174 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 2.533612012863159, + "learning_rate": 4.393939393939394e-06, + "loss": 0.5626664161682129, + "mean_token_accuracy": 0.8029808402061462, + "num_tokens": 1651637.0, + "step": 175 + }, + { + "epoch": 0.1337386018237082, + "grad_norm": 1.669508457183838, + "learning_rate": 4.41919191919192e-06, + "loss": 0.5351508259773254, + "mean_token_accuracy": 0.8281655311584473, + "num_tokens": 1666776.0, + "step": 176 + }, + { + "epoch": 0.1344984802431611, + "grad_norm": 1.7579659223556519, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5235031247138977, + "mean_token_accuracy": 0.8143284320831299, + "num_tokens": 1679241.0, + "step": 177 + }, + { + "epoch": 0.135258358662614, + "grad_norm": 3.123563528060913, + "learning_rate": 4.46969696969697e-06, + "loss": 0.43051332235336304, + "mean_token_accuracy": 0.8518186211585999, + "num_tokens": 1683317.0, + "step": 178 + }, + { + "epoch": 0.13601823708206687, + "grad_norm": 2.2411575317382812, + "learning_rate": 4.494949494949495e-06, + "loss": 0.5471380949020386, + "mean_token_accuracy": 0.8267596960067749, + "num_tokens": 1691366.0, + "step": 179 + }, + { + "epoch": 0.13677811550151975, + "grad_norm": 2.621973991394043, + "learning_rate": 4.520202020202021e-06, + "loss": 0.5685839653015137, + "mean_token_accuracy": 0.8260642290115356, + "num_tokens": 1698148.0, + "step": 180 + }, + { + "epoch": 0.13753799392097266, + "grad_norm": 2.1553852558135986, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5703883171081543, + "mean_token_accuracy": 0.8219090700149536, + "num_tokens": 1707225.0, + "step": 181 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 5.1767897605896, + "learning_rate": 4.5707070707070715e-06, + "loss": 0.32704639434814453, + "mean_token_accuracy": 0.8754568099975586, + "num_tokens": 1712748.0, + "step": 182 + }, + { + "epoch": 0.13905775075987842, + "grad_norm": 2.609168291091919, + "learning_rate": 4.595959595959597e-06, + "loss": 0.5939987301826477, + "mean_token_accuracy": 0.8034975528717041, + "num_tokens": 1719932.0, + "step": 183 + }, + { + "epoch": 0.1398176291793313, + "grad_norm": 2.2059099674224854, + "learning_rate": 4.621212121212122e-06, + "loss": 0.5310720205307007, + "mean_token_accuracy": 0.8177368640899658, + "num_tokens": 1727640.0, + "step": 184 + }, + { + "epoch": 0.1405775075987842, + "grad_norm": 2.6367759704589844, + "learning_rate": 4.646464646464647e-06, + "loss": 0.522086501121521, + "mean_token_accuracy": 0.826233983039856, + "num_tokens": 1733609.0, + "step": 185 + }, + { + "epoch": 0.1413373860182371, + "grad_norm": 3.326732873916626, + "learning_rate": 4.671717171717172e-06, + "loss": 0.4127829074859619, + "mean_token_accuracy": 0.8551101684570312, + "num_tokens": 1737256.0, + "step": 186 + }, + { + "epoch": 0.14209726443768997, + "grad_norm": 1.828412413597107, + "learning_rate": 4.696969696969698e-06, + "loss": 0.5444269180297852, + "mean_token_accuracy": 0.8350818157196045, + "num_tokens": 1750196.0, + "step": 187 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.209203004837036, + "learning_rate": 4.722222222222222e-06, + "loss": 0.5087994933128357, + "mean_token_accuracy": 0.8349015712738037, + "num_tokens": 1754836.0, + "step": 188 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 1.7339166402816772, + "learning_rate": 4.747474747474748e-06, + "loss": 0.5151352286338806, + "mean_token_accuracy": 0.8321266174316406, + "num_tokens": 1766015.0, + "step": 189 + }, + { + "epoch": 0.14437689969604864, + "grad_norm": 2.699068069458008, + "learning_rate": 4.772727272727273e-06, + "loss": 0.4406203031539917, + "mean_token_accuracy": 0.8425000905990601, + "num_tokens": 1771684.0, + "step": 190 + }, + { + "epoch": 0.14513677811550152, + "grad_norm": 2.8117282390594482, + "learning_rate": 4.7979797979797985e-06, + "loss": 0.40428489446640015, + "mean_token_accuracy": 0.8654326796531677, + "num_tokens": 1776301.0, + "step": 191 + }, + { + "epoch": 0.1458966565349544, + "grad_norm": 2.9204647541046143, + "learning_rate": 4.823232323232324e-06, + "loss": 0.4191770553588867, + "mean_token_accuracy": 0.8574687242507935, + "num_tokens": 1781678.0, + "step": 192 + }, + { + "epoch": 0.1466565349544073, + "grad_norm": 2.1648988723754883, + "learning_rate": 4.848484848484849e-06, + "loss": 0.5839012861251831, + "mean_token_accuracy": 0.8053664565086365, + "num_tokens": 1792516.0, + "step": 193 + }, + { + "epoch": 0.1474164133738602, + "grad_norm": 2.3221631050109863, + "learning_rate": 4.873737373737374e-06, + "loss": 0.5037894248962402, + "mean_token_accuracy": 0.8427227139472961, + "num_tokens": 1800192.0, + "step": 194 + }, + { + "epoch": 0.14817629179331307, + "grad_norm": 2.4536430835723877, + "learning_rate": 4.898989898989899e-06, + "loss": 0.42326074838638306, + "mean_token_accuracy": 0.8510633111000061, + "num_tokens": 1806159.0, + "step": 195 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 2.4875805377960205, + "learning_rate": 4.924242424242425e-06, + "loss": 0.539531409740448, + "mean_token_accuracy": 0.8060250282287598, + "num_tokens": 1813392.0, + "step": 196 + }, + { + "epoch": 0.14969604863221886, + "grad_norm": 2.1664798259735107, + "learning_rate": 4.94949494949495e-06, + "loss": 0.42502015829086304, + "mean_token_accuracy": 0.8503251075744629, + "num_tokens": 1821424.0, + "step": 197 + }, + { + "epoch": 0.15045592705167174, + "grad_norm": 2.568808078765869, + "learning_rate": 4.974747474747475e-06, + "loss": 0.5025098323822021, + "mean_token_accuracy": 0.8182311058044434, + "num_tokens": 1827225.0, + "step": 198 + }, + { + "epoch": 0.15121580547112462, + "grad_norm": 1.9116802215576172, + "learning_rate": 5e-06, + "loss": 0.4907258450984955, + "mean_token_accuracy": 0.8310189843177795, + "num_tokens": 1836297.0, + "step": 199 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 3.150765895843506, + "learning_rate": 4.999999122701883e-06, + "loss": 0.390616774559021, + "mean_token_accuracy": 0.8626647591590881, + "num_tokens": 1839984.0, + "step": 200 + }, + { + "epoch": 0.15273556231003038, + "grad_norm": 3.2229044437408447, + "learning_rate": 4.999996490808146e-06, + "loss": 0.48009657859802246, + "mean_token_accuracy": 0.825214147567749, + "num_tokens": 1844610.0, + "step": 201 + }, + { + "epoch": 0.1534954407294833, + "grad_norm": 1.4473289251327515, + "learning_rate": 4.9999921043206356e-06, + "loss": 0.40135183930397034, + "mean_token_accuracy": 0.8537827730178833, + "num_tokens": 1859573.0, + "step": 202 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 4.072319507598877, + "learning_rate": 4.999985963242432e-06, + "loss": 0.6158689260482788, + "mean_token_accuracy": 0.8075432777404785, + "num_tokens": 1863147.0, + "step": 203 + }, + { + "epoch": 0.15501519756838905, + "grad_norm": 3.15741229057312, + "learning_rate": 4.999978067577844e-06, + "loss": 0.4603108763694763, + "mean_token_accuracy": 0.8418779373168945, + "num_tokens": 1867201.0, + "step": 204 + }, + { + "epoch": 0.15577507598784193, + "grad_norm": 2.1925418376922607, + "learning_rate": 4.999968417332415e-06, + "loss": 0.5552488565444946, + "mean_token_accuracy": 0.8216016292572021, + "num_tokens": 1874837.0, + "step": 205 + }, + { + "epoch": 0.15653495440729484, + "grad_norm": 2.2518117427825928, + "learning_rate": 4.999957012512916e-06, + "loss": 0.4912569522857666, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 1881842.0, + "step": 206 + }, + { + "epoch": 0.15729483282674772, + "grad_norm": 1.8223762512207031, + "learning_rate": 4.999943853127351e-06, + "loss": 0.47709137201309204, + "mean_token_accuracy": 0.8311659097671509, + "num_tokens": 1890805.0, + "step": 207 + }, + { + "epoch": 0.1580547112462006, + "grad_norm": 2.066499948501587, + "learning_rate": 4.999928939184958e-06, + "loss": 0.44794657826423645, + "mean_token_accuracy": 0.8513424396514893, + "num_tokens": 1898264.0, + "step": 208 + }, + { + "epoch": 0.15881458966565348, + "grad_norm": 3.53865909576416, + "learning_rate": 4.999912270696202e-06, + "loss": 0.5978270769119263, + "mean_token_accuracy": 0.8080137968063354, + "num_tokens": 1902435.0, + "step": 209 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 2.0760679244995117, + "learning_rate": 4.999893847672783e-06, + "loss": 0.5930601358413696, + "mean_token_accuracy": 0.8028650283813477, + "num_tokens": 1912252.0, + "step": 210 + }, + { + "epoch": 0.16033434650455927, + "grad_norm": 2.21551513671875, + "learning_rate": 4.99987367012763e-06, + "loss": 0.6336753964424133, + "mean_token_accuracy": 0.7902286648750305, + "num_tokens": 1922095.0, + "step": 211 + }, + { + "epoch": 0.16109422492401215, + "grad_norm": 1.7654480934143066, + "learning_rate": 4.999851738074904e-06, + "loss": 0.6373403668403625, + "mean_token_accuracy": 0.7802424430847168, + "num_tokens": 1938962.0, + "step": 212 + }, + { + "epoch": 0.16185410334346503, + "grad_norm": 2.852834701538086, + "learning_rate": 4.9998280515300006e-06, + "loss": 0.6418683528900146, + "mean_token_accuracy": 0.7895716428756714, + "num_tokens": 1944668.0, + "step": 213 + }, + { + "epoch": 0.16261398176291794, + "grad_norm": 3.4737212657928467, + "learning_rate": 4.999802610509541e-06, + "loss": 0.6323273181915283, + "mean_token_accuracy": 0.7982614636421204, + "num_tokens": 1949142.0, + "step": 214 + }, + { + "epoch": 0.16337386018237082, + "grad_norm": 3.0802664756774902, + "learning_rate": 4.999775415031381e-06, + "loss": 0.5929068326950073, + "mean_token_accuracy": 0.8112219572067261, + "num_tokens": 1954141.0, + "step": 215 + }, + { + "epoch": 0.1641337386018237, + "grad_norm": 2.9808855056762695, + "learning_rate": 4.999746465114609e-06, + "loss": 0.5556406378746033, + "mean_token_accuracy": 0.8117628693580627, + "num_tokens": 1959406.0, + "step": 216 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 1.7346166372299194, + "learning_rate": 4.999715760779541e-06, + "loss": 0.5122925043106079, + "mean_token_accuracy": 0.8040724992752075, + "num_tokens": 1971921.0, + "step": 217 + }, + { + "epoch": 0.1656534954407295, + "grad_norm": 1.4183907508850098, + "learning_rate": 4.999683302047729e-06, + "loss": 0.46471893787384033, + "mean_token_accuracy": 0.8381330966949463, + "num_tokens": 1988863.0, + "step": 218 + }, + { + "epoch": 0.16641337386018237, + "grad_norm": 1.6797802448272705, + "learning_rate": 4.999649088941951e-06, + "loss": 0.38348832726478577, + "mean_token_accuracy": 0.8344278931617737, + "num_tokens": 2000003.0, + "step": 219 + }, + { + "epoch": 0.16717325227963525, + "grad_norm": 3.036963939666748, + "learning_rate": 4.999613121486222e-06, + "loss": 0.6062780618667603, + "mean_token_accuracy": 0.8217900991439819, + "num_tokens": 2004813.0, + "step": 220 + }, + { + "epoch": 0.16793313069908813, + "grad_norm": 2.0343217849731445, + "learning_rate": 4.999575399705782e-06, + "loss": 0.5052450895309448, + "mean_token_accuracy": 0.8368623852729797, + "num_tokens": 2013565.0, + "step": 221 + }, + { + "epoch": 0.16869300911854104, + "grad_norm": 2.1162009239196777, + "learning_rate": 4.9995359236271094e-06, + "loss": 0.5169756412506104, + "mean_token_accuracy": 0.8339958190917969, + "num_tokens": 2025763.0, + "step": 222 + }, + { + "epoch": 0.16945288753799392, + "grad_norm": 2.055333375930786, + "learning_rate": 4.9994946932779076e-06, + "loss": 0.6327048540115356, + "mean_token_accuracy": 0.8078711032867432, + "num_tokens": 2037005.0, + "step": 223 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.334620475769043, + "learning_rate": 4.999451708687114e-06, + "loss": 0.5688358545303345, + "mean_token_accuracy": 0.8015589714050293, + "num_tokens": 2041473.0, + "step": 224 + }, + { + "epoch": 0.17097264437689969, + "grad_norm": 2.3734676837921143, + "learning_rate": 4.999406969884897e-06, + "loss": 0.5673821568489075, + "mean_token_accuracy": 0.8054057359695435, + "num_tokens": 2049397.0, + "step": 225 + }, + { + "epoch": 0.1717325227963526, + "grad_norm": 1.807358980178833, + "learning_rate": 4.999360476902656e-06, + "loss": 0.4376158118247986, + "mean_token_accuracy": 0.8456039428710938, + "num_tokens": 2058721.0, + "step": 226 + }, + { + "epoch": 0.17249240121580547, + "grad_norm": 3.231638193130493, + "learning_rate": 4.999312229773022e-06, + "loss": 0.5592809915542603, + "mean_token_accuracy": 0.8170154094696045, + "num_tokens": 2063455.0, + "step": 227 + }, + { + "epoch": 0.17325227963525835, + "grad_norm": 2.2717151641845703, + "learning_rate": 4.999262228529855e-06, + "loss": 0.6144396066665649, + "mean_token_accuracy": 0.7948470115661621, + "num_tokens": 2071686.0, + "step": 228 + }, + { + "epoch": 0.17401215805471124, + "grad_norm": 1.4171342849731445, + "learning_rate": 4.99921047320825e-06, + "loss": 0.43680912256240845, + "mean_token_accuracy": 0.84850013256073, + "num_tokens": 2086999.0, + "step": 229 + }, + { + "epoch": 0.17477203647416414, + "grad_norm": 3.162736654281616, + "learning_rate": 4.99915696384453e-06, + "loss": 0.6025407910346985, + "mean_token_accuracy": 0.8042335510253906, + "num_tokens": 2092001.0, + "step": 230 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 1.8672804832458496, + "learning_rate": 4.99910170047625e-06, + "loss": 0.5843087434768677, + "mean_token_accuracy": 0.8016980886459351, + "num_tokens": 2103372.0, + "step": 231 + }, + { + "epoch": 0.1762917933130699, + "grad_norm": 2.967587471008301, + "learning_rate": 4.999044683142196e-06, + "loss": 0.5123642086982727, + "mean_token_accuracy": 0.8216149806976318, + "num_tokens": 2108008.0, + "step": 232 + }, + { + "epoch": 0.1770516717325228, + "grad_norm": 1.9651981592178345, + "learning_rate": 4.998985911882383e-06, + "loss": 0.5868178606033325, + "mean_token_accuracy": 0.7904198169708252, + "num_tokens": 2119009.0, + "step": 233 + }, + { + "epoch": 0.1778115501519757, + "grad_norm": 2.7785449028015137, + "learning_rate": 4.998925386738063e-06, + "loss": 0.5075510144233704, + "mean_token_accuracy": 0.8280210494995117, + "num_tokens": 2124915.0, + "step": 234 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.957470417022705, + "learning_rate": 4.998863107751711e-06, + "loss": 0.5351958274841309, + "mean_token_accuracy": 0.846825122833252, + "num_tokens": 2129905.0, + "step": 235 + }, + { + "epoch": 0.17933130699088146, + "grad_norm": 3.207671880722046, + "learning_rate": 4.99879907496704e-06, + "loss": 0.6209091544151306, + "mean_token_accuracy": 0.789960503578186, + "num_tokens": 2135027.0, + "step": 236 + }, + { + "epoch": 0.18009118541033434, + "grad_norm": 2.018953800201416, + "learning_rate": 4.998733288428987e-06, + "loss": 0.601510763168335, + "mean_token_accuracy": 0.8136930465698242, + "num_tokens": 2147016.0, + "step": 237 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 2.437281847000122, + "learning_rate": 4.998665748183727e-06, + "loss": 0.5813639163970947, + "mean_token_accuracy": 0.8116716146469116, + "num_tokens": 2155386.0, + "step": 238 + }, + { + "epoch": 0.18161094224924013, + "grad_norm": 1.5708180665969849, + "learning_rate": 4.998596454278661e-06, + "loss": 0.5252395272254944, + "mean_token_accuracy": 0.8193864822387695, + "num_tokens": 2170295.0, + "step": 239 + }, + { + "epoch": 0.182370820668693, + "grad_norm": 1.9921495914459229, + "learning_rate": 4.998525406762422e-06, + "loss": 0.5335029363632202, + "mean_token_accuracy": 0.8120872974395752, + "num_tokens": 2180012.0, + "step": 240 + }, + { + "epoch": 0.1831306990881459, + "grad_norm": 2.6562681198120117, + "learning_rate": 4.998452605684874e-06, + "loss": 0.48021435737609863, + "mean_token_accuracy": 0.8388714790344238, + "num_tokens": 2185607.0, + "step": 241 + }, + { + "epoch": 0.1838905775075988, + "grad_norm": 2.2535853385925293, + "learning_rate": 4.998378051097111e-06, + "loss": 0.5747300386428833, + "mean_token_accuracy": 0.8004639148712158, + "num_tokens": 2194105.0, + "step": 242 + }, + { + "epoch": 0.18465045592705168, + "grad_norm": 1.6151788234710693, + "learning_rate": 4.998301743051459e-06, + "loss": 0.6190565824508667, + "mean_token_accuracy": 0.7816627621650696, + "num_tokens": 2210629.0, + "step": 243 + }, + { + "epoch": 0.18541033434650456, + "grad_norm": 2.1088173389434814, + "learning_rate": 4.9982236816014735e-06, + "loss": 0.4715560972690582, + "mean_token_accuracy": 0.8485721349716187, + "num_tokens": 2218958.0, + "step": 244 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 2.6168735027313232, + "learning_rate": 4.998143866801941e-06, + "loss": 0.6077103018760681, + "mean_token_accuracy": 0.8057924509048462, + "num_tokens": 2226368.0, + "step": 245 + }, + { + "epoch": 0.18693009118541035, + "grad_norm": 2.5988616943359375, + "learning_rate": 4.99806229870888e-06, + "loss": 0.5021637678146362, + "mean_token_accuracy": 0.8361666202545166, + "num_tokens": 2232485.0, + "step": 246 + }, + { + "epoch": 0.18768996960486323, + "grad_norm": 2.015887498855591, + "learning_rate": 4.9979789773795365e-06, + "loss": 0.4309737980365753, + "mean_token_accuracy": 0.8508044481277466, + "num_tokens": 2240819.0, + "step": 247 + }, + { + "epoch": 0.1884498480243161, + "grad_norm": 2.3115265369415283, + "learning_rate": 4.997893902872389e-06, + "loss": 0.5776500701904297, + "mean_token_accuracy": 0.8079549074172974, + "num_tokens": 2249460.0, + "step": 248 + }, + { + "epoch": 0.189209726443769, + "grad_norm": 1.7387021780014038, + "learning_rate": 4.997807075247147e-06, + "loss": 0.430944561958313, + "mean_token_accuracy": 0.8483544588088989, + "num_tokens": 2259124.0, + "step": 249 + }, + { + "epoch": 0.1899696048632219, + "grad_norm": 1.6378381252288818, + "learning_rate": 4.997718494564747e-06, + "loss": 0.4123363792896271, + "mean_token_accuracy": 0.8557409644126892, + "num_tokens": 2269899.0, + "step": 250 + }, + { + "epoch": 0.19072948328267478, + "grad_norm": 1.336282730102539, + "learning_rate": 4.997628160887361e-06, + "loss": 0.502329409122467, + "mean_token_accuracy": 0.8186938166618347, + "num_tokens": 2292821.0, + "step": 251 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 3.3335583209991455, + "learning_rate": 4.997536074278388e-06, + "loss": 0.584446907043457, + "mean_token_accuracy": 0.8062717318534851, + "num_tokens": 2297175.0, + "step": 252 + }, + { + "epoch": 0.19224924012158054, + "grad_norm": 2.246727228164673, + "learning_rate": 4.9974422348024565e-06, + "loss": 0.5683060884475708, + "mean_token_accuracy": 0.8193703293800354, + "num_tokens": 2305456.0, + "step": 253 + }, + { + "epoch": 0.19300911854103345, + "grad_norm": 2.3520865440368652, + "learning_rate": 4.997346642525429e-06, + "loss": 0.4724946618080139, + "mean_token_accuracy": 0.8426719307899475, + "num_tokens": 2312241.0, + "step": 254 + }, + { + "epoch": 0.19376899696048633, + "grad_norm": 2.7115702629089355, + "learning_rate": 4.9972492975143936e-06, + "loss": 0.5019032955169678, + "mean_token_accuracy": 0.8253573179244995, + "num_tokens": 2318094.0, + "step": 255 + }, + { + "epoch": 0.1945288753799392, + "grad_norm": 1.705528974533081, + "learning_rate": 4.997150199837671e-06, + "loss": 0.45588475465774536, + "mean_token_accuracy": 0.836666464805603, + "num_tokens": 2329025.0, + "step": 256 + }, + { + "epoch": 0.1952887537993921, + "grad_norm": 2.161400318145752, + "learning_rate": 4.997049349564814e-06, + "loss": 0.5170183777809143, + "mean_token_accuracy": 0.8287534117698669, + "num_tokens": 2337448.0, + "step": 257 + }, + { + "epoch": 0.196048632218845, + "grad_norm": 2.629669189453125, + "learning_rate": 4.996946746766602e-06, + "loss": 0.44650501012802124, + "mean_token_accuracy": 0.850114107131958, + "num_tokens": 2343207.0, + "step": 258 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 1.6735503673553467, + "learning_rate": 4.996842391515045e-06, + "loss": 0.5247820019721985, + "mean_token_accuracy": 0.8285071849822998, + "num_tokens": 2356801.0, + "step": 259 + }, + { + "epoch": 0.19756838905775076, + "grad_norm": 1.2753115892410278, + "learning_rate": 4.996736283883382e-06, + "loss": 0.41870927810668945, + "mean_token_accuracy": 0.8448047637939453, + "num_tokens": 2377306.0, + "step": 260 + }, + { + "epoch": 0.19832826747720364, + "grad_norm": 2.6947314739227295, + "learning_rate": 4.9966284239460875e-06, + "loss": 0.5059205889701843, + "mean_token_accuracy": 0.8430814743041992, + "num_tokens": 2383352.0, + "step": 261 + }, + { + "epoch": 0.19908814589665655, + "grad_norm": 2.0509963035583496, + "learning_rate": 4.996518811778858e-06, + "loss": 0.4565388560295105, + "mean_token_accuracy": 0.8453130722045898, + "num_tokens": 2391149.0, + "step": 262 + }, + { + "epoch": 0.19984802431610943, + "grad_norm": 2.1856348514556885, + "learning_rate": 4.996407447458626e-06, + "loss": 0.531380832195282, + "mean_token_accuracy": 0.8387004137039185, + "num_tokens": 2399875.0, + "step": 263 + }, + { + "epoch": 0.2006079027355623, + "grad_norm": 2.7348573207855225, + "learning_rate": 4.99629433106355e-06, + "loss": 0.5242817401885986, + "mean_token_accuracy": 0.8177423477172852, + "num_tokens": 2406586.0, + "step": 264 + }, + { + "epoch": 0.2013677811550152, + "grad_norm": 1.76587975025177, + "learning_rate": 4.99617946267302e-06, + "loss": 0.49298471212387085, + "mean_token_accuracy": 0.8271149396896362, + "num_tokens": 2418683.0, + "step": 265 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 2.8129730224609375, + "learning_rate": 4.996062842367655e-06, + "loss": 0.46420302987098694, + "mean_token_accuracy": 0.8453244566917419, + "num_tokens": 2422929.0, + "step": 266 + }, + { + "epoch": 0.20288753799392098, + "grad_norm": 2.575744152069092, + "learning_rate": 4.9959444702293025e-06, + "loss": 0.43208545446395874, + "mean_token_accuracy": 0.8494843244552612, + "num_tokens": 2429567.0, + "step": 267 + }, + { + "epoch": 0.20364741641337386, + "grad_norm": 2.7586750984191895, + "learning_rate": 4.995824346341041e-06, + "loss": 0.4390473961830139, + "mean_token_accuracy": 0.8348895311355591, + "num_tokens": 2434700.0, + "step": 268 + }, + { + "epoch": 0.20440729483282674, + "grad_norm": 1.972145438194275, + "learning_rate": 4.99570247078718e-06, + "loss": 0.6219544410705566, + "mean_token_accuracy": 0.7939999103546143, + "num_tokens": 2447007.0, + "step": 269 + }, + { + "epoch": 0.20516717325227962, + "grad_norm": 2.2963485717773438, + "learning_rate": 4.995578843653255e-06, + "loss": 0.5008970499038696, + "mean_token_accuracy": 0.8255308866500854, + "num_tokens": 2453936.0, + "step": 270 + }, + { + "epoch": 0.20592705167173253, + "grad_norm": 1.8897721767425537, + "learning_rate": 4.995453465026033e-06, + "loss": 0.5436089038848877, + "mean_token_accuracy": 0.819086492061615, + "num_tokens": 2464494.0, + "step": 271 + }, + { + "epoch": 0.2066869300911854, + "grad_norm": 2.319728374481201, + "learning_rate": 4.995326334993508e-06, + "loss": 0.5136368870735168, + "mean_token_accuracy": 0.820817232131958, + "num_tokens": 2470938.0, + "step": 272 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 2.230414390563965, + "learning_rate": 4.9951974536449055e-06, + "loss": 0.5272846817970276, + "mean_token_accuracy": 0.8203279972076416, + "num_tokens": 2478629.0, + "step": 273 + }, + { + "epoch": 0.20820668693009117, + "grad_norm": 3.401937484741211, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.4389592111110687, + "mean_token_accuracy": 0.8647899031639099, + "num_tokens": 2482193.0, + "step": 274 + }, + { + "epoch": 0.20896656534954408, + "grad_norm": 2.1278507709503174, + "learning_rate": 4.994934437362513e-06, + "loss": 0.598863422870636, + "mean_token_accuracy": 0.7945119738578796, + "num_tokens": 2492465.0, + "step": 275 + }, + { + "epoch": 0.20972644376899696, + "grad_norm": 1.9259960651397705, + "learning_rate": 4.994800302613318e-06, + "loss": 0.49520939588546753, + "mean_token_accuracy": 0.8371536135673523, + "num_tokens": 2500825.0, + "step": 276 + }, + { + "epoch": 0.21048632218844984, + "grad_norm": 2.346418857574463, + "learning_rate": 4.994664416917236e-06, + "loss": 0.5412614345550537, + "mean_token_accuracy": 0.810661792755127, + "num_tokens": 2509513.0, + "step": 277 + }, + { + "epoch": 0.21124620060790272, + "grad_norm": 1.3092039823532104, + "learning_rate": 4.994526780369636e-06, + "loss": 0.46305379271507263, + "mean_token_accuracy": 0.8358527421951294, + "num_tokens": 2531405.0, + "step": 278 + }, + { + "epoch": 0.21200607902735563, + "grad_norm": 2.924611806869507, + "learning_rate": 4.9943873930671175e-06, + "loss": 0.6134544610977173, + "mean_token_accuracy": 0.7947378754615784, + "num_tokens": 2536744.0, + "step": 279 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.8290598392486572, + "learning_rate": 4.994246255107506e-06, + "loss": 0.465520441532135, + "mean_token_accuracy": 0.8440108299255371, + "num_tokens": 2541184.0, + "step": 280 + }, + { + "epoch": 0.2135258358662614, + "grad_norm": 3.8081259727478027, + "learning_rate": 4.994103366589859e-06, + "loss": 0.43394139409065247, + "mean_token_accuracy": 0.8579148054122925, + "num_tokens": 2545395.0, + "step": 281 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.7994529008865356, + "learning_rate": 4.993958727614462e-06, + "loss": 0.5076484680175781, + "mean_token_accuracy": 0.8270803093910217, + "num_tokens": 2556541.0, + "step": 282 + }, + { + "epoch": 0.21504559270516718, + "grad_norm": 2.5582659244537354, + "learning_rate": 4.993812338282826e-06, + "loss": 0.4453684389591217, + "mean_token_accuracy": 0.8488293886184692, + "num_tokens": 2562949.0, + "step": 283 + }, + { + "epoch": 0.21580547112462006, + "grad_norm": 1.6448938846588135, + "learning_rate": 4.993664198697694e-06, + "loss": 0.461971640586853, + "mean_token_accuracy": 0.824763298034668, + "num_tokens": 2576407.0, + "step": 284 + }, + { + "epoch": 0.21656534954407294, + "grad_norm": 2.1264469623565674, + "learning_rate": 4.993514308963037e-06, + "loss": 0.6241602897644043, + "mean_token_accuracy": 0.7916014790534973, + "num_tokens": 2585695.0, + "step": 285 + }, + { + "epoch": 0.21732522796352582, + "grad_norm": 3.629991292953491, + "learning_rate": 4.993362669184051e-06, + "loss": 0.610355019569397, + "mean_token_accuracy": 0.7847568988800049, + "num_tokens": 2589778.0, + "step": 286 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 1.9070756435394287, + "learning_rate": 4.993209279467164e-06, + "loss": 0.5513623952865601, + "mean_token_accuracy": 0.7911607027053833, + "num_tokens": 2600920.0, + "step": 287 + }, + { + "epoch": 0.2188449848024316, + "grad_norm": 1.761062741279602, + "learning_rate": 4.993054139920031e-06, + "loss": 0.4579957127571106, + "mean_token_accuracy": 0.8189530372619629, + "num_tokens": 2611856.0, + "step": 288 + }, + { + "epoch": 0.2196048632218845, + "grad_norm": 1.7264713048934937, + "learning_rate": 4.992897250651535e-06, + "loss": 0.5871305465698242, + "mean_token_accuracy": 0.7918527126312256, + "num_tokens": 2624730.0, + "step": 289 + }, + { + "epoch": 0.22036474164133737, + "grad_norm": 1.7455977201461792, + "learning_rate": 4.992738611771787e-06, + "loss": 0.5475119948387146, + "mean_token_accuracy": 0.8226917386054993, + "num_tokens": 2635705.0, + "step": 290 + }, + { + "epoch": 0.22112462006079028, + "grad_norm": 2.095095157623291, + "learning_rate": 4.992578223392124e-06, + "loss": 0.5952225923538208, + "mean_token_accuracy": 0.8078469038009644, + "num_tokens": 2643954.0, + "step": 291 + }, + { + "epoch": 0.22188449848024316, + "grad_norm": 2.994664192199707, + "learning_rate": 4.992416085625115e-06, + "loss": 0.5432442426681519, + "mean_token_accuracy": 0.8329008221626282, + "num_tokens": 2648800.0, + "step": 292 + }, + { + "epoch": 0.22264437689969604, + "grad_norm": 2.796790361404419, + "learning_rate": 4.992252198584554e-06, + "loss": 0.5168961882591248, + "mean_token_accuracy": 0.8393474817276001, + "num_tokens": 2653546.0, + "step": 293 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 1.8610522747039795, + "learning_rate": 4.992086562385462e-06, + "loss": 0.5728024244308472, + "mean_token_accuracy": 0.797406792640686, + "num_tokens": 2667483.0, + "step": 294 + }, + { + "epoch": 0.22416413373860183, + "grad_norm": 1.695472002029419, + "learning_rate": 4.9919191771440905e-06, + "loss": 0.5460028648376465, + "mean_token_accuracy": 0.8123016357421875, + "num_tokens": 2683574.0, + "step": 295 + }, + { + "epoch": 0.22492401215805471, + "grad_norm": 2.8627376556396484, + "learning_rate": 4.9917500429779165e-06, + "loss": 0.5566985011100769, + "mean_token_accuracy": 0.815531313419342, + "num_tokens": 2688985.0, + "step": 296 + }, + { + "epoch": 0.2256838905775076, + "grad_norm": 2.73323655128479, + "learning_rate": 4.991579160005644e-06, + "loss": 0.48197102546691895, + "mean_token_accuracy": 0.8471829295158386, + "num_tokens": 2694799.0, + "step": 297 + }, + { + "epoch": 0.22644376899696048, + "grad_norm": 1.8436161279678345, + "learning_rate": 4.991406528347206e-06, + "loss": 0.4528339207172394, + "mean_token_accuracy": 0.8603188395500183, + "num_tokens": 2707321.0, + "step": 298 + }, + { + "epoch": 0.22720364741641338, + "grad_norm": 2.6231515407562256, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.5916541814804077, + "mean_token_accuracy": 0.8050242066383362, + "num_tokens": 2714233.0, + "step": 299 + }, + { + "epoch": 0.22796352583586627, + "grad_norm": 3.08776593208313, + "learning_rate": 4.991056019457697e-06, + "loss": 0.4860580563545227, + "mean_token_accuracy": 0.8464088439941406, + "num_tokens": 2718443.0, + "step": 300 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 2.2537803649902344, + "learning_rate": 4.990878142472628e-06, + "loss": 0.5158311128616333, + "mean_token_accuracy": 0.824694812297821, + "num_tokens": 2726158.0, + "step": 301 + }, + { + "epoch": 0.22948328267477203, + "grad_norm": 2.1122705936431885, + "learning_rate": 4.990698517293394e-06, + "loss": 0.495265394449234, + "mean_token_accuracy": 0.8343238830566406, + "num_tokens": 2735022.0, + "step": 302 + }, + { + "epoch": 0.23024316109422494, + "grad_norm": 3.5503528118133545, + "learning_rate": 4.9905171440460645e-06, + "loss": 0.46063232421875, + "mean_token_accuracy": 0.8420047760009766, + "num_tokens": 2738550.0, + "step": 303 + }, + { + "epoch": 0.23100303951367782, + "grad_norm": 3.9858486652374268, + "learning_rate": 4.990334022857932e-06, + "loss": 0.5832710266113281, + "mean_token_accuracy": 0.8144199848175049, + "num_tokens": 2741720.0, + "step": 304 + }, + { + "epoch": 0.2317629179331307, + "grad_norm": 2.407231330871582, + "learning_rate": 4.990149153857519e-06, + "loss": 0.4692630171775818, + "mean_token_accuracy": 0.8429223299026489, + "num_tokens": 2748693.0, + "step": 305 + }, + { + "epoch": 0.23252279635258358, + "grad_norm": 1.6996397972106934, + "learning_rate": 4.989962537174573e-06, + "loss": 0.49143946170806885, + "mean_token_accuracy": 0.8340128064155579, + "num_tokens": 2761254.0, + "step": 306 + }, + { + "epoch": 0.23328267477203649, + "grad_norm": 3.746432065963745, + "learning_rate": 4.989774172940071e-06, + "loss": 0.6282026767730713, + "mean_token_accuracy": 0.775698184967041, + "num_tokens": 2765115.0, + "step": 307 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 2.212872266769409, + "learning_rate": 4.989584061286211e-06, + "loss": 0.5193763971328735, + "mean_token_accuracy": 0.8168246746063232, + "num_tokens": 2772345.0, + "step": 308 + }, + { + "epoch": 0.23480243161094225, + "grad_norm": 1.752297282218933, + "learning_rate": 4.989392202346423e-06, + "loss": 0.4437984824180603, + "mean_token_accuracy": 0.8451256155967712, + "num_tokens": 2783072.0, + "step": 309 + }, + { + "epoch": 0.23556231003039513, + "grad_norm": 2.386019706726074, + "learning_rate": 4.989198596255361e-06, + "loss": 0.4090752899646759, + "mean_token_accuracy": 0.8480085134506226, + "num_tokens": 2788757.0, + "step": 310 + }, + { + "epoch": 0.23632218844984804, + "grad_norm": 3.9981489181518555, + "learning_rate": 4.989003243148904e-06, + "loss": 0.5149132013320923, + "mean_token_accuracy": 0.8179056644439697, + "num_tokens": 2792096.0, + "step": 311 + }, + { + "epoch": 0.23708206686930092, + "grad_norm": 1.8723100423812866, + "learning_rate": 4.988806143164159e-06, + "loss": 0.4531487822532654, + "mean_token_accuracy": 0.8400167226791382, + "num_tokens": 2802210.0, + "step": 312 + }, + { + "epoch": 0.2378419452887538, + "grad_norm": 2.3415136337280273, + "learning_rate": 4.988607296439459e-06, + "loss": 0.5974439978599548, + "mean_token_accuracy": 0.8035976886749268, + "num_tokens": 2810088.0, + "step": 313 + }, + { + "epoch": 0.23860182370820668, + "grad_norm": 1.5317577123641968, + "learning_rate": 4.98840670311436e-06, + "loss": 0.49247145652770996, + "mean_token_accuracy": 0.8292540311813354, + "num_tokens": 2824005.0, + "step": 314 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 2.170772075653076, + "learning_rate": 4.988204363329648e-06, + "loss": 0.6359974145889282, + "mean_token_accuracy": 0.7785564661026001, + "num_tokens": 2834680.0, + "step": 315 + }, + { + "epoch": 0.24012158054711247, + "grad_norm": 3.2655932903289795, + "learning_rate": 4.988000277227334e-06, + "loss": 0.5080196857452393, + "mean_token_accuracy": 0.8295877575874329, + "num_tokens": 2838735.0, + "step": 316 + }, + { + "epoch": 0.24088145896656535, + "grad_norm": 3.406589984893799, + "learning_rate": 4.987794444950651e-06, + "loss": 0.3939085006713867, + "mean_token_accuracy": 0.8700719475746155, + "num_tokens": 2842127.0, + "step": 317 + }, + { + "epoch": 0.24164133738601823, + "grad_norm": 1.8211106061935425, + "learning_rate": 4.987586866644061e-06, + "loss": 0.5270540118217468, + "mean_token_accuracy": 0.826683521270752, + "num_tokens": 2853656.0, + "step": 318 + }, + { + "epoch": 0.24240121580547114, + "grad_norm": 1.8429969549179077, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.4705049991607666, + "mean_token_accuracy": 0.8355701565742493, + "num_tokens": 2863513.0, + "step": 319 + }, + { + "epoch": 0.24316109422492402, + "grad_norm": 2.2425320148468018, + "learning_rate": 4.9871664725251314e-06, + "loss": 0.485736608505249, + "mean_token_accuracy": 0.835182785987854, + "num_tokens": 2871556.0, + "step": 320 + }, + { + "epoch": 0.2439209726443769, + "grad_norm": 1.6202056407928467, + "learning_rate": 4.986953657007841e-06, + "loss": 0.4437887370586395, + "mean_token_accuracy": 0.8282591700553894, + "num_tokens": 2884335.0, + "step": 321 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 1.1027268171310425, + "learning_rate": 4.98673909605074e-06, + "loss": 0.3770800828933716, + "mean_token_accuracy": 0.8325437307357788, + "num_tokens": 2904286.0, + "step": 322 + }, + { + "epoch": 0.2454407294832827, + "grad_norm": 2.3239076137542725, + "learning_rate": 4.986522789804417e-06, + "loss": 0.5387254953384399, + "mean_token_accuracy": 0.806242823600769, + "num_tokens": 2910975.0, + "step": 323 + }, + { + "epoch": 0.24620060790273557, + "grad_norm": 2.243482828140259, + "learning_rate": 4.986304738420684e-06, + "loss": 0.4396553039550781, + "mean_token_accuracy": 0.8561904430389404, + "num_tokens": 2917087.0, + "step": 324 + }, + { + "epoch": 0.24696048632218845, + "grad_norm": 2.537264347076416, + "learning_rate": 4.986084942052577e-06, + "loss": 0.395110160112381, + "mean_token_accuracy": 0.8636915683746338, + "num_tokens": 2921887.0, + "step": 325 + }, + { + "epoch": 0.24772036474164133, + "grad_norm": 2.319399118423462, + "learning_rate": 4.9858634008543574e-06, + "loss": 0.581517219543457, + "mean_token_accuracy": 0.8157487511634827, + "num_tokens": 2928996.0, + "step": 326 + }, + { + "epoch": 0.24848024316109424, + "grad_norm": 1.9787474870681763, + "learning_rate": 4.985640114981513e-06, + "loss": 0.5084106922149658, + "mean_token_accuracy": 0.835221529006958, + "num_tokens": 2940302.0, + "step": 327 + }, + { + "epoch": 0.24924012158054712, + "grad_norm": 2.4783265590667725, + "learning_rate": 4.985415084590752e-06, + "loss": 0.6062222719192505, + "mean_token_accuracy": 0.7885516285896301, + "num_tokens": 2946386.0, + "step": 328 + }, + { + "epoch": 0.25, + "grad_norm": 2.4081411361694336, + "learning_rate": 4.985188309840012e-06, + "loss": 0.5079880356788635, + "mean_token_accuracy": 0.8313904404640198, + "num_tokens": 2952323.0, + "step": 329 + }, + { + "epoch": 0.2507598784194529, + "grad_norm": 2.64993953704834, + "learning_rate": 4.984959790888451e-06, + "loss": 0.5461447834968567, + "mean_token_accuracy": 0.8125468492507935, + "num_tokens": 2958119.0, + "step": 330 + }, + { + "epoch": 0.25151975683890576, + "grad_norm": 2.549734115600586, + "learning_rate": 4.984729527896451e-06, + "loss": 0.5998573303222656, + "mean_token_accuracy": 0.8076666593551636, + "num_tokens": 2964947.0, + "step": 331 + }, + { + "epoch": 0.25227963525835867, + "grad_norm": 3.2185161113739014, + "learning_rate": 4.984497521025622e-06, + "loss": 0.4232945442199707, + "mean_token_accuracy": 0.8543803095817566, + "num_tokens": 2968598.0, + "step": 332 + }, + { + "epoch": 0.2530395136778115, + "grad_norm": 2.588994264602661, + "learning_rate": 4.984263770438793e-06, + "loss": 0.460967481136322, + "mean_token_accuracy": 0.8416207432746887, + "num_tokens": 2974510.0, + "step": 333 + }, + { + "epoch": 0.25379939209726443, + "grad_norm": 2.1373162269592285, + "learning_rate": 4.984028276300021e-06, + "loss": 0.49382102489471436, + "mean_token_accuracy": 0.8388048410415649, + "num_tokens": 2981632.0, + "step": 334 + }, + { + "epoch": 0.25455927051671734, + "grad_norm": 2.2524826526641846, + "learning_rate": 4.983791038774585e-06, + "loss": 0.4947671890258789, + "mean_token_accuracy": 0.8066365122795105, + "num_tokens": 2988736.0, + "step": 335 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.7244199514389038, + "learning_rate": 4.983552058028985e-06, + "loss": 0.48096776008605957, + "mean_token_accuracy": 0.830735445022583, + "num_tokens": 3003576.0, + "step": 336 + }, + { + "epoch": 0.2560790273556231, + "grad_norm": 3.0628933906555176, + "learning_rate": 4.9833113342309495e-06, + "loss": 0.6027032136917114, + "mean_token_accuracy": 0.8008694648742676, + "num_tokens": 3009549.0, + "step": 337 + }, + { + "epoch": 0.256838905775076, + "grad_norm": 2.438674211502075, + "learning_rate": 4.983068867549427e-06, + "loss": 0.517090916633606, + "mean_token_accuracy": 0.827893853187561, + "num_tokens": 3015236.0, + "step": 338 + }, + { + "epoch": 0.25759878419452886, + "grad_norm": 2.131535053253174, + "learning_rate": 4.982824658154589e-06, + "loss": 0.6656812429428101, + "mean_token_accuracy": 0.7772425413131714, + "num_tokens": 3028142.0, + "step": 339 + }, + { + "epoch": 0.25835866261398177, + "grad_norm": 2.3206584453582764, + "learning_rate": 4.9825787062178315e-06, + "loss": 0.5757625699043274, + "mean_token_accuracy": 0.8073873519897461, + "num_tokens": 3040996.0, + "step": 340 + }, + { + "epoch": 0.2591185410334346, + "grad_norm": 1.3905521631240845, + "learning_rate": 4.982331011911774e-06, + "loss": 0.4193805456161499, + "mean_token_accuracy": 0.8399466872215271, + "num_tokens": 3061931.0, + "step": 341 + }, + { + "epoch": 0.25987841945288753, + "grad_norm": 2.184173345565796, + "learning_rate": 4.982081575410256e-06, + "loss": 0.4751223921775818, + "mean_token_accuracy": 0.8409271240234375, + "num_tokens": 3069081.0, + "step": 342 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.538764238357544, + "learning_rate": 4.9818303968883445e-06, + "loss": 0.8119601011276245, + "mean_token_accuracy": 0.7442739009857178, + "num_tokens": 3073628.0, + "step": 343 + }, + { + "epoch": 0.2613981762917933, + "grad_norm": 1.8063762187957764, + "learning_rate": 4.981577476522323e-06, + "loss": 0.5615730881690979, + "mean_token_accuracy": 0.8207751512527466, + "num_tokens": 3086596.0, + "step": 344 + }, + { + "epoch": 0.2621580547112462, + "grad_norm": 2.4346961975097656, + "learning_rate": 4.981322814489703e-06, + "loss": 0.5266709327697754, + "mean_token_accuracy": 0.8211277723312378, + "num_tokens": 3092631.0, + "step": 345 + }, + { + "epoch": 0.2629179331306991, + "grad_norm": 1.91289484500885, + "learning_rate": 4.981066410969215e-06, + "loss": 0.5047177672386169, + "mean_token_accuracy": 0.8356877565383911, + "num_tokens": 3101102.0, + "step": 346 + }, + { + "epoch": 0.26367781155015196, + "grad_norm": 2.1495707035064697, + "learning_rate": 4.980808266140813e-06, + "loss": 0.47876280546188354, + "mean_token_accuracy": 0.8364313244819641, + "num_tokens": 3107998.0, + "step": 347 + }, + { + "epoch": 0.26443768996960487, + "grad_norm": 2.5961992740631104, + "learning_rate": 4.9805483801856744e-06, + "loss": 0.5512958765029907, + "mean_token_accuracy": 0.8181467652320862, + "num_tokens": 3113848.0, + "step": 348 + }, + { + "epoch": 0.2651975683890577, + "grad_norm": 3.2828900814056396, + "learning_rate": 4.980286753286196e-06, + "loss": 0.4217945635318756, + "mean_token_accuracy": 0.8617103099822998, + "num_tokens": 3117652.0, + "step": 349 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 1.425554871559143, + "learning_rate": 4.980023385625996e-06, + "loss": 0.4042487144470215, + "mean_token_accuracy": 0.8492785692214966, + "num_tokens": 3132336.0, + "step": 350 + }, + { + "epoch": 0.26671732522796354, + "grad_norm": 2.933504104614258, + "learning_rate": 4.979758277389919e-06, + "loss": 0.5406704545021057, + "mean_token_accuracy": 0.8035423755645752, + "num_tokens": 3137544.0, + "step": 351 + }, + { + "epoch": 0.2674772036474164, + "grad_norm": 1.9958966970443726, + "learning_rate": 4.9794914287640264e-06, + "loss": 0.5857555270195007, + "mean_token_accuracy": 0.7965140342712402, + "num_tokens": 3149705.0, + "step": 352 + }, + { + "epoch": 0.2682370820668693, + "grad_norm": 2.467694044113159, + "learning_rate": 4.979222839935602e-06, + "loss": 0.6404043436050415, + "mean_token_accuracy": 0.7823755741119385, + "num_tokens": 3158353.0, + "step": 353 + }, + { + "epoch": 0.2689969604863222, + "grad_norm": 2.0102720260620117, + "learning_rate": 4.9789525110931545e-06, + "loss": 0.5681496858596802, + "mean_token_accuracy": 0.8108169436454773, + "num_tokens": 3167121.0, + "step": 354 + }, + { + "epoch": 0.26975683890577506, + "grad_norm": 2.6017866134643555, + "learning_rate": 4.978680442426409e-06, + "loss": 0.6309828162193298, + "mean_token_accuracy": 0.7742617130279541, + "num_tokens": 3175012.0, + "step": 355 + }, + { + "epoch": 0.270516717325228, + "grad_norm": 1.8799268007278442, + "learning_rate": 4.978406634126315e-06, + "loss": 0.524029016494751, + "mean_token_accuracy": 0.8317689895629883, + "num_tokens": 3185331.0, + "step": 356 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 1.508332371711731, + "learning_rate": 4.978131086385041e-06, + "loss": 0.46656402945518494, + "mean_token_accuracy": 0.8339117765426636, + "num_tokens": 3198813.0, + "step": 357 + }, + { + "epoch": 0.27203647416413373, + "grad_norm": 3.595707654953003, + "learning_rate": 4.977853799395976e-06, + "loss": 0.5101234912872314, + "mean_token_accuracy": 0.8251723051071167, + "num_tokens": 3206557.0, + "step": 358 + }, + { + "epoch": 0.27279635258358664, + "grad_norm": 3.5317916870117188, + "learning_rate": 4.977574773353732e-06, + "loss": 0.5684665441513062, + "mean_token_accuracy": 0.8124493360519409, + "num_tokens": 3210912.0, + "step": 359 + }, + { + "epoch": 0.2735562310030395, + "grad_norm": 2.8606204986572266, + "learning_rate": 4.97729400845414e-06, + "loss": 0.4746384620666504, + "mean_token_accuracy": 0.8195606470108032, + "num_tokens": 3215365.0, + "step": 360 + }, + { + "epoch": 0.2743161094224924, + "grad_norm": 1.8214033842086792, + "learning_rate": 4.977011504894253e-06, + "loss": 0.4842769503593445, + "mean_token_accuracy": 0.82928866147995, + "num_tokens": 3224037.0, + "step": 361 + }, + { + "epoch": 0.2750759878419453, + "grad_norm": 1.628746509552002, + "learning_rate": 4.97672726287234e-06, + "loss": 0.4397493302822113, + "mean_token_accuracy": 0.8606528043746948, + "num_tokens": 3235589.0, + "step": 362 + }, + { + "epoch": 0.27583586626139817, + "grad_norm": 3.557973861694336, + "learning_rate": 4.976441282587894e-06, + "loss": 0.5732032060623169, + "mean_token_accuracy": 0.8041545748710632, + "num_tokens": 3239958.0, + "step": 363 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 1.3467901945114136, + "learning_rate": 4.9761535642416284e-06, + "loss": 0.4525323510169983, + "mean_token_accuracy": 0.8281061053276062, + "num_tokens": 3257703.0, + "step": 364 + }, + { + "epoch": 0.2773556231003039, + "grad_norm": 2.2649986743927, + "learning_rate": 4.9758641080354745e-06, + "loss": 0.5074734687805176, + "mean_token_accuracy": 0.8447474241256714, + "num_tokens": 3264334.0, + "step": 365 + }, + { + "epoch": 0.27811550151975684, + "grad_norm": 2.8667566776275635, + "learning_rate": 4.975572914172581e-06, + "loss": 0.5759559869766235, + "mean_token_accuracy": 0.7976793050765991, + "num_tokens": 3269314.0, + "step": 366 + }, + { + "epoch": 0.27887537993920974, + "grad_norm": 2.2514986991882324, + "learning_rate": 4.975279982857324e-06, + "loss": 0.5786465406417847, + "mean_token_accuracy": 0.8058781623840332, + "num_tokens": 3277324.0, + "step": 367 + }, + { + "epoch": 0.2796352583586626, + "grad_norm": 1.3826723098754883, + "learning_rate": 4.97498531429529e-06, + "loss": 0.40801727771759033, + "mean_token_accuracy": 0.8601310849189758, + "num_tokens": 3290530.0, + "step": 368 + }, + { + "epoch": 0.2803951367781155, + "grad_norm": 2.084092617034912, + "learning_rate": 4.97468890869329e-06, + "loss": 0.47076648473739624, + "mean_token_accuracy": 0.8310186862945557, + "num_tokens": 3298325.0, + "step": 369 + }, + { + "epoch": 0.2811550151975684, + "grad_norm": 1.3467998504638672, + "learning_rate": 4.974390766259353e-06, + "loss": 0.44668465852737427, + "mean_token_accuracy": 0.8275353908538818, + "num_tokens": 3314302.0, + "step": 370 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 2.5921075344085693, + "learning_rate": 4.974090887202726e-06, + "loss": 0.5343953967094421, + "mean_token_accuracy": 0.8110706806182861, + "num_tokens": 3320963.0, + "step": 371 + }, + { + "epoch": 0.2826747720364742, + "grad_norm": 2.042781352996826, + "learning_rate": 4.973789271733877e-06, + "loss": 0.6293343305587769, + "mean_token_accuracy": 0.7800243496894836, + "num_tokens": 3332742.0, + "step": 372 + }, + { + "epoch": 0.28343465045592703, + "grad_norm": 4.822193145751953, + "learning_rate": 4.973485920064491e-06, + "loss": 0.6256728768348694, + "mean_token_accuracy": 0.7962433099746704, + "num_tokens": 3335872.0, + "step": 373 + }, + { + "epoch": 0.28419452887537994, + "grad_norm": 1.260988473892212, + "learning_rate": 4.973180832407471e-06, + "loss": 0.38731223344802856, + "mean_token_accuracy": 0.8385066986083984, + "num_tokens": 3351884.0, + "step": 374 + }, + { + "epoch": 0.28495440729483285, + "grad_norm": 2.669966697692871, + "learning_rate": 4.97287400897694e-06, + "loss": 0.5594710111618042, + "mean_token_accuracy": 0.8097212314605713, + "num_tokens": 3358197.0, + "step": 375 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 3.0344486236572266, + "learning_rate": 4.972565449988238e-06, + "loss": 0.34449583292007446, + "mean_token_accuracy": 0.8813316822052002, + "num_tokens": 3362133.0, + "step": 376 + }, + { + "epoch": 0.2864741641337386, + "grad_norm": 2.562251091003418, + "learning_rate": 4.972255155657925e-06, + "loss": 0.5331522822380066, + "mean_token_accuracy": 0.8212941288948059, + "num_tokens": 3370346.0, + "step": 377 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 2.7083740234375, + "learning_rate": 4.9719431262037755e-06, + "loss": 0.5403046011924744, + "mean_token_accuracy": 0.8108335733413696, + "num_tokens": 3375588.0, + "step": 378 + }, + { + "epoch": 0.28799392097264437, + "grad_norm": 1.396430492401123, + "learning_rate": 4.971629361844785e-06, + "loss": 0.4041529893875122, + "mean_token_accuracy": 0.8588063716888428, + "num_tokens": 3390749.0, + "step": 379 + }, + { + "epoch": 0.2887537993920973, + "grad_norm": 1.9872784614562988, + "learning_rate": 4.971313862801166e-06, + "loss": 0.4336993098258972, + "mean_token_accuracy": 0.8511303663253784, + "num_tokens": 3399064.0, + "step": 380 + }, + { + "epoch": 0.28951367781155013, + "grad_norm": 1.9652575254440308, + "learning_rate": 4.9709966292943455e-06, + "loss": 0.4578358232975006, + "mean_token_accuracy": 0.8229440450668335, + "num_tokens": 3407229.0, + "step": 381 + }, + { + "epoch": 0.29027355623100304, + "grad_norm": 1.6626898050308228, + "learning_rate": 4.970677661546972e-06, + "loss": 0.5427594184875488, + "mean_token_accuracy": 0.815427303314209, + "num_tokens": 3422321.0, + "step": 382 + }, + { + "epoch": 0.29103343465045595, + "grad_norm": 3.5265562534332275, + "learning_rate": 4.970356959782909e-06, + "loss": 0.6661460995674133, + "mean_token_accuracy": 0.7856965065002441, + "num_tokens": 3427442.0, + "step": 383 + }, + { + "epoch": 0.2917933130699088, + "grad_norm": 1.667205572128296, + "learning_rate": 4.970034524227239e-06, + "loss": 0.36256325244903564, + "mean_token_accuracy": 0.8711205720901489, + "num_tokens": 3436662.0, + "step": 384 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 1.3389486074447632, + "learning_rate": 4.969710355106256e-06, + "loss": 0.4282698631286621, + "mean_token_accuracy": 0.838951587677002, + "num_tokens": 3450060.0, + "step": 385 + }, + { + "epoch": 0.2933130699088146, + "grad_norm": 2.5163397789001465, + "learning_rate": 4.969384452647477e-06, + "loss": 0.5176984071731567, + "mean_token_accuracy": 0.8235267996788025, + "num_tokens": 3456990.0, + "step": 386 + }, + { + "epoch": 0.29407294832826747, + "grad_norm": 1.7588495016098022, + "learning_rate": 4.969056817079633e-06, + "loss": 0.49710947275161743, + "mean_token_accuracy": 0.818520724773407, + "num_tokens": 3468098.0, + "step": 387 + }, + { + "epoch": 0.2948328267477204, + "grad_norm": 2.6381046772003174, + "learning_rate": 4.968727448632669e-06, + "loss": 0.4425308108329773, + "mean_token_accuracy": 0.8451643586158752, + "num_tokens": 3472899.0, + "step": 388 + }, + { + "epoch": 0.29559270516717323, + "grad_norm": 1.6345038414001465, + "learning_rate": 4.968396347537751e-06, + "loss": 0.4177059829235077, + "mean_token_accuracy": 0.8498886227607727, + "num_tokens": 3484826.0, + "step": 389 + }, + { + "epoch": 0.29635258358662614, + "grad_norm": 3.0466468334198, + "learning_rate": 4.968063514027258e-06, + "loss": 0.4274463951587677, + "mean_token_accuracy": 0.8387278318405151, + "num_tokens": 3488610.0, + "step": 390 + }, + { + "epoch": 0.29711246200607905, + "grad_norm": 2.6509406566619873, + "learning_rate": 4.967728948334784e-06, + "loss": 0.5401753783226013, + "mean_token_accuracy": 0.8252490162849426, + "num_tokens": 3493657.0, + "step": 391 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.6372219324111938, + "learning_rate": 4.967392650695141e-06, + "loss": 0.3862472176551819, + "mean_token_accuracy": 0.8555525541305542, + "num_tokens": 3505588.0, + "step": 392 + }, + { + "epoch": 0.2986322188449848, + "grad_norm": 2.1615452766418457, + "learning_rate": 4.967054621344356e-06, + "loss": 0.57850581407547, + "mean_token_accuracy": 0.8222678899765015, + "num_tokens": 3514396.0, + "step": 393 + }, + { + "epoch": 0.2993920972644377, + "grad_norm": 1.8610916137695312, + "learning_rate": 4.96671486051967e-06, + "loss": 0.5440595149993896, + "mean_token_accuracy": 0.8196715116500854, + "num_tokens": 3523604.0, + "step": 394 + }, + { + "epoch": 0.30015197568389057, + "grad_norm": 2.9585862159729004, + "learning_rate": 4.966373368459542e-06, + "loss": 0.6921588182449341, + "mean_token_accuracy": 0.7816659808158875, + "num_tokens": 3529849.0, + "step": 395 + }, + { + "epoch": 0.3009118541033435, + "grad_norm": 1.9374035596847534, + "learning_rate": 4.966030145403642e-06, + "loss": 0.5494055151939392, + "mean_token_accuracy": 0.8126792907714844, + "num_tokens": 3539529.0, + "step": 396 + }, + { + "epoch": 0.30167173252279633, + "grad_norm": 1.730530023574829, + "learning_rate": 4.965685191592859e-06, + "loss": 0.4271572232246399, + "mean_token_accuracy": 0.8383668661117554, + "num_tokens": 3550972.0, + "step": 397 + }, + { + "epoch": 0.30243161094224924, + "grad_norm": 3.9635560512542725, + "learning_rate": 4.9653385072692935e-06, + "loss": 0.5576210021972656, + "mean_token_accuracy": 0.799404501914978, + "num_tokens": 3554147.0, + "step": 398 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 2.5731968879699707, + "learning_rate": 4.964990092676263e-06, + "loss": 0.5478942394256592, + "mean_token_accuracy": 0.8220961093902588, + "num_tokens": 3559972.0, + "step": 399 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 2.2096588611602783, + "learning_rate": 4.964639948058297e-06, + "loss": 0.35461270809173584, + "mean_token_accuracy": 0.8640927076339722, + "num_tokens": 3565770.0, + "step": 400 + }, + { + "epoch": 0.3047112462006079, + "grad_norm": 1.7874189615249634, + "learning_rate": 4.964288073661142e-06, + "loss": 0.38849619030952454, + "mean_token_accuracy": 0.8443037271499634, + "num_tokens": 3574514.0, + "step": 401 + }, + { + "epoch": 0.30547112462006076, + "grad_norm": 1.5583146810531616, + "learning_rate": 4.963934469731756e-06, + "loss": 0.48909449577331543, + "mean_token_accuracy": 0.8429768681526184, + "num_tokens": 3585877.0, + "step": 402 + }, + { + "epoch": 0.30623100303951367, + "grad_norm": 3.026599645614624, + "learning_rate": 4.963579136518312e-06, + "loss": 0.5138992071151733, + "mean_token_accuracy": 0.8283728361129761, + "num_tokens": 3590412.0, + "step": 403 + }, + { + "epoch": 0.3069908814589666, + "grad_norm": 2.777505874633789, + "learning_rate": 4.963222074270197e-06, + "loss": 0.6241534948348999, + "mean_token_accuracy": 0.8130464553833008, + "num_tokens": 3596246.0, + "step": 404 + }, + { + "epoch": 0.30775075987841943, + "grad_norm": 2.4772839546203613, + "learning_rate": 4.962863283238011e-06, + "loss": 0.5930814146995544, + "mean_token_accuracy": 0.8036394715309143, + "num_tokens": 3602878.0, + "step": 405 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 1.5049982070922852, + "learning_rate": 4.962502763673566e-06, + "loss": 0.4903082549571991, + "mean_token_accuracy": 0.8184912204742432, + "num_tokens": 3617018.0, + "step": 406 + }, + { + "epoch": 0.30927051671732525, + "grad_norm": 2.453155040740967, + "learning_rate": 4.96214051582989e-06, + "loss": 0.5138067603111267, + "mean_token_accuracy": 0.8336835503578186, + "num_tokens": 3624188.0, + "step": 407 + }, + { + "epoch": 0.3100303951367781, + "grad_norm": 2.4038336277008057, + "learning_rate": 4.961776539961222e-06, + "loss": 0.5752760171890259, + "mean_token_accuracy": 0.8054730892181396, + "num_tokens": 3634152.0, + "step": 408 + }, + { + "epoch": 0.310790273556231, + "grad_norm": 2.629068374633789, + "learning_rate": 4.961410836323014e-06, + "loss": 0.5580606460571289, + "mean_token_accuracy": 0.8121089935302734, + "num_tokens": 3639528.0, + "step": 409 + }, + { + "epoch": 0.31155015197568386, + "grad_norm": 1.4245928525924683, + "learning_rate": 4.961043405171931e-06, + "loss": 0.5399882793426514, + "mean_token_accuracy": 0.812280535697937, + "num_tokens": 3655744.0, + "step": 410 + }, + { + "epoch": 0.3123100303951368, + "grad_norm": 1.5236459970474243, + "learning_rate": 4.9606742467658505e-06, + "loss": 0.5234690308570862, + "mean_token_accuracy": 0.8188928365707397, + "num_tokens": 3675010.0, + "step": 411 + }, + { + "epoch": 0.3130699088145897, + "grad_norm": 2.27961802482605, + "learning_rate": 4.960303361363863e-06, + "loss": 0.5502505898475647, + "mean_token_accuracy": 0.8161963224411011, + "num_tokens": 3682328.0, + "step": 412 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 1.554518222808838, + "learning_rate": 4.959930749226269e-06, + "loss": 0.420867919921875, + "mean_token_accuracy": 0.8499157428741455, + "num_tokens": 3694980.0, + "step": 413 + }, + { + "epoch": 0.31458966565349544, + "grad_norm": 2.609218120574951, + "learning_rate": 4.9595564106145825e-06, + "loss": 0.4706704318523407, + "mean_token_accuracy": 0.8412490487098694, + "num_tokens": 3700033.0, + "step": 414 + }, + { + "epoch": 0.31534954407294835, + "grad_norm": 1.5303231477737427, + "learning_rate": 4.959180345791528e-06, + "loss": 0.4668654799461365, + "mean_token_accuracy": 0.8125015497207642, + "num_tokens": 3715012.0, + "step": 415 + }, + { + "epoch": 0.3161094224924012, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.958802555021042e-06, + "loss": 0.4339369237422943, + "mean_token_accuracy": 0.8442851901054382, + "num_tokens": 3733928.0, + "step": 416 + }, + { + "epoch": 0.3168693009118541, + "grad_norm": 2.1240181922912598, + "learning_rate": 4.958423038568274e-06, + "loss": 0.4029104709625244, + "mean_token_accuracy": 0.8627674579620361, + "num_tokens": 3740202.0, + "step": 417 + }, + { + "epoch": 0.31762917933130697, + "grad_norm": 2.00538969039917, + "learning_rate": 4.958041796699583e-06, + "loss": 0.5229607820510864, + "mean_token_accuracy": 0.8282366394996643, + "num_tokens": 3749308.0, + "step": 418 + }, + { + "epoch": 0.3183890577507599, + "grad_norm": 2.6555092334747314, + "learning_rate": 4.957658829682539e-06, + "loss": 0.5344101190567017, + "mean_token_accuracy": 0.8183202743530273, + "num_tokens": 3754595.0, + "step": 419 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 1.7468839883804321, + "learning_rate": 4.9572741377859225e-06, + "loss": 0.5667245984077454, + "mean_token_accuracy": 0.8080123662948608, + "num_tokens": 3765761.0, + "step": 420 + }, + { + "epoch": 0.31990881458966564, + "grad_norm": 2.9612457752227783, + "learning_rate": 4.956887721279726e-06, + "loss": 0.5389559864997864, + "mean_token_accuracy": 0.8019476532936096, + "num_tokens": 3770844.0, + "step": 421 + }, + { + "epoch": 0.32066869300911854, + "grad_norm": 1.842403769493103, + "learning_rate": 4.95649958043515e-06, + "loss": 0.38279837369918823, + "mean_token_accuracy": 0.858866810798645, + "num_tokens": 3778094.0, + "step": 422 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 2.3108131885528564, + "learning_rate": 4.956109715524609e-06, + "loss": 0.5453893542289734, + "mean_token_accuracy": 0.8085013031959534, + "num_tokens": 3785015.0, + "step": 423 + }, + { + "epoch": 0.3221884498480243, + "grad_norm": 3.0326945781707764, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.5550523400306702, + "mean_token_accuracy": 0.8125876188278198, + "num_tokens": 3789830.0, + "step": 424 + }, + { + "epoch": 0.3229483282674772, + "grad_norm": 1.8851977586746216, + "learning_rate": 4.955324814601324e-06, + "loss": 0.4902324974536896, + "mean_token_accuracy": 0.8205406665802002, + "num_tokens": 3799862.0, + "step": 425 + }, + { + "epoch": 0.32370820668693007, + "grad_norm": 2.6018171310424805, + "learning_rate": 4.954929779139455e-06, + "loss": 0.5920133590698242, + "mean_token_accuracy": 0.8340690732002258, + "num_tokens": 3806617.0, + "step": 426 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 2.4283878803253174, + "learning_rate": 4.954533020713367e-06, + "loss": 0.5305854082107544, + "mean_token_accuracy": 0.8137468099594116, + "num_tokens": 3813843.0, + "step": 427 + }, + { + "epoch": 0.3252279635258359, + "grad_norm": 2.667978525161743, + "learning_rate": 4.954134539601519e-06, + "loss": 0.5333638787269592, + "mean_token_accuracy": 0.8402629494667053, + "num_tokens": 3819450.0, + "step": 428 + }, + { + "epoch": 0.32598784194528874, + "grad_norm": 1.7302523851394653, + "learning_rate": 4.953734336083582e-06, + "loss": 0.422895610332489, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 3831027.0, + "step": 429 + }, + { + "epoch": 0.32674772036474165, + "grad_norm": 2.427192211151123, + "learning_rate": 4.953332410440434e-06, + "loss": 0.6334598064422607, + "mean_token_accuracy": 0.7817479968070984, + "num_tokens": 3841776.0, + "step": 430 + }, + { + "epoch": 0.32750759878419455, + "grad_norm": 1.460949182510376, + "learning_rate": 4.952928762954161e-06, + "loss": 0.3654777705669403, + "mean_token_accuracy": 0.8780122995376587, + "num_tokens": 3852213.0, + "step": 431 + }, + { + "epoch": 0.3282674772036474, + "grad_norm": 1.9855005741119385, + "learning_rate": 4.952523393908059e-06, + "loss": 0.5117089748382568, + "mean_token_accuracy": 0.811911404132843, + "num_tokens": 3861176.0, + "step": 432 + }, + { + "epoch": 0.3290273556231003, + "grad_norm": 2.2653207778930664, + "learning_rate": 4.952116303586631e-06, + "loss": 0.42514950037002563, + "mean_token_accuracy": 0.8448518514633179, + "num_tokens": 3867164.0, + "step": 433 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 1.9780964851379395, + "learning_rate": 4.951707492275589e-06, + "loss": 0.5095293521881104, + "mean_token_accuracy": 0.8262748718261719, + "num_tokens": 3876406.0, + "step": 434 + }, + { + "epoch": 0.3305471124620061, + "grad_norm": 2.9480233192443848, + "learning_rate": 4.951296960261853e-06, + "loss": 0.3494448959827423, + "mean_token_accuracy": 0.8781307935714722, + "num_tokens": 3880298.0, + "step": 435 + }, + { + "epoch": 0.331306990881459, + "grad_norm": 2.335571527481079, + "learning_rate": 4.95088470783355e-06, + "loss": 0.5456914901733398, + "mean_token_accuracy": 0.816297173500061, + "num_tokens": 3886487.0, + "step": 436 + }, + { + "epoch": 0.33206686930091184, + "grad_norm": 2.3046419620513916, + "learning_rate": 4.950470735280013e-06, + "loss": 0.4835948944091797, + "mean_token_accuracy": 0.8539175391197205, + "num_tokens": 3892706.0, + "step": 437 + }, + { + "epoch": 0.33282674772036475, + "grad_norm": 2.44047474861145, + "learning_rate": 4.950055042891786e-06, + "loss": 0.5154092907905579, + "mean_token_accuracy": 0.8579919338226318, + "num_tokens": 3899532.0, + "step": 438 + }, + { + "epoch": 0.33358662613981765, + "grad_norm": 4.826764106750488, + "learning_rate": 4.949637630960618e-06, + "loss": 0.5270259976387024, + "mean_token_accuracy": 0.8172192573547363, + "num_tokens": 3902260.0, + "step": 439 + }, + { + "epoch": 0.3343465045592705, + "grad_norm": 2.001574754714966, + "learning_rate": 4.949218499779462e-06, + "loss": 0.5413002967834473, + "mean_token_accuracy": 0.8162837028503418, + "num_tokens": 3911706.0, + "step": 440 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 1.7998944520950317, + "learning_rate": 4.948797649642484e-06, + "loss": 0.5131614208221436, + "mean_token_accuracy": 0.8367440700531006, + "num_tokens": 3923490.0, + "step": 441 + }, + { + "epoch": 0.33586626139817627, + "grad_norm": 3.4566173553466797, + "learning_rate": 4.94837508084505e-06, + "loss": 0.7258909940719604, + "mean_token_accuracy": 0.771377444267273, + "num_tokens": 3928099.0, + "step": 442 + }, + { + "epoch": 0.3366261398176292, + "grad_norm": 2.0040442943573, + "learning_rate": 4.9479507936837364e-06, + "loss": 0.482135534286499, + "mean_token_accuracy": 0.8339327573776245, + "num_tokens": 3937328.0, + "step": 443 + }, + { + "epoch": 0.3373860182370821, + "grad_norm": 2.949502944946289, + "learning_rate": 4.947524788456325e-06, + "loss": 0.6474795341491699, + "mean_token_accuracy": 0.7951677441596985, + "num_tokens": 3942529.0, + "step": 444 + }, + { + "epoch": 0.33814589665653494, + "grad_norm": 1.5528364181518555, + "learning_rate": 4.947097065461801e-06, + "loss": 0.48791584372520447, + "mean_token_accuracy": 0.8425545692443848, + "num_tokens": 3955200.0, + "step": 445 + }, + { + "epoch": 0.33890577507598785, + "grad_norm": 1.8813284635543823, + "learning_rate": 4.946667625000358e-06, + "loss": 0.45922309160232544, + "mean_token_accuracy": 0.8206527233123779, + "num_tokens": 3962975.0, + "step": 446 + }, + { + "epoch": 0.33966565349544076, + "grad_norm": 1.7157847881317139, + "learning_rate": 4.946236467373392e-06, + "loss": 0.5454182028770447, + "mean_token_accuracy": 0.8049604892730713, + "num_tokens": 3973956.0, + "step": 447 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 2.008857250213623, + "learning_rate": 4.945803592883509e-06, + "loss": 0.5151860117912292, + "mean_token_accuracy": 0.8262045383453369, + "num_tokens": 3982853.0, + "step": 448 + }, + { + "epoch": 0.3411854103343465, + "grad_norm": 1.6632496118545532, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.42710691690444946, + "mean_token_accuracy": 0.8521314859390259, + "num_tokens": 3993838.0, + "step": 449 + }, + { + "epoch": 0.34194528875379937, + "grad_norm": 1.365234375, + "learning_rate": 4.944932694531423e-06, + "loss": 0.5172526836395264, + "mean_token_accuracy": 0.8277045488357544, + "num_tokens": 4014179.0, + "step": 450 + }, + { + "epoch": 0.3427051671732523, + "grad_norm": 1.7610243558883667, + "learning_rate": 4.94449467128045e-06, + "loss": 0.42104798555374146, + "mean_token_accuracy": 0.8552065491676331, + "num_tokens": 4023663.0, + "step": 451 + }, + { + "epoch": 0.3434650455927052, + "grad_norm": 2.3732354640960693, + "learning_rate": 4.944054932389018e-06, + "loss": 0.5471175909042358, + "mean_token_accuracy": 0.8487317562103271, + "num_tokens": 4030100.0, + "step": 452 + }, + { + "epoch": 0.34422492401215804, + "grad_norm": 1.5973623991012573, + "learning_rate": 4.943613478165753e-06, + "loss": 0.419813871383667, + "mean_token_accuracy": 0.8484025001525879, + "num_tokens": 4041124.0, + "step": 453 + }, + { + "epoch": 0.34498480243161095, + "grad_norm": 2.966381549835205, + "learning_rate": 4.943170308920484e-06, + "loss": 0.5370652675628662, + "mean_token_accuracy": 0.8439491987228394, + "num_tokens": 4045675.0, + "step": 454 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 2.5097248554229736, + "learning_rate": 4.9427254249642445e-06, + "loss": 0.5776349306106567, + "mean_token_accuracy": 0.8060523867607117, + "num_tokens": 4053250.0, + "step": 455 + }, + { + "epoch": 0.3465045592705167, + "grad_norm": 1.6779125928878784, + "learning_rate": 4.942278826609272e-06, + "loss": 0.5245476961135864, + "mean_token_accuracy": 0.8168526887893677, + "num_tokens": 4064106.0, + "step": 456 + }, + { + "epoch": 0.3472644376899696, + "grad_norm": 1.5945546627044678, + "learning_rate": 4.9418305141690045e-06, + "loss": 0.4972047209739685, + "mean_token_accuracy": 0.8257735967636108, + "num_tokens": 4077687.0, + "step": 457 + }, + { + "epoch": 0.34802431610942247, + "grad_norm": 2.864778757095337, + "learning_rate": 4.9413804879580865e-06, + "loss": 0.5372499823570251, + "mean_token_accuracy": 0.8423776626586914, + "num_tokens": 4082632.0, + "step": 458 + }, + { + "epoch": 0.3487841945288754, + "grad_norm": 1.4797078371047974, + "learning_rate": 4.940928748292363e-06, + "loss": 0.5903409719467163, + "mean_token_accuracy": 0.8061295747756958, + "num_tokens": 4104218.0, + "step": 459 + }, + { + "epoch": 0.3495440729483283, + "grad_norm": 2.4376983642578125, + "learning_rate": 4.940475295488882e-06, + "loss": 0.4534894824028015, + "mean_token_accuracy": 0.8395825028419495, + "num_tokens": 4110530.0, + "step": 460 + }, + { + "epoch": 0.35030395136778114, + "grad_norm": 1.2955626249313354, + "learning_rate": 4.940020129865895e-06, + "loss": 0.47155818343162537, + "mean_token_accuracy": 0.8253582715988159, + "num_tokens": 4128398.0, + "step": 461 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 2.066575527191162, + "learning_rate": 4.9395632517428546e-06, + "loss": 0.5555641651153564, + "mean_token_accuracy": 0.814624547958374, + "num_tokens": 4137623.0, + "step": 462 + }, + { + "epoch": 0.3518237082066869, + "grad_norm": 1.6407525539398193, + "learning_rate": 4.939104661440415e-06, + "loss": 0.4361790418624878, + "mean_token_accuracy": 0.8544459342956543, + "num_tokens": 4152803.0, + "step": 463 + }, + { + "epoch": 0.3525835866261398, + "grad_norm": 2.1685116291046143, + "learning_rate": 4.938644359280433e-06, + "loss": 0.5347012877464294, + "mean_token_accuracy": 0.853853702545166, + "num_tokens": 4160778.0, + "step": 464 + }, + { + "epoch": 0.3533434650455927, + "grad_norm": 1.8824869394302368, + "learning_rate": 4.938182345585967e-06, + "loss": 0.5512481927871704, + "mean_token_accuracy": 0.7985891699790955, + "num_tokens": 4170380.0, + "step": 465 + }, + { + "epoch": 0.3541033434650456, + "grad_norm": 2.2229504585266113, + "learning_rate": 4.937718620681273e-06, + "loss": 0.516828179359436, + "mean_token_accuracy": 0.8265621066093445, + "num_tokens": 4178179.0, + "step": 466 + }, + { + "epoch": 0.3548632218844985, + "grad_norm": 1.955990195274353, + "learning_rate": 4.9372531848918145e-06, + "loss": 0.5586158037185669, + "mean_token_accuracy": 0.8367916345596313, + "num_tokens": 4188626.0, + "step": 467 + }, + { + "epoch": 0.3556231003039514, + "grad_norm": 1.9687023162841797, + "learning_rate": 4.936786038544251e-06, + "loss": 0.5517531633377075, + "mean_token_accuracy": 0.8134098052978516, + "num_tokens": 4198144.0, + "step": 468 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 1.405516505241394, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.5305492877960205, + "mean_token_accuracy": 0.8014427423477173, + "num_tokens": 4222818.0, + "step": 469 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.6355695724487305, + "learning_rate": 4.9358466154874535e-06, + "loss": 0.5303391218185425, + "mean_token_accuracy": 0.8028861284255981, + "num_tokens": 4228318.0, + "step": 470 + }, + { + "epoch": 0.35790273556231, + "grad_norm": 1.5133824348449707, + "learning_rate": 4.935374339437543e-06, + "loss": 0.5329189300537109, + "mean_token_accuracy": 0.8479441404342651, + "num_tokens": 4244527.0, + "step": 471 + }, + { + "epoch": 0.3586626139817629, + "grad_norm": 3.4356725215911865, + "learning_rate": 4.934900354148173e-06, + "loss": 0.5431582927703857, + "mean_token_accuracy": 0.8328983783721924, + "num_tokens": 4248034.0, + "step": 472 + }, + { + "epoch": 0.3594224924012158, + "grad_norm": 2.5789499282836914, + "learning_rate": 4.934424659952006e-06, + "loss": 0.4141455292701721, + "mean_token_accuracy": 0.8658635020256042, + "num_tokens": 4252953.0, + "step": 473 + }, + { + "epoch": 0.3601823708206687, + "grad_norm": 1.145262598991394, + "learning_rate": 4.933947257182901e-06, + "loss": 0.40294092893600464, + "mean_token_accuracy": 0.8565847277641296, + "num_tokens": 4277813.0, + "step": 474 + }, + { + "epoch": 0.3609422492401216, + "grad_norm": 1.7242133617401123, + "learning_rate": 4.933468146175918e-06, + "loss": 0.6036738753318787, + "mean_token_accuracy": 0.8072597980499268, + "num_tokens": 4291088.0, + "step": 475 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 2.3490941524505615, + "learning_rate": 4.932987327267317e-06, + "loss": 0.49456146359443665, + "mean_token_accuracy": 0.8372673988342285, + "num_tokens": 4297376.0, + "step": 476 + }, + { + "epoch": 0.36246200607902734, + "grad_norm": 1.3605526685714722, + "learning_rate": 4.932504800794553e-06, + "loss": 0.43595948815345764, + "mean_token_accuracy": 0.8415953516960144, + "num_tokens": 4312054.0, + "step": 477 + }, + { + "epoch": 0.36322188449848025, + "grad_norm": 1.4525885581970215, + "learning_rate": 4.9320205670962815e-06, + "loss": 0.5390371680259705, + "mean_token_accuracy": 0.8101649284362793, + "num_tokens": 4328701.0, + "step": 478 + }, + { + "epoch": 0.3639817629179331, + "grad_norm": 1.9862419366836548, + "learning_rate": 4.931534626512359e-06, + "loss": 0.45436930656433105, + "mean_token_accuracy": 0.8352861404418945, + "num_tokens": 4338372.0, + "step": 479 + }, + { + "epoch": 0.364741641337386, + "grad_norm": 1.7804961204528809, + "learning_rate": 4.931046979383836e-06, + "loss": 0.4677754044532776, + "mean_token_accuracy": 0.840467095375061, + "num_tokens": 4347897.0, + "step": 480 + }, + { + "epoch": 0.3655015197568389, + "grad_norm": 2.066632032394409, + "learning_rate": 4.930557626052961e-06, + "loss": 0.42418140172958374, + "mean_token_accuracy": 0.8528275489807129, + "num_tokens": 4354061.0, + "step": 481 + }, + { + "epoch": 0.3662613981762918, + "grad_norm": 1.6155282258987427, + "learning_rate": 4.930066566863182e-06, + "loss": 0.5424284934997559, + "mean_token_accuracy": 0.825040876865387, + "num_tokens": 4370400.0, + "step": 482 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 2.1452953815460205, + "learning_rate": 4.929573802159143e-06, + "loss": 0.5105804204940796, + "mean_token_accuracy": 0.8284053802490234, + "num_tokens": 4377579.0, + "step": 483 + }, + { + "epoch": 0.3677811550151976, + "grad_norm": 1.8940945863723755, + "learning_rate": 4.929079332286685e-06, + "loss": 0.43478304147720337, + "mean_token_accuracy": 0.8505665063858032, + "num_tokens": 4385686.0, + "step": 484 + }, + { + "epoch": 0.36854103343465044, + "grad_norm": 1.6785860061645508, + "learning_rate": 4.928583157592846e-06, + "loss": 0.40227848291397095, + "mean_token_accuracy": 0.8623573780059814, + "num_tokens": 4396128.0, + "step": 485 + }, + { + "epoch": 0.36930091185410335, + "grad_norm": 1.6416733264923096, + "learning_rate": 4.928085278425862e-06, + "loss": 0.526267409324646, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 4407963.0, + "step": 486 + }, + { + "epoch": 0.3700607902735562, + "grad_norm": 1.8882389068603516, + "learning_rate": 4.927585695135162e-06, + "loss": 0.5555213093757629, + "mean_token_accuracy": 0.8115293979644775, + "num_tokens": 4418057.0, + "step": 487 + }, + { + "epoch": 0.3708206686930091, + "grad_norm": 2.300248384475708, + "learning_rate": 4.9270844080713735e-06, + "loss": 0.5812339186668396, + "mean_token_accuracy": 0.800270676612854, + "num_tokens": 4425358.0, + "step": 488 + }, + { + "epoch": 0.371580547112462, + "grad_norm": 1.6802922487258911, + "learning_rate": 4.926581417586319e-06, + "loss": 0.5134941935539246, + "mean_token_accuracy": 0.8247408866882324, + "num_tokens": 4437702.0, + "step": 489 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 1.7620291709899902, + "learning_rate": 4.926076724033016e-06, + "loss": 0.5233973264694214, + "mean_token_accuracy": 0.8102161884307861, + "num_tokens": 4448584.0, + "step": 490 + }, + { + "epoch": 0.3731003039513678, + "grad_norm": 1.6911998987197876, + "learning_rate": 4.925570327765678e-06, + "loss": 0.5337274074554443, + "mean_token_accuracy": 0.845306396484375, + "num_tokens": 4462651.0, + "step": 491 + }, + { + "epoch": 0.3738601823708207, + "grad_norm": 1.7991242408752441, + "learning_rate": 4.9250622291397144e-06, + "loss": 0.31018948554992676, + "mean_token_accuracy": 0.8857606053352356, + "num_tokens": 4469971.0, + "step": 492 + }, + { + "epoch": 0.37462006079027355, + "grad_norm": 4.9776835441589355, + "learning_rate": 4.924552428511727e-06, + "loss": 0.44114983081817627, + "mean_token_accuracy": 0.8429906368255615, + "num_tokens": 4478275.0, + "step": 493 + }, + { + "epoch": 0.37537993920972645, + "grad_norm": 1.8007272481918335, + "learning_rate": 4.924040926239515e-06, + "loss": 0.574328601360321, + "mean_token_accuracy": 0.7669196128845215, + "num_tokens": 4491551.0, + "step": 494 + }, + { + "epoch": 0.3761398176291793, + "grad_norm": 2.021300792694092, + "learning_rate": 4.92352772268207e-06, + "loss": 0.45636120438575745, + "mean_token_accuracy": 0.840438723564148, + "num_tokens": 4498658.0, + "step": 495 + }, + { + "epoch": 0.3768996960486322, + "grad_norm": 2.369748592376709, + "learning_rate": 4.923012818199576e-06, + "loss": 0.5206376910209656, + "mean_token_accuracy": 0.8521823287010193, + "num_tokens": 4504648.0, + "step": 496 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 2.733485221862793, + "learning_rate": 4.922496213153416e-06, + "loss": 0.5067723989486694, + "mean_token_accuracy": 0.8168281316757202, + "num_tokens": 4509990.0, + "step": 497 + }, + { + "epoch": 0.378419452887538, + "grad_norm": 2.3751676082611084, + "learning_rate": 4.921977907906161e-06, + "loss": 0.49757206439971924, + "mean_token_accuracy": 0.8325017690658569, + "num_tokens": 4518373.0, + "step": 498 + }, + { + "epoch": 0.3791793313069909, + "grad_norm": 2.1672775745391846, + "learning_rate": 4.921457902821578e-06, + "loss": 0.4237566590309143, + "mean_token_accuracy": 0.8404698371887207, + "num_tokens": 4524338.0, + "step": 499 + }, + { + "epoch": 0.3799392097264438, + "grad_norm": 1.8374360799789429, + "learning_rate": 4.9209361982646275e-06, + "loss": 0.4995468854904175, + "mean_token_accuracy": 0.8299649953842163, + "num_tokens": 4533396.0, + "step": 500 + }, + { + "epoch": 0.38069908814589665, + "grad_norm": 2.083967924118042, + "learning_rate": 4.920412794601461e-06, + "loss": 0.489935040473938, + "mean_token_accuracy": 0.8315291404724121, + "num_tokens": 4540941.0, + "step": 501 + }, + { + "epoch": 0.38145896656534956, + "grad_norm": 2.2075610160827637, + "learning_rate": 4.919887692199423e-06, + "loss": 0.5233147740364075, + "mean_token_accuracy": 0.804171085357666, + "num_tokens": 4548215.0, + "step": 502 + }, + { + "epoch": 0.3822188449848024, + "grad_norm": 2.076775312423706, + "learning_rate": 4.9193608914270515e-06, + "loss": 0.5785550475120544, + "mean_token_accuracy": 0.7993186116218567, + "num_tokens": 4558204.0, + "step": 503 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.238546133041382, + "learning_rate": 4.918832392654075e-06, + "loss": 0.5287384390830994, + "mean_token_accuracy": 0.8214945793151855, + "num_tokens": 4565407.0, + "step": 504 + }, + { + "epoch": 0.3837386018237082, + "grad_norm": 1.6783074140548706, + "learning_rate": 4.9183021962514145e-06, + "loss": 0.6063359379768372, + "mean_token_accuracy": 0.7914625406265259, + "num_tokens": 4580991.0, + "step": 505 + }, + { + "epoch": 0.3844984802431611, + "grad_norm": 1.6287449598312378, + "learning_rate": 4.917770302591183e-06, + "loss": 0.3598247766494751, + "mean_token_accuracy": 0.8706809878349304, + "num_tokens": 4590579.0, + "step": 506 + }, + { + "epoch": 0.385258358662614, + "grad_norm": 1.5432041883468628, + "learning_rate": 4.917236712046682e-06, + "loss": 0.5267890095710754, + "mean_token_accuracy": 0.8032117486000061, + "num_tokens": 4608380.0, + "step": 507 + }, + { + "epoch": 0.3860182370820669, + "grad_norm": 1.7664037942886353, + "learning_rate": 4.9167014249924075e-06, + "loss": 0.3552354574203491, + "mean_token_accuracy": 0.8569793701171875, + "num_tokens": 4616426.0, + "step": 508 + }, + { + "epoch": 0.38677811550151975, + "grad_norm": 2.1147472858428955, + "learning_rate": 4.916164441804044e-06, + "loss": 0.5212404727935791, + "mean_token_accuracy": 0.8196578025817871, + "num_tokens": 4623908.0, + "step": 509 + }, + { + "epoch": 0.38753799392097266, + "grad_norm": 2.1092333793640137, + "learning_rate": 4.915625762858467e-06, + "loss": 0.5197038650512695, + "mean_token_accuracy": 0.8245604634284973, + "num_tokens": 4630956.0, + "step": 510 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 1.23331880569458, + "learning_rate": 4.915085388533743e-06, + "loss": 0.4759839177131653, + "mean_token_accuracy": 0.8192248344421387, + "num_tokens": 4651269.0, + "step": 511 + }, + { + "epoch": 0.3890577507598784, + "grad_norm": 2.424199104309082, + "learning_rate": 4.914543319209126e-06, + "loss": 0.5576270818710327, + "mean_token_accuracy": 0.8203302621841431, + "num_tokens": 4657296.0, + "step": 512 + }, + { + "epoch": 0.3898176291793313, + "grad_norm": 2.725156307220459, + "learning_rate": 4.913999555265062e-06, + "loss": 0.4337949752807617, + "mean_token_accuracy": 0.8382406234741211, + "num_tokens": 4661850.0, + "step": 513 + }, + { + "epoch": 0.3905775075987842, + "grad_norm": 2.3120534420013428, + "learning_rate": 4.913454097083185e-06, + "loss": 0.4941597580909729, + "mean_token_accuracy": 0.8302834033966064, + "num_tokens": 4667769.0, + "step": 514 + }, + { + "epoch": 0.3913373860182371, + "grad_norm": 2.3111207485198975, + "learning_rate": 4.912906945046319e-06, + "loss": 0.5253715515136719, + "mean_token_accuracy": 0.84515380859375, + "num_tokens": 4674537.0, + "step": 515 + }, + { + "epoch": 0.39209726443769, + "grad_norm": 1.4117841720581055, + "learning_rate": 4.912358099538476e-06, + "loss": 0.4521017074584961, + "mean_token_accuracy": 0.8208256959915161, + "num_tokens": 4690605.0, + "step": 516 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 2.3742799758911133, + "learning_rate": 4.911807560944858e-06, + "loss": 0.41572901606559753, + "mean_token_accuracy": 0.8550551533699036, + "num_tokens": 4706437.0, + "step": 517 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 2.4052202701568604, + "learning_rate": 4.911255329651852e-06, + "loss": 0.6003736257553101, + "mean_token_accuracy": 0.8247885704040527, + "num_tokens": 4712746.0, + "step": 518 + }, + { + "epoch": 0.3943768996960486, + "grad_norm": 1.9335490465164185, + "learning_rate": 4.910701406047037e-06, + "loss": 0.5457713603973389, + "mean_token_accuracy": 0.787429690361023, + "num_tokens": 4731937.0, + "step": 519 + }, + { + "epoch": 0.3951367781155015, + "grad_norm": 2.257706880569458, + "learning_rate": 4.910145790519177e-06, + "loss": 0.5300652980804443, + "mean_token_accuracy": 0.8192912936210632, + "num_tokens": 4739422.0, + "step": 520 + }, + { + "epoch": 0.3958966565349544, + "grad_norm": 1.2099462747573853, + "learning_rate": 4.9095884834582256e-06, + "loss": 0.45872747898101807, + "mean_token_accuracy": 0.8362667560577393, + "num_tokens": 4757113.0, + "step": 521 + }, + { + "epoch": 0.3966565349544073, + "grad_norm": 2.7991135120391846, + "learning_rate": 4.909029485255321e-06, + "loss": 0.49039560556411743, + "mean_token_accuracy": 0.8260016441345215, + "num_tokens": 4761709.0, + "step": 522 + }, + { + "epoch": 0.3974164133738602, + "grad_norm": 2.2360129356384277, + "learning_rate": 4.90846879630279e-06, + "loss": 0.49556830525398254, + "mean_token_accuracy": 0.827864408493042, + "num_tokens": 4769048.0, + "step": 523 + }, + { + "epoch": 0.3981762917933131, + "grad_norm": 2.5953688621520996, + "learning_rate": 4.907906416994146e-06, + "loss": 0.387208491563797, + "mean_token_accuracy": 0.8467001914978027, + "num_tokens": 4774637.0, + "step": 524 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 2.1046814918518066, + "learning_rate": 4.907342347724088e-06, + "loss": 0.5477259755134583, + "mean_token_accuracy": 0.8060322999954224, + "num_tokens": 4782774.0, + "step": 525 + }, + { + "epoch": 0.39969604863221886, + "grad_norm": 2.5622646808624268, + "learning_rate": 4.906776588888502e-06, + "loss": 0.5684159398078918, + "mean_token_accuracy": 0.8095303177833557, + "num_tokens": 4788766.0, + "step": 526 + }, + { + "epoch": 0.4004559270516717, + "grad_norm": 1.9027913808822632, + "learning_rate": 4.906209140884459e-06, + "loss": 0.535524845123291, + "mean_token_accuracy": 0.815237820148468, + "num_tokens": 4798492.0, + "step": 527 + }, + { + "epoch": 0.4012158054711246, + "grad_norm": 2.1447622776031494, + "learning_rate": 4.905640004110216e-06, + "loss": 0.5628632307052612, + "mean_token_accuracy": 0.8085395097732544, + "num_tokens": 4805737.0, + "step": 528 + }, + { + "epoch": 0.40197568389057753, + "grad_norm": 1.6754741668701172, + "learning_rate": 4.905069178965215e-06, + "loss": 0.5046736598014832, + "mean_token_accuracy": 0.8247535228729248, + "num_tokens": 4816912.0, + "step": 529 + }, + { + "epoch": 0.4027355623100304, + "grad_norm": 2.271230459213257, + "learning_rate": 4.904496665850083e-06, + "loss": 0.6086187958717346, + "mean_token_accuracy": 0.7935276627540588, + "num_tokens": 4824577.0, + "step": 530 + }, + { + "epoch": 0.4034954407294833, + "grad_norm": 2.107595205307007, + "learning_rate": 4.903922465166633e-06, + "loss": 0.5431341528892517, + "mean_token_accuracy": 0.8129537105560303, + "num_tokens": 4831772.0, + "step": 531 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 1.3860732316970825, + "learning_rate": 4.903346577317859e-06, + "loss": 0.45816320180892944, + "mean_token_accuracy": 0.8328287601470947, + "num_tokens": 4850302.0, + "step": 532 + }, + { + "epoch": 0.40501519756838905, + "grad_norm": 1.9186837673187256, + "learning_rate": 4.902769002707942e-06, + "loss": 0.3294633626937866, + "mean_token_accuracy": 0.8853933811187744, + "num_tokens": 4856624.0, + "step": 533 + }, + { + "epoch": 0.40577507598784196, + "grad_norm": 1.516194462776184, + "learning_rate": 4.902189741742247e-06, + "loss": 0.45482105016708374, + "mean_token_accuracy": 0.8370342254638672, + "num_tokens": 4870395.0, + "step": 534 + }, + { + "epoch": 0.4065349544072948, + "grad_norm": 2.3235628604888916, + "learning_rate": 4.901608794827321e-06, + "loss": 0.40688639879226685, + "mean_token_accuracy": 0.8643521666526794, + "num_tokens": 4875645.0, + "step": 535 + }, + { + "epoch": 0.4072948328267477, + "grad_norm": 2.29286527633667, + "learning_rate": 4.9010261623708945e-06, + "loss": 0.45482826232910156, + "mean_token_accuracy": 0.8429383039474487, + "num_tokens": 4881772.0, + "step": 536 + }, + { + "epoch": 0.40805471124620063, + "grad_norm": 1.5907070636749268, + "learning_rate": 4.900441844781882e-06, + "loss": 0.5266948342323303, + "mean_token_accuracy": 0.8348641395568848, + "num_tokens": 4894289.0, + "step": 537 + }, + { + "epoch": 0.4088145896656535, + "grad_norm": 2.1816294193267822, + "learning_rate": 4.89985584247038e-06, + "loss": 0.4797617793083191, + "mean_token_accuracy": 0.8549500703811646, + "num_tokens": 4901106.0, + "step": 538 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 1.7347146272659302, + "learning_rate": 4.899268155847667e-06, + "loss": 0.4754739999771118, + "mean_token_accuracy": 0.8278418183326721, + "num_tokens": 4912131.0, + "step": 539 + }, + { + "epoch": 0.41033434650455924, + "grad_norm": 2.0694527626037598, + "learning_rate": 4.898678785326205e-06, + "loss": 0.5071008801460266, + "mean_token_accuracy": 0.8157946467399597, + "num_tokens": 4921141.0, + "step": 540 + }, + { + "epoch": 0.41109422492401215, + "grad_norm": 2.570047616958618, + "learning_rate": 4.898087731319637e-06, + "loss": 0.43639278411865234, + "mean_token_accuracy": 0.8682913780212402, + "num_tokens": 4926182.0, + "step": 541 + }, + { + "epoch": 0.41185410334346506, + "grad_norm": 4.064006805419922, + "learning_rate": 4.8974949942427854e-06, + "loss": 0.539260745048523, + "mean_token_accuracy": 0.8225528001785278, + "num_tokens": 4929449.0, + "step": 542 + }, + { + "epoch": 0.4126139817629179, + "grad_norm": 1.7644332647323608, + "learning_rate": 4.896900574511657e-06, + "loss": 0.472618043422699, + "mean_token_accuracy": 0.8332902193069458, + "num_tokens": 4939443.0, + "step": 543 + }, + { + "epoch": 0.4133738601823708, + "grad_norm": 2.879918336868286, + "learning_rate": 4.89630447254344e-06, + "loss": 0.6360667943954468, + "mean_token_accuracy": 0.8215296268463135, + "num_tokens": 4950838.0, + "step": 544 + }, + { + "epoch": 0.41413373860182373, + "grad_norm": 1.4575570821762085, + "learning_rate": 4.8957066887565005e-06, + "loss": 0.45617997646331787, + "mean_token_accuracy": 0.8373187184333801, + "num_tokens": 4965222.0, + "step": 545 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 2.4829535484313965, + "learning_rate": 4.895107223570386e-06, + "loss": 0.42285341024398804, + "mean_token_accuracy": 0.8686380386352539, + "num_tokens": 4970724.0, + "step": 546 + }, + { + "epoch": 0.4156534954407295, + "grad_norm": 2.639474630355835, + "learning_rate": 4.894506077405824e-06, + "loss": 0.5906289219856262, + "mean_token_accuracy": 0.8174435496330261, + "num_tokens": 4976766.0, + "step": 547 + }, + { + "epoch": 0.41641337386018235, + "grad_norm": 2.7960562705993652, + "learning_rate": 4.893903250684723e-06, + "loss": 0.4518949091434479, + "mean_token_accuracy": 0.8387585282325745, + "num_tokens": 4980991.0, + "step": 548 + }, + { + "epoch": 0.41717325227963525, + "grad_norm": 2.184176206588745, + "learning_rate": 4.893298743830168e-06, + "loss": 0.5223842859268188, + "mean_token_accuracy": 0.8170937299728394, + "num_tokens": 4987781.0, + "step": 549 + }, + { + "epoch": 0.41793313069908816, + "grad_norm": 2.2393438816070557, + "learning_rate": 4.892692557266429e-06, + "loss": 0.5238431692123413, + "mean_token_accuracy": 0.8217905759811401, + "num_tokens": 4994321.0, + "step": 550 + }, + { + "epoch": 0.418693009118541, + "grad_norm": 3.579047441482544, + "learning_rate": 4.8920846914189465e-06, + "loss": 0.5367584228515625, + "mean_token_accuracy": 0.8312011361122131, + "num_tokens": 4997951.0, + "step": 551 + }, + { + "epoch": 0.4194528875379939, + "grad_norm": 1.6330240964889526, + "learning_rate": 4.891475146714348e-06, + "loss": 0.6054705381393433, + "mean_token_accuracy": 0.7938206791877747, + "num_tokens": 5012726.0, + "step": 552 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 1.5775716304779053, + "learning_rate": 4.8908639235804324e-06, + "loss": 0.4774656891822815, + "mean_token_accuracy": 0.828762948513031, + "num_tokens": 5026751.0, + "step": 553 + }, + { + "epoch": 0.4209726443768997, + "grad_norm": 1.5719101428985596, + "learning_rate": 4.890251022446181e-06, + "loss": 0.549429178237915, + "mean_token_accuracy": 0.8110791444778442, + "num_tokens": 5041861.0, + "step": 554 + }, + { + "epoch": 0.4217325227963526, + "grad_norm": 1.8585275411605835, + "learning_rate": 4.889636443741752e-06, + "loss": 0.4448118805885315, + "mean_token_accuracy": 0.8462690711021423, + "num_tokens": 5052690.0, + "step": 555 + }, + { + "epoch": 0.42249240121580545, + "grad_norm": 2.189202070236206, + "learning_rate": 4.88902018789848e-06, + "loss": 0.4296762943267822, + "mean_token_accuracy": 0.8488791584968567, + "num_tokens": 5058964.0, + "step": 556 + }, + { + "epoch": 0.42325227963525835, + "grad_norm": 1.9328460693359375, + "learning_rate": 4.888402255348877e-06, + "loss": 0.5369474291801453, + "mean_token_accuracy": 0.8184729814529419, + "num_tokens": 5068465.0, + "step": 557 + }, + { + "epoch": 0.42401215805471126, + "grad_norm": 1.6233323812484741, + "learning_rate": 4.887782646526631e-06, + "loss": 0.5284391641616821, + "mean_token_accuracy": 0.8276044726371765, + "num_tokens": 5081052.0, + "step": 558 + }, + { + "epoch": 0.4247720364741641, + "grad_norm": 2.222813844680786, + "learning_rate": 4.887161361866608e-06, + "loss": 0.5679137706756592, + "mean_token_accuracy": 0.8012375831604004, + "num_tokens": 5090001.0, + "step": 559 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.1062207221984863, + "learning_rate": 4.8865384018048494e-06, + "loss": 0.5554201602935791, + "mean_token_accuracy": 0.8128066062927246, + "num_tokens": 5097644.0, + "step": 560 + }, + { + "epoch": 0.42629179331306993, + "grad_norm": 1.5380984544754028, + "learning_rate": 4.8859137667785735e-06, + "loss": 0.4948265850543976, + "mean_token_accuracy": 0.8258291482925415, + "num_tokens": 5110069.0, + "step": 561 + }, + { + "epoch": 0.4270516717325228, + "grad_norm": 2.0290257930755615, + "learning_rate": 4.8852874572261715e-06, + "loss": 0.4969530403614044, + "mean_token_accuracy": 0.8297134637832642, + "num_tokens": 5117452.0, + "step": 562 + }, + { + "epoch": 0.4278115501519757, + "grad_norm": 1.5651452541351318, + "learning_rate": 4.884659473587213e-06, + "loss": 0.5353102087974548, + "mean_token_accuracy": 0.8161719441413879, + "num_tokens": 5133756.0, + "step": 563 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.2470998764038086, + "learning_rate": 4.884029816302441e-06, + "loss": 0.5104288458824158, + "mean_token_accuracy": 0.8081635236740112, + "num_tokens": 5140278.0, + "step": 564 + }, + { + "epoch": 0.42933130699088146, + "grad_norm": 1.726891279220581, + "learning_rate": 4.883398485813772e-06, + "loss": 0.4508771002292633, + "mean_token_accuracy": 0.8548800349235535, + "num_tokens": 5150115.0, + "step": 565 + }, + { + "epoch": 0.43009118541033436, + "grad_norm": 1.4779289960861206, + "learning_rate": 4.8827654825642984e-06, + "loss": 0.46861088275909424, + "mean_token_accuracy": 0.8209476470947266, + "num_tokens": 5163225.0, + "step": 566 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 1.2361034154891968, + "learning_rate": 4.882130806998287e-06, + "loss": 0.4591076672077179, + "mean_token_accuracy": 0.803041934967041, + "num_tokens": 5180342.0, + "step": 567 + }, + { + "epoch": 0.4316109422492401, + "grad_norm": 1.882467269897461, + "learning_rate": 4.881494459561177e-06, + "loss": 0.579258143901825, + "mean_token_accuracy": 0.8007112741470337, + "num_tokens": 5189595.0, + "step": 568 + }, + { + "epoch": 0.43237082066869303, + "grad_norm": 1.095462441444397, + "learning_rate": 4.880856440699582e-06, + "loss": 0.3806574046611786, + "mean_token_accuracy": 0.8650111556053162, + "num_tokens": 5211642.0, + "step": 569 + }, + { + "epoch": 0.4331306990881459, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.880216750861288e-06, + "loss": 0.544589638710022, + "mean_token_accuracy": 0.8060122728347778, + "num_tokens": 5224137.0, + "step": 570 + }, + { + "epoch": 0.4338905775075988, + "grad_norm": 1.8561251163482666, + "learning_rate": 4.879575390495254e-06, + "loss": 0.4094924330711365, + "mean_token_accuracy": 0.8591406345367432, + "num_tokens": 5231588.0, + "step": 571 + }, + { + "epoch": 0.43465045592705165, + "grad_norm": 3.01326847076416, + "learning_rate": 4.878932360051611e-06, + "loss": 0.6139192581176758, + "mean_token_accuracy": 0.8108739852905273, + "num_tokens": 5236853.0, + "step": 572 + }, + { + "epoch": 0.43541033434650456, + "grad_norm": 2.1753034591674805, + "learning_rate": 4.878287659981663e-06, + "loss": 0.49082931876182556, + "mean_token_accuracy": 0.862828254699707, + "num_tokens": 5243264.0, + "step": 573 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 1.4437755346298218, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.5608728528022766, + "mean_token_accuracy": 0.8271626234054565, + "num_tokens": 5261757.0, + "step": 574 + }, + { + "epoch": 0.4369300911854103, + "grad_norm": 1.786683440208435, + "learning_rate": 4.876993252773923e-06, + "loss": 0.4377627968788147, + "mean_token_accuracy": 0.844936192035675, + "num_tokens": 5271038.0, + "step": 575 + }, + { + "epoch": 0.4376899696048632, + "grad_norm": 1.3425915241241455, + "learning_rate": 4.876343546544596e-06, + "loss": 0.44762521982192993, + "mean_token_accuracy": 0.8397793769836426, + "num_tokens": 5285555.0, + "step": 576 + }, + { + "epoch": 0.43844984802431614, + "grad_norm": 2.1549675464630127, + "learning_rate": 4.8756921725058935e-06, + "loss": 0.5332942008972168, + "mean_token_accuracy": 0.820149302482605, + "num_tokens": 5294595.0, + "step": 577 + }, + { + "epoch": 0.439209726443769, + "grad_norm": 1.5254042148590088, + "learning_rate": 4.875039131114975e-06, + "loss": 0.3646543622016907, + "mean_token_accuracy": 0.8442583084106445, + "num_tokens": 5304955.0, + "step": 578 + }, + { + "epoch": 0.4399696048632219, + "grad_norm": 1.5751557350158691, + "learning_rate": 4.8743844228301676e-06, + "loss": 0.4854734539985657, + "mean_token_accuracy": 0.8317523002624512, + "num_tokens": 5317351.0, + "step": 579 + }, + { + "epoch": 0.44072948328267475, + "grad_norm": 1.6950466632843018, + "learning_rate": 4.873728048110973e-06, + "loss": 0.5907570719718933, + "mean_token_accuracy": 0.7946986556053162, + "num_tokens": 5332542.0, + "step": 580 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 2.1180708408355713, + "learning_rate": 4.873070007418059e-06, + "loss": 0.5220296382904053, + "mean_token_accuracy": 0.8037363290786743, + "num_tokens": 5341722.0, + "step": 581 + }, + { + "epoch": 0.44224924012158057, + "grad_norm": 1.3643816709518433, + "learning_rate": 4.872410301213265e-06, + "loss": 0.4865502417087555, + "mean_token_accuracy": 0.8377852439880371, + "num_tokens": 5359359.0, + "step": 582 + }, + { + "epoch": 0.4430091185410334, + "grad_norm": 1.483280897140503, + "learning_rate": 4.871748929959598e-06, + "loss": 0.36856764554977417, + "mean_token_accuracy": 0.8709549903869629, + "num_tokens": 5369749.0, + "step": 583 + }, + { + "epoch": 0.44376899696048633, + "grad_norm": 1.6891541481018066, + "learning_rate": 4.871085894121234e-06, + "loss": 0.5768930912017822, + "mean_token_accuracy": 0.8030461668968201, + "num_tokens": 5383912.0, + "step": 584 + }, + { + "epoch": 0.44452887537993924, + "grad_norm": 2.1318740844726562, + "learning_rate": 4.870421194163515e-06, + "loss": 0.4337100386619568, + "mean_token_accuracy": 0.8562518358230591, + "num_tokens": 5389412.0, + "step": 585 + }, + { + "epoch": 0.4452887537993921, + "grad_norm": 2.540255546569824, + "learning_rate": 4.869754830552956e-06, + "loss": 0.4708256125450134, + "mean_token_accuracy": 0.8446552753448486, + "num_tokens": 5394762.0, + "step": 586 + }, + { + "epoch": 0.446048632218845, + "grad_norm": 2.048015594482422, + "learning_rate": 4.869086803757235e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8181137442588806, + "num_tokens": 5402379.0, + "step": 587 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 2.9821012020111084, + "learning_rate": 4.868417114245199e-06, + "loss": 0.6299797296524048, + "mean_token_accuracy": 0.8237329125404358, + "num_tokens": 5408229.0, + "step": 588 + }, + { + "epoch": 0.44756838905775076, + "grad_norm": 1.7807202339172363, + "learning_rate": 4.867745762486862e-06, + "loss": 0.5176759958267212, + "mean_token_accuracy": 0.8184244632720947, + "num_tokens": 5418383.0, + "step": 589 + }, + { + "epoch": 0.44832826747720367, + "grad_norm": 1.5466399192810059, + "learning_rate": 4.8670727489534035e-06, + "loss": 0.5137228965759277, + "mean_token_accuracy": 0.8365053534507751, + "num_tokens": 5432127.0, + "step": 590 + }, + { + "epoch": 0.4490881458966565, + "grad_norm": 2.9521141052246094, + "learning_rate": 4.866398074117173e-06, + "loss": 0.4056887924671173, + "mean_token_accuracy": 0.8561501502990723, + "num_tokens": 5436062.0, + "step": 591 + }, + { + "epoch": 0.44984802431610943, + "grad_norm": 2.058743953704834, + "learning_rate": 4.86572173845168e-06, + "loss": 0.6124799251556396, + "mean_token_accuracy": 0.8007957339286804, + "num_tokens": 5444989.0, + "step": 592 + }, + { + "epoch": 0.4506079027355623, + "grad_norm": 2.1243767738342285, + "learning_rate": 4.865043742431605e-06, + "loss": 0.5659694671630859, + "mean_token_accuracy": 0.8084750175476074, + "num_tokens": 5453865.0, + "step": 593 + }, + { + "epoch": 0.4513677811550152, + "grad_norm": 1.6732314825057983, + "learning_rate": 4.864364086532792e-06, + "loss": 0.47879064083099365, + "mean_token_accuracy": 0.8346436023712158, + "num_tokens": 5466398.0, + "step": 594 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 1.3793858289718628, + "learning_rate": 4.863682771232249e-06, + "loss": 0.45989373326301575, + "mean_token_accuracy": 0.8254791498184204, + "num_tokens": 5482121.0, + "step": 595 + }, + { + "epoch": 0.45288753799392095, + "grad_norm": 1.9812315702438354, + "learning_rate": 4.862999797008149e-06, + "loss": 0.5778874754905701, + "mean_token_accuracy": 0.8041508197784424, + "num_tokens": 5493000.0, + "step": 596 + }, + { + "epoch": 0.45364741641337386, + "grad_norm": 3.3065083026885986, + "learning_rate": 4.862315164339829e-06, + "loss": 0.4623975157737732, + "mean_token_accuracy": 0.8426318168640137, + "num_tokens": 5496723.0, + "step": 597 + }, + { + "epoch": 0.45440729483282677, + "grad_norm": 3.167119026184082, + "learning_rate": 4.861628873707792e-06, + "loss": 0.6984533667564392, + "mean_token_accuracy": 0.772136926651001, + "num_tokens": 5501161.0, + "step": 598 + }, + { + "epoch": 0.4551671732522796, + "grad_norm": 2.2130985260009766, + "learning_rate": 4.860940925593703e-06, + "loss": 0.4823192059993744, + "mean_token_accuracy": 0.8462972640991211, + "num_tokens": 5509544.0, + "step": 599 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 3.029191732406616, + "learning_rate": 4.86025132048039e-06, + "loss": 0.523664116859436, + "mean_token_accuracy": 0.8229140043258667, + "num_tokens": 5514586.0, + "step": 600 + }, + { + "epoch": 0.4566869300911854, + "grad_norm": 1.6983962059020996, + "learning_rate": 4.859560058851844e-06, + "loss": 0.4832698106765747, + "mean_token_accuracy": 0.8403248190879822, + "num_tokens": 5525773.0, + "step": 601 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 3.0504038333892822, + "learning_rate": 4.8588671411932195e-06, + "loss": 0.5158926248550415, + "mean_token_accuracy": 0.8098392486572266, + "num_tokens": 5529739.0, + "step": 602 + }, + { + "epoch": 0.4582066869300912, + "grad_norm": 2.584836483001709, + "learning_rate": 4.858172567990832e-06, + "loss": 0.5724587440490723, + "mean_token_accuracy": 0.8128519058227539, + "num_tokens": 5535763.0, + "step": 603 + }, + { + "epoch": 0.45896656534954405, + "grad_norm": 2.0514042377471924, + "learning_rate": 4.857476339732162e-06, + "loss": 0.4337679445743561, + "mean_token_accuracy": 0.8405929207801819, + "num_tokens": 5543075.0, + "step": 604 + }, + { + "epoch": 0.45972644376899696, + "grad_norm": 2.2949347496032715, + "learning_rate": 4.856778456905846e-06, + "loss": 0.46532145142555237, + "mean_token_accuracy": 0.8345137238502502, + "num_tokens": 5549035.0, + "step": 605 + }, + { + "epoch": 0.46048632218844987, + "grad_norm": 2.2067551612854004, + "learning_rate": 4.856078920001689e-06, + "loss": 0.5855136513710022, + "mean_token_accuracy": 0.8043795228004456, + "num_tokens": 5555545.0, + "step": 606 + }, + { + "epoch": 0.4612462006079027, + "grad_norm": 2.101945161819458, + "learning_rate": 4.855377729510648e-06, + "loss": 0.6071814298629761, + "mean_token_accuracy": 0.7973253130912781, + "num_tokens": 5563615.0, + "step": 607 + }, + { + "epoch": 0.46200607902735563, + "grad_norm": 2.5958821773529053, + "learning_rate": 4.8546748859248504e-06, + "loss": 0.6278061866760254, + "mean_token_accuracy": 0.7864972352981567, + "num_tokens": 5570078.0, + "step": 608 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 2.778101921081543, + "learning_rate": 4.853970389737576e-06, + "loss": 0.35521194338798523, + "mean_token_accuracy": 0.8752605319023132, + "num_tokens": 5573995.0, + "step": 609 + }, + { + "epoch": 0.4635258358662614, + "grad_norm": 2.600534677505493, + "learning_rate": 4.8532642414432675e-06, + "loss": 0.6541563868522644, + "mean_token_accuracy": 0.7843613028526306, + "num_tokens": 5580333.0, + "step": 610 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.778337836265564, + "learning_rate": 4.852556441537528e-06, + "loss": 0.3561405837535858, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 5588430.0, + "step": 611 + }, + { + "epoch": 0.46504559270516715, + "grad_norm": 1.5653862953186035, + "learning_rate": 4.851846990517118e-06, + "loss": 0.6067906618118286, + "mean_token_accuracy": 0.7919317483901978, + "num_tokens": 5601700.0, + "step": 612 + }, + { + "epoch": 0.46580547112462006, + "grad_norm": 1.6097723245620728, + "learning_rate": 4.851135888879958e-06, + "loss": 0.446664422750473, + "mean_token_accuracy": 0.8441969156265259, + "num_tokens": 5612063.0, + "step": 613 + }, + { + "epoch": 0.46656534954407297, + "grad_norm": 1.961207389831543, + "learning_rate": 4.850423137125126e-06, + "loss": 0.5508605241775513, + "mean_token_accuracy": 0.8240450024604797, + "num_tokens": 5620245.0, + "step": 614 + }, + { + "epoch": 0.4673252279635258, + "grad_norm": 2.2189085483551025, + "learning_rate": 4.8497087357528585e-06, + "loss": 0.6805076599121094, + "mean_token_accuracy": 0.771978497505188, + "num_tokens": 5629590.0, + "step": 615 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 2.5176279544830322, + "learning_rate": 4.8489926852645505e-06, + "loss": 0.4512156844139099, + "mean_token_accuracy": 0.836459755897522, + "num_tokens": 5635259.0, + "step": 616 + }, + { + "epoch": 0.4688449848024316, + "grad_norm": 1.5327287912368774, + "learning_rate": 4.848274986162754e-06, + "loss": 0.4884302616119385, + "mean_token_accuracy": 0.8194037079811096, + "num_tokens": 5649993.0, + "step": 617 + }, + { + "epoch": 0.4696048632218845, + "grad_norm": 2.184554100036621, + "learning_rate": 4.847555638951177e-06, + "loss": 0.5141451358795166, + "mean_token_accuracy": 0.8245922327041626, + "num_tokens": 5657375.0, + "step": 618 + }, + { + "epoch": 0.4703647416413374, + "grad_norm": 1.6143407821655273, + "learning_rate": 4.846834644134686e-06, + "loss": 0.4276641607284546, + "mean_token_accuracy": 0.8481845855712891, + "num_tokens": 5667941.0, + "step": 619 + }, + { + "epoch": 0.47112462006079026, + "grad_norm": 2.3747270107269287, + "learning_rate": 4.846112002219301e-06, + "loss": 0.5608246922492981, + "mean_token_accuracy": 0.8073011040687561, + "num_tokens": 5675042.0, + "step": 620 + }, + { + "epoch": 0.47188449848024316, + "grad_norm": 2.390404224395752, + "learning_rate": 4.845387713712203e-06, + "loss": 0.46616724133491516, + "mean_token_accuracy": 0.8468319177627563, + "num_tokens": 5680207.0, + "step": 621 + }, + { + "epoch": 0.4726443768996961, + "grad_norm": 1.7245099544525146, + "learning_rate": 4.844661779121723e-06, + "loss": 0.5652435421943665, + "mean_token_accuracy": 0.8010749816894531, + "num_tokens": 5693759.0, + "step": 622 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 2.6923108100891113, + "learning_rate": 4.843934198957351e-06, + "loss": 0.6254661679267883, + "mean_token_accuracy": 0.8236024975776672, + "num_tokens": 5699916.0, + "step": 623 + }, + { + "epoch": 0.47416413373860183, + "grad_norm": 2.516901969909668, + "learning_rate": 4.84320497372973e-06, + "loss": 0.6334252953529358, + "mean_token_accuracy": 0.7803834676742554, + "num_tokens": 5706554.0, + "step": 624 + }, + { + "epoch": 0.4749240121580547, + "grad_norm": 2.3744447231292725, + "learning_rate": 4.842474103950658e-06, + "loss": 0.4221811890602112, + "mean_token_accuracy": 0.8639545440673828, + "num_tokens": 5711756.0, + "step": 625 + }, + { + "epoch": 0.4756838905775076, + "grad_norm": 3.2373476028442383, + "learning_rate": 4.841741590133089e-06, + "loss": 0.6637828946113586, + "mean_token_accuracy": 0.7968347072601318, + "num_tokens": 5716458.0, + "step": 626 + }, + { + "epoch": 0.4764437689969605, + "grad_norm": 2.153888463973999, + "learning_rate": 4.841007432791129e-06, + "loss": 0.4877486228942871, + "mean_token_accuracy": 0.8345249891281128, + "num_tokens": 5723155.0, + "step": 627 + }, + { + "epoch": 0.47720364741641336, + "grad_norm": 2.120497703552246, + "learning_rate": 4.8402716324400375e-06, + "loss": 0.37323033809661865, + "mean_token_accuracy": 0.8734050393104553, + "num_tokens": 5729171.0, + "step": 628 + }, + { + "epoch": 0.47796352583586627, + "grad_norm": 1.5294172763824463, + "learning_rate": 4.839534189596228e-06, + "loss": 0.4057067334651947, + "mean_token_accuracy": 0.8523319959640503, + "num_tokens": 5740112.0, + "step": 629 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 2.1913886070251465, + "learning_rate": 4.8387951047772656e-06, + "loss": 0.4835960865020752, + "mean_token_accuracy": 0.8438145518302917, + "num_tokens": 5746838.0, + "step": 630 + }, + { + "epoch": 0.479483282674772, + "grad_norm": 1.482897162437439, + "learning_rate": 4.838054378501868e-06, + "loss": 0.46967992186546326, + "mean_token_accuracy": 0.8315759897232056, + "num_tokens": 5760428.0, + "step": 631 + }, + { + "epoch": 0.48024316109422494, + "grad_norm": 1.38850998878479, + "learning_rate": 4.837312011289907e-06, + "loss": 0.41845446825027466, + "mean_token_accuracy": 0.8557186126708984, + "num_tokens": 5773437.0, + "step": 632 + }, + { + "epoch": 0.4810030395136778, + "grad_norm": 3.8337457180023193, + "learning_rate": 4.836568003662403e-06, + "loss": 0.5102912187576294, + "mean_token_accuracy": 0.830644965171814, + "num_tokens": 5776367.0, + "step": 633 + }, + { + "epoch": 0.4817629179331307, + "grad_norm": 1.2084007263183594, + "learning_rate": 4.8358223561415304e-06, + "loss": 0.3835333585739136, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 5792246.0, + "step": 634 + }, + { + "epoch": 0.4825227963525836, + "grad_norm": 1.939408540725708, + "learning_rate": 4.835075069250613e-06, + "loss": 0.4044850468635559, + "mean_token_accuracy": 0.8488376140594482, + "num_tokens": 5799853.0, + "step": 635 + }, + { + "epoch": 0.48328267477203646, + "grad_norm": 1.345870852470398, + "learning_rate": 4.8343261435141245e-06, + "loss": 0.46660199761390686, + "mean_token_accuracy": 0.8371681571006775, + "num_tokens": 5817478.0, + "step": 636 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 1.6531339883804321, + "learning_rate": 4.833575579457691e-06, + "loss": 0.3886989951133728, + "mean_token_accuracy": 0.8763507008552551, + "num_tokens": 5825739.0, + "step": 637 + }, + { + "epoch": 0.4848024316109423, + "grad_norm": 1.6443969011306763, + "learning_rate": 4.832823377608088e-06, + "loss": 0.4070289731025696, + "mean_token_accuracy": 0.8586630821228027, + "num_tokens": 5837917.0, + "step": 638 + }, + { + "epoch": 0.48556231003039513, + "grad_norm": 2.005136013031006, + "learning_rate": 4.832069538493237e-06, + "loss": 0.40616685152053833, + "mean_token_accuracy": 0.8571510314941406, + "num_tokens": 5845250.0, + "step": 639 + }, + { + "epoch": 0.48632218844984804, + "grad_norm": 1.5244266986846924, + "learning_rate": 4.831314062642213e-06, + "loss": 0.49530288577079773, + "mean_token_accuracy": 0.8328841924667358, + "num_tokens": 5857407.0, + "step": 640 + }, + { + "epoch": 0.4870820668693009, + "grad_norm": 1.9876971244812012, + "learning_rate": 4.830556950585239e-06, + "loss": 0.4583776593208313, + "mean_token_accuracy": 0.8427221179008484, + "num_tokens": 5865391.0, + "step": 641 + }, + { + "epoch": 0.4878419452887538, + "grad_norm": 3.023336172103882, + "learning_rate": 4.829798202853683e-06, + "loss": 0.6134771108627319, + "mean_token_accuracy": 0.7981935739517212, + "num_tokens": 5870729.0, + "step": 642 + }, + { + "epoch": 0.4886018237082067, + "grad_norm": 1.8889515399932861, + "learning_rate": 4.829037819980065e-06, + "loss": 0.4420135021209717, + "mean_token_accuracy": 0.8480775356292725, + "num_tokens": 5878982.0, + "step": 643 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.2408435344696045, + "learning_rate": 4.828275802498051e-06, + "loss": 0.525706946849823, + "mean_token_accuracy": 0.8271557092666626, + "num_tokens": 5885097.0, + "step": 644 + }, + { + "epoch": 0.49012158054711247, + "grad_norm": 1.9734224081039429, + "learning_rate": 4.827512150942454e-06, + "loss": 0.44246578216552734, + "mean_token_accuracy": 0.8456668257713318, + "num_tokens": 5893941.0, + "step": 645 + }, + { + "epoch": 0.4908814589665654, + "grad_norm": 1.9618173837661743, + "learning_rate": 4.8267468658492335e-06, + "loss": 0.5119768381118774, + "mean_token_accuracy": 0.8355510830879211, + "num_tokens": 5902829.0, + "step": 646 + }, + { + "epoch": 0.49164133738601823, + "grad_norm": 1.7181587219238281, + "learning_rate": 4.825979947755496e-06, + "loss": 0.5666520595550537, + "mean_token_accuracy": 0.7951971888542175, + "num_tokens": 5915212.0, + "step": 647 + }, + { + "epoch": 0.49240121580547114, + "grad_norm": 3.0121164321899414, + "learning_rate": 4.8252113971994955e-06, + "loss": 0.628632128238678, + "mean_token_accuracy": 0.8041050434112549, + "num_tokens": 5921410.0, + "step": 648 + }, + { + "epoch": 0.493161094224924, + "grad_norm": 2.9980475902557373, + "learning_rate": 4.824441214720629e-06, + "loss": 0.4507424831390381, + "mean_token_accuracy": 0.8636263608932495, + "num_tokens": 5925179.0, + "step": 649 + }, + { + "epoch": 0.4939209726443769, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.823669400859441e-06, + "loss": 0.602759838104248, + "mean_token_accuracy": 0.8104915618896484, + "num_tokens": 5934160.0, + "step": 650 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 1.1186442375183105, + "learning_rate": 4.8228959561576195e-06, + "loss": 0.41168469190597534, + "mean_token_accuracy": 0.8461419939994812, + "num_tokens": 5954163.0, + "step": 651 + }, + { + "epoch": 0.49544072948328266, + "grad_norm": 1.855465054512024, + "learning_rate": 4.822120881157998e-06, + "loss": 0.5049735307693481, + "mean_token_accuracy": 0.8225747346878052, + "num_tokens": 5963840.0, + "step": 652 + }, + { + "epoch": 0.49620060790273557, + "grad_norm": 3.550563335418701, + "learning_rate": 4.821344176404554e-06, + "loss": 0.49025264382362366, + "mean_token_accuracy": 0.8265978693962097, + "num_tokens": 5967358.0, + "step": 653 + }, + { + "epoch": 0.4969604863221885, + "grad_norm": 3.063910484313965, + "learning_rate": 4.820565842442408e-06, + "loss": 0.5652767419815063, + "mean_token_accuracy": 0.811700701713562, + "num_tokens": 5971858.0, + "step": 654 + }, + { + "epoch": 0.49772036474164133, + "grad_norm": 2.4613308906555176, + "learning_rate": 4.819785879817827e-06, + "loss": 0.5296125411987305, + "mean_token_accuracy": 0.8336488008499146, + "num_tokens": 5977442.0, + "step": 655 + }, + { + "epoch": 0.49848024316109424, + "grad_norm": 2.342519760131836, + "learning_rate": 4.819004289078217e-06, + "loss": 0.5753380060195923, + "mean_token_accuracy": 0.7922406792640686, + "num_tokens": 5984531.0, + "step": 656 + }, + { + "epoch": 0.4992401215805471, + "grad_norm": 2.0410680770874023, + "learning_rate": 4.818221070772129e-06, + "loss": 0.5433275699615479, + "mean_token_accuracy": 0.8043830990791321, + "num_tokens": 5992642.0, + "step": 657 + }, + { + "epoch": 0.5, + "grad_norm": 1.4999698400497437, + "learning_rate": 4.8174362254492555e-06, + "loss": 0.5248899459838867, + "mean_token_accuracy": 0.8107168674468994, + "num_tokens": 6005543.0, + "step": 658 + }, + { + "epoch": 0.5007598784194529, + "grad_norm": 1.9494401216506958, + "learning_rate": 4.816649753660431e-06, + "loss": 0.41291385889053345, + "mean_token_accuracy": 0.8650569915771484, + "num_tokens": 6012185.0, + "step": 659 + }, + { + "epoch": 0.5015197568389058, + "grad_norm": 2.7514095306396484, + "learning_rate": 4.815861655957632e-06, + "loss": 0.4244142770767212, + "mean_token_accuracy": 0.8485112190246582, + "num_tokens": 6016809.0, + "step": 660 + }, + { + "epoch": 0.5022796352583586, + "grad_norm": 1.4354928731918335, + "learning_rate": 4.815071932893976e-06, + "loss": 0.4332060217857361, + "mean_token_accuracy": 0.8386815786361694, + "num_tokens": 6034795.0, + "step": 661 + }, + { + "epoch": 0.5030395136778115, + "grad_norm": 1.3113417625427246, + "learning_rate": 4.81428058502372e-06, + "loss": 0.5415540933609009, + "mean_token_accuracy": 0.8115285038948059, + "num_tokens": 6053624.0, + "step": 662 + }, + { + "epoch": 0.5037993920972644, + "grad_norm": 1.820868730545044, + "learning_rate": 4.813487612902265e-06, + "loss": 0.5360245108604431, + "mean_token_accuracy": 0.8313555717468262, + "num_tokens": 6063399.0, + "step": 663 + }, + { + "epoch": 0.5045592705167173, + "grad_norm": 2.347001552581787, + "learning_rate": 4.812693017086145e-06, + "loss": 0.4926982820034027, + "mean_token_accuracy": 0.8137006759643555, + "num_tokens": 6070111.0, + "step": 664 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 1.8830888271331787, + "learning_rate": 4.811896798133042e-06, + "loss": 0.5419014692306519, + "mean_token_accuracy": 0.8027454614639282, + "num_tokens": 6081090.0, + "step": 665 + }, + { + "epoch": 0.506079027355623, + "grad_norm": 2.3258056640625, + "learning_rate": 4.811098956601772e-06, + "loss": 0.4629337787628174, + "mean_token_accuracy": 0.8416580557823181, + "num_tokens": 6087921.0, + "step": 666 + }, + { + "epoch": 0.506838905775076, + "grad_norm": 1.9578291177749634, + "learning_rate": 4.810299493052289e-06, + "loss": 0.40305402874946594, + "mean_token_accuracy": 0.8529061079025269, + "num_tokens": 6100034.0, + "step": 667 + }, + { + "epoch": 0.5075987841945289, + "grad_norm": 2.800635576248169, + "learning_rate": 4.809498408045691e-06, + "loss": 0.5087342262268066, + "mean_token_accuracy": 0.8214689493179321, + "num_tokens": 6104742.0, + "step": 668 + }, + { + "epoch": 0.5083586626139818, + "grad_norm": 1.5318149328231812, + "learning_rate": 4.808695702144206e-06, + "loss": 0.4733222723007202, + "mean_token_accuracy": 0.837577223777771, + "num_tokens": 6117242.0, + "step": 669 + }, + { + "epoch": 0.5091185410334347, + "grad_norm": 1.2368661165237427, + "learning_rate": 4.807891375911207e-06, + "loss": 0.3929097056388855, + "mean_token_accuracy": 0.8331400752067566, + "num_tokens": 6133509.0, + "step": 670 + }, + { + "epoch": 0.5098784194528876, + "grad_norm": 2.4711415767669678, + "learning_rate": 4.8070854299112e-06, + "loss": 0.6294851303100586, + "mean_token_accuracy": 0.7956781983375549, + "num_tokens": 6140294.0, + "step": 671 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.590961217880249, + "learning_rate": 4.806277864709828e-06, + "loss": 0.580160915851593, + "mean_token_accuracy": 0.809589684009552, + "num_tokens": 6145803.0, + "step": 672 + }, + { + "epoch": 0.5113981762917933, + "grad_norm": 2.4653842449188232, + "learning_rate": 4.805468680873874e-06, + "loss": 0.5262120366096497, + "mean_token_accuracy": 0.822458803653717, + "num_tokens": 6151236.0, + "step": 673 + }, + { + "epoch": 0.5121580547112462, + "grad_norm": 2.860720157623291, + "learning_rate": 4.804657878971252e-06, + "loss": 0.4007391035556793, + "mean_token_accuracy": 0.8637382984161377, + "num_tokens": 6155310.0, + "step": 674 + }, + { + "epoch": 0.5129179331306991, + "grad_norm": 2.520282030105591, + "learning_rate": 4.803845459571014e-06, + "loss": 0.45798182487487793, + "mean_token_accuracy": 0.8270114660263062, + "num_tokens": 6160326.0, + "step": 675 + }, + { + "epoch": 0.513677811550152, + "grad_norm": 2.7290921211242676, + "learning_rate": 4.803031423243349e-06, + "loss": 0.5745848417282104, + "mean_token_accuracy": 0.8401234745979309, + "num_tokens": 6165709.0, + "step": 676 + }, + { + "epoch": 0.5144376899696048, + "grad_norm": 1.6678650379180908, + "learning_rate": 4.802215770559578e-06, + "loss": 0.5257721543312073, + "mean_token_accuracy": 0.8241991996765137, + "num_tokens": 6177875.0, + "step": 677 + }, + { + "epoch": 0.5151975683890577, + "grad_norm": 2.1720468997955322, + "learning_rate": 4.801398502092156e-06, + "loss": 0.45342206954956055, + "mean_token_accuracy": 0.8463799953460693, + "num_tokens": 6185415.0, + "step": 678 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 2.282259702682495, + "learning_rate": 4.800579618414677e-06, + "loss": 0.4864169955253601, + "mean_token_accuracy": 0.8300632238388062, + "num_tokens": 6191832.0, + "step": 679 + }, + { + "epoch": 0.5167173252279635, + "grad_norm": 2.0092248916625977, + "learning_rate": 4.799759120101861e-06, + "loss": 0.5781463980674744, + "mean_token_accuracy": 0.8267031908035278, + "num_tokens": 6199440.0, + "step": 680 + }, + { + "epoch": 0.5174772036474165, + "grad_norm": 1.396580696105957, + "learning_rate": 4.798937007729568e-06, + "loss": 0.49689239263534546, + "mean_token_accuracy": 0.8257499933242798, + "num_tokens": 6213840.0, + "step": 681 + }, + { + "epoch": 0.5182370820668692, + "grad_norm": 1.9060769081115723, + "learning_rate": 4.798113281874788e-06, + "loss": 0.48969539999961853, + "mean_token_accuracy": 0.8171790838241577, + "num_tokens": 6223006.0, + "step": 682 + }, + { + "epoch": 0.5189969604863222, + "grad_norm": 1.6255282163619995, + "learning_rate": 4.797287943115642e-06, + "loss": 0.5532330870628357, + "mean_token_accuracy": 0.8173393607139587, + "num_tokens": 6234857.0, + "step": 683 + }, + { + "epoch": 0.5197568389057751, + "grad_norm": 1.6923905611038208, + "learning_rate": 4.796460992031386e-06, + "loss": 0.4880887269973755, + "mean_token_accuracy": 0.834983229637146, + "num_tokens": 6245252.0, + "step": 684 + }, + { + "epoch": 0.520516717325228, + "grad_norm": 2.13161301612854, + "learning_rate": 4.7956324292024045e-06, + "loss": 0.5687593817710876, + "mean_token_accuracy": 0.7996571063995361, + "num_tokens": 6253726.0, + "step": 685 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 2.509375810623169, + "learning_rate": 4.794802255210217e-06, + "loss": 0.5396929979324341, + "mean_token_accuracy": 0.8007107973098755, + "num_tokens": 6259238.0, + "step": 686 + }, + { + "epoch": 0.5220364741641338, + "grad_norm": 2.393710136413574, + "learning_rate": 4.793970470637469e-06, + "loss": 0.6165191531181335, + "mean_token_accuracy": 0.7891418933868408, + "num_tokens": 6266325.0, + "step": 687 + }, + { + "epoch": 0.5227963525835866, + "grad_norm": 1.511647343635559, + "learning_rate": 4.7931370760679415e-06, + "loss": 0.4773876965045929, + "mean_token_accuracy": 0.8381044864654541, + "num_tokens": 6277447.0, + "step": 688 + }, + { + "epoch": 0.5235562310030395, + "grad_norm": 2.206587314605713, + "learning_rate": 4.792302072086542e-06, + "loss": 0.5482058525085449, + "mean_token_accuracy": 0.8239108920097351, + "num_tokens": 6285163.0, + "step": 689 + }, + { + "epoch": 0.5243161094224924, + "grad_norm": 3.018146514892578, + "learning_rate": 4.7914654592793065e-06, + "loss": 0.4880615472793579, + "mean_token_accuracy": 0.8361308574676514, + "num_tokens": 6289386.0, + "step": 690 + }, + { + "epoch": 0.5250759878419453, + "grad_norm": 1.6469231843948364, + "learning_rate": 4.790627238233405e-06, + "loss": 0.4164774715900421, + "mean_token_accuracy": 0.8496290445327759, + "num_tokens": 6298915.0, + "step": 691 + }, + { + "epoch": 0.5258358662613982, + "grad_norm": 2.352505922317505, + "learning_rate": 4.789787409537131e-06, + "loss": 0.5366303324699402, + "mean_token_accuracy": 0.8350417613983154, + "num_tokens": 6306130.0, + "step": 692 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 1.7463021278381348, + "learning_rate": 4.7889459737799105e-06, + "loss": 0.4389137923717499, + "mean_token_accuracy": 0.8463300466537476, + "num_tokens": 6315503.0, + "step": 693 + }, + { + "epoch": 0.5273556231003039, + "grad_norm": 2.257706642150879, + "learning_rate": 4.788102931552294e-06, + "loss": 0.5309344530105591, + "mean_token_accuracy": 0.8164352178573608, + "num_tokens": 6321852.0, + "step": 694 + }, + { + "epoch": 0.5281155015197568, + "grad_norm": 2.392732620239258, + "learning_rate": 4.787258283445962e-06, + "loss": 0.3956204056739807, + "mean_token_accuracy": 0.8671456575393677, + "num_tokens": 6327380.0, + "step": 695 + }, + { + "epoch": 0.5288753799392097, + "grad_norm": 2.210514545440674, + "learning_rate": 4.786412030053721e-06, + "loss": 0.4842875003814697, + "mean_token_accuracy": 0.8508446216583252, + "num_tokens": 6334898.0, + "step": 696 + }, + { + "epoch": 0.5296352583586627, + "grad_norm": 1.8678946495056152, + "learning_rate": 4.785564171969503e-06, + "loss": 0.47399595379829407, + "mean_token_accuracy": 0.8514996767044067, + "num_tokens": 6346374.0, + "step": 697 + }, + { + "epoch": 0.5303951367781155, + "grad_norm": 2.604079484939575, + "learning_rate": 4.784714709788368e-06, + "loss": 0.5950228571891785, + "mean_token_accuracy": 0.7983481884002686, + "num_tokens": 6351648.0, + "step": 698 + }, + { + "epoch": 0.5311550151975684, + "grad_norm": 1.662381649017334, + "learning_rate": 4.783863644106502e-06, + "loss": 0.41616758704185486, + "mean_token_accuracy": 0.8554803133010864, + "num_tokens": 6360506.0, + "step": 699 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 1.6300342082977295, + "learning_rate": 4.783010975521216e-06, + "loss": 0.43029269576072693, + "mean_token_accuracy": 0.8443028926849365, + "num_tokens": 6370675.0, + "step": 700 + }, + { + "epoch": 0.5326747720364742, + "grad_norm": 1.731873869895935, + "learning_rate": 4.782156704630944e-06, + "loss": 0.4383814334869385, + "mean_token_accuracy": 0.8443183898925781, + "num_tokens": 6381803.0, + "step": 701 + }, + { + "epoch": 0.5334346504559271, + "grad_norm": 3.1788413524627686, + "learning_rate": 4.7813008320352475e-06, + "loss": 0.32194480299949646, + "mean_token_accuracy": 0.8870962858200073, + "num_tokens": 6389263.0, + "step": 702 + }, + { + "epoch": 0.53419452887538, + "grad_norm": 2.099513530731201, + "learning_rate": 4.78044335833481e-06, + "loss": 0.36962923407554626, + "mean_token_accuracy": 0.8661133646965027, + "num_tokens": 6395589.0, + "step": 703 + }, + { + "epoch": 0.5349544072948328, + "grad_norm": 1.4859435558319092, + "learning_rate": 4.77958428413144e-06, + "loss": 0.4619954824447632, + "mean_token_accuracy": 0.8438555002212524, + "num_tokens": 6407470.0, + "step": 704 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.2561073303222656, + "learning_rate": 4.7787236100280685e-06, + "loss": 0.3770977258682251, + "mean_token_accuracy": 0.8515733480453491, + "num_tokens": 6422888.0, + "step": 705 + }, + { + "epoch": 0.5364741641337386, + "grad_norm": 1.4455817937850952, + "learning_rate": 4.777861336628751e-06, + "loss": 0.46481069922447205, + "mean_token_accuracy": 0.8502002954483032, + "num_tokens": 6441266.0, + "step": 706 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 1.1387295722961426, + "learning_rate": 4.7769974645386616e-06, + "loss": 0.36964765191078186, + "mean_token_accuracy": 0.8719524145126343, + "num_tokens": 6463686.0, + "step": 707 + }, + { + "epoch": 0.5379939209726444, + "grad_norm": 1.7179663181304932, + "learning_rate": 4.776131994364102e-06, + "loss": 0.4231719970703125, + "mean_token_accuracy": 0.8416585922241211, + "num_tokens": 6472956.0, + "step": 708 + }, + { + "epoch": 0.5387537993920972, + "grad_norm": 1.6328502893447876, + "learning_rate": 4.775264926712489e-06, + "loss": 0.5836569666862488, + "mean_token_accuracy": 0.8039724230766296, + "num_tokens": 6485773.0, + "step": 709 + }, + { + "epoch": 0.5395136778115501, + "grad_norm": 1.8515360355377197, + "learning_rate": 4.774396262192368e-06, + "loss": 0.5477553009986877, + "mean_token_accuracy": 0.8136521577835083, + "num_tokens": 6496379.0, + "step": 710 + }, + { + "epoch": 0.540273556231003, + "grad_norm": 1.741858959197998, + "learning_rate": 4.7735260014133986e-06, + "loss": 0.4663267731666565, + "mean_token_accuracy": 0.8473691940307617, + "num_tokens": 6507652.0, + "step": 711 + }, + { + "epoch": 0.541033434650456, + "grad_norm": 1.7516659498214722, + "learning_rate": 4.772654144986364e-06, + "loss": 0.374914288520813, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 6519030.0, + "step": 712 + }, + { + "epoch": 0.5417933130699089, + "grad_norm": 2.662343978881836, + "learning_rate": 4.7717806935231665e-06, + "loss": 0.4206875264644623, + "mean_token_accuracy": 0.8544126749038696, + "num_tokens": 6523669.0, + "step": 713 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 1.4088834524154663, + "learning_rate": 4.770905647636828e-06, + "loss": 0.5824331045150757, + "mean_token_accuracy": 0.7857901453971863, + "num_tokens": 6540560.0, + "step": 714 + }, + { + "epoch": 0.5433130699088146, + "grad_norm": 2.173656940460205, + "learning_rate": 4.77002900794149e-06, + "loss": 0.555023729801178, + "mean_token_accuracy": 0.8067290782928467, + "num_tokens": 6548946.0, + "step": 715 + }, + { + "epoch": 0.5440729483282675, + "grad_norm": 2.121018648147583, + "learning_rate": 4.769150775052411e-06, + "loss": 0.559730052947998, + "mean_token_accuracy": 0.8166372776031494, + "num_tokens": 6556065.0, + "step": 716 + }, + { + "epoch": 0.5448328267477204, + "grad_norm": 3.335866928100586, + "learning_rate": 4.768270949585968e-06, + "loss": 0.6442267894744873, + "mean_token_accuracy": 0.7858607769012451, + "num_tokens": 6560615.0, + "step": 717 + }, + { + "epoch": 0.5455927051671733, + "grad_norm": 2.3813695907592773, + "learning_rate": 4.767389532159659e-06, + "loss": 0.4027421474456787, + "mean_token_accuracy": 0.8635619282722473, + "num_tokens": 6565841.0, + "step": 718 + }, + { + "epoch": 0.5463525835866262, + "grad_norm": 2.0657708644866943, + "learning_rate": 4.766506523392095e-06, + "loss": 0.38899827003479004, + "mean_token_accuracy": 0.8660480380058289, + "num_tokens": 6572362.0, + "step": 719 + }, + { + "epoch": 0.547112462006079, + "grad_norm": 1.093705415725708, + "learning_rate": 4.765621923903005e-06, + "loss": 0.45967352390289307, + "mean_token_accuracy": 0.8338102102279663, + "num_tokens": 6595998.0, + "step": 720 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 2.942065954208374, + "learning_rate": 4.764735734313236e-06, + "loss": 0.42910510301589966, + "mean_token_accuracy": 0.8406122922897339, + "num_tokens": 6601075.0, + "step": 721 + }, + { + "epoch": 0.5486322188449848, + "grad_norm": 2.049011707305908, + "learning_rate": 4.763847955244749e-06, + "loss": 0.5584231615066528, + "mean_token_accuracy": 0.8171684741973877, + "num_tokens": 6609310.0, + "step": 722 + }, + { + "epoch": 0.5493920972644377, + "grad_norm": 2.485543966293335, + "learning_rate": 4.762958587320623e-06, + "loss": 0.5396170020103455, + "mean_token_accuracy": 0.8158525824546814, + "num_tokens": 6616185.0, + "step": 723 + }, + { + "epoch": 0.5501519756838906, + "grad_norm": 1.87015962600708, + "learning_rate": 4.762067631165049e-06, + "loss": 0.49739527702331543, + "mean_token_accuracy": 0.8303765654563904, + "num_tokens": 6625629.0, + "step": 724 + }, + { + "epoch": 0.5509118541033434, + "grad_norm": 4.239654541015625, + "learning_rate": 4.761175087403336e-06, + "loss": 0.6029239296913147, + "mean_token_accuracy": 0.8123486042022705, + "num_tokens": 6629194.0, + "step": 725 + }, + { + "epoch": 0.5516717325227963, + "grad_norm": 2.0134730339050293, + "learning_rate": 4.760280956661904e-06, + "loss": 0.4777873754501343, + "mean_token_accuracy": 0.8283513784408569, + "num_tokens": 6636929.0, + "step": 726 + }, + { + "epoch": 0.5524316109422492, + "grad_norm": 1.991780400276184, + "learning_rate": 4.75938523956829e-06, + "loss": 0.4631248116493225, + "mean_token_accuracy": 0.8275107741355896, + "num_tokens": 6645135.0, + "step": 727 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.423792839050293, + "learning_rate": 4.75848793675114e-06, + "loss": 0.49630722403526306, + "mean_token_accuracy": 0.8388000130653381, + "num_tokens": 6662690.0, + "step": 728 + }, + { + "epoch": 0.5539513677811551, + "grad_norm": 2.345294952392578, + "learning_rate": 4.757589048840219e-06, + "loss": 0.37830638885498047, + "mean_token_accuracy": 0.8782080411911011, + "num_tokens": 6667285.0, + "step": 729 + }, + { + "epoch": 0.5547112462006079, + "grad_norm": 2.7452144622802734, + "learning_rate": 4.756688576466398e-06, + "loss": 0.51595538854599, + "mean_token_accuracy": 0.8441770672798157, + "num_tokens": 6672324.0, + "step": 730 + }, + { + "epoch": 0.5554711246200608, + "grad_norm": 1.5247859954833984, + "learning_rate": 4.755786520261666e-06, + "loss": 0.48365193605422974, + "mean_token_accuracy": 0.8276445269584656, + "num_tokens": 6685296.0, + "step": 731 + }, + { + "epoch": 0.5562310030395137, + "grad_norm": 1.4018276929855347, + "learning_rate": 4.75488288085912e-06, + "loss": 0.3876481354236603, + "mean_token_accuracy": 0.8612343072891235, + "num_tokens": 6697515.0, + "step": 732 + }, + { + "epoch": 0.5569908814589666, + "grad_norm": 2.9570324420928955, + "learning_rate": 4.753977658892967e-06, + "loss": 0.5468149185180664, + "mean_token_accuracy": 0.8054271340370178, + "num_tokens": 6702194.0, + "step": 733 + }, + { + "epoch": 0.5577507598784195, + "grad_norm": 1.9282715320587158, + "learning_rate": 4.753070854998529e-06, + "loss": 0.4758574962615967, + "mean_token_accuracy": 0.8379775285720825, + "num_tokens": 6709938.0, + "step": 734 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 1.981264591217041, + "learning_rate": 4.752162469812234e-06, + "loss": 0.48461222648620605, + "mean_token_accuracy": 0.833509087562561, + "num_tokens": 6718125.0, + "step": 735 + }, + { + "epoch": 0.5592705167173252, + "grad_norm": 1.1643427610397339, + "learning_rate": 4.751252503971624e-06, + "loss": 0.410121887922287, + "mean_token_accuracy": 0.8221402764320374, + "num_tokens": 6735125.0, + "step": 736 + }, + { + "epoch": 0.5600303951367781, + "grad_norm": 1.786566972732544, + "learning_rate": 4.750340958115346e-06, + "loss": 0.5964341163635254, + "mean_token_accuracy": 0.8038164377212524, + "num_tokens": 6747369.0, + "step": 737 + }, + { + "epoch": 0.560790273556231, + "grad_norm": 1.7256991863250732, + "learning_rate": 4.749427832883158e-06, + "loss": 0.48737066984176636, + "mean_token_accuracy": 0.830894947052002, + "num_tokens": 6758115.0, + "step": 738 + }, + { + "epoch": 0.5615501519756839, + "grad_norm": 1.997747540473938, + "learning_rate": 4.748513128915928e-06, + "loss": 0.5238886475563049, + "mean_token_accuracy": 0.8066858053207397, + "num_tokens": 6766111.0, + "step": 739 + }, + { + "epoch": 0.5623100303951368, + "grad_norm": 2.127016305923462, + "learning_rate": 4.747596846855629e-06, + "loss": 0.5045586228370667, + "mean_token_accuracy": 0.821424126625061, + "num_tokens": 6772893.0, + "step": 740 + }, + { + "epoch": 0.5630699088145896, + "grad_norm": 1.7664796113967896, + "learning_rate": 4.7466789873453446e-06, + "loss": 0.42954835295677185, + "mean_token_accuracy": 0.8533384799957275, + "num_tokens": 6785133.0, + "step": 741 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 1.4987404346466064, + "learning_rate": 4.7457595510292615e-06, + "loss": 0.5378558039665222, + "mean_token_accuracy": 0.8184819221496582, + "num_tokens": 6799563.0, + "step": 742 + }, + { + "epoch": 0.5645896656534954, + "grad_norm": 1.4444655179977417, + "learning_rate": 4.744838538552678e-06, + "loss": 0.42193782329559326, + "mean_token_accuracy": 0.837514340877533, + "num_tokens": 6812470.0, + "step": 743 + }, + { + "epoch": 0.5653495440729484, + "grad_norm": 3.867751121520996, + "learning_rate": 4.7439159505619946e-06, + "loss": 0.4457814693450928, + "mean_token_accuracy": 0.8630104660987854, + "num_tokens": 6815652.0, + "step": 744 + }, + { + "epoch": 0.5661094224924013, + "grad_norm": 2.1250710487365723, + "learning_rate": 4.74299178770472e-06, + "loss": 0.5638922452926636, + "mean_token_accuracy": 0.7969781160354614, + "num_tokens": 6824566.0, + "step": 745 + }, + { + "epoch": 0.5668693009118541, + "grad_norm": 2.547072410583496, + "learning_rate": 4.742066050629465e-06, + "loss": 0.5516207814216614, + "mean_token_accuracy": 0.8160669803619385, + "num_tokens": 6830589.0, + "step": 746 + }, + { + "epoch": 0.567629179331307, + "grad_norm": 1.2975233793258667, + "learning_rate": 4.741138739985951e-06, + "loss": 0.3823344111442566, + "mean_token_accuracy": 0.8668368458747864, + "num_tokens": 6842707.0, + "step": 747 + }, + { + "epoch": 0.5683890577507599, + "grad_norm": 1.3410450220108032, + "learning_rate": 4.740209856424998e-06, + "loss": 0.5148671269416809, + "mean_token_accuracy": 0.8188045024871826, + "num_tokens": 6857624.0, + "step": 748 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 1.219467282295227, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.3998957872390747, + "mean_token_accuracy": 0.855175256729126, + "num_tokens": 6875064.0, + "step": 749 + }, + { + "epoch": 0.5699088145896657, + "grad_norm": 1.3530343770980835, + "learning_rate": 4.738347373159585e-06, + "loss": 0.5359633564949036, + "mean_token_accuracy": 0.8178457021713257, + "num_tokens": 6890911.0, + "step": 750 + }, + { + "epoch": 0.5706686930091185, + "grad_norm": 2.146988868713379, + "learning_rate": 4.737413774762287e-06, + "loss": 0.4460008144378662, + "mean_token_accuracy": 0.8172903060913086, + "num_tokens": 6896959.0, + "step": 751 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.456023097038269, + "learning_rate": 4.736478606061876e-06, + "loss": 0.43616920709609985, + "mean_token_accuracy": 0.8465108871459961, + "num_tokens": 6908904.0, + "step": 752 + }, + { + "epoch": 0.5721884498480243, + "grad_norm": 2.9696967601776123, + "learning_rate": 4.735541867714687e-06, + "loss": 0.43464532494544983, + "mean_token_accuracy": 0.8608652353286743, + "num_tokens": 6913026.0, + "step": 753 + }, + { + "epoch": 0.5729483282674772, + "grad_norm": 2.2990667819976807, + "learning_rate": 4.73460356037816e-06, + "loss": 0.6619116067886353, + "mean_token_accuracy": 0.7821142673492432, + "num_tokens": 6920588.0, + "step": 754 + }, + { + "epoch": 0.5737082066869301, + "grad_norm": 2.054746389389038, + "learning_rate": 4.733663684710835e-06, + "loss": 0.5304250717163086, + "mean_token_accuracy": 0.8265531063079834, + "num_tokens": 6928910.0, + "step": 755 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.0050594806671143, + "learning_rate": 4.732722241372354e-06, + "loss": 0.6393026113510132, + "mean_token_accuracy": 0.796819806098938, + "num_tokens": 6940217.0, + "step": 756 + }, + { + "epoch": 0.5752279635258358, + "grad_norm": 1.4285320043563843, + "learning_rate": 4.731779231023456e-06, + "loss": 0.5432837009429932, + "mean_token_accuracy": 0.8104778528213501, + "num_tokens": 6959101.0, + "step": 757 + }, + { + "epoch": 0.5759878419452887, + "grad_norm": 2.3941943645477295, + "learning_rate": 4.730834654325984e-06, + "loss": 0.46550673246383667, + "mean_token_accuracy": 0.8444503545761108, + "num_tokens": 6965036.0, + "step": 758 + }, + { + "epoch": 0.5767477203647416, + "grad_norm": 2.3850574493408203, + "learning_rate": 4.729888511942877e-06, + "loss": 0.4916389584541321, + "mean_token_accuracy": 0.8228527307510376, + "num_tokens": 6971184.0, + "step": 759 + }, + { + "epoch": 0.5775075987841946, + "grad_norm": 1.627480149269104, + "learning_rate": 4.728940804538176e-06, + "loss": 0.5863215923309326, + "mean_token_accuracy": 0.7995302677154541, + "num_tokens": 6982569.0, + "step": 760 + }, + { + "epoch": 0.5782674772036475, + "grad_norm": 1.1723195314407349, + "learning_rate": 4.727991532777016e-06, + "loss": 0.36908864974975586, + "mean_token_accuracy": 0.8355655670166016, + "num_tokens": 6998659.0, + "step": 761 + }, + { + "epoch": 0.5790273556231003, + "grad_norm": 1.5324925184249878, + "learning_rate": 4.727040697325634e-06, + "loss": 0.557658851146698, + "mean_token_accuracy": 0.8141458034515381, + "num_tokens": 7012969.0, + "step": 762 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 2.4106390476226807, + "learning_rate": 4.726088298851362e-06, + "loss": 0.5004243850708008, + "mean_token_accuracy": 0.8376860618591309, + "num_tokens": 7018301.0, + "step": 763 + }, + { + "epoch": 0.5805471124620061, + "grad_norm": 2.2594921588897705, + "learning_rate": 4.725134338022631e-06, + "loss": 0.6067016124725342, + "mean_token_accuracy": 0.8100241422653198, + "num_tokens": 7025201.0, + "step": 764 + }, + { + "epoch": 0.581306990881459, + "grad_norm": 1.4649826288223267, + "learning_rate": 4.724178815508967e-06, + "loss": 0.36200693249702454, + "mean_token_accuracy": 0.8621826171875, + "num_tokens": 7035112.0, + "step": 765 + }, + { + "epoch": 0.5820668693009119, + "grad_norm": 2.3634560108184814, + "learning_rate": 4.723221731980993e-06, + "loss": 0.41862213611602783, + "mean_token_accuracy": 0.8541463613510132, + "num_tokens": 7040339.0, + "step": 766 + }, + { + "epoch": 0.5828267477203647, + "grad_norm": 2.7798104286193848, + "learning_rate": 4.722263088110426e-06, + "loss": 0.4647108018398285, + "mean_token_accuracy": 0.8505672216415405, + "num_tokens": 7044880.0, + "step": 767 + }, + { + "epoch": 0.5835866261398176, + "grad_norm": 2.070528507232666, + "learning_rate": 4.721302884570079e-06, + "loss": 0.5147565007209778, + "mean_token_accuracy": 0.8113877773284912, + "num_tokens": 7052433.0, + "step": 768 + }, + { + "epoch": 0.5843465045592705, + "grad_norm": 2.1953284740448, + "learning_rate": 4.720341122033862e-06, + "loss": 0.5075466632843018, + "mean_token_accuracy": 0.8474211096763611, + "num_tokens": 7058686.0, + "step": 769 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 1.9287755489349365, + "learning_rate": 4.719377801176774e-06, + "loss": 0.5382202863693237, + "mean_token_accuracy": 0.8148090243339539, + "num_tokens": 7067538.0, + "step": 770 + }, + { + "epoch": 0.5858662613981763, + "grad_norm": 1.5574456453323364, + "learning_rate": 4.718412922674913e-06, + "loss": 0.43406790494918823, + "mean_token_accuracy": 0.8477081060409546, + "num_tokens": 7077853.0, + "step": 771 + }, + { + "epoch": 0.5866261398176292, + "grad_norm": 1.5490336418151855, + "learning_rate": 4.717446487205466e-06, + "loss": 0.43164271116256714, + "mean_token_accuracy": 0.8504570126533508, + "num_tokens": 7091728.0, + "step": 772 + }, + { + "epoch": 0.587386018237082, + "grad_norm": 1.6945984363555908, + "learning_rate": 4.716478495446717e-06, + "loss": 0.5153743624687195, + "mean_token_accuracy": 0.8213579058647156, + "num_tokens": 7108680.0, + "step": 773 + }, + { + "epoch": 0.5881458966565349, + "grad_norm": 2.2633883953094482, + "learning_rate": 4.715508948078037e-06, + "loss": 0.45254790782928467, + "mean_token_accuracy": 0.8392219543457031, + "num_tokens": 7115546.0, + "step": 774 + }, + { + "epoch": 0.5889057750759878, + "grad_norm": 1.5731090307235718, + "learning_rate": 4.714537845779894e-06, + "loss": 0.38678881525993347, + "mean_token_accuracy": 0.8800252676010132, + "num_tokens": 7126360.0, + "step": 775 + }, + { + "epoch": 0.5896656534954408, + "grad_norm": 2.4873392581939697, + "learning_rate": 4.7135651892338445e-06, + "loss": 0.5190927386283875, + "mean_token_accuracy": 0.8145407438278198, + "num_tokens": 7135705.0, + "step": 776 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 1.2931004762649536, + "learning_rate": 4.712590979122534e-06, + "loss": 0.3686544895172119, + "mean_token_accuracy": 0.8720537424087524, + "num_tokens": 7150688.0, + "step": 777 + }, + { + "epoch": 0.5911854103343465, + "grad_norm": 1.6353671550750732, + "learning_rate": 4.7116152161297045e-06, + "loss": 0.49065062403678894, + "mean_token_accuracy": 0.8203760385513306, + "num_tokens": 7161040.0, + "step": 778 + }, + { + "epoch": 0.5919452887537994, + "grad_norm": 1.2345483303070068, + "learning_rate": 4.710637900940181e-06, + "loss": 0.4004976451396942, + "mean_token_accuracy": 0.8302007913589478, + "num_tokens": 7178074.0, + "step": 779 + }, + { + "epoch": 0.5927051671732523, + "grad_norm": 2.2506837844848633, + "learning_rate": 4.7096590342398825e-06, + "loss": 0.45142874121665955, + "mean_token_accuracy": 0.8481036424636841, + "num_tokens": 7184153.0, + "step": 780 + }, + { + "epoch": 0.5934650455927052, + "grad_norm": 1.420479416847229, + "learning_rate": 4.708678616715815e-06, + "loss": 0.4802100360393524, + "mean_token_accuracy": 0.8586992025375366, + "num_tokens": 7202810.0, + "step": 781 + }, + { + "epoch": 0.5942249240121581, + "grad_norm": 3.457632303237915, + "learning_rate": 4.707696649056073e-06, + "loss": 0.5265094041824341, + "mean_token_accuracy": 0.8260114192962646, + "num_tokens": 7206396.0, + "step": 782 + }, + { + "epoch": 0.5949848024316109, + "grad_norm": 1.1592093706130981, + "learning_rate": 4.706713131949839e-06, + "loss": 0.3708173632621765, + "mean_token_accuracy": 0.8476542234420776, + "num_tokens": 7225034.0, + "step": 783 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.6761400699615479, + "learning_rate": 4.705728066087384e-06, + "loss": 0.4137252867221832, + "mean_token_accuracy": 0.8462049961090088, + "num_tokens": 7237101.0, + "step": 784 + }, + { + "epoch": 0.5965045592705167, + "grad_norm": 2.320185422897339, + "learning_rate": 4.704741452160064e-06, + "loss": 0.5157154202461243, + "mean_token_accuracy": 0.8391785621643066, + "num_tokens": 7243826.0, + "step": 785 + }, + { + "epoch": 0.5972644376899696, + "grad_norm": 2.079423427581787, + "learning_rate": 4.703753290860323e-06, + "loss": 0.4734993278980255, + "mean_token_accuracy": 0.8353281021118164, + "num_tokens": 7250175.0, + "step": 786 + }, + { + "epoch": 0.5980243161094225, + "grad_norm": 1.8215159177780151, + "learning_rate": 4.702763582881692e-06, + "loss": 0.520193338394165, + "mean_token_accuracy": 0.844062864780426, + "num_tokens": 7258868.0, + "step": 787 + }, + { + "epoch": 0.5987841945288754, + "grad_norm": 1.3823071718215942, + "learning_rate": 4.701772328918784e-06, + "loss": 0.4177844822406769, + "mean_token_accuracy": 0.8363165259361267, + "num_tokens": 7271744.0, + "step": 788 + }, + { + "epoch": 0.5995440729483282, + "grad_norm": 2.4749298095703125, + "learning_rate": 4.700779529667301e-06, + "loss": 0.5115069150924683, + "mean_token_accuracy": 0.8473520278930664, + "num_tokens": 7277040.0, + "step": 789 + }, + { + "epoch": 0.6003039513677811, + "grad_norm": 1.7072296142578125, + "learning_rate": 4.699785185824026e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8161447048187256, + "num_tokens": 7288288.0, + "step": 790 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 1.6479384899139404, + "learning_rate": 4.69878929808683e-06, + "loss": 0.4445168972015381, + "mean_token_accuracy": 0.8381255865097046, + "num_tokens": 7298640.0, + "step": 791 + }, + { + "epoch": 0.601823708206687, + "grad_norm": 1.9095896482467651, + "learning_rate": 4.6977918671546635e-06, + "loss": 0.5841238498687744, + "mean_token_accuracy": 0.7971454858779907, + "num_tokens": 7307220.0, + "step": 792 + }, + { + "epoch": 0.6025835866261399, + "grad_norm": 1.9614146947860718, + "learning_rate": 4.696792893727562e-06, + "loss": 0.34684082865715027, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 7313875.0, + "step": 793 + }, + { + "epoch": 0.6033434650455927, + "grad_norm": 2.015570640563965, + "learning_rate": 4.695792378506645e-06, + "loss": 0.42779117822647095, + "mean_token_accuracy": 0.8625012636184692, + "num_tokens": 7321439.0, + "step": 794 + }, + { + "epoch": 0.6041033434650456, + "grad_norm": 2.8581228256225586, + "learning_rate": 4.694790322194111e-06, + "loss": 0.6519991159439087, + "mean_token_accuracy": 0.7629562616348267, + "num_tokens": 7326916.0, + "step": 795 + }, + { + "epoch": 0.6048632218844985, + "grad_norm": 2.482715368270874, + "learning_rate": 4.693786725493242e-06, + "loss": 0.532963216304779, + "mean_token_accuracy": 0.832184910774231, + "num_tokens": 7333311.0, + "step": 796 + }, + { + "epoch": 0.6056231003039514, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.692781589108402e-06, + "loss": 0.43381205201148987, + "mean_token_accuracy": 0.8402494192123413, + "num_tokens": 7343731.0, + "step": 797 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 2.2133216857910156, + "learning_rate": 4.691774913745033e-06, + "loss": 0.4380851089954376, + "mean_token_accuracy": 0.8600908517837524, + "num_tokens": 7350224.0, + "step": 798 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 2.046280860900879, + "learning_rate": 4.690766700109659e-06, + "loss": 0.3821919560432434, + "mean_token_accuracy": 0.8691814541816711, + "num_tokens": 7356717.0, + "step": 799 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 1.8482693433761597, + "learning_rate": 4.689756948909884e-06, + "loss": 0.5217651128768921, + "mean_token_accuracy": 0.803473711013794, + "num_tokens": 7365806.0, + "step": 800 + }, + { + "epoch": 0.6086626139817629, + "grad_norm": 2.192134141921997, + "learning_rate": 4.688745660854388e-06, + "loss": 0.573980987071991, + "mean_token_accuracy": 0.8198676109313965, + "num_tokens": 7380281.0, + "step": 801 + }, + { + "epoch": 0.6094224924012158, + "grad_norm": 2.363626718521118, + "learning_rate": 4.687732836652935e-06, + "loss": 0.5204599499702454, + "mean_token_accuracy": 0.8373252153396606, + "num_tokens": 7386938.0, + "step": 802 + }, + { + "epoch": 0.6101823708206687, + "grad_norm": 1.9320523738861084, + "learning_rate": 4.686718477016361e-06, + "loss": 0.47316622734069824, + "mean_token_accuracy": 0.830596923828125, + "num_tokens": 7395069.0, + "step": 803 + }, + { + "epoch": 0.6109422492401215, + "grad_norm": 2.6573057174682617, + "learning_rate": 4.6857025826565845e-06, + "loss": 0.5495861768722534, + "mean_token_accuracy": 0.8187421560287476, + "num_tokens": 7400563.0, + "step": 804 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 2.0893123149871826, + "learning_rate": 4.684685154286599e-06, + "loss": 0.5362675786018372, + "mean_token_accuracy": 0.8394701480865479, + "num_tokens": 7406973.0, + "step": 805 + }, + { + "epoch": 0.6124620060790273, + "grad_norm": 2.455130100250244, + "learning_rate": 4.683666192620474e-06, + "loss": 0.5405995845794678, + "mean_token_accuracy": 0.8079100847244263, + "num_tokens": 7412931.0, + "step": 806 + }, + { + "epoch": 0.6132218844984803, + "grad_norm": 2.311915636062622, + "learning_rate": 4.682645698373357e-06, + "loss": 0.5395106077194214, + "mean_token_accuracy": 0.8156260251998901, + "num_tokens": 7419699.0, + "step": 807 + }, + { + "epoch": 0.6139817629179332, + "grad_norm": 1.686838984489441, + "learning_rate": 4.6816236722614694e-06, + "loss": 0.6034521460533142, + "mean_token_accuracy": 0.7855954170227051, + "num_tokens": 7431899.0, + "step": 808 + }, + { + "epoch": 0.6147416413373861, + "grad_norm": 1.682759165763855, + "learning_rate": 4.680600115002109e-06, + "loss": 0.48593831062316895, + "mean_token_accuracy": 0.8229435682296753, + "num_tokens": 7443187.0, + "step": 809 + }, + { + "epoch": 0.6155015197568389, + "grad_norm": 2.064589738845825, + "learning_rate": 4.679575027313649e-06, + "loss": 0.5098468661308289, + "mean_token_accuracy": 0.8234638571739197, + "num_tokens": 7450868.0, + "step": 810 + }, + { + "epoch": 0.6162613981762918, + "grad_norm": 2.2063486576080322, + "learning_rate": 4.6785484099155324e-06, + "loss": 0.5138497352600098, + "mean_token_accuracy": 0.8152111172676086, + "num_tokens": 7457176.0, + "step": 811 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 1.6258726119995117, + "learning_rate": 4.67752026352828e-06, + "loss": 0.4064181447029114, + "mean_token_accuracy": 0.8720619678497314, + "num_tokens": 7466557.0, + "step": 812 + }, + { + "epoch": 0.6177811550151976, + "grad_norm": 2.3309383392333984, + "learning_rate": 4.676490588873486e-06, + "loss": 0.5180112719535828, + "mean_token_accuracy": 0.8233879804611206, + "num_tokens": 7472650.0, + "step": 813 + }, + { + "epoch": 0.6185410334346505, + "grad_norm": 1.4545246362686157, + "learning_rate": 4.675459386673815e-06, + "loss": 0.37917959690093994, + "mean_token_accuracy": 0.8598103523254395, + "num_tokens": 7485171.0, + "step": 814 + }, + { + "epoch": 0.6193009118541033, + "grad_norm": 2.654231071472168, + "learning_rate": 4.674426657653003e-06, + "loss": 0.554074227809906, + "mean_token_accuracy": 0.8026446104049683, + "num_tokens": 7490787.0, + "step": 815 + }, + { + "epoch": 0.6200607902735562, + "grad_norm": 1.5543994903564453, + "learning_rate": 4.67339240253586e-06, + "loss": 0.6335440278053284, + "mean_token_accuracy": 0.783241868019104, + "num_tokens": 7505975.0, + "step": 816 + }, + { + "epoch": 0.6208206686930091, + "grad_norm": 2.079998016357422, + "learning_rate": 4.672356622048266e-06, + "loss": 0.5169394016265869, + "mean_token_accuracy": 0.8088761568069458, + "num_tokens": 7513470.0, + "step": 817 + }, + { + "epoch": 0.621580547112462, + "grad_norm": 1.5971896648406982, + "learning_rate": 4.671319316917172e-06, + "loss": 0.44588586688041687, + "mean_token_accuracy": 0.8518649339675903, + "num_tokens": 7524352.0, + "step": 818 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 2.477579116821289, + "learning_rate": 4.670280487870599e-06, + "loss": 0.5713893175125122, + "mean_token_accuracy": 0.8116940259933472, + "num_tokens": 7530359.0, + "step": 819 + }, + { + "epoch": 0.6231003039513677, + "grad_norm": 2.066211700439453, + "learning_rate": 4.669240135637635e-06, + "loss": 0.5295331478118896, + "mean_token_accuracy": 0.819536566734314, + "num_tokens": 7536963.0, + "step": 820 + }, + { + "epoch": 0.6238601823708206, + "grad_norm": 2.1217997074127197, + "learning_rate": 4.668198260948442e-06, + "loss": 0.6146406531333923, + "mean_token_accuracy": 0.7932635545730591, + "num_tokens": 7545800.0, + "step": 821 + }, + { + "epoch": 0.6246200607902735, + "grad_norm": 2.0173542499542236, + "learning_rate": 4.667154864534245e-06, + "loss": 0.6240535974502563, + "mean_token_accuracy": 0.7883644104003906, + "num_tokens": 7556165.0, + "step": 822 + }, + { + "epoch": 0.6253799392097265, + "grad_norm": 2.014526128768921, + "learning_rate": 4.666109947127343e-06, + "loss": 0.40367332100868225, + "mean_token_accuracy": 0.8653522729873657, + "num_tokens": 7562665.0, + "step": 823 + }, + { + "epoch": 0.6261398176291794, + "grad_norm": 2.5078861713409424, + "learning_rate": 4.665063509461098e-06, + "loss": 0.5903617739677429, + "mean_token_accuracy": 0.7902897596359253, + "num_tokens": 7568922.0, + "step": 824 + }, + { + "epoch": 0.6268996960486323, + "grad_norm": 2.454622745513916, + "learning_rate": 4.664015552269938e-06, + "loss": 0.5238361358642578, + "mean_token_accuracy": 0.838546872138977, + "num_tokens": 7575965.0, + "step": 825 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 2.920919418334961, + "learning_rate": 4.662966076289363e-06, + "loss": 0.5028782486915588, + "mean_token_accuracy": 0.8311152458190918, + "num_tokens": 7580193.0, + "step": 826 + }, + { + "epoch": 0.628419452887538, + "grad_norm": 1.545382022857666, + "learning_rate": 4.661915082255932e-06, + "loss": 0.4817378520965576, + "mean_token_accuracy": 0.8373227119445801, + "num_tokens": 7593024.0, + "step": 827 + }, + { + "epoch": 0.6291793313069909, + "grad_norm": 1.5152469873428345, + "learning_rate": 4.6608625709072766e-06, + "loss": 0.4693033695220947, + "mean_token_accuracy": 0.8150848150253296, + "num_tokens": 7606459.0, + "step": 828 + }, + { + "epoch": 0.6299392097264438, + "grad_norm": 2.1310224533081055, + "learning_rate": 4.659808542982089e-06, + "loss": 0.4653395414352417, + "mean_token_accuracy": 0.8286294341087341, + "num_tokens": 7613036.0, + "step": 829 + }, + { + "epoch": 0.6306990881458967, + "grad_norm": 2.1949679851531982, + "learning_rate": 4.658752999220125e-06, + "loss": 0.3698633909225464, + "mean_token_accuracy": 0.871590793132782, + "num_tokens": 7618527.0, + "step": 830 + }, + { + "epoch": 0.6314589665653495, + "grad_norm": 2.2770416736602783, + "learning_rate": 4.657695940362207e-06, + "loss": 0.5202419757843018, + "mean_token_accuracy": 0.817577600479126, + "num_tokens": 7624459.0, + "step": 831 + }, + { + "epoch": 0.6322188449848024, + "grad_norm": 1.402042269706726, + "learning_rate": 4.65663736715022e-06, + "loss": 0.51531583070755, + "mean_token_accuracy": 0.8228116631507874, + "num_tokens": 7639371.0, + "step": 832 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.3554883003234863, + "learning_rate": 4.65557728032711e-06, + "loss": 0.6771188378334045, + "mean_token_accuracy": 0.7880028486251831, + "num_tokens": 7643924.0, + "step": 833 + }, + { + "epoch": 0.6337386018237082, + "grad_norm": 2.081040143966675, + "learning_rate": 4.654515680636888e-06, + "loss": 0.5712796449661255, + "mean_token_accuracy": 0.8177868127822876, + "num_tokens": 7651881.0, + "step": 834 + }, + { + "epoch": 0.6344984802431611, + "grad_norm": 0.9128716588020325, + "learning_rate": 4.653452568824625e-06, + "loss": 0.3423936069011688, + "mean_token_accuracy": 0.8782886266708374, + "num_tokens": 7677829.0, + "step": 835 + }, + { + "epoch": 0.6352583586626139, + "grad_norm": 3.49015736579895, + "learning_rate": 4.652387945636454e-06, + "loss": 0.34657734632492065, + "mean_token_accuracy": 0.8770567178726196, + "num_tokens": 7680796.0, + "step": 836 + }, + { + "epoch": 0.6360182370820668, + "grad_norm": 2.026247501373291, + "learning_rate": 4.651321811819568e-06, + "loss": 0.5098431706428528, + "mean_token_accuracy": 0.8216961622238159, + "num_tokens": 7688746.0, + "step": 837 + }, + { + "epoch": 0.6367781155015197, + "grad_norm": 2.444343090057373, + "learning_rate": 4.650254168122222e-06, + "loss": 0.5490090250968933, + "mean_token_accuracy": 0.8092857599258423, + "num_tokens": 7695220.0, + "step": 838 + }, + { + "epoch": 0.6375379939209727, + "grad_norm": 2.0171122550964355, + "learning_rate": 4.649185015293728e-06, + "loss": 0.47221142053604126, + "mean_token_accuracy": 0.8514408469200134, + "num_tokens": 7702759.0, + "step": 839 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.9800984859466553, + "learning_rate": 4.64811435408446e-06, + "loss": 0.5238803625106812, + "mean_token_accuracy": 0.8479194641113281, + "num_tokens": 7714017.0, + "step": 840 + }, + { + "epoch": 0.6390577507598785, + "grad_norm": 3.0674357414245605, + "learning_rate": 4.647042185245848e-06, + "loss": 0.4668245315551758, + "mean_token_accuracy": 0.8381714820861816, + "num_tokens": 7717801.0, + "step": 841 + }, + { + "epoch": 0.6398176291793313, + "grad_norm": 1.5672820806503296, + "learning_rate": 4.645968509530381e-06, + "loss": 0.4428741931915283, + "mean_token_accuracy": 0.8416479825973511, + "num_tokens": 7728342.0, + "step": 842 + }, + { + "epoch": 0.6405775075987842, + "grad_norm": 2.3042354583740234, + "learning_rate": 4.644893327691608e-06, + "loss": 0.49937760829925537, + "mean_token_accuracy": 0.827070951461792, + "num_tokens": 7734576.0, + "step": 843 + }, + { + "epoch": 0.6413373860182371, + "grad_norm": 2.057772159576416, + "learning_rate": 4.6438166404841316e-06, + "loss": 0.5912986993789673, + "mean_token_accuracy": 0.805509090423584, + "num_tokens": 7742481.0, + "step": 844 + }, + { + "epoch": 0.64209726443769, + "grad_norm": 1.9688186645507812, + "learning_rate": 4.6427384486636115e-06, + "loss": 0.482401967048645, + "mean_token_accuracy": 0.8358086347579956, + "num_tokens": 7750002.0, + "step": 845 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.6852948665618896, + "learning_rate": 4.6416587529867665e-06, + "loss": 0.5479315519332886, + "mean_token_accuracy": 0.8091106414794922, + "num_tokens": 7755578.0, + "step": 846 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 2.0547337532043457, + "learning_rate": 4.640577554211366e-06, + "loss": 0.5327274203300476, + "mean_token_accuracy": 0.8280376195907593, + "num_tokens": 7763513.0, + "step": 847 + }, + { + "epoch": 0.6443768996960486, + "grad_norm": 2.0328633785247803, + "learning_rate": 4.63949485309624e-06, + "loss": 0.4814409613609314, + "mean_token_accuracy": 0.8527672290802002, + "num_tokens": 7771131.0, + "step": 848 + }, + { + "epoch": 0.6451367781155015, + "grad_norm": 1.5892863273620605, + "learning_rate": 4.638410650401267e-06, + "loss": 0.4492785334587097, + "mean_token_accuracy": 0.846997857093811, + "num_tokens": 7781572.0, + "step": 849 + }, + { + "epoch": 0.6458966565349544, + "grad_norm": 1.8295910358428955, + "learning_rate": 4.637324946887384e-06, + "loss": 0.37088239192962646, + "mean_token_accuracy": 0.8616628646850586, + "num_tokens": 7788604.0, + "step": 850 + }, + { + "epoch": 0.6466565349544073, + "grad_norm": 3.380040168762207, + "learning_rate": 4.636237743316578e-06, + "loss": 0.4737280607223511, + "mean_token_accuracy": 0.855940580368042, + "num_tokens": 7792504.0, + "step": 851 + }, + { + "epoch": 0.6474164133738601, + "grad_norm": 2.8790009021759033, + "learning_rate": 4.635149040451891e-06, + "loss": 0.39790448546409607, + "mean_token_accuracy": 0.8710698485374451, + "num_tokens": 7796333.0, + "step": 852 + }, + { + "epoch": 0.648176291793313, + "grad_norm": 1.914914608001709, + "learning_rate": 4.634058839057417e-06, + "loss": 0.2954312562942505, + "mean_token_accuracy": 0.8880234956741333, + "num_tokens": 7802456.0, + "step": 853 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 1.3709120750427246, + "learning_rate": 4.632967139898301e-06, + "loss": 0.43224576115608215, + "mean_token_accuracy": 0.8446190357208252, + "num_tokens": 7816770.0, + "step": 854 + }, + { + "epoch": 0.6496960486322189, + "grad_norm": 1.6579312086105347, + "learning_rate": 4.63187394374074e-06, + "loss": 0.3535553514957428, + "mean_token_accuracy": 0.8738704919815063, + "num_tokens": 7824963.0, + "step": 855 + }, + { + "epoch": 0.6504559270516718, + "grad_norm": 2.4055678844451904, + "learning_rate": 4.63077925135198e-06, + "loss": 0.5078744292259216, + "mean_token_accuracy": 0.8430874347686768, + "num_tokens": 7830962.0, + "step": 856 + }, + { + "epoch": 0.6512158054711246, + "grad_norm": 2.5171499252319336, + "learning_rate": 4.629683063500319e-06, + "loss": 0.5172419548034668, + "mean_token_accuracy": 0.8087141513824463, + "num_tokens": 7836638.0, + "step": 857 + }, + { + "epoch": 0.6519756838905775, + "grad_norm": 1.7588486671447754, + "learning_rate": 4.628585380955104e-06, + "loss": 0.5759496092796326, + "mean_token_accuracy": 0.8043236136436462, + "num_tokens": 7844654.0, + "step": 858 + }, + { + "epoch": 0.6527355623100304, + "grad_norm": 1.5887070894241333, + "learning_rate": 4.62748620448673e-06, + "loss": 0.41849038004875183, + "mean_token_accuracy": 0.8556643724441528, + "num_tokens": 7855642.0, + "step": 859 + }, + { + "epoch": 0.6534954407294833, + "grad_norm": 3.227942705154419, + "learning_rate": 4.626385534866642e-06, + "loss": 0.5279449224472046, + "mean_token_accuracy": 0.8250958323478699, + "num_tokens": 7859890.0, + "step": 860 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 2.440467119216919, + "learning_rate": 4.625283372867333e-06, + "loss": 0.5294933319091797, + "mean_token_accuracy": 0.8235013484954834, + "num_tokens": 7866766.0, + "step": 861 + }, + { + "epoch": 0.6550151975683891, + "grad_norm": 2.4106903076171875, + "learning_rate": 4.624179719262342e-06, + "loss": 0.5662813186645508, + "mean_token_accuracy": 0.8061668872833252, + "num_tokens": 7872809.0, + "step": 862 + }, + { + "epoch": 0.6557750759878419, + "grad_norm": 3.5151145458221436, + "learning_rate": 4.623074574826254e-06, + "loss": 0.5471097230911255, + "mean_token_accuracy": 0.8220691084861755, + "num_tokens": 7876136.0, + "step": 863 + }, + { + "epoch": 0.6565349544072948, + "grad_norm": 1.5319840908050537, + "learning_rate": 4.621967940334705e-06, + "loss": 0.4178982377052307, + "mean_token_accuracy": 0.8517135977745056, + "num_tokens": 7886113.0, + "step": 864 + }, + { + "epoch": 0.6572948328267477, + "grad_norm": 1.63701331615448, + "learning_rate": 4.620859816564371e-06, + "loss": 0.4666512608528137, + "mean_token_accuracy": 0.8223508596420288, + "num_tokens": 7897982.0, + "step": 865 + }, + { + "epoch": 0.6580547112462006, + "grad_norm": 2.1515414714813232, + "learning_rate": 4.619750204292978e-06, + "loss": 0.5359305143356323, + "mean_token_accuracy": 0.8192868232727051, + "num_tokens": 7904947.0, + "step": 866 + }, + { + "epoch": 0.6588145896656535, + "grad_norm": 2.2140955924987793, + "learning_rate": 4.618639104299294e-06, + "loss": 0.5275633931159973, + "mean_token_accuracy": 0.8120715618133545, + "num_tokens": 7913913.0, + "step": 867 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 1.3956893682479858, + "learning_rate": 4.6175265173631304e-06, + "loss": 0.4378768503665924, + "mean_token_accuracy": 0.8479125499725342, + "num_tokens": 7927979.0, + "step": 868 + }, + { + "epoch": 0.6603343465045592, + "grad_norm": 2.98103928565979, + "learning_rate": 4.616412444265344e-06, + "loss": 0.42614591121673584, + "mean_token_accuracy": 0.8595094680786133, + "num_tokens": 7934293.0, + "step": 869 + }, + { + "epoch": 0.6610942249240122, + "grad_norm": 2.554845094680786, + "learning_rate": 4.6152968857878365e-06, + "loss": 0.3698030412197113, + "mean_token_accuracy": 0.8717041015625, + "num_tokens": 7938547.0, + "step": 870 + }, + { + "epoch": 0.6618541033434651, + "grad_norm": 3.0901825428009033, + "learning_rate": 4.6141798427135475e-06, + "loss": 0.5037497282028198, + "mean_token_accuracy": 0.8354041576385498, + "num_tokens": 7942829.0, + "step": 871 + }, + { + "epoch": 0.662613981762918, + "grad_norm": 2.8692073822021484, + "learning_rate": 4.6130613158264605e-06, + "loss": 0.5418164134025574, + "mean_token_accuracy": 0.8298909664154053, + "num_tokens": 7949303.0, + "step": 872 + }, + { + "epoch": 0.6633738601823708, + "grad_norm": 3.960404396057129, + "learning_rate": 4.611941305911602e-06, + "loss": 0.6284480094909668, + "mean_token_accuracy": 0.837495744228363, + "num_tokens": 7952486.0, + "step": 873 + }, + { + "epoch": 0.6641337386018237, + "grad_norm": 2.6690115928649902, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5214360952377319, + "mean_token_accuracy": 0.8213508129119873, + "num_tokens": 7957559.0, + "step": 874 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 2.3376171588897705, + "learning_rate": 4.609696840143875e-06, + "loss": 0.46887528896331787, + "mean_token_accuracy": 0.8438819646835327, + "num_tokens": 7962826.0, + "step": 875 + }, + { + "epoch": 0.6656534954407295, + "grad_norm": 2.2222683429718018, + "learning_rate": 4.6085723858662575e-06, + "loss": 0.5607719421386719, + "mean_token_accuracy": 0.8128405809402466, + "num_tokens": 7970131.0, + "step": 876 + }, + { + "epoch": 0.6664133738601824, + "grad_norm": 2.069091558456421, + "learning_rate": 4.607446451711372e-06, + "loss": 0.506301760673523, + "mean_token_accuracy": 0.8256827592849731, + "num_tokens": 7977524.0, + "step": 877 + }, + { + "epoch": 0.6671732522796353, + "grad_norm": 1.3724967241287231, + "learning_rate": 4.606319038469443e-06, + "loss": 0.43285101652145386, + "mean_token_accuracy": 0.8525032997131348, + "num_tokens": 7989174.0, + "step": 878 + }, + { + "epoch": 0.6679331306990881, + "grad_norm": 2.278205156326294, + "learning_rate": 4.605190146931731e-06, + "loss": 0.4845905303955078, + "mean_token_accuracy": 0.8284652829170227, + "num_tokens": 7998524.0, + "step": 879 + }, + { + "epoch": 0.668693009118541, + "grad_norm": 1.3871766328811646, + "learning_rate": 4.604059777890537e-06, + "loss": 0.5736679434776306, + "mean_token_accuracy": 0.8223285675048828, + "num_tokens": 8015776.0, + "step": 880 + }, + { + "epoch": 0.6694528875379939, + "grad_norm": 1.926164984703064, + "learning_rate": 4.602927932139197e-06, + "loss": 0.4133230447769165, + "mean_token_accuracy": 0.8653768301010132, + "num_tokens": 8022979.0, + "step": 881 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 2.109272003173828, + "learning_rate": 4.601794610472083e-06, + "loss": 0.7005600929260254, + "mean_token_accuracy": 0.7777010202407837, + "num_tokens": 8032618.0, + "step": 882 + }, + { + "epoch": 0.6709726443768997, + "grad_norm": 2.077977418899536, + "learning_rate": 4.6006598136846056e-06, + "loss": 0.5278208255767822, + "mean_token_accuracy": 0.8230358958244324, + "num_tokens": 8040534.0, + "step": 883 + }, + { + "epoch": 0.6717325227963525, + "grad_norm": 1.678581714630127, + "learning_rate": 4.599523542573207e-06, + "loss": 0.4955351650714874, + "mean_token_accuracy": 0.8270003795623779, + "num_tokens": 8052249.0, + "step": 884 + }, + { + "epoch": 0.6724924012158054, + "grad_norm": 2.0751662254333496, + "learning_rate": 4.598385797935368e-06, + "loss": 0.5266247987747192, + "mean_token_accuracy": 0.8263581991195679, + "num_tokens": 8060600.0, + "step": 885 + }, + { + "epoch": 0.6732522796352584, + "grad_norm": 2.418405771255493, + "learning_rate": 4.5972465805696e-06, + "loss": 0.4481425881385803, + "mean_token_accuracy": 0.846164345741272, + "num_tokens": 8066025.0, + "step": 886 + }, + { + "epoch": 0.6740121580547113, + "grad_norm": 2.3936474323272705, + "learning_rate": 4.596105891275449e-06, + "loss": 0.4553404450416565, + "mean_token_accuracy": 0.8412896394729614, + "num_tokens": 8071544.0, + "step": 887 + }, + { + "epoch": 0.6747720364741642, + "grad_norm": 2.2024407386779785, + "learning_rate": 4.594963730853497e-06, + "loss": 0.6218541860580444, + "mean_token_accuracy": 0.7890232801437378, + "num_tokens": 8079061.0, + "step": 888 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 2.51015567779541, + "learning_rate": 4.593820100105355e-06, + "loss": 0.5149124264717102, + "mean_token_accuracy": 0.8241918087005615, + "num_tokens": 8084293.0, + "step": 889 + }, + { + "epoch": 0.6762917933130699, + "grad_norm": 1.8748939037322998, + "learning_rate": 4.5926749998336665e-06, + "loss": 0.50836181640625, + "mean_token_accuracy": 0.8067223429679871, + "num_tokens": 8092511.0, + "step": 890 + }, + { + "epoch": 0.6770516717325228, + "grad_norm": 1.801193118095398, + "learning_rate": 4.5915284308421075e-06, + "loss": 0.4372861683368683, + "mean_token_accuracy": 0.8510604500770569, + "num_tokens": 8101174.0, + "step": 891 + }, + { + "epoch": 0.6778115501519757, + "grad_norm": 2.6476457118988037, + "learning_rate": 4.590380393935383e-06, + "loss": 0.38700711727142334, + "mean_token_accuracy": 0.8659796714782715, + "num_tokens": 8105398.0, + "step": 892 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.1147183179855347, + "learning_rate": 4.589230889919232e-06, + "loss": 0.38546115159988403, + "mean_token_accuracy": 0.8570581674575806, + "num_tokens": 8127394.0, + "step": 893 + }, + { + "epoch": 0.6793313069908815, + "grad_norm": 2.908905506134033, + "learning_rate": 4.588079919600419e-06, + "loss": 0.5108504295349121, + "mean_token_accuracy": 0.8121406435966492, + "num_tokens": 8131801.0, + "step": 894 + }, + { + "epoch": 0.6800911854103343, + "grad_norm": 3.1522326469421387, + "learning_rate": 4.586927483786739e-06, + "loss": 0.44059112668037415, + "mean_token_accuracy": 0.8448011875152588, + "num_tokens": 8154416.0, + "step": 895 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.5142440795898438, + "learning_rate": 4.585773583287017e-06, + "loss": 0.513217568397522, + "mean_token_accuracy": 0.8386049270629883, + "num_tokens": 8171156.0, + "step": 896 + }, + { + "epoch": 0.6816109422492401, + "grad_norm": 2.597881317138672, + "learning_rate": 4.584618218911104e-06, + "loss": 0.4937712550163269, + "mean_token_accuracy": 0.8223681449890137, + "num_tokens": 8176124.0, + "step": 897 + }, + { + "epoch": 0.682370820668693, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.583461391469879e-06, + "loss": 0.519811749458313, + "mean_token_accuracy": 0.8169777393341064, + "num_tokens": 8185136.0, + "step": 898 + }, + { + "epoch": 0.6831306990881459, + "grad_norm": 3.2061994075775146, + "learning_rate": 4.582303101775249e-06, + "loss": 0.4655115008354187, + "mean_token_accuracy": 0.8425977230072021, + "num_tokens": 8188864.0, + "step": 899 + }, + { + "epoch": 0.6838905775075987, + "grad_norm": 1.3485229015350342, + "learning_rate": 4.581143350640146e-06, + "loss": 0.5014470815658569, + "mean_token_accuracy": 0.8273109197616577, + "num_tokens": 8203460.0, + "step": 900 + }, + { + "epoch": 0.6846504559270516, + "grad_norm": 1.3264713287353516, + "learning_rate": 4.579982138878527e-06, + "loss": 0.5073703527450562, + "mean_token_accuracy": 0.8259357213973999, + "num_tokens": 8219348.0, + "step": 901 + }, + { + "epoch": 0.6854103343465046, + "grad_norm": 2.4436347484588623, + "learning_rate": 4.578819467305375e-06, + "loss": 0.47020310163497925, + "mean_token_accuracy": 0.8567265272140503, + "num_tokens": 8224427.0, + "step": 902 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 1.921749234199524, + "learning_rate": 4.5776553367367e-06, + "loss": 0.622514009475708, + "mean_token_accuracy": 0.7863982319831848, + "num_tokens": 8233151.0, + "step": 903 + }, + { + "epoch": 0.6869300911854104, + "grad_norm": 1.8815616369247437, + "learning_rate": 4.576489747989532e-06, + "loss": 0.4910545349121094, + "mean_token_accuracy": 0.8147122859954834, + "num_tokens": 8240762.0, + "step": 904 + }, + { + "epoch": 0.6876899696048632, + "grad_norm": 1.2366989850997925, + "learning_rate": 4.575322701881926e-06, + "loss": 0.3947566747665405, + "mean_token_accuracy": 0.873993992805481, + "num_tokens": 8259381.0, + "step": 905 + }, + { + "epoch": 0.6884498480243161, + "grad_norm": 1.5767735242843628, + "learning_rate": 4.57415419923296e-06, + "loss": 0.57136070728302, + "mean_token_accuracy": 0.8028088808059692, + "num_tokens": 8273296.0, + "step": 906 + }, + { + "epoch": 0.689209726443769, + "grad_norm": 2.378675699234009, + "learning_rate": 4.572984240862733e-06, + "loss": 0.5894849896430969, + "mean_token_accuracy": 0.7977708578109741, + "num_tokens": 8280083.0, + "step": 907 + }, + { + "epoch": 0.6899696048632219, + "grad_norm": 2.0401132106781006, + "learning_rate": 4.57181282759237e-06, + "loss": 0.5524613261222839, + "mean_token_accuracy": 0.8138598203659058, + "num_tokens": 8288236.0, + "step": 908 + }, + { + "epoch": 0.6907294832826748, + "grad_norm": 2.293701648712158, + "learning_rate": 4.570639960244011e-06, + "loss": 0.5154546499252319, + "mean_token_accuracy": 0.8234660625457764, + "num_tokens": 8294493.0, + "step": 909 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 1.9286527633666992, + "learning_rate": 4.56946563964082e-06, + "loss": 0.5364264845848083, + "mean_token_accuracy": 0.8147368431091309, + "num_tokens": 8303441.0, + "step": 910 + }, + { + "epoch": 0.6922492401215805, + "grad_norm": 1.2571251392364502, + "learning_rate": 4.5682898666069815e-06, + "loss": 0.43535223603248596, + "mean_token_accuracy": 0.859239935874939, + "num_tokens": 8321548.0, + "step": 911 + }, + { + "epoch": 0.6930091185410334, + "grad_norm": 1.2224860191345215, + "learning_rate": 4.567112641967697e-06, + "loss": 0.40205076336860657, + "mean_token_accuracy": 0.8724711537361145, + "num_tokens": 8335205.0, + "step": 912 + }, + { + "epoch": 0.6937689969604863, + "grad_norm": 1.2064491510391235, + "learning_rate": 4.5659339665491894e-06, + "loss": 0.37790587544441223, + "mean_token_accuracy": 0.8464339971542358, + "num_tokens": 8350926.0, + "step": 913 + }, + { + "epoch": 0.6945288753799392, + "grad_norm": 2.1755270957946777, + "learning_rate": 4.5647538411786965e-06, + "loss": 0.42034298181533813, + "mean_token_accuracy": 0.84148108959198, + "num_tokens": 8356739.0, + "step": 914 + }, + { + "epoch": 0.6952887537993921, + "grad_norm": 1.234864592552185, + "learning_rate": 4.563572266684478e-06, + "loss": 0.5062938332557678, + "mean_token_accuracy": 0.8132052421569824, + "num_tokens": 8373660.0, + "step": 915 + }, + { + "epoch": 0.6960486322188449, + "grad_norm": 2.4250621795654297, + "learning_rate": 4.562389243895807e-06, + "loss": 0.4907791018486023, + "mean_token_accuracy": 0.8337979912757874, + "num_tokens": 8378661.0, + "step": 916 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 1.5018314123153687, + "learning_rate": 4.561204773642974e-06, + "loss": 0.41041281819343567, + "mean_token_accuracy": 0.8569784164428711, + "num_tokens": 8390322.0, + "step": 917 + }, + { + "epoch": 0.6975683890577508, + "grad_norm": 2.797269344329834, + "learning_rate": 4.5600188567572874e-06, + "loss": 0.3146931529045105, + "mean_token_accuracy": 0.8913302421569824, + "num_tokens": 8393567.0, + "step": 918 + }, + { + "epoch": 0.6983282674772037, + "grad_norm": 1.4002827405929565, + "learning_rate": 4.558831494071069e-06, + "loss": 0.4275597333908081, + "mean_token_accuracy": 0.8504893779754639, + "num_tokens": 8407119.0, + "step": 919 + }, + { + "epoch": 0.6990881458966566, + "grad_norm": 1.7045831680297852, + "learning_rate": 4.557642686417654e-06, + "loss": 0.49593430757522583, + "mean_token_accuracy": 0.8185091018676758, + "num_tokens": 8417408.0, + "step": 920 + }, + { + "epoch": 0.6998480243161094, + "grad_norm": 2.8818066120147705, + "learning_rate": 4.556452434631396e-06, + "loss": 0.637908935546875, + "mean_token_accuracy": 0.7883946895599365, + "num_tokens": 8422319.0, + "step": 921 + }, + { + "epoch": 0.7006079027355623, + "grad_norm": 2.3587265014648438, + "learning_rate": 4.555260739547657e-06, + "loss": 0.38749319314956665, + "mean_token_accuracy": 0.8774704933166504, + "num_tokens": 8427315.0, + "step": 922 + }, + { + "epoch": 0.7013677811550152, + "grad_norm": 1.6648749113082886, + "learning_rate": 4.554067602002815e-06, + "loss": 0.4044865369796753, + "mean_token_accuracy": 0.8524141311645508, + "num_tokens": 8438662.0, + "step": 923 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.467787742614746, + "learning_rate": 4.55287302283426e-06, + "loss": 0.591016411781311, + "mean_token_accuracy": 0.81184983253479, + "num_tokens": 8442237.0, + "step": 924 + }, + { + "epoch": 0.702887537993921, + "grad_norm": 2.1458635330200195, + "learning_rate": 4.551677002880395e-06, + "loss": 0.5017476677894592, + "mean_token_accuracy": 0.822914183139801, + "num_tokens": 8449494.0, + "step": 925 + }, + { + "epoch": 0.7036474164133738, + "grad_norm": 2.521714448928833, + "learning_rate": 4.550479542980632e-06, + "loss": 0.531912088394165, + "mean_token_accuracy": 0.8225687742233276, + "num_tokens": 8454983.0, + "step": 926 + }, + { + "epoch": 0.7044072948328267, + "grad_norm": 3.5248100757598877, + "learning_rate": 4.549280643975394e-06, + "loss": 0.4631815254688263, + "mean_token_accuracy": 0.8443771600723267, + "num_tokens": 8458504.0, + "step": 927 + }, + { + "epoch": 0.7051671732522796, + "grad_norm": 2.5105819702148438, + "learning_rate": 4.548080306706114e-06, + "loss": 0.30487123131752014, + "mean_token_accuracy": 0.9018767476081848, + "num_tokens": 8462589.0, + "step": 928 + }, + { + "epoch": 0.7059270516717325, + "grad_norm": 1.3367713689804077, + "learning_rate": 4.5468785320152365e-06, + "loss": 0.4355026185512543, + "mean_token_accuracy": 0.8323584794998169, + "num_tokens": 8478450.0, + "step": 929 + }, + { + "epoch": 0.7066869300911854, + "grad_norm": 2.2506282329559326, + "learning_rate": 4.545675320746212e-06, + "loss": 0.5082957744598389, + "mean_token_accuracy": 0.823430597782135, + "num_tokens": 8485991.0, + "step": 930 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 1.7164632081985474, + "learning_rate": 4.544470673743502e-06, + "loss": 0.3960164785385132, + "mean_token_accuracy": 0.8592486381530762, + "num_tokens": 8495217.0, + "step": 931 + }, + { + "epoch": 0.7082066869300911, + "grad_norm": 1.5864969491958618, + "learning_rate": 4.543264591852572e-06, + "loss": 0.49114471673965454, + "mean_token_accuracy": 0.8330780267715454, + "num_tokens": 8508904.0, + "step": 932 + }, + { + "epoch": 0.708966565349544, + "grad_norm": 2.1707003116607666, + "learning_rate": 4.542057075919898e-06, + "loss": 0.49895772337913513, + "mean_token_accuracy": 0.8327431082725525, + "num_tokens": 8515792.0, + "step": 933 + }, + { + "epoch": 0.709726443768997, + "grad_norm": 1.9002083539962769, + "learning_rate": 4.54084812679296e-06, + "loss": 0.4548531472682953, + "mean_token_accuracy": 0.834532618522644, + "num_tokens": 8524006.0, + "step": 934 + }, + { + "epoch": 0.7104863221884499, + "grad_norm": 1.8505141735076904, + "learning_rate": 4.539637745320247e-06, + "loss": 0.35716521739959717, + "mean_token_accuracy": 0.872222900390625, + "num_tokens": 8533647.0, + "step": 935 + }, + { + "epoch": 0.7112462006079028, + "grad_norm": 2.092620849609375, + "learning_rate": 4.53842593235125e-06, + "loss": 0.4673694372177124, + "mean_token_accuracy": 0.8460999131202698, + "num_tokens": 8540734.0, + "step": 936 + }, + { + "epoch": 0.7120060790273556, + "grad_norm": 2.689514636993408, + "learning_rate": 4.537212688736466e-06, + "loss": 0.45461273193359375, + "mean_token_accuracy": 0.8450704216957092, + "num_tokens": 8544948.0, + "step": 937 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 2.4507734775543213, + "learning_rate": 4.535998015327396e-06, + "loss": 0.4571906626224518, + "mean_token_accuracy": 0.8429360389709473, + "num_tokens": 8550445.0, + "step": 938 + }, + { + "epoch": 0.7135258358662614, + "grad_norm": 1.8960013389587402, + "learning_rate": 4.534781912976546e-06, + "loss": 0.4461391568183899, + "mean_token_accuracy": 0.8487973213195801, + "num_tokens": 8557630.0, + "step": 939 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.602611780166626, + "learning_rate": 4.533564382537421e-06, + "loss": 0.5277102589607239, + "mean_token_accuracy": 0.8330916166305542, + "num_tokens": 8570397.0, + "step": 940 + }, + { + "epoch": 0.7150455927051672, + "grad_norm": 1.8936395645141602, + "learning_rate": 4.532345424864533e-06, + "loss": 0.38619571924209595, + "mean_token_accuracy": 0.8514572381973267, + "num_tokens": 8582673.0, + "step": 941 + }, + { + "epoch": 0.71580547112462, + "grad_norm": 1.3898619413375854, + "learning_rate": 4.531125040813392e-06, + "loss": 0.4825032949447632, + "mean_token_accuracy": 0.833012580871582, + "num_tokens": 8597239.0, + "step": 942 + }, + { + "epoch": 0.7165653495440729, + "grad_norm": 2.128230571746826, + "learning_rate": 4.529903231240511e-06, + "loss": 0.4862118065357208, + "mean_token_accuracy": 0.8210917711257935, + "num_tokens": 8605877.0, + "step": 943 + }, + { + "epoch": 0.7173252279635258, + "grad_norm": 1.6552259922027588, + "learning_rate": 4.528679997003403e-06, + "loss": 0.5092059373855591, + "mean_token_accuracy": 0.8247389793395996, + "num_tokens": 8617060.0, + "step": 944 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 2.1174771785736084, + "learning_rate": 4.52745533896058e-06, + "loss": 0.39110174775123596, + "mean_token_accuracy": 0.8672944903373718, + "num_tokens": 8623306.0, + "step": 945 + }, + { + "epoch": 0.7188449848024316, + "grad_norm": 2.8648383617401123, + "learning_rate": 4.526229257971556e-06, + "loss": 0.49864327907562256, + "mean_token_accuracy": 0.8305130004882812, + "num_tokens": 8627466.0, + "step": 946 + }, + { + "epoch": 0.7196048632218845, + "grad_norm": 2.155514717102051, + "learning_rate": 4.52500175489684e-06, + "loss": 0.5070191025733948, + "mean_token_accuracy": 0.8311188817024231, + "num_tokens": 8634759.0, + "step": 947 + }, + { + "epoch": 0.7203647416413373, + "grad_norm": 1.8432683944702148, + "learning_rate": 4.523772830597942e-06, + "loss": 0.5569252371788025, + "mean_token_accuracy": 0.8070821762084961, + "num_tokens": 8644160.0, + "step": 948 + }, + { + "epoch": 0.7211246200607903, + "grad_norm": 2.8912241458892822, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4799427390098572, + "mean_token_accuracy": 0.8443552851676941, + "num_tokens": 8648377.0, + "step": 949 + }, + { + "epoch": 0.7218844984802432, + "grad_norm": 3.3449625968933105, + "learning_rate": 4.521310721778622e-06, + "loss": 0.44043463468551636, + "mean_token_accuracy": 0.8521315455436707, + "num_tokens": 8651846.0, + "step": 950 + }, + { + "epoch": 0.7226443768996961, + "grad_norm": 1.4127917289733887, + "learning_rate": 4.520077538986203e-06, + "loss": 0.4700999855995178, + "mean_token_accuracy": 0.8377952575683594, + "num_tokens": 8665199.0, + "step": 951 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.1607301235198975, + "learning_rate": 4.518842938425606e-06, + "loss": 0.4374256730079651, + "mean_token_accuracy": 0.8448896408081055, + "num_tokens": 8672158.0, + "step": 952 + }, + { + "epoch": 0.7241641337386018, + "grad_norm": 1.3442779779434204, + "learning_rate": 4.51760692096332e-06, + "loss": 0.38948923349380493, + "mean_token_accuracy": 0.8598923683166504, + "num_tokens": 8684532.0, + "step": 953 + }, + { + "epoch": 0.7249240121580547, + "grad_norm": 2.0003178119659424, + "learning_rate": 4.516369487466832e-06, + "loss": 0.3797217011451721, + "mean_token_accuracy": 0.8652102947235107, + "num_tokens": 8691460.0, + "step": 954 + }, + { + "epoch": 0.7256838905775076, + "grad_norm": 1.8196535110473633, + "learning_rate": 4.5151306388046175e-06, + "loss": 0.5676811933517456, + "mean_token_accuracy": 0.818500816822052, + "num_tokens": 8701624.0, + "step": 955 + }, + { + "epoch": 0.7264437689969605, + "grad_norm": 2.1962296962738037, + "learning_rate": 4.513890375846152e-06, + "loss": 0.45399484038352966, + "mean_token_accuracy": 0.8463879227638245, + "num_tokens": 8707410.0, + "step": 956 + }, + { + "epoch": 0.7272036474164134, + "grad_norm": 1.8798872232437134, + "learning_rate": 4.512648699461897e-06, + "loss": 0.5679811239242554, + "mean_token_accuracy": 0.8089900016784668, + "num_tokens": 8715630.0, + "step": 957 + }, + { + "epoch": 0.7279635258358662, + "grad_norm": 2.3540258407592773, + "learning_rate": 4.511405610523309e-06, + "loss": 0.5282865762710571, + "mean_token_accuracy": 0.8196114301681519, + "num_tokens": 8721934.0, + "step": 958 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 2.5630908012390137, + "learning_rate": 4.510161109902837e-06, + "loss": 0.39442378282546997, + "mean_token_accuracy": 0.8400980830192566, + "num_tokens": 8726511.0, + "step": 959 + }, + { + "epoch": 0.729483282674772, + "grad_norm": 1.9829226732254028, + "learning_rate": 4.508915198473919e-06, + "loss": 0.4611976742744446, + "mean_token_accuracy": 0.8439624309539795, + "num_tokens": 8733460.0, + "step": 960 + }, + { + "epoch": 0.7302431610942249, + "grad_norm": 3.0291950702667236, + "learning_rate": 4.507667877110982e-06, + "loss": 0.5158340930938721, + "mean_token_accuracy": 0.8300060033798218, + "num_tokens": 8737629.0, + "step": 961 + }, + { + "epoch": 0.7310030395136778, + "grad_norm": 1.9208252429962158, + "learning_rate": 4.506419146689445e-06, + "loss": 0.3807099163532257, + "mean_token_accuracy": 0.871469259262085, + "num_tokens": 8744615.0, + "step": 962 + }, + { + "epoch": 0.7317629179331308, + "grad_norm": 3.051565408706665, + "learning_rate": 4.505169008085717e-06, + "loss": 0.38461726903915405, + "mean_token_accuracy": 0.874465823173523, + "num_tokens": 8748154.0, + "step": 963 + }, + { + "epoch": 0.7325227963525835, + "grad_norm": 1.375466227531433, + "learning_rate": 4.503917462177192e-06, + "loss": 0.42490679025650024, + "mean_token_accuracy": 0.8457326889038086, + "num_tokens": 8760965.0, + "step": 964 + }, + { + "epoch": 0.7332826747720365, + "grad_norm": 2.216681957244873, + "learning_rate": 4.5026645098422515e-06, + "loss": 0.43149900436401367, + "mean_token_accuracy": 0.8527278900146484, + "num_tokens": 8766996.0, + "step": 965 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 1.9422595500946045, + "learning_rate": 4.5014101519602684e-06, + "loss": 0.4964504539966583, + "mean_token_accuracy": 0.8137556314468384, + "num_tokens": 8774411.0, + "step": 966 + }, + { + "epoch": 0.7348024316109423, + "grad_norm": 2.058887004852295, + "learning_rate": 4.500154389411598e-06, + "loss": 0.4977570176124573, + "mean_token_accuracy": 0.8254626989364624, + "num_tokens": 8782220.0, + "step": 967 + }, + { + "epoch": 0.7355623100303952, + "grad_norm": 2.9977786540985107, + "learning_rate": 4.498897223077582e-06, + "loss": 0.4061415195465088, + "mean_token_accuracy": 0.8752427101135254, + "num_tokens": 8786120.0, + "step": 968 + }, + { + "epoch": 0.736322188449848, + "grad_norm": 2.2636303901672363, + "learning_rate": 4.49763865384055e-06, + "loss": 0.5062161087989807, + "mean_token_accuracy": 0.8171653747558594, + "num_tokens": 8792459.0, + "step": 969 + }, + { + "epoch": 0.7370820668693009, + "grad_norm": 1.8850842714309692, + "learning_rate": 4.496378682583813e-06, + "loss": 0.5014280676841736, + "mean_token_accuracy": 0.8547511100769043, + "num_tokens": 8800675.0, + "step": 970 + }, + { + "epoch": 0.7378419452887538, + "grad_norm": 1.191985011100769, + "learning_rate": 4.495117310191667e-06, + "loss": 0.4713883101940155, + "mean_token_accuracy": 0.8213596343994141, + "num_tokens": 8820740.0, + "step": 971 + }, + { + "epoch": 0.7386018237082067, + "grad_norm": 1.823000192642212, + "learning_rate": 4.493854537549393e-06, + "loss": 0.46332645416259766, + "mean_token_accuracy": 0.8359860777854919, + "num_tokens": 8828884.0, + "step": 972 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 2.590446949005127, + "learning_rate": 4.492590365543253e-06, + "loss": 0.49074703454971313, + "mean_token_accuracy": 0.8433758020401001, + "num_tokens": 8833859.0, + "step": 973 + }, + { + "epoch": 0.7401215805471124, + "grad_norm": 2.2762670516967773, + "learning_rate": 4.491324795060491e-06, + "loss": 0.39465656876564026, + "mean_token_accuracy": 0.8734766244888306, + "num_tokens": 8839350.0, + "step": 974 + }, + { + "epoch": 0.7408814589665653, + "grad_norm": 2.698725461959839, + "learning_rate": 4.490057826989333e-06, + "loss": 0.5552085041999817, + "mean_token_accuracy": 0.8132266998291016, + "num_tokens": 8844373.0, + "step": 975 + }, + { + "epoch": 0.7416413373860182, + "grad_norm": 2.704606294631958, + "learning_rate": 4.488789462218988e-06, + "loss": 0.3447791635990143, + "mean_token_accuracy": 0.8736170530319214, + "num_tokens": 8848236.0, + "step": 976 + }, + { + "epoch": 0.7424012158054711, + "grad_norm": 3.1260716915130615, + "learning_rate": 4.487519701639641e-06, + "loss": 0.5945233702659607, + "mean_token_accuracy": 0.7997599840164185, + "num_tokens": 8852935.0, + "step": 977 + }, + { + "epoch": 0.743161094224924, + "grad_norm": 1.6895452737808228, + "learning_rate": 4.486248546142459e-06, + "loss": 0.4823892116546631, + "mean_token_accuracy": 0.8279662132263184, + "num_tokens": 8861743.0, + "step": 978 + }, + { + "epoch": 0.743920972644377, + "grad_norm": 1.9161452054977417, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.5266581773757935, + "mean_token_accuracy": 0.8218623399734497, + "num_tokens": 8870601.0, + "step": 979 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 1.6894301176071167, + "learning_rate": 4.483702053964154e-06, + "loss": 0.4186219573020935, + "mean_token_accuracy": 0.8471781015396118, + "num_tokens": 8885617.0, + "step": 980 + }, + { + "epoch": 0.7454407294832827, + "grad_norm": 1.6319992542266846, + "learning_rate": 4.482426719070258e-06, + "loss": 0.541317880153656, + "mean_token_accuracy": 0.8216162323951721, + "num_tokens": 8897595.0, + "step": 981 + }, + { + "epoch": 0.7462006079027356, + "grad_norm": 5.102413177490234, + "learning_rate": 4.4811499928329775e-06, + "loss": 0.3928517699241638, + "mean_token_accuracy": 0.858033299446106, + "num_tokens": 8901682.0, + "step": 982 + }, + { + "epoch": 0.7469604863221885, + "grad_norm": 2.213860273361206, + "learning_rate": 4.479871876148368e-06, + "loss": 0.4276347756385803, + "mean_token_accuracy": 0.8529798984527588, + "num_tokens": 8908088.0, + "step": 983 + }, + { + "epoch": 0.7477203647416414, + "grad_norm": 1.2180038690567017, + "learning_rate": 4.478592369913464e-06, + "loss": 0.3941590189933777, + "mean_token_accuracy": 0.8608149290084839, + "num_tokens": 8925876.0, + "step": 984 + }, + { + "epoch": 0.7484802431610942, + "grad_norm": 2.849802255630493, + "learning_rate": 4.477311475026271e-06, + "loss": 0.42190325260162354, + "mean_token_accuracy": 0.860505223274231, + "num_tokens": 8930190.0, + "step": 985 + }, + { + "epoch": 0.7492401215805471, + "grad_norm": 1.704128384590149, + "learning_rate": 4.476029192385769e-06, + "loss": 0.4786282777786255, + "mean_token_accuracy": 0.8302322626113892, + "num_tokens": 8938340.0, + "step": 986 + }, + { + "epoch": 0.75, + "grad_norm": 2.06322979927063, + "learning_rate": 4.474745522891915e-06, + "loss": 0.4648786187171936, + "mean_token_accuracy": 0.8366481065750122, + "num_tokens": 8944633.0, + "step": 987 + }, + { + "epoch": 0.7507598784194529, + "grad_norm": 2.0745396614074707, + "learning_rate": 4.473460467445637e-06, + "loss": 0.5744885206222534, + "mean_token_accuracy": 0.8357284069061279, + "num_tokens": 8954457.0, + "step": 988 + }, + { + "epoch": 0.7515197568389058, + "grad_norm": 1.9281407594680786, + "learning_rate": 4.472174026948836e-06, + "loss": 0.528974175453186, + "mean_token_accuracy": 0.8083580732345581, + "num_tokens": 8962701.0, + "step": 989 + }, + { + "epoch": 0.7522796352583586, + "grad_norm": 3.012381076812744, + "learning_rate": 4.470886202304385e-06, + "loss": 0.48754751682281494, + "mean_token_accuracy": 0.8368391990661621, + "num_tokens": 8967272.0, + "step": 990 + }, + { + "epoch": 0.7530395136778115, + "grad_norm": 1.691826581954956, + "learning_rate": 4.469596994416131e-06, + "loss": 0.484740674495697, + "mean_token_accuracy": 0.8500643968582153, + "num_tokens": 8976615.0, + "step": 991 + }, + { + "epoch": 0.7537993920972644, + "grad_norm": 2.4961965084075928, + "learning_rate": 4.468306404188887e-06, + "loss": 0.50777268409729, + "mean_token_accuracy": 0.8168395757675171, + "num_tokens": 8983235.0, + "step": 992 + }, + { + "epoch": 0.7545592705167173, + "grad_norm": 1.512007713317871, + "learning_rate": 4.467014432528441e-06, + "loss": 0.4583340287208557, + "mean_token_accuracy": 0.8465162515640259, + "num_tokens": 8993815.0, + "step": 993 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 1.9362257719039917, + "learning_rate": 4.465721080341547e-06, + "loss": 0.6027892827987671, + "mean_token_accuracy": 0.8052380084991455, + "num_tokens": 9002697.0, + "step": 994 + }, + { + "epoch": 0.756079027355623, + "grad_norm": 2.473632335662842, + "learning_rate": 4.4644263485359316e-06, + "loss": 0.5394320487976074, + "mean_token_accuracy": 0.834665834903717, + "num_tokens": 9007428.0, + "step": 995 + }, + { + "epoch": 0.756838905775076, + "grad_norm": 2.2527434825897217, + "learning_rate": 4.463130238020284e-06, + "loss": 0.5485198497772217, + "mean_token_accuracy": 0.8090173006057739, + "num_tokens": 9013570.0, + "step": 996 + }, + { + "epoch": 0.7575987841945289, + "grad_norm": 1.4130940437316895, + "learning_rate": 4.4618327497042676e-06, + "loss": 0.37994423508644104, + "mean_token_accuracy": 0.8625167012214661, + "num_tokens": 9025485.0, + "step": 997 + }, + { + "epoch": 0.7583586626139818, + "grad_norm": 2.685115098953247, + "learning_rate": 4.460533884498509e-06, + "loss": 0.447973370552063, + "mean_token_accuracy": 0.8564165234565735, + "num_tokens": 9030355.0, + "step": 998 + }, + { + "epoch": 0.7591185410334347, + "grad_norm": 3.2743139266967773, + "learning_rate": 4.4592336433146e-06, + "loss": 0.45275989174842834, + "mean_token_accuracy": 0.8462578058242798, + "num_tokens": 9034406.0, + "step": 999 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 1.9383049011230469, + "learning_rate": 4.457932027065102e-06, + "loss": 0.5387729406356812, + "mean_token_accuracy": 0.8357330560684204, + "num_tokens": 9041502.0, + "step": 1000 + }, + { + "epoch": 0.7606382978723404, + "grad_norm": 2.7348275184631348, + "learning_rate": 4.456629036663537e-06, + "loss": 0.4448447823524475, + "mean_token_accuracy": 0.8453642129898071, + "num_tokens": 9046088.0, + "step": 1001 + }, + { + "epoch": 0.7613981762917933, + "grad_norm": 1.8477401733398438, + "learning_rate": 4.455324673024396e-06, + "loss": 0.5766505002975464, + "mean_token_accuracy": 0.8074213862419128, + "num_tokens": 9055678.0, + "step": 1002 + }, + { + "epoch": 0.7621580547112462, + "grad_norm": 3.134481430053711, + "learning_rate": 4.4540189370631315e-06, + "loss": 0.5690872669219971, + "mean_token_accuracy": 0.8414670825004578, + "num_tokens": 9062006.0, + "step": 1003 + }, + { + "epoch": 0.7629179331306991, + "grad_norm": 1.7933398485183716, + "learning_rate": 4.452711829696158e-06, + "loss": 0.4898291826248169, + "mean_token_accuracy": 0.8259007930755615, + "num_tokens": 9070754.0, + "step": 1004 + }, + { + "epoch": 0.763677811550152, + "grad_norm": 1.2552275657653809, + "learning_rate": 4.451403351840855e-06, + "loss": 0.4280198812484741, + "mean_token_accuracy": 0.8409112691879272, + "num_tokens": 9085306.0, + "step": 1005 + }, + { + "epoch": 0.7644376899696048, + "grad_norm": 1.6749331951141357, + "learning_rate": 4.450093504415562e-06, + "loss": 0.3723178505897522, + "mean_token_accuracy": 0.8545734882354736, + "num_tokens": 9102453.0, + "step": 1006 + }, + { + "epoch": 0.7651975683890577, + "grad_norm": 2.7514500617980957, + "learning_rate": 4.44878228833958e-06, + "loss": 0.5463190674781799, + "mean_token_accuracy": 0.8121639490127563, + "num_tokens": 9108342.0, + "step": 1007 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 1.3322733640670776, + "learning_rate": 4.447469704533172e-06, + "loss": 0.573723316192627, + "mean_token_accuracy": 0.8065711259841919, + "num_tokens": 9123712.0, + "step": 1008 + }, + { + "epoch": 0.7667173252279635, + "grad_norm": 2.6893765926361084, + "learning_rate": 4.446155753917559e-06, + "loss": 0.6856257915496826, + "mean_token_accuracy": 0.7718256711959839, + "num_tokens": 9130728.0, + "step": 1009 + }, + { + "epoch": 0.7674772036474165, + "grad_norm": 1.792765498161316, + "learning_rate": 4.444840437414923e-06, + "loss": 0.48203110694885254, + "mean_token_accuracy": 0.8419194221496582, + "num_tokens": 9137983.0, + "step": 1010 + }, + { + "epoch": 0.7682370820668692, + "grad_norm": 1.4957399368286133, + "learning_rate": 4.443523755948401e-06, + "loss": 0.4372181296348572, + "mean_token_accuracy": 0.8491764664649963, + "num_tokens": 9148081.0, + "step": 1011 + }, + { + "epoch": 0.7689969604863222, + "grad_norm": 1.7294867038726807, + "learning_rate": 4.442205710442095e-06, + "loss": 0.54277503490448, + "mean_token_accuracy": 0.8196806907653809, + "num_tokens": 9158407.0, + "step": 1012 + }, + { + "epoch": 0.7697568389057751, + "grad_norm": 2.2091221809387207, + "learning_rate": 4.4408863018210564e-06, + "loss": 0.4888187646865845, + "mean_token_accuracy": 0.8384175300598145, + "num_tokens": 9164754.0, + "step": 1013 + }, + { + "epoch": 0.770516717325228, + "grad_norm": 1.7615830898284912, + "learning_rate": 4.439565531011299e-06, + "loss": 0.4640008211135864, + "mean_token_accuracy": 0.8424701690673828, + "num_tokens": 9172715.0, + "step": 1014 + }, + { + "epoch": 0.7712765957446809, + "grad_norm": 1.6796128749847412, + "learning_rate": 4.43824339893979e-06, + "loss": 0.5227609276771545, + "mean_token_accuracy": 0.8135923743247986, + "num_tokens": 9183214.0, + "step": 1015 + }, + { + "epoch": 0.7720364741641338, + "grad_norm": 2.1485698223114014, + "learning_rate": 4.436919906534452e-06, + "loss": 0.4857056140899658, + "mean_token_accuracy": 0.8323013782501221, + "num_tokens": 9190360.0, + "step": 1016 + }, + { + "epoch": 0.7727963525835866, + "grad_norm": 2.7842206954956055, + "learning_rate": 4.4355950547241645e-06, + "loss": 0.46406883001327515, + "mean_token_accuracy": 0.859869122505188, + "num_tokens": 9194523.0, + "step": 1017 + }, + { + "epoch": 0.7735562310030395, + "grad_norm": 2.3774640560150146, + "learning_rate": 4.434268844438758e-06, + "loss": 0.5625549554824829, + "mean_token_accuracy": 0.8188897371292114, + "num_tokens": 9201155.0, + "step": 1018 + }, + { + "epoch": 0.7743161094224924, + "grad_norm": 2.004427909851074, + "learning_rate": 4.432941276609018e-06, + "loss": 0.5164387226104736, + "mean_token_accuracy": 0.829569935798645, + "num_tokens": 9209269.0, + "step": 1019 + }, + { + "epoch": 0.7750759878419453, + "grad_norm": 1.7218989133834839, + "learning_rate": 4.431612352166684e-06, + "loss": 0.481005996465683, + "mean_token_accuracy": 0.8359906673431396, + "num_tokens": 9220860.0, + "step": 1020 + }, + { + "epoch": 0.7758358662613982, + "grad_norm": 2.197108507156372, + "learning_rate": 4.4302820720444454e-06, + "loss": 0.440413236618042, + "mean_token_accuracy": 0.8412867784500122, + "num_tokens": 9226414.0, + "step": 1021 + }, + { + "epoch": 0.776595744680851, + "grad_norm": 2.6995162963867188, + "learning_rate": 4.428950437175944e-06, + "loss": 0.3884299397468567, + "mean_token_accuracy": 0.8696021437644958, + "num_tokens": 9230898.0, + "step": 1022 + }, + { + "epoch": 0.7773556231003039, + "grad_norm": 2.1671667098999023, + "learning_rate": 4.427617448495772e-06, + "loss": 0.5747478008270264, + "mean_token_accuracy": 0.7842930555343628, + "num_tokens": 9238479.0, + "step": 1023 + }, + { + "epoch": 0.7781155015197568, + "grad_norm": 1.6299028396606445, + "learning_rate": 4.426283106939474e-06, + "loss": 0.39478403329849243, + "mean_token_accuracy": 0.8685503602027893, + "num_tokens": 9248263.0, + "step": 1024 + }, + { + "epoch": 0.7788753799392097, + "grad_norm": 2.2621798515319824, + "learning_rate": 4.424947413443539e-06, + "loss": 0.4582178592681885, + "mean_token_accuracy": 0.8312377333641052, + "num_tokens": 9254168.0, + "step": 1025 + }, + { + "epoch": 0.7796352583586627, + "grad_norm": 2.121091365814209, + "learning_rate": 4.423610368945411e-06, + "loss": 0.5315121412277222, + "mean_token_accuracy": 0.8121483325958252, + "num_tokens": 9261808.0, + "step": 1026 + }, + { + "epoch": 0.7803951367781155, + "grad_norm": 1.8558297157287598, + "learning_rate": 4.422271974383479e-06, + "loss": 0.4299176037311554, + "mean_token_accuracy": 0.8452648520469666, + "num_tokens": 9269264.0, + "step": 1027 + }, + { + "epoch": 0.7811550151975684, + "grad_norm": 1.9089949131011963, + "learning_rate": 4.420932230697079e-06, + "loss": 0.43876272439956665, + "mean_token_accuracy": 0.8434094190597534, + "num_tokens": 9277381.0, + "step": 1028 + }, + { + "epoch": 0.7819148936170213, + "grad_norm": 1.8619649410247803, + "learning_rate": 4.419591138826495e-06, + "loss": 0.48798668384552, + "mean_token_accuracy": 0.8281317353248596, + "num_tokens": 9285413.0, + "step": 1029 + }, + { + "epoch": 0.7826747720364742, + "grad_norm": 1.3273087739944458, + "learning_rate": 4.418248699712955e-06, + "loss": 0.4611460864543915, + "mean_token_accuracy": 0.8233213424682617, + "num_tokens": 9300805.0, + "step": 1030 + }, + { + "epoch": 0.7834346504559271, + "grad_norm": 1.0473746061325073, + "learning_rate": 4.416904914298637e-06, + "loss": 0.36537665128707886, + "mean_token_accuracy": 0.8671857118606567, + "num_tokens": 9320035.0, + "step": 1031 + }, + { + "epoch": 0.78419452887538, + "grad_norm": 1.9130918979644775, + "learning_rate": 4.415559783526661e-06, + "loss": 0.4916655123233795, + "mean_token_accuracy": 0.8266351222991943, + "num_tokens": 9326795.0, + "step": 1032 + }, + { + "epoch": 0.7849544072948328, + "grad_norm": 2.0001816749572754, + "learning_rate": 4.414213308341092e-06, + "loss": 0.5711008310317993, + "mean_token_accuracy": 0.8093076348304749, + "num_tokens": 9335625.0, + "step": 1033 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 3.933542251586914, + "learning_rate": 4.412865489686936e-06, + "loss": 0.621616542339325, + "mean_token_accuracy": 0.7938898801803589, + "num_tokens": 9339080.0, + "step": 1034 + }, + { + "epoch": 0.7864741641337386, + "grad_norm": 2.061558961868286, + "learning_rate": 4.411516328510145e-06, + "loss": 0.583686113357544, + "mean_token_accuracy": 0.8216883540153503, + "num_tokens": 9348581.0, + "step": 1035 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 1.9401264190673828, + "learning_rate": 4.410165825757613e-06, + "loss": 0.4905240535736084, + "mean_token_accuracy": 0.8229951858520508, + "num_tokens": 9356032.0, + "step": 1036 + }, + { + "epoch": 0.7879939209726444, + "grad_norm": 3.620547294616699, + "learning_rate": 4.408813982377175e-06, + "loss": 0.4269888997077942, + "mean_token_accuracy": 0.8713940978050232, + "num_tokens": 9359061.0, + "step": 1037 + }, + { + "epoch": 0.7887537993920972, + "grad_norm": 1.2027851343154907, + "learning_rate": 4.407460799317605e-06, + "loss": 0.39972418546676636, + "mean_token_accuracy": 0.8610097765922546, + "num_tokens": 9377068.0, + "step": 1038 + }, + { + "epoch": 0.7895136778115501, + "grad_norm": 2.566753387451172, + "learning_rate": 4.40610627752862e-06, + "loss": 0.45267152786254883, + "mean_token_accuracy": 0.83243328332901, + "num_tokens": 9383604.0, + "step": 1039 + }, + { + "epoch": 0.790273556231003, + "grad_norm": 2.940094470977783, + "learning_rate": 4.404750417960876e-06, + "loss": 0.42862242460250854, + "mean_token_accuracy": 0.8582849502563477, + "num_tokens": 9387541.0, + "step": 1040 + }, + { + "epoch": 0.791033434650456, + "grad_norm": 2.0223944187164307, + "learning_rate": 4.403393221565966e-06, + "loss": 0.4349963665008545, + "mean_token_accuracy": 0.8453047871589661, + "num_tokens": 9394382.0, + "step": 1041 + }, + { + "epoch": 0.7917933130699089, + "grad_norm": 2.9399030208587646, + "learning_rate": 4.402034689296425e-06, + "loss": 0.32197174429893494, + "mean_token_accuracy": 0.8953392505645752, + "num_tokens": 9397741.0, + "step": 1042 + }, + { + "epoch": 0.7925531914893617, + "grad_norm": 2.819016456604004, + "learning_rate": 4.400674822105721e-06, + "loss": 0.6790289878845215, + "mean_token_accuracy": 0.8135063648223877, + "num_tokens": 9403509.0, + "step": 1043 + }, + { + "epoch": 0.7933130699088146, + "grad_norm": 1.3225977420806885, + "learning_rate": 4.399313620948262e-06, + "loss": 0.42203834652900696, + "mean_token_accuracy": 0.8399381637573242, + "num_tokens": 9418870.0, + "step": 1044 + }, + { + "epoch": 0.7940729483282675, + "grad_norm": 1.7822176218032837, + "learning_rate": 4.397951086779392e-06, + "loss": 0.4666554927825928, + "mean_token_accuracy": 0.8364764451980591, + "num_tokens": 9427640.0, + "step": 1045 + }, + { + "epoch": 0.7948328267477204, + "grad_norm": 3.186439037322998, + "learning_rate": 4.396587220555389e-06, + "loss": 0.6048363447189331, + "mean_token_accuracy": 0.7806557416915894, + "num_tokens": 9431927.0, + "step": 1046 + }, + { + "epoch": 0.7955927051671733, + "grad_norm": 3.0804805755615234, + "learning_rate": 4.395222023233467e-06, + "loss": 0.445969820022583, + "mean_token_accuracy": 0.850671112537384, + "num_tokens": 9436136.0, + "step": 1047 + }, + { + "epoch": 0.7963525835866262, + "grad_norm": 1.675968885421753, + "learning_rate": 4.393855495771774e-06, + "loss": 0.4311422109603882, + "mean_token_accuracy": 0.8449079990386963, + "num_tokens": 9445189.0, + "step": 1048 + }, + { + "epoch": 0.797112462006079, + "grad_norm": 2.342410087585449, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.5733606219291687, + "mean_token_accuracy": 0.8156592845916748, + "num_tokens": 9451939.0, + "step": 1049 + }, + { + "epoch": 0.7978723404255319, + "grad_norm": 1.5967470407485962, + "learning_rate": 4.391118454266335e-06, + "loss": 0.46664729714393616, + "mean_token_accuracy": 0.8091695308685303, + "num_tokens": 9463968.0, + "step": 1050 + }, + { + "epoch": 0.7986322188449848, + "grad_norm": 1.5777863264083862, + "learning_rate": 4.389747942143549e-06, + "loss": 0.46028903126716614, + "mean_token_accuracy": 0.8347330093383789, + "num_tokens": 9475561.0, + "step": 1051 + }, + { + "epoch": 0.7993920972644377, + "grad_norm": 2.7630488872528076, + "learning_rate": 4.388376103722914e-06, + "loss": 0.5618188977241516, + "mean_token_accuracy": 0.8273467421531677, + "num_tokens": 9480661.0, + "step": 1052 + }, + { + "epoch": 0.8001519756838906, + "grad_norm": 2.093397378921509, + "learning_rate": 4.387002939967237e-06, + "loss": 0.2998353838920593, + "mean_token_accuracy": 0.8905231952667236, + "num_tokens": 9485924.0, + "step": 1053 + }, + { + "epoch": 0.8009118541033434, + "grad_norm": 1.4385871887207031, + "learning_rate": 4.38562845184026e-06, + "loss": 0.4944111704826355, + "mean_token_accuracy": 0.8403056263923645, + "num_tokens": 9500128.0, + "step": 1054 + }, + { + "epoch": 0.8016717325227963, + "grad_norm": 1.6393156051635742, + "learning_rate": 4.384252640306649e-06, + "loss": 0.5727907419204712, + "mean_token_accuracy": 0.7849414348602295, + "num_tokens": 9511569.0, + "step": 1055 + }, + { + "epoch": 0.8024316109422492, + "grad_norm": 2.3909664154052734, + "learning_rate": 4.382875506332002e-06, + "loss": 0.4760419726371765, + "mean_token_accuracy": 0.8408266305923462, + "num_tokens": 9517244.0, + "step": 1056 + }, + { + "epoch": 0.8031914893617021, + "grad_norm": 1.7288594245910645, + "learning_rate": 4.381497050882845e-06, + "loss": 0.5375926494598389, + "mean_token_accuracy": 0.8138614892959595, + "num_tokens": 9528736.0, + "step": 1057 + }, + { + "epoch": 0.8039513677811551, + "grad_norm": 2.093407392501831, + "learning_rate": 4.380117274926632e-06, + "loss": 0.46659404039382935, + "mean_token_accuracy": 0.8450702428817749, + "num_tokens": 9536200.0, + "step": 1058 + }, + { + "epoch": 0.8047112462006079, + "grad_norm": 1.6835898160934448, + "learning_rate": 4.3787361794317405e-06, + "loss": 0.43157699704170227, + "mean_token_accuracy": 0.8279973268508911, + "num_tokens": 9546314.0, + "step": 1059 + }, + { + "epoch": 0.8054711246200608, + "grad_norm": 1.983067512512207, + "learning_rate": 4.377353765367479e-06, + "loss": 0.5021739602088928, + "mean_token_accuracy": 0.8274815082550049, + "num_tokens": 9554375.0, + "step": 1060 + }, + { + "epoch": 0.8062310030395137, + "grad_norm": 2.0472030639648438, + "learning_rate": 4.375970033704078e-06, + "loss": 0.34298190474510193, + "mean_token_accuracy": 0.8900876045227051, + "num_tokens": 9560230.0, + "step": 1061 + }, + { + "epoch": 0.8069908814589666, + "grad_norm": 1.9613717794418335, + "learning_rate": 4.374584985412692e-06, + "loss": 0.3826758861541748, + "mean_token_accuracy": 0.839923620223999, + "num_tokens": 9566809.0, + "step": 1062 + }, + { + "epoch": 0.8077507598784195, + "grad_norm": 1.991289496421814, + "learning_rate": 4.373198621465405e-06, + "loss": 0.5492525100708008, + "mean_token_accuracy": 0.8153272867202759, + "num_tokens": 9576810.0, + "step": 1063 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.421370506286621, + "learning_rate": 4.3718109428352155e-06, + "loss": 0.5240297317504883, + "mean_token_accuracy": 0.8087242245674133, + "num_tokens": 9582906.0, + "step": 1064 + }, + { + "epoch": 0.8092705167173252, + "grad_norm": 3.697765588760376, + "learning_rate": 4.370421950496055e-06, + "loss": 0.6096476912498474, + "mean_token_accuracy": 0.787585973739624, + "num_tokens": 9586920.0, + "step": 1065 + }, + { + "epoch": 0.8100303951367781, + "grad_norm": 2.0767786502838135, + "learning_rate": 4.369031645422768e-06, + "loss": 0.41120079159736633, + "mean_token_accuracy": 0.8513731956481934, + "num_tokens": 9593902.0, + "step": 1066 + }, + { + "epoch": 0.810790273556231, + "grad_norm": 2.5968732833862305, + "learning_rate": 4.367640028591126e-06, + "loss": 0.3364982008934021, + "mean_token_accuracy": 0.8786963224411011, + "num_tokens": 9597745.0, + "step": 1067 + }, + { + "epoch": 0.8115501519756839, + "grad_norm": 2.165742874145508, + "learning_rate": 4.366247100977818e-06, + "loss": 0.406129390001297, + "mean_token_accuracy": 0.868243932723999, + "num_tokens": 9603496.0, + "step": 1068 + }, + { + "epoch": 0.8123100303951368, + "grad_norm": 2.0493404865264893, + "learning_rate": 4.364852863560456e-06, + "loss": 0.5356296300888062, + "mean_token_accuracy": 0.8191947340965271, + "num_tokens": 9610898.0, + "step": 1069 + }, + { + "epoch": 0.8130699088145896, + "grad_norm": 2.3224308490753174, + "learning_rate": 4.363457317317568e-06, + "loss": 0.41461923718452454, + "mean_token_accuracy": 0.8537945747375488, + "num_tokens": 9616626.0, + "step": 1070 + }, + { + "epoch": 0.8138297872340425, + "grad_norm": 1.7387986183166504, + "learning_rate": 4.362060463228603e-06, + "loss": 0.5134786367416382, + "mean_token_accuracy": 0.8511737585067749, + "num_tokens": 9626223.0, + "step": 1071 + }, + { + "epoch": 0.8145896656534954, + "grad_norm": 3.0270655155181885, + "learning_rate": 4.360662302273926e-06, + "loss": 0.3410695791244507, + "mean_token_accuracy": 0.8746449947357178, + "num_tokens": 9629455.0, + "step": 1072 + }, + { + "epoch": 0.8153495440729484, + "grad_norm": 1.7727062702178955, + "learning_rate": 4.35926283543482e-06, + "loss": 0.4610968828201294, + "mean_token_accuracy": 0.8444793224334717, + "num_tokens": 9638070.0, + "step": 1073 + }, + { + "epoch": 0.8161094224924013, + "grad_norm": 3.6333565711975098, + "learning_rate": 4.357862063693486e-06, + "loss": 0.3881273865699768, + "mean_token_accuracy": 0.8757344484329224, + "num_tokens": 9641028.0, + "step": 1074 + }, + { + "epoch": 0.8168693009118541, + "grad_norm": 3.024042844772339, + "learning_rate": 4.356459988033039e-06, + "loss": 0.3853808641433716, + "mean_token_accuracy": 0.8602254390716553, + "num_tokens": 9645730.0, + "step": 1075 + }, + { + "epoch": 0.817629179331307, + "grad_norm": 2.3359482288360596, + "learning_rate": 4.355056609437509e-06, + "loss": 0.4852045476436615, + "mean_token_accuracy": 0.8502728343009949, + "num_tokens": 9650975.0, + "step": 1076 + }, + { + "epoch": 0.8183890577507599, + "grad_norm": 2.2390685081481934, + "learning_rate": 4.353651928891842e-06, + "loss": 0.5287341475486755, + "mean_token_accuracy": 0.8247801065444946, + "num_tokens": 9657471.0, + "step": 1077 + }, + { + "epoch": 0.8191489361702128, + "grad_norm": 2.3809144496917725, + "learning_rate": 4.352245947381897e-06, + "loss": 0.5218510627746582, + "mean_token_accuracy": 0.8149170875549316, + "num_tokens": 9664108.0, + "step": 1078 + }, + { + "epoch": 0.8199088145896657, + "grad_norm": 1.7072309255599976, + "learning_rate": 4.3508386658944455e-06, + "loss": 0.46481168270111084, + "mean_token_accuracy": 0.834963321685791, + "num_tokens": 9673175.0, + "step": 1079 + }, + { + "epoch": 0.8206686930091185, + "grad_norm": 1.7383702993392944, + "learning_rate": 4.349430085417171e-06, + "loss": 0.4505952000617981, + "mean_token_accuracy": 0.8507769107818604, + "num_tokens": 9682800.0, + "step": 1080 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 2.4308547973632812, + "learning_rate": 4.348020206938672e-06, + "loss": 0.4832455515861511, + "mean_token_accuracy": 0.8538393974304199, + "num_tokens": 9688123.0, + "step": 1081 + }, + { + "epoch": 0.8221884498480243, + "grad_norm": 2.2686192989349365, + "learning_rate": 4.3466090314484526e-06, + "loss": 0.5112563371658325, + "mean_token_accuracy": 0.8308460712432861, + "num_tokens": 9694299.0, + "step": 1082 + }, + { + "epoch": 0.8229483282674772, + "grad_norm": 2.806093454360962, + "learning_rate": 4.345196559936931e-06, + "loss": 0.4818246364593506, + "mean_token_accuracy": 0.86617112159729, + "num_tokens": 9698471.0, + "step": 1083 + }, + { + "epoch": 0.8237082066869301, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.343782793395435e-06, + "loss": 0.38246971368789673, + "mean_token_accuracy": 0.8675198554992676, + "num_tokens": 9706444.0, + "step": 1084 + }, + { + "epoch": 0.824468085106383, + "grad_norm": 1.664942741394043, + "learning_rate": 4.3423677328162e-06, + "loss": 0.498797208070755, + "mean_token_accuracy": 0.8447319865226746, + "num_tokens": 9716765.0, + "step": 1085 + }, + { + "epoch": 0.8252279635258358, + "grad_norm": 1.3608235120773315, + "learning_rate": 4.340951379192369e-06, + "loss": 0.41961491107940674, + "mean_token_accuracy": 0.8339346647262573, + "num_tokens": 9729564.0, + "step": 1086 + }, + { + "epoch": 0.8259878419452887, + "grad_norm": 1.642503261566162, + "learning_rate": 4.3395337335179945e-06, + "loss": 0.5477945804595947, + "mean_token_accuracy": 0.8117889761924744, + "num_tokens": 9741217.0, + "step": 1087 + }, + { + "epoch": 0.8267477203647416, + "grad_norm": 3.0345044136047363, + "learning_rate": 4.338114796788035e-06, + "loss": 0.5024623870849609, + "mean_token_accuracy": 0.8333141207695007, + "num_tokens": 9744941.0, + "step": 1088 + }, + { + "epoch": 0.8275075987841946, + "grad_norm": 1.3096630573272705, + "learning_rate": 4.336694569998354e-06, + "loss": 0.44169723987579346, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 9757854.0, + "step": 1089 + }, + { + "epoch": 0.8282674772036475, + "grad_norm": 2.203279495239258, + "learning_rate": 4.3352730541457215e-06, + "loss": 0.5283265113830566, + "mean_token_accuracy": 0.8053759932518005, + "num_tokens": 9764096.0, + "step": 1090 + }, + { + "epoch": 0.8290273556231003, + "grad_norm": 1.3774312734603882, + "learning_rate": 4.333850250227814e-06, + "loss": 0.4584103226661682, + "mean_token_accuracy": 0.8342611193656921, + "num_tokens": 9777768.0, + "step": 1091 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 1.822637915611267, + "learning_rate": 4.332426159243206e-06, + "loss": 0.5432791709899902, + "mean_token_accuracy": 0.8136210441589355, + "num_tokens": 9791276.0, + "step": 1092 + }, + { + "epoch": 0.8305471124620061, + "grad_norm": 3.0190067291259766, + "learning_rate": 4.331000782191384e-06, + "loss": 0.5018150806427002, + "mean_token_accuracy": 0.8234807252883911, + "num_tokens": 9794902.0, + "step": 1093 + }, + { + "epoch": 0.831306990881459, + "grad_norm": 2.09987735748291, + "learning_rate": 4.329574120072728e-06, + "loss": 0.4270891547203064, + "mean_token_accuracy": 0.8544977903366089, + "num_tokens": 9800903.0, + "step": 1094 + }, + { + "epoch": 0.8320668693009119, + "grad_norm": 1.969549536705017, + "learning_rate": 4.328146173888528e-06, + "loss": 0.45801427960395813, + "mean_token_accuracy": 0.8334714770317078, + "num_tokens": 9808719.0, + "step": 1095 + }, + { + "epoch": 0.8328267477203647, + "grad_norm": 1.4565571546554565, + "learning_rate": 4.32671694464097e-06, + "loss": 0.34864288568496704, + "mean_token_accuracy": 0.8689061999320984, + "num_tokens": 9818262.0, + "step": 1096 + }, + { + "epoch": 0.8335866261398176, + "grad_norm": 1.2163832187652588, + "learning_rate": 4.3252864333331424e-06, + "loss": 0.37953704595565796, + "mean_token_accuracy": 0.866554856300354, + "num_tokens": 9833942.0, + "step": 1097 + }, + { + "epoch": 0.8343465045592705, + "grad_norm": 1.6112010478973389, + "learning_rate": 4.323854640969033e-06, + "loss": 0.5442801713943481, + "mean_token_accuracy": 0.8190416097640991, + "num_tokens": 9844765.0, + "step": 1098 + }, + { + "epoch": 0.8351063829787234, + "grad_norm": 1.8190315961837769, + "learning_rate": 4.322421568553529e-06, + "loss": 0.48271381855010986, + "mean_token_accuracy": 0.8203652501106262, + "num_tokens": 9852625.0, + "step": 1099 + }, + { + "epoch": 0.8358662613981763, + "grad_norm": 2.7897756099700928, + "learning_rate": 4.320987217092416e-06, + "loss": 0.4086323380470276, + "mean_token_accuracy": 0.8504934310913086, + "num_tokens": 9856888.0, + "step": 1100 + }, + { + "epoch": 0.8366261398176292, + "grad_norm": 1.7035977840423584, + "learning_rate": 4.319551587592377e-06, + "loss": 0.6325064301490784, + "mean_token_accuracy": 0.788190484046936, + "num_tokens": 9869419.0, + "step": 1101 + }, + { + "epoch": 0.837386018237082, + "grad_norm": 2.609731912612915, + "learning_rate": 4.318114681060989e-06, + "loss": 0.519314706325531, + "mean_token_accuracy": 0.8469992280006409, + "num_tokens": 9874553.0, + "step": 1102 + }, + { + "epoch": 0.8381458966565349, + "grad_norm": 1.2519766092300415, + "learning_rate": 4.316676498506735e-06, + "loss": 0.3566005825996399, + "mean_token_accuracy": 0.8588439226150513, + "num_tokens": 9886498.0, + "step": 1103 + }, + { + "epoch": 0.8389057750759878, + "grad_norm": 1.430892825126648, + "learning_rate": 4.3152370409389795e-06, + "loss": 0.5250182747840881, + "mean_token_accuracy": 0.8164948225021362, + "num_tokens": 9900256.0, + "step": 1104 + }, + { + "epoch": 0.8396656534954408, + "grad_norm": 3.1245436668395996, + "learning_rate": 4.3137963093679945e-06, + "loss": 0.3173971176147461, + "mean_token_accuracy": 0.8835347890853882, + "num_tokens": 9903899.0, + "step": 1105 + }, + { + "epoch": 0.8404255319148937, + "grad_norm": 3.131812572479248, + "learning_rate": 4.3123543048049395e-06, + "loss": 0.6567763090133667, + "mean_token_accuracy": 0.8233605027198792, + "num_tokens": 9908798.0, + "step": 1106 + }, + { + "epoch": 0.8411854103343465, + "grad_norm": 1.3551725149154663, + "learning_rate": 4.310911028261867e-06, + "loss": 0.3993729054927826, + "mean_token_accuracy": 0.8529655933380127, + "num_tokens": 9922577.0, + "step": 1107 + }, + { + "epoch": 0.8419452887537994, + "grad_norm": 2.572533130645752, + "learning_rate": 4.309466480751726e-06, + "loss": 0.40906503796577454, + "mean_token_accuracy": 0.8630726933479309, + "num_tokens": 9926890.0, + "step": 1108 + }, + { + "epoch": 0.8427051671732523, + "grad_norm": 1.9146469831466675, + "learning_rate": 4.308020663288356e-06, + "loss": 0.48423194885253906, + "mean_token_accuracy": 0.8370280861854553, + "num_tokens": 9934293.0, + "step": 1109 + }, + { + "epoch": 0.8434650455927052, + "grad_norm": 1.6178001165390015, + "learning_rate": 4.306573576886485e-06, + "loss": 0.4262213408946991, + "mean_token_accuracy": 0.839401125907898, + "num_tokens": 9944513.0, + "step": 1110 + }, + { + "epoch": 0.8442249240121581, + "grad_norm": 2.4444572925567627, + "learning_rate": 4.305125222561736e-06, + "loss": 0.5199950933456421, + "mean_token_accuracy": 0.8507720232009888, + "num_tokens": 9949512.0, + "step": 1111 + }, + { + "epoch": 0.8449848024316109, + "grad_norm": 1.7983134984970093, + "learning_rate": 4.303675601330618e-06, + "loss": 0.36155956983566284, + "mean_token_accuracy": 0.8568712472915649, + "num_tokens": 9956402.0, + "step": 1112 + }, + { + "epoch": 0.8457446808510638, + "grad_norm": 2.391096353530884, + "learning_rate": 4.302224714210532e-06, + "loss": 0.5391949415206909, + "mean_token_accuracy": 0.8183057308197021, + "num_tokens": 9961606.0, + "step": 1113 + }, + { + "epoch": 0.8465045592705167, + "grad_norm": 1.8520214557647705, + "learning_rate": 4.3007725622197675e-06, + "loss": 0.5758882761001587, + "mean_token_accuracy": 0.7924330234527588, + "num_tokens": 9971473.0, + "step": 1114 + }, + { + "epoch": 0.8472644376899696, + "grad_norm": 2.436640739440918, + "learning_rate": 4.2993191463775e-06, + "loss": 0.3837985396385193, + "mean_token_accuracy": 0.8620110750198364, + "num_tokens": 9976333.0, + "step": 1115 + }, + { + "epoch": 0.8480243161094225, + "grad_norm": 1.7287120819091797, + "learning_rate": 4.29786446770379e-06, + "loss": 0.40066856145858765, + "mean_token_accuracy": 0.8618333339691162, + "num_tokens": 9985617.0, + "step": 1116 + }, + { + "epoch": 0.8487841945288754, + "grad_norm": 2.0310518741607666, + "learning_rate": 4.296408527219592e-06, + "loss": 0.5465943217277527, + "mean_token_accuracy": 0.812044620513916, + "num_tokens": 9995363.0, + "step": 1117 + }, + { + "epoch": 0.8495440729483282, + "grad_norm": 1.4858589172363281, + "learning_rate": 4.294951325946737e-06, + "loss": 0.45840176939964294, + "mean_token_accuracy": 0.8432979583740234, + "num_tokens": 10006400.0, + "step": 1118 + }, + { + "epoch": 0.8503039513677811, + "grad_norm": 1.6153514385223389, + "learning_rate": 4.293492864907947e-06, + "loss": 0.5225611925125122, + "mean_token_accuracy": 0.8180211186408997, + "num_tokens": 10018352.0, + "step": 1119 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.1178412437438965, + "learning_rate": 4.2920331451268246e-06, + "loss": 0.5580621361732483, + "mean_token_accuracy": 0.8211709260940552, + "num_tokens": 10025614.0, + "step": 1120 + }, + { + "epoch": 0.851823708206687, + "grad_norm": 2.036839246749878, + "learning_rate": 4.2905721676278585e-06, + "loss": 0.4658433198928833, + "mean_token_accuracy": 0.8380423784255981, + "num_tokens": 10032489.0, + "step": 1121 + }, + { + "epoch": 0.8525835866261399, + "grad_norm": 2.0056262016296387, + "learning_rate": 4.28910993343642e-06, + "loss": 0.47023308277130127, + "mean_token_accuracy": 0.8340359926223755, + "num_tokens": 10040050.0, + "step": 1122 + }, + { + "epoch": 0.8533434650455927, + "grad_norm": 2.540024518966675, + "learning_rate": 4.2876464435787576e-06, + "loss": 0.502303957939148, + "mean_token_accuracy": 0.8288739919662476, + "num_tokens": 10045042.0, + "step": 1123 + }, + { + "epoch": 0.8541033434650456, + "grad_norm": 1.7894693613052368, + "learning_rate": 4.286181699082008e-06, + "loss": 0.4732973575592041, + "mean_token_accuracy": 0.8340568542480469, + "num_tokens": 10054424.0, + "step": 1124 + }, + { + "epoch": 0.8548632218844985, + "grad_norm": 1.5601223707199097, + "learning_rate": 4.284715700974186e-06, + "loss": 0.472471684217453, + "mean_token_accuracy": 0.8274722695350647, + "num_tokens": 10065523.0, + "step": 1125 + }, + { + "epoch": 0.8556231003039514, + "grad_norm": 1.7326055765151978, + "learning_rate": 4.283248450284182e-06, + "loss": 0.5924872159957886, + "mean_token_accuracy": 0.7943467497825623, + "num_tokens": 10076839.0, + "step": 1126 + }, + { + "epoch": 0.8563829787234043, + "grad_norm": 1.5165479183197021, + "learning_rate": 4.281779948041772e-06, + "loss": 0.44768425822257996, + "mean_token_accuracy": 0.8394696712493896, + "num_tokens": 10088168.0, + "step": 1127 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.5448920726776123, + "learning_rate": 4.280310195277606e-06, + "loss": 0.4458175003528595, + "mean_token_accuracy": 0.835773229598999, + "num_tokens": 10100306.0, + "step": 1128 + }, + { + "epoch": 0.85790273556231, + "grad_norm": 1.6311609745025635, + "learning_rate": 4.278839193023214e-06, + "loss": 0.4158072769641876, + "mean_token_accuracy": 0.8482539653778076, + "num_tokens": 10110581.0, + "step": 1129 + }, + { + "epoch": 0.8586626139817629, + "grad_norm": 1.6714754104614258, + "learning_rate": 4.277366942311001e-06, + "loss": 0.3686875104904175, + "mean_token_accuracy": 0.8681533336639404, + "num_tokens": 10118799.0, + "step": 1130 + }, + { + "epoch": 0.8594224924012158, + "grad_norm": 2.1604413986206055, + "learning_rate": 4.2758934441742494e-06, + "loss": 0.37267982959747314, + "mean_token_accuracy": 0.8520427346229553, + "num_tokens": 10124734.0, + "step": 1131 + }, + { + "epoch": 0.8601823708206687, + "grad_norm": 2.123013973236084, + "learning_rate": 4.274418699647117e-06, + "loss": 0.49963313341140747, + "mean_token_accuracy": 0.8248758912086487, + "num_tokens": 10131965.0, + "step": 1132 + }, + { + "epoch": 0.8609422492401215, + "grad_norm": 1.4308786392211914, + "learning_rate": 4.272942709764638e-06, + "loss": 0.48666873574256897, + "mean_token_accuracy": 0.8304717540740967, + "num_tokens": 10145164.0, + "step": 1133 + }, + { + "epoch": 0.8617021276595744, + "grad_norm": 1.7952618598937988, + "learning_rate": 4.271465475562716e-06, + "loss": 0.5536223649978638, + "mean_token_accuracy": 0.8093959093093872, + "num_tokens": 10154083.0, + "step": 1134 + }, + { + "epoch": 0.8624620060790273, + "grad_norm": 2.0622456073760986, + "learning_rate": 4.269986998078132e-06, + "loss": 0.5173629522323608, + "mean_token_accuracy": 0.8285619020462036, + "num_tokens": 10161889.0, + "step": 1135 + }, + { + "epoch": 0.8632218844984803, + "grad_norm": 2.0707509517669678, + "learning_rate": 4.268507278348539e-06, + "loss": 0.5871608257293701, + "mean_token_accuracy": 0.7827386856079102, + "num_tokens": 10170726.0, + "step": 1136 + }, + { + "epoch": 0.8639817629179332, + "grad_norm": 2.054368257522583, + "learning_rate": 4.2670263174124615e-06, + "loss": 0.5788969993591309, + "mean_token_accuracy": 0.7967237234115601, + "num_tokens": 10178474.0, + "step": 1137 + }, + { + "epoch": 0.8647416413373861, + "grad_norm": 1.901846170425415, + "learning_rate": 4.265544116309294e-06, + "loss": 0.5405587553977966, + "mean_token_accuracy": 0.8151819705963135, + "num_tokens": 10187013.0, + "step": 1138 + }, + { + "epoch": 0.8655015197568389, + "grad_norm": 2.901285409927368, + "learning_rate": 4.264060676079302e-06, + "loss": 0.44101861119270325, + "mean_token_accuracy": 0.8433429002761841, + "num_tokens": 10191517.0, + "step": 1139 + }, + { + "epoch": 0.8662613981762918, + "grad_norm": 2.4168388843536377, + "learning_rate": 4.262575997763622e-06, + "loss": 0.4686204195022583, + "mean_token_accuracy": 0.8505309820175171, + "num_tokens": 10196948.0, + "step": 1140 + }, + { + "epoch": 0.8670212765957447, + "grad_norm": 1.9588396549224854, + "learning_rate": 4.2610900824042575e-06, + "loss": 0.47056013345718384, + "mean_token_accuracy": 0.8280024528503418, + "num_tokens": 10204292.0, + "step": 1141 + }, + { + "epoch": 0.8677811550151976, + "grad_norm": 2.569150924682617, + "learning_rate": 4.2596029310440826e-06, + "loss": 0.573108434677124, + "mean_token_accuracy": 0.8108246326446533, + "num_tokens": 10209571.0, + "step": 1142 + }, + { + "epoch": 0.8685410334346505, + "grad_norm": 2.038032293319702, + "learning_rate": 4.258114544726835e-06, + "loss": 0.40545332431793213, + "mean_token_accuracy": 0.8611703515052795, + "num_tokens": 10215716.0, + "step": 1143 + }, + { + "epoch": 0.8693009118541033, + "grad_norm": 1.9884231090545654, + "learning_rate": 4.256624924497124e-06, + "loss": 0.40085992217063904, + "mean_token_accuracy": 0.8615031242370605, + "num_tokens": 10222775.0, + "step": 1144 + }, + { + "epoch": 0.8700607902735562, + "grad_norm": 1.912842035293579, + "learning_rate": 4.25513407140042e-06, + "loss": 0.41022324562072754, + "mean_token_accuracy": 0.8459607362747192, + "num_tokens": 10229589.0, + "step": 1145 + }, + { + "epoch": 0.8708206686930091, + "grad_norm": 1.9190576076507568, + "learning_rate": 4.253641986483063e-06, + "loss": 0.5541447401046753, + "mean_token_accuracy": 0.8256468772888184, + "num_tokens": 10240633.0, + "step": 1146 + }, + { + "epoch": 0.871580547112462, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.2521486707922545e-06, + "loss": 0.3680543899536133, + "mean_token_accuracy": 0.8654477596282959, + "num_tokens": 10251252.0, + "step": 1147 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 1.4438525438308716, + "learning_rate": 4.250654125376062e-06, + "loss": 0.45830875635147095, + "mean_token_accuracy": 0.8433834314346313, + "num_tokens": 10263980.0, + "step": 1148 + }, + { + "epoch": 0.8731003039513677, + "grad_norm": 2.1273653507232666, + "learning_rate": 4.249158351283414e-06, + "loss": 0.4129376709461212, + "mean_token_accuracy": 0.861556351184845, + "num_tokens": 10270426.0, + "step": 1149 + }, + { + "epoch": 0.8738601823708206, + "grad_norm": 2.598440647125244, + "learning_rate": 4.247661349564103e-06, + "loss": 0.418030709028244, + "mean_token_accuracy": 0.86553955078125, + "num_tokens": 10275493.0, + "step": 1150 + }, + { + "epoch": 0.8746200607902735, + "grad_norm": 1.6852490901947021, + "learning_rate": 4.246163121268782e-06, + "loss": 0.6403408050537109, + "mean_token_accuracy": 0.7966094017028809, + "num_tokens": 10287989.0, + "step": 1151 + }, + { + "epoch": 0.8753799392097265, + "grad_norm": 2.5013794898986816, + "learning_rate": 4.244663667448965e-06, + "loss": 0.49922505021095276, + "mean_token_accuracy": 0.8318735361099243, + "num_tokens": 10293360.0, + "step": 1152 + }, + { + "epoch": 0.8761398176291794, + "grad_norm": 1.2022709846496582, + "learning_rate": 4.243162989157027e-06, + "loss": 0.4414965510368347, + "mean_token_accuracy": 0.8338693380355835, + "num_tokens": 10310558.0, + "step": 1153 + }, + { + "epoch": 0.8768996960486323, + "grad_norm": 1.9903281927108765, + "learning_rate": 4.241661087446202e-06, + "loss": 0.4277610778808594, + "mean_token_accuracy": 0.8560749292373657, + "num_tokens": 10316983.0, + "step": 1154 + }, + { + "epoch": 0.8776595744680851, + "grad_norm": 2.104923725128174, + "learning_rate": 4.240157963370583e-06, + "loss": 0.44431713223457336, + "mean_token_accuracy": 0.8785282969474792, + "num_tokens": 10323294.0, + "step": 1155 + }, + { + "epoch": 0.878419452887538, + "grad_norm": 2.8364813327789307, + "learning_rate": 4.2386536179851175e-06, + "loss": 0.49948397278785706, + "mean_token_accuracy": 0.8305255174636841, + "num_tokens": 10327662.0, + "step": 1156 + }, + { + "epoch": 0.8791793313069909, + "grad_norm": 1.9493682384490967, + "learning_rate": 4.2371480523456156e-06, + "loss": 0.45867404341697693, + "mean_token_accuracy": 0.8373264074325562, + "num_tokens": 10335699.0, + "step": 1157 + }, + { + "epoch": 0.8799392097264438, + "grad_norm": 2.268616199493408, + "learning_rate": 4.235641267508741e-06, + "loss": 0.4547857940196991, + "mean_token_accuracy": 0.8252766132354736, + "num_tokens": 10342464.0, + "step": 1158 + }, + { + "epoch": 0.8806990881458967, + "grad_norm": 2.1334283351898193, + "learning_rate": 4.234133264532012e-06, + "loss": 0.39503124356269836, + "mean_token_accuracy": 0.8648351430892944, + "num_tokens": 10347514.0, + "step": 1159 + }, + { + "epoch": 0.8814589665653495, + "grad_norm": 1.2775357961654663, + "learning_rate": 4.232624044473805e-06, + "loss": 0.39945733547210693, + "mean_token_accuracy": 0.8369829654693604, + "num_tokens": 10363316.0, + "step": 1160 + }, + { + "epoch": 0.8822188449848024, + "grad_norm": 2.458413600921631, + "learning_rate": 4.231113608393348e-06, + "loss": 0.5020045638084412, + "mean_token_accuracy": 0.8295938968658447, + "num_tokens": 10368401.0, + "step": 1161 + }, + { + "epoch": 0.8829787234042553, + "grad_norm": 1.7464948892593384, + "learning_rate": 4.229601957350722e-06, + "loss": 0.5335392951965332, + "mean_token_accuracy": 0.8134858012199402, + "num_tokens": 10378337.0, + "step": 1162 + }, + { + "epoch": 0.8837386018237082, + "grad_norm": 3.1152119636535645, + "learning_rate": 4.228089092406863e-06, + "loss": 0.4811682105064392, + "mean_token_accuracy": 0.8460187315940857, + "num_tokens": 10382362.0, + "step": 1163 + }, + { + "epoch": 0.8844984802431611, + "grad_norm": 2.190847158432007, + "learning_rate": 4.226575014623557e-06, + "loss": 0.4428049921989441, + "mean_token_accuracy": 0.8382467031478882, + "num_tokens": 10388211.0, + "step": 1164 + }, + { + "epoch": 0.8852583586626139, + "grad_norm": 1.860153079032898, + "learning_rate": 4.225059725063444e-06, + "loss": 0.5265918970108032, + "mean_token_accuracy": 0.8181334733963013, + "num_tokens": 10398873.0, + "step": 1165 + }, + { + "epoch": 0.8860182370820668, + "grad_norm": 1.3372713327407837, + "learning_rate": 4.22354322479001e-06, + "loss": 0.43202850222587585, + "mean_token_accuracy": 0.8432420492172241, + "num_tokens": 10413158.0, + "step": 1166 + }, + { + "epoch": 0.8867781155015197, + "grad_norm": 1.3653379678726196, + "learning_rate": 4.222025514867596e-06, + "loss": 0.43780991435050964, + "mean_token_accuracy": 0.8441485166549683, + "num_tokens": 10428137.0, + "step": 1167 + }, + { + "epoch": 0.8875379939209727, + "grad_norm": 3.0230672359466553, + "learning_rate": 4.220506596361387e-06, + "loss": 0.6039337515830994, + "mean_token_accuracy": 0.8274872303009033, + "num_tokens": 10432586.0, + "step": 1168 + }, + { + "epoch": 0.8882978723404256, + "grad_norm": 2.2180392742156982, + "learning_rate": 4.218986470337419e-06, + "loss": 0.5453792810440063, + "mean_token_accuracy": 0.8127184510231018, + "num_tokens": 10439471.0, + "step": 1169 + }, + { + "epoch": 0.8890577507598785, + "grad_norm": 1.8519103527069092, + "learning_rate": 4.217465137862575e-06, + "loss": 0.5145469903945923, + "mean_token_accuracy": 0.8178654909133911, + "num_tokens": 10450471.0, + "step": 1170 + }, + { + "epoch": 0.8898176291793313, + "grad_norm": 2.034008026123047, + "learning_rate": 4.215942600004586e-06, + "loss": 0.44061461091041565, + "mean_token_accuracy": 0.8572084307670593, + "num_tokens": 10457382.0, + "step": 1171 + }, + { + "epoch": 0.8905775075987842, + "grad_norm": 3.4304304122924805, + "learning_rate": 4.214418857832025e-06, + "loss": 0.44397830963134766, + "mean_token_accuracy": 0.842149019241333, + "num_tokens": 10460650.0, + "step": 1172 + }, + { + "epoch": 0.8913373860182371, + "grad_norm": 1.9021750688552856, + "learning_rate": 4.212893912414316e-06, + "loss": 0.3769867420196533, + "mean_token_accuracy": 0.8806171417236328, + "num_tokens": 10468214.0, + "step": 1173 + }, + { + "epoch": 0.89209726443769, + "grad_norm": 1.9704062938690186, + "learning_rate": 4.211367764821722e-06, + "loss": 0.5501819849014282, + "mean_token_accuracy": 0.8176811337471008, + "num_tokens": 10476739.0, + "step": 1174 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.4350415468215942, + "learning_rate": 4.209840416125353e-06, + "loss": 0.41897401213645935, + "mean_token_accuracy": 0.8498011827468872, + "num_tokens": 10491769.0, + "step": 1175 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.8237783908843994, + "learning_rate": 4.208311867397162e-06, + "loss": 0.5296977162361145, + "mean_token_accuracy": 0.8168715834617615, + "num_tokens": 10494958.0, + "step": 1176 + }, + { + "epoch": 0.8943768996960486, + "grad_norm": 2.04784893989563, + "learning_rate": 4.206782119709942e-06, + "loss": 0.476105272769928, + "mean_token_accuracy": 0.834011435508728, + "num_tokens": 10502077.0, + "step": 1177 + }, + { + "epoch": 0.8951367781155015, + "grad_norm": 1.8839610815048218, + "learning_rate": 4.205251174137329e-06, + "loss": 0.49628815054893494, + "mean_token_accuracy": 0.8212119936943054, + "num_tokens": 10510077.0, + "step": 1178 + }, + { + "epoch": 0.8958966565349544, + "grad_norm": 1.2100634574890137, + "learning_rate": 4.2037190317538e-06, + "loss": 0.4931519329547882, + "mean_token_accuracy": 0.8170043230056763, + "num_tokens": 10528373.0, + "step": 1179 + }, + { + "epoch": 0.8966565349544073, + "grad_norm": 1.884637713432312, + "learning_rate": 4.202185693634671e-06, + "loss": 0.4913347363471985, + "mean_token_accuracy": 0.8234949707984924, + "num_tokens": 10537108.0, + "step": 1180 + }, + { + "epoch": 0.8974164133738601, + "grad_norm": 1.5062434673309326, + "learning_rate": 4.200651160856099e-06, + "loss": 0.4160492420196533, + "mean_token_accuracy": 0.845937192440033, + "num_tokens": 10547577.0, + "step": 1181 + }, + { + "epoch": 0.898176291793313, + "grad_norm": 2.331169605255127, + "learning_rate": 4.1991154344950755e-06, + "loss": 0.6532632112503052, + "mean_token_accuracy": 0.7743191123008728, + "num_tokens": 10556328.0, + "step": 1182 + }, + { + "epoch": 0.898936170212766, + "grad_norm": 1.3538362979888916, + "learning_rate": 4.197578515629435e-06, + "loss": 0.4437566101551056, + "mean_token_accuracy": 0.8427901268005371, + "num_tokens": 10570026.0, + "step": 1183 + }, + { + "epoch": 0.8996960486322189, + "grad_norm": 2.3828957080841064, + "learning_rate": 4.196040405337846e-06, + "loss": 0.6185290217399597, + "mean_token_accuracy": 0.7969824075698853, + "num_tokens": 10576465.0, + "step": 1184 + }, + { + "epoch": 0.9004559270516718, + "grad_norm": 2.4759042263031006, + "learning_rate": 4.194501104699813e-06, + "loss": 0.46489226818084717, + "mean_token_accuracy": 0.8472316265106201, + "num_tokens": 10582034.0, + "step": 1185 + }, + { + "epoch": 0.9012158054711246, + "grad_norm": 1.9215164184570312, + "learning_rate": 4.192960614795676e-06, + "loss": 0.48001551628112793, + "mean_token_accuracy": 0.8371596336364746, + "num_tokens": 10590556.0, + "step": 1186 + }, + { + "epoch": 0.9019756838905775, + "grad_norm": 2.2717080116271973, + "learning_rate": 4.19141893670661e-06, + "loss": 0.40083563327789307, + "mean_token_accuracy": 0.8464195728302002, + "num_tokens": 10595661.0, + "step": 1187 + }, + { + "epoch": 0.9027355623100304, + "grad_norm": 2.187122344970703, + "learning_rate": 4.189876071514624e-06, + "loss": 0.4942901134490967, + "mean_token_accuracy": 0.8186990022659302, + "num_tokens": 10603366.0, + "step": 1188 + }, + { + "epoch": 0.9034954407294833, + "grad_norm": 1.542414665222168, + "learning_rate": 4.188332020302561e-06, + "loss": 0.4731982946395874, + "mean_token_accuracy": 0.8487229347229004, + "num_tokens": 10616203.0, + "step": 1189 + }, + { + "epoch": 0.9042553191489362, + "grad_norm": 0.9957579970359802, + "learning_rate": 4.186786784154096e-06, + "loss": 0.33211836218833923, + "mean_token_accuracy": 0.870644748210907, + "num_tokens": 10633294.0, + "step": 1190 + }, + { + "epoch": 0.9050151975683891, + "grad_norm": 2.593867540359497, + "learning_rate": 4.1852403641537344e-06, + "loss": 0.6825464963912964, + "mean_token_accuracy": 0.7716869115829468, + "num_tokens": 10640615.0, + "step": 1191 + }, + { + "epoch": 0.9057750759878419, + "grad_norm": 2.0424516201019287, + "learning_rate": 4.183692761386813e-06, + "loss": 0.5672709941864014, + "mean_token_accuracy": 0.7973801493644714, + "num_tokens": 10649845.0, + "step": 1192 + }, + { + "epoch": 0.9065349544072948, + "grad_norm": 1.429018259048462, + "learning_rate": 4.1821439769395e-06, + "loss": 0.5427846908569336, + "mean_token_accuracy": 0.8200292587280273, + "num_tokens": 10665898.0, + "step": 1193 + }, + { + "epoch": 0.9072948328267477, + "grad_norm": 1.9764264822006226, + "learning_rate": 4.180594011898791e-06, + "loss": 0.4784567356109619, + "mean_token_accuracy": 0.82924485206604, + "num_tokens": 10673595.0, + "step": 1194 + }, + { + "epoch": 0.9080547112462006, + "grad_norm": 1.4004309177398682, + "learning_rate": 4.1790428673525104e-06, + "loss": 0.4791432023048401, + "mean_token_accuracy": 0.8334879875183105, + "num_tokens": 10687892.0, + "step": 1195 + }, + { + "epoch": 0.9088145896656535, + "grad_norm": 2.2207727432250977, + "learning_rate": 4.177490544389313e-06, + "loss": 0.5089365243911743, + "mean_token_accuracy": 0.8270776271820068, + "num_tokens": 10694911.0, + "step": 1196 + }, + { + "epoch": 0.9095744680851063, + "grad_norm": 2.2890450954437256, + "learning_rate": 4.175937044098678e-06, + "loss": 0.5152267813682556, + "mean_token_accuracy": 0.8527299165725708, + "num_tokens": 10700512.0, + "step": 1197 + }, + { + "epoch": 0.9103343465045592, + "grad_norm": 1.7938050031661987, + "learning_rate": 4.1743823675709115e-06, + "loss": 0.3507300615310669, + "mean_token_accuracy": 0.8694599866867065, + "num_tokens": 10707953.0, + "step": 1198 + }, + { + "epoch": 0.9110942249240122, + "grad_norm": 1.4368808269500732, + "learning_rate": 4.172826515897146e-06, + "loss": 0.407418429851532, + "mean_token_accuracy": 0.8432893753051758, + "num_tokens": 10717485.0, + "step": 1199 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 1.735339879989624, + "learning_rate": 4.171269490169337e-06, + "loss": 0.46996885538101196, + "mean_token_accuracy": 0.8331948518753052, + "num_tokens": 10726160.0, + "step": 1200 + }, + { + "epoch": 0.912613981762918, + "grad_norm": 1.7859221696853638, + "learning_rate": 4.1697112914802665e-06, + "loss": 0.5325199365615845, + "mean_token_accuracy": 0.8179605007171631, + "num_tokens": 10736284.0, + "step": 1201 + }, + { + "epoch": 0.9133738601823708, + "grad_norm": 2.6394896507263184, + "learning_rate": 4.168151920923536e-06, + "loss": 0.4039744734764099, + "mean_token_accuracy": 0.8545527458190918, + "num_tokens": 10740673.0, + "step": 1202 + }, + { + "epoch": 0.9141337386018237, + "grad_norm": 1.910988211631775, + "learning_rate": 4.1665913795935755e-06, + "loss": 0.5190291404724121, + "mean_token_accuracy": 0.8203921318054199, + "num_tokens": 10751946.0, + "step": 1203 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.0006964206695557, + "learning_rate": 4.16502966858563e-06, + "loss": 0.5856777429580688, + "mean_token_accuracy": 0.8061224222183228, + "num_tokens": 10756795.0, + "step": 1204 + }, + { + "epoch": 0.9156534954407295, + "grad_norm": 1.7396167516708374, + "learning_rate": 4.163466788995768e-06, + "loss": 0.54935222864151, + "mean_token_accuracy": 0.8052443265914917, + "num_tokens": 10767202.0, + "step": 1205 + }, + { + "epoch": 0.9164133738601824, + "grad_norm": 2.143735885620117, + "learning_rate": 4.161902741920881e-06, + "loss": 0.5020298361778259, + "mean_token_accuracy": 0.8249630928039551, + "num_tokens": 10774329.0, + "step": 1206 + }, + { + "epoch": 0.9171732522796353, + "grad_norm": 2.8871893882751465, + "learning_rate": 4.160337528458676e-06, + "loss": 0.5154489278793335, + "mean_token_accuracy": 0.8276848793029785, + "num_tokens": 10778929.0, + "step": 1207 + }, + { + "epoch": 0.9179331306990881, + "grad_norm": 1.4642788171768188, + "learning_rate": 4.15877114970768e-06, + "loss": 0.5033774375915527, + "mean_token_accuracy": 0.8296241164207458, + "num_tokens": 10790928.0, + "step": 1208 + }, + { + "epoch": 0.918693009118541, + "grad_norm": 1.8313497304916382, + "learning_rate": 4.1572036067672386e-06, + "loss": 0.5674909353256226, + "mean_token_accuracy": 0.7975562214851379, + "num_tokens": 10801372.0, + "step": 1209 + }, + { + "epoch": 0.9194528875379939, + "grad_norm": 2.005958080291748, + "learning_rate": 4.155634900737513e-06, + "loss": 0.5557019114494324, + "mean_token_accuracy": 0.8141391277313232, + "num_tokens": 10809150.0, + "step": 1210 + }, + { + "epoch": 0.9202127659574468, + "grad_norm": 2.333519697189331, + "learning_rate": 4.154065032719482e-06, + "loss": 0.6990420818328857, + "mean_token_accuracy": 0.7565394043922424, + "num_tokens": 10816612.0, + "step": 1211 + }, + { + "epoch": 0.9209726443768997, + "grad_norm": 1.4472655057907104, + "learning_rate": 4.152494003814939e-06, + "loss": 0.541398286819458, + "mean_token_accuracy": 0.8027358055114746, + "num_tokens": 10833840.0, + "step": 1212 + }, + { + "epoch": 0.9217325227963525, + "grad_norm": 1.6183619499206543, + "learning_rate": 4.150921815126493e-06, + "loss": 0.6096762418746948, + "mean_token_accuracy": 0.7994354963302612, + "num_tokens": 10846367.0, + "step": 1213 + }, + { + "epoch": 0.9224924012158054, + "grad_norm": 2.614919900894165, + "learning_rate": 4.149348467757566e-06, + "loss": 0.41846764087677, + "mean_token_accuracy": 0.8555068969726562, + "num_tokens": 10850836.0, + "step": 1214 + }, + { + "epoch": 0.9232522796352584, + "grad_norm": 1.4419831037521362, + "learning_rate": 4.147773962812393e-06, + "loss": 0.4139535427093506, + "mean_token_accuracy": 0.845671534538269, + "num_tokens": 10864228.0, + "step": 1215 + }, + { + "epoch": 0.9240121580547113, + "grad_norm": 2.3868865966796875, + "learning_rate": 4.146198301396025e-06, + "loss": 0.3357275128364563, + "mean_token_accuracy": 0.8829520344734192, + "num_tokens": 10868920.0, + "step": 1216 + }, + { + "epoch": 0.9247720364741642, + "grad_norm": 1.7685474157333374, + "learning_rate": 4.14462148461432e-06, + "loss": 0.45333072543144226, + "mean_token_accuracy": 0.8505891561508179, + "num_tokens": 10877286.0, + "step": 1217 + }, + { + "epoch": 0.925531914893617, + "grad_norm": 1.7627625465393066, + "learning_rate": 4.143043513573949e-06, + "loss": 0.5028705596923828, + "mean_token_accuracy": 0.825471043586731, + "num_tokens": 10887047.0, + "step": 1218 + }, + { + "epoch": 0.9262917933130699, + "grad_norm": 1.3168725967407227, + "learning_rate": 4.141464389382392e-06, + "loss": 0.5494637489318848, + "mean_token_accuracy": 0.8121747970581055, + "num_tokens": 10903599.0, + "step": 1219 + }, + { + "epoch": 0.9270516717325228, + "grad_norm": 2.5180399417877197, + "learning_rate": 4.13988411314794e-06, + "loss": 0.6134277582168579, + "mean_token_accuracy": 0.7983006834983826, + "num_tokens": 10909791.0, + "step": 1220 + }, + { + "epoch": 0.9278115501519757, + "grad_norm": 1.1889166831970215, + "learning_rate": 4.13830268597969e-06, + "loss": 0.36713096499443054, + "mean_token_accuracy": 0.8416121006011963, + "num_tokens": 10925794.0, + "step": 1221 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 2.142422676086426, + "learning_rate": 4.136720108987552e-06, + "loss": 0.4427933096885681, + "mean_token_accuracy": 0.8427745699882507, + "num_tokens": 10931622.0, + "step": 1222 + }, + { + "epoch": 0.9293313069908815, + "grad_norm": 1.908564567565918, + "learning_rate": 4.1351363832822364e-06, + "loss": 0.5088109374046326, + "mean_token_accuracy": 0.8309272527694702, + "num_tokens": 10940843.0, + "step": 1223 + }, + { + "epoch": 0.9300911854103343, + "grad_norm": 1.2862322330474854, + "learning_rate": 4.133551509975264e-06, + "loss": 0.3963761329650879, + "mean_token_accuracy": 0.8602159023284912, + "num_tokens": 10954481.0, + "step": 1224 + }, + { + "epoch": 0.9308510638297872, + "grad_norm": 1.5876200199127197, + "learning_rate": 4.13196549017896e-06, + "loss": 0.4311184287071228, + "mean_token_accuracy": 0.8460899591445923, + "num_tokens": 10963501.0, + "step": 1225 + }, + { + "epoch": 0.9316109422492401, + "grad_norm": 2.459878444671631, + "learning_rate": 4.130378325006453e-06, + "loss": 0.5016295313835144, + "mean_token_accuracy": 0.8125218152999878, + "num_tokens": 10968850.0, + "step": 1226 + }, + { + "epoch": 0.932370820668693, + "grad_norm": 2.059718370437622, + "learning_rate": 4.128790015571679e-06, + "loss": 0.48982277512550354, + "mean_token_accuracy": 0.8327049016952515, + "num_tokens": 10976642.0, + "step": 1227 + }, + { + "epoch": 0.9331306990881459, + "grad_norm": 1.3719185590744019, + "learning_rate": 4.127200562989372e-06, + "loss": 0.38778752088546753, + "mean_token_accuracy": 0.8623501062393188, + "num_tokens": 10988703.0, + "step": 1228 + }, + { + "epoch": 0.9338905775075987, + "grad_norm": 1.302140712738037, + "learning_rate": 4.125609968375073e-06, + "loss": 0.4887842535972595, + "mean_token_accuracy": 0.8322232961654663, + "num_tokens": 11005981.0, + "step": 1229 + }, + { + "epoch": 0.9346504559270516, + "grad_norm": 1.819624423980713, + "learning_rate": 4.12401823284512e-06, + "loss": 0.49825209379196167, + "mean_token_accuracy": 0.8278916478157043, + "num_tokens": 11014145.0, + "step": 1230 + }, + { + "epoch": 0.9354103343465046, + "grad_norm": 1.2762807607650757, + "learning_rate": 4.122425357516658e-06, + "loss": 0.433994323015213, + "mean_token_accuracy": 0.853028416633606, + "num_tokens": 11029232.0, + "step": 1231 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.2171671390533447, + "learning_rate": 4.1208313435076255e-06, + "loss": 0.38436949253082275, + "mean_token_accuracy": 0.8616260290145874, + "num_tokens": 11034743.0, + "step": 1232 + }, + { + "epoch": 0.9369300911854104, + "grad_norm": 1.355879545211792, + "learning_rate": 4.119236191936764e-06, + "loss": 0.5378084182739258, + "mean_token_accuracy": 0.8256701231002808, + "num_tokens": 11048149.0, + "step": 1233 + }, + { + "epoch": 0.9376899696048632, + "grad_norm": 2.66812801361084, + "learning_rate": 4.117639903923611e-06, + "loss": 0.5236451625823975, + "mean_token_accuracy": 0.8431973457336426, + "num_tokens": 11052295.0, + "step": 1234 + }, + { + "epoch": 0.9384498480243161, + "grad_norm": 1.5740545988082886, + "learning_rate": 4.116042480588505e-06, + "loss": 0.44322824478149414, + "mean_token_accuracy": 0.8436908721923828, + "num_tokens": 11062066.0, + "step": 1235 + }, + { + "epoch": 0.939209726443769, + "grad_norm": 1.230706810951233, + "learning_rate": 4.114443923052577e-06, + "loss": 0.3325323462486267, + "mean_token_accuracy": 0.8674666881561279, + "num_tokens": 11074300.0, + "step": 1236 + }, + { + "epoch": 0.9399696048632219, + "grad_norm": 1.9870070219039917, + "learning_rate": 4.112844232437757e-06, + "loss": 0.5711548328399658, + "mean_token_accuracy": 0.8081738948822021, + "num_tokens": 11082297.0, + "step": 1237 + }, + { + "epoch": 0.9407294832826748, + "grad_norm": 1.3020970821380615, + "learning_rate": 4.11124340986677e-06, + "loss": 0.4187922477722168, + "mean_token_accuracy": 0.8566171526908875, + "num_tokens": 11096810.0, + "step": 1238 + }, + { + "epoch": 0.9414893617021277, + "grad_norm": 2.1399197578430176, + "learning_rate": 4.109641456463135e-06, + "loss": 0.5293116569519043, + "mean_token_accuracy": 0.8176157474517822, + "num_tokens": 11102761.0, + "step": 1239 + }, + { + "epoch": 0.9422492401215805, + "grad_norm": 1.3503763675689697, + "learning_rate": 4.108038373351163e-06, + "loss": 0.4907652735710144, + "mean_token_accuracy": 0.8204987049102783, + "num_tokens": 11118480.0, + "step": 1240 + }, + { + "epoch": 0.9430091185410334, + "grad_norm": 1.9571399688720703, + "learning_rate": 4.106434161655962e-06, + "loss": 0.4709656536579132, + "mean_token_accuracy": 0.8371885418891907, + "num_tokens": 11126265.0, + "step": 1241 + }, + { + "epoch": 0.9437689969604863, + "grad_norm": 2.1277313232421875, + "learning_rate": 4.104828822503427e-06, + "loss": 0.4010283350944519, + "mean_token_accuracy": 0.8586333990097046, + "num_tokens": 11133022.0, + "step": 1242 + }, + { + "epoch": 0.9445288753799392, + "grad_norm": 1.6745036840438843, + "learning_rate": 4.103222357020248e-06, + "loss": 0.562545657157898, + "mean_token_accuracy": 0.8052060604095459, + "num_tokens": 11145255.0, + "step": 1243 + }, + { + "epoch": 0.9452887537993921, + "grad_norm": 2.3616299629211426, + "learning_rate": 4.101614766333904e-06, + "loss": 0.5878340601921082, + "mean_token_accuracy": 0.796745777130127, + "num_tokens": 11152020.0, + "step": 1244 + }, + { + "epoch": 0.9460486322188449, + "grad_norm": 1.6182078123092651, + "learning_rate": 4.100006051572664e-06, + "loss": 0.5357589721679688, + "mean_token_accuracy": 0.8089962005615234, + "num_tokens": 11163112.0, + "step": 1245 + }, + { + "epoch": 0.9468085106382979, + "grad_norm": 1.911770224571228, + "learning_rate": 4.098396213865587e-06, + "loss": 0.49805426597595215, + "mean_token_accuracy": 0.8289647102355957, + "num_tokens": 11171768.0, + "step": 1246 + }, + { + "epoch": 0.9475683890577508, + "grad_norm": 1.649155616760254, + "learning_rate": 4.096785254342518e-06, + "loss": 0.5756166577339172, + "mean_token_accuracy": 0.807680606842041, + "num_tokens": 11183527.0, + "step": 1247 + }, + { + "epoch": 0.9483282674772037, + "grad_norm": 1.8922761678695679, + "learning_rate": 4.095173174134091e-06, + "loss": 0.44688963890075684, + "mean_token_accuracy": 0.8375608921051025, + "num_tokens": 11191494.0, + "step": 1248 + }, + { + "epoch": 0.9490881458966566, + "grad_norm": 2.9044547080993652, + "learning_rate": 4.093559974371725e-06, + "loss": 0.48609739542007446, + "mean_token_accuracy": 0.8404892086982727, + "num_tokens": 11195837.0, + "step": 1249 + }, + { + "epoch": 0.9498480243161094, + "grad_norm": 2.287506580352783, + "learning_rate": 4.091945656187626e-06, + "loss": 0.5260225534439087, + "mean_token_accuracy": 0.8181945085525513, + "num_tokens": 11202174.0, + "step": 1250 + }, + { + "epoch": 0.9506079027355623, + "grad_norm": 1.7908886671066284, + "learning_rate": 4.090330220714785e-06, + "loss": 0.4207724928855896, + "mean_token_accuracy": 0.8616912364959717, + "num_tokens": 11209995.0, + "step": 1251 + }, + { + "epoch": 0.9513677811550152, + "grad_norm": 2.905418634414673, + "learning_rate": 4.0887136690869774e-06, + "loss": 0.4209241271018982, + "mean_token_accuracy": 0.8561323285102844, + "num_tokens": 11213799.0, + "step": 1252 + }, + { + "epoch": 0.9521276595744681, + "grad_norm": 2.814150333404541, + "learning_rate": 4.08709600243876e-06, + "loss": 0.36855608224868774, + "mean_token_accuracy": 0.8764539361000061, + "num_tokens": 11217643.0, + "step": 1253 + }, + { + "epoch": 0.952887537993921, + "grad_norm": 1.9385707378387451, + "learning_rate": 4.0854772219054735e-06, + "loss": 0.531031608581543, + "mean_token_accuracy": 0.80600905418396, + "num_tokens": 11225871.0, + "step": 1254 + }, + { + "epoch": 0.9536474164133738, + "grad_norm": 2.103058099746704, + "learning_rate": 4.083857328623243e-06, + "loss": 0.4576364755630493, + "mean_token_accuracy": 0.8447524905204773, + "num_tokens": 11231829.0, + "step": 1255 + }, + { + "epoch": 0.9544072948328267, + "grad_norm": 1.7518818378448486, + "learning_rate": 4.082236323728969e-06, + "loss": 0.5386767983436584, + "mean_token_accuracy": 0.8055596351623535, + "num_tokens": 11240977.0, + "step": 1256 + }, + { + "epoch": 0.9551671732522796, + "grad_norm": 1.8434966802597046, + "learning_rate": 4.0806142083603365e-06, + "loss": 0.5415925979614258, + "mean_token_accuracy": 0.809962272644043, + "num_tokens": 11249616.0, + "step": 1257 + }, + { + "epoch": 0.9559270516717325, + "grad_norm": 1.7341015338897705, + "learning_rate": 4.078990983655807e-06, + "loss": 0.4621101915836334, + "mean_token_accuracy": 0.8330386877059937, + "num_tokens": 11258616.0, + "step": 1258 + }, + { + "epoch": 0.9566869300911854, + "grad_norm": 1.8589727878570557, + "learning_rate": 4.077366650754624e-06, + "loss": 0.4031238555908203, + "mean_token_accuracy": 0.842434287071228, + "num_tokens": 11266006.0, + "step": 1259 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 1.657175898551941, + "learning_rate": 4.075741210796806e-06, + "loss": 0.41686388850212097, + "mean_token_accuracy": 0.8443650007247925, + "num_tokens": 11275601.0, + "step": 1260 + }, + { + "epoch": 0.9582066869300911, + "grad_norm": 2.4303717613220215, + "learning_rate": 4.07411466492315e-06, + "loss": 0.4554435610771179, + "mean_token_accuracy": 0.853043794631958, + "num_tokens": 11280650.0, + "step": 1261 + }, + { + "epoch": 0.958966565349544, + "grad_norm": 2.3653745651245117, + "learning_rate": 4.072487014275228e-06, + "loss": 0.4304995536804199, + "mean_token_accuracy": 0.8462260961532593, + "num_tokens": 11285637.0, + "step": 1262 + }, + { + "epoch": 0.959726443768997, + "grad_norm": 1.6689718961715698, + "learning_rate": 4.070858259995388e-06, + "loss": 0.5290807485580444, + "mean_token_accuracy": 0.8176917433738708, + "num_tokens": 11299110.0, + "step": 1263 + }, + { + "epoch": 0.9604863221884499, + "grad_norm": 2.103879451751709, + "learning_rate": 4.069228403226751e-06, + "loss": 0.4620879888534546, + "mean_token_accuracy": 0.835270345211029, + "num_tokens": 11305564.0, + "step": 1264 + }, + { + "epoch": 0.9612462006079028, + "grad_norm": 2.139012575149536, + "learning_rate": 4.067597445113216e-06, + "loss": 0.5143396258354187, + "mean_token_accuracy": 0.8191739320755005, + "num_tokens": 11311870.0, + "step": 1265 + }, + { + "epoch": 0.9620060790273556, + "grad_norm": 1.3971210718154907, + "learning_rate": 4.06596538679945e-06, + "loss": 0.472080260515213, + "mean_token_accuracy": 0.8321092128753662, + "num_tokens": 11323970.0, + "step": 1266 + }, + { + "epoch": 0.9627659574468085, + "grad_norm": 1.4965174198150635, + "learning_rate": 4.064332229430895e-06, + "loss": 0.359701007604599, + "mean_token_accuracy": 0.8903120160102844, + "num_tokens": 11333412.0, + "step": 1267 + }, + { + "epoch": 0.9635258358662614, + "grad_norm": 1.1898726224899292, + "learning_rate": 4.062697974153764e-06, + "loss": 0.3423798084259033, + "mean_token_accuracy": 0.8661491870880127, + "num_tokens": 11347657.0, + "step": 1268 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 1.4952168464660645, + "learning_rate": 4.06106262211504e-06, + "loss": 0.4214417338371277, + "mean_token_accuracy": 0.8362159729003906, + "num_tokens": 11357786.0, + "step": 1269 + }, + { + "epoch": 0.9650455927051672, + "grad_norm": 1.7949583530426025, + "learning_rate": 4.059426174462476e-06, + "loss": 0.59087735414505, + "mean_token_accuracy": 0.7965556979179382, + "num_tokens": 11370561.0, + "step": 1270 + }, + { + "epoch": 0.96580547112462, + "grad_norm": 1.8973214626312256, + "learning_rate": 4.057788632344594e-06, + "loss": 0.47525322437286377, + "mean_token_accuracy": 0.8317365050315857, + "num_tokens": 11378507.0, + "step": 1271 + }, + { + "epoch": 0.9665653495440729, + "grad_norm": 1.8665250539779663, + "learning_rate": 4.056149996910683e-06, + "loss": 0.3537125587463379, + "mean_token_accuracy": 0.8921569585800171, + "num_tokens": 11385186.0, + "step": 1272 + }, + { + "epoch": 0.9673252279635258, + "grad_norm": 1.5072317123413086, + "learning_rate": 4.054510269310803e-06, + "loss": 0.5145624876022339, + "mean_token_accuracy": 0.8265488147735596, + "num_tokens": 11397125.0, + "step": 1273 + }, + { + "epoch": 0.9680851063829787, + "grad_norm": 1.520525574684143, + "learning_rate": 4.052869450695776e-06, + "loss": 0.44322293996810913, + "mean_token_accuracy": 0.8403642177581787, + "num_tokens": 11409919.0, + "step": 1274 + }, + { + "epoch": 0.9688449848024316, + "grad_norm": 1.3764475584030151, + "learning_rate": 4.051227542217192e-06, + "loss": 0.5774400234222412, + "mean_token_accuracy": 0.804118275642395, + "num_tokens": 11425900.0, + "step": 1275 + }, + { + "epoch": 0.9696048632218845, + "grad_norm": 1.3922648429870605, + "learning_rate": 4.049584545027406e-06, + "loss": 0.42727944254875183, + "mean_token_accuracy": 0.8654505014419556, + "num_tokens": 11438787.0, + "step": 1276 + }, + { + "epoch": 0.9703647416413373, + "grad_norm": 1.8505840301513672, + "learning_rate": 4.047940460279537e-06, + "loss": 0.490803062915802, + "mean_token_accuracy": 0.8340574502944946, + "num_tokens": 11447997.0, + "step": 1277 + }, + { + "epoch": 0.9711246200607903, + "grad_norm": 2.28271222114563, + "learning_rate": 4.046295289127466e-06, + "loss": 0.588828444480896, + "mean_token_accuracy": 0.833497166633606, + "num_tokens": 11454072.0, + "step": 1278 + }, + { + "epoch": 0.9718844984802432, + "grad_norm": 2.4242560863494873, + "learning_rate": 4.044649032725836e-06, + "loss": 0.5128831267356873, + "mean_token_accuracy": 0.8225122690200806, + "num_tokens": 11460211.0, + "step": 1279 + }, + { + "epoch": 0.9726443768996961, + "grad_norm": 2.1738455295562744, + "learning_rate": 4.0430016922300566e-06, + "loss": 0.441631942987442, + "mean_token_accuracy": 0.841723620891571, + "num_tokens": 11466814.0, + "step": 1280 + }, + { + "epoch": 0.973404255319149, + "grad_norm": 2.541599988937378, + "learning_rate": 4.0413532687962926e-06, + "loss": 0.5062629580497742, + "mean_token_accuracy": 0.8013502359390259, + "num_tokens": 11472371.0, + "step": 1281 + }, + { + "epoch": 0.9741641337386018, + "grad_norm": 2.8011014461517334, + "learning_rate": 4.039703763581472e-06, + "loss": 0.5061966776847839, + "mean_token_accuracy": 0.829810380935669, + "num_tokens": 11476672.0, + "step": 1282 + }, + { + "epoch": 0.9749240121580547, + "grad_norm": 2.4505462646484375, + "learning_rate": 4.038053177743279e-06, + "loss": 0.43407535552978516, + "mean_token_accuracy": 0.8428469896316528, + "num_tokens": 11481297.0, + "step": 1283 + }, + { + "epoch": 0.9756838905775076, + "grad_norm": 2.1618378162384033, + "learning_rate": 4.036401512440161e-06, + "loss": 0.6056663393974304, + "mean_token_accuracy": 0.7977457642555237, + "num_tokens": 11488657.0, + "step": 1284 + }, + { + "epoch": 0.9764437689969605, + "grad_norm": 1.9192147254943848, + "learning_rate": 4.034748768831319e-06, + "loss": 0.524390697479248, + "mean_token_accuracy": 0.8120636940002441, + "num_tokens": 11496485.0, + "step": 1285 + }, + { + "epoch": 0.9772036474164134, + "grad_norm": 2.766435384750366, + "learning_rate": 4.033094948076713e-06, + "loss": 0.5494908690452576, + "mean_token_accuracy": 0.8141890168190002, + "num_tokens": 11501341.0, + "step": 1286 + }, + { + "epoch": 0.9779635258358662, + "grad_norm": 1.3519539833068848, + "learning_rate": 4.031440051337056e-06, + "loss": 0.4339691400527954, + "mean_token_accuracy": 0.8400131464004517, + "num_tokens": 11512843.0, + "step": 1287 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 1.2492141723632812, + "learning_rate": 4.02978407977382e-06, + "loss": 0.4433518052101135, + "mean_token_accuracy": 0.8432940244674683, + "num_tokens": 11530227.0, + "step": 1288 + }, + { + "epoch": 0.979483282674772, + "grad_norm": 1.6597715616226196, + "learning_rate": 4.02812703454923e-06, + "loss": 0.602222204208374, + "mean_token_accuracy": 0.786965548992157, + "num_tokens": 11543955.0, + "step": 1289 + }, + { + "epoch": 0.9802431610942249, + "grad_norm": 1.6621816158294678, + "learning_rate": 4.026468916826262e-06, + "loss": 0.35662174224853516, + "mean_token_accuracy": 0.8716133832931519, + "num_tokens": 11552064.0, + "step": 1290 + }, + { + "epoch": 0.9810030395136778, + "grad_norm": 4.539844989776611, + "learning_rate": 4.024809727768648e-06, + "loss": 0.543423593044281, + "mean_token_accuracy": 0.8293194770812988, + "num_tokens": 11555595.0, + "step": 1291 + }, + { + "epoch": 0.9817629179331308, + "grad_norm": 1.4026556015014648, + "learning_rate": 4.023149468540871e-06, + "loss": 0.4301237165927887, + "mean_token_accuracy": 0.8358224630355835, + "num_tokens": 11572275.0, + "step": 1292 + }, + { + "epoch": 0.9825227963525835, + "grad_norm": 1.611262321472168, + "learning_rate": 4.021488140308165e-06, + "loss": 0.5378580689430237, + "mean_token_accuracy": 0.8173760771751404, + "num_tokens": 11584299.0, + "step": 1293 + }, + { + "epoch": 0.9832826747720365, + "grad_norm": 4.138631820678711, + "learning_rate": 4.019825744236514e-06, + "loss": 0.40272149443626404, + "mean_token_accuracy": 0.8648844957351685, + "num_tokens": 11586705.0, + "step": 1294 + }, + { + "epoch": 0.9840425531914894, + "grad_norm": 3.177703619003296, + "learning_rate": 4.018162281492651e-06, + "loss": 0.5320103168487549, + "mean_token_accuracy": 0.8250276446342468, + "num_tokens": 11590689.0, + "step": 1295 + }, + { + "epoch": 0.9848024316109423, + "grad_norm": 2.727597713470459, + "learning_rate": 4.016497753244058e-06, + "loss": 0.5662774443626404, + "mean_token_accuracy": 0.8074625730514526, + "num_tokens": 11596092.0, + "step": 1296 + }, + { + "epoch": 0.9855623100303952, + "grad_norm": 1.485139012336731, + "learning_rate": 4.014832160658966e-06, + "loss": 0.5414972305297852, + "mean_token_accuracy": 0.8082696199417114, + "num_tokens": 11613785.0, + "step": 1297 + }, + { + "epoch": 0.986322188449848, + "grad_norm": 2.4025990962982178, + "learning_rate": 4.013165504906352e-06, + "loss": 0.6556503772735596, + "mean_token_accuracy": 0.7785214781761169, + "num_tokens": 11620421.0, + "step": 1298 + }, + { + "epoch": 0.9870820668693009, + "grad_norm": 1.878273606300354, + "learning_rate": 4.011497787155938e-06, + "loss": 0.4221133887767792, + "mean_token_accuracy": 0.850035548210144, + "num_tokens": 11627998.0, + "step": 1299 + }, + { + "epoch": 0.9878419452887538, + "grad_norm": 2.0430715084075928, + "learning_rate": 4.009829008578192e-06, + "loss": 0.5205984711647034, + "mean_token_accuracy": 0.819183349609375, + "num_tokens": 11636279.0, + "step": 1300 + }, + { + "epoch": 0.9886018237082067, + "grad_norm": 3.4769439697265625, + "learning_rate": 4.00815917034433e-06, + "loss": 0.5449948310852051, + "mean_token_accuracy": 0.8240023851394653, + "num_tokens": 11639638.0, + "step": 1301 + }, + { + "epoch": 0.9893617021276596, + "grad_norm": 2.4783987998962402, + "learning_rate": 4.006488273626307e-06, + "loss": 0.4316832423210144, + "mean_token_accuracy": 0.8474695086479187, + "num_tokens": 11645463.0, + "step": 1302 + }, + { + "epoch": 0.9901215805471124, + "grad_norm": 1.881475567817688, + "learning_rate": 4.004816319596822e-06, + "loss": 0.5157331824302673, + "mean_token_accuracy": 0.826042652130127, + "num_tokens": 11653955.0, + "step": 1303 + }, + { + "epoch": 0.9908814589665653, + "grad_norm": 2.6569254398345947, + "learning_rate": 4.003143309429317e-06, + "loss": 0.46492767333984375, + "mean_token_accuracy": 0.8320850133895874, + "num_tokens": 11659357.0, + "step": 1304 + }, + { + "epoch": 0.9916413373860182, + "grad_norm": 2.4917593002319336, + "learning_rate": 4.0014692442979756e-06, + "loss": 0.459585040807724, + "mean_token_accuracy": 0.8457611799240112, + "num_tokens": 11664207.0, + "step": 1305 + }, + { + "epoch": 0.9924012158054711, + "grad_norm": 2.6885526180267334, + "learning_rate": 3.999794125377721e-06, + "loss": 0.4677402973175049, + "mean_token_accuracy": 0.8307361602783203, + "num_tokens": 11668879.0, + "step": 1306 + }, + { + "epoch": 0.993161094224924, + "grad_norm": 1.9737319946289062, + "learning_rate": 3.998117953844215e-06, + "loss": 0.44684839248657227, + "mean_token_accuracy": 0.8367687463760376, + "num_tokens": 11676081.0, + "step": 1307 + }, + { + "epoch": 0.993920972644377, + "grad_norm": 1.4333021640777588, + "learning_rate": 3.996440730873861e-06, + "loss": 0.526146650314331, + "mean_token_accuracy": 0.816251814365387, + "num_tokens": 11689333.0, + "step": 1308 + }, + { + "epoch": 0.9946808510638298, + "grad_norm": 1.3689230680465698, + "learning_rate": 3.9947624576437975e-06, + "loss": 0.40214329957962036, + "mean_token_accuracy": 0.8610327839851379, + "num_tokens": 11701540.0, + "step": 1309 + }, + { + "epoch": 0.9954407294832827, + "grad_norm": 1.2435375452041626, + "learning_rate": 3.9930831353319025e-06, + "loss": 0.4532913267612457, + "mean_token_accuracy": 0.8415389060974121, + "num_tokens": 11717920.0, + "step": 1310 + }, + { + "epoch": 0.9962006079027356, + "grad_norm": 1.9968011379241943, + "learning_rate": 3.9914027651167866e-06, + "loss": 0.46954160928726196, + "mean_token_accuracy": 0.8351103663444519, + "num_tokens": 11724999.0, + "step": 1311 + }, + { + "epoch": 0.9969604863221885, + "grad_norm": 1.9521311521530151, + "learning_rate": 3.989721348177801e-06, + "loss": 0.5068016052246094, + "mean_token_accuracy": 0.8220845460891724, + "num_tokens": 11732569.0, + "step": 1312 + }, + { + "epoch": 0.9977203647416414, + "grad_norm": 2.7332582473754883, + "learning_rate": 3.988038885695028e-06, + "loss": 0.4154692590236664, + "mean_token_accuracy": 0.8493857383728027, + "num_tokens": 11736759.0, + "step": 1313 + }, + { + "epoch": 0.9984802431610942, + "grad_norm": 1.8656952381134033, + "learning_rate": 3.986355378849284e-06, + "loss": 0.4151354134082794, + "mean_token_accuracy": 0.83440101146698, + "num_tokens": 11743827.0, + "step": 1314 + }, + { + "epoch": 0.9992401215805471, + "grad_norm": 1.304006576538086, + "learning_rate": 3.984670828822118e-06, + "loss": 0.4926128089427948, + "mean_token_accuracy": 0.8603005409240723, + "num_tokens": 11757707.0, + "step": 1315 + }, + { + "epoch": 1.0, + "grad_norm": 1.497079610824585, + "learning_rate": 3.982985236795815e-06, + "loss": 0.43342477083206177, + "mean_token_accuracy": 0.8550825119018555, + "num_tokens": 11769678.0, + "step": 1316 + }, + { + "epoch": 1.000759878419453, + "grad_norm": 2.870274543762207, + "learning_rate": 3.981298603953385e-06, + "loss": 0.3723528981208801, + "mean_token_accuracy": 0.8745899796485901, + "num_tokens": 11773290.0, + "step": 1317 + }, + { + "epoch": 1.0015197568389058, + "grad_norm": 1.3442503213882446, + "learning_rate": 3.979610931478574e-06, + "loss": 0.34688329696655273, + "mean_token_accuracy": 0.8749074935913086, + "num_tokens": 11786400.0, + "step": 1318 + }, + { + "epoch": 1.0022796352583587, + "grad_norm": 1.7272238731384277, + "learning_rate": 3.977922220555855e-06, + "loss": 0.28274932503700256, + "mean_token_accuracy": 0.896713137626648, + "num_tokens": 11793059.0, + "step": 1319 + }, + { + "epoch": 1.0030395136778116, + "grad_norm": 1.7362451553344727, + "learning_rate": 3.976232472370431e-06, + "loss": 0.5494794845581055, + "mean_token_accuracy": 0.8341718912124634, + "num_tokens": 11802593.0, + "step": 1320 + }, + { + "epoch": 1.0037993920972645, + "grad_norm": 1.3316494226455688, + "learning_rate": 3.97454168810823e-06, + "loss": 0.41505366563796997, + "mean_token_accuracy": 0.8581969738006592, + "num_tokens": 11813925.0, + "step": 1321 + }, + { + "epoch": 1.0045592705167172, + "grad_norm": 1.6152615547180176, + "learning_rate": 3.972849868955913e-06, + "loss": 0.44761013984680176, + "mean_token_accuracy": 0.8413045406341553, + "num_tokens": 11825709.0, + "step": 1322 + }, + { + "epoch": 1.0053191489361701, + "grad_norm": 2.1172471046447754, + "learning_rate": 3.97115701610086e-06, + "loss": 0.3903353810310364, + "mean_token_accuracy": 0.8662760257720947, + "num_tokens": 11832070.0, + "step": 1323 + }, + { + "epoch": 1.006079027355623, + "grad_norm": 1.5923868417739868, + "learning_rate": 3.969463130731183e-06, + "loss": 0.4491051137447357, + "mean_token_accuracy": 0.8677828311920166, + "num_tokens": 11843154.0, + "step": 1324 + }, + { + "epoch": 1.006838905775076, + "grad_norm": 1.6848995685577393, + "learning_rate": 3.967768214035716e-06, + "loss": 0.45765817165374756, + "mean_token_accuracy": 0.8401060104370117, + "num_tokens": 11854826.0, + "step": 1325 + }, + { + "epoch": 1.0075987841945289, + "grad_norm": 2.3739020824432373, + "learning_rate": 3.966072267204014e-06, + "loss": 0.4482722580432892, + "mean_token_accuracy": 0.8368916511535645, + "num_tokens": 11860559.0, + "step": 1326 + }, + { + "epoch": 1.0083586626139818, + "grad_norm": 1.5403034687042236, + "learning_rate": 3.964375291426361e-06, + "loss": 0.35589972138404846, + "mean_token_accuracy": 0.8728118538856506, + "num_tokens": 11871959.0, + "step": 1327 + }, + { + "epoch": 1.0091185410334347, + "grad_norm": 1.6750119924545288, + "learning_rate": 3.962677287893758e-06, + "loss": 0.35873427987098694, + "mean_token_accuracy": 0.9027186632156372, + "num_tokens": 11881818.0, + "step": 1328 + }, + { + "epoch": 1.0098784194528876, + "grad_norm": 1.5489170551300049, + "learning_rate": 3.9609782577979305e-06, + "loss": 0.3634672462940216, + "mean_token_accuracy": 0.8582607507705688, + "num_tokens": 11891084.0, + "step": 1329 + }, + { + "epoch": 1.0106382978723405, + "grad_norm": 2.43859601020813, + "learning_rate": 3.959278202331323e-06, + "loss": 0.3640799820423126, + "mean_token_accuracy": 0.88062584400177, + "num_tokens": 11896032.0, + "step": 1330 + }, + { + "epoch": 1.0113981762917934, + "grad_norm": 3.612184524536133, + "learning_rate": 3.9575771226870986e-06, + "loss": 0.3733130097389221, + "mean_token_accuracy": 0.8946067094802856, + "num_tokens": 11899479.0, + "step": 1331 + }, + { + "epoch": 1.012158054711246, + "grad_norm": 1.541355848312378, + "learning_rate": 3.955875020059141e-06, + "loss": 0.320593923330307, + "mean_token_accuracy": 0.9057406783103943, + "num_tokens": 11910179.0, + "step": 1332 + }, + { + "epoch": 1.012917933130699, + "grad_norm": 2.0565030574798584, + "learning_rate": 3.954171895642052e-06, + "loss": 0.3341682553291321, + "mean_token_accuracy": 0.8829344511032104, + "num_tokens": 11916489.0, + "step": 1333 + }, + { + "epoch": 1.013677811550152, + "grad_norm": 2.9732539653778076, + "learning_rate": 3.9524677506311505e-06, + "loss": 0.38488566875457764, + "mean_token_accuracy": 0.8752974271774292, + "num_tokens": 11920682.0, + "step": 1334 + }, + { + "epoch": 1.0144376899696048, + "grad_norm": 2.7697458267211914, + "learning_rate": 3.950762586222469e-06, + "loss": 0.39864760637283325, + "mean_token_accuracy": 0.8593167662620544, + "num_tokens": 11925233.0, + "step": 1335 + }, + { + "epoch": 1.0151975683890577, + "grad_norm": 2.2302119731903076, + "learning_rate": 3.949056403612758e-06, + "loss": 0.3985682725906372, + "mean_token_accuracy": 0.8677899837493896, + "num_tokens": 11932000.0, + "step": 1336 + }, + { + "epoch": 1.0159574468085106, + "grad_norm": 2.360572576522827, + "learning_rate": 3.947349203999485e-06, + "loss": 0.36940714716911316, + "mean_token_accuracy": 0.8760676383972168, + "num_tokens": 11937569.0, + "step": 1337 + }, + { + "epoch": 1.0167173252279635, + "grad_norm": 1.3383921384811401, + "learning_rate": 3.945640988580824e-06, + "loss": 0.40628793835639954, + "mean_token_accuracy": 0.866442084312439, + "num_tokens": 11955679.0, + "step": 1338 + }, + { + "epoch": 1.0174772036474165, + "grad_norm": 2.1502623558044434, + "learning_rate": 3.943931758555669e-06, + "loss": 0.4493565559387207, + "mean_token_accuracy": 0.8307522535324097, + "num_tokens": 11962734.0, + "step": 1339 + }, + { + "epoch": 1.0182370820668694, + "grad_norm": 2.4737331867218018, + "learning_rate": 3.942221515123624e-06, + "loss": 0.28508758544921875, + "mean_token_accuracy": 0.8967142105102539, + "num_tokens": 11967783.0, + "step": 1340 + }, + { + "epoch": 1.0189969604863223, + "grad_norm": 2.4525370597839355, + "learning_rate": 3.940510259485002e-06, + "loss": 0.40227818489074707, + "mean_token_accuracy": 0.8618967533111572, + "num_tokens": 11972918.0, + "step": 1341 + }, + { + "epoch": 1.0197568389057752, + "grad_norm": 1.7299731969833374, + "learning_rate": 3.938797992840828e-06, + "loss": 0.26339593529701233, + "mean_token_accuracy": 0.9004406929016113, + "num_tokens": 11981250.0, + "step": 1342 + }, + { + "epoch": 1.0205167173252279, + "grad_norm": 2.8756747245788574, + "learning_rate": 3.937084716392839e-06, + "loss": 0.47792482376098633, + "mean_token_accuracy": 0.8440839052200317, + "num_tokens": 11986356.0, + "step": 1343 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 2.104473114013672, + "learning_rate": 3.935370431343475e-06, + "loss": 0.36723971366882324, + "mean_token_accuracy": 0.8831232786178589, + "num_tokens": 11994495.0, + "step": 1344 + }, + { + "epoch": 1.0220364741641337, + "grad_norm": 1.9173074960708618, + "learning_rate": 3.933655138895889e-06, + "loss": 0.409319669008255, + "mean_token_accuracy": 0.8632645606994629, + "num_tokens": 12002060.0, + "step": 1345 + }, + { + "epoch": 1.0227963525835866, + "grad_norm": 2.958311080932617, + "learning_rate": 3.9319388402539395e-06, + "loss": 0.5390093922615051, + "mean_token_accuracy": 0.8204828500747681, + "num_tokens": 12007588.0, + "step": 1346 + }, + { + "epoch": 1.0235562310030395, + "grad_norm": 1.6470831632614136, + "learning_rate": 3.930221536622192e-06, + "loss": 0.4524633288383484, + "mean_token_accuracy": 0.8516575694084167, + "num_tokens": 12018831.0, + "step": 1347 + }, + { + "epoch": 1.0243161094224924, + "grad_norm": 1.3160780668258667, + "learning_rate": 3.928503229205913e-06, + "loss": 0.4180558919906616, + "mean_token_accuracy": 0.8495022058486938, + "num_tokens": 12033947.0, + "step": 1348 + }, + { + "epoch": 1.0250759878419453, + "grad_norm": 1.9686089754104614, + "learning_rate": 3.92678391921108e-06, + "loss": 0.41927334666252136, + "mean_token_accuracy": 0.8462997674942017, + "num_tokens": 12042005.0, + "step": 1349 + }, + { + "epoch": 1.0258358662613982, + "grad_norm": 2.351778507232666, + "learning_rate": 3.92506360784437e-06, + "loss": 0.2946245074272156, + "mean_token_accuracy": 0.9170923233032227, + "num_tokens": 12046579.0, + "step": 1350 + }, + { + "epoch": 1.0265957446808511, + "grad_norm": 2.0636913776397705, + "learning_rate": 3.923342296313162e-06, + "loss": 0.3422774076461792, + "mean_token_accuracy": 0.8809213638305664, + "num_tokens": 12053214.0, + "step": 1351 + }, + { + "epoch": 1.027355623100304, + "grad_norm": 1.7272592782974243, + "learning_rate": 3.92161998582554e-06, + "loss": 0.5864541530609131, + "mean_token_accuracy": 0.7986117601394653, + "num_tokens": 12068522.0, + "step": 1352 + }, + { + "epoch": 1.028115501519757, + "grad_norm": 0.8980231881141663, + "learning_rate": 3.919896677590289e-06, + "loss": 0.2964550256729126, + "mean_token_accuracy": 0.8911845088005066, + "num_tokens": 12093834.0, + "step": 1353 + }, + { + "epoch": 1.0288753799392096, + "grad_norm": 1.6031712293624878, + "learning_rate": 3.918172372816892e-06, + "loss": 0.37254488468170166, + "mean_token_accuracy": 0.8615843057632446, + "num_tokens": 12104393.0, + "step": 1354 + }, + { + "epoch": 1.0296352583586625, + "grad_norm": 1.282134771347046, + "learning_rate": 3.916447072715531e-06, + "loss": 0.3522927761077881, + "mean_token_accuracy": 0.8713657259941101, + "num_tokens": 12118671.0, + "step": 1355 + }, + { + "epoch": 1.0303951367781155, + "grad_norm": 2.1986680030822754, + "learning_rate": 3.914720778497091e-06, + "loss": 0.3716316223144531, + "mean_token_accuracy": 0.8661249279975891, + "num_tokens": 12125178.0, + "step": 1356 + }, + { + "epoch": 1.0311550151975684, + "grad_norm": 1.5937882661819458, + "learning_rate": 3.91299349137315e-06, + "loss": 0.48067355155944824, + "mean_token_accuracy": 0.8284252882003784, + "num_tokens": 12136785.0, + "step": 1357 + }, + { + "epoch": 1.0319148936170213, + "grad_norm": 1.6743099689483643, + "learning_rate": 3.9112652125559845e-06, + "loss": 0.4461551308631897, + "mean_token_accuracy": 0.8381845355033875, + "num_tokens": 12150066.0, + "step": 1358 + }, + { + "epoch": 1.0326747720364742, + "grad_norm": 2.2346715927124023, + "learning_rate": 3.909535943258567e-06, + "loss": 0.3148220181465149, + "mean_token_accuracy": 0.8797591924667358, + "num_tokens": 12155506.0, + "step": 1359 + }, + { + "epoch": 1.033434650455927, + "grad_norm": 1.9608992338180542, + "learning_rate": 3.907805684694567e-06, + "loss": 0.32598960399627686, + "mean_token_accuracy": 0.8819410800933838, + "num_tokens": 12163261.0, + "step": 1360 + }, + { + "epoch": 1.03419452887538, + "grad_norm": 2.413477897644043, + "learning_rate": 3.906074438078343e-06, + "loss": 0.38179588317871094, + "mean_token_accuracy": 0.8739585876464844, + "num_tokens": 12169254.0, + "step": 1361 + }, + { + "epoch": 1.034954407294833, + "grad_norm": 2.0258278846740723, + "learning_rate": 3.904342204624955e-06, + "loss": 0.33240315318107605, + "mean_token_accuracy": 0.8808181285858154, + "num_tokens": 12175379.0, + "step": 1362 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 2.4111437797546387, + "learning_rate": 3.9026089855501475e-06, + "loss": 0.412802517414093, + "mean_token_accuracy": 0.8504396677017212, + "num_tokens": 12182007.0, + "step": 1363 + }, + { + "epoch": 1.0364741641337385, + "grad_norm": 2.0424840450286865, + "learning_rate": 3.900874782070362e-06, + "loss": 0.2914797067642212, + "mean_token_accuracy": 0.8731886148452759, + "num_tokens": 12187743.0, + "step": 1364 + }, + { + "epoch": 1.0372340425531914, + "grad_norm": 2.9248716831207275, + "learning_rate": 3.899139595402729e-06, + "loss": 0.34071338176727295, + "mean_token_accuracy": 0.8736443519592285, + "num_tokens": 12191830.0, + "step": 1365 + }, + { + "epoch": 1.0379939209726443, + "grad_norm": 2.240220785140991, + "learning_rate": 3.8974034267650695e-06, + "loss": 0.23049014806747437, + "mean_token_accuracy": 0.9000070691108704, + "num_tokens": 12196460.0, + "step": 1366 + }, + { + "epoch": 1.0387537993920972, + "grad_norm": 1.5038460493087769, + "learning_rate": 3.895666277375892e-06, + "loss": 0.32255327701568604, + "mean_token_accuracy": 0.873004674911499, + "num_tokens": 12206230.0, + "step": 1367 + }, + { + "epoch": 1.0395136778115501, + "grad_norm": 1.2339142560958862, + "learning_rate": 3.893928148454398e-06, + "loss": 0.4069131314754486, + "mean_token_accuracy": 0.8461740016937256, + "num_tokens": 12226502.0, + "step": 1368 + }, + { + "epoch": 1.040273556231003, + "grad_norm": 2.531553268432617, + "learning_rate": 3.89218904122047e-06, + "loss": 0.43681037425994873, + "mean_token_accuracy": 0.8497104048728943, + "num_tokens": 12232241.0, + "step": 1369 + }, + { + "epoch": 1.041033434650456, + "grad_norm": 3.8404815196990967, + "learning_rate": 3.890448956894682e-06, + "loss": 0.3241814970970154, + "mean_token_accuracy": 0.884732723236084, + "num_tokens": 12235126.0, + "step": 1370 + }, + { + "epoch": 1.0417933130699089, + "grad_norm": 2.9608030319213867, + "learning_rate": 3.888707896698293e-06, + "loss": 0.4641021490097046, + "mean_token_accuracy": 0.8496800661087036, + "num_tokens": 12240630.0, + "step": 1371 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.1166417598724365, + "learning_rate": 3.886965861853243e-06, + "loss": 0.42038479447364807, + "mean_token_accuracy": 0.8512747287750244, + "num_tokens": 12247969.0, + "step": 1372 + }, + { + "epoch": 1.0433130699088147, + "grad_norm": 2.5918161869049072, + "learning_rate": 3.885222853582163e-06, + "loss": 0.2871917188167572, + "mean_token_accuracy": 0.9129709601402283, + "num_tokens": 12252161.0, + "step": 1373 + }, + { + "epoch": 1.0440729483282676, + "grad_norm": 2.4261348247528076, + "learning_rate": 3.88347887310836e-06, + "loss": 0.4003123342990875, + "mean_token_accuracy": 0.8570356369018555, + "num_tokens": 12258135.0, + "step": 1374 + }, + { + "epoch": 1.0448328267477203, + "grad_norm": 1.3439548015594482, + "learning_rate": 3.881733921655829e-06, + "loss": 0.3278140425682068, + "mean_token_accuracy": 0.8831373453140259, + "num_tokens": 12272849.0, + "step": 1375 + }, + { + "epoch": 1.0455927051671732, + "grad_norm": 1.527989387512207, + "learning_rate": 3.879988000449243e-06, + "loss": 0.33789363503456116, + "mean_token_accuracy": 0.8825669884681702, + "num_tokens": 12283281.0, + "step": 1376 + }, + { + "epoch": 1.046352583586626, + "grad_norm": 1.6755503416061401, + "learning_rate": 3.878241110713957e-06, + "loss": 0.4816160798072815, + "mean_token_accuracy": 0.8193758726119995, + "num_tokens": 12295422.0, + "step": 1377 + }, + { + "epoch": 1.047112462006079, + "grad_norm": 2.8110361099243164, + "learning_rate": 3.876493253676004e-06, + "loss": 0.38662949204444885, + "mean_token_accuracy": 0.8611986637115479, + "num_tokens": 12299806.0, + "step": 1378 + }, + { + "epoch": 1.047872340425532, + "grad_norm": 1.86097252368927, + "learning_rate": 3.8747444305621e-06, + "loss": 0.27612629532814026, + "mean_token_accuracy": 0.8984048366546631, + "num_tokens": 12306599.0, + "step": 1379 + }, + { + "epoch": 1.0486322188449848, + "grad_norm": 2.361828565597534, + "learning_rate": 3.872994642599635e-06, + "loss": 0.469953715801239, + "mean_token_accuracy": 0.8464452028274536, + "num_tokens": 12314249.0, + "step": 1380 + }, + { + "epoch": 1.0493920972644377, + "grad_norm": 1.9524794816970825, + "learning_rate": 3.871243891016676e-06, + "loss": 0.5419625043869019, + "mean_token_accuracy": 0.8468329906463623, + "num_tokens": 12324987.0, + "step": 1381 + }, + { + "epoch": 1.0501519756838906, + "grad_norm": 1.6931511163711548, + "learning_rate": 3.869492177041971e-06, + "loss": 0.3791416883468628, + "mean_token_accuracy": 0.8692882061004639, + "num_tokens": 12336864.0, + "step": 1382 + }, + { + "epoch": 1.0509118541033435, + "grad_norm": 1.909692406654358, + "learning_rate": 3.867739501904938e-06, + "loss": 0.27974557876586914, + "mean_token_accuracy": 0.9004636406898499, + "num_tokens": 12343093.0, + "step": 1383 + }, + { + "epoch": 1.0516717325227964, + "grad_norm": 1.415162205696106, + "learning_rate": 3.8659858668356735e-06, + "loss": 0.38928335905075073, + "mean_token_accuracy": 0.8491984009742737, + "num_tokens": 12356613.0, + "step": 1384 + }, + { + "epoch": 1.0524316109422491, + "grad_norm": 1.8195741176605225, + "learning_rate": 3.864231273064944e-06, + "loss": 0.3798758089542389, + "mean_token_accuracy": 0.8728072047233582, + "num_tokens": 12364860.0, + "step": 1385 + }, + { + "epoch": 1.053191489361702, + "grad_norm": 1.8481454849243164, + "learning_rate": 3.862475721824193e-06, + "loss": 0.269635945558548, + "mean_token_accuracy": 0.899247407913208, + "num_tokens": 12371841.0, + "step": 1386 + }, + { + "epoch": 1.053951367781155, + "grad_norm": 1.7838784456253052, + "learning_rate": 3.8607192143455325e-06, + "loss": 0.36971768736839294, + "mean_token_accuracy": 0.8833638429641724, + "num_tokens": 12380685.0, + "step": 1387 + }, + { + "epoch": 1.0547112462006079, + "grad_norm": 1.333358645439148, + "learning_rate": 3.858961751861748e-06, + "loss": 0.4039418399333954, + "mean_token_accuracy": 0.8541078567504883, + "num_tokens": 12394072.0, + "step": 1388 + }, + { + "epoch": 1.0554711246200608, + "grad_norm": 2.1600265502929688, + "learning_rate": 3.857203335606294e-06, + "loss": 0.38211894035339355, + "mean_token_accuracy": 0.8549972772598267, + "num_tokens": 12400449.0, + "step": 1389 + }, + { + "epoch": 1.0562310030395137, + "grad_norm": 2.914902687072754, + "learning_rate": 3.855443966813295e-06, + "loss": 0.2237374186515808, + "mean_token_accuracy": 0.9253600835800171, + "num_tokens": 12403758.0, + "step": 1390 + }, + { + "epoch": 1.0569908814589666, + "grad_norm": 2.2361080646514893, + "learning_rate": 3.853683646717543e-06, + "loss": 0.3359566926956177, + "mean_token_accuracy": 0.898173451423645, + "num_tokens": 12410374.0, + "step": 1391 + }, + { + "epoch": 1.0577507598784195, + "grad_norm": 2.3639304637908936, + "learning_rate": 3.8519223765544985e-06, + "loss": 0.3844943046569824, + "mean_token_accuracy": 0.863599419593811, + "num_tokens": 12416016.0, + "step": 1392 + }, + { + "epoch": 1.0585106382978724, + "grad_norm": 2.202971935272217, + "learning_rate": 3.85016015756029e-06, + "loss": 0.3546281158924103, + "mean_token_accuracy": 0.8907540440559387, + "num_tokens": 12422026.0, + "step": 1393 + }, + { + "epoch": 1.0592705167173253, + "grad_norm": 1.1279661655426025, + "learning_rate": 3.848396990971709e-06, + "loss": 0.31522464752197266, + "mean_token_accuracy": 0.8662257194519043, + "num_tokens": 12439964.0, + "step": 1394 + }, + { + "epoch": 1.0600303951367782, + "grad_norm": 2.4731740951538086, + "learning_rate": 3.846632878026214e-06, + "loss": 0.456442266702652, + "mean_token_accuracy": 0.8516958951950073, + "num_tokens": 12446231.0, + "step": 1395 + }, + { + "epoch": 1.060790273556231, + "grad_norm": 1.7631878852844238, + "learning_rate": 3.844867819961928e-06, + "loss": 0.487227201461792, + "mean_token_accuracy": 0.8466947078704834, + "num_tokens": 12459989.0, + "step": 1396 + }, + { + "epoch": 1.0615501519756838, + "grad_norm": 2.4468278884887695, + "learning_rate": 3.843101818017637e-06, + "loss": 0.3367291986942291, + "mean_token_accuracy": 0.8734689950942993, + "num_tokens": 12465741.0, + "step": 1397 + }, + { + "epoch": 1.0623100303951367, + "grad_norm": 1.9045145511627197, + "learning_rate": 3.841334873432789e-06, + "loss": 0.4652615487575531, + "mean_token_accuracy": 0.8333107233047485, + "num_tokens": 12474963.0, + "step": 1398 + }, + { + "epoch": 1.0630699088145896, + "grad_norm": 1.6816917657852173, + "learning_rate": 3.839566987447492e-06, + "loss": 0.4144279956817627, + "mean_token_accuracy": 0.8472539186477661, + "num_tokens": 12485521.0, + "step": 1399 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 1.8990092277526855, + "learning_rate": 3.837798161302518e-06, + "loss": 0.4040985405445099, + "mean_token_accuracy": 0.8514704704284668, + "num_tokens": 12493495.0, + "step": 1400 + }, + { + "epoch": 1.0645896656534954, + "grad_norm": 2.27785325050354, + "learning_rate": 3.836028396239297e-06, + "loss": 0.43425723910331726, + "mean_token_accuracy": 0.8795069456100464, + "num_tokens": 12499789.0, + "step": 1401 + }, + { + "epoch": 1.0653495440729484, + "grad_norm": 2.5130882263183594, + "learning_rate": 3.8342576934999184e-06, + "loss": 0.33892524242401123, + "mean_token_accuracy": 0.8717449903488159, + "num_tokens": 12504885.0, + "step": 1402 + }, + { + "epoch": 1.0661094224924013, + "grad_norm": 2.650040864944458, + "learning_rate": 3.832486054327131e-06, + "loss": 0.4200317859649658, + "mean_token_accuracy": 0.8616159558296204, + "num_tokens": 12509783.0, + "step": 1403 + }, + { + "epoch": 1.0668693009118542, + "grad_norm": 2.9176881313323975, + "learning_rate": 3.830713479964335e-06, + "loss": 0.37018489837646484, + "mean_token_accuracy": 0.8676021695137024, + "num_tokens": 12514441.0, + "step": 1404 + }, + { + "epoch": 1.067629179331307, + "grad_norm": 1.6430318355560303, + "learning_rate": 3.828939971655595e-06, + "loss": 0.27539193630218506, + "mean_token_accuracy": 0.9077831506729126, + "num_tokens": 12523677.0, + "step": 1405 + }, + { + "epoch": 1.06838905775076, + "grad_norm": 1.3683708906173706, + "learning_rate": 3.827165530645627e-06, + "loss": 0.4085099697113037, + "mean_token_accuracy": 0.8579255938529968, + "num_tokens": 12540104.0, + "step": 1406 + }, + { + "epoch": 1.0691489361702127, + "grad_norm": 2.528465747833252, + "learning_rate": 3.825390158179802e-06, + "loss": 0.42462456226348877, + "mean_token_accuracy": 0.852813720703125, + "num_tokens": 12548239.0, + "step": 1407 + }, + { + "epoch": 1.0699088145896656, + "grad_norm": 1.8288795948028564, + "learning_rate": 3.823613855504144e-06, + "loss": 0.412417471408844, + "mean_token_accuracy": 0.8622130751609802, + "num_tokens": 12557316.0, + "step": 1408 + }, + { + "epoch": 1.0706686930091185, + "grad_norm": 2.341794490814209, + "learning_rate": 3.82183662386533e-06, + "loss": 0.2996668815612793, + "mean_token_accuracy": 0.8964041471481323, + "num_tokens": 12562377.0, + "step": 1409 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 2.555877208709717, + "learning_rate": 3.82005846451069e-06, + "loss": 0.4184221625328064, + "mean_token_accuracy": 0.8678828477859497, + "num_tokens": 12568516.0, + "step": 1410 + }, + { + "epoch": 1.0721884498480243, + "grad_norm": 2.081308126449585, + "learning_rate": 3.8182793786882065e-06, + "loss": 0.4376835823059082, + "mean_token_accuracy": 0.8409077525138855, + "num_tokens": 12576598.0, + "step": 1411 + }, + { + "epoch": 1.0729483282674772, + "grad_norm": 2.0272316932678223, + "learning_rate": 3.816499367646508e-06, + "loss": 0.3630060851573944, + "mean_token_accuracy": 0.8762413263320923, + "num_tokens": 12584587.0, + "step": 1412 + }, + { + "epoch": 1.0737082066869301, + "grad_norm": 2.6382484436035156, + "learning_rate": 3.814718432634877e-06, + "loss": 0.4244990348815918, + "mean_token_accuracy": 0.8509312272071838, + "num_tokens": 12590028.0, + "step": 1413 + }, + { + "epoch": 1.074468085106383, + "grad_norm": 2.429800271987915, + "learning_rate": 3.8129365749032398e-06, + "loss": 0.36990004777908325, + "mean_token_accuracy": 0.8749774098396301, + "num_tokens": 12594984.0, + "step": 1414 + }, + { + "epoch": 1.075227963525836, + "grad_norm": 3.5939090251922607, + "learning_rate": 3.8111537957021736e-06, + "loss": 0.4245661199092865, + "mean_token_accuracy": 0.8481623530387878, + "num_tokens": 12598494.0, + "step": 1415 + }, + { + "epoch": 1.0759878419452888, + "grad_norm": 2.705955982208252, + "learning_rate": 3.809370096282903e-06, + "loss": 0.41851678490638733, + "mean_token_accuracy": 0.8548051714897156, + "num_tokens": 12603876.0, + "step": 1416 + }, + { + "epoch": 1.0767477203647418, + "grad_norm": 1.7812079191207886, + "learning_rate": 3.807585477897296e-06, + "loss": 0.47113919258117676, + "mean_token_accuracy": 0.8346904516220093, + "num_tokens": 12613402.0, + "step": 1417 + }, + { + "epoch": 1.0775075987841944, + "grad_norm": 1.4335212707519531, + "learning_rate": 3.8057999417978654e-06, + "loss": 0.3802063465118408, + "mean_token_accuracy": 0.8563423156738281, + "num_tokens": 12626865.0, + "step": 1418 + }, + { + "epoch": 1.0782674772036474, + "grad_norm": 1.9171305894851685, + "learning_rate": 3.8040134892377702e-06, + "loss": 0.20898357033729553, + "mean_token_accuracy": 0.9189738035202026, + "num_tokens": 12632593.0, + "step": 1419 + }, + { + "epoch": 1.0790273556231003, + "grad_norm": 1.4996821880340576, + "learning_rate": 3.802226121470811e-06, + "loss": 0.4203261137008667, + "mean_token_accuracy": 0.8479211330413818, + "num_tokens": 12646395.0, + "step": 1420 + }, + { + "epoch": 1.0797872340425532, + "grad_norm": 2.2007253170013428, + "learning_rate": 3.800437839751432e-06, + "loss": 0.40370577573776245, + "mean_token_accuracy": 0.8427679538726807, + "num_tokens": 12653508.0, + "step": 1421 + }, + { + "epoch": 1.080547112462006, + "grad_norm": 1.7266581058502197, + "learning_rate": 3.7986486453347183e-06, + "loss": 0.46750491857528687, + "mean_token_accuracy": 0.8429205417633057, + "num_tokens": 12666329.0, + "step": 1422 + }, + { + "epoch": 1.081306990881459, + "grad_norm": 1.4716318845748901, + "learning_rate": 3.796858539476394e-06, + "loss": 0.3330317735671997, + "mean_token_accuracy": 0.879012942314148, + "num_tokens": 12676741.0, + "step": 1423 + }, + { + "epoch": 1.082066869300912, + "grad_norm": 2.652127265930176, + "learning_rate": 3.795067523432826e-06, + "loss": 0.35365715622901917, + "mean_token_accuracy": 0.8796792030334473, + "num_tokens": 12681479.0, + "step": 1424 + }, + { + "epoch": 1.0828267477203648, + "grad_norm": 1.2937829494476318, + "learning_rate": 3.793275598461017e-06, + "loss": 0.25272446870803833, + "mean_token_accuracy": 0.9231734275817871, + "num_tokens": 12694238.0, + "step": 1425 + }, + { + "epoch": 1.0835866261398177, + "grad_norm": 1.3831220865249634, + "learning_rate": 3.7914827658186104e-06, + "loss": 0.4935331344604492, + "mean_token_accuracy": 0.8417420387268066, + "num_tokens": 12712857.0, + "step": 1426 + }, + { + "epoch": 1.0843465045592706, + "grad_norm": 3.059525728225708, + "learning_rate": 3.7896890267638832e-06, + "loss": 0.2592190206050873, + "mean_token_accuracy": 0.9040263295173645, + "num_tokens": 12716766.0, + "step": 1427 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.8399202823638916, + "learning_rate": 3.787894382555752e-06, + "loss": 0.32098138332366943, + "mean_token_accuracy": 0.8838302493095398, + "num_tokens": 12720774.0, + "step": 1428 + }, + { + "epoch": 1.0858662613981762, + "grad_norm": 2.618479013442993, + "learning_rate": 3.7860988344537664e-06, + "loss": 0.425255686044693, + "mean_token_accuracy": 0.8564130067825317, + "num_tokens": 12726506.0, + "step": 1429 + }, + { + "epoch": 1.0866261398176291, + "grad_norm": 1.3108669519424438, + "learning_rate": 3.7843023837181126e-06, + "loss": 0.40220165252685547, + "mean_token_accuracy": 0.8588873147964478, + "num_tokens": 12742814.0, + "step": 1430 + }, + { + "epoch": 1.087386018237082, + "grad_norm": 2.2083566188812256, + "learning_rate": 3.782505031609607e-06, + "loss": 0.318379282951355, + "mean_token_accuracy": 0.8887606859207153, + "num_tokens": 12748388.0, + "step": 1431 + }, + { + "epoch": 1.088145896656535, + "grad_norm": 1.922358751296997, + "learning_rate": 3.7807067793897006e-06, + "loss": 0.2519589364528656, + "mean_token_accuracy": 0.8936764001846313, + "num_tokens": 12754761.0, + "step": 1432 + }, + { + "epoch": 1.0889057750759878, + "grad_norm": 1.7367439270019531, + "learning_rate": 3.778907628320477e-06, + "loss": 0.3970367908477783, + "mean_token_accuracy": 0.858735203742981, + "num_tokens": 12764016.0, + "step": 1433 + }, + { + "epoch": 1.0896656534954408, + "grad_norm": 2.1931066513061523, + "learning_rate": 3.77710757966465e-06, + "loss": 0.5250554084777832, + "mean_token_accuracy": 0.8356746435165405, + "num_tokens": 12772272.0, + "step": 1434 + }, + { + "epoch": 1.0904255319148937, + "grad_norm": 1.718337893486023, + "learning_rate": 3.775306634685562e-06, + "loss": 0.283231645822525, + "mean_token_accuracy": 0.9009919166564941, + "num_tokens": 12780706.0, + "step": 1435 + }, + { + "epoch": 1.0911854103343466, + "grad_norm": 2.1985926628112793, + "learning_rate": 3.773504794647187e-06, + "loss": 0.3913170397281647, + "mean_token_accuracy": 0.8909255266189575, + "num_tokens": 12787052.0, + "step": 1436 + }, + { + "epoch": 1.0919452887537995, + "grad_norm": 2.8687937259674072, + "learning_rate": 3.771702060814123e-06, + "loss": 0.3135771155357361, + "mean_token_accuracy": 0.9016125202178955, + "num_tokens": 12791854.0, + "step": 1437 + }, + { + "epoch": 1.0927051671732522, + "grad_norm": 4.203946590423584, + "learning_rate": 3.7698984344516e-06, + "loss": 0.3642737865447998, + "mean_token_accuracy": 0.8842349052429199, + "num_tokens": 12794969.0, + "step": 1438 + }, + { + "epoch": 1.093465045592705, + "grad_norm": 1.5134642124176025, + "learning_rate": 3.7680939168254733e-06, + "loss": 0.3732057213783264, + "mean_token_accuracy": 0.8671083450317383, + "num_tokens": 12808480.0, + "step": 1439 + }, + { + "epoch": 1.094224924012158, + "grad_norm": 3.2103970050811768, + "learning_rate": 3.7662885092022206e-06, + "loss": 0.3556194603443146, + "mean_token_accuracy": 0.8786529302597046, + "num_tokens": 12812654.0, + "step": 1440 + }, + { + "epoch": 1.094984802431611, + "grad_norm": 2.2774064540863037, + "learning_rate": 3.7644822128489476e-06, + "loss": 0.38409674167633057, + "mean_token_accuracy": 0.866563081741333, + "num_tokens": 12819854.0, + "step": 1441 + }, + { + "epoch": 1.0957446808510638, + "grad_norm": 1.8250885009765625, + "learning_rate": 3.7626750290333824e-06, + "loss": 0.3812350034713745, + "mean_token_accuracy": 0.8676212430000305, + "num_tokens": 12830338.0, + "step": 1442 + }, + { + "epoch": 1.0965045592705167, + "grad_norm": 1.8337891101837158, + "learning_rate": 3.7608669590238765e-06, + "loss": 0.3892471194267273, + "mean_token_accuracy": 0.8616238832473755, + "num_tokens": 12840340.0, + "step": 1443 + }, + { + "epoch": 1.0972644376899696, + "grad_norm": 1.5300254821777344, + "learning_rate": 3.7590580040894025e-06, + "loss": 0.35288217663764954, + "mean_token_accuracy": 0.8625509738922119, + "num_tokens": 12853144.0, + "step": 1444 + }, + { + "epoch": 1.0980243161094225, + "grad_norm": 2.152683734893799, + "learning_rate": 3.7572481654995554e-06, + "loss": 0.4004772901535034, + "mean_token_accuracy": 0.858427107334137, + "num_tokens": 12859970.0, + "step": 1445 + }, + { + "epoch": 1.0987841945288754, + "grad_norm": 1.532832145690918, + "learning_rate": 3.755437444524548e-06, + "loss": 0.46820127964019775, + "mean_token_accuracy": 0.8585472106933594, + "num_tokens": 12875243.0, + "step": 1446 + }, + { + "epoch": 1.0995440729483283, + "grad_norm": 1.6485342979431152, + "learning_rate": 3.7536258424352164e-06, + "loss": 0.46329325437545776, + "mean_token_accuracy": 0.8376060724258423, + "num_tokens": 12886383.0, + "step": 1447 + }, + { + "epoch": 1.1003039513677813, + "grad_norm": 2.402256488800049, + "learning_rate": 3.75181336050301e-06, + "loss": 0.43916207551956177, + "mean_token_accuracy": 0.8448786735534668, + "num_tokens": 12892613.0, + "step": 1448 + }, + { + "epoch": 1.101063829787234, + "grad_norm": 1.3893651962280273, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.3919021785259247, + "mean_token_accuracy": 0.8495820760726929, + "num_tokens": 12905523.0, + "step": 1449 + }, + { + "epoch": 1.1018237082066868, + "grad_norm": 1.5519827604293823, + "learning_rate": 3.7481857621988734e-06, + "loss": 0.4710700809955597, + "mean_token_accuracy": 0.8387632369995117, + "num_tokens": 12918236.0, + "step": 1450 + }, + { + "epoch": 1.1025835866261398, + "grad_norm": 2.0141353607177734, + "learning_rate": 3.74637064837293e-06, + "loss": 0.30866751074790955, + "mean_token_accuracy": 0.9059321880340576, + "num_tokens": 12924391.0, + "step": 1451 + }, + { + "epoch": 1.1033434650455927, + "grad_norm": 1.2201496362686157, + "learning_rate": 3.7445546597960882e-06, + "loss": 0.3938257396221161, + "mean_token_accuracy": 0.8726630210876465, + "num_tokens": 12943338.0, + "step": 1452 + }, + { + "epoch": 1.1041033434650456, + "grad_norm": 2.29434871673584, + "learning_rate": 3.742737797742878e-06, + "loss": 0.4347776174545288, + "mean_token_accuracy": 0.840569257736206, + "num_tokens": 12950636.0, + "step": 1453 + }, + { + "epoch": 1.1048632218844985, + "grad_norm": 2.3875105381011963, + "learning_rate": 3.7409200634884425e-06, + "loss": 0.48353564739227295, + "mean_token_accuracy": 0.8207056522369385, + "num_tokens": 12957635.0, + "step": 1454 + }, + { + "epoch": 1.1056231003039514, + "grad_norm": 2.3539648056030273, + "learning_rate": 3.7391014583085384e-06, + "loss": 0.3532431721687317, + "mean_token_accuracy": 0.8903788924217224, + "num_tokens": 12963032.0, + "step": 1455 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 1.5611135959625244, + "learning_rate": 3.737281983479534e-06, + "loss": 0.4734863042831421, + "mean_token_accuracy": 0.8413879871368408, + "num_tokens": 12977170.0, + "step": 1456 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.474320411682129, + "learning_rate": 3.735461640278404e-06, + "loss": 0.41854286193847656, + "mean_token_accuracy": 0.8499876856803894, + "num_tokens": 12993750.0, + "step": 1457 + }, + { + "epoch": 1.1079027355623101, + "grad_norm": 2.6873273849487305, + "learning_rate": 3.733640429982738e-06, + "loss": 0.47637903690338135, + "mean_token_accuracy": 0.83599853515625, + "num_tokens": 12999058.0, + "step": 1458 + }, + { + "epoch": 1.108662613981763, + "grad_norm": 1.4575026035308838, + "learning_rate": 3.731818353870729e-06, + "loss": 0.38441652059555054, + "mean_token_accuracy": 0.8582364320755005, + "num_tokens": 13013864.0, + "step": 1459 + }, + { + "epoch": 1.1094224924012157, + "grad_norm": 1.7722690105438232, + "learning_rate": 3.729995413221183e-06, + "loss": 0.4224998950958252, + "mean_token_accuracy": 0.8511888384819031, + "num_tokens": 13023714.0, + "step": 1460 + }, + { + "epoch": 1.1101823708206686, + "grad_norm": 2.625760555267334, + "learning_rate": 3.7281716093135068e-06, + "loss": 0.3487582802772522, + "mean_token_accuracy": 0.8834779262542725, + "num_tokens": 13028608.0, + "step": 1461 + }, + { + "epoch": 1.1109422492401215, + "grad_norm": 1.2554056644439697, + "learning_rate": 3.726346943427719e-06, + "loss": 0.33312469720840454, + "mean_token_accuracy": 0.8704153299331665, + "num_tokens": 13044901.0, + "step": 1462 + }, + { + "epoch": 1.1117021276595744, + "grad_norm": 2.1109910011291504, + "learning_rate": 3.7245214168444388e-06, + "loss": 0.387290894985199, + "mean_token_accuracy": 0.860816240310669, + "num_tokens": 13051452.0, + "step": 1463 + }, + { + "epoch": 1.1124620060790273, + "grad_norm": 3.159201145172119, + "learning_rate": 3.722695030844891e-06, + "loss": 0.37690871953964233, + "mean_token_accuracy": 0.8717561960220337, + "num_tokens": 13055131.0, + "step": 1464 + }, + { + "epoch": 1.1132218844984803, + "grad_norm": 1.3810011148452759, + "learning_rate": 3.7208677867109042e-06, + "loss": 0.36598485708236694, + "mean_token_accuracy": 0.8683375120162964, + "num_tokens": 13069798.0, + "step": 1465 + }, + { + "epoch": 1.1139817629179332, + "grad_norm": 2.500849485397339, + "learning_rate": 3.7190396857249087e-06, + "loss": 0.2781746983528137, + "mean_token_accuracy": 0.9026005268096924, + "num_tokens": 13075127.0, + "step": 1466 + }, + { + "epoch": 1.114741641337386, + "grad_norm": 1.7445712089538574, + "learning_rate": 3.7172107291699356e-06, + "loss": 0.5055314302444458, + "mean_token_accuracy": 0.8252174258232117, + "num_tokens": 13084843.0, + "step": 1467 + }, + { + "epoch": 1.115501519756839, + "grad_norm": 1.6386256217956543, + "learning_rate": 3.7153809183296174e-06, + "loss": 0.38478314876556396, + "mean_token_accuracy": 0.8600847721099854, + "num_tokens": 13096517.0, + "step": 1468 + }, + { + "epoch": 1.1162613981762919, + "grad_norm": 2.3818395137786865, + "learning_rate": 3.713550254488185e-06, + "loss": 0.40308547019958496, + "mean_token_accuracy": 0.8628184795379639, + "num_tokens": 13102324.0, + "step": 1469 + }, + { + "epoch": 1.1170212765957448, + "grad_norm": 1.73163640499115, + "learning_rate": 3.7117187389304703e-06, + "loss": 0.5035421848297119, + "mean_token_accuracy": 0.8229597210884094, + "num_tokens": 13113763.0, + "step": 1470 + }, + { + "epoch": 1.1177811550151975, + "grad_norm": 3.147177219390869, + "learning_rate": 3.7098863729418997e-06, + "loss": 0.557449221611023, + "mean_token_accuracy": 0.8266849517822266, + "num_tokens": 13118849.0, + "step": 1471 + }, + { + "epoch": 1.1185410334346504, + "grad_norm": 1.5061391592025757, + "learning_rate": 3.7080531578085e-06, + "loss": 0.3759554922580719, + "mean_token_accuracy": 0.8541903495788574, + "num_tokens": 13131337.0, + "step": 1472 + }, + { + "epoch": 1.1193009118541033, + "grad_norm": 2.172346353530884, + "learning_rate": 3.7062190948168906e-06, + "loss": 0.41491609811782837, + "mean_token_accuracy": 0.8531454801559448, + "num_tokens": 13139767.0, + "step": 1473 + }, + { + "epoch": 1.1200607902735562, + "grad_norm": 2.1527154445648193, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4309239387512207, + "mean_token_accuracy": 0.8327745199203491, + "num_tokens": 13147210.0, + "step": 1474 + }, + { + "epoch": 1.1208206686930091, + "grad_norm": 1.8342832326889038, + "learning_rate": 3.7025484304085035e-06, + "loss": 0.34393298625946045, + "mean_token_accuracy": 0.8948153257369995, + "num_tokens": 13154831.0, + "step": 1475 + }, + { + "epoch": 1.121580547112462, + "grad_norm": 2.509291172027588, + "learning_rate": 3.7007118315679384e-06, + "loss": 0.4479471445083618, + "mean_token_accuracy": 0.8280234336853027, + "num_tokens": 13161040.0, + "step": 1476 + }, + { + "epoch": 1.122340425531915, + "grad_norm": 2.914710521697998, + "learning_rate": 3.6988743900215895e-06, + "loss": 0.3724832832813263, + "mean_token_accuracy": 0.863893985748291, + "num_tokens": 13164975.0, + "step": 1477 + }, + { + "epoch": 1.1231003039513678, + "grad_norm": 3.274808645248413, + "learning_rate": 3.6970361070590443e-06, + "loss": 0.4088161885738373, + "mean_token_accuracy": 0.8474822044372559, + "num_tokens": 13168826.0, + "step": 1478 + }, + { + "epoch": 1.1238601823708207, + "grad_norm": 2.861546277999878, + "learning_rate": 3.695196983970481e-06, + "loss": 0.45837992429733276, + "mean_token_accuracy": 0.8579759001731873, + "num_tokens": 13173794.0, + "step": 1479 + }, + { + "epoch": 1.1246200607902737, + "grad_norm": 1.9491597414016724, + "learning_rate": 3.6933570220466654e-06, + "loss": 0.4333910346031189, + "mean_token_accuracy": 0.8444236516952515, + "num_tokens": 13181598.0, + "step": 1480 + }, + { + "epoch": 1.1253799392097266, + "grad_norm": 1.329848051071167, + "learning_rate": 3.6915162225789546e-06, + "loss": 0.36404621601104736, + "mean_token_accuracy": 0.8694117069244385, + "num_tokens": 13196381.0, + "step": 1481 + }, + { + "epoch": 1.1261398176291793, + "grad_norm": 1.8854197263717651, + "learning_rate": 3.6896745868592924e-06, + "loss": 0.4085756838321686, + "mean_token_accuracy": 0.855188250541687, + "num_tokens": 13205236.0, + "step": 1482 + }, + { + "epoch": 1.1268996960486322, + "grad_norm": 3.01684832572937, + "learning_rate": 3.6878321161802106e-06, + "loss": 0.28105655312538147, + "mean_token_accuracy": 0.9009426236152649, + "num_tokens": 13209380.0, + "step": 1483 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 1.8051308393478394, + "learning_rate": 3.685988811834823e-06, + "loss": 0.3314531147480011, + "mean_token_accuracy": 0.8805814385414124, + "num_tokens": 13217714.0, + "step": 1484 + }, + { + "epoch": 1.128419452887538, + "grad_norm": 1.61757493019104, + "learning_rate": 3.684144675116836e-06, + "loss": 0.4543863534927368, + "mean_token_accuracy": 0.8400536775588989, + "num_tokens": 13229330.0, + "step": 1485 + }, + { + "epoch": 1.1291793313069909, + "grad_norm": 1.602686882019043, + "learning_rate": 3.682299707320532e-06, + "loss": 0.3653204143047333, + "mean_token_accuracy": 0.8655825853347778, + "num_tokens": 13242872.0, + "step": 1486 + }, + { + "epoch": 1.1299392097264438, + "grad_norm": 2.3093113899230957, + "learning_rate": 3.680453909740782e-06, + "loss": 0.4383693039417267, + "mean_token_accuracy": 0.839782178401947, + "num_tokens": 13248976.0, + "step": 1487 + }, + { + "epoch": 1.1306990881458967, + "grad_norm": 1.180559754371643, + "learning_rate": 3.6786072836730376e-06, + "loss": 0.5354755520820618, + "mean_token_accuracy": 0.8151205778121948, + "num_tokens": 13272896.0, + "step": 1488 + }, + { + "epoch": 1.1314589665653496, + "grad_norm": 1.9554040431976318, + "learning_rate": 3.6767598304133325e-06, + "loss": 0.4485316872596741, + "mean_token_accuracy": 0.8399936556816101, + "num_tokens": 13280757.0, + "step": 1489 + }, + { + "epoch": 1.1322188449848025, + "grad_norm": 2.236471176147461, + "learning_rate": 3.674911551258279e-06, + "loss": 0.45594364404678345, + "mean_token_accuracy": 0.8552400469779968, + "num_tokens": 13287328.0, + "step": 1490 + }, + { + "epoch": 1.1329787234042552, + "grad_norm": 2.5228686332702637, + "learning_rate": 3.673062447505072e-06, + "loss": 0.4048641622066498, + "mean_token_accuracy": 0.8617376685142517, + "num_tokens": 13292716.0, + "step": 1491 + }, + { + "epoch": 1.1337386018237081, + "grad_norm": 1.1274473667144775, + "learning_rate": 3.6712125204514836e-06, + "loss": 0.3848876357078552, + "mean_token_accuracy": 0.8672975301742554, + "num_tokens": 13313403.0, + "step": 1492 + }, + { + "epoch": 1.134498480243161, + "grad_norm": 2.349541425704956, + "learning_rate": 3.6693617713958633e-06, + "loss": 0.3166058361530304, + "mean_token_accuracy": 0.8896721601486206, + "num_tokens": 13318720.0, + "step": 1493 + }, + { + "epoch": 1.135258358662614, + "grad_norm": 2.2438278198242188, + "learning_rate": 3.6675102016371387e-06, + "loss": 0.5418218970298767, + "mean_token_accuracy": 0.8256527185440063, + "num_tokens": 13325360.0, + "step": 1494 + }, + { + "epoch": 1.1360182370820668, + "grad_norm": 2.21268892288208, + "learning_rate": 3.665657812474812e-06, + "loss": 0.48603951930999756, + "mean_token_accuracy": 0.8273470401763916, + "num_tokens": 13333217.0, + "step": 1495 + }, + { + "epoch": 1.1367781155015197, + "grad_norm": 2.6105997562408447, + "learning_rate": 3.6638046052089614e-06, + "loss": 0.31221291422843933, + "mean_token_accuracy": 0.888375997543335, + "num_tokens": 13338413.0, + "step": 1496 + }, + { + "epoch": 1.1375379939209727, + "grad_norm": 3.655658483505249, + "learning_rate": 3.661950581140239e-06, + "loss": 0.3609023988246918, + "mean_token_accuracy": 0.8838576078414917, + "num_tokens": 13341499.0, + "step": 1497 + }, + { + "epoch": 1.1382978723404256, + "grad_norm": 2.242009162902832, + "learning_rate": 3.660095741569871e-06, + "loss": 0.40022802352905273, + "mean_token_accuracy": 0.8559960722923279, + "num_tokens": 13347917.0, + "step": 1498 + }, + { + "epoch": 1.1390577507598785, + "grad_norm": 1.7958979606628418, + "learning_rate": 3.658240087799655e-06, + "loss": 0.499157190322876, + "mean_token_accuracy": 0.8423802256584167, + "num_tokens": 13361570.0, + "step": 1499 + }, + { + "epoch": 1.1398176291793314, + "grad_norm": 2.5406908988952637, + "learning_rate": 3.6563836211319593e-06, + "loss": 0.4090137481689453, + "mean_token_accuracy": 0.8769663572311401, + "num_tokens": 13367183.0, + "step": 1500 + }, + { + "epoch": 1.1405775075987843, + "grad_norm": 1.9861716032028198, + "learning_rate": 3.654526342869724e-06, + "loss": 0.5125207304954529, + "mean_token_accuracy": 0.8315266370773315, + "num_tokens": 13376767.0, + "step": 1501 + }, + { + "epoch": 1.141337386018237, + "grad_norm": 1.731188178062439, + "learning_rate": 3.65266825431646e-06, + "loss": 0.39452576637268066, + "mean_token_accuracy": 0.8585706353187561, + "num_tokens": 13388437.0, + "step": 1502 + }, + { + "epoch": 1.1420972644376899, + "grad_norm": 1.5203773975372314, + "learning_rate": 3.6508093567762425e-06, + "loss": 0.39466819167137146, + "mean_token_accuracy": 0.8584027886390686, + "num_tokens": 13399727.0, + "step": 1503 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 2.606462001800537, + "learning_rate": 3.6489496515537204e-06, + "loss": 0.4521079361438751, + "mean_token_accuracy": 0.8413360118865967, + "num_tokens": 13408426.0, + "step": 1504 + }, + { + "epoch": 1.1436170212765957, + "grad_norm": 2.6207993030548096, + "learning_rate": 3.647089139954104e-06, + "loss": 0.4709353446960449, + "mean_token_accuracy": 0.8397113084793091, + "num_tokens": 13413506.0, + "step": 1505 + }, + { + "epoch": 1.1443768996960486, + "grad_norm": 1.7214165925979614, + "learning_rate": 3.6452278232831734e-06, + "loss": 0.45506367087364197, + "mean_token_accuracy": 0.8466023206710815, + "num_tokens": 13424592.0, + "step": 1506 + }, + { + "epoch": 1.1451367781155015, + "grad_norm": 1.7111759185791016, + "learning_rate": 3.643365702847272e-06, + "loss": 0.5016278624534607, + "mean_token_accuracy": 0.8196234703063965, + "num_tokens": 13434421.0, + "step": 1507 + }, + { + "epoch": 1.1458966565349544, + "grad_norm": 1.7528148889541626, + "learning_rate": 3.641502779953307e-06, + "loss": 0.5020896196365356, + "mean_token_accuracy": 0.826249361038208, + "num_tokens": 13445286.0, + "step": 1508 + }, + { + "epoch": 1.1466565349544073, + "grad_norm": 1.3470909595489502, + "learning_rate": 3.639639055908751e-06, + "loss": 0.45765724778175354, + "mean_token_accuracy": 0.8380560278892517, + "num_tokens": 13465030.0, + "step": 1509 + }, + { + "epoch": 1.1474164133738602, + "grad_norm": 2.4846835136413574, + "learning_rate": 3.6377745320216346e-06, + "loss": 0.46488267183303833, + "mean_token_accuracy": 0.8393925428390503, + "num_tokens": 13470883.0, + "step": 1510 + }, + { + "epoch": 1.1481762917933132, + "grad_norm": 1.770201563835144, + "learning_rate": 3.635909209600555e-06, + "loss": 0.5262179374694824, + "mean_token_accuracy": 0.8201162815093994, + "num_tokens": 13482558.0, + "step": 1511 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 1.5955098867416382, + "learning_rate": 3.6340430899546656e-06, + "loss": 0.430621862411499, + "mean_token_accuracy": 0.8488553762435913, + "num_tokens": 13493003.0, + "step": 1512 + }, + { + "epoch": 1.1496960486322187, + "grad_norm": 2.846176862716675, + "learning_rate": 3.632176174393682e-06, + "loss": 0.23461638391017914, + "mean_token_accuracy": 0.9218817353248596, + "num_tokens": 13496566.0, + "step": 1513 + }, + { + "epoch": 1.1504559270516717, + "grad_norm": 1.9606610536575317, + "learning_rate": 3.630308464227877e-06, + "loss": 0.4940161108970642, + "mean_token_accuracy": 0.8474864959716797, + "num_tokens": 13504843.0, + "step": 1514 + }, + { + "epoch": 1.1512158054711246, + "grad_norm": 1.1588608026504517, + "learning_rate": 3.628439960768082e-06, + "loss": 0.32650992274284363, + "mean_token_accuracy": 0.8797246217727661, + "num_tokens": 13521513.0, + "step": 1515 + }, + { + "epoch": 1.1519756838905775, + "grad_norm": 1.3566495180130005, + "learning_rate": 3.6265706653256837e-06, + "loss": 0.4359064996242523, + "mean_token_accuracy": 0.8379859328269958, + "num_tokens": 13540608.0, + "step": 1516 + }, + { + "epoch": 1.1527355623100304, + "grad_norm": 1.4728609323501587, + "learning_rate": 3.624700579212626e-06, + "loss": 0.29939693212509155, + "mean_token_accuracy": 0.8831408023834229, + "num_tokens": 13550641.0, + "step": 1517 + }, + { + "epoch": 1.1534954407294833, + "grad_norm": 2.162325382232666, + "learning_rate": 3.6228297037414077e-06, + "loss": 0.4097636938095093, + "mean_token_accuracy": 0.8575425148010254, + "num_tokens": 13556931.0, + "step": 1518 + }, + { + "epoch": 1.1542553191489362, + "grad_norm": 1.754439353942871, + "learning_rate": 3.6209580402250816e-06, + "loss": 0.400202214717865, + "mean_token_accuracy": 0.8569821119308472, + "num_tokens": 13565491.0, + "step": 1519 + }, + { + "epoch": 1.155015197568389, + "grad_norm": 1.5250083208084106, + "learning_rate": 3.619085589977251e-06, + "loss": 0.43330419063568115, + "mean_token_accuracy": 0.8492985963821411, + "num_tokens": 13577147.0, + "step": 1520 + }, + { + "epoch": 1.155775075987842, + "grad_norm": 1.9108905792236328, + "learning_rate": 3.617212354312076e-06, + "loss": 0.30567464232444763, + "mean_token_accuracy": 0.8850164413452148, + "num_tokens": 13584366.0, + "step": 1521 + }, + { + "epoch": 1.156534954407295, + "grad_norm": 2.2574243545532227, + "learning_rate": 3.615338334544265e-06, + "loss": 0.4391738772392273, + "mean_token_accuracy": 0.839765727519989, + "num_tokens": 13591816.0, + "step": 1522 + }, + { + "epoch": 1.1572948328267478, + "grad_norm": 2.1235218048095703, + "learning_rate": 3.6134635319890763e-06, + "loss": 0.45043107867240906, + "mean_token_accuracy": 0.8385299444198608, + "num_tokens": 13599736.0, + "step": 1523 + }, + { + "epoch": 1.1580547112462005, + "grad_norm": 2.2274110317230225, + "learning_rate": 3.611587947962319e-06, + "loss": 0.3623226284980774, + "mean_token_accuracy": 0.8724044561386108, + "num_tokens": 13605354.0, + "step": 1524 + }, + { + "epoch": 1.1588145896656534, + "grad_norm": 3.414236545562744, + "learning_rate": 3.6097115837803504e-06, + "loss": 0.30060696601867676, + "mean_token_accuracy": 0.8971061706542969, + "num_tokens": 13608851.0, + "step": 1525 + }, + { + "epoch": 1.1595744680851063, + "grad_norm": 2.496264696121216, + "learning_rate": 3.6078344407600744e-06, + "loss": 0.3567180037498474, + "mean_token_accuracy": 0.8596180081367493, + "num_tokens": 13614339.0, + "step": 1526 + }, + { + "epoch": 1.1603343465045592, + "grad_norm": 2.0191843509674072, + "learning_rate": 3.6059565202189433e-06, + "loss": 0.43206095695495605, + "mean_token_accuracy": 0.8464000821113586, + "num_tokens": 13622395.0, + "step": 1527 + }, + { + "epoch": 1.1610942249240122, + "grad_norm": 1.5475906133651733, + "learning_rate": 3.604077823474954e-06, + "loss": 0.4535648226737976, + "mean_token_accuracy": 0.8391586542129517, + "num_tokens": 13635356.0, + "step": 1528 + }, + { + "epoch": 1.161854103343465, + "grad_norm": 2.1348211765289307, + "learning_rate": 3.6021983518466468e-06, + "loss": 0.2733963429927826, + "mean_token_accuracy": 0.9007417559623718, + "num_tokens": 13640641.0, + "step": 1529 + }, + { + "epoch": 1.162613981762918, + "grad_norm": 2.8452792167663574, + "learning_rate": 3.600318106653108e-06, + "loss": 0.29591235518455505, + "mean_token_accuracy": 0.8934413194656372, + "num_tokens": 13644995.0, + "step": 1530 + }, + { + "epoch": 1.1633738601823709, + "grad_norm": 2.342907190322876, + "learning_rate": 3.5984370892139663e-06, + "loss": 0.4675130248069763, + "mean_token_accuracy": 0.8352028131484985, + "num_tokens": 13652695.0, + "step": 1531 + }, + { + "epoch": 1.1641337386018238, + "grad_norm": 2.3480238914489746, + "learning_rate": 3.5965553008493924e-06, + "loss": 0.3114515542984009, + "mean_token_accuracy": 0.8845353126525879, + "num_tokens": 13658101.0, + "step": 1532 + }, + { + "epoch": 1.1648936170212765, + "grad_norm": 1.8608155250549316, + "learning_rate": 3.594672742880097e-06, + "loss": 0.3864145278930664, + "mean_token_accuracy": 0.867354154586792, + "num_tokens": 13666042.0, + "step": 1533 + }, + { + "epoch": 1.1656534954407296, + "grad_norm": 1.4756088256835938, + "learning_rate": 3.5927894166273324e-06, + "loss": 0.3671600818634033, + "mean_token_accuracy": 0.8695988655090332, + "num_tokens": 13678253.0, + "step": 1534 + }, + { + "epoch": 1.1664133738601823, + "grad_norm": 2.8831355571746826, + "learning_rate": 3.5909053234128893e-06, + "loss": 0.267184317111969, + "mean_token_accuracy": 0.9008115530014038, + "num_tokens": 13681790.0, + "step": 1535 + }, + { + "epoch": 1.1671732522796352, + "grad_norm": 2.1984763145446777, + "learning_rate": 3.5890204645590964e-06, + "loss": 0.4431505799293518, + "mean_token_accuracy": 0.8623673915863037, + "num_tokens": 13688444.0, + "step": 1536 + }, + { + "epoch": 1.167933130699088, + "grad_norm": 1.8271523714065552, + "learning_rate": 3.5871348413888207e-06, + "loss": 0.3861040771007538, + "mean_token_accuracy": 0.8624277114868164, + "num_tokens": 13696872.0, + "step": 1537 + }, + { + "epoch": 1.168693009118541, + "grad_norm": 1.6313756704330444, + "learning_rate": 3.585248455225466e-06, + "loss": 0.3775154948234558, + "mean_token_accuracy": 0.8624461889266968, + "num_tokens": 13706167.0, + "step": 1538 + }, + { + "epoch": 1.169452887537994, + "grad_norm": 2.4377901554107666, + "learning_rate": 3.5833613073929684e-06, + "loss": 0.2308957427740097, + "mean_token_accuracy": 0.920600175857544, + "num_tokens": 13710367.0, + "step": 1539 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.2621750831604004, + "learning_rate": 3.5814733992158025e-06, + "loss": 0.33167219161987305, + "mean_token_accuracy": 0.8963261842727661, + "num_tokens": 13716384.0, + "step": 1540 + }, + { + "epoch": 1.1709726443768997, + "grad_norm": 1.3178150653839111, + "learning_rate": 3.579584732018975e-06, + "loss": 0.3276631832122803, + "mean_token_accuracy": 0.8853521347045898, + "num_tokens": 13731031.0, + "step": 1541 + }, + { + "epoch": 1.1717325227963526, + "grad_norm": 2.177750587463379, + "learning_rate": 3.577695307128024e-06, + "loss": 0.48177266120910645, + "mean_token_accuracy": 0.830329418182373, + "num_tokens": 13737925.0, + "step": 1542 + }, + { + "epoch": 1.1724924012158056, + "grad_norm": 2.2268829345703125, + "learning_rate": 3.5758051258690223e-06, + "loss": 0.48843517899513245, + "mean_token_accuracy": 0.8310644030570984, + "num_tokens": 13746039.0, + "step": 1543 + }, + { + "epoch": 1.1732522796352582, + "grad_norm": 1.498701572418213, + "learning_rate": 3.5739141895685708e-06, + "loss": 0.4542962312698364, + "mean_token_accuracy": 0.8500330448150635, + "num_tokens": 13765002.0, + "step": 1544 + }, + { + "epoch": 1.1740121580547112, + "grad_norm": 1.786670446395874, + "learning_rate": 3.5720224995538023e-06, + "loss": 0.27367928624153137, + "mean_token_accuracy": 0.8916142582893372, + "num_tokens": 13774113.0, + "step": 1545 + }, + { + "epoch": 1.174772036474164, + "grad_norm": 2.0311272144317627, + "learning_rate": 3.5701300571523757e-06, + "loss": 0.559987485408783, + "mean_token_accuracy": 0.8266973495483398, + "num_tokens": 13783912.0, + "step": 1546 + }, + { + "epoch": 1.175531914893617, + "grad_norm": 1.8732186555862427, + "learning_rate": 3.5682368636924825e-06, + "loss": 0.5184751152992249, + "mean_token_accuracy": 0.8450918197631836, + "num_tokens": 13792728.0, + "step": 1547 + }, + { + "epoch": 1.1762917933130699, + "grad_norm": 1.4410661458969116, + "learning_rate": 3.566342920502837e-06, + "loss": 0.383536696434021, + "mean_token_accuracy": 0.8672217726707458, + "num_tokens": 13813590.0, + "step": 1548 + }, + { + "epoch": 1.1770516717325228, + "grad_norm": 3.06056547164917, + "learning_rate": 3.564448228912682e-06, + "loss": 0.3941686153411865, + "mean_token_accuracy": 0.8696402311325073, + "num_tokens": 13817704.0, + "step": 1549 + }, + { + "epoch": 1.1778115501519757, + "grad_norm": 1.6150329113006592, + "learning_rate": 3.562552790251785e-06, + "loss": 0.41606605052948, + "mean_token_accuracy": 0.8488572835922241, + "num_tokens": 13831303.0, + "step": 1550 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 2.1199934482574463, + "learning_rate": 3.5606566058504377e-06, + "loss": 0.3974752426147461, + "mean_token_accuracy": 0.8686345219612122, + "num_tokens": 13837613.0, + "step": 1551 + }, + { + "epoch": 1.1793313069908815, + "grad_norm": 1.5683876276016235, + "learning_rate": 3.558759677039455e-06, + "loss": 0.35225993394851685, + "mean_token_accuracy": 0.8710784316062927, + "num_tokens": 13846779.0, + "step": 1552 + }, + { + "epoch": 1.1800911854103344, + "grad_norm": 1.4644675254821777, + "learning_rate": 3.5568620051501755e-06, + "loss": 0.38400042057037354, + "mean_token_accuracy": 0.8548328876495361, + "num_tokens": 13860713.0, + "step": 1553 + }, + { + "epoch": 1.1808510638297873, + "grad_norm": 1.461491346359253, + "learning_rate": 3.5549635915144578e-06, + "loss": 0.4572640061378479, + "mean_token_accuracy": 0.8506045937538147, + "num_tokens": 13877289.0, + "step": 1554 + }, + { + "epoch": 1.18161094224924, + "grad_norm": 2.6364715099334717, + "learning_rate": 3.553064437464682e-06, + "loss": 0.3954341411590576, + "mean_token_accuracy": 0.8561649322509766, + "num_tokens": 13882064.0, + "step": 1555 + }, + { + "epoch": 1.182370820668693, + "grad_norm": 2.027273654937744, + "learning_rate": 3.551164544333745e-06, + "loss": 0.47625732421875, + "mean_token_accuracy": 0.8349384069442749, + "num_tokens": 13890306.0, + "step": 1556 + }, + { + "epoch": 1.1831306990881458, + "grad_norm": 2.8427743911743164, + "learning_rate": 3.549263913455069e-06, + "loss": 0.4273033142089844, + "mean_token_accuracy": 0.8541387319564819, + "num_tokens": 13894882.0, + "step": 1557 + }, + { + "epoch": 1.1838905775075987, + "grad_norm": 1.6298975944519043, + "learning_rate": 3.5473625461625884e-06, + "loss": 0.4378639757633209, + "mean_token_accuracy": 0.8634963631629944, + "num_tokens": 13906152.0, + "step": 1558 + }, + { + "epoch": 1.1846504559270516, + "grad_norm": 2.4098947048187256, + "learning_rate": 3.5454604437907535e-06, + "loss": 0.47236716747283936, + "mean_token_accuracy": 0.8646864891052246, + "num_tokens": 13911803.0, + "step": 1559 + }, + { + "epoch": 1.1854103343465046, + "grad_norm": 1.5972497463226318, + "learning_rate": 3.543557607674537e-06, + "loss": 0.3001407980918884, + "mean_token_accuracy": 0.8927055597305298, + "num_tokens": 13921304.0, + "step": 1560 + }, + { + "epoch": 1.1861702127659575, + "grad_norm": 2.1140005588531494, + "learning_rate": 3.54165403914942e-06, + "loss": 0.41898271441459656, + "mean_token_accuracy": 0.8542245626449585, + "num_tokens": 13929434.0, + "step": 1561 + }, + { + "epoch": 1.1869300911854104, + "grad_norm": 1.8733803033828735, + "learning_rate": 3.539749739551401e-06, + "loss": 0.35469961166381836, + "mean_token_accuracy": 0.8805290460586548, + "num_tokens": 13937781.0, + "step": 1562 + }, + { + "epoch": 1.1876899696048633, + "grad_norm": 2.2805802822113037, + "learning_rate": 3.53784471021699e-06, + "loss": 0.44496792554855347, + "mean_token_accuracy": 0.8454172611236572, + "num_tokens": 13944394.0, + "step": 1563 + }, + { + "epoch": 1.1884498480243162, + "grad_norm": 0.9728449583053589, + "learning_rate": 3.535938952483211e-06, + "loss": 0.3156968355178833, + "mean_token_accuracy": 0.8739837408065796, + "num_tokens": 13966712.0, + "step": 1564 + }, + { + "epoch": 1.189209726443769, + "grad_norm": 3.025338888168335, + "learning_rate": 3.534032467687597e-06, + "loss": 0.30036938190460205, + "mean_token_accuracy": 0.9058252573013306, + "num_tokens": 13970183.0, + "step": 1565 + }, + { + "epoch": 1.1899696048632218, + "grad_norm": 2.0659425258636475, + "learning_rate": 3.532125257168193e-06, + "loss": 0.30619731545448303, + "mean_token_accuracy": 0.9041587710380554, + "num_tokens": 13976657.0, + "step": 1566 + }, + { + "epoch": 1.1907294832826747, + "grad_norm": 3.2036776542663574, + "learning_rate": 3.5302173222635526e-06, + "loss": 0.4145944118499756, + "mean_token_accuracy": 0.8502328395843506, + "num_tokens": 13981198.0, + "step": 1567 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 1.7767539024353027, + "learning_rate": 3.5283086643127396e-06, + "loss": 0.437128484249115, + "mean_token_accuracy": 0.8965631723403931, + "num_tokens": 13990259.0, + "step": 1568 + }, + { + "epoch": 1.1922492401215805, + "grad_norm": 1.7777384519577026, + "learning_rate": 3.5263992846553203e-06, + "loss": 0.33831220865249634, + "mean_token_accuracy": 0.8734279870986938, + "num_tokens": 13999363.0, + "step": 1569 + }, + { + "epoch": 1.1930091185410334, + "grad_norm": 1.6710708141326904, + "learning_rate": 3.5244891846313733e-06, + "loss": 0.4005590081214905, + "mean_token_accuracy": 0.8820298314094543, + "num_tokens": 14008719.0, + "step": 1570 + }, + { + "epoch": 1.1937689969604863, + "grad_norm": 1.0378777980804443, + "learning_rate": 3.5225783655814798e-06, + "loss": 0.3174915313720703, + "mean_token_accuracy": 0.8894162774085999, + "num_tokens": 14025806.0, + "step": 1571 + }, + { + "epoch": 1.1945288753799392, + "grad_norm": 1.2647521495819092, + "learning_rate": 3.520666828846726e-06, + "loss": 0.4173050820827484, + "mean_token_accuracy": 0.8437265157699585, + "num_tokens": 14046445.0, + "step": 1572 + }, + { + "epoch": 1.1952887537993921, + "grad_norm": 2.8625528812408447, + "learning_rate": 3.518754575768702e-06, + "loss": 0.37182557582855225, + "mean_token_accuracy": 0.8660947680473328, + "num_tokens": 14051197.0, + "step": 1573 + }, + { + "epoch": 1.196048632218845, + "grad_norm": 1.1213171482086182, + "learning_rate": 3.516841607689501e-06, + "loss": 0.332731157541275, + "mean_token_accuracy": 0.8573278784751892, + "num_tokens": 14070817.0, + "step": 1574 + }, + { + "epoch": 1.196808510638298, + "grad_norm": 1.197508692741394, + "learning_rate": 3.5149279259517165e-06, + "loss": 0.34058472514152527, + "mean_token_accuracy": 0.8603571653366089, + "num_tokens": 14085301.0, + "step": 1575 + }, + { + "epoch": 1.1975683890577509, + "grad_norm": 4.019949913024902, + "learning_rate": 3.5130135318984454e-06, + "loss": 0.3094622492790222, + "mean_token_accuracy": 0.8905094861984253, + "num_tokens": 14088107.0, + "step": 1576 + }, + { + "epoch": 1.1983282674772036, + "grad_norm": 2.591181755065918, + "learning_rate": 3.5110984268732827e-06, + "loss": 0.3407078981399536, + "mean_token_accuracy": 0.880385160446167, + "num_tokens": 14092887.0, + "step": 1577 + }, + { + "epoch": 1.1990881458966565, + "grad_norm": 1.3069331645965576, + "learning_rate": 3.509182612220322e-06, + "loss": 0.3761988878250122, + "mean_token_accuracy": 0.862013041973114, + "num_tokens": 14109216.0, + "step": 1578 + }, + { + "epoch": 1.1998480243161094, + "grad_norm": 1.7802022695541382, + "learning_rate": 3.507266089284157e-06, + "loss": 0.3824652135372162, + "mean_token_accuracy": 0.8707721829414368, + "num_tokens": 14119645.0, + "step": 1579 + }, + { + "epoch": 1.2006079027355623, + "grad_norm": 2.7937185764312744, + "learning_rate": 3.5053488594098763e-06, + "loss": 0.33828890323638916, + "mean_token_accuracy": 0.8765541315078735, + "num_tokens": 14124628.0, + "step": 1580 + }, + { + "epoch": 1.2013677811550152, + "grad_norm": 1.892671823501587, + "learning_rate": 3.5034309239430664e-06, + "loss": 0.3476094603538513, + "mean_token_accuracy": 0.9053795337677002, + "num_tokens": 14131756.0, + "step": 1581 + }, + { + "epoch": 1.202127659574468, + "grad_norm": 1.6857695579528809, + "learning_rate": 3.501512284229807e-06, + "loss": 0.5397108793258667, + "mean_token_accuracy": 0.8173421025276184, + "num_tokens": 14143024.0, + "step": 1582 + }, + { + "epoch": 1.202887537993921, + "grad_norm": 2.501737117767334, + "learning_rate": 3.4995929416166756e-06, + "loss": 0.4192458391189575, + "mean_token_accuracy": 0.8558136224746704, + "num_tokens": 14149499.0, + "step": 1583 + }, + { + "epoch": 1.203647416413374, + "grad_norm": 2.0133907794952393, + "learning_rate": 3.4976728974507387e-06, + "loss": 0.4791576564311981, + "mean_token_accuracy": 0.8253597021102905, + "num_tokens": 14158381.0, + "step": 1584 + }, + { + "epoch": 1.2044072948328268, + "grad_norm": 2.984611988067627, + "learning_rate": 3.4957521530795576e-06, + "loss": 0.3040750026702881, + "mean_token_accuracy": 0.8902391791343689, + "num_tokens": 14162419.0, + "step": 1585 + }, + { + "epoch": 1.2051671732522795, + "grad_norm": 1.518591284751892, + "learning_rate": 3.493830709851185e-06, + "loss": 0.35539618134498596, + "mean_token_accuracy": 0.8737183809280396, + "num_tokens": 14173048.0, + "step": 1586 + }, + { + "epoch": 1.2059270516717326, + "grad_norm": 2.628758192062378, + "learning_rate": 3.4919085691141636e-06, + "loss": 0.33340200781822205, + "mean_token_accuracy": 0.8705098628997803, + "num_tokens": 14178255.0, + "step": 1587 + }, + { + "epoch": 1.2066869300911853, + "grad_norm": 2.5565974712371826, + "learning_rate": 3.4899857322175252e-06, + "loss": 0.44939476251602173, + "mean_token_accuracy": 0.8315504193305969, + "num_tokens": 14183808.0, + "step": 1588 + }, + { + "epoch": 1.2074468085106382, + "grad_norm": 1.7521045207977295, + "learning_rate": 3.4880622005107916e-06, + "loss": 0.3168621063232422, + "mean_token_accuracy": 0.8824669122695923, + "num_tokens": 14192186.0, + "step": 1589 + }, + { + "epoch": 1.2082066869300911, + "grad_norm": 1.9816104173660278, + "learning_rate": 3.486137975343971e-06, + "loss": 0.3892582058906555, + "mean_token_accuracy": 0.8524188995361328, + "num_tokens": 14200512.0, + "step": 1590 + }, + { + "epoch": 1.208966565349544, + "grad_norm": 1.459800124168396, + "learning_rate": 3.484213058067559e-06, + "loss": 0.45930033922195435, + "mean_token_accuracy": 0.8408471345901489, + "num_tokens": 14215232.0, + "step": 1591 + }, + { + "epoch": 1.209726443768997, + "grad_norm": 2.015493154525757, + "learning_rate": 3.482287450032536e-06, + "loss": 0.5514016151428223, + "mean_token_accuracy": 0.8456779718399048, + "num_tokens": 14225402.0, + "step": 1592 + }, + { + "epoch": 1.2104863221884499, + "grad_norm": 3.4511911869049072, + "learning_rate": 3.4803611525903687e-06, + "loss": 0.4772771894931793, + "mean_token_accuracy": 0.8558698892593384, + "num_tokens": 14229038.0, + "step": 1593 + }, + { + "epoch": 1.2112462006079028, + "grad_norm": 2.2247982025146484, + "learning_rate": 3.4784341670930067e-06, + "loss": 0.4042825996875763, + "mean_token_accuracy": 0.8635870218276978, + "num_tokens": 14237057.0, + "step": 1594 + }, + { + "epoch": 1.2120060790273557, + "grad_norm": 2.0534820556640625, + "learning_rate": 3.4765064948928813e-06, + "loss": 0.34057414531707764, + "mean_token_accuracy": 0.8800770044326782, + "num_tokens": 14243013.0, + "step": 1595 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.594703197479248, + "learning_rate": 3.474578137342909e-06, + "loss": 0.4997410774230957, + "mean_token_accuracy": 0.8302106261253357, + "num_tokens": 14251210.0, + "step": 1596 + }, + { + "epoch": 1.2135258358662613, + "grad_norm": 2.517833948135376, + "learning_rate": 3.4726490957964836e-06, + "loss": 0.3630390465259552, + "mean_token_accuracy": 0.8679884672164917, + "num_tokens": 14255893.0, + "step": 1597 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.5177065134048462, + "learning_rate": 3.4707193716074816e-06, + "loss": 0.36218544840812683, + "mean_token_accuracy": 0.879178524017334, + "num_tokens": 14268143.0, + "step": 1598 + }, + { + "epoch": 1.215045592705167, + "grad_norm": 2.215291738510132, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.4166645407676697, + "mean_token_accuracy": 0.8495793342590332, + "num_tokens": 14276794.0, + "step": 1599 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 1.534294843673706, + "learning_rate": 3.466857880719645e-06, + "loss": 0.2635883092880249, + "mean_token_accuracy": 0.8971712589263916, + "num_tokens": 14287000.0, + "step": 1600 + }, + { + "epoch": 1.216565349544073, + "grad_norm": 1.2338658571243286, + "learning_rate": 3.464926116730953e-06, + "loss": 0.339110404253006, + "mean_token_accuracy": 0.895592987537384, + "num_tokens": 14303217.0, + "step": 1601 + }, + { + "epoch": 1.2173252279635258, + "grad_norm": 1.8717178106307983, + "learning_rate": 3.462993675519968e-06, + "loss": 0.41204726696014404, + "mean_token_accuracy": 0.8560728430747986, + "num_tokens": 14311372.0, + "step": 1602 + }, + { + "epoch": 1.2180851063829787, + "grad_norm": 2.844160795211792, + "learning_rate": 3.4610605584429526e-06, + "loss": 0.4129520058631897, + "mean_token_accuracy": 0.8555002212524414, + "num_tokens": 14316244.0, + "step": 1603 + }, + { + "epoch": 1.2188449848024316, + "grad_norm": 1.099926471710205, + "learning_rate": 3.4591267668566412e-06, + "loss": 0.35783132910728455, + "mean_token_accuracy": 0.8693175315856934, + "num_tokens": 14338414.0, + "step": 1604 + }, + { + "epoch": 1.2196048632218845, + "grad_norm": 1.6448384523391724, + "learning_rate": 3.457192302118244e-06, + "loss": 0.42060258984565735, + "mean_token_accuracy": 0.8557323217391968, + "num_tokens": 14349143.0, + "step": 1605 + }, + { + "epoch": 1.2203647416413375, + "grad_norm": 2.097529888153076, + "learning_rate": 3.455257165585444e-06, + "loss": 0.5227499008178711, + "mean_token_accuracy": 0.828961968421936, + "num_tokens": 14360032.0, + "step": 1606 + }, + { + "epoch": 1.2211246200607904, + "grad_norm": 1.602988600730896, + "learning_rate": 3.453321358616393e-06, + "loss": 0.3537187874317169, + "mean_token_accuracy": 0.8776708841323853, + "num_tokens": 14370005.0, + "step": 1607 + }, + { + "epoch": 1.221884498480243, + "grad_norm": 2.358971357345581, + "learning_rate": 3.4513848825697145e-06, + "loss": 0.3448919653892517, + "mean_token_accuracy": 0.8887944221496582, + "num_tokens": 14375718.0, + "step": 1608 + }, + { + "epoch": 1.222644376899696, + "grad_norm": 1.72306227684021, + "learning_rate": 3.4494477388045035e-06, + "loss": 0.36985084414482117, + "mean_token_accuracy": 0.859595537185669, + "num_tokens": 14385016.0, + "step": 1609 + }, + { + "epoch": 1.2234042553191489, + "grad_norm": 1.5494085550308228, + "learning_rate": 3.4475099286803204e-06, + "loss": 0.49003708362579346, + "mean_token_accuracy": 0.8701964616775513, + "num_tokens": 14399277.0, + "step": 1610 + }, + { + "epoch": 1.2241641337386018, + "grad_norm": 2.6874046325683594, + "learning_rate": 3.445571453557196e-06, + "loss": 0.3424490690231323, + "mean_token_accuracy": 0.8835943937301636, + "num_tokens": 14404182.0, + "step": 1611 + }, + { + "epoch": 1.2249240121580547, + "grad_norm": 2.2163190841674805, + "learning_rate": 3.443632314795627e-06, + "loss": 0.40944457054138184, + "mean_token_accuracy": 0.8649888038635254, + "num_tokens": 14410158.0, + "step": 1612 + }, + { + "epoch": 1.2256838905775076, + "grad_norm": 2.7961158752441406, + "learning_rate": 3.4416925137565756e-06, + "loss": 0.17890746891498566, + "mean_token_accuracy": 0.9439430832862854, + "num_tokens": 14413285.0, + "step": 1613 + }, + { + "epoch": 1.2264437689969605, + "grad_norm": 1.421451210975647, + "learning_rate": 3.439752051801467e-06, + "loss": 0.33948683738708496, + "mean_token_accuracy": 0.8754585981369019, + "num_tokens": 14424674.0, + "step": 1614 + }, + { + "epoch": 1.2272036474164134, + "grad_norm": 2.105196237564087, + "learning_rate": 3.4378109302921946e-06, + "loss": 0.40009379386901855, + "mean_token_accuracy": 0.8600341081619263, + "num_tokens": 14432400.0, + "step": 1615 + }, + { + "epoch": 1.2279635258358663, + "grad_norm": 2.004122734069824, + "learning_rate": 3.4358691505911105e-06, + "loss": 0.46013444662094116, + "mean_token_accuracy": 0.8400925993919373, + "num_tokens": 14440741.0, + "step": 1616 + }, + { + "epoch": 1.2287234042553192, + "grad_norm": 1.8407535552978516, + "learning_rate": 3.4339267140610317e-06, + "loss": 0.38828906416893005, + "mean_token_accuracy": 0.8582802414894104, + "num_tokens": 14448698.0, + "step": 1617 + }, + { + "epoch": 1.2294832826747721, + "grad_norm": 2.4285924434661865, + "learning_rate": 3.4319836220652334e-06, + "loss": 0.3109283447265625, + "mean_token_accuracy": 0.8888344764709473, + "num_tokens": 14453674.0, + "step": 1618 + }, + { + "epoch": 1.2302431610942248, + "grad_norm": 1.6322550773620605, + "learning_rate": 3.430039875967454e-06, + "loss": 0.5222204327583313, + "mean_token_accuracy": 0.825019121170044, + "num_tokens": 14465736.0, + "step": 1619 + }, + { + "epoch": 1.2310030395136777, + "grad_norm": 2.307573080062866, + "learning_rate": 3.428095477131888e-06, + "loss": 0.29477375745773315, + "mean_token_accuracy": 0.8899064660072327, + "num_tokens": 14471266.0, + "step": 1620 + }, + { + "epoch": 1.2317629179331306, + "grad_norm": 1.8044531345367432, + "learning_rate": 3.4261504269231904e-06, + "loss": 0.4883342981338501, + "mean_token_accuracy": 0.8310165405273438, + "num_tokens": 14481679.0, + "step": 1621 + }, + { + "epoch": 1.2325227963525835, + "grad_norm": 2.7585411071777344, + "learning_rate": 3.4242047267064714e-06, + "loss": 0.45369645953178406, + "mean_token_accuracy": 0.8432134985923767, + "num_tokens": 14487299.0, + "step": 1622 + }, + { + "epoch": 1.2332826747720365, + "grad_norm": 2.687490701675415, + "learning_rate": 3.4222583778472997e-06, + "loss": 0.5627540349960327, + "mean_token_accuracy": 0.8186438083648682, + "num_tokens": 14494254.0, + "step": 1623 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.622443199157715, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.28697147965431213, + "mean_token_accuracy": 0.8861737847328186, + "num_tokens": 14498632.0, + "step": 1624 + }, + { + "epoch": 1.2348024316109423, + "grad_norm": 2.6943359375, + "learning_rate": 3.4183637396661372e-06, + "loss": 0.25273287296295166, + "mean_token_accuracy": 0.9104914665222168, + "num_tokens": 14502797.0, + "step": 1625 + }, + { + "epoch": 1.2355623100303952, + "grad_norm": 2.428189992904663, + "learning_rate": 3.4164154530775552e-06, + "loss": 0.4213451147079468, + "mean_token_accuracy": 0.851524293422699, + "num_tokens": 14508503.0, + "step": 1626 + }, + { + "epoch": 1.236322188449848, + "grad_norm": 2.1722824573516846, + "learning_rate": 3.4144665233133318e-06, + "loss": 0.35238856077194214, + "mean_token_accuracy": 0.8730837106704712, + "num_tokens": 14516126.0, + "step": 1627 + }, + { + "epoch": 1.237082066869301, + "grad_norm": 2.291365146636963, + "learning_rate": 3.4125169517413005e-06, + "loss": 0.43963465094566345, + "mean_token_accuracy": 0.8525444865226746, + "num_tokens": 14522507.0, + "step": 1628 + }, + { + "epoch": 1.237841945288754, + "grad_norm": 1.6181648969650269, + "learning_rate": 3.410566739729746e-06, + "loss": 0.2799680233001709, + "mean_token_accuracy": 0.8915654420852661, + "num_tokens": 14531025.0, + "step": 1629 + }, + { + "epoch": 1.2386018237082066, + "grad_norm": 1.4039218425750732, + "learning_rate": 3.408615888647402e-06, + "loss": 0.29756587743759155, + "mean_token_accuracy": 0.8951715230941772, + "num_tokens": 14543770.0, + "step": 1630 + }, + { + "epoch": 1.2393617021276595, + "grad_norm": 2.148325204849243, + "learning_rate": 3.4066643998634506e-06, + "loss": 0.3983418345451355, + "mean_token_accuracy": 0.8635951280593872, + "num_tokens": 14550896.0, + "step": 1631 + }, + { + "epoch": 1.2401215805471124, + "grad_norm": 1.5225859880447388, + "learning_rate": 3.4047122747475227e-06, + "loss": 0.3247569799423218, + "mean_token_accuracy": 0.8727027177810669, + "num_tokens": 14562181.0, + "step": 1632 + }, + { + "epoch": 1.2408814589665653, + "grad_norm": 3.99835467338562, + "learning_rate": 3.402759514669694e-06, + "loss": 0.4317352771759033, + "mean_token_accuracy": 0.8488142490386963, + "num_tokens": 14565521.0, + "step": 1633 + }, + { + "epoch": 1.2416413373860182, + "grad_norm": 1.7306902408599854, + "learning_rate": 3.4008061210004872e-06, + "loss": 0.389854371547699, + "mean_token_accuracy": 0.8553084135055542, + "num_tokens": 14574633.0, + "step": 1634 + }, + { + "epoch": 1.2424012158054711, + "grad_norm": 2.3614673614501953, + "learning_rate": 3.3988520951108683e-06, + "loss": 0.3150152564048767, + "mean_token_accuracy": 0.8865959644317627, + "num_tokens": 14580240.0, + "step": 1635 + }, + { + "epoch": 1.243161094224924, + "grad_norm": 1.5625747442245483, + "learning_rate": 3.3968974383722497e-06, + "loss": 0.43160033226013184, + "mean_token_accuracy": 0.840155839920044, + "num_tokens": 14594255.0, + "step": 1636 + }, + { + "epoch": 1.243920972644377, + "grad_norm": 1.871620535850525, + "learning_rate": 3.3949421521564825e-06, + "loss": 0.49550193548202515, + "mean_token_accuracy": 0.8315126299858093, + "num_tokens": 14605416.0, + "step": 1637 + }, + { + "epoch": 1.2446808510638299, + "grad_norm": 2.111304759979248, + "learning_rate": 3.392986237835863e-06, + "loss": 0.2794899046421051, + "mean_token_accuracy": 0.9049773216247559, + "num_tokens": 14611711.0, + "step": 1638 + }, + { + "epoch": 1.2454407294832828, + "grad_norm": 3.7479894161224365, + "learning_rate": 3.391029696783127e-06, + "loss": 0.469397634267807, + "mean_token_accuracy": 0.8352956771850586, + "num_tokens": 14615536.0, + "step": 1639 + }, + { + "epoch": 1.2462006079027357, + "grad_norm": 3.277726650238037, + "learning_rate": 3.389072530371451e-06, + "loss": 0.35431790351867676, + "mean_token_accuracy": 0.8822286128997803, + "num_tokens": 14619390.0, + "step": 1640 + }, + { + "epoch": 1.2469604863221884, + "grad_norm": 1.9583072662353516, + "learning_rate": 3.3871147399744482e-06, + "loss": 0.3708694577217102, + "mean_token_accuracy": 0.8720351457595825, + "num_tokens": 14626573.0, + "step": 1641 + }, + { + "epoch": 1.2477203647416413, + "grad_norm": 1.8734042644500732, + "learning_rate": 3.385156326966173e-06, + "loss": 0.48163774609565735, + "mean_token_accuracy": 0.8479621410369873, + "num_tokens": 14636382.0, + "step": 1642 + }, + { + "epoch": 1.2484802431610942, + "grad_norm": 2.0085532665252686, + "learning_rate": 3.383197292721114e-06, + "loss": 0.4893198311328888, + "mean_token_accuracy": 0.838238000869751, + "num_tokens": 14645083.0, + "step": 1643 + }, + { + "epoch": 1.249240121580547, + "grad_norm": 2.0874593257904053, + "learning_rate": 3.3812376386141966e-06, + "loss": 0.4610505700111389, + "mean_token_accuracy": 0.8441368341445923, + "num_tokens": 14654048.0, + "step": 1644 + }, + { + "epoch": 1.25, + "grad_norm": 1.6887420415878296, + "learning_rate": 3.379277366020782e-06, + "loss": 0.3628596067428589, + "mean_token_accuracy": 0.8838590383529663, + "num_tokens": 14662317.0, + "step": 1645 + }, + { + "epoch": 1.250759878419453, + "grad_norm": 2.389002561569214, + "learning_rate": 3.3773164763166653e-06, + "loss": 0.21903495490550995, + "mean_token_accuracy": 0.9249413013458252, + "num_tokens": 14666394.0, + "step": 1646 + }, + { + "epoch": 1.2515197568389058, + "grad_norm": 1.7091087102890015, + "learning_rate": 3.3753549708780736e-06, + "loss": 0.37802332639694214, + "mean_token_accuracy": 0.8644627332687378, + "num_tokens": 14676214.0, + "step": 1647 + }, + { + "epoch": 1.2522796352583587, + "grad_norm": 2.5717999935150146, + "learning_rate": 3.3733928510816677e-06, + "loss": 0.4236462116241455, + "mean_token_accuracy": 0.8519910573959351, + "num_tokens": 14681681.0, + "step": 1648 + }, + { + "epoch": 1.2530395136778116, + "grad_norm": 1.958856463432312, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.3923419415950775, + "mean_token_accuracy": 0.8720202445983887, + "num_tokens": 14690419.0, + "step": 1649 + }, + { + "epoch": 1.2537993920972643, + "grad_norm": 1.5900038480758667, + "learning_rate": 3.369466773924207e-06, + "loss": 0.4182325601577759, + "mean_token_accuracy": 0.8515387177467346, + "num_tokens": 14699790.0, + "step": 1650 + }, + { + "epoch": 1.2545592705167175, + "grad_norm": 1.260547161102295, + "learning_rate": 3.3675028193186243e-06, + "loss": 0.3915718197822571, + "mean_token_accuracy": 0.8536830544471741, + "num_tokens": 14717502.0, + "step": 1651 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 1.8152283430099487, + "learning_rate": 3.365538255866169e-06, + "loss": 0.424524188041687, + "mean_token_accuracy": 0.8434420824050903, + "num_tokens": 14726591.0, + "step": 1652 + }, + { + "epoch": 1.256079027355623, + "grad_norm": 1.3357285261154175, + "learning_rate": 3.3635730849456484e-06, + "loss": 0.2949739396572113, + "mean_token_accuracy": 0.8868321180343628, + "num_tokens": 14739911.0, + "step": 1653 + }, + { + "epoch": 1.256838905775076, + "grad_norm": 1.1770358085632324, + "learning_rate": 3.3616073079362925e-06, + "loss": 0.29939576983451843, + "mean_token_accuracy": 0.8923654556274414, + "num_tokens": 14755521.0, + "step": 1654 + }, + { + "epoch": 1.2575987841945289, + "grad_norm": 2.059162139892578, + "learning_rate": 3.3596409262177633e-06, + "loss": 0.4562555253505707, + "mean_token_accuracy": 0.8585271239280701, + "num_tokens": 14764173.0, + "step": 1655 + }, + { + "epoch": 1.2583586626139818, + "grad_norm": 1.430752158164978, + "learning_rate": 3.357673941170139e-06, + "loss": 0.35301265120506287, + "mean_token_accuracy": 0.8920517563819885, + "num_tokens": 14775596.0, + "step": 1656 + }, + { + "epoch": 1.2591185410334347, + "grad_norm": 1.6066302061080933, + "learning_rate": 3.3557063541739283e-06, + "loss": 0.41129636764526367, + "mean_token_accuracy": 0.8512256145477295, + "num_tokens": 14786289.0, + "step": 1657 + }, + { + "epoch": 1.2598784194528876, + "grad_norm": 1.5471590757369995, + "learning_rate": 3.353738166610058e-06, + "loss": 0.3935067057609558, + "mean_token_accuracy": 0.8514131903648376, + "num_tokens": 14798672.0, + "step": 1658 + }, + { + "epoch": 1.2606382978723405, + "grad_norm": 1.3455181121826172, + "learning_rate": 3.35176937985988e-06, + "loss": 0.3486790657043457, + "mean_token_accuracy": 0.8644362688064575, + "num_tokens": 14811603.0, + "step": 1659 + }, + { + "epoch": 1.2613981762917934, + "grad_norm": 1.891432762145996, + "learning_rate": 3.349799995305162e-06, + "loss": 0.3325638175010681, + "mean_token_accuracy": 0.8844645023345947, + "num_tokens": 14819256.0, + "step": 1660 + }, + { + "epoch": 1.262158054711246, + "grad_norm": 2.600614309310913, + "learning_rate": 3.3478300143280946e-06, + "loss": 0.30310919880867004, + "mean_token_accuracy": 0.9103429317474365, + "num_tokens": 14823706.0, + "step": 1661 + }, + { + "epoch": 1.2629179331306992, + "grad_norm": 3.8636202812194824, + "learning_rate": 3.3458594383112868e-06, + "loss": 0.28377676010131836, + "mean_token_accuracy": 0.9047091007232666, + "num_tokens": 14826688.0, + "step": 1662 + }, + { + "epoch": 1.263677811550152, + "grad_norm": 2.3100268840789795, + "learning_rate": 3.343888268637765e-06, + "loss": 0.4723394513130188, + "mean_token_accuracy": 0.8306777477264404, + "num_tokens": 14835471.0, + "step": 1663 + }, + { + "epoch": 1.2644376899696048, + "grad_norm": 1.7582160234451294, + "learning_rate": 3.341916506690971e-06, + "loss": 0.48168784379959106, + "mean_token_accuracy": 0.8281306028366089, + "num_tokens": 14846513.0, + "step": 1664 + }, + { + "epoch": 1.2651975683890577, + "grad_norm": 2.166055917739868, + "learning_rate": 3.3399441538547638e-06, + "loss": 0.4626024067401886, + "mean_token_accuracy": 0.8377980589866638, + "num_tokens": 14853408.0, + "step": 1665 + }, + { + "epoch": 1.2659574468085106, + "grad_norm": 2.23038911819458, + "learning_rate": 3.337971211513417e-06, + "loss": 0.38434159755706787, + "mean_token_accuracy": 0.8708412647247314, + "num_tokens": 14859919.0, + "step": 1666 + }, + { + "epoch": 1.2667173252279635, + "grad_norm": 2.092505693435669, + "learning_rate": 3.3359976810516164e-06, + "loss": 0.35072219371795654, + "mean_token_accuracy": 0.8761640191078186, + "num_tokens": 14865624.0, + "step": 1667 + }, + { + "epoch": 1.2674772036474165, + "grad_norm": 1.8255130052566528, + "learning_rate": 3.3340235638544633e-06, + "loss": 0.4404270648956299, + "mean_token_accuracy": 0.836356520652771, + "num_tokens": 14874181.0, + "step": 1668 + }, + { + "epoch": 1.2682370820668694, + "grad_norm": 1.9889036417007446, + "learning_rate": 3.332048861307467e-06, + "loss": 0.4199368357658386, + "mean_token_accuracy": 0.8508217334747314, + "num_tokens": 14882275.0, + "step": 1669 + }, + { + "epoch": 1.2689969604863223, + "grad_norm": 4.050281047821045, + "learning_rate": 3.330073574796551e-06, + "loss": 0.4271625280380249, + "mean_token_accuracy": 0.8471108675003052, + "num_tokens": 14893633.0, + "step": 1670 + }, + { + "epoch": 1.2697568389057752, + "grad_norm": 1.998838186264038, + "learning_rate": 3.328097705708047e-06, + "loss": 0.34743767976760864, + "mean_token_accuracy": 0.8771528005599976, + "num_tokens": 14899859.0, + "step": 1671 + }, + { + "epoch": 1.2705167173252279, + "grad_norm": 1.7989062070846558, + "learning_rate": 3.3261212554286977e-06, + "loss": 0.5267184376716614, + "mean_token_accuracy": 0.8323302268981934, + "num_tokens": 14911131.0, + "step": 1672 + }, + { + "epoch": 1.2712765957446808, + "grad_norm": 1.312070369720459, + "learning_rate": 3.324144225345649e-06, + "loss": 0.4675425887107849, + "mean_token_accuracy": 0.8157106637954712, + "num_tokens": 14928955.0, + "step": 1673 + }, + { + "epoch": 1.2720364741641337, + "grad_norm": 2.0547919273376465, + "learning_rate": 3.3221666168464584e-06, + "loss": 0.33704331517219543, + "mean_token_accuracy": 0.8621441125869751, + "num_tokens": 14935536.0, + "step": 1674 + }, + { + "epoch": 1.2727963525835866, + "grad_norm": 2.810413122177124, + "learning_rate": 3.320188431319088e-06, + "loss": 0.4007563292980194, + "mean_token_accuracy": 0.8649672269821167, + "num_tokens": 14940219.0, + "step": 1675 + }, + { + "epoch": 1.2735562310030395, + "grad_norm": 1.3516674041748047, + "learning_rate": 3.318209670151904e-06, + "loss": 0.3457040786743164, + "mean_token_accuracy": 0.8698287010192871, + "num_tokens": 14952904.0, + "step": 1676 + }, + { + "epoch": 1.2743161094224924, + "grad_norm": 2.440643310546875, + "learning_rate": 3.3162303347336765e-06, + "loss": 0.5195086002349854, + "mean_token_accuracy": 0.8348199129104614, + "num_tokens": 14958623.0, + "step": 1677 + }, + { + "epoch": 1.2750759878419453, + "grad_norm": 1.3264343738555908, + "learning_rate": 3.3142504264535808e-06, + "loss": 0.2990425229072571, + "mean_token_accuracy": 0.8961933851242065, + "num_tokens": 14971494.0, + "step": 1678 + }, + { + "epoch": 1.2758358662613982, + "grad_norm": 1.3106894493103027, + "learning_rate": 3.3122699467011913e-06, + "loss": 0.291853666305542, + "mean_token_accuracy": 0.893449068069458, + "num_tokens": 14985239.0, + "step": 1679 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.5387396812438965, + "learning_rate": 3.3102888968664857e-06, + "loss": 0.4336916208267212, + "mean_token_accuracy": 0.8447890877723694, + "num_tokens": 14991453.0, + "step": 1680 + }, + { + "epoch": 1.2773556231003038, + "grad_norm": 2.7052135467529297, + "learning_rate": 3.308307278339842e-06, + "loss": 0.3279378116130829, + "mean_token_accuracy": 0.8935879468917847, + "num_tokens": 14995428.0, + "step": 1681 + }, + { + "epoch": 1.278115501519757, + "grad_norm": 1.6251261234283447, + "learning_rate": 3.306325092512034e-06, + "loss": 0.32066458463668823, + "mean_token_accuracy": 0.8909799456596375, + "num_tokens": 15004841.0, + "step": 1682 + }, + { + "epoch": 1.2788753799392096, + "grad_norm": 2.3014605045318604, + "learning_rate": 3.3043423407742374e-06, + "loss": 0.3523373603820801, + "mean_token_accuracy": 0.8810735940933228, + "num_tokens": 15010742.0, + "step": 1683 + }, + { + "epoch": 1.2796352583586625, + "grad_norm": 2.9563019275665283, + "learning_rate": 3.3023590245180237e-06, + "loss": 0.39715707302093506, + "mean_token_accuracy": 0.8779881000518799, + "num_tokens": 15015357.0, + "step": 1684 + }, + { + "epoch": 1.2803951367781155, + "grad_norm": 1.5787957906723022, + "learning_rate": 3.300375145135361e-06, + "loss": 0.44630166888237, + "mean_token_accuracy": 0.8400174975395203, + "num_tokens": 15031360.0, + "step": 1685 + }, + { + "epoch": 1.2811550151975684, + "grad_norm": 1.6753438711166382, + "learning_rate": 3.2983907040186112e-06, + "loss": 0.3235800862312317, + "mean_token_accuracy": 0.8938044309616089, + "num_tokens": 15040276.0, + "step": 1686 + }, + { + "epoch": 1.2819148936170213, + "grad_norm": 1.7331148386001587, + "learning_rate": 3.296405702560532e-06, + "loss": 0.39061424136161804, + "mean_token_accuracy": 0.8599754571914673, + "num_tokens": 15049725.0, + "step": 1687 + }, + { + "epoch": 1.2826747720364742, + "grad_norm": 2.2029430866241455, + "learning_rate": 3.294420142154274e-06, + "loss": 0.43598297238349915, + "mean_token_accuracy": 0.8663698434829712, + "num_tokens": 15058182.0, + "step": 1688 + }, + { + "epoch": 1.283434650455927, + "grad_norm": 2.943964958190918, + "learning_rate": 3.29243402419338e-06, + "loss": 0.405210942029953, + "mean_token_accuracy": 0.854996919631958, + "num_tokens": 15062920.0, + "step": 1689 + }, + { + "epoch": 1.28419452887538, + "grad_norm": 1.9343379735946655, + "learning_rate": 3.2904473500717826e-06, + "loss": 0.35011449456214905, + "mean_token_accuracy": 0.8745867013931274, + "num_tokens": 15070298.0, + "step": 1690 + }, + { + "epoch": 1.284954407294833, + "grad_norm": 2.559859037399292, + "learning_rate": 3.2884601211838087e-06, + "loss": 0.38816407322883606, + "mean_token_accuracy": 0.854763388633728, + "num_tokens": 15075667.0, + "step": 1691 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4357839822769165, + "learning_rate": 3.2864723389241697e-06, + "loss": 0.4512745141983032, + "mean_token_accuracy": 0.8398592472076416, + "num_tokens": 15090291.0, + "step": 1692 + }, + { + "epoch": 1.2864741641337387, + "grad_norm": 1.7643728256225586, + "learning_rate": 3.284484004687969e-06, + "loss": 0.3536742627620697, + "mean_token_accuracy": 0.8726381063461304, + "num_tokens": 15099325.0, + "step": 1693 + }, + { + "epoch": 1.2872340425531914, + "grad_norm": 1.853173017501831, + "learning_rate": 3.2824951198706958e-06, + "loss": 0.36579740047454834, + "mean_token_accuracy": 0.8988048434257507, + "num_tokens": 15107090.0, + "step": 1694 + }, + { + "epoch": 1.2879939209726443, + "grad_norm": 1.6526862382888794, + "learning_rate": 3.280505685868226e-06, + "loss": 0.3853636682033539, + "mean_token_accuracy": 0.8743607997894287, + "num_tokens": 15117818.0, + "step": 1695 + }, + { + "epoch": 1.2887537993920972, + "grad_norm": 2.790398597717285, + "learning_rate": 3.278515704076821e-06, + "loss": 0.2707311511039734, + "mean_token_accuracy": 0.9034668803215027, + "num_tokens": 15121641.0, + "step": 1696 + }, + { + "epoch": 1.2895136778115501, + "grad_norm": 1.69557523727417, + "learning_rate": 3.276525175893126e-06, + "loss": 0.3707970082759857, + "mean_token_accuracy": 0.8617855906486511, + "num_tokens": 15130414.0, + "step": 1697 + }, + { + "epoch": 1.290273556231003, + "grad_norm": 1.1360478401184082, + "learning_rate": 3.274534102714172e-06, + "loss": 0.3368082344532013, + "mean_token_accuracy": 0.8781654834747314, + "num_tokens": 15148307.0, + "step": 1698 + }, + { + "epoch": 1.291033434650456, + "grad_norm": 1.5894653797149658, + "learning_rate": 3.272542485937369e-06, + "loss": 0.3870658278465271, + "mean_token_accuracy": 0.8830926418304443, + "num_tokens": 15161841.0, + "step": 1699 + }, + { + "epoch": 1.2917933130699089, + "grad_norm": 2.3735709190368652, + "learning_rate": 3.270550326960511e-06, + "loss": 0.3873991370201111, + "mean_token_accuracy": 0.8729057908058167, + "num_tokens": 15167733.0, + "step": 1700 + }, + { + "epoch": 1.2925531914893618, + "grad_norm": 1.3739598989486694, + "learning_rate": 3.268557627181772e-06, + "loss": 0.30831626057624817, + "mean_token_accuracy": 0.8695719242095947, + "num_tokens": 15180861.0, + "step": 1701 + }, + { + "epoch": 1.2933130699088147, + "grad_norm": 1.7526969909667969, + "learning_rate": 3.2665643879997054e-06, + "loss": 0.4716024398803711, + "mean_token_accuracy": 0.8303275108337402, + "num_tokens": 15191642.0, + "step": 1702 + }, + { + "epoch": 1.2940729483282674, + "grad_norm": 2.7866084575653076, + "learning_rate": 3.2645706108132426e-06, + "loss": 0.33337634801864624, + "mean_token_accuracy": 0.8790726065635681, + "num_tokens": 15196038.0, + "step": 1703 + }, + { + "epoch": 1.2948328267477205, + "grad_norm": 2.319765090942383, + "learning_rate": 3.2625762970216944e-06, + "loss": 0.3999716639518738, + "mean_token_accuracy": 0.8693568706512451, + "num_tokens": 15202075.0, + "step": 1704 + }, + { + "epoch": 1.2955927051671732, + "grad_norm": 3.18292498588562, + "learning_rate": 3.2605814480247454e-06, + "loss": 0.4579541087150574, + "mean_token_accuracy": 0.8516187071800232, + "num_tokens": 15206886.0, + "step": 1705 + }, + { + "epoch": 1.296352583586626, + "grad_norm": 2.1816933155059814, + "learning_rate": 3.258586065222459e-06, + "loss": 0.5198885202407837, + "mean_token_accuracy": 0.8170592784881592, + "num_tokens": 15214088.0, + "step": 1706 + }, + { + "epoch": 1.297112462006079, + "grad_norm": 1.9076340198516846, + "learning_rate": 3.2565901500152702e-06, + "loss": 0.49752360582351685, + "mean_token_accuracy": 0.8681992292404175, + "num_tokens": 15226046.0, + "step": 1707 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.0223331451416016, + "learning_rate": 3.2545937038039904e-06, + "loss": 0.4515793025493622, + "mean_token_accuracy": 0.8429619073867798, + "num_tokens": 15234993.0, + "step": 1708 + }, + { + "epoch": 1.2986322188449848, + "grad_norm": 2.5089669227600098, + "learning_rate": 3.2525967279898017e-06, + "loss": 0.43628376722335815, + "mean_token_accuracy": 0.8493682146072388, + "num_tokens": 15240575.0, + "step": 1709 + }, + { + "epoch": 1.2993920972644377, + "grad_norm": 2.8347091674804688, + "learning_rate": 3.2505992239742582e-06, + "loss": 0.25112441182136536, + "mean_token_accuracy": 0.908825159072876, + "num_tokens": 15244085.0, + "step": 1710 + }, + { + "epoch": 1.3001519756838906, + "grad_norm": 2.3157572746276855, + "learning_rate": 3.2486011931592863e-06, + "loss": 0.482818067073822, + "mean_token_accuracy": 0.8305923938751221, + "num_tokens": 15250377.0, + "step": 1711 + }, + { + "epoch": 1.3009118541033435, + "grad_norm": 3.169052839279175, + "learning_rate": 3.2466026369471804e-06, + "loss": 0.3493242561817169, + "mean_token_accuracy": 0.86913001537323, + "num_tokens": 15255041.0, + "step": 1712 + }, + { + "epoch": 1.3016717325227964, + "grad_norm": 1.4475083351135254, + "learning_rate": 3.2446035567406033e-06, + "loss": 0.4177290201187134, + "mean_token_accuracy": 0.8497589826583862, + "num_tokens": 15266946.0, + "step": 1713 + }, + { + "epoch": 1.3024316109422491, + "grad_norm": 1.6473008394241333, + "learning_rate": 3.2426039539425875e-06, + "loss": 0.5272886753082275, + "mean_token_accuracy": 0.8440133333206177, + "num_tokens": 15279263.0, + "step": 1714 + }, + { + "epoch": 1.3031914893617023, + "grad_norm": 2.3996543884277344, + "learning_rate": 3.240603829956531e-06, + "loss": 0.4272066652774811, + "mean_token_accuracy": 0.8495640754699707, + "num_tokens": 15285213.0, + "step": 1715 + }, + { + "epoch": 1.303951367781155, + "grad_norm": 1.63034987449646, + "learning_rate": 3.238603186186198e-06, + "loss": 0.4034635126590729, + "mean_token_accuracy": 0.8638584613800049, + "num_tokens": 15295974.0, + "step": 1716 + }, + { + "epoch": 1.3047112462006079, + "grad_norm": 2.153608798980713, + "learning_rate": 3.2366020240357166e-06, + "loss": 0.30712565779685974, + "mean_token_accuracy": 0.8863866329193115, + "num_tokens": 15302220.0, + "step": 1717 + }, + { + "epoch": 1.3054711246200608, + "grad_norm": 2.9814558029174805, + "learning_rate": 3.2346003449095803e-06, + "loss": 0.3922840356826782, + "mean_token_accuracy": 0.868030309677124, + "num_tokens": 15306747.0, + "step": 1718 + }, + { + "epoch": 1.3062310030395137, + "grad_norm": 3.3417985439300537, + "learning_rate": 3.2325981502126434e-06, + "loss": 0.30750396847724915, + "mean_token_accuracy": 0.9065356850624084, + "num_tokens": 15310309.0, + "step": 1719 + }, + { + "epoch": 1.3069908814589666, + "grad_norm": 2.237682819366455, + "learning_rate": 3.2305954413501252e-06, + "loss": 0.35068294405937195, + "mean_token_accuracy": 0.8887614011764526, + "num_tokens": 15316463.0, + "step": 1720 + }, + { + "epoch": 1.3077507598784195, + "grad_norm": 1.9526605606079102, + "learning_rate": 3.228592219727602e-06, + "loss": 0.42061835527420044, + "mean_token_accuracy": 0.8456839323043823, + "num_tokens": 15323984.0, + "step": 1721 + }, + { + "epoch": 1.3085106382978724, + "grad_norm": 1.6454212665557861, + "learning_rate": 3.226588486751012e-06, + "loss": 0.5189976692199707, + "mean_token_accuracy": 0.8187375068664551, + "num_tokens": 15338807.0, + "step": 1722 + }, + { + "epoch": 1.3092705167173253, + "grad_norm": 1.4521609544754028, + "learning_rate": 3.2245842438266526e-06, + "loss": 0.329673171043396, + "mean_token_accuracy": 0.853867769241333, + "num_tokens": 15350400.0, + "step": 1723 + }, + { + "epoch": 1.3100303951367782, + "grad_norm": 1.8750989437103271, + "learning_rate": 3.222579492361179e-06, + "loss": 0.4635341167449951, + "mean_token_accuracy": 0.8393422365188599, + "num_tokens": 15360557.0, + "step": 1724 + }, + { + "epoch": 1.310790273556231, + "grad_norm": 1.2728849649429321, + "learning_rate": 3.220574233761603e-06, + "loss": 0.3255572021007538, + "mean_token_accuracy": 0.8989741802215576, + "num_tokens": 15376548.0, + "step": 1725 + }, + { + "epoch": 1.3115501519756838, + "grad_norm": 3.5155694484710693, + "learning_rate": 3.2185684694352913e-06, + "loss": 0.34204089641571045, + "mean_token_accuracy": 0.8781906366348267, + "num_tokens": 15380304.0, + "step": 1726 + }, + { + "epoch": 1.3123100303951367, + "grad_norm": 2.059800148010254, + "learning_rate": 3.216562200789968e-06, + "loss": 0.36288338899612427, + "mean_token_accuracy": 0.8595278263092041, + "num_tokens": 15387653.0, + "step": 1727 + }, + { + "epoch": 1.3130699088145896, + "grad_norm": 3.5388240814208984, + "learning_rate": 3.214555429233707e-06, + "loss": 0.5434849858283997, + "mean_token_accuracy": 0.8074631690979004, + "num_tokens": 15391662.0, + "step": 1728 + }, + { + "epoch": 1.3138297872340425, + "grad_norm": 2.8595592975616455, + "learning_rate": 3.2125481561749406e-06, + "loss": 0.5113687515258789, + "mean_token_accuracy": 0.8448649644851685, + "num_tokens": 15397536.0, + "step": 1729 + }, + { + "epoch": 1.3145896656534954, + "grad_norm": 2.50386905670166, + "learning_rate": 3.210540383022449e-06, + "loss": 0.5293697118759155, + "mean_token_accuracy": 0.8096445798873901, + "num_tokens": 15403478.0, + "step": 1730 + }, + { + "epoch": 1.3153495440729484, + "grad_norm": 1.880035400390625, + "learning_rate": 3.208532111185365e-06, + "loss": 0.5344835519790649, + "mean_token_accuracy": 0.8172965049743652, + "num_tokens": 15413812.0, + "step": 1731 + }, + { + "epoch": 1.3161094224924013, + "grad_norm": 1.3688768148422241, + "learning_rate": 3.2065233420731717e-06, + "loss": 0.2577427327632904, + "mean_token_accuracy": 0.9142681360244751, + "num_tokens": 15423583.0, + "step": 1732 + }, + { + "epoch": 1.3168693009118542, + "grad_norm": 1.7945705652236938, + "learning_rate": 3.2045140770956987e-06, + "loss": 0.3983926773071289, + "mean_token_accuracy": 0.8652000427246094, + "num_tokens": 15432473.0, + "step": 1733 + }, + { + "epoch": 1.3176291793313069, + "grad_norm": 1.8243350982666016, + "learning_rate": 3.2025043176631283e-06, + "loss": 0.48644185066223145, + "mean_token_accuracy": 0.8319193124771118, + "num_tokens": 15445463.0, + "step": 1734 + }, + { + "epoch": 1.31838905775076, + "grad_norm": 2.000094175338745, + "learning_rate": 3.2004940651859844e-06, + "loss": 0.43567317724227905, + "mean_token_accuracy": 0.8857482671737671, + "num_tokens": 15452382.0, + "step": 1735 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.379974365234375, + "learning_rate": 3.198483321075141e-06, + "loss": 0.5153506398200989, + "mean_token_accuracy": 0.8295865654945374, + "num_tokens": 15458740.0, + "step": 1736 + }, + { + "epoch": 1.3199088145896656, + "grad_norm": 1.6564184427261353, + "learning_rate": 3.196472086741815e-06, + "loss": 0.508430540561676, + "mean_token_accuracy": 0.8181540369987488, + "num_tokens": 15471844.0, + "step": 1737 + }, + { + "epoch": 1.3206686930091185, + "grad_norm": 2.006925344467163, + "learning_rate": 3.194460363597569e-06, + "loss": 0.34542378783226013, + "mean_token_accuracy": 0.8827437162399292, + "num_tokens": 15478414.0, + "step": 1738 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 3.589045763015747, + "learning_rate": 3.192448153054306e-06, + "loss": 0.4385780096054077, + "mean_token_accuracy": 0.8480287790298462, + "num_tokens": 15482063.0, + "step": 1739 + }, + { + "epoch": 1.3221884498480243, + "grad_norm": 1.9797427654266357, + "learning_rate": 3.190435456524275e-06, + "loss": 0.4330386519432068, + "mean_token_accuracy": 0.8458058834075928, + "num_tokens": 15489803.0, + "step": 1740 + }, + { + "epoch": 1.3229483282674772, + "grad_norm": 1.4777411222457886, + "learning_rate": 3.188422275420063e-06, + "loss": 0.3997895419597626, + "mean_token_accuracy": 0.8639512062072754, + "num_tokens": 15501103.0, + "step": 1741 + }, + { + "epoch": 1.3237082066869301, + "grad_norm": 2.882338523864746, + "learning_rate": 3.186408611154597e-06, + "loss": 0.2336438149213791, + "mean_token_accuracy": 0.9176726937294006, + "num_tokens": 15504854.0, + "step": 1742 + }, + { + "epoch": 1.324468085106383, + "grad_norm": 2.353503704071045, + "learning_rate": 3.184394465141146e-06, + "loss": 0.4107069671154022, + "mean_token_accuracy": 0.8677014112472534, + "num_tokens": 15510662.0, + "step": 1743 + }, + { + "epoch": 1.325227963525836, + "grad_norm": 2.6551976203918457, + "learning_rate": 3.1823798387933134e-06, + "loss": 0.3862302899360657, + "mean_token_accuracy": 0.8819445371627808, + "num_tokens": 15515681.0, + "step": 1744 + }, + { + "epoch": 1.3259878419452886, + "grad_norm": 1.478572964668274, + "learning_rate": 3.180364733525043e-06, + "loss": 0.43972986936569214, + "mean_token_accuracy": 0.832388162612915, + "num_tokens": 15529542.0, + "step": 1745 + }, + { + "epoch": 1.3267477203647418, + "grad_norm": 1.6003550291061401, + "learning_rate": 3.178349150750612e-06, + "loss": 0.3404902219772339, + "mean_token_accuracy": 0.8764007091522217, + "num_tokens": 15538865.0, + "step": 1746 + }, + { + "epoch": 1.3275075987841944, + "grad_norm": 2.130689859390259, + "learning_rate": 3.1763330918846347e-06, + "loss": 0.383136510848999, + "mean_token_accuracy": 0.8652247190475464, + "num_tokens": 15545567.0, + "step": 1747 + }, + { + "epoch": 1.3282674772036474, + "grad_norm": 2.395937442779541, + "learning_rate": 3.1743165583420586e-06, + "loss": 0.3870319128036499, + "mean_token_accuracy": 0.8618065118789673, + "num_tokens": 15551090.0, + "step": 1748 + }, + { + "epoch": 1.3290273556231003, + "grad_norm": 2.0841057300567627, + "learning_rate": 3.1722995515381644e-06, + "loss": 0.4838739335536957, + "mean_token_accuracy": 0.8548711538314819, + "num_tokens": 15558913.0, + "step": 1749 + }, + { + "epoch": 1.3297872340425532, + "grad_norm": 1.4237847328186035, + "learning_rate": 3.1702820728885657e-06, + "loss": 0.40350261330604553, + "mean_token_accuracy": 0.858984649181366, + "num_tokens": 15572045.0, + "step": 1750 + }, + { + "epoch": 1.330547112462006, + "grad_norm": 2.2641282081604004, + "learning_rate": 3.1682641238092064e-06, + "loss": 0.5117636919021606, + "mean_token_accuracy": 0.8078924417495728, + "num_tokens": 15579753.0, + "step": 1751 + }, + { + "epoch": 1.331306990881459, + "grad_norm": 1.0010309219360352, + "learning_rate": 3.1662457057163603e-06, + "loss": 0.3220978379249573, + "mean_token_accuracy": 0.8786559104919434, + "num_tokens": 15602823.0, + "step": 1752 + }, + { + "epoch": 1.332066869300912, + "grad_norm": 2.441230535507202, + "learning_rate": 3.164226820026632e-06, + "loss": 0.37529727816581726, + "mean_token_accuracy": 0.8886898756027222, + "num_tokens": 15608473.0, + "step": 1753 + }, + { + "epoch": 1.3328267477203648, + "grad_norm": 1.2960991859436035, + "learning_rate": 3.162207468156952e-06, + "loss": 0.3393767476081848, + "mean_token_accuracy": 0.8766993284225464, + "num_tokens": 15620893.0, + "step": 1754 + }, + { + "epoch": 1.3335866261398177, + "grad_norm": 2.0806996822357178, + "learning_rate": 3.16018765152458e-06, + "loss": 0.38034507632255554, + "mean_token_accuracy": 0.8854838609695435, + "num_tokens": 15627068.0, + "step": 1755 + }, + { + "epoch": 1.3343465045592704, + "grad_norm": 1.4316699504852295, + "learning_rate": 3.1581673715471007e-06, + "loss": 0.3665890693664551, + "mean_token_accuracy": 0.870919406414032, + "num_tokens": 15641070.0, + "step": 1756 + }, + { + "epoch": 1.3351063829787235, + "grad_norm": 1.3466622829437256, + "learning_rate": 3.1561466296424247e-06, + "loss": 0.37387198209762573, + "mean_token_accuracy": 0.8633951544761658, + "num_tokens": 15653777.0, + "step": 1757 + }, + { + "epoch": 1.3358662613981762, + "grad_norm": 1.8108628988265991, + "learning_rate": 3.154125427228786e-06, + "loss": 0.38428938388824463, + "mean_token_accuracy": 0.85402512550354, + "num_tokens": 15662494.0, + "step": 1758 + }, + { + "epoch": 1.3366261398176291, + "grad_norm": 1.3221700191497803, + "learning_rate": 3.152103765724743e-06, + "loss": 0.42825520038604736, + "mean_token_accuracy": 0.8435465097427368, + "num_tokens": 15677552.0, + "step": 1759 + }, + { + "epoch": 1.337386018237082, + "grad_norm": 2.6247692108154297, + "learning_rate": 3.150081646549174e-06, + "loss": 0.36186715960502625, + "mean_token_accuracy": 0.8767328262329102, + "num_tokens": 15682103.0, + "step": 1760 + }, + { + "epoch": 1.338145896656535, + "grad_norm": 2.1469814777374268, + "learning_rate": 3.1480590711212823e-06, + "loss": 0.3734385669231415, + "mean_token_accuracy": 0.8711104393005371, + "num_tokens": 15689182.0, + "step": 1761 + }, + { + "epoch": 1.3389057750759878, + "grad_norm": 2.1702585220336914, + "learning_rate": 3.1460360408605866e-06, + "loss": 0.2795315086841583, + "mean_token_accuracy": 0.8892190456390381, + "num_tokens": 15694272.0, + "step": 1762 + }, + { + "epoch": 1.3396656534954408, + "grad_norm": 1.918797254562378, + "learning_rate": 3.144012557186931e-06, + "loss": 0.4363473057746887, + "mean_token_accuracy": 0.8573931455612183, + "num_tokens": 15703532.0, + "step": 1763 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.5579960346221924, + "learning_rate": 3.14198862152047e-06, + "loss": 0.406247079372406, + "mean_token_accuracy": 0.8617593050003052, + "num_tokens": 15708652.0, + "step": 1764 + }, + { + "epoch": 1.3411854103343466, + "grad_norm": 2.3617870807647705, + "learning_rate": 3.1399642352816825e-06, + "loss": 0.2839522659778595, + "mean_token_accuracy": 0.8996064066886902, + "num_tokens": 15713598.0, + "step": 1765 + }, + { + "epoch": 1.3419452887537995, + "grad_norm": 1.248302936553955, + "learning_rate": 3.1379393998913594e-06, + "loss": 0.2922290861606598, + "mean_token_accuracy": 0.8948773145675659, + "num_tokens": 15726693.0, + "step": 1766 + }, + { + "epoch": 1.3427051671732522, + "grad_norm": 2.143599510192871, + "learning_rate": 3.135914116770609e-06, + "loss": 0.32176223397254944, + "mean_token_accuracy": 0.8808754682540894, + "num_tokens": 15731901.0, + "step": 1767 + }, + { + "epoch": 1.3434650455927053, + "grad_norm": 4.226369857788086, + "learning_rate": 3.1338883873408517e-06, + "loss": 0.4682556390762329, + "mean_token_accuracy": 0.8566025495529175, + "num_tokens": 15735029.0, + "step": 1768 + }, + { + "epoch": 1.344224924012158, + "grad_norm": 1.8695988655090332, + "learning_rate": 3.1318622130238237e-06, + "loss": 0.4297192394733429, + "mean_token_accuracy": 0.8419148921966553, + "num_tokens": 15744310.0, + "step": 1769 + }, + { + "epoch": 1.344984802431611, + "grad_norm": 2.4321305751800537, + "learning_rate": 3.1298355952415714e-06, + "loss": 0.36076444387435913, + "mean_token_accuracy": 0.8826035261154175, + "num_tokens": 15749337.0, + "step": 1770 + }, + { + "epoch": 1.3457446808510638, + "grad_norm": 1.5500011444091797, + "learning_rate": 3.127808535416454e-06, + "loss": 0.48664039373397827, + "mean_token_accuracy": 0.844344437122345, + "num_tokens": 15761096.0, + "step": 1771 + }, + { + "epoch": 1.3465045592705167, + "grad_norm": 2.1498289108276367, + "learning_rate": 3.1257810349711388e-06, + "loss": 0.4841752052307129, + "mean_token_accuracy": 0.8324567079544067, + "num_tokens": 15768646.0, + "step": 1772 + }, + { + "epoch": 1.3472644376899696, + "grad_norm": 1.2995187044143677, + "learning_rate": 3.1237530953286046e-06, + "loss": 0.492019385099411, + "mean_token_accuracy": 0.8285316228866577, + "num_tokens": 15788401.0, + "step": 1773 + }, + { + "epoch": 1.3480243161094225, + "grad_norm": 2.324819803237915, + "learning_rate": 3.121724717912138e-06, + "loss": 0.33166298270225525, + "mean_token_accuracy": 0.8856451511383057, + "num_tokens": 15794097.0, + "step": 1774 + }, + { + "epoch": 1.3487841945288754, + "grad_norm": 1.9611430168151855, + "learning_rate": 3.11969590414533e-06, + "loss": 0.3974284827709198, + "mean_token_accuracy": 0.8751305937767029, + "num_tokens": 15801065.0, + "step": 1775 + }, + { + "epoch": 1.3495440729483283, + "grad_norm": 1.7084417343139648, + "learning_rate": 3.1176666554520827e-06, + "loss": 0.38729435205459595, + "mean_token_accuracy": 0.8680770397186279, + "num_tokens": 15810353.0, + "step": 1776 + }, + { + "epoch": 1.3503039513677813, + "grad_norm": 1.7616240978240967, + "learning_rate": 3.1156369732566006e-06, + "loss": 0.4271578788757324, + "mean_token_accuracy": 0.843730092048645, + "num_tokens": 15821889.0, + "step": 1777 + }, + { + "epoch": 1.351063829787234, + "grad_norm": 2.030747413635254, + "learning_rate": 3.113606858983391e-06, + "loss": 0.361891508102417, + "mean_token_accuracy": 0.8522407412528992, + "num_tokens": 15830800.0, + "step": 1778 + }, + { + "epoch": 1.3518237082066868, + "grad_norm": 1.4842649698257446, + "learning_rate": 3.1115763140572686e-06, + "loss": 0.466334730386734, + "mean_token_accuracy": 0.8433995246887207, + "num_tokens": 15849422.0, + "step": 1779 + }, + { + "epoch": 1.3525835866261398, + "grad_norm": 1.6595379114151, + "learning_rate": 3.109545339903347e-06, + "loss": 0.4622533321380615, + "mean_token_accuracy": 0.8526314496994019, + "num_tokens": 15860431.0, + "step": 1780 + }, + { + "epoch": 1.3533434650455927, + "grad_norm": 2.1235809326171875, + "learning_rate": 3.107513937947041e-06, + "loss": 0.42694270610809326, + "mean_token_accuracy": 0.854864239692688, + "num_tokens": 15869044.0, + "step": 1781 + }, + { + "epoch": 1.3541033434650456, + "grad_norm": 1.5889263153076172, + "learning_rate": 3.1054821096140675e-06, + "loss": 0.41838499903678894, + "mean_token_accuracy": 0.8671513795852661, + "num_tokens": 15878598.0, + "step": 1782 + }, + { + "epoch": 1.3548632218844985, + "grad_norm": 2.2261741161346436, + "learning_rate": 3.1034498563304435e-06, + "loss": 0.4045066237449646, + "mean_token_accuracy": 0.843826949596405, + "num_tokens": 15885167.0, + "step": 1783 + }, + { + "epoch": 1.3556231003039514, + "grad_norm": 2.2569329738616943, + "learning_rate": 3.1014171795224794e-06, + "loss": 0.36677104234695435, + "mean_token_accuracy": 0.8747833967208862, + "num_tokens": 15891308.0, + "step": 1784 + }, + { + "epoch": 1.3563829787234043, + "grad_norm": 2.1027088165283203, + "learning_rate": 3.0993840806167884e-06, + "loss": 0.437946081161499, + "mean_token_accuracy": 0.8370785117149353, + "num_tokens": 15898952.0, + "step": 1785 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 1.8768929243087769, + "learning_rate": 3.0973505610402767e-06, + "loss": 0.4201734662055969, + "mean_token_accuracy": 0.8474810123443604, + "num_tokens": 15907340.0, + "step": 1786 + }, + { + "epoch": 1.35790273556231, + "grad_norm": 1.7216229438781738, + "learning_rate": 3.0953166222201474e-06, + "loss": 0.4225231409072876, + "mean_token_accuracy": 0.8437749147415161, + "num_tokens": 15917852.0, + "step": 1787 + }, + { + "epoch": 1.358662613981763, + "grad_norm": 2.6256966590881348, + "learning_rate": 3.093282265583895e-06, + "loss": 0.435439795255661, + "mean_token_accuracy": 0.8452040553092957, + "num_tokens": 15923739.0, + "step": 1788 + }, + { + "epoch": 1.3594224924012157, + "grad_norm": 2.90028977394104, + "learning_rate": 3.0912474925593124e-06, + "loss": 0.3730456829071045, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 15927943.0, + "step": 1789 + }, + { + "epoch": 1.3601823708206686, + "grad_norm": 1.5966626405715942, + "learning_rate": 3.0892123045744787e-06, + "loss": 0.42150455713272095, + "mean_token_accuracy": 0.854656457901001, + "num_tokens": 15939922.0, + "step": 1790 + }, + { + "epoch": 1.3609422492401215, + "grad_norm": 1.8069748878479004, + "learning_rate": 3.0871767030577686e-06, + "loss": 0.4954872131347656, + "mean_token_accuracy": 0.8289790153503418, + "num_tokens": 15950095.0, + "step": 1791 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.0855250358581543, + "learning_rate": 3.085140689437846e-06, + "loss": 0.41999945044517517, + "mean_token_accuracy": 0.8517382144927979, + "num_tokens": 15957972.0, + "step": 1792 + }, + { + "epoch": 1.3624620060790273, + "grad_norm": 2.108659267425537, + "learning_rate": 3.0831042651436634e-06, + "loss": 0.3668023645877838, + "mean_token_accuracy": 0.8710855841636658, + "num_tokens": 15965614.0, + "step": 1793 + }, + { + "epoch": 1.3632218844984803, + "grad_norm": 1.3799632787704468, + "learning_rate": 3.0810674316044602e-06, + "loss": 0.351409375667572, + "mean_token_accuracy": 0.870837390422821, + "num_tokens": 15978854.0, + "step": 1794 + }, + { + "epoch": 1.3639817629179332, + "grad_norm": 1.540397047996521, + "learning_rate": 3.0790301902497664e-06, + "loss": 0.403600811958313, + "mean_token_accuracy": 0.8485002517700195, + "num_tokens": 15993324.0, + "step": 1795 + }, + { + "epoch": 1.364741641337386, + "grad_norm": 1.946882963180542, + "learning_rate": 3.076992542509396e-06, + "loss": 0.40118327736854553, + "mean_token_accuracy": 0.8607497811317444, + "num_tokens": 16001937.0, + "step": 1796 + }, + { + "epoch": 1.365501519756839, + "grad_norm": 2.0464305877685547, + "learning_rate": 3.0749544898134487e-06, + "loss": 0.31742292642593384, + "mean_token_accuracy": 0.8878391981124878, + "num_tokens": 16009277.0, + "step": 1797 + }, + { + "epoch": 1.3662613981762917, + "grad_norm": 2.091754913330078, + "learning_rate": 3.072916033592307e-06, + "loss": 0.31580421328544617, + "mean_token_accuracy": 0.8875244855880737, + "num_tokens": 16015756.0, + "step": 1798 + }, + { + "epoch": 1.3670212765957448, + "grad_norm": 3.4449212551116943, + "learning_rate": 3.0708771752766397e-06, + "loss": 0.4692591726779938, + "mean_token_accuracy": 0.8456202149391174, + "num_tokens": 16019912.0, + "step": 1799 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 1.600419521331787, + "learning_rate": 3.068837916297396e-06, + "loss": 0.40389442443847656, + "mean_token_accuracy": 0.8378961086273193, + "num_tokens": 16032637.0, + "step": 1800 + }, + { + "epoch": 1.3685410334346504, + "grad_norm": 1.5282686948776245, + "learning_rate": 3.0667982580858047e-06, + "loss": 0.379841685295105, + "mean_token_accuracy": 0.8752143383026123, + "num_tokens": 16045205.0, + "step": 1801 + }, + { + "epoch": 1.3693009118541033, + "grad_norm": 2.486079454421997, + "learning_rate": 3.0647582020733773e-06, + "loss": 0.41060030460357666, + "mean_token_accuracy": 0.8575131893157959, + "num_tokens": 16051189.0, + "step": 1802 + }, + { + "epoch": 1.3700607902735562, + "grad_norm": 1.9458621740341187, + "learning_rate": 3.062717749691904e-06, + "loss": 0.4442213773727417, + "mean_token_accuracy": 0.8451495170593262, + "num_tokens": 16059700.0, + "step": 1803 + }, + { + "epoch": 1.3708206686930091, + "grad_norm": 1.4333001375198364, + "learning_rate": 3.0606769023734535e-06, + "loss": 0.39132001996040344, + "mean_token_accuracy": 0.8609901666641235, + "num_tokens": 16072458.0, + "step": 1804 + }, + { + "epoch": 1.371580547112462, + "grad_norm": 1.490355372428894, + "learning_rate": 3.0586356615503693e-06, + "loss": 0.4108564257621765, + "mean_token_accuracy": 0.8871046304702759, + "num_tokens": 16083142.0, + "step": 1805 + }, + { + "epoch": 1.372340425531915, + "grad_norm": 1.7765129804611206, + "learning_rate": 3.056594028655274e-06, + "loss": 0.3850266635417938, + "mean_token_accuracy": 0.8923365473747253, + "num_tokens": 16092519.0, + "step": 1806 + }, + { + "epoch": 1.3731003039513678, + "grad_norm": 1.955661416053772, + "learning_rate": 3.0545520051210637e-06, + "loss": 0.4665378928184509, + "mean_token_accuracy": 0.837419867515564, + "num_tokens": 16100618.0, + "step": 1807 + }, + { + "epoch": 1.3738601823708207, + "grad_norm": 3.259265422821045, + "learning_rate": 3.052509592380909e-06, + "loss": 0.24722981452941895, + "mean_token_accuracy": 0.9106054306030273, + "num_tokens": 16103836.0, + "step": 1808 + }, + { + "epoch": 1.3746200607902734, + "grad_norm": 1.7995736598968506, + "learning_rate": 3.050466791868254e-06, + "loss": 0.4982220530509949, + "mean_token_accuracy": 0.8298169374465942, + "num_tokens": 16114727.0, + "step": 1809 + }, + { + "epoch": 1.3753799392097266, + "grad_norm": 1.9643093347549438, + "learning_rate": 3.048423605016815e-06, + "loss": 0.5076829195022583, + "mean_token_accuracy": 0.8303098678588867, + "num_tokens": 16129491.0, + "step": 1810 + }, + { + "epoch": 1.3761398176291793, + "grad_norm": 3.505594491958618, + "learning_rate": 3.0463800332605787e-06, + "loss": 0.27466052770614624, + "mean_token_accuracy": 0.9018045663833618, + "num_tokens": 16132640.0, + "step": 1811 + }, + { + "epoch": 1.3768996960486322, + "grad_norm": 1.798437237739563, + "learning_rate": 3.0443360780338034e-06, + "loss": 0.4004853069782257, + "mean_token_accuracy": 0.8569544553756714, + "num_tokens": 16143317.0, + "step": 1812 + }, + { + "epoch": 1.377659574468085, + "grad_norm": 2.276740789413452, + "learning_rate": 3.042291740771014e-06, + "loss": 0.3823797106742859, + "mean_token_accuracy": 0.8764113783836365, + "num_tokens": 16148898.0, + "step": 1813 + }, + { + "epoch": 1.378419452887538, + "grad_norm": 2.5051357746124268, + "learning_rate": 3.0402470229070057e-06, + "loss": 0.40365856885910034, + "mean_token_accuracy": 0.8809891939163208, + "num_tokens": 16153815.0, + "step": 1814 + }, + { + "epoch": 1.3791793313069909, + "grad_norm": 1.2379236221313477, + "learning_rate": 3.03820192587684e-06, + "loss": 0.3955119848251343, + "mean_token_accuracy": 0.8536627292633057, + "num_tokens": 16167783.0, + "step": 1815 + }, + { + "epoch": 1.3799392097264438, + "grad_norm": 2.2286343574523926, + "learning_rate": 3.036156451115846e-06, + "loss": 0.39647501707077026, + "mean_token_accuracy": 0.8621993064880371, + "num_tokens": 16174707.0, + "step": 1816 + }, + { + "epoch": 1.3806990881458967, + "grad_norm": 1.884639024734497, + "learning_rate": 3.034110600059616e-06, + "loss": 0.31612110137939453, + "mean_token_accuracy": 0.8942475318908691, + "num_tokens": 16181919.0, + "step": 1817 + }, + { + "epoch": 1.3814589665653496, + "grad_norm": 1.891312599182129, + "learning_rate": 3.0320643741440052e-06, + "loss": 0.46209126710891724, + "mean_token_accuracy": 0.8374713659286499, + "num_tokens": 16189276.0, + "step": 1818 + }, + { + "epoch": 1.3822188449848025, + "grad_norm": 2.507478713989258, + "learning_rate": 3.0300177748051375e-06, + "loss": 0.37601593136787415, + "mean_token_accuracy": 0.8633589148521423, + "num_tokens": 16194346.0, + "step": 1819 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 1.5046696662902832, + "learning_rate": 3.0279708034793907e-06, + "loss": 0.3284982144832611, + "mean_token_accuracy": 0.8792630434036255, + "num_tokens": 16205457.0, + "step": 1820 + }, + { + "epoch": 1.3837386018237083, + "grad_norm": 2.4244449138641357, + "learning_rate": 3.025923461603412e-06, + "loss": 0.40939009189605713, + "mean_token_accuracy": 0.8596426248550415, + "num_tokens": 16211866.0, + "step": 1821 + }, + { + "epoch": 1.384498480243161, + "grad_norm": 2.8656933307647705, + "learning_rate": 3.0238757506141013e-06, + "loss": 0.4397110044956207, + "mean_token_accuracy": 0.8597331047058105, + "num_tokens": 16216607.0, + "step": 1822 + }, + { + "epoch": 1.385258358662614, + "grad_norm": 2.0718610286712646, + "learning_rate": 3.0218276719486245e-06, + "loss": 0.49057573080062866, + "mean_token_accuracy": 0.8325331211090088, + "num_tokens": 16224014.0, + "step": 1823 + }, + { + "epoch": 1.3860182370820668, + "grad_norm": 1.054450273513794, + "learning_rate": 3.019779227044398e-06, + "loss": 0.3758106827735901, + "mean_token_accuracy": 0.8689473867416382, + "num_tokens": 16248627.0, + "step": 1824 + }, + { + "epoch": 1.3867781155015197, + "grad_norm": 2.1115148067474365, + "learning_rate": 3.0177304173391038e-06, + "loss": 0.502967119216919, + "mean_token_accuracy": 0.823198676109314, + "num_tokens": 16256255.0, + "step": 1825 + }, + { + "epoch": 1.3875379939209727, + "grad_norm": 2.207277297973633, + "learning_rate": 3.015681244270672e-06, + "loss": 0.3458971083164215, + "mean_token_accuracy": 0.8930196762084961, + "num_tokens": 16261823.0, + "step": 1826 + }, + { + "epoch": 1.3882978723404256, + "grad_norm": 1.289669156074524, + "learning_rate": 3.0136317092772923e-06, + "loss": 0.4422765374183655, + "mean_token_accuracy": 0.8358346819877625, + "num_tokens": 16280659.0, + "step": 1827 + }, + { + "epoch": 1.3890577507598785, + "grad_norm": 2.233865737915039, + "learning_rate": 3.0115818137974066e-06, + "loss": 0.3643006384372711, + "mean_token_accuracy": 0.8682862520217896, + "num_tokens": 16286356.0, + "step": 1828 + }, + { + "epoch": 1.3898176291793314, + "grad_norm": 1.0950042009353638, + "learning_rate": 3.0095315592697126e-06, + "loss": 0.34712421894073486, + "mean_token_accuracy": 0.8578766584396362, + "num_tokens": 16307298.0, + "step": 1829 + }, + { + "epoch": 1.3905775075987843, + "grad_norm": 1.1708037853240967, + "learning_rate": 3.007480947133155e-06, + "loss": 0.33152541518211365, + "mean_token_accuracy": 0.894973874092102, + "num_tokens": 16323232.0, + "step": 1830 + }, + { + "epoch": 1.391337386018237, + "grad_norm": 1.2226970195770264, + "learning_rate": 3.0054299788269343e-06, + "loss": 0.3915635943412781, + "mean_token_accuracy": 0.8575779795646667, + "num_tokens": 16339273.0, + "step": 1831 + }, + { + "epoch": 1.39209726443769, + "grad_norm": 1.2226042747497559, + "learning_rate": 3.0033786557904982e-06, + "loss": 0.45846253633499146, + "mean_token_accuracy": 0.8290432691574097, + "num_tokens": 16360145.0, + "step": 1832 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 2.0117406845092773, + "learning_rate": 3.001326979463545e-06, + "loss": 0.3837882876396179, + "mean_token_accuracy": 0.8941739797592163, + "num_tokens": 16366602.0, + "step": 1833 + }, + { + "epoch": 1.3936170212765957, + "grad_norm": 1.8419997692108154, + "learning_rate": 2.9992749512860177e-06, + "loss": 0.40777021646499634, + "mean_token_accuracy": 0.854655385017395, + "num_tokens": 16375611.0, + "step": 1834 + }, + { + "epoch": 1.3943768996960486, + "grad_norm": 1.9405122995376587, + "learning_rate": 2.9972225726981114e-06, + "loss": 0.46685922145843506, + "mean_token_accuracy": 0.8493201732635498, + "num_tokens": 16384878.0, + "step": 1835 + }, + { + "epoch": 1.3951367781155015, + "grad_norm": 1.2425674200057983, + "learning_rate": 2.995169845140264e-06, + "loss": 0.394692063331604, + "mean_token_accuracy": 0.851348876953125, + "num_tokens": 16404452.0, + "step": 1836 + }, + { + "epoch": 1.3958966565349544, + "grad_norm": 1.2215365171432495, + "learning_rate": 2.9931167700531575e-06, + "loss": 0.31412452459335327, + "mean_token_accuracy": 0.882760763168335, + "num_tokens": 16419358.0, + "step": 1837 + }, + { + "epoch": 1.3966565349544073, + "grad_norm": 1.912168025970459, + "learning_rate": 2.9910633488777198e-06, + "loss": 0.5065487623214722, + "mean_token_accuracy": 0.8524355292320251, + "num_tokens": 16430418.0, + "step": 1838 + }, + { + "epoch": 1.3974164133738602, + "grad_norm": 2.2173948287963867, + "learning_rate": 2.989009583055121e-06, + "loss": 0.4290938377380371, + "mean_token_accuracy": 0.8381836414337158, + "num_tokens": 16438267.0, + "step": 1839 + }, + { + "epoch": 1.3981762917933132, + "grad_norm": 1.8293484449386597, + "learning_rate": 2.9869554740267726e-06, + "loss": 0.41683733463287354, + "mean_token_accuracy": 0.8548779487609863, + "num_tokens": 16447382.0, + "step": 1840 + }, + { + "epoch": 1.398936170212766, + "grad_norm": 1.835015892982483, + "learning_rate": 2.9849010232343274e-06, + "loss": 0.5080599784851074, + "mean_token_accuracy": 0.8193596601486206, + "num_tokens": 16458541.0, + "step": 1841 + }, + { + "epoch": 1.3996960486322187, + "grad_norm": 2.031339645385742, + "learning_rate": 2.982846232119679e-06, + "loss": 0.5168882012367249, + "mean_token_accuracy": 0.8525956869125366, + "num_tokens": 16467747.0, + "step": 1842 + }, + { + "epoch": 1.4004559270516717, + "grad_norm": 1.5554167032241821, + "learning_rate": 2.9807911021249573e-06, + "loss": 0.35098958015441895, + "mean_token_accuracy": 0.888373851776123, + "num_tokens": 16479319.0, + "step": 1843 + }, + { + "epoch": 1.4012158054711246, + "grad_norm": 1.7183740139007568, + "learning_rate": 2.9787356346925327e-06, + "loss": 0.41263148188591003, + "mean_token_accuracy": 0.8478364944458008, + "num_tokens": 16489952.0, + "step": 1844 + }, + { + "epoch": 1.4019756838905775, + "grad_norm": 1.7743209600448608, + "learning_rate": 2.9766798312650112e-06, + "loss": 0.4211183190345764, + "mean_token_accuracy": 0.8641136884689331, + "num_tokens": 16498655.0, + "step": 1845 + }, + { + "epoch": 1.4027355623100304, + "grad_norm": 2.141300916671753, + "learning_rate": 2.9746236932852355e-06, + "loss": 0.49548980593681335, + "mean_token_accuracy": 0.8304252028465271, + "num_tokens": 16506348.0, + "step": 1846 + }, + { + "epoch": 1.4034954407294833, + "grad_norm": 2.341571807861328, + "learning_rate": 2.9725672221962804e-06, + "loss": 0.40804803371429443, + "mean_token_accuracy": 0.8545800447463989, + "num_tokens": 16513091.0, + "step": 1847 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 1.934428095817566, + "learning_rate": 2.9705104194414587e-06, + "loss": 0.30029812455177307, + "mean_token_accuracy": 0.9032052755355835, + "num_tokens": 16519455.0, + "step": 1848 + }, + { + "epoch": 1.405015197568389, + "grad_norm": 1.420804500579834, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.4384060502052307, + "mean_token_accuracy": 0.8465110063552856, + "num_tokens": 16533222.0, + "step": 1849 + }, + { + "epoch": 1.405775075987842, + "grad_norm": 2.1180737018585205, + "learning_rate": 2.9663958247086165e-06, + "loss": 0.3915565609931946, + "mean_token_accuracy": 0.8633890748023987, + "num_tokens": 16539489.0, + "step": 1850 + }, + { + "epoch": 1.4065349544072947, + "grad_norm": 1.408048152923584, + "learning_rate": 2.964338035618378e-06, + "loss": 0.46166157722473145, + "mean_token_accuracy": 0.8305013179779053, + "num_tokens": 16555785.0, + "step": 1851 + }, + { + "epoch": 1.4072948328267478, + "grad_norm": 1.3418530225753784, + "learning_rate": 2.9622799206378306e-06, + "loss": 0.5314373970031738, + "mean_token_accuracy": 0.81779944896698, + "num_tokens": 16578111.0, + "step": 1852 + }, + { + "epoch": 1.4080547112462005, + "grad_norm": 1.4634262323379517, + "learning_rate": 2.9602214812114414e-06, + "loss": 0.4859408140182495, + "mean_token_accuracy": 0.8261818885803223, + "num_tokens": 16591976.0, + "step": 1853 + }, + { + "epoch": 1.4088145896656534, + "grad_norm": 1.4840295314788818, + "learning_rate": 2.9581627187838997e-06, + "loss": 0.4079628586769104, + "mean_token_accuracy": 0.8549603223800659, + "num_tokens": 16603631.0, + "step": 1854 + }, + { + "epoch": 1.4095744680851063, + "grad_norm": 2.1474642753601074, + "learning_rate": 2.956103634800126e-06, + "loss": 0.32997995615005493, + "mean_token_accuracy": 0.8836915493011475, + "num_tokens": 16609875.0, + "step": 1855 + }, + { + "epoch": 1.4103343465045592, + "grad_norm": 2.627460241317749, + "learning_rate": 2.9540442307052643e-06, + "loss": 0.3229186236858368, + "mean_token_accuracy": 0.8852157592773438, + "num_tokens": 16614113.0, + "step": 1856 + }, + { + "epoch": 1.4110942249240122, + "grad_norm": 1.9569811820983887, + "learning_rate": 2.9519845079446824e-06, + "loss": 0.5057883858680725, + "mean_token_accuracy": 0.8585711717605591, + "num_tokens": 16624611.0, + "step": 1857 + }, + { + "epoch": 1.411854103343465, + "grad_norm": 2.0604090690612793, + "learning_rate": 2.949924467963975e-06, + "loss": 0.4681510329246521, + "mean_token_accuracy": 0.8390560150146484, + "num_tokens": 16632938.0, + "step": 1858 + }, + { + "epoch": 1.412613981762918, + "grad_norm": 2.5430450439453125, + "learning_rate": 2.9478641122089563e-06, + "loss": 0.3090999126434326, + "mean_token_accuracy": 0.8943990468978882, + "num_tokens": 16637135.0, + "step": 1859 + }, + { + "epoch": 1.4133738601823709, + "grad_norm": 1.3275387287139893, + "learning_rate": 2.945803442125663e-06, + "loss": 0.3592180013656616, + "mean_token_accuracy": 0.8678265810012817, + "num_tokens": 16650322.0, + "step": 1860 + }, + { + "epoch": 1.4141337386018238, + "grad_norm": 1.9070929288864136, + "learning_rate": 2.943742459160354e-06, + "loss": 0.5332518815994263, + "mean_token_accuracy": 0.8475706577301025, + "num_tokens": 16660240.0, + "step": 1861 + }, + { + "epoch": 1.4148936170212765, + "grad_norm": 2.8724546432495117, + "learning_rate": 2.9416811647595052e-06, + "loss": 0.5052884817123413, + "mean_token_accuracy": 0.8363175392150879, + "num_tokens": 16665481.0, + "step": 1862 + }, + { + "epoch": 1.4156534954407296, + "grad_norm": 4.203817844390869, + "learning_rate": 2.939619560369813e-06, + "loss": 0.546925961971283, + "mean_token_accuracy": 0.834044337272644, + "num_tokens": 16669615.0, + "step": 1863 + }, + { + "epoch": 1.4164133738601823, + "grad_norm": 1.6466281414031982, + "learning_rate": 2.9375576474381907e-06, + "loss": 0.3474533259868622, + "mean_token_accuracy": 0.8571163415908813, + "num_tokens": 16678893.0, + "step": 1864 + }, + { + "epoch": 1.4171732522796352, + "grad_norm": 1.8885842561721802, + "learning_rate": 2.9354954274117683e-06, + "loss": 0.3726021349430084, + "mean_token_accuracy": 0.8629094958305359, + "num_tokens": 16685939.0, + "step": 1865 + }, + { + "epoch": 1.417933130699088, + "grad_norm": 2.830599784851074, + "learning_rate": 2.9334329017378898e-06, + "loss": 0.4138668477535248, + "mean_token_accuracy": 0.8670746088027954, + "num_tokens": 16690012.0, + "step": 1866 + }, + { + "epoch": 1.418693009118541, + "grad_norm": 1.6838961839675903, + "learning_rate": 2.9313700718641167e-06, + "loss": 0.33954259753227234, + "mean_token_accuracy": 0.8660278916358948, + "num_tokens": 16700061.0, + "step": 1867 + }, + { + "epoch": 1.419452887537994, + "grad_norm": 2.8767011165618896, + "learning_rate": 2.9293069392382224e-06, + "loss": 0.4650302827358246, + "mean_token_accuracy": 0.8448452949523926, + "num_tokens": 16705072.0, + "step": 1868 + }, + { + "epoch": 1.4202127659574468, + "grad_norm": 1.5901305675506592, + "learning_rate": 2.927243505308192e-06, + "loss": 0.40838998556137085, + "mean_token_accuracy": 0.8560664653778076, + "num_tokens": 16714763.0, + "step": 1869 + }, + { + "epoch": 1.4209726443768997, + "grad_norm": 1.3293657302856445, + "learning_rate": 2.925179771522223e-06, + "loss": 0.34712862968444824, + "mean_token_accuracy": 0.8633697032928467, + "num_tokens": 16729575.0, + "step": 1870 + }, + { + "epoch": 1.4217325227963526, + "grad_norm": 1.7465964555740356, + "learning_rate": 2.9231157393287234e-06, + "loss": 0.48190903663635254, + "mean_token_accuracy": 0.8255834579467773, + "num_tokens": 16742529.0, + "step": 1871 + }, + { + "epoch": 1.4224924012158056, + "grad_norm": 1.865749716758728, + "learning_rate": 2.9210514101763116e-06, + "loss": 0.4912028908729553, + "mean_token_accuracy": 0.8309572339057922, + "num_tokens": 16753989.0, + "step": 1872 + }, + { + "epoch": 1.4232522796352582, + "grad_norm": 2.55780291557312, + "learning_rate": 2.9189867855138103e-06, + "loss": 0.4550635814666748, + "mean_token_accuracy": 0.8584091067314148, + "num_tokens": 16758906.0, + "step": 1873 + }, + { + "epoch": 1.4240121580547114, + "grad_norm": 1.867530107498169, + "learning_rate": 2.9169218667902562e-06, + "loss": 0.3524911105632782, + "mean_token_accuracy": 0.8715004920959473, + "num_tokens": 16765969.0, + "step": 1874 + }, + { + "epoch": 1.424772036474164, + "grad_norm": 1.8886862993240356, + "learning_rate": 2.9148566554548857e-06, + "loss": 0.37144535779953003, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 16773935.0, + "step": 1875 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 1.266065239906311, + "learning_rate": 2.912791152957145e-06, + "loss": 0.3341747522354126, + "mean_token_accuracy": 0.8929134607315063, + "num_tokens": 16787780.0, + "step": 1876 + }, + { + "epoch": 1.4262917933130699, + "grad_norm": 2.524888753890991, + "learning_rate": 2.9107253607466833e-06, + "loss": 0.33709171414375305, + "mean_token_accuracy": 0.8857531547546387, + "num_tokens": 16792753.0, + "step": 1877 + }, + { + "epoch": 1.4270516717325228, + "grad_norm": 1.9269018173217773, + "learning_rate": 2.908659280273354e-06, + "loss": 0.32599249482154846, + "mean_token_accuracy": 0.8777773380279541, + "num_tokens": 16799904.0, + "step": 1878 + }, + { + "epoch": 1.4278115501519757, + "grad_norm": 1.9844375848770142, + "learning_rate": 2.9065929129872097e-06, + "loss": 0.4086732268333435, + "mean_token_accuracy": 0.8505409955978394, + "num_tokens": 16807774.0, + "step": 1879 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 4.0958662033081055, + "learning_rate": 2.9045262603385073e-06, + "loss": 0.3838827610015869, + "mean_token_accuracy": 0.877601146697998, + "num_tokens": 16810908.0, + "step": 1880 + }, + { + "epoch": 1.4293313069908815, + "grad_norm": 1.7323768138885498, + "learning_rate": 2.902459323777704e-06, + "loss": 0.37459003925323486, + "mean_token_accuracy": 0.8655836582183838, + "num_tokens": 16819494.0, + "step": 1881 + }, + { + "epoch": 1.4300911854103344, + "grad_norm": 2.608043670654297, + "learning_rate": 2.900392104755455e-06, + "loss": 0.5798726677894592, + "mean_token_accuracy": 0.8382592797279358, + "num_tokens": 16827745.0, + "step": 1882 + }, + { + "epoch": 1.4308510638297873, + "grad_norm": 1.3262078762054443, + "learning_rate": 2.8983246047226137e-06, + "loss": 0.3724595904350281, + "mean_token_accuracy": 0.8651963472366333, + "num_tokens": 16844171.0, + "step": 1883 + }, + { + "epoch": 1.43161094224924, + "grad_norm": 1.7250545024871826, + "learning_rate": 2.8962568251302327e-06, + "loss": 0.3478979468345642, + "mean_token_accuracy": 0.8807886242866516, + "num_tokens": 16852838.0, + "step": 1884 + }, + { + "epoch": 1.4323708206686931, + "grad_norm": 2.114525318145752, + "learning_rate": 2.8941887674295573e-06, + "loss": 0.5156140327453613, + "mean_token_accuracy": 0.825178861618042, + "num_tokens": 16861087.0, + "step": 1885 + }, + { + "epoch": 1.4331306990881458, + "grad_norm": 2.400829792022705, + "learning_rate": 2.892120433072031e-06, + "loss": 0.2807392477989197, + "mean_token_accuracy": 0.8907361030578613, + "num_tokens": 16866557.0, + "step": 1886 + }, + { + "epoch": 1.4338905775075987, + "grad_norm": 2.490880012512207, + "learning_rate": 2.8900518235092908e-06, + "loss": 0.2615952491760254, + "mean_token_accuracy": 0.9152894020080566, + "num_tokens": 16871357.0, + "step": 1887 + }, + { + "epoch": 1.4346504559270516, + "grad_norm": 1.9058431386947632, + "learning_rate": 2.887982940193165e-06, + "loss": 0.43623363971710205, + "mean_token_accuracy": 0.84696364402771, + "num_tokens": 16879016.0, + "step": 1888 + }, + { + "epoch": 1.4354103343465046, + "grad_norm": 1.4520210027694702, + "learning_rate": 2.8859137845756785e-06, + "loss": 0.3961856961250305, + "mean_token_accuracy": 0.8518897294998169, + "num_tokens": 16892254.0, + "step": 1889 + }, + { + "epoch": 1.4361702127659575, + "grad_norm": 2.500274896621704, + "learning_rate": 2.8838443581090415e-06, + "loss": 0.41457289457321167, + "mean_token_accuracy": 0.8751448392868042, + "num_tokens": 16897156.0, + "step": 1890 + }, + { + "epoch": 1.4369300911854104, + "grad_norm": 2.9312057495117188, + "learning_rate": 2.8817746622456585e-06, + "loss": 0.45875269174575806, + "mean_token_accuracy": 0.8411039113998413, + "num_tokens": 16902291.0, + "step": 1891 + }, + { + "epoch": 1.4376899696048633, + "grad_norm": 2.367419481277466, + "learning_rate": 2.879704698438121e-06, + "loss": 0.3643629848957062, + "mean_token_accuracy": 0.8771071434020996, + "num_tokens": 16908128.0, + "step": 1892 + }, + { + "epoch": 1.4384498480243162, + "grad_norm": 1.9907705783843994, + "learning_rate": 2.8776344681392106e-06, + "loss": 0.3206835389137268, + "mean_token_accuracy": 0.879996657371521, + "num_tokens": 16914918.0, + "step": 1893 + }, + { + "epoch": 1.439209726443769, + "grad_norm": 3.536956310272217, + "learning_rate": 2.875563972801893e-06, + "loss": 0.3640141785144806, + "mean_token_accuracy": 0.8814959526062012, + "num_tokens": 16918187.0, + "step": 1894 + }, + { + "epoch": 1.4399696048632218, + "grad_norm": 1.3451156616210938, + "learning_rate": 2.8734932138793226e-06, + "loss": 0.3427346348762512, + "mean_token_accuracy": 0.8835382461547852, + "num_tokens": 16931135.0, + "step": 1895 + }, + { + "epoch": 1.4407294832826747, + "grad_norm": 2.0735955238342285, + "learning_rate": 2.871422192824837e-06, + "loss": 0.4265315532684326, + "mean_token_accuracy": 0.8452677726745605, + "num_tokens": 16937995.0, + "step": 1896 + }, + { + "epoch": 1.4414893617021276, + "grad_norm": 1.5124932527542114, + "learning_rate": 2.8693509110919597e-06, + "loss": 0.497121661901474, + "mean_token_accuracy": 0.815092921257019, + "num_tokens": 16952743.0, + "step": 1897 + }, + { + "epoch": 1.4422492401215805, + "grad_norm": 3.716669797897339, + "learning_rate": 2.867279370134395e-06, + "loss": 0.5452651381492615, + "mean_token_accuracy": 0.8150380849838257, + "num_tokens": 16956797.0, + "step": 1898 + }, + { + "epoch": 1.4430091185410334, + "grad_norm": 1.3571398258209229, + "learning_rate": 2.8652075714060296e-06, + "loss": 0.4249724745750427, + "mean_token_accuracy": 0.8675867915153503, + "num_tokens": 16974494.0, + "step": 1899 + }, + { + "epoch": 1.4437689969604863, + "grad_norm": 2.310673475265503, + "learning_rate": 2.863135516360932e-06, + "loss": 0.39368677139282227, + "mean_token_accuracy": 0.878392219543457, + "num_tokens": 16980612.0, + "step": 1900 + }, + { + "epoch": 1.4445288753799392, + "grad_norm": 1.9025533199310303, + "learning_rate": 2.8610632064533517e-06, + "loss": 0.4786127805709839, + "mean_token_accuracy": 0.8720556497573853, + "num_tokens": 16992262.0, + "step": 1901 + }, + { + "epoch": 1.4452887537993921, + "grad_norm": 2.528564453125, + "learning_rate": 2.8589906431377133e-06, + "loss": 0.4223094582557678, + "mean_token_accuracy": 0.8513246178627014, + "num_tokens": 16997717.0, + "step": 1902 + }, + { + "epoch": 1.446048632218845, + "grad_norm": 1.010425329208374, + "learning_rate": 2.8569178278686222e-06, + "loss": 0.3908255696296692, + "mean_token_accuracy": 0.8620463609695435, + "num_tokens": 17020903.0, + "step": 1903 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 1.5760232210159302, + "learning_rate": 2.8548447621008614e-06, + "loss": 0.4134044051170349, + "mean_token_accuracy": 0.8472093343734741, + "num_tokens": 17035250.0, + "step": 1904 + }, + { + "epoch": 1.4475683890577509, + "grad_norm": 2.0668535232543945, + "learning_rate": 2.8527714472893866e-06, + "loss": 0.44095730781555176, + "mean_token_accuracy": 0.881983757019043, + "num_tokens": 17042170.0, + "step": 1905 + }, + { + "epoch": 1.4483282674772036, + "grad_norm": 1.1620599031448364, + "learning_rate": 2.85069788488933e-06, + "loss": 0.3607163429260254, + "mean_token_accuracy": 0.8684282898902893, + "num_tokens": 17061937.0, + "step": 1906 + }, + { + "epoch": 1.4490881458966565, + "grad_norm": 2.1316568851470947, + "learning_rate": 2.8486240763559984e-06, + "loss": 0.3478124141693115, + "mean_token_accuracy": 0.8772403001785278, + "num_tokens": 17068628.0, + "step": 1907 + }, + { + "epoch": 1.4498480243161094, + "grad_norm": 2.4756391048431396, + "learning_rate": 2.8465500231448707e-06, + "loss": 0.46441152691841125, + "mean_token_accuracy": 0.8436450958251953, + "num_tokens": 17075495.0, + "step": 1908 + }, + { + "epoch": 1.4506079027355623, + "grad_norm": 2.249720573425293, + "learning_rate": 2.844475726711595e-06, + "loss": 0.41565513610839844, + "mean_token_accuracy": 0.8525094985961914, + "num_tokens": 17080940.0, + "step": 1909 + }, + { + "epoch": 1.4513677811550152, + "grad_norm": 2.3081841468811035, + "learning_rate": 2.8424011885119956e-06, + "loss": 0.49903199076652527, + "mean_token_accuracy": 0.8212426900863647, + "num_tokens": 17092024.0, + "step": 1910 + }, + { + "epoch": 1.452127659574468, + "grad_norm": 1.2929959297180176, + "learning_rate": 2.8403264100020613e-06, + "loss": 0.47038257122039795, + "mean_token_accuracy": 0.8319816589355469, + "num_tokens": 17108840.0, + "step": 1911 + }, + { + "epoch": 1.452887537993921, + "grad_norm": 1.6476463079452515, + "learning_rate": 2.8382513926379508e-06, + "loss": 0.42287829518318176, + "mean_token_accuracy": 0.8555682897567749, + "num_tokens": 17119704.0, + "step": 1912 + }, + { + "epoch": 1.453647416413374, + "grad_norm": 1.759998083114624, + "learning_rate": 2.836176137875993e-06, + "loss": 0.40904951095581055, + "mean_token_accuracy": 0.8698266744613647, + "num_tokens": 17130676.0, + "step": 1913 + }, + { + "epoch": 1.4544072948328268, + "grad_norm": 1.510909914970398, + "learning_rate": 2.8341006471726817e-06, + "loss": 0.47834792733192444, + "mean_token_accuracy": 0.8335825204849243, + "num_tokens": 17146304.0, + "step": 1914 + }, + { + "epoch": 1.4551671732522795, + "grad_norm": 3.538071632385254, + "learning_rate": 2.832024921984674e-06, + "loss": 0.34059035778045654, + "mean_token_accuracy": 0.8769031763076782, + "num_tokens": 17150458.0, + "step": 1915 + }, + { + "epoch": 1.4559270516717326, + "grad_norm": 2.3368659019470215, + "learning_rate": 2.8299489637687955e-06, + "loss": 0.43068382143974304, + "mean_token_accuracy": 0.845360517501831, + "num_tokens": 17157368.0, + "step": 1916 + }, + { + "epoch": 1.4566869300911853, + "grad_norm": 1.8720396757125854, + "learning_rate": 2.8278727739820334e-06, + "loss": 0.37013399600982666, + "mean_token_accuracy": 0.854241132736206, + "num_tokens": 17166325.0, + "step": 1917 + }, + { + "epoch": 1.4574468085106382, + "grad_norm": 1.6706892251968384, + "learning_rate": 2.825796354081537e-06, + "loss": 0.5397020578384399, + "mean_token_accuracy": 0.8309713006019592, + "num_tokens": 17178920.0, + "step": 1918 + }, + { + "epoch": 1.4582066869300911, + "grad_norm": 2.729210376739502, + "learning_rate": 2.8237197055246175e-06, + "loss": 0.25137859582901, + "mean_token_accuracy": 0.9148792028427124, + "num_tokens": 17183107.0, + "step": 1919 + }, + { + "epoch": 1.458966565349544, + "grad_norm": 3.023500680923462, + "learning_rate": 2.821642829768748e-06, + "loss": 0.43312495946884155, + "mean_token_accuracy": 0.8481811285018921, + "num_tokens": 17187853.0, + "step": 1920 + }, + { + "epoch": 1.459726443768997, + "grad_norm": 1.8108519315719604, + "learning_rate": 2.8195657282715595e-06, + "loss": 0.5101792216300964, + "mean_token_accuracy": 0.8315553069114685, + "num_tokens": 17199247.0, + "step": 1921 + }, + { + "epoch": 1.4604863221884499, + "grad_norm": 2.0262672901153564, + "learning_rate": 2.817488402490841e-06, + "loss": 0.4449934959411621, + "mean_token_accuracy": 0.8634527325630188, + "num_tokens": 17206348.0, + "step": 1922 + }, + { + "epoch": 1.4612462006079028, + "grad_norm": 2.6163926124572754, + "learning_rate": 2.8154108538845405e-06, + "loss": 0.43052345514297485, + "mean_token_accuracy": 0.8375401496887207, + "num_tokens": 17211702.0, + "step": 1923 + }, + { + "epoch": 1.4620060790273557, + "grad_norm": 2.0854408740997314, + "learning_rate": 2.813333083910761e-06, + "loss": 0.5011380910873413, + "mean_token_accuracy": 0.8359915018081665, + "num_tokens": 17219096.0, + "step": 1924 + }, + { + "epoch": 1.4627659574468086, + "grad_norm": 2.2081687450408936, + "learning_rate": 2.8112550940277615e-06, + "loss": 0.5239193439483643, + "mean_token_accuracy": 0.8499593734741211, + "num_tokens": 17229266.0, + "step": 1925 + }, + { + "epoch": 1.4635258358662613, + "grad_norm": 1.798343539237976, + "learning_rate": 2.809176885693956e-06, + "loss": 0.4515029191970825, + "mean_token_accuracy": 0.8400485515594482, + "num_tokens": 17239280.0, + "step": 1926 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.897887945175171, + "learning_rate": 2.807098460367911e-06, + "loss": 0.35935714840888977, + "mean_token_accuracy": 0.8776072263717651, + "num_tokens": 17247132.0, + "step": 1927 + }, + { + "epoch": 1.465045592705167, + "grad_norm": 2.705836296081543, + "learning_rate": 2.8050198195083445e-06, + "loss": 0.3728443682193756, + "mean_token_accuracy": 0.8649885654449463, + "num_tokens": 17251865.0, + "step": 1928 + }, + { + "epoch": 1.46580547112462, + "grad_norm": 1.841178059577942, + "learning_rate": 2.802940964574127e-06, + "loss": 0.40604841709136963, + "mean_token_accuracy": 0.8537783622741699, + "num_tokens": 17260163.0, + "step": 1929 + }, + { + "epoch": 1.466565349544073, + "grad_norm": 2.7393605709075928, + "learning_rate": 2.800861897024279e-06, + "loss": 0.39346879720687866, + "mean_token_accuracy": 0.8628787994384766, + "num_tokens": 17264876.0, + "step": 1930 + }, + { + "epoch": 1.4673252279635258, + "grad_norm": 1.84367835521698, + "learning_rate": 2.798782618317971e-06, + "loss": 0.37411895394325256, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 17273049.0, + "step": 1931 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 1.6546733379364014, + "learning_rate": 2.796703129914519e-06, + "loss": 0.4997844099998474, + "mean_token_accuracy": 0.8267433643341064, + "num_tokens": 17285074.0, + "step": 1932 + }, + { + "epoch": 1.4688449848024316, + "grad_norm": 2.2749221324920654, + "learning_rate": 2.79462343327339e-06, + "loss": 0.35453367233276367, + "mean_token_accuracy": 0.8746850490570068, + "num_tokens": 17290273.0, + "step": 1933 + }, + { + "epoch": 1.4696048632218845, + "grad_norm": 1.7142518758773804, + "learning_rate": 2.7925435298541944e-06, + "loss": 0.345878541469574, + "mean_token_accuracy": 0.8600981831550598, + "num_tokens": 17301045.0, + "step": 1934 + }, + { + "epoch": 1.4703647416413375, + "grad_norm": 3.163342237472534, + "learning_rate": 2.7904634211166877e-06, + "loss": 0.4356975853443146, + "mean_token_accuracy": 0.8460350036621094, + "num_tokens": 17305108.0, + "step": 1935 + }, + { + "epoch": 1.4711246200607904, + "grad_norm": 1.6377612352371216, + "learning_rate": 2.7883831085207707e-06, + "loss": 0.4459729790687561, + "mean_token_accuracy": 0.8463394641876221, + "num_tokens": 17315479.0, + "step": 1936 + }, + { + "epoch": 1.471884498480243, + "grad_norm": 1.865268588066101, + "learning_rate": 2.7863025935264876e-06, + "loss": 0.394723117351532, + "mean_token_accuracy": 0.864177942276001, + "num_tokens": 17324795.0, + "step": 1937 + }, + { + "epoch": 1.4726443768996962, + "grad_norm": 1.241937518119812, + "learning_rate": 2.784221877594024e-06, + "loss": 0.2752220630645752, + "mean_token_accuracy": 0.8998259902000427, + "num_tokens": 17338000.0, + "step": 1938 + }, + { + "epoch": 1.4734042553191489, + "grad_norm": 1.8013651371002197, + "learning_rate": 2.7821409621837042e-06, + "loss": 0.4251005947589874, + "mean_token_accuracy": 0.8518919348716736, + "num_tokens": 17347351.0, + "step": 1939 + }, + { + "epoch": 1.4741641337386018, + "grad_norm": 1.2902207374572754, + "learning_rate": 2.7800598487559976e-06, + "loss": 0.3640727400779724, + "mean_token_accuracy": 0.8592870235443115, + "num_tokens": 17362335.0, + "step": 1940 + }, + { + "epoch": 1.4749240121580547, + "grad_norm": 2.5427513122558594, + "learning_rate": 2.777978538771508e-06, + "loss": 0.38166797161102295, + "mean_token_accuracy": 0.8653234839439392, + "num_tokens": 17367733.0, + "step": 1941 + }, + { + "epoch": 1.4756838905775076, + "grad_norm": 1.7793641090393066, + "learning_rate": 2.7758970336909795e-06, + "loss": 0.3113783895969391, + "mean_token_accuracy": 0.8812868595123291, + "num_tokens": 17375267.0, + "step": 1942 + }, + { + "epoch": 1.4764437689969605, + "grad_norm": 3.4031741619110107, + "learning_rate": 2.7738153349752923e-06, + "loss": 0.4800986647605896, + "mean_token_accuracy": 0.8336698412895203, + "num_tokens": 17379549.0, + "step": 1943 + }, + { + "epoch": 1.4772036474164134, + "grad_norm": 1.3451651334762573, + "learning_rate": 2.7717334440854634e-06, + "loss": 0.3115345239639282, + "mean_token_accuracy": 0.908623218536377, + "num_tokens": 17394455.0, + "step": 1944 + }, + { + "epoch": 1.4779635258358663, + "grad_norm": 1.980919599533081, + "learning_rate": 2.7696513624826422e-06, + "loss": 0.391154944896698, + "mean_token_accuracy": 0.8650267720222473, + "num_tokens": 17401931.0, + "step": 1945 + }, + { + "epoch": 1.4787234042553192, + "grad_norm": 1.0118765830993652, + "learning_rate": 2.7675690916281158e-06, + "loss": 0.3157956600189209, + "mean_token_accuracy": 0.8827471733093262, + "num_tokens": 17424144.0, + "step": 1946 + }, + { + "epoch": 1.4794832826747721, + "grad_norm": 1.579654335975647, + "learning_rate": 2.7654866329833e-06, + "loss": 0.4578486382961273, + "mean_token_accuracy": 0.8361750245094299, + "num_tokens": 17435769.0, + "step": 1947 + }, + { + "epoch": 1.4802431610942248, + "grad_norm": 1.7706717252731323, + "learning_rate": 2.763403988009746e-06, + "loss": 0.3564416170120239, + "mean_token_accuracy": 0.8689201474189758, + "num_tokens": 17444088.0, + "step": 1948 + }, + { + "epoch": 1.4810030395136777, + "grad_norm": 1.2264244556427002, + "learning_rate": 2.761321158169134e-06, + "loss": 0.30763837695121765, + "mean_token_accuracy": 0.8960219621658325, + "num_tokens": 17458096.0, + "step": 1949 + }, + { + "epoch": 1.4817629179331306, + "grad_norm": 1.214431881904602, + "learning_rate": 2.759238144923274e-06, + "loss": 0.49099457263946533, + "mean_token_accuracy": 0.8279136419296265, + "num_tokens": 17481062.0, + "step": 1950 + }, + { + "epoch": 1.4825227963525835, + "grad_norm": 1.593892216682434, + "learning_rate": 2.7571549497341044e-06, + "loss": 0.3745320737361908, + "mean_token_accuracy": 0.8690779209136963, + "num_tokens": 17490874.0, + "step": 1951 + }, + { + "epoch": 1.4832826747720365, + "grad_norm": 2.409924268722534, + "learning_rate": 2.755071574063692e-06, + "loss": 0.4310247600078583, + "mean_token_accuracy": 0.8521159291267395, + "num_tokens": 17496942.0, + "step": 1952 + }, + { + "epoch": 1.4840425531914894, + "grad_norm": 1.2557463645935059, + "learning_rate": 2.7529880193742297e-06, + "loss": 0.34304720163345337, + "mean_token_accuracy": 0.8748183250427246, + "num_tokens": 17514391.0, + "step": 1953 + }, + { + "epoch": 1.4848024316109423, + "grad_norm": 1.17310631275177, + "learning_rate": 2.7509042871280373e-06, + "loss": 0.3835817277431488, + "mean_token_accuracy": 0.8853274583816528, + "num_tokens": 17533289.0, + "step": 1954 + }, + { + "epoch": 1.4855623100303952, + "grad_norm": 1.5261479616165161, + "learning_rate": 2.748820378787558e-06, + "loss": 0.4799988865852356, + "mean_token_accuracy": 0.8252149820327759, + "num_tokens": 17544118.0, + "step": 1955 + }, + { + "epoch": 1.486322188449848, + "grad_norm": 2.030930757522583, + "learning_rate": 2.7467362958153585e-06, + "loss": 0.35690805315971375, + "mean_token_accuracy": 0.8959587216377258, + "num_tokens": 17550431.0, + "step": 1956 + }, + { + "epoch": 1.4870820668693008, + "grad_norm": 2.376520872116089, + "learning_rate": 2.7446520396741293e-06, + "loss": 0.262234091758728, + "mean_token_accuracy": 0.9054547548294067, + "num_tokens": 17554853.0, + "step": 1957 + }, + { + "epoch": 1.487841945288754, + "grad_norm": 1.6944479942321777, + "learning_rate": 2.742567611826681e-06, + "loss": 0.529259979724884, + "mean_token_accuracy": 0.8195339441299438, + "num_tokens": 17568016.0, + "step": 1958 + }, + { + "epoch": 1.4886018237082066, + "grad_norm": 2.833029270172119, + "learning_rate": 2.7404830137359445e-06, + "loss": 0.30229634046554565, + "mean_token_accuracy": 0.8933001756668091, + "num_tokens": 17572587.0, + "step": 1959 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 1.7040144205093384, + "learning_rate": 2.7383982468649715e-06, + "loss": 0.3166356682777405, + "mean_token_accuracy": 0.8871906399726868, + "num_tokens": 17580966.0, + "step": 1960 + }, + { + "epoch": 1.4901215805471124, + "grad_norm": 1.7539052963256836, + "learning_rate": 2.7363133126769326e-06, + "loss": 0.4231064021587372, + "mean_token_accuracy": 0.8708304166793823, + "num_tokens": 17590907.0, + "step": 1961 + }, + { + "epoch": 1.4908814589665653, + "grad_norm": 1.6198650598526, + "learning_rate": 2.7342282126351145e-06, + "loss": 0.4198967218399048, + "mean_token_accuracy": 0.8723280429840088, + "num_tokens": 17604291.0, + "step": 1962 + }, + { + "epoch": 1.4916413373860182, + "grad_norm": 1.8437711000442505, + "learning_rate": 2.73214294820292e-06, + "loss": 0.38923323154449463, + "mean_token_accuracy": 0.8697006106376648, + "num_tokens": 17612291.0, + "step": 1963 + }, + { + "epoch": 1.4924012158054711, + "grad_norm": 1.1129369735717773, + "learning_rate": 2.7300575208438684e-06, + "loss": 0.3107512593269348, + "mean_token_accuracy": 0.878618597984314, + "num_tokens": 17630073.0, + "step": 1964 + }, + { + "epoch": 1.493161094224924, + "grad_norm": 3.0210442543029785, + "learning_rate": 2.7279719320215924e-06, + "loss": 0.4630751609802246, + "mean_token_accuracy": 0.8567075729370117, + "num_tokens": 17634758.0, + "step": 1965 + }, + { + "epoch": 1.493920972644377, + "grad_norm": 2.8825972080230713, + "learning_rate": 2.725886183199839e-06, + "loss": 0.35351765155792236, + "mean_token_accuracy": 0.8711981773376465, + "num_tokens": 17639613.0, + "step": 1966 + }, + { + "epoch": 1.4946808510638299, + "grad_norm": 2.111238718032837, + "learning_rate": 2.723800275842468e-06, + "loss": 0.3529569208621979, + "mean_token_accuracy": 0.8679244518280029, + "num_tokens": 17645308.0, + "step": 1967 + }, + { + "epoch": 1.4954407294832825, + "grad_norm": 2.080509901046753, + "learning_rate": 2.7217142114134466e-06, + "loss": 0.43321219086647034, + "mean_token_accuracy": 0.8848220109939575, + "num_tokens": 17652292.0, + "step": 1968 + }, + { + "epoch": 1.4962006079027357, + "grad_norm": 2.8686363697052, + "learning_rate": 2.7196279913768587e-06, + "loss": 0.417035311460495, + "mean_token_accuracy": 0.8724601864814758, + "num_tokens": 17656908.0, + "step": 1969 + }, + { + "epoch": 1.4969604863221884, + "grad_norm": 3.294193744659424, + "learning_rate": 2.717541617196891e-06, + "loss": 0.3551934063434601, + "mean_token_accuracy": 0.8838565349578857, + "num_tokens": 17660590.0, + "step": 1970 + }, + { + "epoch": 1.4977203647416413, + "grad_norm": 1.766292929649353, + "learning_rate": 2.7154550903378425e-06, + "loss": 0.36521971225738525, + "mean_token_accuracy": 0.8810199499130249, + "num_tokens": 17668214.0, + "step": 1971 + }, + { + "epoch": 1.4984802431610942, + "grad_norm": 1.2127676010131836, + "learning_rate": 2.713368412264118e-06, + "loss": 0.35184425115585327, + "mean_token_accuracy": 0.8672580718994141, + "num_tokens": 17684736.0, + "step": 1972 + }, + { + "epoch": 1.499240121580547, + "grad_norm": 2.268256664276123, + "learning_rate": 2.711281584440228e-06, + "loss": 0.40115267038345337, + "mean_token_accuracy": 0.8517841100692749, + "num_tokens": 17691510.0, + "step": 1973 + }, + { + "epoch": 1.5, + "grad_norm": 2.7196054458618164, + "learning_rate": 2.70919460833079e-06, + "loss": 0.3819037675857544, + "mean_token_accuracy": 0.8765411376953125, + "num_tokens": 17696179.0, + "step": 1974 + }, + { + "epoch": 1.500759878419453, + "grad_norm": 2.969406843185425, + "learning_rate": 2.7071074854005206e-06, + "loss": 0.3922455608844757, + "mean_token_accuracy": 0.8796037435531616, + "num_tokens": 17700597.0, + "step": 1975 + }, + { + "epoch": 1.5015197568389058, + "grad_norm": 2.2965853214263916, + "learning_rate": 2.705020217114248e-06, + "loss": 0.5433666110038757, + "mean_token_accuracy": 0.809639036655426, + "num_tokens": 17708895.0, + "step": 1976 + }, + { + "epoch": 1.5022796352583585, + "grad_norm": 1.5584394931793213, + "learning_rate": 2.7029328049368942e-06, + "loss": 0.4736343324184418, + "mean_token_accuracy": 0.8197190761566162, + "num_tokens": 17725202.0, + "step": 1977 + }, + { + "epoch": 1.5030395136778116, + "grad_norm": 1.3903142213821411, + "learning_rate": 2.700845250333486e-06, + "loss": 0.4471571445465088, + "mean_token_accuracy": 0.839043140411377, + "num_tokens": 17742835.0, + "step": 1978 + }, + { + "epoch": 1.5037993920972643, + "grad_norm": 3.080716609954834, + "learning_rate": 2.69875755476915e-06, + "loss": 0.45760005712509155, + "mean_token_accuracy": 0.8366328477859497, + "num_tokens": 17747324.0, + "step": 1979 + }, + { + "epoch": 1.5045592705167175, + "grad_norm": 1.0150405168533325, + "learning_rate": 2.696669719709111e-06, + "loss": 0.33638954162597656, + "mean_token_accuracy": 0.8591676354408264, + "num_tokens": 17765565.0, + "step": 1980 + }, + { + "epoch": 1.5053191489361701, + "grad_norm": 2.402927875518799, + "learning_rate": 2.694581746618691e-06, + "loss": 0.4086601436138153, + "mean_token_accuracy": 0.8769911527633667, + "num_tokens": 17771275.0, + "step": 1981 + }, + { + "epoch": 1.506079027355623, + "grad_norm": 2.030583381652832, + "learning_rate": 2.6924936369633126e-06, + "loss": 0.5115457773208618, + "mean_token_accuracy": 0.8054746389389038, + "num_tokens": 17779999.0, + "step": 1982 + }, + { + "epoch": 1.506838905775076, + "grad_norm": 2.575199604034424, + "learning_rate": 2.6904053922084893e-06, + "loss": 0.363183856010437, + "mean_token_accuracy": 0.8716042637825012, + "num_tokens": 17785473.0, + "step": 1983 + }, + { + "epoch": 1.5075987841945289, + "grad_norm": 1.8497480154037476, + "learning_rate": 2.688317013819832e-06, + "loss": 0.4254384934902191, + "mean_token_accuracy": 0.8549597263336182, + "num_tokens": 17793812.0, + "step": 1984 + }, + { + "epoch": 1.5083586626139818, + "grad_norm": 1.7786511182785034, + "learning_rate": 2.686228503263045e-06, + "loss": 0.33400774002075195, + "mean_token_accuracy": 0.9027615189552307, + "num_tokens": 17801783.0, + "step": 1985 + }, + { + "epoch": 1.5091185410334347, + "grad_norm": 1.8365367650985718, + "learning_rate": 2.684139862003927e-06, + "loss": 0.35765063762664795, + "mean_token_accuracy": 0.8663736581802368, + "num_tokens": 17809562.0, + "step": 1986 + }, + { + "epoch": 1.5098784194528876, + "grad_norm": 1.8817477226257324, + "learning_rate": 2.682051091508365e-06, + "loss": 0.4627506732940674, + "mean_token_accuracy": 0.8358862400054932, + "num_tokens": 17819094.0, + "step": 1987 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.221547842025757, + "learning_rate": 2.679962193242338e-06, + "loss": 0.577020525932312, + "mean_token_accuracy": 0.80013108253479, + "num_tokens": 17826666.0, + "step": 1988 + }, + { + "epoch": 1.5113981762917934, + "grad_norm": 2.6618270874023438, + "learning_rate": 2.6778731686719177e-06, + "loss": 0.44632256031036377, + "mean_token_accuracy": 0.8611289262771606, + "num_tokens": 17833172.0, + "step": 1989 + }, + { + "epoch": 1.512158054711246, + "grad_norm": 2.9495689868927, + "learning_rate": 2.67578401926326e-06, + "loss": 0.3482511043548584, + "mean_token_accuracy": 0.8703314661979675, + "num_tokens": 17837220.0, + "step": 1990 + }, + { + "epoch": 1.5129179331306992, + "grad_norm": 2.0943644046783447, + "learning_rate": 2.6736947464826107e-06, + "loss": 0.2354314625263214, + "mean_token_accuracy": 0.9137634038925171, + "num_tokens": 17842712.0, + "step": 1991 + }, + { + "epoch": 1.513677811550152, + "grad_norm": 1.1303033828735352, + "learning_rate": 2.671605351796302e-06, + "loss": 0.3624761700630188, + "mean_token_accuracy": 0.8769594430923462, + "num_tokens": 17860902.0, + "step": 1992 + }, + { + "epoch": 1.5144376899696048, + "grad_norm": 2.8921146392822266, + "learning_rate": 2.6695158366707526e-06, + "loss": 0.2517220973968506, + "mean_token_accuracy": 0.8974182605743408, + "num_tokens": 17865160.0, + "step": 1993 + }, + { + "epoch": 1.5151975683890577, + "grad_norm": 2.320587158203125, + "learning_rate": 2.667426202572463e-06, + "loss": 0.4589889943599701, + "mean_token_accuracy": 0.8379613161087036, + "num_tokens": 17871994.0, + "step": 1994 + }, + { + "epoch": 1.5159574468085106, + "grad_norm": 1.1407674551010132, + "learning_rate": 2.665336450968019e-06, + "loss": 0.34412115812301636, + "mean_token_accuracy": 0.8776306509971619, + "num_tokens": 17889941.0, + "step": 1995 + }, + { + "epoch": 1.5167173252279635, + "grad_norm": 2.069814920425415, + "learning_rate": 2.6632465833240895e-06, + "loss": 0.47524404525756836, + "mean_token_accuracy": 0.830310046672821, + "num_tokens": 17898447.0, + "step": 1996 + }, + { + "epoch": 1.5174772036474165, + "grad_norm": 1.822415828704834, + "learning_rate": 2.661156601107424e-06, + "loss": 0.4541318416595459, + "mean_token_accuracy": 0.8856616020202637, + "num_tokens": 17908729.0, + "step": 1997 + }, + { + "epoch": 1.5182370820668694, + "grad_norm": 2.851428985595703, + "learning_rate": 2.659066505784852e-06, + "loss": 0.41761666536331177, + "mean_token_accuracy": 0.8710572719573975, + "num_tokens": 17913860.0, + "step": 1998 + }, + { + "epoch": 1.518996960486322, + "grad_norm": 1.8483710289001465, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.45517268776893616, + "mean_token_accuracy": 0.8411115407943726, + "num_tokens": 17923497.0, + "step": 1999 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 1.9044219255447388, + "learning_rate": 2.654885981689706e-06, + "loss": 0.42533189058303833, + "mean_token_accuracy": 0.8597894906997681, + "num_tokens": 17932670.0, + "step": 2000 + } + ], + "logging_steps": 1.0, + "max_steps": 3948, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9547571235271475e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2fc4f538d721f958cdceda5408f2f4e1a35f4210 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 +size 6225 diff --git a/checkpoint-3000/chat_template.jinja b/checkpoint-3000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..70adff8a08fb31e0636f618564838d4bf3c05286 --- /dev/null +++ b/checkpoint-3000/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-3000/config.json b/checkpoint-3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c351e5fb52f50ea6e07b40981aef81c80f9df7e4 --- /dev/null +++ b/checkpoint-3000/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151662, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-3000/generation_config.json b/checkpoint-3000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2104b83493c2833855e8fe32a7a784805ab5c2ee --- /dev/null +++ b/checkpoint-3000/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151662, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.5.3" +} diff --git a/checkpoint-3000/model.safetensors b/checkpoint-3000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07e0e931ed749c8c0c6c086ebb969bd3c5167e3f --- /dev/null +++ b/checkpoint-3000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a87a133eb5ec5af0878395bc45e179834b11224819f981211f70acdd015060b +size 17645743048 diff --git a/checkpoint-3000/optimizer.bin b/checkpoint-3000/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..18574ad6580a2815e85a104eea5910c353aaf5dc --- /dev/null +++ b/checkpoint-3000/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ff8e5977667fc938b297528391c931889487050b2acf34a78a42a820912cd38 +size 32180124005 diff --git a/checkpoint-3000/pytorch_model_fsdp.bin b/checkpoint-3000/pytorch_model_fsdp.bin new file mode 100644 index 0000000000000000000000000000000000000000..798e41cb07595e0af0eea0bc21a9c2bdffb4914c --- /dev/null +++ b/checkpoint-3000/pytorch_model_fsdp.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3023a52ce183c0d2cddf839ebf937f5047e153db9c651eb9f295b9a386e6b589 +size 17645897996 diff --git a/checkpoint-3000/rng_state_0.pth b/checkpoint-3000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5379ca97bc0c62d226d0fc37920d4937a7bb8b43 --- /dev/null +++ b/checkpoint-3000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e957b4cd785256be4cb26eb03060ef689e1d58f1766d7f26ca36a62bec4994 +size 14917 diff --git a/checkpoint-3000/rng_state_1.pth b/checkpoint-3000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..662ad0d5b30369c825f66c080779973608c5058e --- /dev/null +++ b/checkpoint-3000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550c54d430b44b77b0abe44c6e3ceba90a155305315c081b7616b35e2c18d1ce +size 14917 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..58a045115b7b529e69edb60002fbf90b0935a577 --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b07c9eca675fb8c47d0c01728c4ef879c66a752ffdace85e7e9feac32b48ac4b +size 1465 diff --git a/checkpoint-3000/tokenizer.json b/checkpoint-3000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/checkpoint-3000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e47e52c4e7f0b2bcf2103a878790216f3f6436d --- /dev/null +++ b/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 1010000, + "pad_token": "<|fim_pad|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..666130f045326cc7a4d60f3405606f5f0040b4a4 --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,27034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2796352583586628, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007598784194528875, + "grad_norm": 11.767926216125488, + "learning_rate": 0.0, + "loss": 0.7937269806861877, + "mean_token_accuracy": 0.7822731137275696, + "num_tokens": 10507.0, + "step": 1 + }, + { + "epoch": 0.001519756838905775, + "grad_norm": 14.9199800491333, + "learning_rate": 2.5252525252525256e-08, + "loss": 0.7665389776229858, + "mean_token_accuracy": 0.8342233300209045, + "num_tokens": 14806.0, + "step": 2 + }, + { + "epoch": 0.0022796352583586625, + "grad_norm": 11.991217613220215, + "learning_rate": 5.050505050505051e-08, + "loss": 0.9597002267837524, + "mean_token_accuracy": 0.7054992318153381, + "num_tokens": 27170.0, + "step": 3 + }, + { + "epoch": 0.00303951367781155, + "grad_norm": 12.958333015441895, + "learning_rate": 7.575757575757576e-08, + "loss": 0.9971482753753662, + "mean_token_accuracy": 0.7261134386062622, + "num_tokens": 33729.0, + "step": 4 + }, + { + "epoch": 0.003799392097264438, + "grad_norm": 13.5665283203125, + "learning_rate": 1.0101010101010103e-07, + "loss": 0.9504883885383606, + "mean_token_accuracy": 0.745307445526123, + "num_tokens": 41174.0, + "step": 5 + }, + { + "epoch": 0.004559270516717325, + "grad_norm": 10.09444808959961, + "learning_rate": 1.2626262626262626e-07, + "loss": 0.759548008441925, + "mean_token_accuracy": 0.7842121124267578, + "num_tokens": 47943.0, + "step": 6 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 10.741650581359863, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.8231598138809204, + "mean_token_accuracy": 0.7550969123840332, + "num_tokens": 56665.0, + "step": 7 + }, + { + "epoch": 0.0060790273556231, + "grad_norm": 12.250170707702637, + "learning_rate": 1.767676767676768e-07, + "loss": 0.8576581478118896, + "mean_token_accuracy": 0.7568671703338623, + "num_tokens": 67606.0, + "step": 8 + }, + { + "epoch": 0.006838905775075988, + "grad_norm": 12.828629493713379, + "learning_rate": 2.0202020202020205e-07, + "loss": 0.9886435866355896, + "mean_token_accuracy": 0.733400285243988, + "num_tokens": 74272.0, + "step": 9 + }, + { + "epoch": 0.007598784194528876, + "grad_norm": 15.966923713684082, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.064985990524292, + "mean_token_accuracy": 0.7101132869720459, + "num_tokens": 80524.0, + "step": 10 + }, + { + "epoch": 0.008358662613981762, + "grad_norm": 10.864850044250488, + "learning_rate": 2.525252525252525e-07, + "loss": 0.8311550617218018, + "mean_token_accuracy": 0.7431639432907104, + "num_tokens": 96292.0, + "step": 11 + }, + { + "epoch": 0.00911854103343465, + "grad_norm": 16.438785552978516, + "learning_rate": 2.7777777777777776e-07, + "loss": 1.0579866170883179, + "mean_token_accuracy": 0.7222976684570312, + "num_tokens": 102992.0, + "step": 12 + }, + { + "epoch": 0.009878419452887538, + "grad_norm": 11.179214477539062, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9816144704818726, + "mean_token_accuracy": 0.7206371426582336, + "num_tokens": 113571.0, + "step": 13 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 12.780299186706543, + "learning_rate": 3.2828282828282834e-07, + "loss": 0.847449004650116, + "mean_token_accuracy": 0.7826199531555176, + "num_tokens": 119568.0, + "step": 14 + }, + { + "epoch": 0.011398176291793313, + "grad_norm": 14.800421714782715, + "learning_rate": 3.535353535353536e-07, + "loss": 0.9275516271591187, + "mean_token_accuracy": 0.7655045986175537, + "num_tokens": 126258.0, + "step": 15 + }, + { + "epoch": 0.0121580547112462, + "grad_norm": 11.267602920532227, + "learning_rate": 3.787878787878788e-07, + "loss": 0.8464037179946899, + "mean_token_accuracy": 0.7606508731842041, + "num_tokens": 136831.0, + "step": 16 + }, + { + "epoch": 0.012917933130699088, + "grad_norm": 12.891013145446777, + "learning_rate": 4.040404040404041e-07, + "loss": 0.9903074502944946, + "mean_token_accuracy": 0.7247487306594849, + "num_tokens": 150434.0, + "step": 17 + }, + { + "epoch": 0.013677811550151976, + "grad_norm": 11.13957691192627, + "learning_rate": 4.2929292929292934e-07, + "loss": 0.8287211656570435, + "mean_token_accuracy": 0.7621913552284241, + "num_tokens": 158516.0, + "step": 18 + }, + { + "epoch": 0.014437689969604863, + "grad_norm": 18.39569664001465, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.150015115737915, + "mean_token_accuracy": 0.7349498271942139, + "num_tokens": 162214.0, + "step": 19 + }, + { + "epoch": 0.015197568389057751, + "grad_norm": 9.353750228881836, + "learning_rate": 4.797979797979798e-07, + "loss": 0.7228299379348755, + "mean_token_accuracy": 0.7969573736190796, + "num_tokens": 173035.0, + "step": 20 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 8.267163276672363, + "learning_rate": 5.05050505050505e-07, + "loss": 0.7358136177062988, + "mean_token_accuracy": 0.7903937101364136, + "num_tokens": 183568.0, + "step": 21 + }, + { + "epoch": 0.016717325227963525, + "grad_norm": 11.137128829956055, + "learning_rate": 5.303030303030304e-07, + "loss": 1.0075397491455078, + "mean_token_accuracy": 0.702807605266571, + "num_tokens": 192759.0, + "step": 22 + }, + { + "epoch": 0.017477203647416412, + "grad_norm": 10.734103202819824, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8925919532775879, + "mean_token_accuracy": 0.7475671768188477, + "num_tokens": 201280.0, + "step": 23 + }, + { + "epoch": 0.0182370820668693, + "grad_norm": 11.945566177368164, + "learning_rate": 5.808080808080809e-07, + "loss": 0.7260514497756958, + "mean_token_accuracy": 0.7859152555465698, + "num_tokens": 218053.0, + "step": 24 + }, + { + "epoch": 0.018996960486322188, + "grad_norm": 18.610652923583984, + "learning_rate": 6.060606060606061e-07, + "loss": 0.8995465636253357, + "mean_token_accuracy": 0.7931990623474121, + "num_tokens": 220953.0, + "step": 25 + }, + { + "epoch": 0.019756838905775075, + "grad_norm": 10.51898193359375, + "learning_rate": 6.313131313131314e-07, + "loss": 0.9532671570777893, + "mean_token_accuracy": 0.7257645726203918, + "num_tokens": 231200.0, + "step": 26 + }, + { + "epoch": 0.020516717325227963, + "grad_norm": 9.581812858581543, + "learning_rate": 6.565656565656567e-07, + "loss": 0.9038010239601135, + "mean_token_accuracy": 0.7390379905700684, + "num_tokens": 237711.0, + "step": 27 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 12.297484397888184, + "learning_rate": 6.818181818181818e-07, + "loss": 1.048936367034912, + "mean_token_accuracy": 0.7175670862197876, + "num_tokens": 242503.0, + "step": 28 + }, + { + "epoch": 0.022036474164133738, + "grad_norm": 7.437953472137451, + "learning_rate": 7.070707070707071e-07, + "loss": 0.8308826684951782, + "mean_token_accuracy": 0.7415335774421692, + "num_tokens": 250842.0, + "step": 29 + }, + { + "epoch": 0.022796352583586626, + "grad_norm": 6.134475231170654, + "learning_rate": 7.323232323232324e-07, + "loss": 0.647913932800293, + "mean_token_accuracy": 0.8124054670333862, + "num_tokens": 267453.0, + "step": 30 + }, + { + "epoch": 0.023556231003039513, + "grad_norm": 6.678966045379639, + "learning_rate": 7.575757575757576e-07, + "loss": 0.7052810192108154, + "mean_token_accuracy": 0.7908754348754883, + "num_tokens": 284416.0, + "step": 31 + }, + { + "epoch": 0.0243161094224924, + "grad_norm": 7.42232084274292, + "learning_rate": 7.82828282828283e-07, + "loss": 1.022383213043213, + "mean_token_accuracy": 0.7053230404853821, + "num_tokens": 292073.0, + "step": 32 + }, + { + "epoch": 0.02507598784194529, + "grad_norm": 6.463219165802002, + "learning_rate": 8.080808080808082e-07, + "loss": 0.7603012323379517, + "mean_token_accuracy": 0.7728140354156494, + "num_tokens": 298550.0, + "step": 33 + }, + { + "epoch": 0.025835866261398176, + "grad_norm": 5.668411731719971, + "learning_rate": 8.333333333333333e-07, + "loss": 0.7707852721214294, + "mean_token_accuracy": 0.7827773094177246, + "num_tokens": 306683.0, + "step": 34 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 4.984964847564697, + "learning_rate": 8.585858585858587e-07, + "loss": 0.6317349672317505, + "mean_token_accuracy": 0.8106861114501953, + "num_tokens": 318842.0, + "step": 35 + }, + { + "epoch": 0.02735562310030395, + "grad_norm": 4.421732425689697, + "learning_rate": 8.838383838383839e-07, + "loss": 0.6228617429733276, + "mean_token_accuracy": 0.8023355603218079, + "num_tokens": 329850.0, + "step": 36 + }, + { + "epoch": 0.02811550151975684, + "grad_norm": 5.970808029174805, + "learning_rate": 9.090909090909091e-07, + "loss": 0.8443238139152527, + "mean_token_accuracy": 0.7462409734725952, + "num_tokens": 335844.0, + "step": 37 + }, + { + "epoch": 0.028875379939209727, + "grad_norm": 4.5389084815979, + "learning_rate": 9.343434343434345e-07, + "loss": 0.6976436376571655, + "mean_token_accuracy": 0.790410041809082, + "num_tokens": 348768.0, + "step": 38 + }, + { + "epoch": 0.029635258358662615, + "grad_norm": 4.116631507873535, + "learning_rate": 9.595959595959596e-07, + "loss": 0.6698519587516785, + "mean_token_accuracy": 0.7818127870559692, + "num_tokens": 355460.0, + "step": 39 + }, + { + "epoch": 0.030395136778115502, + "grad_norm": 3.3714773654937744, + "learning_rate": 9.84848484848485e-07, + "loss": 0.5723201036453247, + "mean_token_accuracy": 0.8100086450576782, + "num_tokens": 368507.0, + "step": 40 + }, + { + "epoch": 0.03115501519756839, + "grad_norm": 4.4438347816467285, + "learning_rate": 1.01010101010101e-06, + "loss": 0.7508786916732788, + "mean_token_accuracy": 0.7711942791938782, + "num_tokens": 376467.0, + "step": 41 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 5.609974384307861, + "learning_rate": 1.0353535353535354e-06, + "loss": 0.566256046295166, + "mean_token_accuracy": 0.8319284319877625, + "num_tokens": 381399.0, + "step": 42 + }, + { + "epoch": 0.03267477203647416, + "grad_norm": 5.124386787414551, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.8151067495346069, + "mean_token_accuracy": 0.7537785768508911, + "num_tokens": 387389.0, + "step": 43 + }, + { + "epoch": 0.03343465045592705, + "grad_norm": 3.6318116188049316, + "learning_rate": 1.085858585858586e-06, + "loss": 0.5989949107170105, + "mean_token_accuracy": 0.8129256963729858, + "num_tokens": 395302.0, + "step": 44 + }, + { + "epoch": 0.03419452887537994, + "grad_norm": 2.694424629211426, + "learning_rate": 1.111111111111111e-06, + "loss": 0.5831396579742432, + "mean_token_accuracy": 0.8056820631027222, + "num_tokens": 409920.0, + "step": 45 + }, + { + "epoch": 0.034954407294832825, + "grad_norm": 2.2949178218841553, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.472550630569458, + "mean_token_accuracy": 0.8343006372451782, + "num_tokens": 428323.0, + "step": 46 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 3.3930575847625732, + "learning_rate": 1.1616161616161617e-06, + "loss": 0.6246505379676819, + "mean_token_accuracy": 0.783149003982544, + "num_tokens": 435889.0, + "step": 47 + }, + { + "epoch": 0.0364741641337386, + "grad_norm": 3.692598819732666, + "learning_rate": 1.186868686868687e-06, + "loss": 0.46132946014404297, + "mean_token_accuracy": 0.8583089113235474, + "num_tokens": 441192.0, + "step": 48 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 6.571533203125, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.9351121783256531, + "mean_token_accuracy": 0.7580878734588623, + "num_tokens": 444277.0, + "step": 49 + }, + { + "epoch": 0.037993920972644375, + "grad_norm": 5.029570579528809, + "learning_rate": 1.2373737373737375e-06, + "loss": 0.6921554803848267, + "mean_token_accuracy": 0.8131166100502014, + "num_tokens": 447646.0, + "step": 50 + }, + { + "epoch": 0.03875379939209726, + "grad_norm": 2.9174208641052246, + "learning_rate": 1.2626262626262629e-06, + "loss": 0.591706395149231, + "mean_token_accuracy": 0.8108617067337036, + "num_tokens": 461397.0, + "step": 51 + }, + { + "epoch": 0.03951367781155015, + "grad_norm": 4.315536022186279, + "learning_rate": 1.287878787878788e-06, + "loss": 0.6986310482025146, + "mean_token_accuracy": 0.7710754871368408, + "num_tokens": 472047.0, + "step": 52 + }, + { + "epoch": 0.04027355623100304, + "grad_norm": 2.6216275691986084, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5553690791130066, + "mean_token_accuracy": 0.8167896866798401, + "num_tokens": 482795.0, + "step": 53 + }, + { + "epoch": 0.041033434650455926, + "grad_norm": 3.0562477111816406, + "learning_rate": 1.3383838383838385e-06, + "loss": 0.6909202337265015, + "mean_token_accuracy": 0.7859863638877869, + "num_tokens": 494818.0, + "step": 54 + }, + { + "epoch": 0.04179331306990881, + "grad_norm": 2.1420412063598633, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.5415265560150146, + "mean_token_accuracy": 0.818886399269104, + "num_tokens": 513695.0, + "step": 55 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.9610488414764404, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.6602212190628052, + "mean_token_accuracy": 0.7830734252929688, + "num_tokens": 523784.0, + "step": 56 + }, + { + "epoch": 0.04331306990881459, + "grad_norm": 2.511972665786743, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.5717809796333313, + "mean_token_accuracy": 0.8053616285324097, + "num_tokens": 546308.0, + "step": 57 + }, + { + "epoch": 0.044072948328267476, + "grad_norm": 3.52642822265625, + "learning_rate": 1.4393939393939396e-06, + "loss": 0.6242594718933105, + "mean_token_accuracy": 0.8162082433700562, + "num_tokens": 552019.0, + "step": 58 + }, + { + "epoch": 0.044832826747720364, + "grad_norm": 3.02362322807312, + "learning_rate": 1.4646464646464648e-06, + "loss": 0.6634255647659302, + "mean_token_accuracy": 0.7682032585144043, + "num_tokens": 560009.0, + "step": 59 + }, + { + "epoch": 0.04559270516717325, + "grad_norm": 2.3910107612609863, + "learning_rate": 1.48989898989899e-06, + "loss": 0.5519146919250488, + "mean_token_accuracy": 0.8270269632339478, + "num_tokens": 571005.0, + "step": 60 + }, + { + "epoch": 0.04635258358662614, + "grad_norm": 4.28154993057251, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.7437789440155029, + "mean_token_accuracy": 0.7782418131828308, + "num_tokens": 574950.0, + "step": 61 + }, + { + "epoch": 0.04711246200607903, + "grad_norm": 3.4078686237335205, + "learning_rate": 1.5404040404040404e-06, + "loss": 0.6345915198326111, + "mean_token_accuracy": 0.7903392314910889, + "num_tokens": 581657.0, + "step": 62 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 2.6834158897399902, + "learning_rate": 1.565656565656566e-06, + "loss": 0.5981127023696899, + "mean_token_accuracy": 0.7911489605903625, + "num_tokens": 591267.0, + "step": 63 + }, + { + "epoch": 0.0486322188449848, + "grad_norm": 2.1054461002349854, + "learning_rate": 1.590909090909091e-06, + "loss": 0.5523523688316345, + "mean_token_accuracy": 0.8194501399993896, + "num_tokens": 606787.0, + "step": 64 + }, + { + "epoch": 0.04939209726443769, + "grad_norm": 3.322596788406372, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.48417025804519653, + "mean_token_accuracy": 0.8293706178665161, + "num_tokens": 611068.0, + "step": 65 + }, + { + "epoch": 0.05015197568389058, + "grad_norm": 2.302450180053711, + "learning_rate": 1.6414141414141415e-06, + "loss": 0.6498389840126038, + "mean_token_accuracy": 0.7728497385978699, + "num_tokens": 624452.0, + "step": 66 + }, + { + "epoch": 0.050911854103343465, + "grad_norm": 2.680191993713379, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.6347037553787231, + "mean_token_accuracy": 0.8108306527137756, + "num_tokens": 638049.0, + "step": 67 + }, + { + "epoch": 0.05167173252279635, + "grad_norm": 3.0297021865844727, + "learning_rate": 1.6919191919191922e-06, + "loss": 0.5344363451004028, + "mean_token_accuracy": 0.8113535046577454, + "num_tokens": 643892.0, + "step": 68 + }, + { + "epoch": 0.05243161094224924, + "grad_norm": 2.9283676147460938, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.6999260187149048, + "mean_token_accuracy": 0.7782022356987, + "num_tokens": 654418.0, + "step": 69 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 3.4098572731018066, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.6508946418762207, + "mean_token_accuracy": 0.7942900657653809, + "num_tokens": 659837.0, + "step": 70 + }, + { + "epoch": 0.053951367781155016, + "grad_norm": 2.6756019592285156, + "learning_rate": 1.7676767676767678e-06, + "loss": 0.603486180305481, + "mean_token_accuracy": 0.8015457391738892, + "num_tokens": 668361.0, + "step": 71 + }, + { + "epoch": 0.0547112462006079, + "grad_norm": 2.2630293369293213, + "learning_rate": 1.792929292929293e-06, + "loss": 0.6608274579048157, + "mean_token_accuracy": 0.7753809690475464, + "num_tokens": 679025.0, + "step": 72 + }, + { + "epoch": 0.05547112462006079, + "grad_norm": 2.123962879180908, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4525482654571533, + "mean_token_accuracy": 0.8425612449645996, + "num_tokens": 688574.0, + "step": 73 + }, + { + "epoch": 0.05623100303951368, + "grad_norm": 7.90519905090332, + "learning_rate": 1.8434343434343434e-06, + "loss": 0.6507195830345154, + "mean_token_accuracy": 0.7714964151382446, + "num_tokens": 694534.0, + "step": 74 + }, + { + "epoch": 0.056990881458966566, + "grad_norm": 2.372203826904297, + "learning_rate": 1.868686868686869e-06, + "loss": 0.4458143413066864, + "mean_token_accuracy": 0.7991449236869812, + "num_tokens": 703114.0, + "step": 75 + }, + { + "epoch": 0.057750759878419454, + "grad_norm": 2.918677568435669, + "learning_rate": 1.8939393939393941e-06, + "loss": 0.5614339113235474, + "mean_token_accuracy": 0.8211464881896973, + "num_tokens": 709038.0, + "step": 76 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 1.6106709241867065, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.5802098512649536, + "mean_token_accuracy": 0.8055065870285034, + "num_tokens": 730482.0, + "step": 77 + }, + { + "epoch": 0.05927051671732523, + "grad_norm": 2.8069989681243896, + "learning_rate": 1.944444444444445e-06, + "loss": 0.5709059238433838, + "mean_token_accuracy": 0.8024872541427612, + "num_tokens": 751817.0, + "step": 78 + }, + { + "epoch": 0.06003039513677812, + "grad_norm": 2.641667127609253, + "learning_rate": 1.96969696969697e-06, + "loss": 0.6480152606964111, + "mean_token_accuracy": 0.7912271618843079, + "num_tokens": 759236.0, + "step": 79 + }, + { + "epoch": 0.060790273556231005, + "grad_norm": 2.6034350395202637, + "learning_rate": 1.994949494949495e-06, + "loss": 0.5535176396369934, + "mean_token_accuracy": 0.7980542778968811, + "num_tokens": 766496.0, + "step": 80 + }, + { + "epoch": 0.06155015197568389, + "grad_norm": 1.7095069885253906, + "learning_rate": 2.02020202020202e-06, + "loss": 0.4545496106147766, + "mean_token_accuracy": 0.8229660391807556, + "num_tokens": 780124.0, + "step": 81 + }, + { + "epoch": 0.06231003039513678, + "grad_norm": 3.788830518722534, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.6679391264915466, + "mean_token_accuracy": 0.7942397594451904, + "num_tokens": 784555.0, + "step": 82 + }, + { + "epoch": 0.06306990881458967, + "grad_norm": 2.009831666946411, + "learning_rate": 2.070707070707071e-06, + "loss": 0.5067101120948792, + "mean_token_accuracy": 0.8276634216308594, + "num_tokens": 797459.0, + "step": 83 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 2.201627731323242, + "learning_rate": 2.095959595959596e-06, + "loss": 0.5012127161026001, + "mean_token_accuracy": 0.8432504534721375, + "num_tokens": 810817.0, + "step": 84 + }, + { + "epoch": 0.06458966565349544, + "grad_norm": 2.492568016052246, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.6142797470092773, + "mean_token_accuracy": 0.8338661193847656, + "num_tokens": 818191.0, + "step": 85 + }, + { + "epoch": 0.06534954407294832, + "grad_norm": 2.8360862731933594, + "learning_rate": 2.1464646464646467e-06, + "loss": 0.5569300651550293, + "mean_token_accuracy": 0.8121030330657959, + "num_tokens": 825325.0, + "step": 86 + }, + { + "epoch": 0.06610942249240122, + "grad_norm": 2.407548427581787, + "learning_rate": 2.171717171717172e-06, + "loss": 0.6442930102348328, + "mean_token_accuracy": 0.792514443397522, + "num_tokens": 834439.0, + "step": 87 + }, + { + "epoch": 0.0668693009118541, + "grad_norm": 2.340728759765625, + "learning_rate": 2.196969696969697e-06, + "loss": 0.6494365930557251, + "mean_token_accuracy": 0.7746615409851074, + "num_tokens": 843078.0, + "step": 88 + }, + { + "epoch": 0.067629179331307, + "grad_norm": 1.7703697681427002, + "learning_rate": 2.222222222222222e-06, + "loss": 0.598991870880127, + "mean_token_accuracy": 0.7992157340049744, + "num_tokens": 860171.0, + "step": 89 + }, + { + "epoch": 0.06838905775075987, + "grad_norm": 2.5779271125793457, + "learning_rate": 2.2474747474747476e-06, + "loss": 0.5693082809448242, + "mean_token_accuracy": 0.8093700408935547, + "num_tokens": 866669.0, + "step": 90 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 2.014092206954956, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5346695780754089, + "mean_token_accuracy": 0.8165590763092041, + "num_tokens": 876698.0, + "step": 91 + }, + { + "epoch": 0.06990881458966565, + "grad_norm": 1.7555919885635376, + "learning_rate": 2.2979797979797983e-06, + "loss": 0.5321458578109741, + "mean_token_accuracy": 0.8166656494140625, + "num_tokens": 889488.0, + "step": 92 + }, + { + "epoch": 0.07066869300911854, + "grad_norm": 1.8631824254989624, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.5246532559394836, + "mean_token_accuracy": 0.8088107705116272, + "num_tokens": 901322.0, + "step": 93 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.2332139015197754, + "learning_rate": 2.348484848484849e-06, + "loss": 0.5141711235046387, + "mean_token_accuracy": 0.8382217884063721, + "num_tokens": 905792.0, + "step": 94 + }, + { + "epoch": 0.07218844984802432, + "grad_norm": 1.7806555032730103, + "learning_rate": 2.373737373737374e-06, + "loss": 0.5233149528503418, + "mean_token_accuracy": 0.8101529479026794, + "num_tokens": 917320.0, + "step": 95 + }, + { + "epoch": 0.0729483282674772, + "grad_norm": 1.8169859647750854, + "learning_rate": 2.3989898989898993e-06, + "loss": 0.578881561756134, + "mean_token_accuracy": 0.8044873476028442, + "num_tokens": 931062.0, + "step": 96 + }, + { + "epoch": 0.0737082066869301, + "grad_norm": 4.677402496337891, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.7842556238174438, + "mean_token_accuracy": 0.7579764127731323, + "num_tokens": 934712.0, + "step": 97 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 2.6987264156341553, + "learning_rate": 2.4494949494949495e-06, + "loss": 0.5669287443161011, + "mean_token_accuracy": 0.8186933994293213, + "num_tokens": 941058.0, + "step": 98 + }, + { + "epoch": 0.07522796352583587, + "grad_norm": 1.6906023025512695, + "learning_rate": 2.474747474747475e-06, + "loss": 0.4976363778114319, + "mean_token_accuracy": 0.8198553323745728, + "num_tokens": 956509.0, + "step": 99 + }, + { + "epoch": 0.07598784194528875, + "grad_norm": 2.7256152629852295, + "learning_rate": 2.5e-06, + "loss": 0.7138420343399048, + "mean_token_accuracy": 0.7752805948257446, + "num_tokens": 963920.0, + "step": 100 + }, + { + "epoch": 0.07674772036474165, + "grad_norm": 2.174870491027832, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.6733541488647461, + "mean_token_accuracy": 0.7745175361633301, + "num_tokens": 975268.0, + "step": 101 + }, + { + "epoch": 0.07750759878419453, + "grad_norm": 1.5587213039398193, + "learning_rate": 2.5505050505050505e-06, + "loss": 0.44223445653915405, + "mean_token_accuracy": 0.8278359174728394, + "num_tokens": 991837.0, + "step": 102 + }, + { + "epoch": 0.07826747720364742, + "grad_norm": 2.181840658187866, + "learning_rate": 2.575757575757576e-06, + "loss": 0.625128448009491, + "mean_token_accuracy": 0.7941786050796509, + "num_tokens": 1004325.0, + "step": 103 + }, + { + "epoch": 0.0790273556231003, + "grad_norm": 1.4986687898635864, + "learning_rate": 2.601010101010101e-06, + "loss": 0.39262527227401733, + "mean_token_accuracy": 0.8412648439407349, + "num_tokens": 1018331.0, + "step": 104 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 2.3416061401367188, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.5495132803916931, + "mean_token_accuracy": 0.8193322420120239, + "num_tokens": 1026090.0, + "step": 105 + }, + { + "epoch": 0.08054711246200608, + "grad_norm": 3.8168859481811523, + "learning_rate": 2.6515151515151514e-06, + "loss": 0.4898706376552582, + "mean_token_accuracy": 0.8467956185340881, + "num_tokens": 1029955.0, + "step": 106 + }, + { + "epoch": 0.08130699088145897, + "grad_norm": 4.113908767700195, + "learning_rate": 2.676767676767677e-06, + "loss": 0.6189584732055664, + "mean_token_accuracy": 0.8019394278526306, + "num_tokens": 1033598.0, + "step": 107 + }, + { + "epoch": 0.08206686930091185, + "grad_norm": 2.50003981590271, + "learning_rate": 2.7020202020202025e-06, + "loss": 0.6479471921920776, + "mean_token_accuracy": 0.7790026664733887, + "num_tokens": 1042533.0, + "step": 108 + }, + { + "epoch": 0.08282674772036475, + "grad_norm": 1.408934473991394, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.3909248113632202, + "mean_token_accuracy": 0.8477586507797241, + "num_tokens": 1061755.0, + "step": 109 + }, + { + "epoch": 0.08358662613981763, + "grad_norm": 3.360633611679077, + "learning_rate": 2.7525252525252528e-06, + "loss": 0.6952459812164307, + "mean_token_accuracy": 0.777535080909729, + "num_tokens": 1067316.0, + "step": 110 + }, + { + "epoch": 0.08434650455927052, + "grad_norm": 1.8631696701049805, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.5420593023300171, + "mean_token_accuracy": 0.8157662749290466, + "num_tokens": 1079930.0, + "step": 111 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.4308314323425293, + "learning_rate": 2.803030303030303e-06, + "loss": 0.5863882303237915, + "mean_token_accuracy": 0.8206346035003662, + "num_tokens": 1088069.0, + "step": 112 + }, + { + "epoch": 0.0858662613981763, + "grad_norm": 2.922808885574341, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5217319130897522, + "mean_token_accuracy": 0.8253234028816223, + "num_tokens": 1093607.0, + "step": 113 + }, + { + "epoch": 0.08662613981762918, + "grad_norm": 2.3596107959747314, + "learning_rate": 2.8535353535353537e-06, + "loss": 0.5070714950561523, + "mean_token_accuracy": 0.8258323669433594, + "num_tokens": 1100405.0, + "step": 114 + }, + { + "epoch": 0.08738601823708207, + "grad_norm": 3.0853066444396973, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.591964840888977, + "mean_token_accuracy": 0.8047322630882263, + "num_tokens": 1107535.0, + "step": 115 + }, + { + "epoch": 0.08814589665653495, + "grad_norm": 1.9251092672348022, + "learning_rate": 2.904040404040404e-06, + "loss": 0.5226191878318787, + "mean_token_accuracy": 0.8022720217704773, + "num_tokens": 1118716.0, + "step": 116 + }, + { + "epoch": 0.08890577507598785, + "grad_norm": 1.9692988395690918, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.5462069511413574, + "mean_token_accuracy": 0.8157015442848206, + "num_tokens": 1131917.0, + "step": 117 + }, + { + "epoch": 0.08966565349544073, + "grad_norm": 1.4738909006118774, + "learning_rate": 2.954545454545455e-06, + "loss": 0.4564219117164612, + "mean_token_accuracy": 0.849632978439331, + "num_tokens": 1148534.0, + "step": 118 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 2.72646164894104, + "learning_rate": 2.97979797979798e-06, + "loss": 0.6654808521270752, + "mean_token_accuracy": 0.7752684354782104, + "num_tokens": 1155438.0, + "step": 119 + }, + { + "epoch": 0.0911854103343465, + "grad_norm": 2.7843852043151855, + "learning_rate": 3.0050505050505054e-06, + "loss": 0.5354680418968201, + "mean_token_accuracy": 0.8196378946304321, + "num_tokens": 1161815.0, + "step": 120 + }, + { + "epoch": 0.0919452887537994, + "grad_norm": 2.8052573204040527, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.6366757154464722, + "mean_token_accuracy": 0.7967483997344971, + "num_tokens": 1168295.0, + "step": 121 + }, + { + "epoch": 0.09270516717325228, + "grad_norm": 2.7462735176086426, + "learning_rate": 3.055555555555556e-06, + "loss": 0.59470534324646, + "mean_token_accuracy": 0.8023771047592163, + "num_tokens": 1174502.0, + "step": 122 + }, + { + "epoch": 0.09346504559270517, + "grad_norm": 2.2743821144104004, + "learning_rate": 3.0808080808080807e-06, + "loss": 0.5720560550689697, + "mean_token_accuracy": 0.8162771463394165, + "num_tokens": 1183615.0, + "step": 123 + }, + { + "epoch": 0.09422492401215805, + "grad_norm": 1.8669533729553223, + "learning_rate": 3.1060606060606063e-06, + "loss": 0.4655378758907318, + "mean_token_accuracy": 0.8360732793807983, + "num_tokens": 1193761.0, + "step": 124 + }, + { + "epoch": 0.09498480243161095, + "grad_norm": 1.7666901350021362, + "learning_rate": 3.131313131313132e-06, + "loss": 0.5524153709411621, + "mean_token_accuracy": 0.8252713680267334, + "num_tokens": 1207870.0, + "step": 125 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 2.4720070362091064, + "learning_rate": 3.1565656565656566e-06, + "loss": 0.5003011226654053, + "mean_token_accuracy": 0.8491042852401733, + "num_tokens": 1214603.0, + "step": 126 + }, + { + "epoch": 0.09650455927051672, + "grad_norm": 1.6500422954559326, + "learning_rate": 3.181818181818182e-06, + "loss": 0.5137069225311279, + "mean_token_accuracy": 0.8273531198501587, + "num_tokens": 1228717.0, + "step": 127 + }, + { + "epoch": 0.0972644376899696, + "grad_norm": 3.402543067932129, + "learning_rate": 3.2070707070707072e-06, + "loss": 0.708167552947998, + "mean_token_accuracy": 0.7705385684967041, + "num_tokens": 1234361.0, + "step": 128 + }, + { + "epoch": 0.0980243161094225, + "grad_norm": 2.547285795211792, + "learning_rate": 3.232323232323233e-06, + "loss": 0.6020137071609497, + "mean_token_accuracy": 0.7981340289115906, + "num_tokens": 1244169.0, + "step": 129 + }, + { + "epoch": 0.09878419452887538, + "grad_norm": 2.0578792095184326, + "learning_rate": 3.257575757575758e-06, + "loss": 0.4425000250339508, + "mean_token_accuracy": 0.8567807674407959, + "num_tokens": 1252709.0, + "step": 130 + }, + { + "epoch": 0.09954407294832827, + "grad_norm": 1.672614336013794, + "learning_rate": 3.282828282828283e-06, + "loss": 0.4860966205596924, + "mean_token_accuracy": 0.8393139243125916, + "num_tokens": 1265766.0, + "step": 131 + }, + { + "epoch": 0.10030395136778116, + "grad_norm": 3.2560198307037354, + "learning_rate": 3.3080808080808086e-06, + "loss": 0.624736487865448, + "mean_token_accuracy": 0.7875322699546814, + "num_tokens": 1270779.0, + "step": 132 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 2.4468185901641846, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5062227249145508, + "mean_token_accuracy": 0.8217229843139648, + "num_tokens": 1277113.0, + "step": 133 + }, + { + "epoch": 0.10182370820668693, + "grad_norm": 2.6371328830718994, + "learning_rate": 3.358585858585859e-06, + "loss": 0.477113276720047, + "mean_token_accuracy": 0.8605583906173706, + "num_tokens": 1282514.0, + "step": 134 + }, + { + "epoch": 0.10258358662613981, + "grad_norm": 2.48421311378479, + "learning_rate": 3.3838383838383844e-06, + "loss": 0.40855684876441956, + "mean_token_accuracy": 0.864548921585083, + "num_tokens": 1287859.0, + "step": 135 + }, + { + "epoch": 0.1033434650455927, + "grad_norm": 1.993099331855774, + "learning_rate": 3.409090909090909e-06, + "loss": 0.5913145542144775, + "mean_token_accuracy": 0.8248485922813416, + "num_tokens": 1301074.0, + "step": 136 + }, + { + "epoch": 0.10410334346504559, + "grad_norm": 3.5947680473327637, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.5028599500656128, + "mean_token_accuracy": 0.8367215394973755, + "num_tokens": 1305219.0, + "step": 137 + }, + { + "epoch": 0.10486322188449848, + "grad_norm": 2.5778582096099854, + "learning_rate": 3.45959595959596e-06, + "loss": 0.5297672748565674, + "mean_token_accuracy": 0.8232187032699585, + "num_tokens": 1312482.0, + "step": 138 + }, + { + "epoch": 0.10562310030395136, + "grad_norm": 1.8961588144302368, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.39954107999801636, + "mean_token_accuracy": 0.8605833053588867, + "num_tokens": 1323404.0, + "step": 139 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 1.9687960147857666, + "learning_rate": 3.51010101010101e-06, + "loss": 0.48791587352752686, + "mean_token_accuracy": 0.8200347423553467, + "num_tokens": 1333027.0, + "step": 140 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 2.520242691040039, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.6106002330780029, + "mean_token_accuracy": 0.790692150592804, + "num_tokens": 1340999.0, + "step": 141 + }, + { + "epoch": 0.10790273556231003, + "grad_norm": 3.751617431640625, + "learning_rate": 3.560606060606061e-06, + "loss": 0.48141729831695557, + "mean_token_accuracy": 0.8421382904052734, + "num_tokens": 1344687.0, + "step": 142 + }, + { + "epoch": 0.10866261398176291, + "grad_norm": 2.7101709842681885, + "learning_rate": 3.585858585858586e-06, + "loss": 0.5375241637229919, + "mean_token_accuracy": 0.8061438202857971, + "num_tokens": 1350192.0, + "step": 143 + }, + { + "epoch": 0.1094224924012158, + "grad_norm": 2.583484411239624, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.6492470502853394, + "mean_token_accuracy": 0.7863001823425293, + "num_tokens": 1358148.0, + "step": 144 + }, + { + "epoch": 0.11018237082066869, + "grad_norm": 1.792561650276184, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.48480600118637085, + "mean_token_accuracy": 0.8358709812164307, + "num_tokens": 1369519.0, + "step": 145 + }, + { + "epoch": 0.11094224924012158, + "grad_norm": 2.6480472087860107, + "learning_rate": 3.661616161616162e-06, + "loss": 0.5268933176994324, + "mean_token_accuracy": 0.8214013576507568, + "num_tokens": 1375862.0, + "step": 146 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 2.3174469470977783, + "learning_rate": 3.686868686868687e-06, + "loss": 0.42517897486686707, + "mean_token_accuracy": 0.8523461222648621, + "num_tokens": 1381546.0, + "step": 147 + }, + { + "epoch": 0.11246200607902736, + "grad_norm": 3.0090949535369873, + "learning_rate": 3.7121212121212124e-06, + "loss": 0.4042336940765381, + "mean_token_accuracy": 0.8670448064804077, + "num_tokens": 1385896.0, + "step": 148 + }, + { + "epoch": 0.11322188449848024, + "grad_norm": 2.4928104877471924, + "learning_rate": 3.737373737373738e-06, + "loss": 0.6498878598213196, + "mean_token_accuracy": 0.7967068552970886, + "num_tokens": 1394169.0, + "step": 149 + }, + { + "epoch": 0.11398176291793313, + "grad_norm": 1.5984913110733032, + "learning_rate": 3.7626262626262627e-06, + "loss": 0.546096920967102, + "mean_token_accuracy": 0.8035850524902344, + "num_tokens": 1408785.0, + "step": 150 + }, + { + "epoch": 0.11474164133738601, + "grad_norm": 2.3663532733917236, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.6111721992492676, + "mean_token_accuracy": 0.8015355467796326, + "num_tokens": 1417510.0, + "step": 151 + }, + { + "epoch": 0.11550151975683891, + "grad_norm": 2.518932819366455, + "learning_rate": 3.8131313131313138e-06, + "loss": 0.5274964570999146, + "mean_token_accuracy": 0.8155480623245239, + "num_tokens": 1424186.0, + "step": 152 + }, + { + "epoch": 0.11626139817629179, + "grad_norm": 2.14353609085083, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.5283297896385193, + "mean_token_accuracy": 0.8275758028030396, + "num_tokens": 1432630.0, + "step": 153 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 1.8243604898452759, + "learning_rate": 3.863636363636364e-06, + "loss": 0.41854870319366455, + "mean_token_accuracy": 0.8222295045852661, + "num_tokens": 1442691.0, + "step": 154 + }, + { + "epoch": 0.11778115501519756, + "grad_norm": 2.088212251663208, + "learning_rate": 3.88888888888889e-06, + "loss": 0.6062943339347839, + "mean_token_accuracy": 0.8009427785873413, + "num_tokens": 1456890.0, + "step": 155 + }, + { + "epoch": 0.11854103343465046, + "grad_norm": 1.3469511270523071, + "learning_rate": 3.914141414141415e-06, + "loss": 0.4390433728694916, + "mean_token_accuracy": 0.8436295986175537, + "num_tokens": 1475349.0, + "step": 156 + }, + { + "epoch": 0.11930091185410334, + "grad_norm": 3.247023105621338, + "learning_rate": 3.93939393939394e-06, + "loss": 0.6490433216094971, + "mean_token_accuracy": 0.8037861585617065, + "num_tokens": 1479952.0, + "step": 157 + }, + { + "epoch": 0.12006079027355623, + "grad_norm": 2.6610445976257324, + "learning_rate": 3.964646464646465e-06, + "loss": 0.6221826076507568, + "mean_token_accuracy": 0.7848749160766602, + "num_tokens": 1487306.0, + "step": 158 + }, + { + "epoch": 0.12082066869300911, + "grad_norm": 2.3060810565948486, + "learning_rate": 3.98989898989899e-06, + "loss": 0.5052388310432434, + "mean_token_accuracy": 0.8281195759773254, + "num_tokens": 1495367.0, + "step": 159 + }, + { + "epoch": 0.12158054711246201, + "grad_norm": 2.504448652267456, + "learning_rate": 4.015151515151515e-06, + "loss": 0.5005477666854858, + "mean_token_accuracy": 0.8408058881759644, + "num_tokens": 1502069.0, + "step": 160 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 3.993938446044922, + "learning_rate": 4.04040404040404e-06, + "loss": 0.5569638013839722, + "mean_token_accuracy": 0.8095242977142334, + "num_tokens": 1510224.0, + "step": 161 + }, + { + "epoch": 0.12310030395136778, + "grad_norm": 2.2287683486938477, + "learning_rate": 4.065656565656566e-06, + "loss": 0.524042546749115, + "mean_token_accuracy": 0.8102203607559204, + "num_tokens": 1518364.0, + "step": 162 + }, + { + "epoch": 0.12386018237082067, + "grad_norm": 1.9531738758087158, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.45794573426246643, + "mean_token_accuracy": 0.8560376167297363, + "num_tokens": 1528097.0, + "step": 163 + }, + { + "epoch": 0.12462006079027356, + "grad_norm": 1.5841206312179565, + "learning_rate": 4.116161616161617e-06, + "loss": 0.5420972108840942, + "mean_token_accuracy": 0.8092726469039917, + "num_tokens": 1544119.0, + "step": 164 + }, + { + "epoch": 0.12537993920972645, + "grad_norm": 1.7536218166351318, + "learning_rate": 4.141414141414142e-06, + "loss": 0.554668664932251, + "mean_token_accuracy": 0.8193825483322144, + "num_tokens": 1559140.0, + "step": 165 + }, + { + "epoch": 0.12613981762917933, + "grad_norm": 3.545454740524292, + "learning_rate": 4.166666666666667e-06, + "loss": 0.580947995185852, + "mean_token_accuracy": 0.8286383152008057, + "num_tokens": 1563625.0, + "step": 166 + }, + { + "epoch": 0.12689969604863222, + "grad_norm": 1.6608915328979492, + "learning_rate": 4.191919191919192e-06, + "loss": 0.5523324012756348, + "mean_token_accuracy": 0.8155215978622437, + "num_tokens": 1574945.0, + "step": 167 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 1.4832708835601807, + "learning_rate": 4.217171717171717e-06, + "loss": 0.5133191347122192, + "mean_token_accuracy": 0.8367571830749512, + "num_tokens": 1595865.0, + "step": 168 + }, + { + "epoch": 0.128419452887538, + "grad_norm": 1.7807520627975464, + "learning_rate": 4.242424242424243e-06, + "loss": 0.5131410360336304, + "mean_token_accuracy": 0.8129367232322693, + "num_tokens": 1608723.0, + "step": 169 + }, + { + "epoch": 0.12917933130699089, + "grad_norm": 2.707569122314453, + "learning_rate": 4.267676767676767e-06, + "loss": 0.6129013299942017, + "mean_token_accuracy": 0.7926048040390015, + "num_tokens": 1616136.0, + "step": 170 + }, + { + "epoch": 0.12993920972644377, + "grad_norm": 2.5831644535064697, + "learning_rate": 4.292929292929293e-06, + "loss": 0.6264227628707886, + "mean_token_accuracy": 0.8074911236763, + "num_tokens": 1624228.0, + "step": 171 + }, + { + "epoch": 0.13069908814589665, + "grad_norm": 3.1124250888824463, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.41763827204704285, + "mean_token_accuracy": 0.8565453290939331, + "num_tokens": 1628098.0, + "step": 172 + }, + { + "epoch": 0.13145896656534956, + "grad_norm": 2.3214211463928223, + "learning_rate": 4.343434343434344e-06, + "loss": 0.421974778175354, + "mean_token_accuracy": 0.8391546010971069, + "num_tokens": 1634950.0, + "step": 173 + }, + { + "epoch": 0.13221884498480244, + "grad_norm": 2.1010327339172363, + "learning_rate": 4.368686868686869e-06, + "loss": 0.5307331681251526, + "mean_token_accuracy": 0.8139588236808777, + "num_tokens": 1644132.0, + "step": 174 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 2.533612012863159, + "learning_rate": 4.393939393939394e-06, + "loss": 0.5626664161682129, + "mean_token_accuracy": 0.8029808402061462, + "num_tokens": 1651637.0, + "step": 175 + }, + { + "epoch": 0.1337386018237082, + "grad_norm": 1.669508457183838, + "learning_rate": 4.41919191919192e-06, + "loss": 0.5351508259773254, + "mean_token_accuracy": 0.8281655311584473, + "num_tokens": 1666776.0, + "step": 176 + }, + { + "epoch": 0.1344984802431611, + "grad_norm": 1.7579659223556519, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5235031247138977, + "mean_token_accuracy": 0.8143284320831299, + "num_tokens": 1679241.0, + "step": 177 + }, + { + "epoch": 0.135258358662614, + "grad_norm": 3.123563528060913, + "learning_rate": 4.46969696969697e-06, + "loss": 0.43051332235336304, + "mean_token_accuracy": 0.8518186211585999, + "num_tokens": 1683317.0, + "step": 178 + }, + { + "epoch": 0.13601823708206687, + "grad_norm": 2.2411575317382812, + "learning_rate": 4.494949494949495e-06, + "loss": 0.5471380949020386, + "mean_token_accuracy": 0.8267596960067749, + "num_tokens": 1691366.0, + "step": 179 + }, + { + "epoch": 0.13677811550151975, + "grad_norm": 2.621973991394043, + "learning_rate": 4.520202020202021e-06, + "loss": 0.5685839653015137, + "mean_token_accuracy": 0.8260642290115356, + "num_tokens": 1698148.0, + "step": 180 + }, + { + "epoch": 0.13753799392097266, + "grad_norm": 2.1553852558135986, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5703883171081543, + "mean_token_accuracy": 0.8219090700149536, + "num_tokens": 1707225.0, + "step": 181 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 5.1767897605896, + "learning_rate": 4.5707070707070715e-06, + "loss": 0.32704639434814453, + "mean_token_accuracy": 0.8754568099975586, + "num_tokens": 1712748.0, + "step": 182 + }, + { + "epoch": 0.13905775075987842, + "grad_norm": 2.609168291091919, + "learning_rate": 4.595959595959597e-06, + "loss": 0.5939987301826477, + "mean_token_accuracy": 0.8034975528717041, + "num_tokens": 1719932.0, + "step": 183 + }, + { + "epoch": 0.1398176291793313, + "grad_norm": 2.2059099674224854, + "learning_rate": 4.621212121212122e-06, + "loss": 0.5310720205307007, + "mean_token_accuracy": 0.8177368640899658, + "num_tokens": 1727640.0, + "step": 184 + }, + { + "epoch": 0.1405775075987842, + "grad_norm": 2.6367759704589844, + "learning_rate": 4.646464646464647e-06, + "loss": 0.522086501121521, + "mean_token_accuracy": 0.826233983039856, + "num_tokens": 1733609.0, + "step": 185 + }, + { + "epoch": 0.1413373860182371, + "grad_norm": 3.326732873916626, + "learning_rate": 4.671717171717172e-06, + "loss": 0.4127829074859619, + "mean_token_accuracy": 0.8551101684570312, + "num_tokens": 1737256.0, + "step": 186 + }, + { + "epoch": 0.14209726443768997, + "grad_norm": 1.828412413597107, + "learning_rate": 4.696969696969698e-06, + "loss": 0.5444269180297852, + "mean_token_accuracy": 0.8350818157196045, + "num_tokens": 1750196.0, + "step": 187 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.209203004837036, + "learning_rate": 4.722222222222222e-06, + "loss": 0.5087994933128357, + "mean_token_accuracy": 0.8349015712738037, + "num_tokens": 1754836.0, + "step": 188 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 1.7339166402816772, + "learning_rate": 4.747474747474748e-06, + "loss": 0.5151352286338806, + "mean_token_accuracy": 0.8321266174316406, + "num_tokens": 1766015.0, + "step": 189 + }, + { + "epoch": 0.14437689969604864, + "grad_norm": 2.699068069458008, + "learning_rate": 4.772727272727273e-06, + "loss": 0.4406203031539917, + "mean_token_accuracy": 0.8425000905990601, + "num_tokens": 1771684.0, + "step": 190 + }, + { + "epoch": 0.14513677811550152, + "grad_norm": 2.8117282390594482, + "learning_rate": 4.7979797979797985e-06, + "loss": 0.40428489446640015, + "mean_token_accuracy": 0.8654326796531677, + "num_tokens": 1776301.0, + "step": 191 + }, + { + "epoch": 0.1458966565349544, + "grad_norm": 2.9204647541046143, + "learning_rate": 4.823232323232324e-06, + "loss": 0.4191770553588867, + "mean_token_accuracy": 0.8574687242507935, + "num_tokens": 1781678.0, + "step": 192 + }, + { + "epoch": 0.1466565349544073, + "grad_norm": 2.1648988723754883, + "learning_rate": 4.848484848484849e-06, + "loss": 0.5839012861251831, + "mean_token_accuracy": 0.8053664565086365, + "num_tokens": 1792516.0, + "step": 193 + }, + { + "epoch": 0.1474164133738602, + "grad_norm": 2.3221631050109863, + "learning_rate": 4.873737373737374e-06, + "loss": 0.5037894248962402, + "mean_token_accuracy": 0.8427227139472961, + "num_tokens": 1800192.0, + "step": 194 + }, + { + "epoch": 0.14817629179331307, + "grad_norm": 2.4536430835723877, + "learning_rate": 4.898989898989899e-06, + "loss": 0.42326074838638306, + "mean_token_accuracy": 0.8510633111000061, + "num_tokens": 1806159.0, + "step": 195 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 2.4875805377960205, + "learning_rate": 4.924242424242425e-06, + "loss": 0.539531409740448, + "mean_token_accuracy": 0.8060250282287598, + "num_tokens": 1813392.0, + "step": 196 + }, + { + "epoch": 0.14969604863221886, + "grad_norm": 2.1664798259735107, + "learning_rate": 4.94949494949495e-06, + "loss": 0.42502015829086304, + "mean_token_accuracy": 0.8503251075744629, + "num_tokens": 1821424.0, + "step": 197 + }, + { + "epoch": 0.15045592705167174, + "grad_norm": 2.568808078765869, + "learning_rate": 4.974747474747475e-06, + "loss": 0.5025098323822021, + "mean_token_accuracy": 0.8182311058044434, + "num_tokens": 1827225.0, + "step": 198 + }, + { + "epoch": 0.15121580547112462, + "grad_norm": 1.9116802215576172, + "learning_rate": 5e-06, + "loss": 0.4907258450984955, + "mean_token_accuracy": 0.8310189843177795, + "num_tokens": 1836297.0, + "step": 199 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 3.150765895843506, + "learning_rate": 4.999999122701883e-06, + "loss": 0.390616774559021, + "mean_token_accuracy": 0.8626647591590881, + "num_tokens": 1839984.0, + "step": 200 + }, + { + "epoch": 0.15273556231003038, + "grad_norm": 3.2229044437408447, + "learning_rate": 4.999996490808146e-06, + "loss": 0.48009657859802246, + "mean_token_accuracy": 0.825214147567749, + "num_tokens": 1844610.0, + "step": 201 + }, + { + "epoch": 0.1534954407294833, + "grad_norm": 1.4473289251327515, + "learning_rate": 4.9999921043206356e-06, + "loss": 0.40135183930397034, + "mean_token_accuracy": 0.8537827730178833, + "num_tokens": 1859573.0, + "step": 202 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 4.072319507598877, + "learning_rate": 4.999985963242432e-06, + "loss": 0.6158689260482788, + "mean_token_accuracy": 0.8075432777404785, + "num_tokens": 1863147.0, + "step": 203 + }, + { + "epoch": 0.15501519756838905, + "grad_norm": 3.15741229057312, + "learning_rate": 4.999978067577844e-06, + "loss": 0.4603108763694763, + "mean_token_accuracy": 0.8418779373168945, + "num_tokens": 1867201.0, + "step": 204 + }, + { + "epoch": 0.15577507598784193, + "grad_norm": 2.1925418376922607, + "learning_rate": 4.999968417332415e-06, + "loss": 0.5552488565444946, + "mean_token_accuracy": 0.8216016292572021, + "num_tokens": 1874837.0, + "step": 205 + }, + { + "epoch": 0.15653495440729484, + "grad_norm": 2.2518117427825928, + "learning_rate": 4.999957012512916e-06, + "loss": 0.4912569522857666, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 1881842.0, + "step": 206 + }, + { + "epoch": 0.15729483282674772, + "grad_norm": 1.8223762512207031, + "learning_rate": 4.999943853127351e-06, + "loss": 0.47709137201309204, + "mean_token_accuracy": 0.8311659097671509, + "num_tokens": 1890805.0, + "step": 207 + }, + { + "epoch": 0.1580547112462006, + "grad_norm": 2.066499948501587, + "learning_rate": 4.999928939184958e-06, + "loss": 0.44794657826423645, + "mean_token_accuracy": 0.8513424396514893, + "num_tokens": 1898264.0, + "step": 208 + }, + { + "epoch": 0.15881458966565348, + "grad_norm": 3.53865909576416, + "learning_rate": 4.999912270696202e-06, + "loss": 0.5978270769119263, + "mean_token_accuracy": 0.8080137968063354, + "num_tokens": 1902435.0, + "step": 209 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 2.0760679244995117, + "learning_rate": 4.999893847672783e-06, + "loss": 0.5930601358413696, + "mean_token_accuracy": 0.8028650283813477, + "num_tokens": 1912252.0, + "step": 210 + }, + { + "epoch": 0.16033434650455927, + "grad_norm": 2.21551513671875, + "learning_rate": 4.99987367012763e-06, + "loss": 0.6336753964424133, + "mean_token_accuracy": 0.7902286648750305, + "num_tokens": 1922095.0, + "step": 211 + }, + { + "epoch": 0.16109422492401215, + "grad_norm": 1.7654480934143066, + "learning_rate": 4.999851738074904e-06, + "loss": 0.6373403668403625, + "mean_token_accuracy": 0.7802424430847168, + "num_tokens": 1938962.0, + "step": 212 + }, + { + "epoch": 0.16185410334346503, + "grad_norm": 2.852834701538086, + "learning_rate": 4.9998280515300006e-06, + "loss": 0.6418683528900146, + "mean_token_accuracy": 0.7895716428756714, + "num_tokens": 1944668.0, + "step": 213 + }, + { + "epoch": 0.16261398176291794, + "grad_norm": 3.4737212657928467, + "learning_rate": 4.999802610509541e-06, + "loss": 0.6323273181915283, + "mean_token_accuracy": 0.7982614636421204, + "num_tokens": 1949142.0, + "step": 214 + }, + { + "epoch": 0.16337386018237082, + "grad_norm": 3.0802664756774902, + "learning_rate": 4.999775415031381e-06, + "loss": 0.5929068326950073, + "mean_token_accuracy": 0.8112219572067261, + "num_tokens": 1954141.0, + "step": 215 + }, + { + "epoch": 0.1641337386018237, + "grad_norm": 2.9808855056762695, + "learning_rate": 4.999746465114609e-06, + "loss": 0.5556406378746033, + "mean_token_accuracy": 0.8117628693580627, + "num_tokens": 1959406.0, + "step": 216 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 1.7346166372299194, + "learning_rate": 4.999715760779541e-06, + "loss": 0.5122925043106079, + "mean_token_accuracy": 0.8040724992752075, + "num_tokens": 1971921.0, + "step": 217 + }, + { + "epoch": 0.1656534954407295, + "grad_norm": 1.4183907508850098, + "learning_rate": 4.999683302047729e-06, + "loss": 0.46471893787384033, + "mean_token_accuracy": 0.8381330966949463, + "num_tokens": 1988863.0, + "step": 218 + }, + { + "epoch": 0.16641337386018237, + "grad_norm": 1.6797802448272705, + "learning_rate": 4.999649088941951e-06, + "loss": 0.38348832726478577, + "mean_token_accuracy": 0.8344278931617737, + "num_tokens": 2000003.0, + "step": 219 + }, + { + "epoch": 0.16717325227963525, + "grad_norm": 3.036963939666748, + "learning_rate": 4.999613121486222e-06, + "loss": 0.6062780618667603, + "mean_token_accuracy": 0.8217900991439819, + "num_tokens": 2004813.0, + "step": 220 + }, + { + "epoch": 0.16793313069908813, + "grad_norm": 2.0343217849731445, + "learning_rate": 4.999575399705782e-06, + "loss": 0.5052450895309448, + "mean_token_accuracy": 0.8368623852729797, + "num_tokens": 2013565.0, + "step": 221 + }, + { + "epoch": 0.16869300911854104, + "grad_norm": 2.1162009239196777, + "learning_rate": 4.9995359236271094e-06, + "loss": 0.5169756412506104, + "mean_token_accuracy": 0.8339958190917969, + "num_tokens": 2025763.0, + "step": 222 + }, + { + "epoch": 0.16945288753799392, + "grad_norm": 2.055333375930786, + "learning_rate": 4.9994946932779076e-06, + "loss": 0.6327048540115356, + "mean_token_accuracy": 0.8078711032867432, + "num_tokens": 2037005.0, + "step": 223 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.334620475769043, + "learning_rate": 4.999451708687114e-06, + "loss": 0.5688358545303345, + "mean_token_accuracy": 0.8015589714050293, + "num_tokens": 2041473.0, + "step": 224 + }, + { + "epoch": 0.17097264437689969, + "grad_norm": 2.3734676837921143, + "learning_rate": 4.999406969884897e-06, + "loss": 0.5673821568489075, + "mean_token_accuracy": 0.8054057359695435, + "num_tokens": 2049397.0, + "step": 225 + }, + { + "epoch": 0.1717325227963526, + "grad_norm": 1.807358980178833, + "learning_rate": 4.999360476902656e-06, + "loss": 0.4376158118247986, + "mean_token_accuracy": 0.8456039428710938, + "num_tokens": 2058721.0, + "step": 226 + }, + { + "epoch": 0.17249240121580547, + "grad_norm": 3.231638193130493, + "learning_rate": 4.999312229773022e-06, + "loss": 0.5592809915542603, + "mean_token_accuracy": 0.8170154094696045, + "num_tokens": 2063455.0, + "step": 227 + }, + { + "epoch": 0.17325227963525835, + "grad_norm": 2.2717151641845703, + "learning_rate": 4.999262228529855e-06, + "loss": 0.6144396066665649, + "mean_token_accuracy": 0.7948470115661621, + "num_tokens": 2071686.0, + "step": 228 + }, + { + "epoch": 0.17401215805471124, + "grad_norm": 1.4171342849731445, + "learning_rate": 4.99921047320825e-06, + "loss": 0.43680912256240845, + "mean_token_accuracy": 0.84850013256073, + "num_tokens": 2086999.0, + "step": 229 + }, + { + "epoch": 0.17477203647416414, + "grad_norm": 3.162736654281616, + "learning_rate": 4.99915696384453e-06, + "loss": 0.6025407910346985, + "mean_token_accuracy": 0.8042335510253906, + "num_tokens": 2092001.0, + "step": 230 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 1.8672804832458496, + "learning_rate": 4.99910170047625e-06, + "loss": 0.5843087434768677, + "mean_token_accuracy": 0.8016980886459351, + "num_tokens": 2103372.0, + "step": 231 + }, + { + "epoch": 0.1762917933130699, + "grad_norm": 2.967587471008301, + "learning_rate": 4.999044683142196e-06, + "loss": 0.5123642086982727, + "mean_token_accuracy": 0.8216149806976318, + "num_tokens": 2108008.0, + "step": 232 + }, + { + "epoch": 0.1770516717325228, + "grad_norm": 1.9651981592178345, + "learning_rate": 4.998985911882383e-06, + "loss": 0.5868178606033325, + "mean_token_accuracy": 0.7904198169708252, + "num_tokens": 2119009.0, + "step": 233 + }, + { + "epoch": 0.1778115501519757, + "grad_norm": 2.7785449028015137, + "learning_rate": 4.998925386738063e-06, + "loss": 0.5075510144233704, + "mean_token_accuracy": 0.8280210494995117, + "num_tokens": 2124915.0, + "step": 234 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.957470417022705, + "learning_rate": 4.998863107751711e-06, + "loss": 0.5351958274841309, + "mean_token_accuracy": 0.846825122833252, + "num_tokens": 2129905.0, + "step": 235 + }, + { + "epoch": 0.17933130699088146, + "grad_norm": 3.207671880722046, + "learning_rate": 4.99879907496704e-06, + "loss": 0.6209091544151306, + "mean_token_accuracy": 0.789960503578186, + "num_tokens": 2135027.0, + "step": 236 + }, + { + "epoch": 0.18009118541033434, + "grad_norm": 2.018953800201416, + "learning_rate": 4.998733288428987e-06, + "loss": 0.601510763168335, + "mean_token_accuracy": 0.8136930465698242, + "num_tokens": 2147016.0, + "step": 237 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 2.437281847000122, + "learning_rate": 4.998665748183727e-06, + "loss": 0.5813639163970947, + "mean_token_accuracy": 0.8116716146469116, + "num_tokens": 2155386.0, + "step": 238 + }, + { + "epoch": 0.18161094224924013, + "grad_norm": 1.5708180665969849, + "learning_rate": 4.998596454278661e-06, + "loss": 0.5252395272254944, + "mean_token_accuracy": 0.8193864822387695, + "num_tokens": 2170295.0, + "step": 239 + }, + { + "epoch": 0.182370820668693, + "grad_norm": 1.9921495914459229, + "learning_rate": 4.998525406762422e-06, + "loss": 0.5335029363632202, + "mean_token_accuracy": 0.8120872974395752, + "num_tokens": 2180012.0, + "step": 240 + }, + { + "epoch": 0.1831306990881459, + "grad_norm": 2.6562681198120117, + "learning_rate": 4.998452605684874e-06, + "loss": 0.48021435737609863, + "mean_token_accuracy": 0.8388714790344238, + "num_tokens": 2185607.0, + "step": 241 + }, + { + "epoch": 0.1838905775075988, + "grad_norm": 2.2535853385925293, + "learning_rate": 4.998378051097111e-06, + "loss": 0.5747300386428833, + "mean_token_accuracy": 0.8004639148712158, + "num_tokens": 2194105.0, + "step": 242 + }, + { + "epoch": 0.18465045592705168, + "grad_norm": 1.6151788234710693, + "learning_rate": 4.998301743051459e-06, + "loss": 0.6190565824508667, + "mean_token_accuracy": 0.7816627621650696, + "num_tokens": 2210629.0, + "step": 243 + }, + { + "epoch": 0.18541033434650456, + "grad_norm": 2.1088173389434814, + "learning_rate": 4.9982236816014735e-06, + "loss": 0.4715560972690582, + "mean_token_accuracy": 0.8485721349716187, + "num_tokens": 2218958.0, + "step": 244 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 2.6168735027313232, + "learning_rate": 4.998143866801941e-06, + "loss": 0.6077103018760681, + "mean_token_accuracy": 0.8057924509048462, + "num_tokens": 2226368.0, + "step": 245 + }, + { + "epoch": 0.18693009118541035, + "grad_norm": 2.5988616943359375, + "learning_rate": 4.99806229870888e-06, + "loss": 0.5021637678146362, + "mean_token_accuracy": 0.8361666202545166, + "num_tokens": 2232485.0, + "step": 246 + }, + { + "epoch": 0.18768996960486323, + "grad_norm": 2.015887498855591, + "learning_rate": 4.9979789773795365e-06, + "loss": 0.4309737980365753, + "mean_token_accuracy": 0.8508044481277466, + "num_tokens": 2240819.0, + "step": 247 + }, + { + "epoch": 0.1884498480243161, + "grad_norm": 2.3115265369415283, + "learning_rate": 4.997893902872389e-06, + "loss": 0.5776500701904297, + "mean_token_accuracy": 0.8079549074172974, + "num_tokens": 2249460.0, + "step": 248 + }, + { + "epoch": 0.189209726443769, + "grad_norm": 1.7387021780014038, + "learning_rate": 4.997807075247147e-06, + "loss": 0.430944561958313, + "mean_token_accuracy": 0.8483544588088989, + "num_tokens": 2259124.0, + "step": 249 + }, + { + "epoch": 0.1899696048632219, + "grad_norm": 1.6378381252288818, + "learning_rate": 4.997718494564747e-06, + "loss": 0.4123363792896271, + "mean_token_accuracy": 0.8557409644126892, + "num_tokens": 2269899.0, + "step": 250 + }, + { + "epoch": 0.19072948328267478, + "grad_norm": 1.336282730102539, + "learning_rate": 4.997628160887361e-06, + "loss": 0.502329409122467, + "mean_token_accuracy": 0.8186938166618347, + "num_tokens": 2292821.0, + "step": 251 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 3.3335583209991455, + "learning_rate": 4.997536074278388e-06, + "loss": 0.584446907043457, + "mean_token_accuracy": 0.8062717318534851, + "num_tokens": 2297175.0, + "step": 252 + }, + { + "epoch": 0.19224924012158054, + "grad_norm": 2.246727228164673, + "learning_rate": 4.9974422348024565e-06, + "loss": 0.5683060884475708, + "mean_token_accuracy": 0.8193703293800354, + "num_tokens": 2305456.0, + "step": 253 + }, + { + "epoch": 0.19300911854103345, + "grad_norm": 2.3520865440368652, + "learning_rate": 4.997346642525429e-06, + "loss": 0.4724946618080139, + "mean_token_accuracy": 0.8426719307899475, + "num_tokens": 2312241.0, + "step": 254 + }, + { + "epoch": 0.19376899696048633, + "grad_norm": 2.7115702629089355, + "learning_rate": 4.9972492975143936e-06, + "loss": 0.5019032955169678, + "mean_token_accuracy": 0.8253573179244995, + "num_tokens": 2318094.0, + "step": 255 + }, + { + "epoch": 0.1945288753799392, + "grad_norm": 1.705528974533081, + "learning_rate": 4.997150199837671e-06, + "loss": 0.45588475465774536, + "mean_token_accuracy": 0.836666464805603, + "num_tokens": 2329025.0, + "step": 256 + }, + { + "epoch": 0.1952887537993921, + "grad_norm": 2.161400318145752, + "learning_rate": 4.997049349564814e-06, + "loss": 0.5170183777809143, + "mean_token_accuracy": 0.8287534117698669, + "num_tokens": 2337448.0, + "step": 257 + }, + { + "epoch": 0.196048632218845, + "grad_norm": 2.629669189453125, + "learning_rate": 4.996946746766602e-06, + "loss": 0.44650501012802124, + "mean_token_accuracy": 0.850114107131958, + "num_tokens": 2343207.0, + "step": 258 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 1.6735503673553467, + "learning_rate": 4.996842391515045e-06, + "loss": 0.5247820019721985, + "mean_token_accuracy": 0.8285071849822998, + "num_tokens": 2356801.0, + "step": 259 + }, + { + "epoch": 0.19756838905775076, + "grad_norm": 1.2753115892410278, + "learning_rate": 4.996736283883382e-06, + "loss": 0.41870927810668945, + "mean_token_accuracy": 0.8448047637939453, + "num_tokens": 2377306.0, + "step": 260 + }, + { + "epoch": 0.19832826747720364, + "grad_norm": 2.6947314739227295, + "learning_rate": 4.9966284239460875e-06, + "loss": 0.5059205889701843, + "mean_token_accuracy": 0.8430814743041992, + "num_tokens": 2383352.0, + "step": 261 + }, + { + "epoch": 0.19908814589665655, + "grad_norm": 2.0509963035583496, + "learning_rate": 4.996518811778858e-06, + "loss": 0.4565388560295105, + "mean_token_accuracy": 0.8453130722045898, + "num_tokens": 2391149.0, + "step": 262 + }, + { + "epoch": 0.19984802431610943, + "grad_norm": 2.1856348514556885, + "learning_rate": 4.996407447458626e-06, + "loss": 0.531380832195282, + "mean_token_accuracy": 0.8387004137039185, + "num_tokens": 2399875.0, + "step": 263 + }, + { + "epoch": 0.2006079027355623, + "grad_norm": 2.7348573207855225, + "learning_rate": 4.99629433106355e-06, + "loss": 0.5242817401885986, + "mean_token_accuracy": 0.8177423477172852, + "num_tokens": 2406586.0, + "step": 264 + }, + { + "epoch": 0.2013677811550152, + "grad_norm": 1.76587975025177, + "learning_rate": 4.99617946267302e-06, + "loss": 0.49298471212387085, + "mean_token_accuracy": 0.8271149396896362, + "num_tokens": 2418683.0, + "step": 265 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 2.8129730224609375, + "learning_rate": 4.996062842367655e-06, + "loss": 0.46420302987098694, + "mean_token_accuracy": 0.8453244566917419, + "num_tokens": 2422929.0, + "step": 266 + }, + { + "epoch": 0.20288753799392098, + "grad_norm": 2.575744152069092, + "learning_rate": 4.9959444702293025e-06, + "loss": 0.43208545446395874, + "mean_token_accuracy": 0.8494843244552612, + "num_tokens": 2429567.0, + "step": 267 + }, + { + "epoch": 0.20364741641337386, + "grad_norm": 2.7586750984191895, + "learning_rate": 4.995824346341041e-06, + "loss": 0.4390473961830139, + "mean_token_accuracy": 0.8348895311355591, + "num_tokens": 2434700.0, + "step": 268 + }, + { + "epoch": 0.20440729483282674, + "grad_norm": 1.972145438194275, + "learning_rate": 4.99570247078718e-06, + "loss": 0.6219544410705566, + "mean_token_accuracy": 0.7939999103546143, + "num_tokens": 2447007.0, + "step": 269 + }, + { + "epoch": 0.20516717325227962, + "grad_norm": 2.2963485717773438, + "learning_rate": 4.995578843653255e-06, + "loss": 0.5008970499038696, + "mean_token_accuracy": 0.8255308866500854, + "num_tokens": 2453936.0, + "step": 270 + }, + { + "epoch": 0.20592705167173253, + "grad_norm": 1.8897721767425537, + "learning_rate": 4.995453465026033e-06, + "loss": 0.5436089038848877, + "mean_token_accuracy": 0.819086492061615, + "num_tokens": 2464494.0, + "step": 271 + }, + { + "epoch": 0.2066869300911854, + "grad_norm": 2.319728374481201, + "learning_rate": 4.995326334993508e-06, + "loss": 0.5136368870735168, + "mean_token_accuracy": 0.820817232131958, + "num_tokens": 2470938.0, + "step": 272 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 2.230414390563965, + "learning_rate": 4.9951974536449055e-06, + "loss": 0.5272846817970276, + "mean_token_accuracy": 0.8203279972076416, + "num_tokens": 2478629.0, + "step": 273 + }, + { + "epoch": 0.20820668693009117, + "grad_norm": 3.401937484741211, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.4389592111110687, + "mean_token_accuracy": 0.8647899031639099, + "num_tokens": 2482193.0, + "step": 274 + }, + { + "epoch": 0.20896656534954408, + "grad_norm": 2.1278507709503174, + "learning_rate": 4.994934437362513e-06, + "loss": 0.598863422870636, + "mean_token_accuracy": 0.7945119738578796, + "num_tokens": 2492465.0, + "step": 275 + }, + { + "epoch": 0.20972644376899696, + "grad_norm": 1.9259960651397705, + "learning_rate": 4.994800302613318e-06, + "loss": 0.49520939588546753, + "mean_token_accuracy": 0.8371536135673523, + "num_tokens": 2500825.0, + "step": 276 + }, + { + "epoch": 0.21048632218844984, + "grad_norm": 2.346418857574463, + "learning_rate": 4.994664416917236e-06, + "loss": 0.5412614345550537, + "mean_token_accuracy": 0.810661792755127, + "num_tokens": 2509513.0, + "step": 277 + }, + { + "epoch": 0.21124620060790272, + "grad_norm": 1.3092039823532104, + "learning_rate": 4.994526780369636e-06, + "loss": 0.46305379271507263, + "mean_token_accuracy": 0.8358527421951294, + "num_tokens": 2531405.0, + "step": 278 + }, + { + "epoch": 0.21200607902735563, + "grad_norm": 2.924611806869507, + "learning_rate": 4.9943873930671175e-06, + "loss": 0.6134544610977173, + "mean_token_accuracy": 0.7947378754615784, + "num_tokens": 2536744.0, + "step": 279 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.8290598392486572, + "learning_rate": 4.994246255107506e-06, + "loss": 0.465520441532135, + "mean_token_accuracy": 0.8440108299255371, + "num_tokens": 2541184.0, + "step": 280 + }, + { + "epoch": 0.2135258358662614, + "grad_norm": 3.8081259727478027, + "learning_rate": 4.994103366589859e-06, + "loss": 0.43394139409065247, + "mean_token_accuracy": 0.8579148054122925, + "num_tokens": 2545395.0, + "step": 281 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.7994529008865356, + "learning_rate": 4.993958727614462e-06, + "loss": 0.5076484680175781, + "mean_token_accuracy": 0.8270803093910217, + "num_tokens": 2556541.0, + "step": 282 + }, + { + "epoch": 0.21504559270516718, + "grad_norm": 2.5582659244537354, + "learning_rate": 4.993812338282826e-06, + "loss": 0.4453684389591217, + "mean_token_accuracy": 0.8488293886184692, + "num_tokens": 2562949.0, + "step": 283 + }, + { + "epoch": 0.21580547112462006, + "grad_norm": 1.6448938846588135, + "learning_rate": 4.993664198697694e-06, + "loss": 0.461971640586853, + "mean_token_accuracy": 0.824763298034668, + "num_tokens": 2576407.0, + "step": 284 + }, + { + "epoch": 0.21656534954407294, + "grad_norm": 2.1264469623565674, + "learning_rate": 4.993514308963037e-06, + "loss": 0.6241602897644043, + "mean_token_accuracy": 0.7916014790534973, + "num_tokens": 2585695.0, + "step": 285 + }, + { + "epoch": 0.21732522796352582, + "grad_norm": 3.629991292953491, + "learning_rate": 4.993362669184051e-06, + "loss": 0.610355019569397, + "mean_token_accuracy": 0.7847568988800049, + "num_tokens": 2589778.0, + "step": 286 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 1.9070756435394287, + "learning_rate": 4.993209279467164e-06, + "loss": 0.5513623952865601, + "mean_token_accuracy": 0.7911607027053833, + "num_tokens": 2600920.0, + "step": 287 + }, + { + "epoch": 0.2188449848024316, + "grad_norm": 1.761062741279602, + "learning_rate": 4.993054139920031e-06, + "loss": 0.4579957127571106, + "mean_token_accuracy": 0.8189530372619629, + "num_tokens": 2611856.0, + "step": 288 + }, + { + "epoch": 0.2196048632218845, + "grad_norm": 1.7264713048934937, + "learning_rate": 4.992897250651535e-06, + "loss": 0.5871305465698242, + "mean_token_accuracy": 0.7918527126312256, + "num_tokens": 2624730.0, + "step": 289 + }, + { + "epoch": 0.22036474164133737, + "grad_norm": 1.7455977201461792, + "learning_rate": 4.992738611771787e-06, + "loss": 0.5475119948387146, + "mean_token_accuracy": 0.8226917386054993, + "num_tokens": 2635705.0, + "step": 290 + }, + { + "epoch": 0.22112462006079028, + "grad_norm": 2.095095157623291, + "learning_rate": 4.992578223392124e-06, + "loss": 0.5952225923538208, + "mean_token_accuracy": 0.8078469038009644, + "num_tokens": 2643954.0, + "step": 291 + }, + { + "epoch": 0.22188449848024316, + "grad_norm": 2.994664192199707, + "learning_rate": 4.992416085625115e-06, + "loss": 0.5432442426681519, + "mean_token_accuracy": 0.8329008221626282, + "num_tokens": 2648800.0, + "step": 292 + }, + { + "epoch": 0.22264437689969604, + "grad_norm": 2.796790361404419, + "learning_rate": 4.992252198584554e-06, + "loss": 0.5168961882591248, + "mean_token_accuracy": 0.8393474817276001, + "num_tokens": 2653546.0, + "step": 293 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 1.8610522747039795, + "learning_rate": 4.992086562385462e-06, + "loss": 0.5728024244308472, + "mean_token_accuracy": 0.797406792640686, + "num_tokens": 2667483.0, + "step": 294 + }, + { + "epoch": 0.22416413373860183, + "grad_norm": 1.695472002029419, + "learning_rate": 4.9919191771440905e-06, + "loss": 0.5460028648376465, + "mean_token_accuracy": 0.8123016357421875, + "num_tokens": 2683574.0, + "step": 295 + }, + { + "epoch": 0.22492401215805471, + "grad_norm": 2.8627376556396484, + "learning_rate": 4.9917500429779165e-06, + "loss": 0.5566985011100769, + "mean_token_accuracy": 0.815531313419342, + "num_tokens": 2688985.0, + "step": 296 + }, + { + "epoch": 0.2256838905775076, + "grad_norm": 2.73323655128479, + "learning_rate": 4.991579160005644e-06, + "loss": 0.48197102546691895, + "mean_token_accuracy": 0.8471829295158386, + "num_tokens": 2694799.0, + "step": 297 + }, + { + "epoch": 0.22644376899696048, + "grad_norm": 1.8436161279678345, + "learning_rate": 4.991406528347206e-06, + "loss": 0.4528339207172394, + "mean_token_accuracy": 0.8603188395500183, + "num_tokens": 2707321.0, + "step": 298 + }, + { + "epoch": 0.22720364741641338, + "grad_norm": 2.6231515407562256, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.5916541814804077, + "mean_token_accuracy": 0.8050242066383362, + "num_tokens": 2714233.0, + "step": 299 + }, + { + "epoch": 0.22796352583586627, + "grad_norm": 3.08776593208313, + "learning_rate": 4.991056019457697e-06, + "loss": 0.4860580563545227, + "mean_token_accuracy": 0.8464088439941406, + "num_tokens": 2718443.0, + "step": 300 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 2.2537803649902344, + "learning_rate": 4.990878142472628e-06, + "loss": 0.5158311128616333, + "mean_token_accuracy": 0.824694812297821, + "num_tokens": 2726158.0, + "step": 301 + }, + { + "epoch": 0.22948328267477203, + "grad_norm": 2.1122705936431885, + "learning_rate": 4.990698517293394e-06, + "loss": 0.495265394449234, + "mean_token_accuracy": 0.8343238830566406, + "num_tokens": 2735022.0, + "step": 302 + }, + { + "epoch": 0.23024316109422494, + "grad_norm": 3.5503528118133545, + "learning_rate": 4.9905171440460645e-06, + "loss": 0.46063232421875, + "mean_token_accuracy": 0.8420047760009766, + "num_tokens": 2738550.0, + "step": 303 + }, + { + "epoch": 0.23100303951367782, + "grad_norm": 3.9858486652374268, + "learning_rate": 4.990334022857932e-06, + "loss": 0.5832710266113281, + "mean_token_accuracy": 0.8144199848175049, + "num_tokens": 2741720.0, + "step": 304 + }, + { + "epoch": 0.2317629179331307, + "grad_norm": 2.407231330871582, + "learning_rate": 4.990149153857519e-06, + "loss": 0.4692630171775818, + "mean_token_accuracy": 0.8429223299026489, + "num_tokens": 2748693.0, + "step": 305 + }, + { + "epoch": 0.23252279635258358, + "grad_norm": 1.6996397972106934, + "learning_rate": 4.989962537174573e-06, + "loss": 0.49143946170806885, + "mean_token_accuracy": 0.8340128064155579, + "num_tokens": 2761254.0, + "step": 306 + }, + { + "epoch": 0.23328267477203649, + "grad_norm": 3.746432065963745, + "learning_rate": 4.989774172940071e-06, + "loss": 0.6282026767730713, + "mean_token_accuracy": 0.775698184967041, + "num_tokens": 2765115.0, + "step": 307 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 2.212872266769409, + "learning_rate": 4.989584061286211e-06, + "loss": 0.5193763971328735, + "mean_token_accuracy": 0.8168246746063232, + "num_tokens": 2772345.0, + "step": 308 + }, + { + "epoch": 0.23480243161094225, + "grad_norm": 1.752297282218933, + "learning_rate": 4.989392202346423e-06, + "loss": 0.4437984824180603, + "mean_token_accuracy": 0.8451256155967712, + "num_tokens": 2783072.0, + "step": 309 + }, + { + "epoch": 0.23556231003039513, + "grad_norm": 2.386019706726074, + "learning_rate": 4.989198596255361e-06, + "loss": 0.4090752899646759, + "mean_token_accuracy": 0.8480085134506226, + "num_tokens": 2788757.0, + "step": 310 + }, + { + "epoch": 0.23632218844984804, + "grad_norm": 3.9981489181518555, + "learning_rate": 4.989003243148904e-06, + "loss": 0.5149132013320923, + "mean_token_accuracy": 0.8179056644439697, + "num_tokens": 2792096.0, + "step": 311 + }, + { + "epoch": 0.23708206686930092, + "grad_norm": 1.8723100423812866, + "learning_rate": 4.988806143164159e-06, + "loss": 0.4531487822532654, + "mean_token_accuracy": 0.8400167226791382, + "num_tokens": 2802210.0, + "step": 312 + }, + { + "epoch": 0.2378419452887538, + "grad_norm": 2.3415136337280273, + "learning_rate": 4.988607296439459e-06, + "loss": 0.5974439978599548, + "mean_token_accuracy": 0.8035976886749268, + "num_tokens": 2810088.0, + "step": 313 + }, + { + "epoch": 0.23860182370820668, + "grad_norm": 1.5317577123641968, + "learning_rate": 4.98840670311436e-06, + "loss": 0.49247145652770996, + "mean_token_accuracy": 0.8292540311813354, + "num_tokens": 2824005.0, + "step": 314 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 2.170772075653076, + "learning_rate": 4.988204363329648e-06, + "loss": 0.6359974145889282, + "mean_token_accuracy": 0.7785564661026001, + "num_tokens": 2834680.0, + "step": 315 + }, + { + "epoch": 0.24012158054711247, + "grad_norm": 3.2655932903289795, + "learning_rate": 4.988000277227334e-06, + "loss": 0.5080196857452393, + "mean_token_accuracy": 0.8295877575874329, + "num_tokens": 2838735.0, + "step": 316 + }, + { + "epoch": 0.24088145896656535, + "grad_norm": 3.406589984893799, + "learning_rate": 4.987794444950651e-06, + "loss": 0.3939085006713867, + "mean_token_accuracy": 0.8700719475746155, + "num_tokens": 2842127.0, + "step": 317 + }, + { + "epoch": 0.24164133738601823, + "grad_norm": 1.8211106061935425, + "learning_rate": 4.987586866644061e-06, + "loss": 0.5270540118217468, + "mean_token_accuracy": 0.826683521270752, + "num_tokens": 2853656.0, + "step": 318 + }, + { + "epoch": 0.24240121580547114, + "grad_norm": 1.8429969549179077, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.4705049991607666, + "mean_token_accuracy": 0.8355701565742493, + "num_tokens": 2863513.0, + "step": 319 + }, + { + "epoch": 0.24316109422492402, + "grad_norm": 2.2425320148468018, + "learning_rate": 4.9871664725251314e-06, + "loss": 0.485736608505249, + "mean_token_accuracy": 0.835182785987854, + "num_tokens": 2871556.0, + "step": 320 + }, + { + "epoch": 0.2439209726443769, + "grad_norm": 1.6202056407928467, + "learning_rate": 4.986953657007841e-06, + "loss": 0.4437887370586395, + "mean_token_accuracy": 0.8282591700553894, + "num_tokens": 2884335.0, + "step": 321 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 1.1027268171310425, + "learning_rate": 4.98673909605074e-06, + "loss": 0.3770800828933716, + "mean_token_accuracy": 0.8325437307357788, + "num_tokens": 2904286.0, + "step": 322 + }, + { + "epoch": 0.2454407294832827, + "grad_norm": 2.3239076137542725, + "learning_rate": 4.986522789804417e-06, + "loss": 0.5387254953384399, + "mean_token_accuracy": 0.806242823600769, + "num_tokens": 2910975.0, + "step": 323 + }, + { + "epoch": 0.24620060790273557, + "grad_norm": 2.243482828140259, + "learning_rate": 4.986304738420684e-06, + "loss": 0.4396553039550781, + "mean_token_accuracy": 0.8561904430389404, + "num_tokens": 2917087.0, + "step": 324 + }, + { + "epoch": 0.24696048632218845, + "grad_norm": 2.537264347076416, + "learning_rate": 4.986084942052577e-06, + "loss": 0.395110160112381, + "mean_token_accuracy": 0.8636915683746338, + "num_tokens": 2921887.0, + "step": 325 + }, + { + "epoch": 0.24772036474164133, + "grad_norm": 2.319399118423462, + "learning_rate": 4.9858634008543574e-06, + "loss": 0.581517219543457, + "mean_token_accuracy": 0.8157487511634827, + "num_tokens": 2928996.0, + "step": 326 + }, + { + "epoch": 0.24848024316109424, + "grad_norm": 1.9787474870681763, + "learning_rate": 4.985640114981513e-06, + "loss": 0.5084106922149658, + "mean_token_accuracy": 0.835221529006958, + "num_tokens": 2940302.0, + "step": 327 + }, + { + "epoch": 0.24924012158054712, + "grad_norm": 2.4783265590667725, + "learning_rate": 4.985415084590752e-06, + "loss": 0.6062222719192505, + "mean_token_accuracy": 0.7885516285896301, + "num_tokens": 2946386.0, + "step": 328 + }, + { + "epoch": 0.25, + "grad_norm": 2.4081411361694336, + "learning_rate": 4.985188309840012e-06, + "loss": 0.5079880356788635, + "mean_token_accuracy": 0.8313904404640198, + "num_tokens": 2952323.0, + "step": 329 + }, + { + "epoch": 0.2507598784194529, + "grad_norm": 2.64993953704834, + "learning_rate": 4.984959790888451e-06, + "loss": 0.5461447834968567, + "mean_token_accuracy": 0.8125468492507935, + "num_tokens": 2958119.0, + "step": 330 + }, + { + "epoch": 0.25151975683890576, + "grad_norm": 2.549734115600586, + "learning_rate": 4.984729527896451e-06, + "loss": 0.5998573303222656, + "mean_token_accuracy": 0.8076666593551636, + "num_tokens": 2964947.0, + "step": 331 + }, + { + "epoch": 0.25227963525835867, + "grad_norm": 3.2185161113739014, + "learning_rate": 4.984497521025622e-06, + "loss": 0.4232945442199707, + "mean_token_accuracy": 0.8543803095817566, + "num_tokens": 2968598.0, + "step": 332 + }, + { + "epoch": 0.2530395136778115, + "grad_norm": 2.588994264602661, + "learning_rate": 4.984263770438793e-06, + "loss": 0.460967481136322, + "mean_token_accuracy": 0.8416207432746887, + "num_tokens": 2974510.0, + "step": 333 + }, + { + "epoch": 0.25379939209726443, + "grad_norm": 2.1373162269592285, + "learning_rate": 4.984028276300021e-06, + "loss": 0.49382102489471436, + "mean_token_accuracy": 0.8388048410415649, + "num_tokens": 2981632.0, + "step": 334 + }, + { + "epoch": 0.25455927051671734, + "grad_norm": 2.2524826526641846, + "learning_rate": 4.983791038774585e-06, + "loss": 0.4947671890258789, + "mean_token_accuracy": 0.8066365122795105, + "num_tokens": 2988736.0, + "step": 335 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.7244199514389038, + "learning_rate": 4.983552058028985e-06, + "loss": 0.48096776008605957, + "mean_token_accuracy": 0.830735445022583, + "num_tokens": 3003576.0, + "step": 336 + }, + { + "epoch": 0.2560790273556231, + "grad_norm": 3.0628933906555176, + "learning_rate": 4.9833113342309495e-06, + "loss": 0.6027032136917114, + "mean_token_accuracy": 0.8008694648742676, + "num_tokens": 3009549.0, + "step": 337 + }, + { + "epoch": 0.256838905775076, + "grad_norm": 2.438674211502075, + "learning_rate": 4.983068867549427e-06, + "loss": 0.517090916633606, + "mean_token_accuracy": 0.827893853187561, + "num_tokens": 3015236.0, + "step": 338 + }, + { + "epoch": 0.25759878419452886, + "grad_norm": 2.131535053253174, + "learning_rate": 4.982824658154589e-06, + "loss": 0.6656812429428101, + "mean_token_accuracy": 0.7772425413131714, + "num_tokens": 3028142.0, + "step": 339 + }, + { + "epoch": 0.25835866261398177, + "grad_norm": 2.3206584453582764, + "learning_rate": 4.9825787062178315e-06, + "loss": 0.5757625699043274, + "mean_token_accuracy": 0.8073873519897461, + "num_tokens": 3040996.0, + "step": 340 + }, + { + "epoch": 0.2591185410334346, + "grad_norm": 1.3905521631240845, + "learning_rate": 4.982331011911774e-06, + "loss": 0.4193805456161499, + "mean_token_accuracy": 0.8399466872215271, + "num_tokens": 3061931.0, + "step": 341 + }, + { + "epoch": 0.25987841945288753, + "grad_norm": 2.184173345565796, + "learning_rate": 4.982081575410256e-06, + "loss": 0.4751223921775818, + "mean_token_accuracy": 0.8409271240234375, + "num_tokens": 3069081.0, + "step": 342 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.538764238357544, + "learning_rate": 4.9818303968883445e-06, + "loss": 0.8119601011276245, + "mean_token_accuracy": 0.7442739009857178, + "num_tokens": 3073628.0, + "step": 343 + }, + { + "epoch": 0.2613981762917933, + "grad_norm": 1.8063762187957764, + "learning_rate": 4.981577476522323e-06, + "loss": 0.5615730881690979, + "mean_token_accuracy": 0.8207751512527466, + "num_tokens": 3086596.0, + "step": 344 + }, + { + "epoch": 0.2621580547112462, + "grad_norm": 2.4346961975097656, + "learning_rate": 4.981322814489703e-06, + "loss": 0.5266709327697754, + "mean_token_accuracy": 0.8211277723312378, + "num_tokens": 3092631.0, + "step": 345 + }, + { + "epoch": 0.2629179331306991, + "grad_norm": 1.91289484500885, + "learning_rate": 4.981066410969215e-06, + "loss": 0.5047177672386169, + "mean_token_accuracy": 0.8356877565383911, + "num_tokens": 3101102.0, + "step": 346 + }, + { + "epoch": 0.26367781155015196, + "grad_norm": 2.1495707035064697, + "learning_rate": 4.980808266140813e-06, + "loss": 0.47876280546188354, + "mean_token_accuracy": 0.8364313244819641, + "num_tokens": 3107998.0, + "step": 347 + }, + { + "epoch": 0.26443768996960487, + "grad_norm": 2.5961992740631104, + "learning_rate": 4.9805483801856744e-06, + "loss": 0.5512958765029907, + "mean_token_accuracy": 0.8181467652320862, + "num_tokens": 3113848.0, + "step": 348 + }, + { + "epoch": 0.2651975683890577, + "grad_norm": 3.2828900814056396, + "learning_rate": 4.980286753286196e-06, + "loss": 0.4217945635318756, + "mean_token_accuracy": 0.8617103099822998, + "num_tokens": 3117652.0, + "step": 349 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 1.425554871559143, + "learning_rate": 4.980023385625996e-06, + "loss": 0.4042487144470215, + "mean_token_accuracy": 0.8492785692214966, + "num_tokens": 3132336.0, + "step": 350 + }, + { + "epoch": 0.26671732522796354, + "grad_norm": 2.933504104614258, + "learning_rate": 4.979758277389919e-06, + "loss": 0.5406704545021057, + "mean_token_accuracy": 0.8035423755645752, + "num_tokens": 3137544.0, + "step": 351 + }, + { + "epoch": 0.2674772036474164, + "grad_norm": 1.9958966970443726, + "learning_rate": 4.9794914287640264e-06, + "loss": 0.5857555270195007, + "mean_token_accuracy": 0.7965140342712402, + "num_tokens": 3149705.0, + "step": 352 + }, + { + "epoch": 0.2682370820668693, + "grad_norm": 2.467694044113159, + "learning_rate": 4.979222839935602e-06, + "loss": 0.6404043436050415, + "mean_token_accuracy": 0.7823755741119385, + "num_tokens": 3158353.0, + "step": 353 + }, + { + "epoch": 0.2689969604863222, + "grad_norm": 2.0102720260620117, + "learning_rate": 4.9789525110931545e-06, + "loss": 0.5681496858596802, + "mean_token_accuracy": 0.8108169436454773, + "num_tokens": 3167121.0, + "step": 354 + }, + { + "epoch": 0.26975683890577506, + "grad_norm": 2.6017866134643555, + "learning_rate": 4.978680442426409e-06, + "loss": 0.6309828162193298, + "mean_token_accuracy": 0.7742617130279541, + "num_tokens": 3175012.0, + "step": 355 + }, + { + "epoch": 0.270516717325228, + "grad_norm": 1.8799268007278442, + "learning_rate": 4.978406634126315e-06, + "loss": 0.524029016494751, + "mean_token_accuracy": 0.8317689895629883, + "num_tokens": 3185331.0, + "step": 356 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 1.508332371711731, + "learning_rate": 4.978131086385041e-06, + "loss": 0.46656402945518494, + "mean_token_accuracy": 0.8339117765426636, + "num_tokens": 3198813.0, + "step": 357 + }, + { + "epoch": 0.27203647416413373, + "grad_norm": 3.595707654953003, + "learning_rate": 4.977853799395976e-06, + "loss": 0.5101234912872314, + "mean_token_accuracy": 0.8251723051071167, + "num_tokens": 3206557.0, + "step": 358 + }, + { + "epoch": 0.27279635258358664, + "grad_norm": 3.5317916870117188, + "learning_rate": 4.977574773353732e-06, + "loss": 0.5684665441513062, + "mean_token_accuracy": 0.8124493360519409, + "num_tokens": 3210912.0, + "step": 359 + }, + { + "epoch": 0.2735562310030395, + "grad_norm": 2.8606204986572266, + "learning_rate": 4.97729400845414e-06, + "loss": 0.4746384620666504, + "mean_token_accuracy": 0.8195606470108032, + "num_tokens": 3215365.0, + "step": 360 + }, + { + "epoch": 0.2743161094224924, + "grad_norm": 1.8214033842086792, + "learning_rate": 4.977011504894253e-06, + "loss": 0.4842769503593445, + "mean_token_accuracy": 0.82928866147995, + "num_tokens": 3224037.0, + "step": 361 + }, + { + "epoch": 0.2750759878419453, + "grad_norm": 1.628746509552002, + "learning_rate": 4.97672726287234e-06, + "loss": 0.4397493302822113, + "mean_token_accuracy": 0.8606528043746948, + "num_tokens": 3235589.0, + "step": 362 + }, + { + "epoch": 0.27583586626139817, + "grad_norm": 3.557973861694336, + "learning_rate": 4.976441282587894e-06, + "loss": 0.5732032060623169, + "mean_token_accuracy": 0.8041545748710632, + "num_tokens": 3239958.0, + "step": 363 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 1.3467901945114136, + "learning_rate": 4.9761535642416284e-06, + "loss": 0.4525323510169983, + "mean_token_accuracy": 0.8281061053276062, + "num_tokens": 3257703.0, + "step": 364 + }, + { + "epoch": 0.2773556231003039, + "grad_norm": 2.2649986743927, + "learning_rate": 4.9758641080354745e-06, + "loss": 0.5074734687805176, + "mean_token_accuracy": 0.8447474241256714, + "num_tokens": 3264334.0, + "step": 365 + }, + { + "epoch": 0.27811550151975684, + "grad_norm": 2.8667566776275635, + "learning_rate": 4.975572914172581e-06, + "loss": 0.5759559869766235, + "mean_token_accuracy": 0.7976793050765991, + "num_tokens": 3269314.0, + "step": 366 + }, + { + "epoch": 0.27887537993920974, + "grad_norm": 2.2514986991882324, + "learning_rate": 4.975279982857324e-06, + "loss": 0.5786465406417847, + "mean_token_accuracy": 0.8058781623840332, + "num_tokens": 3277324.0, + "step": 367 + }, + { + "epoch": 0.2796352583586626, + "grad_norm": 1.3826723098754883, + "learning_rate": 4.97498531429529e-06, + "loss": 0.40801727771759033, + "mean_token_accuracy": 0.8601310849189758, + "num_tokens": 3290530.0, + "step": 368 + }, + { + "epoch": 0.2803951367781155, + "grad_norm": 2.084092617034912, + "learning_rate": 4.97468890869329e-06, + "loss": 0.47076648473739624, + "mean_token_accuracy": 0.8310186862945557, + "num_tokens": 3298325.0, + "step": 369 + }, + { + "epoch": 0.2811550151975684, + "grad_norm": 1.3467998504638672, + "learning_rate": 4.974390766259353e-06, + "loss": 0.44668465852737427, + "mean_token_accuracy": 0.8275353908538818, + "num_tokens": 3314302.0, + "step": 370 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 2.5921075344085693, + "learning_rate": 4.974090887202726e-06, + "loss": 0.5343953967094421, + "mean_token_accuracy": 0.8110706806182861, + "num_tokens": 3320963.0, + "step": 371 + }, + { + "epoch": 0.2826747720364742, + "grad_norm": 2.042781352996826, + "learning_rate": 4.973789271733877e-06, + "loss": 0.6293343305587769, + "mean_token_accuracy": 0.7800243496894836, + "num_tokens": 3332742.0, + "step": 372 + }, + { + "epoch": 0.28343465045592703, + "grad_norm": 4.822193145751953, + "learning_rate": 4.973485920064491e-06, + "loss": 0.6256728768348694, + "mean_token_accuracy": 0.7962433099746704, + "num_tokens": 3335872.0, + "step": 373 + }, + { + "epoch": 0.28419452887537994, + "grad_norm": 1.260988473892212, + "learning_rate": 4.973180832407471e-06, + "loss": 0.38731223344802856, + "mean_token_accuracy": 0.8385066986083984, + "num_tokens": 3351884.0, + "step": 374 + }, + { + "epoch": 0.28495440729483285, + "grad_norm": 2.669966697692871, + "learning_rate": 4.97287400897694e-06, + "loss": 0.5594710111618042, + "mean_token_accuracy": 0.8097212314605713, + "num_tokens": 3358197.0, + "step": 375 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 3.0344486236572266, + "learning_rate": 4.972565449988238e-06, + "loss": 0.34449583292007446, + "mean_token_accuracy": 0.8813316822052002, + "num_tokens": 3362133.0, + "step": 376 + }, + { + "epoch": 0.2864741641337386, + "grad_norm": 2.562251091003418, + "learning_rate": 4.972255155657925e-06, + "loss": 0.5331522822380066, + "mean_token_accuracy": 0.8212941288948059, + "num_tokens": 3370346.0, + "step": 377 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 2.7083740234375, + "learning_rate": 4.9719431262037755e-06, + "loss": 0.5403046011924744, + "mean_token_accuracy": 0.8108335733413696, + "num_tokens": 3375588.0, + "step": 378 + }, + { + "epoch": 0.28799392097264437, + "grad_norm": 1.396430492401123, + "learning_rate": 4.971629361844785e-06, + "loss": 0.4041529893875122, + "mean_token_accuracy": 0.8588063716888428, + "num_tokens": 3390749.0, + "step": 379 + }, + { + "epoch": 0.2887537993920973, + "grad_norm": 1.9872784614562988, + "learning_rate": 4.971313862801166e-06, + "loss": 0.4336993098258972, + "mean_token_accuracy": 0.8511303663253784, + "num_tokens": 3399064.0, + "step": 380 + }, + { + "epoch": 0.28951367781155013, + "grad_norm": 1.9652575254440308, + "learning_rate": 4.9709966292943455e-06, + "loss": 0.4578358232975006, + "mean_token_accuracy": 0.8229440450668335, + "num_tokens": 3407229.0, + "step": 381 + }, + { + "epoch": 0.29027355623100304, + "grad_norm": 1.6626898050308228, + "learning_rate": 4.970677661546972e-06, + "loss": 0.5427594184875488, + "mean_token_accuracy": 0.815427303314209, + "num_tokens": 3422321.0, + "step": 382 + }, + { + "epoch": 0.29103343465045595, + "grad_norm": 3.5265562534332275, + "learning_rate": 4.970356959782909e-06, + "loss": 0.6661460995674133, + "mean_token_accuracy": 0.7856965065002441, + "num_tokens": 3427442.0, + "step": 383 + }, + { + "epoch": 0.2917933130699088, + "grad_norm": 1.667205572128296, + "learning_rate": 4.970034524227239e-06, + "loss": 0.36256325244903564, + "mean_token_accuracy": 0.8711205720901489, + "num_tokens": 3436662.0, + "step": 384 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 1.3389486074447632, + "learning_rate": 4.969710355106256e-06, + "loss": 0.4282698631286621, + "mean_token_accuracy": 0.838951587677002, + "num_tokens": 3450060.0, + "step": 385 + }, + { + "epoch": 0.2933130699088146, + "grad_norm": 2.5163397789001465, + "learning_rate": 4.969384452647477e-06, + "loss": 0.5176984071731567, + "mean_token_accuracy": 0.8235267996788025, + "num_tokens": 3456990.0, + "step": 386 + }, + { + "epoch": 0.29407294832826747, + "grad_norm": 1.7588495016098022, + "learning_rate": 4.969056817079633e-06, + "loss": 0.49710947275161743, + "mean_token_accuracy": 0.818520724773407, + "num_tokens": 3468098.0, + "step": 387 + }, + { + "epoch": 0.2948328267477204, + "grad_norm": 2.6381046772003174, + "learning_rate": 4.968727448632669e-06, + "loss": 0.4425308108329773, + "mean_token_accuracy": 0.8451643586158752, + "num_tokens": 3472899.0, + "step": 388 + }, + { + "epoch": 0.29559270516717323, + "grad_norm": 1.6345038414001465, + "learning_rate": 4.968396347537751e-06, + "loss": 0.4177059829235077, + "mean_token_accuracy": 0.8498886227607727, + "num_tokens": 3484826.0, + "step": 389 + }, + { + "epoch": 0.29635258358662614, + "grad_norm": 3.0466468334198, + "learning_rate": 4.968063514027258e-06, + "loss": 0.4274463951587677, + "mean_token_accuracy": 0.8387278318405151, + "num_tokens": 3488610.0, + "step": 390 + }, + { + "epoch": 0.29711246200607905, + "grad_norm": 2.6509406566619873, + "learning_rate": 4.967728948334784e-06, + "loss": 0.5401753783226013, + "mean_token_accuracy": 0.8252490162849426, + "num_tokens": 3493657.0, + "step": 391 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.6372219324111938, + "learning_rate": 4.967392650695141e-06, + "loss": 0.3862472176551819, + "mean_token_accuracy": 0.8555525541305542, + "num_tokens": 3505588.0, + "step": 392 + }, + { + "epoch": 0.2986322188449848, + "grad_norm": 2.1615452766418457, + "learning_rate": 4.967054621344356e-06, + "loss": 0.57850581407547, + "mean_token_accuracy": 0.8222678899765015, + "num_tokens": 3514396.0, + "step": 393 + }, + { + "epoch": 0.2993920972644377, + "grad_norm": 1.8610916137695312, + "learning_rate": 4.96671486051967e-06, + "loss": 0.5440595149993896, + "mean_token_accuracy": 0.8196715116500854, + "num_tokens": 3523604.0, + "step": 394 + }, + { + "epoch": 0.30015197568389057, + "grad_norm": 2.9585862159729004, + "learning_rate": 4.966373368459542e-06, + "loss": 0.6921588182449341, + "mean_token_accuracy": 0.7816659808158875, + "num_tokens": 3529849.0, + "step": 395 + }, + { + "epoch": 0.3009118541033435, + "grad_norm": 1.9374035596847534, + "learning_rate": 4.966030145403642e-06, + "loss": 0.5494055151939392, + "mean_token_accuracy": 0.8126792907714844, + "num_tokens": 3539529.0, + "step": 396 + }, + { + "epoch": 0.30167173252279633, + "grad_norm": 1.730530023574829, + "learning_rate": 4.965685191592859e-06, + "loss": 0.4271572232246399, + "mean_token_accuracy": 0.8383668661117554, + "num_tokens": 3550972.0, + "step": 397 + }, + { + "epoch": 0.30243161094224924, + "grad_norm": 3.9635560512542725, + "learning_rate": 4.9653385072692935e-06, + "loss": 0.5576210021972656, + "mean_token_accuracy": 0.799404501914978, + "num_tokens": 3554147.0, + "step": 398 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 2.5731968879699707, + "learning_rate": 4.964990092676263e-06, + "loss": 0.5478942394256592, + "mean_token_accuracy": 0.8220961093902588, + "num_tokens": 3559972.0, + "step": 399 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 2.2096588611602783, + "learning_rate": 4.964639948058297e-06, + "loss": 0.35461270809173584, + "mean_token_accuracy": 0.8640927076339722, + "num_tokens": 3565770.0, + "step": 400 + }, + { + "epoch": 0.3047112462006079, + "grad_norm": 1.7874189615249634, + "learning_rate": 4.964288073661142e-06, + "loss": 0.38849619030952454, + "mean_token_accuracy": 0.8443037271499634, + "num_tokens": 3574514.0, + "step": 401 + }, + { + "epoch": 0.30547112462006076, + "grad_norm": 1.5583146810531616, + "learning_rate": 4.963934469731756e-06, + "loss": 0.48909449577331543, + "mean_token_accuracy": 0.8429768681526184, + "num_tokens": 3585877.0, + "step": 402 + }, + { + "epoch": 0.30623100303951367, + "grad_norm": 3.026599645614624, + "learning_rate": 4.963579136518312e-06, + "loss": 0.5138992071151733, + "mean_token_accuracy": 0.8283728361129761, + "num_tokens": 3590412.0, + "step": 403 + }, + { + "epoch": 0.3069908814589666, + "grad_norm": 2.777505874633789, + "learning_rate": 4.963222074270197e-06, + "loss": 0.6241534948348999, + "mean_token_accuracy": 0.8130464553833008, + "num_tokens": 3596246.0, + "step": 404 + }, + { + "epoch": 0.30775075987841943, + "grad_norm": 2.4772839546203613, + "learning_rate": 4.962863283238011e-06, + "loss": 0.5930814146995544, + "mean_token_accuracy": 0.8036394715309143, + "num_tokens": 3602878.0, + "step": 405 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 1.5049982070922852, + "learning_rate": 4.962502763673566e-06, + "loss": 0.4903082549571991, + "mean_token_accuracy": 0.8184912204742432, + "num_tokens": 3617018.0, + "step": 406 + }, + { + "epoch": 0.30927051671732525, + "grad_norm": 2.453155040740967, + "learning_rate": 4.96214051582989e-06, + "loss": 0.5138067603111267, + "mean_token_accuracy": 0.8336835503578186, + "num_tokens": 3624188.0, + "step": 407 + }, + { + "epoch": 0.3100303951367781, + "grad_norm": 2.4038336277008057, + "learning_rate": 4.961776539961222e-06, + "loss": 0.5752760171890259, + "mean_token_accuracy": 0.8054730892181396, + "num_tokens": 3634152.0, + "step": 408 + }, + { + "epoch": 0.310790273556231, + "grad_norm": 2.629068374633789, + "learning_rate": 4.961410836323014e-06, + "loss": 0.5580606460571289, + "mean_token_accuracy": 0.8121089935302734, + "num_tokens": 3639528.0, + "step": 409 + }, + { + "epoch": 0.31155015197568386, + "grad_norm": 1.4245928525924683, + "learning_rate": 4.961043405171931e-06, + "loss": 0.5399882793426514, + "mean_token_accuracy": 0.812280535697937, + "num_tokens": 3655744.0, + "step": 410 + }, + { + "epoch": 0.3123100303951368, + "grad_norm": 1.5236459970474243, + "learning_rate": 4.9606742467658505e-06, + "loss": 0.5234690308570862, + "mean_token_accuracy": 0.8188928365707397, + "num_tokens": 3675010.0, + "step": 411 + }, + { + "epoch": 0.3130699088145897, + "grad_norm": 2.27961802482605, + "learning_rate": 4.960303361363863e-06, + "loss": 0.5502505898475647, + "mean_token_accuracy": 0.8161963224411011, + "num_tokens": 3682328.0, + "step": 412 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 1.554518222808838, + "learning_rate": 4.959930749226269e-06, + "loss": 0.420867919921875, + "mean_token_accuracy": 0.8499157428741455, + "num_tokens": 3694980.0, + "step": 413 + }, + { + "epoch": 0.31458966565349544, + "grad_norm": 2.609218120574951, + "learning_rate": 4.9595564106145825e-06, + "loss": 0.4706704318523407, + "mean_token_accuracy": 0.8412490487098694, + "num_tokens": 3700033.0, + "step": 414 + }, + { + "epoch": 0.31534954407294835, + "grad_norm": 1.5303231477737427, + "learning_rate": 4.959180345791528e-06, + "loss": 0.4668654799461365, + "mean_token_accuracy": 0.8125015497207642, + "num_tokens": 3715012.0, + "step": 415 + }, + { + "epoch": 0.3161094224924012, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.958802555021042e-06, + "loss": 0.4339369237422943, + "mean_token_accuracy": 0.8442851901054382, + "num_tokens": 3733928.0, + "step": 416 + }, + { + "epoch": 0.3168693009118541, + "grad_norm": 2.1240181922912598, + "learning_rate": 4.958423038568274e-06, + "loss": 0.4029104709625244, + "mean_token_accuracy": 0.8627674579620361, + "num_tokens": 3740202.0, + "step": 417 + }, + { + "epoch": 0.31762917933130697, + "grad_norm": 2.00538969039917, + "learning_rate": 4.958041796699583e-06, + "loss": 0.5229607820510864, + "mean_token_accuracy": 0.8282366394996643, + "num_tokens": 3749308.0, + "step": 418 + }, + { + "epoch": 0.3183890577507599, + "grad_norm": 2.6555092334747314, + "learning_rate": 4.957658829682539e-06, + "loss": 0.5344101190567017, + "mean_token_accuracy": 0.8183202743530273, + "num_tokens": 3754595.0, + "step": 419 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 1.7468839883804321, + "learning_rate": 4.9572741377859225e-06, + "loss": 0.5667245984077454, + "mean_token_accuracy": 0.8080123662948608, + "num_tokens": 3765761.0, + "step": 420 + }, + { + "epoch": 0.31990881458966564, + "grad_norm": 2.9612457752227783, + "learning_rate": 4.956887721279726e-06, + "loss": 0.5389559864997864, + "mean_token_accuracy": 0.8019476532936096, + "num_tokens": 3770844.0, + "step": 421 + }, + { + "epoch": 0.32066869300911854, + "grad_norm": 1.842403769493103, + "learning_rate": 4.95649958043515e-06, + "loss": 0.38279837369918823, + "mean_token_accuracy": 0.858866810798645, + "num_tokens": 3778094.0, + "step": 422 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 2.3108131885528564, + "learning_rate": 4.956109715524609e-06, + "loss": 0.5453893542289734, + "mean_token_accuracy": 0.8085013031959534, + "num_tokens": 3785015.0, + "step": 423 + }, + { + "epoch": 0.3221884498480243, + "grad_norm": 3.0326945781707764, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.5550523400306702, + "mean_token_accuracy": 0.8125876188278198, + "num_tokens": 3789830.0, + "step": 424 + }, + { + "epoch": 0.3229483282674772, + "grad_norm": 1.8851977586746216, + "learning_rate": 4.955324814601324e-06, + "loss": 0.4902324974536896, + "mean_token_accuracy": 0.8205406665802002, + "num_tokens": 3799862.0, + "step": 425 + }, + { + "epoch": 0.32370820668693007, + "grad_norm": 2.6018171310424805, + "learning_rate": 4.954929779139455e-06, + "loss": 0.5920133590698242, + "mean_token_accuracy": 0.8340690732002258, + "num_tokens": 3806617.0, + "step": 426 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 2.4283878803253174, + "learning_rate": 4.954533020713367e-06, + "loss": 0.5305854082107544, + "mean_token_accuracy": 0.8137468099594116, + "num_tokens": 3813843.0, + "step": 427 + }, + { + "epoch": 0.3252279635258359, + "grad_norm": 2.667978525161743, + "learning_rate": 4.954134539601519e-06, + "loss": 0.5333638787269592, + "mean_token_accuracy": 0.8402629494667053, + "num_tokens": 3819450.0, + "step": 428 + }, + { + "epoch": 0.32598784194528874, + "grad_norm": 1.7302523851394653, + "learning_rate": 4.953734336083582e-06, + "loss": 0.422895610332489, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 3831027.0, + "step": 429 + }, + { + "epoch": 0.32674772036474165, + "grad_norm": 2.427192211151123, + "learning_rate": 4.953332410440434e-06, + "loss": 0.6334598064422607, + "mean_token_accuracy": 0.7817479968070984, + "num_tokens": 3841776.0, + "step": 430 + }, + { + "epoch": 0.32750759878419455, + "grad_norm": 1.460949182510376, + "learning_rate": 4.952928762954161e-06, + "loss": 0.3654777705669403, + "mean_token_accuracy": 0.8780122995376587, + "num_tokens": 3852213.0, + "step": 431 + }, + { + "epoch": 0.3282674772036474, + "grad_norm": 1.9855005741119385, + "learning_rate": 4.952523393908059e-06, + "loss": 0.5117089748382568, + "mean_token_accuracy": 0.811911404132843, + "num_tokens": 3861176.0, + "step": 432 + }, + { + "epoch": 0.3290273556231003, + "grad_norm": 2.2653207778930664, + "learning_rate": 4.952116303586631e-06, + "loss": 0.42514950037002563, + "mean_token_accuracy": 0.8448518514633179, + "num_tokens": 3867164.0, + "step": 433 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 1.9780964851379395, + "learning_rate": 4.951707492275589e-06, + "loss": 0.5095293521881104, + "mean_token_accuracy": 0.8262748718261719, + "num_tokens": 3876406.0, + "step": 434 + }, + { + "epoch": 0.3305471124620061, + "grad_norm": 2.9480233192443848, + "learning_rate": 4.951296960261853e-06, + "loss": 0.3494448959827423, + "mean_token_accuracy": 0.8781307935714722, + "num_tokens": 3880298.0, + "step": 435 + }, + { + "epoch": 0.331306990881459, + "grad_norm": 2.335571527481079, + "learning_rate": 4.95088470783355e-06, + "loss": 0.5456914901733398, + "mean_token_accuracy": 0.816297173500061, + "num_tokens": 3886487.0, + "step": 436 + }, + { + "epoch": 0.33206686930091184, + "grad_norm": 2.3046419620513916, + "learning_rate": 4.950470735280013e-06, + "loss": 0.4835948944091797, + "mean_token_accuracy": 0.8539175391197205, + "num_tokens": 3892706.0, + "step": 437 + }, + { + "epoch": 0.33282674772036475, + "grad_norm": 2.44047474861145, + "learning_rate": 4.950055042891786e-06, + "loss": 0.5154092907905579, + "mean_token_accuracy": 0.8579919338226318, + "num_tokens": 3899532.0, + "step": 438 + }, + { + "epoch": 0.33358662613981765, + "grad_norm": 4.826764106750488, + "learning_rate": 4.949637630960618e-06, + "loss": 0.5270259976387024, + "mean_token_accuracy": 0.8172192573547363, + "num_tokens": 3902260.0, + "step": 439 + }, + { + "epoch": 0.3343465045592705, + "grad_norm": 2.001574754714966, + "learning_rate": 4.949218499779462e-06, + "loss": 0.5413002967834473, + "mean_token_accuracy": 0.8162837028503418, + "num_tokens": 3911706.0, + "step": 440 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 1.7998944520950317, + "learning_rate": 4.948797649642484e-06, + "loss": 0.5131614208221436, + "mean_token_accuracy": 0.8367440700531006, + "num_tokens": 3923490.0, + "step": 441 + }, + { + "epoch": 0.33586626139817627, + "grad_norm": 3.4566173553466797, + "learning_rate": 4.94837508084505e-06, + "loss": 0.7258909940719604, + "mean_token_accuracy": 0.771377444267273, + "num_tokens": 3928099.0, + "step": 442 + }, + { + "epoch": 0.3366261398176292, + "grad_norm": 2.0040442943573, + "learning_rate": 4.9479507936837364e-06, + "loss": 0.482135534286499, + "mean_token_accuracy": 0.8339327573776245, + "num_tokens": 3937328.0, + "step": 443 + }, + { + "epoch": 0.3373860182370821, + "grad_norm": 2.949502944946289, + "learning_rate": 4.947524788456325e-06, + "loss": 0.6474795341491699, + "mean_token_accuracy": 0.7951677441596985, + "num_tokens": 3942529.0, + "step": 444 + }, + { + "epoch": 0.33814589665653494, + "grad_norm": 1.5528364181518555, + "learning_rate": 4.947097065461801e-06, + "loss": 0.48791584372520447, + "mean_token_accuracy": 0.8425545692443848, + "num_tokens": 3955200.0, + "step": 445 + }, + { + "epoch": 0.33890577507598785, + "grad_norm": 1.8813284635543823, + "learning_rate": 4.946667625000358e-06, + "loss": 0.45922309160232544, + "mean_token_accuracy": 0.8206527233123779, + "num_tokens": 3962975.0, + "step": 446 + }, + { + "epoch": 0.33966565349544076, + "grad_norm": 1.7157847881317139, + "learning_rate": 4.946236467373392e-06, + "loss": 0.5454182028770447, + "mean_token_accuracy": 0.8049604892730713, + "num_tokens": 3973956.0, + "step": 447 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 2.008857250213623, + "learning_rate": 4.945803592883509e-06, + "loss": 0.5151860117912292, + "mean_token_accuracy": 0.8262045383453369, + "num_tokens": 3982853.0, + "step": 448 + }, + { + "epoch": 0.3411854103343465, + "grad_norm": 1.6632496118545532, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.42710691690444946, + "mean_token_accuracy": 0.8521314859390259, + "num_tokens": 3993838.0, + "step": 449 + }, + { + "epoch": 0.34194528875379937, + "grad_norm": 1.365234375, + "learning_rate": 4.944932694531423e-06, + "loss": 0.5172526836395264, + "mean_token_accuracy": 0.8277045488357544, + "num_tokens": 4014179.0, + "step": 450 + }, + { + "epoch": 0.3427051671732523, + "grad_norm": 1.7610243558883667, + "learning_rate": 4.94449467128045e-06, + "loss": 0.42104798555374146, + "mean_token_accuracy": 0.8552065491676331, + "num_tokens": 4023663.0, + "step": 451 + }, + { + "epoch": 0.3434650455927052, + "grad_norm": 2.3732354640960693, + "learning_rate": 4.944054932389018e-06, + "loss": 0.5471175909042358, + "mean_token_accuracy": 0.8487317562103271, + "num_tokens": 4030100.0, + "step": 452 + }, + { + "epoch": 0.34422492401215804, + "grad_norm": 1.5973623991012573, + "learning_rate": 4.943613478165753e-06, + "loss": 0.419813871383667, + "mean_token_accuracy": 0.8484025001525879, + "num_tokens": 4041124.0, + "step": 453 + }, + { + "epoch": 0.34498480243161095, + "grad_norm": 2.966381549835205, + "learning_rate": 4.943170308920484e-06, + "loss": 0.5370652675628662, + "mean_token_accuracy": 0.8439491987228394, + "num_tokens": 4045675.0, + "step": 454 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 2.5097248554229736, + "learning_rate": 4.9427254249642445e-06, + "loss": 0.5776349306106567, + "mean_token_accuracy": 0.8060523867607117, + "num_tokens": 4053250.0, + "step": 455 + }, + { + "epoch": 0.3465045592705167, + "grad_norm": 1.6779125928878784, + "learning_rate": 4.942278826609272e-06, + "loss": 0.5245476961135864, + "mean_token_accuracy": 0.8168526887893677, + "num_tokens": 4064106.0, + "step": 456 + }, + { + "epoch": 0.3472644376899696, + "grad_norm": 1.5945546627044678, + "learning_rate": 4.9418305141690045e-06, + "loss": 0.4972047209739685, + "mean_token_accuracy": 0.8257735967636108, + "num_tokens": 4077687.0, + "step": 457 + }, + { + "epoch": 0.34802431610942247, + "grad_norm": 2.864778757095337, + "learning_rate": 4.9413804879580865e-06, + "loss": 0.5372499823570251, + "mean_token_accuracy": 0.8423776626586914, + "num_tokens": 4082632.0, + "step": 458 + }, + { + "epoch": 0.3487841945288754, + "grad_norm": 1.4797078371047974, + "learning_rate": 4.940928748292363e-06, + "loss": 0.5903409719467163, + "mean_token_accuracy": 0.8061295747756958, + "num_tokens": 4104218.0, + "step": 459 + }, + { + "epoch": 0.3495440729483283, + "grad_norm": 2.4376983642578125, + "learning_rate": 4.940475295488882e-06, + "loss": 0.4534894824028015, + "mean_token_accuracy": 0.8395825028419495, + "num_tokens": 4110530.0, + "step": 460 + }, + { + "epoch": 0.35030395136778114, + "grad_norm": 1.2955626249313354, + "learning_rate": 4.940020129865895e-06, + "loss": 0.47155818343162537, + "mean_token_accuracy": 0.8253582715988159, + "num_tokens": 4128398.0, + "step": 461 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 2.066575527191162, + "learning_rate": 4.9395632517428546e-06, + "loss": 0.5555641651153564, + "mean_token_accuracy": 0.814624547958374, + "num_tokens": 4137623.0, + "step": 462 + }, + { + "epoch": 0.3518237082066869, + "grad_norm": 1.6407525539398193, + "learning_rate": 4.939104661440415e-06, + "loss": 0.4361790418624878, + "mean_token_accuracy": 0.8544459342956543, + "num_tokens": 4152803.0, + "step": 463 + }, + { + "epoch": 0.3525835866261398, + "grad_norm": 2.1685116291046143, + "learning_rate": 4.938644359280433e-06, + "loss": 0.5347012877464294, + "mean_token_accuracy": 0.853853702545166, + "num_tokens": 4160778.0, + "step": 464 + }, + { + "epoch": 0.3533434650455927, + "grad_norm": 1.8824869394302368, + "learning_rate": 4.938182345585967e-06, + "loss": 0.5512481927871704, + "mean_token_accuracy": 0.7985891699790955, + "num_tokens": 4170380.0, + "step": 465 + }, + { + "epoch": 0.3541033434650456, + "grad_norm": 2.2229504585266113, + "learning_rate": 4.937718620681273e-06, + "loss": 0.516828179359436, + "mean_token_accuracy": 0.8265621066093445, + "num_tokens": 4178179.0, + "step": 466 + }, + { + "epoch": 0.3548632218844985, + "grad_norm": 1.955990195274353, + "learning_rate": 4.9372531848918145e-06, + "loss": 0.5586158037185669, + "mean_token_accuracy": 0.8367916345596313, + "num_tokens": 4188626.0, + "step": 467 + }, + { + "epoch": 0.3556231003039514, + "grad_norm": 1.9687023162841797, + "learning_rate": 4.936786038544251e-06, + "loss": 0.5517531633377075, + "mean_token_accuracy": 0.8134098052978516, + "num_tokens": 4198144.0, + "step": 468 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 1.405516505241394, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.5305492877960205, + "mean_token_accuracy": 0.8014427423477173, + "num_tokens": 4222818.0, + "step": 469 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.6355695724487305, + "learning_rate": 4.9358466154874535e-06, + "loss": 0.5303391218185425, + "mean_token_accuracy": 0.8028861284255981, + "num_tokens": 4228318.0, + "step": 470 + }, + { + "epoch": 0.35790273556231, + "grad_norm": 1.5133824348449707, + "learning_rate": 4.935374339437543e-06, + "loss": 0.5329189300537109, + "mean_token_accuracy": 0.8479441404342651, + "num_tokens": 4244527.0, + "step": 471 + }, + { + "epoch": 0.3586626139817629, + "grad_norm": 3.4356725215911865, + "learning_rate": 4.934900354148173e-06, + "loss": 0.5431582927703857, + "mean_token_accuracy": 0.8328983783721924, + "num_tokens": 4248034.0, + "step": 472 + }, + { + "epoch": 0.3594224924012158, + "grad_norm": 2.5789499282836914, + "learning_rate": 4.934424659952006e-06, + "loss": 0.4141455292701721, + "mean_token_accuracy": 0.8658635020256042, + "num_tokens": 4252953.0, + "step": 473 + }, + { + "epoch": 0.3601823708206687, + "grad_norm": 1.145262598991394, + "learning_rate": 4.933947257182901e-06, + "loss": 0.40294092893600464, + "mean_token_accuracy": 0.8565847277641296, + "num_tokens": 4277813.0, + "step": 474 + }, + { + "epoch": 0.3609422492401216, + "grad_norm": 1.7242133617401123, + "learning_rate": 4.933468146175918e-06, + "loss": 0.6036738753318787, + "mean_token_accuracy": 0.8072597980499268, + "num_tokens": 4291088.0, + "step": 475 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 2.3490941524505615, + "learning_rate": 4.932987327267317e-06, + "loss": 0.49456146359443665, + "mean_token_accuracy": 0.8372673988342285, + "num_tokens": 4297376.0, + "step": 476 + }, + { + "epoch": 0.36246200607902734, + "grad_norm": 1.3605526685714722, + "learning_rate": 4.932504800794553e-06, + "loss": 0.43595948815345764, + "mean_token_accuracy": 0.8415953516960144, + "num_tokens": 4312054.0, + "step": 477 + }, + { + "epoch": 0.36322188449848025, + "grad_norm": 1.4525885581970215, + "learning_rate": 4.9320205670962815e-06, + "loss": 0.5390371680259705, + "mean_token_accuracy": 0.8101649284362793, + "num_tokens": 4328701.0, + "step": 478 + }, + { + "epoch": 0.3639817629179331, + "grad_norm": 1.9862419366836548, + "learning_rate": 4.931534626512359e-06, + "loss": 0.45436930656433105, + "mean_token_accuracy": 0.8352861404418945, + "num_tokens": 4338372.0, + "step": 479 + }, + { + "epoch": 0.364741641337386, + "grad_norm": 1.7804961204528809, + "learning_rate": 4.931046979383836e-06, + "loss": 0.4677754044532776, + "mean_token_accuracy": 0.840467095375061, + "num_tokens": 4347897.0, + "step": 480 + }, + { + "epoch": 0.3655015197568389, + "grad_norm": 2.066632032394409, + "learning_rate": 4.930557626052961e-06, + "loss": 0.42418140172958374, + "mean_token_accuracy": 0.8528275489807129, + "num_tokens": 4354061.0, + "step": 481 + }, + { + "epoch": 0.3662613981762918, + "grad_norm": 1.6155282258987427, + "learning_rate": 4.930066566863182e-06, + "loss": 0.5424284934997559, + "mean_token_accuracy": 0.825040876865387, + "num_tokens": 4370400.0, + "step": 482 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 2.1452953815460205, + "learning_rate": 4.929573802159143e-06, + "loss": 0.5105804204940796, + "mean_token_accuracy": 0.8284053802490234, + "num_tokens": 4377579.0, + "step": 483 + }, + { + "epoch": 0.3677811550151976, + "grad_norm": 1.8940945863723755, + "learning_rate": 4.929079332286685e-06, + "loss": 0.43478304147720337, + "mean_token_accuracy": 0.8505665063858032, + "num_tokens": 4385686.0, + "step": 484 + }, + { + "epoch": 0.36854103343465044, + "grad_norm": 1.6785860061645508, + "learning_rate": 4.928583157592846e-06, + "loss": 0.40227848291397095, + "mean_token_accuracy": 0.8623573780059814, + "num_tokens": 4396128.0, + "step": 485 + }, + { + "epoch": 0.36930091185410335, + "grad_norm": 1.6416733264923096, + "learning_rate": 4.928085278425862e-06, + "loss": 0.526267409324646, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 4407963.0, + "step": 486 + }, + { + "epoch": 0.3700607902735562, + "grad_norm": 1.8882389068603516, + "learning_rate": 4.927585695135162e-06, + "loss": 0.5555213093757629, + "mean_token_accuracy": 0.8115293979644775, + "num_tokens": 4418057.0, + "step": 487 + }, + { + "epoch": 0.3708206686930091, + "grad_norm": 2.300248384475708, + "learning_rate": 4.9270844080713735e-06, + "loss": 0.5812339186668396, + "mean_token_accuracy": 0.800270676612854, + "num_tokens": 4425358.0, + "step": 488 + }, + { + "epoch": 0.371580547112462, + "grad_norm": 1.6802922487258911, + "learning_rate": 4.926581417586319e-06, + "loss": 0.5134941935539246, + "mean_token_accuracy": 0.8247408866882324, + "num_tokens": 4437702.0, + "step": 489 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 1.7620291709899902, + "learning_rate": 4.926076724033016e-06, + "loss": 0.5233973264694214, + "mean_token_accuracy": 0.8102161884307861, + "num_tokens": 4448584.0, + "step": 490 + }, + { + "epoch": 0.3731003039513678, + "grad_norm": 1.6911998987197876, + "learning_rate": 4.925570327765678e-06, + "loss": 0.5337274074554443, + "mean_token_accuracy": 0.845306396484375, + "num_tokens": 4462651.0, + "step": 491 + }, + { + "epoch": 0.3738601823708207, + "grad_norm": 1.7991242408752441, + "learning_rate": 4.9250622291397144e-06, + "loss": 0.31018948554992676, + "mean_token_accuracy": 0.8857606053352356, + "num_tokens": 4469971.0, + "step": 492 + }, + { + "epoch": 0.37462006079027355, + "grad_norm": 4.9776835441589355, + "learning_rate": 4.924552428511727e-06, + "loss": 0.44114983081817627, + "mean_token_accuracy": 0.8429906368255615, + "num_tokens": 4478275.0, + "step": 493 + }, + { + "epoch": 0.37537993920972645, + "grad_norm": 1.8007272481918335, + "learning_rate": 4.924040926239515e-06, + "loss": 0.574328601360321, + "mean_token_accuracy": 0.7669196128845215, + "num_tokens": 4491551.0, + "step": 494 + }, + { + "epoch": 0.3761398176291793, + "grad_norm": 2.021300792694092, + "learning_rate": 4.92352772268207e-06, + "loss": 0.45636120438575745, + "mean_token_accuracy": 0.840438723564148, + "num_tokens": 4498658.0, + "step": 495 + }, + { + "epoch": 0.3768996960486322, + "grad_norm": 2.369748592376709, + "learning_rate": 4.923012818199576e-06, + "loss": 0.5206376910209656, + "mean_token_accuracy": 0.8521823287010193, + "num_tokens": 4504648.0, + "step": 496 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 2.733485221862793, + "learning_rate": 4.922496213153416e-06, + "loss": 0.5067723989486694, + "mean_token_accuracy": 0.8168281316757202, + "num_tokens": 4509990.0, + "step": 497 + }, + { + "epoch": 0.378419452887538, + "grad_norm": 2.3751676082611084, + "learning_rate": 4.921977907906161e-06, + "loss": 0.49757206439971924, + "mean_token_accuracy": 0.8325017690658569, + "num_tokens": 4518373.0, + "step": 498 + }, + { + "epoch": 0.3791793313069909, + "grad_norm": 2.1672775745391846, + "learning_rate": 4.921457902821578e-06, + "loss": 0.4237566590309143, + "mean_token_accuracy": 0.8404698371887207, + "num_tokens": 4524338.0, + "step": 499 + }, + { + "epoch": 0.3799392097264438, + "grad_norm": 1.8374360799789429, + "learning_rate": 4.9209361982646275e-06, + "loss": 0.4995468854904175, + "mean_token_accuracy": 0.8299649953842163, + "num_tokens": 4533396.0, + "step": 500 + }, + { + "epoch": 0.38069908814589665, + "grad_norm": 2.083967924118042, + "learning_rate": 4.920412794601461e-06, + "loss": 0.489935040473938, + "mean_token_accuracy": 0.8315291404724121, + "num_tokens": 4540941.0, + "step": 501 + }, + { + "epoch": 0.38145896656534956, + "grad_norm": 2.2075610160827637, + "learning_rate": 4.919887692199423e-06, + "loss": 0.5233147740364075, + "mean_token_accuracy": 0.804171085357666, + "num_tokens": 4548215.0, + "step": 502 + }, + { + "epoch": 0.3822188449848024, + "grad_norm": 2.076775312423706, + "learning_rate": 4.9193608914270515e-06, + "loss": 0.5785550475120544, + "mean_token_accuracy": 0.7993186116218567, + "num_tokens": 4558204.0, + "step": 503 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.238546133041382, + "learning_rate": 4.918832392654075e-06, + "loss": 0.5287384390830994, + "mean_token_accuracy": 0.8214945793151855, + "num_tokens": 4565407.0, + "step": 504 + }, + { + "epoch": 0.3837386018237082, + "grad_norm": 1.6783074140548706, + "learning_rate": 4.9183021962514145e-06, + "loss": 0.6063359379768372, + "mean_token_accuracy": 0.7914625406265259, + "num_tokens": 4580991.0, + "step": 505 + }, + { + "epoch": 0.3844984802431611, + "grad_norm": 1.6287449598312378, + "learning_rate": 4.917770302591183e-06, + "loss": 0.3598247766494751, + "mean_token_accuracy": 0.8706809878349304, + "num_tokens": 4590579.0, + "step": 506 + }, + { + "epoch": 0.385258358662614, + "grad_norm": 1.5432041883468628, + "learning_rate": 4.917236712046682e-06, + "loss": 0.5267890095710754, + "mean_token_accuracy": 0.8032117486000061, + "num_tokens": 4608380.0, + "step": 507 + }, + { + "epoch": 0.3860182370820669, + "grad_norm": 1.7664037942886353, + "learning_rate": 4.9167014249924075e-06, + "loss": 0.3552354574203491, + "mean_token_accuracy": 0.8569793701171875, + "num_tokens": 4616426.0, + "step": 508 + }, + { + "epoch": 0.38677811550151975, + "grad_norm": 2.1147472858428955, + "learning_rate": 4.916164441804044e-06, + "loss": 0.5212404727935791, + "mean_token_accuracy": 0.8196578025817871, + "num_tokens": 4623908.0, + "step": 509 + }, + { + "epoch": 0.38753799392097266, + "grad_norm": 2.1092333793640137, + "learning_rate": 4.915625762858467e-06, + "loss": 0.5197038650512695, + "mean_token_accuracy": 0.8245604634284973, + "num_tokens": 4630956.0, + "step": 510 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 1.23331880569458, + "learning_rate": 4.915085388533743e-06, + "loss": 0.4759839177131653, + "mean_token_accuracy": 0.8192248344421387, + "num_tokens": 4651269.0, + "step": 511 + }, + { + "epoch": 0.3890577507598784, + "grad_norm": 2.424199104309082, + "learning_rate": 4.914543319209126e-06, + "loss": 0.5576270818710327, + "mean_token_accuracy": 0.8203302621841431, + "num_tokens": 4657296.0, + "step": 512 + }, + { + "epoch": 0.3898176291793313, + "grad_norm": 2.725156307220459, + "learning_rate": 4.913999555265062e-06, + "loss": 0.4337949752807617, + "mean_token_accuracy": 0.8382406234741211, + "num_tokens": 4661850.0, + "step": 513 + }, + { + "epoch": 0.3905775075987842, + "grad_norm": 2.3120534420013428, + "learning_rate": 4.913454097083185e-06, + "loss": 0.4941597580909729, + "mean_token_accuracy": 0.8302834033966064, + "num_tokens": 4667769.0, + "step": 514 + }, + { + "epoch": 0.3913373860182371, + "grad_norm": 2.3111207485198975, + "learning_rate": 4.912906945046319e-06, + "loss": 0.5253715515136719, + "mean_token_accuracy": 0.84515380859375, + "num_tokens": 4674537.0, + "step": 515 + }, + { + "epoch": 0.39209726443769, + "grad_norm": 1.4117841720581055, + "learning_rate": 4.912358099538476e-06, + "loss": 0.4521017074584961, + "mean_token_accuracy": 0.8208256959915161, + "num_tokens": 4690605.0, + "step": 516 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 2.3742799758911133, + "learning_rate": 4.911807560944858e-06, + "loss": 0.41572901606559753, + "mean_token_accuracy": 0.8550551533699036, + "num_tokens": 4706437.0, + "step": 517 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 2.4052202701568604, + "learning_rate": 4.911255329651852e-06, + "loss": 0.6003736257553101, + "mean_token_accuracy": 0.8247885704040527, + "num_tokens": 4712746.0, + "step": 518 + }, + { + "epoch": 0.3943768996960486, + "grad_norm": 1.9335490465164185, + "learning_rate": 4.910701406047037e-06, + "loss": 0.5457713603973389, + "mean_token_accuracy": 0.787429690361023, + "num_tokens": 4731937.0, + "step": 519 + }, + { + "epoch": 0.3951367781155015, + "grad_norm": 2.257706880569458, + "learning_rate": 4.910145790519177e-06, + "loss": 0.5300652980804443, + "mean_token_accuracy": 0.8192912936210632, + "num_tokens": 4739422.0, + "step": 520 + }, + { + "epoch": 0.3958966565349544, + "grad_norm": 1.2099462747573853, + "learning_rate": 4.9095884834582256e-06, + "loss": 0.45872747898101807, + "mean_token_accuracy": 0.8362667560577393, + "num_tokens": 4757113.0, + "step": 521 + }, + { + "epoch": 0.3966565349544073, + "grad_norm": 2.7991135120391846, + "learning_rate": 4.909029485255321e-06, + "loss": 0.49039560556411743, + "mean_token_accuracy": 0.8260016441345215, + "num_tokens": 4761709.0, + "step": 522 + }, + { + "epoch": 0.3974164133738602, + "grad_norm": 2.2360129356384277, + "learning_rate": 4.90846879630279e-06, + "loss": 0.49556830525398254, + "mean_token_accuracy": 0.827864408493042, + "num_tokens": 4769048.0, + "step": 523 + }, + { + "epoch": 0.3981762917933131, + "grad_norm": 2.5953688621520996, + "learning_rate": 4.907906416994146e-06, + "loss": 0.387208491563797, + "mean_token_accuracy": 0.8467001914978027, + "num_tokens": 4774637.0, + "step": 524 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 2.1046814918518066, + "learning_rate": 4.907342347724088e-06, + "loss": 0.5477259755134583, + "mean_token_accuracy": 0.8060322999954224, + "num_tokens": 4782774.0, + "step": 525 + }, + { + "epoch": 0.39969604863221886, + "grad_norm": 2.5622646808624268, + "learning_rate": 4.906776588888502e-06, + "loss": 0.5684159398078918, + "mean_token_accuracy": 0.8095303177833557, + "num_tokens": 4788766.0, + "step": 526 + }, + { + "epoch": 0.4004559270516717, + "grad_norm": 1.9027913808822632, + "learning_rate": 4.906209140884459e-06, + "loss": 0.535524845123291, + "mean_token_accuracy": 0.815237820148468, + "num_tokens": 4798492.0, + "step": 527 + }, + { + "epoch": 0.4012158054711246, + "grad_norm": 2.1447622776031494, + "learning_rate": 4.905640004110216e-06, + "loss": 0.5628632307052612, + "mean_token_accuracy": 0.8085395097732544, + "num_tokens": 4805737.0, + "step": 528 + }, + { + "epoch": 0.40197568389057753, + "grad_norm": 1.6754741668701172, + "learning_rate": 4.905069178965215e-06, + "loss": 0.5046736598014832, + "mean_token_accuracy": 0.8247535228729248, + "num_tokens": 4816912.0, + "step": 529 + }, + { + "epoch": 0.4027355623100304, + "grad_norm": 2.271230459213257, + "learning_rate": 4.904496665850083e-06, + "loss": 0.6086187958717346, + "mean_token_accuracy": 0.7935276627540588, + "num_tokens": 4824577.0, + "step": 530 + }, + { + "epoch": 0.4034954407294833, + "grad_norm": 2.107595205307007, + "learning_rate": 4.903922465166633e-06, + "loss": 0.5431341528892517, + "mean_token_accuracy": 0.8129537105560303, + "num_tokens": 4831772.0, + "step": 531 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 1.3860732316970825, + "learning_rate": 4.903346577317859e-06, + "loss": 0.45816320180892944, + "mean_token_accuracy": 0.8328287601470947, + "num_tokens": 4850302.0, + "step": 532 + }, + { + "epoch": 0.40501519756838905, + "grad_norm": 1.9186837673187256, + "learning_rate": 4.902769002707942e-06, + "loss": 0.3294633626937866, + "mean_token_accuracy": 0.8853933811187744, + "num_tokens": 4856624.0, + "step": 533 + }, + { + "epoch": 0.40577507598784196, + "grad_norm": 1.516194462776184, + "learning_rate": 4.902189741742247e-06, + "loss": 0.45482105016708374, + "mean_token_accuracy": 0.8370342254638672, + "num_tokens": 4870395.0, + "step": 534 + }, + { + "epoch": 0.4065349544072948, + "grad_norm": 2.3235628604888916, + "learning_rate": 4.901608794827321e-06, + "loss": 0.40688639879226685, + "mean_token_accuracy": 0.8643521666526794, + "num_tokens": 4875645.0, + "step": 535 + }, + { + "epoch": 0.4072948328267477, + "grad_norm": 2.29286527633667, + "learning_rate": 4.9010261623708945e-06, + "loss": 0.45482826232910156, + "mean_token_accuracy": 0.8429383039474487, + "num_tokens": 4881772.0, + "step": 536 + }, + { + "epoch": 0.40805471124620063, + "grad_norm": 1.5907070636749268, + "learning_rate": 4.900441844781882e-06, + "loss": 0.5266948342323303, + "mean_token_accuracy": 0.8348641395568848, + "num_tokens": 4894289.0, + "step": 537 + }, + { + "epoch": 0.4088145896656535, + "grad_norm": 2.1816294193267822, + "learning_rate": 4.89985584247038e-06, + "loss": 0.4797617793083191, + "mean_token_accuracy": 0.8549500703811646, + "num_tokens": 4901106.0, + "step": 538 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 1.7347146272659302, + "learning_rate": 4.899268155847667e-06, + "loss": 0.4754739999771118, + "mean_token_accuracy": 0.8278418183326721, + "num_tokens": 4912131.0, + "step": 539 + }, + { + "epoch": 0.41033434650455924, + "grad_norm": 2.0694527626037598, + "learning_rate": 4.898678785326205e-06, + "loss": 0.5071008801460266, + "mean_token_accuracy": 0.8157946467399597, + "num_tokens": 4921141.0, + "step": 540 + }, + { + "epoch": 0.41109422492401215, + "grad_norm": 2.570047616958618, + "learning_rate": 4.898087731319637e-06, + "loss": 0.43639278411865234, + "mean_token_accuracy": 0.8682913780212402, + "num_tokens": 4926182.0, + "step": 541 + }, + { + "epoch": 0.41185410334346506, + "grad_norm": 4.064006805419922, + "learning_rate": 4.8974949942427854e-06, + "loss": 0.539260745048523, + "mean_token_accuracy": 0.8225528001785278, + "num_tokens": 4929449.0, + "step": 542 + }, + { + "epoch": 0.4126139817629179, + "grad_norm": 1.7644332647323608, + "learning_rate": 4.896900574511657e-06, + "loss": 0.472618043422699, + "mean_token_accuracy": 0.8332902193069458, + "num_tokens": 4939443.0, + "step": 543 + }, + { + "epoch": 0.4133738601823708, + "grad_norm": 2.879918336868286, + "learning_rate": 4.89630447254344e-06, + "loss": 0.6360667943954468, + "mean_token_accuracy": 0.8215296268463135, + "num_tokens": 4950838.0, + "step": 544 + }, + { + "epoch": 0.41413373860182373, + "grad_norm": 1.4575570821762085, + "learning_rate": 4.8957066887565005e-06, + "loss": 0.45617997646331787, + "mean_token_accuracy": 0.8373187184333801, + "num_tokens": 4965222.0, + "step": 545 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 2.4829535484313965, + "learning_rate": 4.895107223570386e-06, + "loss": 0.42285341024398804, + "mean_token_accuracy": 0.8686380386352539, + "num_tokens": 4970724.0, + "step": 546 + }, + { + "epoch": 0.4156534954407295, + "grad_norm": 2.639474630355835, + "learning_rate": 4.894506077405824e-06, + "loss": 0.5906289219856262, + "mean_token_accuracy": 0.8174435496330261, + "num_tokens": 4976766.0, + "step": 547 + }, + { + "epoch": 0.41641337386018235, + "grad_norm": 2.7960562705993652, + "learning_rate": 4.893903250684723e-06, + "loss": 0.4518949091434479, + "mean_token_accuracy": 0.8387585282325745, + "num_tokens": 4980991.0, + "step": 548 + }, + { + "epoch": 0.41717325227963525, + "grad_norm": 2.184176206588745, + "learning_rate": 4.893298743830168e-06, + "loss": 0.5223842859268188, + "mean_token_accuracy": 0.8170937299728394, + "num_tokens": 4987781.0, + "step": 549 + }, + { + "epoch": 0.41793313069908816, + "grad_norm": 2.2393438816070557, + "learning_rate": 4.892692557266429e-06, + "loss": 0.5238431692123413, + "mean_token_accuracy": 0.8217905759811401, + "num_tokens": 4994321.0, + "step": 550 + }, + { + "epoch": 0.418693009118541, + "grad_norm": 3.579047441482544, + "learning_rate": 4.8920846914189465e-06, + "loss": 0.5367584228515625, + "mean_token_accuracy": 0.8312011361122131, + "num_tokens": 4997951.0, + "step": 551 + }, + { + "epoch": 0.4194528875379939, + "grad_norm": 1.6330240964889526, + "learning_rate": 4.891475146714348e-06, + "loss": 0.6054705381393433, + "mean_token_accuracy": 0.7938206791877747, + "num_tokens": 5012726.0, + "step": 552 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 1.5775716304779053, + "learning_rate": 4.8908639235804324e-06, + "loss": 0.4774656891822815, + "mean_token_accuracy": 0.828762948513031, + "num_tokens": 5026751.0, + "step": 553 + }, + { + "epoch": 0.4209726443768997, + "grad_norm": 1.5719101428985596, + "learning_rate": 4.890251022446181e-06, + "loss": 0.549429178237915, + "mean_token_accuracy": 0.8110791444778442, + "num_tokens": 5041861.0, + "step": 554 + }, + { + "epoch": 0.4217325227963526, + "grad_norm": 1.8585275411605835, + "learning_rate": 4.889636443741752e-06, + "loss": 0.4448118805885315, + "mean_token_accuracy": 0.8462690711021423, + "num_tokens": 5052690.0, + "step": 555 + }, + { + "epoch": 0.42249240121580545, + "grad_norm": 2.189202070236206, + "learning_rate": 4.88902018789848e-06, + "loss": 0.4296762943267822, + "mean_token_accuracy": 0.8488791584968567, + "num_tokens": 5058964.0, + "step": 556 + }, + { + "epoch": 0.42325227963525835, + "grad_norm": 1.9328460693359375, + "learning_rate": 4.888402255348877e-06, + "loss": 0.5369474291801453, + "mean_token_accuracy": 0.8184729814529419, + "num_tokens": 5068465.0, + "step": 557 + }, + { + "epoch": 0.42401215805471126, + "grad_norm": 1.6233323812484741, + "learning_rate": 4.887782646526631e-06, + "loss": 0.5284391641616821, + "mean_token_accuracy": 0.8276044726371765, + "num_tokens": 5081052.0, + "step": 558 + }, + { + "epoch": 0.4247720364741641, + "grad_norm": 2.222813844680786, + "learning_rate": 4.887161361866608e-06, + "loss": 0.5679137706756592, + "mean_token_accuracy": 0.8012375831604004, + "num_tokens": 5090001.0, + "step": 559 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.1062207221984863, + "learning_rate": 4.8865384018048494e-06, + "loss": 0.5554201602935791, + "mean_token_accuracy": 0.8128066062927246, + "num_tokens": 5097644.0, + "step": 560 + }, + { + "epoch": 0.42629179331306993, + "grad_norm": 1.5380984544754028, + "learning_rate": 4.8859137667785735e-06, + "loss": 0.4948265850543976, + "mean_token_accuracy": 0.8258291482925415, + "num_tokens": 5110069.0, + "step": 561 + }, + { + "epoch": 0.4270516717325228, + "grad_norm": 2.0290257930755615, + "learning_rate": 4.8852874572261715e-06, + "loss": 0.4969530403614044, + "mean_token_accuracy": 0.8297134637832642, + "num_tokens": 5117452.0, + "step": 562 + }, + { + "epoch": 0.4278115501519757, + "grad_norm": 1.5651452541351318, + "learning_rate": 4.884659473587213e-06, + "loss": 0.5353102087974548, + "mean_token_accuracy": 0.8161719441413879, + "num_tokens": 5133756.0, + "step": 563 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.2470998764038086, + "learning_rate": 4.884029816302441e-06, + "loss": 0.5104288458824158, + "mean_token_accuracy": 0.8081635236740112, + "num_tokens": 5140278.0, + "step": 564 + }, + { + "epoch": 0.42933130699088146, + "grad_norm": 1.726891279220581, + "learning_rate": 4.883398485813772e-06, + "loss": 0.4508771002292633, + "mean_token_accuracy": 0.8548800349235535, + "num_tokens": 5150115.0, + "step": 565 + }, + { + "epoch": 0.43009118541033436, + "grad_norm": 1.4779289960861206, + "learning_rate": 4.8827654825642984e-06, + "loss": 0.46861088275909424, + "mean_token_accuracy": 0.8209476470947266, + "num_tokens": 5163225.0, + "step": 566 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 1.2361034154891968, + "learning_rate": 4.882130806998287e-06, + "loss": 0.4591076672077179, + "mean_token_accuracy": 0.803041934967041, + "num_tokens": 5180342.0, + "step": 567 + }, + { + "epoch": 0.4316109422492401, + "grad_norm": 1.882467269897461, + "learning_rate": 4.881494459561177e-06, + "loss": 0.579258143901825, + "mean_token_accuracy": 0.8007112741470337, + "num_tokens": 5189595.0, + "step": 568 + }, + { + "epoch": 0.43237082066869303, + "grad_norm": 1.095462441444397, + "learning_rate": 4.880856440699582e-06, + "loss": 0.3806574046611786, + "mean_token_accuracy": 0.8650111556053162, + "num_tokens": 5211642.0, + "step": 569 + }, + { + "epoch": 0.4331306990881459, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.880216750861288e-06, + "loss": 0.544589638710022, + "mean_token_accuracy": 0.8060122728347778, + "num_tokens": 5224137.0, + "step": 570 + }, + { + "epoch": 0.4338905775075988, + "grad_norm": 1.8561251163482666, + "learning_rate": 4.879575390495254e-06, + "loss": 0.4094924330711365, + "mean_token_accuracy": 0.8591406345367432, + "num_tokens": 5231588.0, + "step": 571 + }, + { + "epoch": 0.43465045592705165, + "grad_norm": 3.01326847076416, + "learning_rate": 4.878932360051611e-06, + "loss": 0.6139192581176758, + "mean_token_accuracy": 0.8108739852905273, + "num_tokens": 5236853.0, + "step": 572 + }, + { + "epoch": 0.43541033434650456, + "grad_norm": 2.1753034591674805, + "learning_rate": 4.878287659981663e-06, + "loss": 0.49082931876182556, + "mean_token_accuracy": 0.862828254699707, + "num_tokens": 5243264.0, + "step": 573 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 1.4437755346298218, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.5608728528022766, + "mean_token_accuracy": 0.8271626234054565, + "num_tokens": 5261757.0, + "step": 574 + }, + { + "epoch": 0.4369300911854103, + "grad_norm": 1.786683440208435, + "learning_rate": 4.876993252773923e-06, + "loss": 0.4377627968788147, + "mean_token_accuracy": 0.844936192035675, + "num_tokens": 5271038.0, + "step": 575 + }, + { + "epoch": 0.4376899696048632, + "grad_norm": 1.3425915241241455, + "learning_rate": 4.876343546544596e-06, + "loss": 0.44762521982192993, + "mean_token_accuracy": 0.8397793769836426, + "num_tokens": 5285555.0, + "step": 576 + }, + { + "epoch": 0.43844984802431614, + "grad_norm": 2.1549675464630127, + "learning_rate": 4.8756921725058935e-06, + "loss": 0.5332942008972168, + "mean_token_accuracy": 0.820149302482605, + "num_tokens": 5294595.0, + "step": 577 + }, + { + "epoch": 0.439209726443769, + "grad_norm": 1.5254042148590088, + "learning_rate": 4.875039131114975e-06, + "loss": 0.3646543622016907, + "mean_token_accuracy": 0.8442583084106445, + "num_tokens": 5304955.0, + "step": 578 + }, + { + "epoch": 0.4399696048632219, + "grad_norm": 1.5751557350158691, + "learning_rate": 4.8743844228301676e-06, + "loss": 0.4854734539985657, + "mean_token_accuracy": 0.8317523002624512, + "num_tokens": 5317351.0, + "step": 579 + }, + { + "epoch": 0.44072948328267475, + "grad_norm": 1.6950466632843018, + "learning_rate": 4.873728048110973e-06, + "loss": 0.5907570719718933, + "mean_token_accuracy": 0.7946986556053162, + "num_tokens": 5332542.0, + "step": 580 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 2.1180708408355713, + "learning_rate": 4.873070007418059e-06, + "loss": 0.5220296382904053, + "mean_token_accuracy": 0.8037363290786743, + "num_tokens": 5341722.0, + "step": 581 + }, + { + "epoch": 0.44224924012158057, + "grad_norm": 1.3643816709518433, + "learning_rate": 4.872410301213265e-06, + "loss": 0.4865502417087555, + "mean_token_accuracy": 0.8377852439880371, + "num_tokens": 5359359.0, + "step": 582 + }, + { + "epoch": 0.4430091185410334, + "grad_norm": 1.483280897140503, + "learning_rate": 4.871748929959598e-06, + "loss": 0.36856764554977417, + "mean_token_accuracy": 0.8709549903869629, + "num_tokens": 5369749.0, + "step": 583 + }, + { + "epoch": 0.44376899696048633, + "grad_norm": 1.6891541481018066, + "learning_rate": 4.871085894121234e-06, + "loss": 0.5768930912017822, + "mean_token_accuracy": 0.8030461668968201, + "num_tokens": 5383912.0, + "step": 584 + }, + { + "epoch": 0.44452887537993924, + "grad_norm": 2.1318740844726562, + "learning_rate": 4.870421194163515e-06, + "loss": 0.4337100386619568, + "mean_token_accuracy": 0.8562518358230591, + "num_tokens": 5389412.0, + "step": 585 + }, + { + "epoch": 0.4452887537993921, + "grad_norm": 2.540255546569824, + "learning_rate": 4.869754830552956e-06, + "loss": 0.4708256125450134, + "mean_token_accuracy": 0.8446552753448486, + "num_tokens": 5394762.0, + "step": 586 + }, + { + "epoch": 0.446048632218845, + "grad_norm": 2.048015594482422, + "learning_rate": 4.869086803757235e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8181137442588806, + "num_tokens": 5402379.0, + "step": 587 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 2.9821012020111084, + "learning_rate": 4.868417114245199e-06, + "loss": 0.6299797296524048, + "mean_token_accuracy": 0.8237329125404358, + "num_tokens": 5408229.0, + "step": 588 + }, + { + "epoch": 0.44756838905775076, + "grad_norm": 1.7807202339172363, + "learning_rate": 4.867745762486862e-06, + "loss": 0.5176759958267212, + "mean_token_accuracy": 0.8184244632720947, + "num_tokens": 5418383.0, + "step": 589 + }, + { + "epoch": 0.44832826747720367, + "grad_norm": 1.5466399192810059, + "learning_rate": 4.8670727489534035e-06, + "loss": 0.5137228965759277, + "mean_token_accuracy": 0.8365053534507751, + "num_tokens": 5432127.0, + "step": 590 + }, + { + "epoch": 0.4490881458966565, + "grad_norm": 2.9521141052246094, + "learning_rate": 4.866398074117173e-06, + "loss": 0.4056887924671173, + "mean_token_accuracy": 0.8561501502990723, + "num_tokens": 5436062.0, + "step": 591 + }, + { + "epoch": 0.44984802431610943, + "grad_norm": 2.058743953704834, + "learning_rate": 4.86572173845168e-06, + "loss": 0.6124799251556396, + "mean_token_accuracy": 0.8007957339286804, + "num_tokens": 5444989.0, + "step": 592 + }, + { + "epoch": 0.4506079027355623, + "grad_norm": 2.1243767738342285, + "learning_rate": 4.865043742431605e-06, + "loss": 0.5659694671630859, + "mean_token_accuracy": 0.8084750175476074, + "num_tokens": 5453865.0, + "step": 593 + }, + { + "epoch": 0.4513677811550152, + "grad_norm": 1.6732314825057983, + "learning_rate": 4.864364086532792e-06, + "loss": 0.47879064083099365, + "mean_token_accuracy": 0.8346436023712158, + "num_tokens": 5466398.0, + "step": 594 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 1.3793858289718628, + "learning_rate": 4.863682771232249e-06, + "loss": 0.45989373326301575, + "mean_token_accuracy": 0.8254791498184204, + "num_tokens": 5482121.0, + "step": 595 + }, + { + "epoch": 0.45288753799392095, + "grad_norm": 1.9812315702438354, + "learning_rate": 4.862999797008149e-06, + "loss": 0.5778874754905701, + "mean_token_accuracy": 0.8041508197784424, + "num_tokens": 5493000.0, + "step": 596 + }, + { + "epoch": 0.45364741641337386, + "grad_norm": 3.3065083026885986, + "learning_rate": 4.862315164339829e-06, + "loss": 0.4623975157737732, + "mean_token_accuracy": 0.8426318168640137, + "num_tokens": 5496723.0, + "step": 597 + }, + { + "epoch": 0.45440729483282677, + "grad_norm": 3.167119026184082, + "learning_rate": 4.861628873707792e-06, + "loss": 0.6984533667564392, + "mean_token_accuracy": 0.772136926651001, + "num_tokens": 5501161.0, + "step": 598 + }, + { + "epoch": 0.4551671732522796, + "grad_norm": 2.2130985260009766, + "learning_rate": 4.860940925593703e-06, + "loss": 0.4823192059993744, + "mean_token_accuracy": 0.8462972640991211, + "num_tokens": 5509544.0, + "step": 599 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 3.029191732406616, + "learning_rate": 4.86025132048039e-06, + "loss": 0.523664116859436, + "mean_token_accuracy": 0.8229140043258667, + "num_tokens": 5514586.0, + "step": 600 + }, + { + "epoch": 0.4566869300911854, + "grad_norm": 1.6983962059020996, + "learning_rate": 4.859560058851844e-06, + "loss": 0.4832698106765747, + "mean_token_accuracy": 0.8403248190879822, + "num_tokens": 5525773.0, + "step": 601 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 3.0504038333892822, + "learning_rate": 4.8588671411932195e-06, + "loss": 0.5158926248550415, + "mean_token_accuracy": 0.8098392486572266, + "num_tokens": 5529739.0, + "step": 602 + }, + { + "epoch": 0.4582066869300912, + "grad_norm": 2.584836483001709, + "learning_rate": 4.858172567990832e-06, + "loss": 0.5724587440490723, + "mean_token_accuracy": 0.8128519058227539, + "num_tokens": 5535763.0, + "step": 603 + }, + { + "epoch": 0.45896656534954405, + "grad_norm": 2.0514042377471924, + "learning_rate": 4.857476339732162e-06, + "loss": 0.4337679445743561, + "mean_token_accuracy": 0.8405929207801819, + "num_tokens": 5543075.0, + "step": 604 + }, + { + "epoch": 0.45972644376899696, + "grad_norm": 2.2949347496032715, + "learning_rate": 4.856778456905846e-06, + "loss": 0.46532145142555237, + "mean_token_accuracy": 0.8345137238502502, + "num_tokens": 5549035.0, + "step": 605 + }, + { + "epoch": 0.46048632218844987, + "grad_norm": 2.2067551612854004, + "learning_rate": 4.856078920001689e-06, + "loss": 0.5855136513710022, + "mean_token_accuracy": 0.8043795228004456, + "num_tokens": 5555545.0, + "step": 606 + }, + { + "epoch": 0.4612462006079027, + "grad_norm": 2.101945161819458, + "learning_rate": 4.855377729510648e-06, + "loss": 0.6071814298629761, + "mean_token_accuracy": 0.7973253130912781, + "num_tokens": 5563615.0, + "step": 607 + }, + { + "epoch": 0.46200607902735563, + "grad_norm": 2.5958821773529053, + "learning_rate": 4.8546748859248504e-06, + "loss": 0.6278061866760254, + "mean_token_accuracy": 0.7864972352981567, + "num_tokens": 5570078.0, + "step": 608 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 2.778101921081543, + "learning_rate": 4.853970389737576e-06, + "loss": 0.35521194338798523, + "mean_token_accuracy": 0.8752605319023132, + "num_tokens": 5573995.0, + "step": 609 + }, + { + "epoch": 0.4635258358662614, + "grad_norm": 2.600534677505493, + "learning_rate": 4.8532642414432675e-06, + "loss": 0.6541563868522644, + "mean_token_accuracy": 0.7843613028526306, + "num_tokens": 5580333.0, + "step": 610 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.778337836265564, + "learning_rate": 4.852556441537528e-06, + "loss": 0.3561405837535858, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 5588430.0, + "step": 611 + }, + { + "epoch": 0.46504559270516715, + "grad_norm": 1.5653862953186035, + "learning_rate": 4.851846990517118e-06, + "loss": 0.6067906618118286, + "mean_token_accuracy": 0.7919317483901978, + "num_tokens": 5601700.0, + "step": 612 + }, + { + "epoch": 0.46580547112462006, + "grad_norm": 1.6097723245620728, + "learning_rate": 4.851135888879958e-06, + "loss": 0.446664422750473, + "mean_token_accuracy": 0.8441969156265259, + "num_tokens": 5612063.0, + "step": 613 + }, + { + "epoch": 0.46656534954407297, + "grad_norm": 1.961207389831543, + "learning_rate": 4.850423137125126e-06, + "loss": 0.5508605241775513, + "mean_token_accuracy": 0.8240450024604797, + "num_tokens": 5620245.0, + "step": 614 + }, + { + "epoch": 0.4673252279635258, + "grad_norm": 2.2189085483551025, + "learning_rate": 4.8497087357528585e-06, + "loss": 0.6805076599121094, + "mean_token_accuracy": 0.771978497505188, + "num_tokens": 5629590.0, + "step": 615 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 2.5176279544830322, + "learning_rate": 4.8489926852645505e-06, + "loss": 0.4512156844139099, + "mean_token_accuracy": 0.836459755897522, + "num_tokens": 5635259.0, + "step": 616 + }, + { + "epoch": 0.4688449848024316, + "grad_norm": 1.5327287912368774, + "learning_rate": 4.848274986162754e-06, + "loss": 0.4884302616119385, + "mean_token_accuracy": 0.8194037079811096, + "num_tokens": 5649993.0, + "step": 617 + }, + { + "epoch": 0.4696048632218845, + "grad_norm": 2.184554100036621, + "learning_rate": 4.847555638951177e-06, + "loss": 0.5141451358795166, + "mean_token_accuracy": 0.8245922327041626, + "num_tokens": 5657375.0, + "step": 618 + }, + { + "epoch": 0.4703647416413374, + "grad_norm": 1.6143407821655273, + "learning_rate": 4.846834644134686e-06, + "loss": 0.4276641607284546, + "mean_token_accuracy": 0.8481845855712891, + "num_tokens": 5667941.0, + "step": 619 + }, + { + "epoch": 0.47112462006079026, + "grad_norm": 2.3747270107269287, + "learning_rate": 4.846112002219301e-06, + "loss": 0.5608246922492981, + "mean_token_accuracy": 0.8073011040687561, + "num_tokens": 5675042.0, + "step": 620 + }, + { + "epoch": 0.47188449848024316, + "grad_norm": 2.390404224395752, + "learning_rate": 4.845387713712203e-06, + "loss": 0.46616724133491516, + "mean_token_accuracy": 0.8468319177627563, + "num_tokens": 5680207.0, + "step": 621 + }, + { + "epoch": 0.4726443768996961, + "grad_norm": 1.7245099544525146, + "learning_rate": 4.844661779121723e-06, + "loss": 0.5652435421943665, + "mean_token_accuracy": 0.8010749816894531, + "num_tokens": 5693759.0, + "step": 622 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 2.6923108100891113, + "learning_rate": 4.843934198957351e-06, + "loss": 0.6254661679267883, + "mean_token_accuracy": 0.8236024975776672, + "num_tokens": 5699916.0, + "step": 623 + }, + { + "epoch": 0.47416413373860183, + "grad_norm": 2.516901969909668, + "learning_rate": 4.84320497372973e-06, + "loss": 0.6334252953529358, + "mean_token_accuracy": 0.7803834676742554, + "num_tokens": 5706554.0, + "step": 624 + }, + { + "epoch": 0.4749240121580547, + "grad_norm": 2.3744447231292725, + "learning_rate": 4.842474103950658e-06, + "loss": 0.4221811890602112, + "mean_token_accuracy": 0.8639545440673828, + "num_tokens": 5711756.0, + "step": 625 + }, + { + "epoch": 0.4756838905775076, + "grad_norm": 3.2373476028442383, + "learning_rate": 4.841741590133089e-06, + "loss": 0.6637828946113586, + "mean_token_accuracy": 0.7968347072601318, + "num_tokens": 5716458.0, + "step": 626 + }, + { + "epoch": 0.4764437689969605, + "grad_norm": 2.153888463973999, + "learning_rate": 4.841007432791129e-06, + "loss": 0.4877486228942871, + "mean_token_accuracy": 0.8345249891281128, + "num_tokens": 5723155.0, + "step": 627 + }, + { + "epoch": 0.47720364741641336, + "grad_norm": 2.120497703552246, + "learning_rate": 4.8402716324400375e-06, + "loss": 0.37323033809661865, + "mean_token_accuracy": 0.8734050393104553, + "num_tokens": 5729171.0, + "step": 628 + }, + { + "epoch": 0.47796352583586627, + "grad_norm": 1.5294172763824463, + "learning_rate": 4.839534189596228e-06, + "loss": 0.4057067334651947, + "mean_token_accuracy": 0.8523319959640503, + "num_tokens": 5740112.0, + "step": 629 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 2.1913886070251465, + "learning_rate": 4.8387951047772656e-06, + "loss": 0.4835960865020752, + "mean_token_accuracy": 0.8438145518302917, + "num_tokens": 5746838.0, + "step": 630 + }, + { + "epoch": 0.479483282674772, + "grad_norm": 1.482897162437439, + "learning_rate": 4.838054378501868e-06, + "loss": 0.46967992186546326, + "mean_token_accuracy": 0.8315759897232056, + "num_tokens": 5760428.0, + "step": 631 + }, + { + "epoch": 0.48024316109422494, + "grad_norm": 1.38850998878479, + "learning_rate": 4.837312011289907e-06, + "loss": 0.41845446825027466, + "mean_token_accuracy": 0.8557186126708984, + "num_tokens": 5773437.0, + "step": 632 + }, + { + "epoch": 0.4810030395136778, + "grad_norm": 3.8337457180023193, + "learning_rate": 4.836568003662403e-06, + "loss": 0.5102912187576294, + "mean_token_accuracy": 0.830644965171814, + "num_tokens": 5776367.0, + "step": 633 + }, + { + "epoch": 0.4817629179331307, + "grad_norm": 1.2084007263183594, + "learning_rate": 4.8358223561415304e-06, + "loss": 0.3835333585739136, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 5792246.0, + "step": 634 + }, + { + "epoch": 0.4825227963525836, + "grad_norm": 1.939408540725708, + "learning_rate": 4.835075069250613e-06, + "loss": 0.4044850468635559, + "mean_token_accuracy": 0.8488376140594482, + "num_tokens": 5799853.0, + "step": 635 + }, + { + "epoch": 0.48328267477203646, + "grad_norm": 1.345870852470398, + "learning_rate": 4.8343261435141245e-06, + "loss": 0.46660199761390686, + "mean_token_accuracy": 0.8371681571006775, + "num_tokens": 5817478.0, + "step": 636 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 1.6531339883804321, + "learning_rate": 4.833575579457691e-06, + "loss": 0.3886989951133728, + "mean_token_accuracy": 0.8763507008552551, + "num_tokens": 5825739.0, + "step": 637 + }, + { + "epoch": 0.4848024316109423, + "grad_norm": 1.6443969011306763, + "learning_rate": 4.832823377608088e-06, + "loss": 0.4070289731025696, + "mean_token_accuracy": 0.8586630821228027, + "num_tokens": 5837917.0, + "step": 638 + }, + { + "epoch": 0.48556231003039513, + "grad_norm": 2.005136013031006, + "learning_rate": 4.832069538493237e-06, + "loss": 0.40616685152053833, + "mean_token_accuracy": 0.8571510314941406, + "num_tokens": 5845250.0, + "step": 639 + }, + { + "epoch": 0.48632218844984804, + "grad_norm": 1.5244266986846924, + "learning_rate": 4.831314062642213e-06, + "loss": 0.49530288577079773, + "mean_token_accuracy": 0.8328841924667358, + "num_tokens": 5857407.0, + "step": 640 + }, + { + "epoch": 0.4870820668693009, + "grad_norm": 1.9876971244812012, + "learning_rate": 4.830556950585239e-06, + "loss": 0.4583776593208313, + "mean_token_accuracy": 0.8427221179008484, + "num_tokens": 5865391.0, + "step": 641 + }, + { + "epoch": 0.4878419452887538, + "grad_norm": 3.023336172103882, + "learning_rate": 4.829798202853683e-06, + "loss": 0.6134771108627319, + "mean_token_accuracy": 0.7981935739517212, + "num_tokens": 5870729.0, + "step": 642 + }, + { + "epoch": 0.4886018237082067, + "grad_norm": 1.8889515399932861, + "learning_rate": 4.829037819980065e-06, + "loss": 0.4420135021209717, + "mean_token_accuracy": 0.8480775356292725, + "num_tokens": 5878982.0, + "step": 643 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.2408435344696045, + "learning_rate": 4.828275802498051e-06, + "loss": 0.525706946849823, + "mean_token_accuracy": 0.8271557092666626, + "num_tokens": 5885097.0, + "step": 644 + }, + { + "epoch": 0.49012158054711247, + "grad_norm": 1.9734224081039429, + "learning_rate": 4.827512150942454e-06, + "loss": 0.44246578216552734, + "mean_token_accuracy": 0.8456668257713318, + "num_tokens": 5893941.0, + "step": 645 + }, + { + "epoch": 0.4908814589665654, + "grad_norm": 1.9618173837661743, + "learning_rate": 4.8267468658492335e-06, + "loss": 0.5119768381118774, + "mean_token_accuracy": 0.8355510830879211, + "num_tokens": 5902829.0, + "step": 646 + }, + { + "epoch": 0.49164133738601823, + "grad_norm": 1.7181587219238281, + "learning_rate": 4.825979947755496e-06, + "loss": 0.5666520595550537, + "mean_token_accuracy": 0.7951971888542175, + "num_tokens": 5915212.0, + "step": 647 + }, + { + "epoch": 0.49240121580547114, + "grad_norm": 3.0121164321899414, + "learning_rate": 4.8252113971994955e-06, + "loss": 0.628632128238678, + "mean_token_accuracy": 0.8041050434112549, + "num_tokens": 5921410.0, + "step": 648 + }, + { + "epoch": 0.493161094224924, + "grad_norm": 2.9980475902557373, + "learning_rate": 4.824441214720629e-06, + "loss": 0.4507424831390381, + "mean_token_accuracy": 0.8636263608932495, + "num_tokens": 5925179.0, + "step": 649 + }, + { + "epoch": 0.4939209726443769, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.823669400859441e-06, + "loss": 0.602759838104248, + "mean_token_accuracy": 0.8104915618896484, + "num_tokens": 5934160.0, + "step": 650 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 1.1186442375183105, + "learning_rate": 4.8228959561576195e-06, + "loss": 0.41168469190597534, + "mean_token_accuracy": 0.8461419939994812, + "num_tokens": 5954163.0, + "step": 651 + }, + { + "epoch": 0.49544072948328266, + "grad_norm": 1.855465054512024, + "learning_rate": 4.822120881157998e-06, + "loss": 0.5049735307693481, + "mean_token_accuracy": 0.8225747346878052, + "num_tokens": 5963840.0, + "step": 652 + }, + { + "epoch": 0.49620060790273557, + "grad_norm": 3.550563335418701, + "learning_rate": 4.821344176404554e-06, + "loss": 0.49025264382362366, + "mean_token_accuracy": 0.8265978693962097, + "num_tokens": 5967358.0, + "step": 653 + }, + { + "epoch": 0.4969604863221885, + "grad_norm": 3.063910484313965, + "learning_rate": 4.820565842442408e-06, + "loss": 0.5652767419815063, + "mean_token_accuracy": 0.811700701713562, + "num_tokens": 5971858.0, + "step": 654 + }, + { + "epoch": 0.49772036474164133, + "grad_norm": 2.4613308906555176, + "learning_rate": 4.819785879817827e-06, + "loss": 0.5296125411987305, + "mean_token_accuracy": 0.8336488008499146, + "num_tokens": 5977442.0, + "step": 655 + }, + { + "epoch": 0.49848024316109424, + "grad_norm": 2.342519760131836, + "learning_rate": 4.819004289078217e-06, + "loss": 0.5753380060195923, + "mean_token_accuracy": 0.7922406792640686, + "num_tokens": 5984531.0, + "step": 656 + }, + { + "epoch": 0.4992401215805471, + "grad_norm": 2.0410680770874023, + "learning_rate": 4.818221070772129e-06, + "loss": 0.5433275699615479, + "mean_token_accuracy": 0.8043830990791321, + "num_tokens": 5992642.0, + "step": 657 + }, + { + "epoch": 0.5, + "grad_norm": 1.4999698400497437, + "learning_rate": 4.8174362254492555e-06, + "loss": 0.5248899459838867, + "mean_token_accuracy": 0.8107168674468994, + "num_tokens": 6005543.0, + "step": 658 + }, + { + "epoch": 0.5007598784194529, + "grad_norm": 1.9494401216506958, + "learning_rate": 4.816649753660431e-06, + "loss": 0.41291385889053345, + "mean_token_accuracy": 0.8650569915771484, + "num_tokens": 6012185.0, + "step": 659 + }, + { + "epoch": 0.5015197568389058, + "grad_norm": 2.7514095306396484, + "learning_rate": 4.815861655957632e-06, + "loss": 0.4244142770767212, + "mean_token_accuracy": 0.8485112190246582, + "num_tokens": 6016809.0, + "step": 660 + }, + { + "epoch": 0.5022796352583586, + "grad_norm": 1.4354928731918335, + "learning_rate": 4.815071932893976e-06, + "loss": 0.4332060217857361, + "mean_token_accuracy": 0.8386815786361694, + "num_tokens": 6034795.0, + "step": 661 + }, + { + "epoch": 0.5030395136778115, + "grad_norm": 1.3113417625427246, + "learning_rate": 4.81428058502372e-06, + "loss": 0.5415540933609009, + "mean_token_accuracy": 0.8115285038948059, + "num_tokens": 6053624.0, + "step": 662 + }, + { + "epoch": 0.5037993920972644, + "grad_norm": 1.820868730545044, + "learning_rate": 4.813487612902265e-06, + "loss": 0.5360245108604431, + "mean_token_accuracy": 0.8313555717468262, + "num_tokens": 6063399.0, + "step": 663 + }, + { + "epoch": 0.5045592705167173, + "grad_norm": 2.347001552581787, + "learning_rate": 4.812693017086145e-06, + "loss": 0.4926982820034027, + "mean_token_accuracy": 0.8137006759643555, + "num_tokens": 6070111.0, + "step": 664 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 1.8830888271331787, + "learning_rate": 4.811896798133042e-06, + "loss": 0.5419014692306519, + "mean_token_accuracy": 0.8027454614639282, + "num_tokens": 6081090.0, + "step": 665 + }, + { + "epoch": 0.506079027355623, + "grad_norm": 2.3258056640625, + "learning_rate": 4.811098956601772e-06, + "loss": 0.4629337787628174, + "mean_token_accuracy": 0.8416580557823181, + "num_tokens": 6087921.0, + "step": 666 + }, + { + "epoch": 0.506838905775076, + "grad_norm": 1.9578291177749634, + "learning_rate": 4.810299493052289e-06, + "loss": 0.40305402874946594, + "mean_token_accuracy": 0.8529061079025269, + "num_tokens": 6100034.0, + "step": 667 + }, + { + "epoch": 0.5075987841945289, + "grad_norm": 2.800635576248169, + "learning_rate": 4.809498408045691e-06, + "loss": 0.5087342262268066, + "mean_token_accuracy": 0.8214689493179321, + "num_tokens": 6104742.0, + "step": 668 + }, + { + "epoch": 0.5083586626139818, + "grad_norm": 1.5318149328231812, + "learning_rate": 4.808695702144206e-06, + "loss": 0.4733222723007202, + "mean_token_accuracy": 0.837577223777771, + "num_tokens": 6117242.0, + "step": 669 + }, + { + "epoch": 0.5091185410334347, + "grad_norm": 1.2368661165237427, + "learning_rate": 4.807891375911207e-06, + "loss": 0.3929097056388855, + "mean_token_accuracy": 0.8331400752067566, + "num_tokens": 6133509.0, + "step": 670 + }, + { + "epoch": 0.5098784194528876, + "grad_norm": 2.4711415767669678, + "learning_rate": 4.8070854299112e-06, + "loss": 0.6294851303100586, + "mean_token_accuracy": 0.7956781983375549, + "num_tokens": 6140294.0, + "step": 671 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.590961217880249, + "learning_rate": 4.806277864709828e-06, + "loss": 0.580160915851593, + "mean_token_accuracy": 0.809589684009552, + "num_tokens": 6145803.0, + "step": 672 + }, + { + "epoch": 0.5113981762917933, + "grad_norm": 2.4653842449188232, + "learning_rate": 4.805468680873874e-06, + "loss": 0.5262120366096497, + "mean_token_accuracy": 0.822458803653717, + "num_tokens": 6151236.0, + "step": 673 + }, + { + "epoch": 0.5121580547112462, + "grad_norm": 2.860720157623291, + "learning_rate": 4.804657878971252e-06, + "loss": 0.4007391035556793, + "mean_token_accuracy": 0.8637382984161377, + "num_tokens": 6155310.0, + "step": 674 + }, + { + "epoch": 0.5129179331306991, + "grad_norm": 2.520282030105591, + "learning_rate": 4.803845459571014e-06, + "loss": 0.45798182487487793, + "mean_token_accuracy": 0.8270114660263062, + "num_tokens": 6160326.0, + "step": 675 + }, + { + "epoch": 0.513677811550152, + "grad_norm": 2.7290921211242676, + "learning_rate": 4.803031423243349e-06, + "loss": 0.5745848417282104, + "mean_token_accuracy": 0.8401234745979309, + "num_tokens": 6165709.0, + "step": 676 + }, + { + "epoch": 0.5144376899696048, + "grad_norm": 1.6678650379180908, + "learning_rate": 4.802215770559578e-06, + "loss": 0.5257721543312073, + "mean_token_accuracy": 0.8241991996765137, + "num_tokens": 6177875.0, + "step": 677 + }, + { + "epoch": 0.5151975683890577, + "grad_norm": 2.1720468997955322, + "learning_rate": 4.801398502092156e-06, + "loss": 0.45342206954956055, + "mean_token_accuracy": 0.8463799953460693, + "num_tokens": 6185415.0, + "step": 678 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 2.282259702682495, + "learning_rate": 4.800579618414677e-06, + "loss": 0.4864169955253601, + "mean_token_accuracy": 0.8300632238388062, + "num_tokens": 6191832.0, + "step": 679 + }, + { + "epoch": 0.5167173252279635, + "grad_norm": 2.0092248916625977, + "learning_rate": 4.799759120101861e-06, + "loss": 0.5781463980674744, + "mean_token_accuracy": 0.8267031908035278, + "num_tokens": 6199440.0, + "step": 680 + }, + { + "epoch": 0.5174772036474165, + "grad_norm": 1.396580696105957, + "learning_rate": 4.798937007729568e-06, + "loss": 0.49689239263534546, + "mean_token_accuracy": 0.8257499933242798, + "num_tokens": 6213840.0, + "step": 681 + }, + { + "epoch": 0.5182370820668692, + "grad_norm": 1.9060769081115723, + "learning_rate": 4.798113281874788e-06, + "loss": 0.48969539999961853, + "mean_token_accuracy": 0.8171790838241577, + "num_tokens": 6223006.0, + "step": 682 + }, + { + "epoch": 0.5189969604863222, + "grad_norm": 1.6255282163619995, + "learning_rate": 4.797287943115642e-06, + "loss": 0.5532330870628357, + "mean_token_accuracy": 0.8173393607139587, + "num_tokens": 6234857.0, + "step": 683 + }, + { + "epoch": 0.5197568389057751, + "grad_norm": 1.6923905611038208, + "learning_rate": 4.796460992031386e-06, + "loss": 0.4880887269973755, + "mean_token_accuracy": 0.834983229637146, + "num_tokens": 6245252.0, + "step": 684 + }, + { + "epoch": 0.520516717325228, + "grad_norm": 2.13161301612854, + "learning_rate": 4.7956324292024045e-06, + "loss": 0.5687593817710876, + "mean_token_accuracy": 0.7996571063995361, + "num_tokens": 6253726.0, + "step": 685 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 2.509375810623169, + "learning_rate": 4.794802255210217e-06, + "loss": 0.5396929979324341, + "mean_token_accuracy": 0.8007107973098755, + "num_tokens": 6259238.0, + "step": 686 + }, + { + "epoch": 0.5220364741641338, + "grad_norm": 2.393710136413574, + "learning_rate": 4.793970470637469e-06, + "loss": 0.6165191531181335, + "mean_token_accuracy": 0.7891418933868408, + "num_tokens": 6266325.0, + "step": 687 + }, + { + "epoch": 0.5227963525835866, + "grad_norm": 1.511647343635559, + "learning_rate": 4.7931370760679415e-06, + "loss": 0.4773876965045929, + "mean_token_accuracy": 0.8381044864654541, + "num_tokens": 6277447.0, + "step": 688 + }, + { + "epoch": 0.5235562310030395, + "grad_norm": 2.206587314605713, + "learning_rate": 4.792302072086542e-06, + "loss": 0.5482058525085449, + "mean_token_accuracy": 0.8239108920097351, + "num_tokens": 6285163.0, + "step": 689 + }, + { + "epoch": 0.5243161094224924, + "grad_norm": 3.018146514892578, + "learning_rate": 4.7914654592793065e-06, + "loss": 0.4880615472793579, + "mean_token_accuracy": 0.8361308574676514, + "num_tokens": 6289386.0, + "step": 690 + }, + { + "epoch": 0.5250759878419453, + "grad_norm": 1.6469231843948364, + "learning_rate": 4.790627238233405e-06, + "loss": 0.4164774715900421, + "mean_token_accuracy": 0.8496290445327759, + "num_tokens": 6298915.0, + "step": 691 + }, + { + "epoch": 0.5258358662613982, + "grad_norm": 2.352505922317505, + "learning_rate": 4.789787409537131e-06, + "loss": 0.5366303324699402, + "mean_token_accuracy": 0.8350417613983154, + "num_tokens": 6306130.0, + "step": 692 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 1.7463021278381348, + "learning_rate": 4.7889459737799105e-06, + "loss": 0.4389137923717499, + "mean_token_accuracy": 0.8463300466537476, + "num_tokens": 6315503.0, + "step": 693 + }, + { + "epoch": 0.5273556231003039, + "grad_norm": 2.257706642150879, + "learning_rate": 4.788102931552294e-06, + "loss": 0.5309344530105591, + "mean_token_accuracy": 0.8164352178573608, + "num_tokens": 6321852.0, + "step": 694 + }, + { + "epoch": 0.5281155015197568, + "grad_norm": 2.392732620239258, + "learning_rate": 4.787258283445962e-06, + "loss": 0.3956204056739807, + "mean_token_accuracy": 0.8671456575393677, + "num_tokens": 6327380.0, + "step": 695 + }, + { + "epoch": 0.5288753799392097, + "grad_norm": 2.210514545440674, + "learning_rate": 4.786412030053721e-06, + "loss": 0.4842875003814697, + "mean_token_accuracy": 0.8508446216583252, + "num_tokens": 6334898.0, + "step": 696 + }, + { + "epoch": 0.5296352583586627, + "grad_norm": 1.8678946495056152, + "learning_rate": 4.785564171969503e-06, + "loss": 0.47399595379829407, + "mean_token_accuracy": 0.8514996767044067, + "num_tokens": 6346374.0, + "step": 697 + }, + { + "epoch": 0.5303951367781155, + "grad_norm": 2.604079484939575, + "learning_rate": 4.784714709788368e-06, + "loss": 0.5950228571891785, + "mean_token_accuracy": 0.7983481884002686, + "num_tokens": 6351648.0, + "step": 698 + }, + { + "epoch": 0.5311550151975684, + "grad_norm": 1.662381649017334, + "learning_rate": 4.783863644106502e-06, + "loss": 0.41616758704185486, + "mean_token_accuracy": 0.8554803133010864, + "num_tokens": 6360506.0, + "step": 699 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 1.6300342082977295, + "learning_rate": 4.783010975521216e-06, + "loss": 0.43029269576072693, + "mean_token_accuracy": 0.8443028926849365, + "num_tokens": 6370675.0, + "step": 700 + }, + { + "epoch": 0.5326747720364742, + "grad_norm": 1.731873869895935, + "learning_rate": 4.782156704630944e-06, + "loss": 0.4383814334869385, + "mean_token_accuracy": 0.8443183898925781, + "num_tokens": 6381803.0, + "step": 701 + }, + { + "epoch": 0.5334346504559271, + "grad_norm": 3.1788413524627686, + "learning_rate": 4.7813008320352475e-06, + "loss": 0.32194480299949646, + "mean_token_accuracy": 0.8870962858200073, + "num_tokens": 6389263.0, + "step": 702 + }, + { + "epoch": 0.53419452887538, + "grad_norm": 2.099513530731201, + "learning_rate": 4.78044335833481e-06, + "loss": 0.36962923407554626, + "mean_token_accuracy": 0.8661133646965027, + "num_tokens": 6395589.0, + "step": 703 + }, + { + "epoch": 0.5349544072948328, + "grad_norm": 1.4859435558319092, + "learning_rate": 4.77958428413144e-06, + "loss": 0.4619954824447632, + "mean_token_accuracy": 0.8438555002212524, + "num_tokens": 6407470.0, + "step": 704 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.2561073303222656, + "learning_rate": 4.7787236100280685e-06, + "loss": 0.3770977258682251, + "mean_token_accuracy": 0.8515733480453491, + "num_tokens": 6422888.0, + "step": 705 + }, + { + "epoch": 0.5364741641337386, + "grad_norm": 1.4455817937850952, + "learning_rate": 4.777861336628751e-06, + "loss": 0.46481069922447205, + "mean_token_accuracy": 0.8502002954483032, + "num_tokens": 6441266.0, + "step": 706 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 1.1387295722961426, + "learning_rate": 4.7769974645386616e-06, + "loss": 0.36964765191078186, + "mean_token_accuracy": 0.8719524145126343, + "num_tokens": 6463686.0, + "step": 707 + }, + { + "epoch": 0.5379939209726444, + "grad_norm": 1.7179663181304932, + "learning_rate": 4.776131994364102e-06, + "loss": 0.4231719970703125, + "mean_token_accuracy": 0.8416585922241211, + "num_tokens": 6472956.0, + "step": 708 + }, + { + "epoch": 0.5387537993920972, + "grad_norm": 1.6328502893447876, + "learning_rate": 4.775264926712489e-06, + "loss": 0.5836569666862488, + "mean_token_accuracy": 0.8039724230766296, + "num_tokens": 6485773.0, + "step": 709 + }, + { + "epoch": 0.5395136778115501, + "grad_norm": 1.8515360355377197, + "learning_rate": 4.774396262192368e-06, + "loss": 0.5477553009986877, + "mean_token_accuracy": 0.8136521577835083, + "num_tokens": 6496379.0, + "step": 710 + }, + { + "epoch": 0.540273556231003, + "grad_norm": 1.741858959197998, + "learning_rate": 4.7735260014133986e-06, + "loss": 0.4663267731666565, + "mean_token_accuracy": 0.8473691940307617, + "num_tokens": 6507652.0, + "step": 711 + }, + { + "epoch": 0.541033434650456, + "grad_norm": 1.7516659498214722, + "learning_rate": 4.772654144986364e-06, + "loss": 0.374914288520813, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 6519030.0, + "step": 712 + }, + { + "epoch": 0.5417933130699089, + "grad_norm": 2.662343978881836, + "learning_rate": 4.7717806935231665e-06, + "loss": 0.4206875264644623, + "mean_token_accuracy": 0.8544126749038696, + "num_tokens": 6523669.0, + "step": 713 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 1.4088834524154663, + "learning_rate": 4.770905647636828e-06, + "loss": 0.5824331045150757, + "mean_token_accuracy": 0.7857901453971863, + "num_tokens": 6540560.0, + "step": 714 + }, + { + "epoch": 0.5433130699088146, + "grad_norm": 2.173656940460205, + "learning_rate": 4.77002900794149e-06, + "loss": 0.555023729801178, + "mean_token_accuracy": 0.8067290782928467, + "num_tokens": 6548946.0, + "step": 715 + }, + { + "epoch": 0.5440729483282675, + "grad_norm": 2.121018648147583, + "learning_rate": 4.769150775052411e-06, + "loss": 0.559730052947998, + "mean_token_accuracy": 0.8166372776031494, + "num_tokens": 6556065.0, + "step": 716 + }, + { + "epoch": 0.5448328267477204, + "grad_norm": 3.335866928100586, + "learning_rate": 4.768270949585968e-06, + "loss": 0.6442267894744873, + "mean_token_accuracy": 0.7858607769012451, + "num_tokens": 6560615.0, + "step": 717 + }, + { + "epoch": 0.5455927051671733, + "grad_norm": 2.3813695907592773, + "learning_rate": 4.767389532159659e-06, + "loss": 0.4027421474456787, + "mean_token_accuracy": 0.8635619282722473, + "num_tokens": 6565841.0, + "step": 718 + }, + { + "epoch": 0.5463525835866262, + "grad_norm": 2.0657708644866943, + "learning_rate": 4.766506523392095e-06, + "loss": 0.38899827003479004, + "mean_token_accuracy": 0.8660480380058289, + "num_tokens": 6572362.0, + "step": 719 + }, + { + "epoch": 0.547112462006079, + "grad_norm": 1.093705415725708, + "learning_rate": 4.765621923903005e-06, + "loss": 0.45967352390289307, + "mean_token_accuracy": 0.8338102102279663, + "num_tokens": 6595998.0, + "step": 720 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 2.942065954208374, + "learning_rate": 4.764735734313236e-06, + "loss": 0.42910510301589966, + "mean_token_accuracy": 0.8406122922897339, + "num_tokens": 6601075.0, + "step": 721 + }, + { + "epoch": 0.5486322188449848, + "grad_norm": 2.049011707305908, + "learning_rate": 4.763847955244749e-06, + "loss": 0.5584231615066528, + "mean_token_accuracy": 0.8171684741973877, + "num_tokens": 6609310.0, + "step": 722 + }, + { + "epoch": 0.5493920972644377, + "grad_norm": 2.485543966293335, + "learning_rate": 4.762958587320623e-06, + "loss": 0.5396170020103455, + "mean_token_accuracy": 0.8158525824546814, + "num_tokens": 6616185.0, + "step": 723 + }, + { + "epoch": 0.5501519756838906, + "grad_norm": 1.87015962600708, + "learning_rate": 4.762067631165049e-06, + "loss": 0.49739527702331543, + "mean_token_accuracy": 0.8303765654563904, + "num_tokens": 6625629.0, + "step": 724 + }, + { + "epoch": 0.5509118541033434, + "grad_norm": 4.239654541015625, + "learning_rate": 4.761175087403336e-06, + "loss": 0.6029239296913147, + "mean_token_accuracy": 0.8123486042022705, + "num_tokens": 6629194.0, + "step": 725 + }, + { + "epoch": 0.5516717325227963, + "grad_norm": 2.0134730339050293, + "learning_rate": 4.760280956661904e-06, + "loss": 0.4777873754501343, + "mean_token_accuracy": 0.8283513784408569, + "num_tokens": 6636929.0, + "step": 726 + }, + { + "epoch": 0.5524316109422492, + "grad_norm": 1.991780400276184, + "learning_rate": 4.75938523956829e-06, + "loss": 0.4631248116493225, + "mean_token_accuracy": 0.8275107741355896, + "num_tokens": 6645135.0, + "step": 727 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.423792839050293, + "learning_rate": 4.75848793675114e-06, + "loss": 0.49630722403526306, + "mean_token_accuracy": 0.8388000130653381, + "num_tokens": 6662690.0, + "step": 728 + }, + { + "epoch": 0.5539513677811551, + "grad_norm": 2.345294952392578, + "learning_rate": 4.757589048840219e-06, + "loss": 0.37830638885498047, + "mean_token_accuracy": 0.8782080411911011, + "num_tokens": 6667285.0, + "step": 729 + }, + { + "epoch": 0.5547112462006079, + "grad_norm": 2.7452144622802734, + "learning_rate": 4.756688576466398e-06, + "loss": 0.51595538854599, + "mean_token_accuracy": 0.8441770672798157, + "num_tokens": 6672324.0, + "step": 730 + }, + { + "epoch": 0.5554711246200608, + "grad_norm": 1.5247859954833984, + "learning_rate": 4.755786520261666e-06, + "loss": 0.48365193605422974, + "mean_token_accuracy": 0.8276445269584656, + "num_tokens": 6685296.0, + "step": 731 + }, + { + "epoch": 0.5562310030395137, + "grad_norm": 1.4018276929855347, + "learning_rate": 4.75488288085912e-06, + "loss": 0.3876481354236603, + "mean_token_accuracy": 0.8612343072891235, + "num_tokens": 6697515.0, + "step": 732 + }, + { + "epoch": 0.5569908814589666, + "grad_norm": 2.9570324420928955, + "learning_rate": 4.753977658892967e-06, + "loss": 0.5468149185180664, + "mean_token_accuracy": 0.8054271340370178, + "num_tokens": 6702194.0, + "step": 733 + }, + { + "epoch": 0.5577507598784195, + "grad_norm": 1.9282715320587158, + "learning_rate": 4.753070854998529e-06, + "loss": 0.4758574962615967, + "mean_token_accuracy": 0.8379775285720825, + "num_tokens": 6709938.0, + "step": 734 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 1.981264591217041, + "learning_rate": 4.752162469812234e-06, + "loss": 0.48461222648620605, + "mean_token_accuracy": 0.833509087562561, + "num_tokens": 6718125.0, + "step": 735 + }, + { + "epoch": 0.5592705167173252, + "grad_norm": 1.1643427610397339, + "learning_rate": 4.751252503971624e-06, + "loss": 0.410121887922287, + "mean_token_accuracy": 0.8221402764320374, + "num_tokens": 6735125.0, + "step": 736 + }, + { + "epoch": 0.5600303951367781, + "grad_norm": 1.786566972732544, + "learning_rate": 4.750340958115346e-06, + "loss": 0.5964341163635254, + "mean_token_accuracy": 0.8038164377212524, + "num_tokens": 6747369.0, + "step": 737 + }, + { + "epoch": 0.560790273556231, + "grad_norm": 1.7256991863250732, + "learning_rate": 4.749427832883158e-06, + "loss": 0.48737066984176636, + "mean_token_accuracy": 0.830894947052002, + "num_tokens": 6758115.0, + "step": 738 + }, + { + "epoch": 0.5615501519756839, + "grad_norm": 1.997747540473938, + "learning_rate": 4.748513128915928e-06, + "loss": 0.5238886475563049, + "mean_token_accuracy": 0.8066858053207397, + "num_tokens": 6766111.0, + "step": 739 + }, + { + "epoch": 0.5623100303951368, + "grad_norm": 2.127016305923462, + "learning_rate": 4.747596846855629e-06, + "loss": 0.5045586228370667, + "mean_token_accuracy": 0.821424126625061, + "num_tokens": 6772893.0, + "step": 740 + }, + { + "epoch": 0.5630699088145896, + "grad_norm": 1.7664796113967896, + "learning_rate": 4.7466789873453446e-06, + "loss": 0.42954835295677185, + "mean_token_accuracy": 0.8533384799957275, + "num_tokens": 6785133.0, + "step": 741 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 1.4987404346466064, + "learning_rate": 4.7457595510292615e-06, + "loss": 0.5378558039665222, + "mean_token_accuracy": 0.8184819221496582, + "num_tokens": 6799563.0, + "step": 742 + }, + { + "epoch": 0.5645896656534954, + "grad_norm": 1.4444655179977417, + "learning_rate": 4.744838538552678e-06, + "loss": 0.42193782329559326, + "mean_token_accuracy": 0.837514340877533, + "num_tokens": 6812470.0, + "step": 743 + }, + { + "epoch": 0.5653495440729484, + "grad_norm": 3.867751121520996, + "learning_rate": 4.7439159505619946e-06, + "loss": 0.4457814693450928, + "mean_token_accuracy": 0.8630104660987854, + "num_tokens": 6815652.0, + "step": 744 + }, + { + "epoch": 0.5661094224924013, + "grad_norm": 2.1250710487365723, + "learning_rate": 4.74299178770472e-06, + "loss": 0.5638922452926636, + "mean_token_accuracy": 0.7969781160354614, + "num_tokens": 6824566.0, + "step": 745 + }, + { + "epoch": 0.5668693009118541, + "grad_norm": 2.547072410583496, + "learning_rate": 4.742066050629465e-06, + "loss": 0.5516207814216614, + "mean_token_accuracy": 0.8160669803619385, + "num_tokens": 6830589.0, + "step": 746 + }, + { + "epoch": 0.567629179331307, + "grad_norm": 1.2975233793258667, + "learning_rate": 4.741138739985951e-06, + "loss": 0.3823344111442566, + "mean_token_accuracy": 0.8668368458747864, + "num_tokens": 6842707.0, + "step": 747 + }, + { + "epoch": 0.5683890577507599, + "grad_norm": 1.3410450220108032, + "learning_rate": 4.740209856424998e-06, + "loss": 0.5148671269416809, + "mean_token_accuracy": 0.8188045024871826, + "num_tokens": 6857624.0, + "step": 748 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 1.219467282295227, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.3998957872390747, + "mean_token_accuracy": 0.855175256729126, + "num_tokens": 6875064.0, + "step": 749 + }, + { + "epoch": 0.5699088145896657, + "grad_norm": 1.3530343770980835, + "learning_rate": 4.738347373159585e-06, + "loss": 0.5359633564949036, + "mean_token_accuracy": 0.8178457021713257, + "num_tokens": 6890911.0, + "step": 750 + }, + { + "epoch": 0.5706686930091185, + "grad_norm": 2.146988868713379, + "learning_rate": 4.737413774762287e-06, + "loss": 0.4460008144378662, + "mean_token_accuracy": 0.8172903060913086, + "num_tokens": 6896959.0, + "step": 751 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.456023097038269, + "learning_rate": 4.736478606061876e-06, + "loss": 0.43616920709609985, + "mean_token_accuracy": 0.8465108871459961, + "num_tokens": 6908904.0, + "step": 752 + }, + { + "epoch": 0.5721884498480243, + "grad_norm": 2.9696967601776123, + "learning_rate": 4.735541867714687e-06, + "loss": 0.43464532494544983, + "mean_token_accuracy": 0.8608652353286743, + "num_tokens": 6913026.0, + "step": 753 + }, + { + "epoch": 0.5729483282674772, + "grad_norm": 2.2990667819976807, + "learning_rate": 4.73460356037816e-06, + "loss": 0.6619116067886353, + "mean_token_accuracy": 0.7821142673492432, + "num_tokens": 6920588.0, + "step": 754 + }, + { + "epoch": 0.5737082066869301, + "grad_norm": 2.054746389389038, + "learning_rate": 4.733663684710835e-06, + "loss": 0.5304250717163086, + "mean_token_accuracy": 0.8265531063079834, + "num_tokens": 6928910.0, + "step": 755 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.0050594806671143, + "learning_rate": 4.732722241372354e-06, + "loss": 0.6393026113510132, + "mean_token_accuracy": 0.796819806098938, + "num_tokens": 6940217.0, + "step": 756 + }, + { + "epoch": 0.5752279635258358, + "grad_norm": 1.4285320043563843, + "learning_rate": 4.731779231023456e-06, + "loss": 0.5432837009429932, + "mean_token_accuracy": 0.8104778528213501, + "num_tokens": 6959101.0, + "step": 757 + }, + { + "epoch": 0.5759878419452887, + "grad_norm": 2.3941943645477295, + "learning_rate": 4.730834654325984e-06, + "loss": 0.46550673246383667, + "mean_token_accuracy": 0.8444503545761108, + "num_tokens": 6965036.0, + "step": 758 + }, + { + "epoch": 0.5767477203647416, + "grad_norm": 2.3850574493408203, + "learning_rate": 4.729888511942877e-06, + "loss": 0.4916389584541321, + "mean_token_accuracy": 0.8228527307510376, + "num_tokens": 6971184.0, + "step": 759 + }, + { + "epoch": 0.5775075987841946, + "grad_norm": 1.627480149269104, + "learning_rate": 4.728940804538176e-06, + "loss": 0.5863215923309326, + "mean_token_accuracy": 0.7995302677154541, + "num_tokens": 6982569.0, + "step": 760 + }, + { + "epoch": 0.5782674772036475, + "grad_norm": 1.1723195314407349, + "learning_rate": 4.727991532777016e-06, + "loss": 0.36908864974975586, + "mean_token_accuracy": 0.8355655670166016, + "num_tokens": 6998659.0, + "step": 761 + }, + { + "epoch": 0.5790273556231003, + "grad_norm": 1.5324925184249878, + "learning_rate": 4.727040697325634e-06, + "loss": 0.557658851146698, + "mean_token_accuracy": 0.8141458034515381, + "num_tokens": 7012969.0, + "step": 762 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 2.4106390476226807, + "learning_rate": 4.726088298851362e-06, + "loss": 0.5004243850708008, + "mean_token_accuracy": 0.8376860618591309, + "num_tokens": 7018301.0, + "step": 763 + }, + { + "epoch": 0.5805471124620061, + "grad_norm": 2.2594921588897705, + "learning_rate": 4.725134338022631e-06, + "loss": 0.6067016124725342, + "mean_token_accuracy": 0.8100241422653198, + "num_tokens": 7025201.0, + "step": 764 + }, + { + "epoch": 0.581306990881459, + "grad_norm": 1.4649826288223267, + "learning_rate": 4.724178815508967e-06, + "loss": 0.36200693249702454, + "mean_token_accuracy": 0.8621826171875, + "num_tokens": 7035112.0, + "step": 765 + }, + { + "epoch": 0.5820668693009119, + "grad_norm": 2.3634560108184814, + "learning_rate": 4.723221731980993e-06, + "loss": 0.41862213611602783, + "mean_token_accuracy": 0.8541463613510132, + "num_tokens": 7040339.0, + "step": 766 + }, + { + "epoch": 0.5828267477203647, + "grad_norm": 2.7798104286193848, + "learning_rate": 4.722263088110426e-06, + "loss": 0.4647108018398285, + "mean_token_accuracy": 0.8505672216415405, + "num_tokens": 7044880.0, + "step": 767 + }, + { + "epoch": 0.5835866261398176, + "grad_norm": 2.070528507232666, + "learning_rate": 4.721302884570079e-06, + "loss": 0.5147565007209778, + "mean_token_accuracy": 0.8113877773284912, + "num_tokens": 7052433.0, + "step": 768 + }, + { + "epoch": 0.5843465045592705, + "grad_norm": 2.1953284740448, + "learning_rate": 4.720341122033862e-06, + "loss": 0.5075466632843018, + "mean_token_accuracy": 0.8474211096763611, + "num_tokens": 7058686.0, + "step": 769 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 1.9287755489349365, + "learning_rate": 4.719377801176774e-06, + "loss": 0.5382202863693237, + "mean_token_accuracy": 0.8148090243339539, + "num_tokens": 7067538.0, + "step": 770 + }, + { + "epoch": 0.5858662613981763, + "grad_norm": 1.5574456453323364, + "learning_rate": 4.718412922674913e-06, + "loss": 0.43406790494918823, + "mean_token_accuracy": 0.8477081060409546, + "num_tokens": 7077853.0, + "step": 771 + }, + { + "epoch": 0.5866261398176292, + "grad_norm": 1.5490336418151855, + "learning_rate": 4.717446487205466e-06, + "loss": 0.43164271116256714, + "mean_token_accuracy": 0.8504570126533508, + "num_tokens": 7091728.0, + "step": 772 + }, + { + "epoch": 0.587386018237082, + "grad_norm": 1.6945984363555908, + "learning_rate": 4.716478495446717e-06, + "loss": 0.5153743624687195, + "mean_token_accuracy": 0.8213579058647156, + "num_tokens": 7108680.0, + "step": 773 + }, + { + "epoch": 0.5881458966565349, + "grad_norm": 2.2633883953094482, + "learning_rate": 4.715508948078037e-06, + "loss": 0.45254790782928467, + "mean_token_accuracy": 0.8392219543457031, + "num_tokens": 7115546.0, + "step": 774 + }, + { + "epoch": 0.5889057750759878, + "grad_norm": 1.5731090307235718, + "learning_rate": 4.714537845779894e-06, + "loss": 0.38678881525993347, + "mean_token_accuracy": 0.8800252676010132, + "num_tokens": 7126360.0, + "step": 775 + }, + { + "epoch": 0.5896656534954408, + "grad_norm": 2.4873392581939697, + "learning_rate": 4.7135651892338445e-06, + "loss": 0.5190927386283875, + "mean_token_accuracy": 0.8145407438278198, + "num_tokens": 7135705.0, + "step": 776 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 1.2931004762649536, + "learning_rate": 4.712590979122534e-06, + "loss": 0.3686544895172119, + "mean_token_accuracy": 0.8720537424087524, + "num_tokens": 7150688.0, + "step": 777 + }, + { + "epoch": 0.5911854103343465, + "grad_norm": 1.6353671550750732, + "learning_rate": 4.7116152161297045e-06, + "loss": 0.49065062403678894, + "mean_token_accuracy": 0.8203760385513306, + "num_tokens": 7161040.0, + "step": 778 + }, + { + "epoch": 0.5919452887537994, + "grad_norm": 1.2345483303070068, + "learning_rate": 4.710637900940181e-06, + "loss": 0.4004976451396942, + "mean_token_accuracy": 0.8302007913589478, + "num_tokens": 7178074.0, + "step": 779 + }, + { + "epoch": 0.5927051671732523, + "grad_norm": 2.2506837844848633, + "learning_rate": 4.7096590342398825e-06, + "loss": 0.45142874121665955, + "mean_token_accuracy": 0.8481036424636841, + "num_tokens": 7184153.0, + "step": 780 + }, + { + "epoch": 0.5934650455927052, + "grad_norm": 1.420479416847229, + "learning_rate": 4.708678616715815e-06, + "loss": 0.4802100360393524, + "mean_token_accuracy": 0.8586992025375366, + "num_tokens": 7202810.0, + "step": 781 + }, + { + "epoch": 0.5942249240121581, + "grad_norm": 3.457632303237915, + "learning_rate": 4.707696649056073e-06, + "loss": 0.5265094041824341, + "mean_token_accuracy": 0.8260114192962646, + "num_tokens": 7206396.0, + "step": 782 + }, + { + "epoch": 0.5949848024316109, + "grad_norm": 1.1592093706130981, + "learning_rate": 4.706713131949839e-06, + "loss": 0.3708173632621765, + "mean_token_accuracy": 0.8476542234420776, + "num_tokens": 7225034.0, + "step": 783 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.6761400699615479, + "learning_rate": 4.705728066087384e-06, + "loss": 0.4137252867221832, + "mean_token_accuracy": 0.8462049961090088, + "num_tokens": 7237101.0, + "step": 784 + }, + { + "epoch": 0.5965045592705167, + "grad_norm": 2.320185422897339, + "learning_rate": 4.704741452160064e-06, + "loss": 0.5157154202461243, + "mean_token_accuracy": 0.8391785621643066, + "num_tokens": 7243826.0, + "step": 785 + }, + { + "epoch": 0.5972644376899696, + "grad_norm": 2.079423427581787, + "learning_rate": 4.703753290860323e-06, + "loss": 0.4734993278980255, + "mean_token_accuracy": 0.8353281021118164, + "num_tokens": 7250175.0, + "step": 786 + }, + { + "epoch": 0.5980243161094225, + "grad_norm": 1.8215159177780151, + "learning_rate": 4.702763582881692e-06, + "loss": 0.520193338394165, + "mean_token_accuracy": 0.844062864780426, + "num_tokens": 7258868.0, + "step": 787 + }, + { + "epoch": 0.5987841945288754, + "grad_norm": 1.3823071718215942, + "learning_rate": 4.701772328918784e-06, + "loss": 0.4177844822406769, + "mean_token_accuracy": 0.8363165259361267, + "num_tokens": 7271744.0, + "step": 788 + }, + { + "epoch": 0.5995440729483282, + "grad_norm": 2.4749298095703125, + "learning_rate": 4.700779529667301e-06, + "loss": 0.5115069150924683, + "mean_token_accuracy": 0.8473520278930664, + "num_tokens": 7277040.0, + "step": 789 + }, + { + "epoch": 0.6003039513677811, + "grad_norm": 1.7072296142578125, + "learning_rate": 4.699785185824026e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8161447048187256, + "num_tokens": 7288288.0, + "step": 790 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 1.6479384899139404, + "learning_rate": 4.69878929808683e-06, + "loss": 0.4445168972015381, + "mean_token_accuracy": 0.8381255865097046, + "num_tokens": 7298640.0, + "step": 791 + }, + { + "epoch": 0.601823708206687, + "grad_norm": 1.9095896482467651, + "learning_rate": 4.6977918671546635e-06, + "loss": 0.5841238498687744, + "mean_token_accuracy": 0.7971454858779907, + "num_tokens": 7307220.0, + "step": 792 + }, + { + "epoch": 0.6025835866261399, + "grad_norm": 1.9614146947860718, + "learning_rate": 4.696792893727562e-06, + "loss": 0.34684082865715027, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 7313875.0, + "step": 793 + }, + { + "epoch": 0.6033434650455927, + "grad_norm": 2.015570640563965, + "learning_rate": 4.695792378506645e-06, + "loss": 0.42779117822647095, + "mean_token_accuracy": 0.8625012636184692, + "num_tokens": 7321439.0, + "step": 794 + }, + { + "epoch": 0.6041033434650456, + "grad_norm": 2.8581228256225586, + "learning_rate": 4.694790322194111e-06, + "loss": 0.6519991159439087, + "mean_token_accuracy": 0.7629562616348267, + "num_tokens": 7326916.0, + "step": 795 + }, + { + "epoch": 0.6048632218844985, + "grad_norm": 2.482715368270874, + "learning_rate": 4.693786725493242e-06, + "loss": 0.532963216304779, + "mean_token_accuracy": 0.832184910774231, + "num_tokens": 7333311.0, + "step": 796 + }, + { + "epoch": 0.6056231003039514, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.692781589108402e-06, + "loss": 0.43381205201148987, + "mean_token_accuracy": 0.8402494192123413, + "num_tokens": 7343731.0, + "step": 797 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 2.2133216857910156, + "learning_rate": 4.691774913745033e-06, + "loss": 0.4380851089954376, + "mean_token_accuracy": 0.8600908517837524, + "num_tokens": 7350224.0, + "step": 798 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 2.046280860900879, + "learning_rate": 4.690766700109659e-06, + "loss": 0.3821919560432434, + "mean_token_accuracy": 0.8691814541816711, + "num_tokens": 7356717.0, + "step": 799 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 1.8482693433761597, + "learning_rate": 4.689756948909884e-06, + "loss": 0.5217651128768921, + "mean_token_accuracy": 0.803473711013794, + "num_tokens": 7365806.0, + "step": 800 + }, + { + "epoch": 0.6086626139817629, + "grad_norm": 2.192134141921997, + "learning_rate": 4.688745660854388e-06, + "loss": 0.573980987071991, + "mean_token_accuracy": 0.8198676109313965, + "num_tokens": 7380281.0, + "step": 801 + }, + { + "epoch": 0.6094224924012158, + "grad_norm": 2.363626718521118, + "learning_rate": 4.687732836652935e-06, + "loss": 0.5204599499702454, + "mean_token_accuracy": 0.8373252153396606, + "num_tokens": 7386938.0, + "step": 802 + }, + { + "epoch": 0.6101823708206687, + "grad_norm": 1.9320523738861084, + "learning_rate": 4.686718477016361e-06, + "loss": 0.47316622734069824, + "mean_token_accuracy": 0.830596923828125, + "num_tokens": 7395069.0, + "step": 803 + }, + { + "epoch": 0.6109422492401215, + "grad_norm": 2.6573057174682617, + "learning_rate": 4.6857025826565845e-06, + "loss": 0.5495861768722534, + "mean_token_accuracy": 0.8187421560287476, + "num_tokens": 7400563.0, + "step": 804 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 2.0893123149871826, + "learning_rate": 4.684685154286599e-06, + "loss": 0.5362675786018372, + "mean_token_accuracy": 0.8394701480865479, + "num_tokens": 7406973.0, + "step": 805 + }, + { + "epoch": 0.6124620060790273, + "grad_norm": 2.455130100250244, + "learning_rate": 4.683666192620474e-06, + "loss": 0.5405995845794678, + "mean_token_accuracy": 0.8079100847244263, + "num_tokens": 7412931.0, + "step": 806 + }, + { + "epoch": 0.6132218844984803, + "grad_norm": 2.311915636062622, + "learning_rate": 4.682645698373357e-06, + "loss": 0.5395106077194214, + "mean_token_accuracy": 0.8156260251998901, + "num_tokens": 7419699.0, + "step": 807 + }, + { + "epoch": 0.6139817629179332, + "grad_norm": 1.686838984489441, + "learning_rate": 4.6816236722614694e-06, + "loss": 0.6034521460533142, + "mean_token_accuracy": 0.7855954170227051, + "num_tokens": 7431899.0, + "step": 808 + }, + { + "epoch": 0.6147416413373861, + "grad_norm": 1.682759165763855, + "learning_rate": 4.680600115002109e-06, + "loss": 0.48593831062316895, + "mean_token_accuracy": 0.8229435682296753, + "num_tokens": 7443187.0, + "step": 809 + }, + { + "epoch": 0.6155015197568389, + "grad_norm": 2.064589738845825, + "learning_rate": 4.679575027313649e-06, + "loss": 0.5098468661308289, + "mean_token_accuracy": 0.8234638571739197, + "num_tokens": 7450868.0, + "step": 810 + }, + { + "epoch": 0.6162613981762918, + "grad_norm": 2.2063486576080322, + "learning_rate": 4.6785484099155324e-06, + "loss": 0.5138497352600098, + "mean_token_accuracy": 0.8152111172676086, + "num_tokens": 7457176.0, + "step": 811 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 1.6258726119995117, + "learning_rate": 4.67752026352828e-06, + "loss": 0.4064181447029114, + "mean_token_accuracy": 0.8720619678497314, + "num_tokens": 7466557.0, + "step": 812 + }, + { + "epoch": 0.6177811550151976, + "grad_norm": 2.3309383392333984, + "learning_rate": 4.676490588873486e-06, + "loss": 0.5180112719535828, + "mean_token_accuracy": 0.8233879804611206, + "num_tokens": 7472650.0, + "step": 813 + }, + { + "epoch": 0.6185410334346505, + "grad_norm": 1.4545246362686157, + "learning_rate": 4.675459386673815e-06, + "loss": 0.37917959690093994, + "mean_token_accuracy": 0.8598103523254395, + "num_tokens": 7485171.0, + "step": 814 + }, + { + "epoch": 0.6193009118541033, + "grad_norm": 2.654231071472168, + "learning_rate": 4.674426657653003e-06, + "loss": 0.554074227809906, + "mean_token_accuracy": 0.8026446104049683, + "num_tokens": 7490787.0, + "step": 815 + }, + { + "epoch": 0.6200607902735562, + "grad_norm": 1.5543994903564453, + "learning_rate": 4.67339240253586e-06, + "loss": 0.6335440278053284, + "mean_token_accuracy": 0.783241868019104, + "num_tokens": 7505975.0, + "step": 816 + }, + { + "epoch": 0.6208206686930091, + "grad_norm": 2.079998016357422, + "learning_rate": 4.672356622048266e-06, + "loss": 0.5169394016265869, + "mean_token_accuracy": 0.8088761568069458, + "num_tokens": 7513470.0, + "step": 817 + }, + { + "epoch": 0.621580547112462, + "grad_norm": 1.5971896648406982, + "learning_rate": 4.671319316917172e-06, + "loss": 0.44588586688041687, + "mean_token_accuracy": 0.8518649339675903, + "num_tokens": 7524352.0, + "step": 818 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 2.477579116821289, + "learning_rate": 4.670280487870599e-06, + "loss": 0.5713893175125122, + "mean_token_accuracy": 0.8116940259933472, + "num_tokens": 7530359.0, + "step": 819 + }, + { + "epoch": 0.6231003039513677, + "grad_norm": 2.066211700439453, + "learning_rate": 4.669240135637635e-06, + "loss": 0.5295331478118896, + "mean_token_accuracy": 0.819536566734314, + "num_tokens": 7536963.0, + "step": 820 + }, + { + "epoch": 0.6238601823708206, + "grad_norm": 2.1217997074127197, + "learning_rate": 4.668198260948442e-06, + "loss": 0.6146406531333923, + "mean_token_accuracy": 0.7932635545730591, + "num_tokens": 7545800.0, + "step": 821 + }, + { + "epoch": 0.6246200607902735, + "grad_norm": 2.0173542499542236, + "learning_rate": 4.667154864534245e-06, + "loss": 0.6240535974502563, + "mean_token_accuracy": 0.7883644104003906, + "num_tokens": 7556165.0, + "step": 822 + }, + { + "epoch": 0.6253799392097265, + "grad_norm": 2.014526128768921, + "learning_rate": 4.666109947127343e-06, + "loss": 0.40367332100868225, + "mean_token_accuracy": 0.8653522729873657, + "num_tokens": 7562665.0, + "step": 823 + }, + { + "epoch": 0.6261398176291794, + "grad_norm": 2.5078861713409424, + "learning_rate": 4.665063509461098e-06, + "loss": 0.5903617739677429, + "mean_token_accuracy": 0.7902897596359253, + "num_tokens": 7568922.0, + "step": 824 + }, + { + "epoch": 0.6268996960486323, + "grad_norm": 2.454622745513916, + "learning_rate": 4.664015552269938e-06, + "loss": 0.5238361358642578, + "mean_token_accuracy": 0.838546872138977, + "num_tokens": 7575965.0, + "step": 825 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 2.920919418334961, + "learning_rate": 4.662966076289363e-06, + "loss": 0.5028782486915588, + "mean_token_accuracy": 0.8311152458190918, + "num_tokens": 7580193.0, + "step": 826 + }, + { + "epoch": 0.628419452887538, + "grad_norm": 1.545382022857666, + "learning_rate": 4.661915082255932e-06, + "loss": 0.4817378520965576, + "mean_token_accuracy": 0.8373227119445801, + "num_tokens": 7593024.0, + "step": 827 + }, + { + "epoch": 0.6291793313069909, + "grad_norm": 1.5152469873428345, + "learning_rate": 4.6608625709072766e-06, + "loss": 0.4693033695220947, + "mean_token_accuracy": 0.8150848150253296, + "num_tokens": 7606459.0, + "step": 828 + }, + { + "epoch": 0.6299392097264438, + "grad_norm": 2.1310224533081055, + "learning_rate": 4.659808542982089e-06, + "loss": 0.4653395414352417, + "mean_token_accuracy": 0.8286294341087341, + "num_tokens": 7613036.0, + "step": 829 + }, + { + "epoch": 0.6306990881458967, + "grad_norm": 2.1949679851531982, + "learning_rate": 4.658752999220125e-06, + "loss": 0.3698633909225464, + "mean_token_accuracy": 0.871590793132782, + "num_tokens": 7618527.0, + "step": 830 + }, + { + "epoch": 0.6314589665653495, + "grad_norm": 2.2770416736602783, + "learning_rate": 4.657695940362207e-06, + "loss": 0.5202419757843018, + "mean_token_accuracy": 0.817577600479126, + "num_tokens": 7624459.0, + "step": 831 + }, + { + "epoch": 0.6322188449848024, + "grad_norm": 1.402042269706726, + "learning_rate": 4.65663736715022e-06, + "loss": 0.51531583070755, + "mean_token_accuracy": 0.8228116631507874, + "num_tokens": 7639371.0, + "step": 832 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.3554883003234863, + "learning_rate": 4.65557728032711e-06, + "loss": 0.6771188378334045, + "mean_token_accuracy": 0.7880028486251831, + "num_tokens": 7643924.0, + "step": 833 + }, + { + "epoch": 0.6337386018237082, + "grad_norm": 2.081040143966675, + "learning_rate": 4.654515680636888e-06, + "loss": 0.5712796449661255, + "mean_token_accuracy": 0.8177868127822876, + "num_tokens": 7651881.0, + "step": 834 + }, + { + "epoch": 0.6344984802431611, + "grad_norm": 0.9128716588020325, + "learning_rate": 4.653452568824625e-06, + "loss": 0.3423936069011688, + "mean_token_accuracy": 0.8782886266708374, + "num_tokens": 7677829.0, + "step": 835 + }, + { + "epoch": 0.6352583586626139, + "grad_norm": 3.49015736579895, + "learning_rate": 4.652387945636454e-06, + "loss": 0.34657734632492065, + "mean_token_accuracy": 0.8770567178726196, + "num_tokens": 7680796.0, + "step": 836 + }, + { + "epoch": 0.6360182370820668, + "grad_norm": 2.026247501373291, + "learning_rate": 4.651321811819568e-06, + "loss": 0.5098431706428528, + "mean_token_accuracy": 0.8216961622238159, + "num_tokens": 7688746.0, + "step": 837 + }, + { + "epoch": 0.6367781155015197, + "grad_norm": 2.444343090057373, + "learning_rate": 4.650254168122222e-06, + "loss": 0.5490090250968933, + "mean_token_accuracy": 0.8092857599258423, + "num_tokens": 7695220.0, + "step": 838 + }, + { + "epoch": 0.6375379939209727, + "grad_norm": 2.0171122550964355, + "learning_rate": 4.649185015293728e-06, + "loss": 0.47221142053604126, + "mean_token_accuracy": 0.8514408469200134, + "num_tokens": 7702759.0, + "step": 839 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.9800984859466553, + "learning_rate": 4.64811435408446e-06, + "loss": 0.5238803625106812, + "mean_token_accuracy": 0.8479194641113281, + "num_tokens": 7714017.0, + "step": 840 + }, + { + "epoch": 0.6390577507598785, + "grad_norm": 3.0674357414245605, + "learning_rate": 4.647042185245848e-06, + "loss": 0.4668245315551758, + "mean_token_accuracy": 0.8381714820861816, + "num_tokens": 7717801.0, + "step": 841 + }, + { + "epoch": 0.6398176291793313, + "grad_norm": 1.5672820806503296, + "learning_rate": 4.645968509530381e-06, + "loss": 0.4428741931915283, + "mean_token_accuracy": 0.8416479825973511, + "num_tokens": 7728342.0, + "step": 842 + }, + { + "epoch": 0.6405775075987842, + "grad_norm": 2.3042354583740234, + "learning_rate": 4.644893327691608e-06, + "loss": 0.49937760829925537, + "mean_token_accuracy": 0.827070951461792, + "num_tokens": 7734576.0, + "step": 843 + }, + { + "epoch": 0.6413373860182371, + "grad_norm": 2.057772159576416, + "learning_rate": 4.6438166404841316e-06, + "loss": 0.5912986993789673, + "mean_token_accuracy": 0.805509090423584, + "num_tokens": 7742481.0, + "step": 844 + }, + { + "epoch": 0.64209726443769, + "grad_norm": 1.9688186645507812, + "learning_rate": 4.6427384486636115e-06, + "loss": 0.482401967048645, + "mean_token_accuracy": 0.8358086347579956, + "num_tokens": 7750002.0, + "step": 845 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.6852948665618896, + "learning_rate": 4.6416587529867665e-06, + "loss": 0.5479315519332886, + "mean_token_accuracy": 0.8091106414794922, + "num_tokens": 7755578.0, + "step": 846 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 2.0547337532043457, + "learning_rate": 4.640577554211366e-06, + "loss": 0.5327274203300476, + "mean_token_accuracy": 0.8280376195907593, + "num_tokens": 7763513.0, + "step": 847 + }, + { + "epoch": 0.6443768996960486, + "grad_norm": 2.0328633785247803, + "learning_rate": 4.63949485309624e-06, + "loss": 0.4814409613609314, + "mean_token_accuracy": 0.8527672290802002, + "num_tokens": 7771131.0, + "step": 848 + }, + { + "epoch": 0.6451367781155015, + "grad_norm": 1.5892863273620605, + "learning_rate": 4.638410650401267e-06, + "loss": 0.4492785334587097, + "mean_token_accuracy": 0.846997857093811, + "num_tokens": 7781572.0, + "step": 849 + }, + { + "epoch": 0.6458966565349544, + "grad_norm": 1.8295910358428955, + "learning_rate": 4.637324946887384e-06, + "loss": 0.37088239192962646, + "mean_token_accuracy": 0.8616628646850586, + "num_tokens": 7788604.0, + "step": 850 + }, + { + "epoch": 0.6466565349544073, + "grad_norm": 3.380040168762207, + "learning_rate": 4.636237743316578e-06, + "loss": 0.4737280607223511, + "mean_token_accuracy": 0.855940580368042, + "num_tokens": 7792504.0, + "step": 851 + }, + { + "epoch": 0.6474164133738601, + "grad_norm": 2.8790009021759033, + "learning_rate": 4.635149040451891e-06, + "loss": 0.39790448546409607, + "mean_token_accuracy": 0.8710698485374451, + "num_tokens": 7796333.0, + "step": 852 + }, + { + "epoch": 0.648176291793313, + "grad_norm": 1.914914608001709, + "learning_rate": 4.634058839057417e-06, + "loss": 0.2954312562942505, + "mean_token_accuracy": 0.8880234956741333, + "num_tokens": 7802456.0, + "step": 853 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 1.3709120750427246, + "learning_rate": 4.632967139898301e-06, + "loss": 0.43224576115608215, + "mean_token_accuracy": 0.8446190357208252, + "num_tokens": 7816770.0, + "step": 854 + }, + { + "epoch": 0.6496960486322189, + "grad_norm": 1.6579312086105347, + "learning_rate": 4.63187394374074e-06, + "loss": 0.3535553514957428, + "mean_token_accuracy": 0.8738704919815063, + "num_tokens": 7824963.0, + "step": 855 + }, + { + "epoch": 0.6504559270516718, + "grad_norm": 2.4055678844451904, + "learning_rate": 4.63077925135198e-06, + "loss": 0.5078744292259216, + "mean_token_accuracy": 0.8430874347686768, + "num_tokens": 7830962.0, + "step": 856 + }, + { + "epoch": 0.6512158054711246, + "grad_norm": 2.5171499252319336, + "learning_rate": 4.629683063500319e-06, + "loss": 0.5172419548034668, + "mean_token_accuracy": 0.8087141513824463, + "num_tokens": 7836638.0, + "step": 857 + }, + { + "epoch": 0.6519756838905775, + "grad_norm": 1.7588486671447754, + "learning_rate": 4.628585380955104e-06, + "loss": 0.5759496092796326, + "mean_token_accuracy": 0.8043236136436462, + "num_tokens": 7844654.0, + "step": 858 + }, + { + "epoch": 0.6527355623100304, + "grad_norm": 1.5887070894241333, + "learning_rate": 4.62748620448673e-06, + "loss": 0.41849038004875183, + "mean_token_accuracy": 0.8556643724441528, + "num_tokens": 7855642.0, + "step": 859 + }, + { + "epoch": 0.6534954407294833, + "grad_norm": 3.227942705154419, + "learning_rate": 4.626385534866642e-06, + "loss": 0.5279449224472046, + "mean_token_accuracy": 0.8250958323478699, + "num_tokens": 7859890.0, + "step": 860 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 2.440467119216919, + "learning_rate": 4.625283372867333e-06, + "loss": 0.5294933319091797, + "mean_token_accuracy": 0.8235013484954834, + "num_tokens": 7866766.0, + "step": 861 + }, + { + "epoch": 0.6550151975683891, + "grad_norm": 2.4106903076171875, + "learning_rate": 4.624179719262342e-06, + "loss": 0.5662813186645508, + "mean_token_accuracy": 0.8061668872833252, + "num_tokens": 7872809.0, + "step": 862 + }, + { + "epoch": 0.6557750759878419, + "grad_norm": 3.5151145458221436, + "learning_rate": 4.623074574826254e-06, + "loss": 0.5471097230911255, + "mean_token_accuracy": 0.8220691084861755, + "num_tokens": 7876136.0, + "step": 863 + }, + { + "epoch": 0.6565349544072948, + "grad_norm": 1.5319840908050537, + "learning_rate": 4.621967940334705e-06, + "loss": 0.4178982377052307, + "mean_token_accuracy": 0.8517135977745056, + "num_tokens": 7886113.0, + "step": 864 + }, + { + "epoch": 0.6572948328267477, + "grad_norm": 1.63701331615448, + "learning_rate": 4.620859816564371e-06, + "loss": 0.4666512608528137, + "mean_token_accuracy": 0.8223508596420288, + "num_tokens": 7897982.0, + "step": 865 + }, + { + "epoch": 0.6580547112462006, + "grad_norm": 2.1515414714813232, + "learning_rate": 4.619750204292978e-06, + "loss": 0.5359305143356323, + "mean_token_accuracy": 0.8192868232727051, + "num_tokens": 7904947.0, + "step": 866 + }, + { + "epoch": 0.6588145896656535, + "grad_norm": 2.2140955924987793, + "learning_rate": 4.618639104299294e-06, + "loss": 0.5275633931159973, + "mean_token_accuracy": 0.8120715618133545, + "num_tokens": 7913913.0, + "step": 867 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 1.3956893682479858, + "learning_rate": 4.6175265173631304e-06, + "loss": 0.4378768503665924, + "mean_token_accuracy": 0.8479125499725342, + "num_tokens": 7927979.0, + "step": 868 + }, + { + "epoch": 0.6603343465045592, + "grad_norm": 2.98103928565979, + "learning_rate": 4.616412444265344e-06, + "loss": 0.42614591121673584, + "mean_token_accuracy": 0.8595094680786133, + "num_tokens": 7934293.0, + "step": 869 + }, + { + "epoch": 0.6610942249240122, + "grad_norm": 2.554845094680786, + "learning_rate": 4.6152968857878365e-06, + "loss": 0.3698030412197113, + "mean_token_accuracy": 0.8717041015625, + "num_tokens": 7938547.0, + "step": 870 + }, + { + "epoch": 0.6618541033434651, + "grad_norm": 3.0901825428009033, + "learning_rate": 4.6141798427135475e-06, + "loss": 0.5037497282028198, + "mean_token_accuracy": 0.8354041576385498, + "num_tokens": 7942829.0, + "step": 871 + }, + { + "epoch": 0.662613981762918, + "grad_norm": 2.8692073822021484, + "learning_rate": 4.6130613158264605e-06, + "loss": 0.5418164134025574, + "mean_token_accuracy": 0.8298909664154053, + "num_tokens": 7949303.0, + "step": 872 + }, + { + "epoch": 0.6633738601823708, + "grad_norm": 3.960404396057129, + "learning_rate": 4.611941305911602e-06, + "loss": 0.6284480094909668, + "mean_token_accuracy": 0.837495744228363, + "num_tokens": 7952486.0, + "step": 873 + }, + { + "epoch": 0.6641337386018237, + "grad_norm": 2.6690115928649902, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5214360952377319, + "mean_token_accuracy": 0.8213508129119873, + "num_tokens": 7957559.0, + "step": 874 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 2.3376171588897705, + "learning_rate": 4.609696840143875e-06, + "loss": 0.46887528896331787, + "mean_token_accuracy": 0.8438819646835327, + "num_tokens": 7962826.0, + "step": 875 + }, + { + "epoch": 0.6656534954407295, + "grad_norm": 2.2222683429718018, + "learning_rate": 4.6085723858662575e-06, + "loss": 0.5607719421386719, + "mean_token_accuracy": 0.8128405809402466, + "num_tokens": 7970131.0, + "step": 876 + }, + { + "epoch": 0.6664133738601824, + "grad_norm": 2.069091558456421, + "learning_rate": 4.607446451711372e-06, + "loss": 0.506301760673523, + "mean_token_accuracy": 0.8256827592849731, + "num_tokens": 7977524.0, + "step": 877 + }, + { + "epoch": 0.6671732522796353, + "grad_norm": 1.3724967241287231, + "learning_rate": 4.606319038469443e-06, + "loss": 0.43285101652145386, + "mean_token_accuracy": 0.8525032997131348, + "num_tokens": 7989174.0, + "step": 878 + }, + { + "epoch": 0.6679331306990881, + "grad_norm": 2.278205156326294, + "learning_rate": 4.605190146931731e-06, + "loss": 0.4845905303955078, + "mean_token_accuracy": 0.8284652829170227, + "num_tokens": 7998524.0, + "step": 879 + }, + { + "epoch": 0.668693009118541, + "grad_norm": 1.3871766328811646, + "learning_rate": 4.604059777890537e-06, + "loss": 0.5736679434776306, + "mean_token_accuracy": 0.8223285675048828, + "num_tokens": 8015776.0, + "step": 880 + }, + { + "epoch": 0.6694528875379939, + "grad_norm": 1.926164984703064, + "learning_rate": 4.602927932139197e-06, + "loss": 0.4133230447769165, + "mean_token_accuracy": 0.8653768301010132, + "num_tokens": 8022979.0, + "step": 881 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 2.109272003173828, + "learning_rate": 4.601794610472083e-06, + "loss": 0.7005600929260254, + "mean_token_accuracy": 0.7777010202407837, + "num_tokens": 8032618.0, + "step": 882 + }, + { + "epoch": 0.6709726443768997, + "grad_norm": 2.077977418899536, + "learning_rate": 4.6006598136846056e-06, + "loss": 0.5278208255767822, + "mean_token_accuracy": 0.8230358958244324, + "num_tokens": 8040534.0, + "step": 883 + }, + { + "epoch": 0.6717325227963525, + "grad_norm": 1.678581714630127, + "learning_rate": 4.599523542573207e-06, + "loss": 0.4955351650714874, + "mean_token_accuracy": 0.8270003795623779, + "num_tokens": 8052249.0, + "step": 884 + }, + { + "epoch": 0.6724924012158054, + "grad_norm": 2.0751662254333496, + "learning_rate": 4.598385797935368e-06, + "loss": 0.5266247987747192, + "mean_token_accuracy": 0.8263581991195679, + "num_tokens": 8060600.0, + "step": 885 + }, + { + "epoch": 0.6732522796352584, + "grad_norm": 2.418405771255493, + "learning_rate": 4.5972465805696e-06, + "loss": 0.4481425881385803, + "mean_token_accuracy": 0.846164345741272, + "num_tokens": 8066025.0, + "step": 886 + }, + { + "epoch": 0.6740121580547113, + "grad_norm": 2.3936474323272705, + "learning_rate": 4.596105891275449e-06, + "loss": 0.4553404450416565, + "mean_token_accuracy": 0.8412896394729614, + "num_tokens": 8071544.0, + "step": 887 + }, + { + "epoch": 0.6747720364741642, + "grad_norm": 2.2024407386779785, + "learning_rate": 4.594963730853497e-06, + "loss": 0.6218541860580444, + "mean_token_accuracy": 0.7890232801437378, + "num_tokens": 8079061.0, + "step": 888 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 2.51015567779541, + "learning_rate": 4.593820100105355e-06, + "loss": 0.5149124264717102, + "mean_token_accuracy": 0.8241918087005615, + "num_tokens": 8084293.0, + "step": 889 + }, + { + "epoch": 0.6762917933130699, + "grad_norm": 1.8748939037322998, + "learning_rate": 4.5926749998336665e-06, + "loss": 0.50836181640625, + "mean_token_accuracy": 0.8067223429679871, + "num_tokens": 8092511.0, + "step": 890 + }, + { + "epoch": 0.6770516717325228, + "grad_norm": 1.801193118095398, + "learning_rate": 4.5915284308421075e-06, + "loss": 0.4372861683368683, + "mean_token_accuracy": 0.8510604500770569, + "num_tokens": 8101174.0, + "step": 891 + }, + { + "epoch": 0.6778115501519757, + "grad_norm": 2.6476457118988037, + "learning_rate": 4.590380393935383e-06, + "loss": 0.38700711727142334, + "mean_token_accuracy": 0.8659796714782715, + "num_tokens": 8105398.0, + "step": 892 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.1147183179855347, + "learning_rate": 4.589230889919232e-06, + "loss": 0.38546115159988403, + "mean_token_accuracy": 0.8570581674575806, + "num_tokens": 8127394.0, + "step": 893 + }, + { + "epoch": 0.6793313069908815, + "grad_norm": 2.908905506134033, + "learning_rate": 4.588079919600419e-06, + "loss": 0.5108504295349121, + "mean_token_accuracy": 0.8121406435966492, + "num_tokens": 8131801.0, + "step": 894 + }, + { + "epoch": 0.6800911854103343, + "grad_norm": 3.1522326469421387, + "learning_rate": 4.586927483786739e-06, + "loss": 0.44059112668037415, + "mean_token_accuracy": 0.8448011875152588, + "num_tokens": 8154416.0, + "step": 895 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.5142440795898438, + "learning_rate": 4.585773583287017e-06, + "loss": 0.513217568397522, + "mean_token_accuracy": 0.8386049270629883, + "num_tokens": 8171156.0, + "step": 896 + }, + { + "epoch": 0.6816109422492401, + "grad_norm": 2.597881317138672, + "learning_rate": 4.584618218911104e-06, + "loss": 0.4937712550163269, + "mean_token_accuracy": 0.8223681449890137, + "num_tokens": 8176124.0, + "step": 897 + }, + { + "epoch": 0.682370820668693, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.583461391469879e-06, + "loss": 0.519811749458313, + "mean_token_accuracy": 0.8169777393341064, + "num_tokens": 8185136.0, + "step": 898 + }, + { + "epoch": 0.6831306990881459, + "grad_norm": 3.2061994075775146, + "learning_rate": 4.582303101775249e-06, + "loss": 0.4655115008354187, + "mean_token_accuracy": 0.8425977230072021, + "num_tokens": 8188864.0, + "step": 899 + }, + { + "epoch": 0.6838905775075987, + "grad_norm": 1.3485229015350342, + "learning_rate": 4.581143350640146e-06, + "loss": 0.5014470815658569, + "mean_token_accuracy": 0.8273109197616577, + "num_tokens": 8203460.0, + "step": 900 + }, + { + "epoch": 0.6846504559270516, + "grad_norm": 1.3264713287353516, + "learning_rate": 4.579982138878527e-06, + "loss": 0.5073703527450562, + "mean_token_accuracy": 0.8259357213973999, + "num_tokens": 8219348.0, + "step": 901 + }, + { + "epoch": 0.6854103343465046, + "grad_norm": 2.4436347484588623, + "learning_rate": 4.578819467305375e-06, + "loss": 0.47020310163497925, + "mean_token_accuracy": 0.8567265272140503, + "num_tokens": 8224427.0, + "step": 902 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 1.921749234199524, + "learning_rate": 4.5776553367367e-06, + "loss": 0.622514009475708, + "mean_token_accuracy": 0.7863982319831848, + "num_tokens": 8233151.0, + "step": 903 + }, + { + "epoch": 0.6869300911854104, + "grad_norm": 1.8815616369247437, + "learning_rate": 4.576489747989532e-06, + "loss": 0.4910545349121094, + "mean_token_accuracy": 0.8147122859954834, + "num_tokens": 8240762.0, + "step": 904 + }, + { + "epoch": 0.6876899696048632, + "grad_norm": 1.2366989850997925, + "learning_rate": 4.575322701881926e-06, + "loss": 0.3947566747665405, + "mean_token_accuracy": 0.873993992805481, + "num_tokens": 8259381.0, + "step": 905 + }, + { + "epoch": 0.6884498480243161, + "grad_norm": 1.5767735242843628, + "learning_rate": 4.57415419923296e-06, + "loss": 0.57136070728302, + "mean_token_accuracy": 0.8028088808059692, + "num_tokens": 8273296.0, + "step": 906 + }, + { + "epoch": 0.689209726443769, + "grad_norm": 2.378675699234009, + "learning_rate": 4.572984240862733e-06, + "loss": 0.5894849896430969, + "mean_token_accuracy": 0.7977708578109741, + "num_tokens": 8280083.0, + "step": 907 + }, + { + "epoch": 0.6899696048632219, + "grad_norm": 2.0401132106781006, + "learning_rate": 4.57181282759237e-06, + "loss": 0.5524613261222839, + "mean_token_accuracy": 0.8138598203659058, + "num_tokens": 8288236.0, + "step": 908 + }, + { + "epoch": 0.6907294832826748, + "grad_norm": 2.293701648712158, + "learning_rate": 4.570639960244011e-06, + "loss": 0.5154546499252319, + "mean_token_accuracy": 0.8234660625457764, + "num_tokens": 8294493.0, + "step": 909 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 1.9286527633666992, + "learning_rate": 4.56946563964082e-06, + "loss": 0.5364264845848083, + "mean_token_accuracy": 0.8147368431091309, + "num_tokens": 8303441.0, + "step": 910 + }, + { + "epoch": 0.6922492401215805, + "grad_norm": 1.2571251392364502, + "learning_rate": 4.5682898666069815e-06, + "loss": 0.43535223603248596, + "mean_token_accuracy": 0.859239935874939, + "num_tokens": 8321548.0, + "step": 911 + }, + { + "epoch": 0.6930091185410334, + "grad_norm": 1.2224860191345215, + "learning_rate": 4.567112641967697e-06, + "loss": 0.40205076336860657, + "mean_token_accuracy": 0.8724711537361145, + "num_tokens": 8335205.0, + "step": 912 + }, + { + "epoch": 0.6937689969604863, + "grad_norm": 1.2064491510391235, + "learning_rate": 4.5659339665491894e-06, + "loss": 0.37790587544441223, + "mean_token_accuracy": 0.8464339971542358, + "num_tokens": 8350926.0, + "step": 913 + }, + { + "epoch": 0.6945288753799392, + "grad_norm": 2.1755270957946777, + "learning_rate": 4.5647538411786965e-06, + "loss": 0.42034298181533813, + "mean_token_accuracy": 0.84148108959198, + "num_tokens": 8356739.0, + "step": 914 + }, + { + "epoch": 0.6952887537993921, + "grad_norm": 1.234864592552185, + "learning_rate": 4.563572266684478e-06, + "loss": 0.5062938332557678, + "mean_token_accuracy": 0.8132052421569824, + "num_tokens": 8373660.0, + "step": 915 + }, + { + "epoch": 0.6960486322188449, + "grad_norm": 2.4250621795654297, + "learning_rate": 4.562389243895807e-06, + "loss": 0.4907791018486023, + "mean_token_accuracy": 0.8337979912757874, + "num_tokens": 8378661.0, + "step": 916 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 1.5018314123153687, + "learning_rate": 4.561204773642974e-06, + "loss": 0.41041281819343567, + "mean_token_accuracy": 0.8569784164428711, + "num_tokens": 8390322.0, + "step": 917 + }, + { + "epoch": 0.6975683890577508, + "grad_norm": 2.797269344329834, + "learning_rate": 4.5600188567572874e-06, + "loss": 0.3146931529045105, + "mean_token_accuracy": 0.8913302421569824, + "num_tokens": 8393567.0, + "step": 918 + }, + { + "epoch": 0.6983282674772037, + "grad_norm": 1.4002827405929565, + "learning_rate": 4.558831494071069e-06, + "loss": 0.4275597333908081, + "mean_token_accuracy": 0.8504893779754639, + "num_tokens": 8407119.0, + "step": 919 + }, + { + "epoch": 0.6990881458966566, + "grad_norm": 1.7045831680297852, + "learning_rate": 4.557642686417654e-06, + "loss": 0.49593430757522583, + "mean_token_accuracy": 0.8185091018676758, + "num_tokens": 8417408.0, + "step": 920 + }, + { + "epoch": 0.6998480243161094, + "grad_norm": 2.8818066120147705, + "learning_rate": 4.556452434631396e-06, + "loss": 0.637908935546875, + "mean_token_accuracy": 0.7883946895599365, + "num_tokens": 8422319.0, + "step": 921 + }, + { + "epoch": 0.7006079027355623, + "grad_norm": 2.3587265014648438, + "learning_rate": 4.555260739547657e-06, + "loss": 0.38749319314956665, + "mean_token_accuracy": 0.8774704933166504, + "num_tokens": 8427315.0, + "step": 922 + }, + { + "epoch": 0.7013677811550152, + "grad_norm": 1.6648749113082886, + "learning_rate": 4.554067602002815e-06, + "loss": 0.4044865369796753, + "mean_token_accuracy": 0.8524141311645508, + "num_tokens": 8438662.0, + "step": 923 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.467787742614746, + "learning_rate": 4.55287302283426e-06, + "loss": 0.591016411781311, + "mean_token_accuracy": 0.81184983253479, + "num_tokens": 8442237.0, + "step": 924 + }, + { + "epoch": 0.702887537993921, + "grad_norm": 2.1458635330200195, + "learning_rate": 4.551677002880395e-06, + "loss": 0.5017476677894592, + "mean_token_accuracy": 0.822914183139801, + "num_tokens": 8449494.0, + "step": 925 + }, + { + "epoch": 0.7036474164133738, + "grad_norm": 2.521714448928833, + "learning_rate": 4.550479542980632e-06, + "loss": 0.531912088394165, + "mean_token_accuracy": 0.8225687742233276, + "num_tokens": 8454983.0, + "step": 926 + }, + { + "epoch": 0.7044072948328267, + "grad_norm": 3.5248100757598877, + "learning_rate": 4.549280643975394e-06, + "loss": 0.4631815254688263, + "mean_token_accuracy": 0.8443771600723267, + "num_tokens": 8458504.0, + "step": 927 + }, + { + "epoch": 0.7051671732522796, + "grad_norm": 2.5105819702148438, + "learning_rate": 4.548080306706114e-06, + "loss": 0.30487123131752014, + "mean_token_accuracy": 0.9018767476081848, + "num_tokens": 8462589.0, + "step": 928 + }, + { + "epoch": 0.7059270516717325, + "grad_norm": 1.3367713689804077, + "learning_rate": 4.5468785320152365e-06, + "loss": 0.4355026185512543, + "mean_token_accuracy": 0.8323584794998169, + "num_tokens": 8478450.0, + "step": 929 + }, + { + "epoch": 0.7066869300911854, + "grad_norm": 2.2506282329559326, + "learning_rate": 4.545675320746212e-06, + "loss": 0.5082957744598389, + "mean_token_accuracy": 0.823430597782135, + "num_tokens": 8485991.0, + "step": 930 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 1.7164632081985474, + "learning_rate": 4.544470673743502e-06, + "loss": 0.3960164785385132, + "mean_token_accuracy": 0.8592486381530762, + "num_tokens": 8495217.0, + "step": 931 + }, + { + "epoch": 0.7082066869300911, + "grad_norm": 1.5864969491958618, + "learning_rate": 4.543264591852572e-06, + "loss": 0.49114471673965454, + "mean_token_accuracy": 0.8330780267715454, + "num_tokens": 8508904.0, + "step": 932 + }, + { + "epoch": 0.708966565349544, + "grad_norm": 2.1707003116607666, + "learning_rate": 4.542057075919898e-06, + "loss": 0.49895772337913513, + "mean_token_accuracy": 0.8327431082725525, + "num_tokens": 8515792.0, + "step": 933 + }, + { + "epoch": 0.709726443768997, + "grad_norm": 1.9002083539962769, + "learning_rate": 4.54084812679296e-06, + "loss": 0.4548531472682953, + "mean_token_accuracy": 0.834532618522644, + "num_tokens": 8524006.0, + "step": 934 + }, + { + "epoch": 0.7104863221884499, + "grad_norm": 1.8505141735076904, + "learning_rate": 4.539637745320247e-06, + "loss": 0.35716521739959717, + "mean_token_accuracy": 0.872222900390625, + "num_tokens": 8533647.0, + "step": 935 + }, + { + "epoch": 0.7112462006079028, + "grad_norm": 2.092620849609375, + "learning_rate": 4.53842593235125e-06, + "loss": 0.4673694372177124, + "mean_token_accuracy": 0.8460999131202698, + "num_tokens": 8540734.0, + "step": 936 + }, + { + "epoch": 0.7120060790273556, + "grad_norm": 2.689514636993408, + "learning_rate": 4.537212688736466e-06, + "loss": 0.45461273193359375, + "mean_token_accuracy": 0.8450704216957092, + "num_tokens": 8544948.0, + "step": 937 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 2.4507734775543213, + "learning_rate": 4.535998015327396e-06, + "loss": 0.4571906626224518, + "mean_token_accuracy": 0.8429360389709473, + "num_tokens": 8550445.0, + "step": 938 + }, + { + "epoch": 0.7135258358662614, + "grad_norm": 1.8960013389587402, + "learning_rate": 4.534781912976546e-06, + "loss": 0.4461391568183899, + "mean_token_accuracy": 0.8487973213195801, + "num_tokens": 8557630.0, + "step": 939 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.602611780166626, + "learning_rate": 4.533564382537421e-06, + "loss": 0.5277102589607239, + "mean_token_accuracy": 0.8330916166305542, + "num_tokens": 8570397.0, + "step": 940 + }, + { + "epoch": 0.7150455927051672, + "grad_norm": 1.8936395645141602, + "learning_rate": 4.532345424864533e-06, + "loss": 0.38619571924209595, + "mean_token_accuracy": 0.8514572381973267, + "num_tokens": 8582673.0, + "step": 941 + }, + { + "epoch": 0.71580547112462, + "grad_norm": 1.3898619413375854, + "learning_rate": 4.531125040813392e-06, + "loss": 0.4825032949447632, + "mean_token_accuracy": 0.833012580871582, + "num_tokens": 8597239.0, + "step": 942 + }, + { + "epoch": 0.7165653495440729, + "grad_norm": 2.128230571746826, + "learning_rate": 4.529903231240511e-06, + "loss": 0.4862118065357208, + "mean_token_accuracy": 0.8210917711257935, + "num_tokens": 8605877.0, + "step": 943 + }, + { + "epoch": 0.7173252279635258, + "grad_norm": 1.6552259922027588, + "learning_rate": 4.528679997003403e-06, + "loss": 0.5092059373855591, + "mean_token_accuracy": 0.8247389793395996, + "num_tokens": 8617060.0, + "step": 944 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 2.1174771785736084, + "learning_rate": 4.52745533896058e-06, + "loss": 0.39110174775123596, + "mean_token_accuracy": 0.8672944903373718, + "num_tokens": 8623306.0, + "step": 945 + }, + { + "epoch": 0.7188449848024316, + "grad_norm": 2.8648383617401123, + "learning_rate": 4.526229257971556e-06, + "loss": 0.49864327907562256, + "mean_token_accuracy": 0.8305130004882812, + "num_tokens": 8627466.0, + "step": 946 + }, + { + "epoch": 0.7196048632218845, + "grad_norm": 2.155514717102051, + "learning_rate": 4.52500175489684e-06, + "loss": 0.5070191025733948, + "mean_token_accuracy": 0.8311188817024231, + "num_tokens": 8634759.0, + "step": 947 + }, + { + "epoch": 0.7203647416413373, + "grad_norm": 1.8432683944702148, + "learning_rate": 4.523772830597942e-06, + "loss": 0.5569252371788025, + "mean_token_accuracy": 0.8070821762084961, + "num_tokens": 8644160.0, + "step": 948 + }, + { + "epoch": 0.7211246200607903, + "grad_norm": 2.8912241458892822, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4799427390098572, + "mean_token_accuracy": 0.8443552851676941, + "num_tokens": 8648377.0, + "step": 949 + }, + { + "epoch": 0.7218844984802432, + "grad_norm": 3.3449625968933105, + "learning_rate": 4.521310721778622e-06, + "loss": 0.44043463468551636, + "mean_token_accuracy": 0.8521315455436707, + "num_tokens": 8651846.0, + "step": 950 + }, + { + "epoch": 0.7226443768996961, + "grad_norm": 1.4127917289733887, + "learning_rate": 4.520077538986203e-06, + "loss": 0.4700999855995178, + "mean_token_accuracy": 0.8377952575683594, + "num_tokens": 8665199.0, + "step": 951 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.1607301235198975, + "learning_rate": 4.518842938425606e-06, + "loss": 0.4374256730079651, + "mean_token_accuracy": 0.8448896408081055, + "num_tokens": 8672158.0, + "step": 952 + }, + { + "epoch": 0.7241641337386018, + "grad_norm": 1.3442779779434204, + "learning_rate": 4.51760692096332e-06, + "loss": 0.38948923349380493, + "mean_token_accuracy": 0.8598923683166504, + "num_tokens": 8684532.0, + "step": 953 + }, + { + "epoch": 0.7249240121580547, + "grad_norm": 2.0003178119659424, + "learning_rate": 4.516369487466832e-06, + "loss": 0.3797217011451721, + "mean_token_accuracy": 0.8652102947235107, + "num_tokens": 8691460.0, + "step": 954 + }, + { + "epoch": 0.7256838905775076, + "grad_norm": 1.8196535110473633, + "learning_rate": 4.5151306388046175e-06, + "loss": 0.5676811933517456, + "mean_token_accuracy": 0.818500816822052, + "num_tokens": 8701624.0, + "step": 955 + }, + { + "epoch": 0.7264437689969605, + "grad_norm": 2.1962296962738037, + "learning_rate": 4.513890375846152e-06, + "loss": 0.45399484038352966, + "mean_token_accuracy": 0.8463879227638245, + "num_tokens": 8707410.0, + "step": 956 + }, + { + "epoch": 0.7272036474164134, + "grad_norm": 1.8798872232437134, + "learning_rate": 4.512648699461897e-06, + "loss": 0.5679811239242554, + "mean_token_accuracy": 0.8089900016784668, + "num_tokens": 8715630.0, + "step": 957 + }, + { + "epoch": 0.7279635258358662, + "grad_norm": 2.3540258407592773, + "learning_rate": 4.511405610523309e-06, + "loss": 0.5282865762710571, + "mean_token_accuracy": 0.8196114301681519, + "num_tokens": 8721934.0, + "step": 958 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 2.5630908012390137, + "learning_rate": 4.510161109902837e-06, + "loss": 0.39442378282546997, + "mean_token_accuracy": 0.8400980830192566, + "num_tokens": 8726511.0, + "step": 959 + }, + { + "epoch": 0.729483282674772, + "grad_norm": 1.9829226732254028, + "learning_rate": 4.508915198473919e-06, + "loss": 0.4611976742744446, + "mean_token_accuracy": 0.8439624309539795, + "num_tokens": 8733460.0, + "step": 960 + }, + { + "epoch": 0.7302431610942249, + "grad_norm": 3.0291950702667236, + "learning_rate": 4.507667877110982e-06, + "loss": 0.5158340930938721, + "mean_token_accuracy": 0.8300060033798218, + "num_tokens": 8737629.0, + "step": 961 + }, + { + "epoch": 0.7310030395136778, + "grad_norm": 1.9208252429962158, + "learning_rate": 4.506419146689445e-06, + "loss": 0.3807099163532257, + "mean_token_accuracy": 0.871469259262085, + "num_tokens": 8744615.0, + "step": 962 + }, + { + "epoch": 0.7317629179331308, + "grad_norm": 3.051565408706665, + "learning_rate": 4.505169008085717e-06, + "loss": 0.38461726903915405, + "mean_token_accuracy": 0.874465823173523, + "num_tokens": 8748154.0, + "step": 963 + }, + { + "epoch": 0.7325227963525835, + "grad_norm": 1.375466227531433, + "learning_rate": 4.503917462177192e-06, + "loss": 0.42490679025650024, + "mean_token_accuracy": 0.8457326889038086, + "num_tokens": 8760965.0, + "step": 964 + }, + { + "epoch": 0.7332826747720365, + "grad_norm": 2.216681957244873, + "learning_rate": 4.5026645098422515e-06, + "loss": 0.43149900436401367, + "mean_token_accuracy": 0.8527278900146484, + "num_tokens": 8766996.0, + "step": 965 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 1.9422595500946045, + "learning_rate": 4.5014101519602684e-06, + "loss": 0.4964504539966583, + "mean_token_accuracy": 0.8137556314468384, + "num_tokens": 8774411.0, + "step": 966 + }, + { + "epoch": 0.7348024316109423, + "grad_norm": 2.058887004852295, + "learning_rate": 4.500154389411598e-06, + "loss": 0.4977570176124573, + "mean_token_accuracy": 0.8254626989364624, + "num_tokens": 8782220.0, + "step": 967 + }, + { + "epoch": 0.7355623100303952, + "grad_norm": 2.9977786540985107, + "learning_rate": 4.498897223077582e-06, + "loss": 0.4061415195465088, + "mean_token_accuracy": 0.8752427101135254, + "num_tokens": 8786120.0, + "step": 968 + }, + { + "epoch": 0.736322188449848, + "grad_norm": 2.2636303901672363, + "learning_rate": 4.49763865384055e-06, + "loss": 0.5062161087989807, + "mean_token_accuracy": 0.8171653747558594, + "num_tokens": 8792459.0, + "step": 969 + }, + { + "epoch": 0.7370820668693009, + "grad_norm": 1.8850842714309692, + "learning_rate": 4.496378682583813e-06, + "loss": 0.5014280676841736, + "mean_token_accuracy": 0.8547511100769043, + "num_tokens": 8800675.0, + "step": 970 + }, + { + "epoch": 0.7378419452887538, + "grad_norm": 1.191985011100769, + "learning_rate": 4.495117310191667e-06, + "loss": 0.4713883101940155, + "mean_token_accuracy": 0.8213596343994141, + "num_tokens": 8820740.0, + "step": 971 + }, + { + "epoch": 0.7386018237082067, + "grad_norm": 1.823000192642212, + "learning_rate": 4.493854537549393e-06, + "loss": 0.46332645416259766, + "mean_token_accuracy": 0.8359860777854919, + "num_tokens": 8828884.0, + "step": 972 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 2.590446949005127, + "learning_rate": 4.492590365543253e-06, + "loss": 0.49074703454971313, + "mean_token_accuracy": 0.8433758020401001, + "num_tokens": 8833859.0, + "step": 973 + }, + { + "epoch": 0.7401215805471124, + "grad_norm": 2.2762670516967773, + "learning_rate": 4.491324795060491e-06, + "loss": 0.39465656876564026, + "mean_token_accuracy": 0.8734766244888306, + "num_tokens": 8839350.0, + "step": 974 + }, + { + "epoch": 0.7408814589665653, + "grad_norm": 2.698725461959839, + "learning_rate": 4.490057826989333e-06, + "loss": 0.5552085041999817, + "mean_token_accuracy": 0.8132266998291016, + "num_tokens": 8844373.0, + "step": 975 + }, + { + "epoch": 0.7416413373860182, + "grad_norm": 2.704606294631958, + "learning_rate": 4.488789462218988e-06, + "loss": 0.3447791635990143, + "mean_token_accuracy": 0.8736170530319214, + "num_tokens": 8848236.0, + "step": 976 + }, + { + "epoch": 0.7424012158054711, + "grad_norm": 3.1260716915130615, + "learning_rate": 4.487519701639641e-06, + "loss": 0.5945233702659607, + "mean_token_accuracy": 0.7997599840164185, + "num_tokens": 8852935.0, + "step": 977 + }, + { + "epoch": 0.743161094224924, + "grad_norm": 1.6895452737808228, + "learning_rate": 4.486248546142459e-06, + "loss": 0.4823892116546631, + "mean_token_accuracy": 0.8279662132263184, + "num_tokens": 8861743.0, + "step": 978 + }, + { + "epoch": 0.743920972644377, + "grad_norm": 1.9161452054977417, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.5266581773757935, + "mean_token_accuracy": 0.8218623399734497, + "num_tokens": 8870601.0, + "step": 979 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 1.6894301176071167, + "learning_rate": 4.483702053964154e-06, + "loss": 0.4186219573020935, + "mean_token_accuracy": 0.8471781015396118, + "num_tokens": 8885617.0, + "step": 980 + }, + { + "epoch": 0.7454407294832827, + "grad_norm": 1.6319992542266846, + "learning_rate": 4.482426719070258e-06, + "loss": 0.541317880153656, + "mean_token_accuracy": 0.8216162323951721, + "num_tokens": 8897595.0, + "step": 981 + }, + { + "epoch": 0.7462006079027356, + "grad_norm": 5.102413177490234, + "learning_rate": 4.4811499928329775e-06, + "loss": 0.3928517699241638, + "mean_token_accuracy": 0.858033299446106, + "num_tokens": 8901682.0, + "step": 982 + }, + { + "epoch": 0.7469604863221885, + "grad_norm": 2.213860273361206, + "learning_rate": 4.479871876148368e-06, + "loss": 0.4276347756385803, + "mean_token_accuracy": 0.8529798984527588, + "num_tokens": 8908088.0, + "step": 983 + }, + { + "epoch": 0.7477203647416414, + "grad_norm": 1.2180038690567017, + "learning_rate": 4.478592369913464e-06, + "loss": 0.3941590189933777, + "mean_token_accuracy": 0.8608149290084839, + "num_tokens": 8925876.0, + "step": 984 + }, + { + "epoch": 0.7484802431610942, + "grad_norm": 2.849802255630493, + "learning_rate": 4.477311475026271e-06, + "loss": 0.42190325260162354, + "mean_token_accuracy": 0.860505223274231, + "num_tokens": 8930190.0, + "step": 985 + }, + { + "epoch": 0.7492401215805471, + "grad_norm": 1.704128384590149, + "learning_rate": 4.476029192385769e-06, + "loss": 0.4786282777786255, + "mean_token_accuracy": 0.8302322626113892, + "num_tokens": 8938340.0, + "step": 986 + }, + { + "epoch": 0.75, + "grad_norm": 2.06322979927063, + "learning_rate": 4.474745522891915e-06, + "loss": 0.4648786187171936, + "mean_token_accuracy": 0.8366481065750122, + "num_tokens": 8944633.0, + "step": 987 + }, + { + "epoch": 0.7507598784194529, + "grad_norm": 2.0745396614074707, + "learning_rate": 4.473460467445637e-06, + "loss": 0.5744885206222534, + "mean_token_accuracy": 0.8357284069061279, + "num_tokens": 8954457.0, + "step": 988 + }, + { + "epoch": 0.7515197568389058, + "grad_norm": 1.9281407594680786, + "learning_rate": 4.472174026948836e-06, + "loss": 0.528974175453186, + "mean_token_accuracy": 0.8083580732345581, + "num_tokens": 8962701.0, + "step": 989 + }, + { + "epoch": 0.7522796352583586, + "grad_norm": 3.012381076812744, + "learning_rate": 4.470886202304385e-06, + "loss": 0.48754751682281494, + "mean_token_accuracy": 0.8368391990661621, + "num_tokens": 8967272.0, + "step": 990 + }, + { + "epoch": 0.7530395136778115, + "grad_norm": 1.691826581954956, + "learning_rate": 4.469596994416131e-06, + "loss": 0.484740674495697, + "mean_token_accuracy": 0.8500643968582153, + "num_tokens": 8976615.0, + "step": 991 + }, + { + "epoch": 0.7537993920972644, + "grad_norm": 2.4961965084075928, + "learning_rate": 4.468306404188887e-06, + "loss": 0.50777268409729, + "mean_token_accuracy": 0.8168395757675171, + "num_tokens": 8983235.0, + "step": 992 + }, + { + "epoch": 0.7545592705167173, + "grad_norm": 1.512007713317871, + "learning_rate": 4.467014432528441e-06, + "loss": 0.4583340287208557, + "mean_token_accuracy": 0.8465162515640259, + "num_tokens": 8993815.0, + "step": 993 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 1.9362257719039917, + "learning_rate": 4.465721080341547e-06, + "loss": 0.6027892827987671, + "mean_token_accuracy": 0.8052380084991455, + "num_tokens": 9002697.0, + "step": 994 + }, + { + "epoch": 0.756079027355623, + "grad_norm": 2.473632335662842, + "learning_rate": 4.4644263485359316e-06, + "loss": 0.5394320487976074, + "mean_token_accuracy": 0.834665834903717, + "num_tokens": 9007428.0, + "step": 995 + }, + { + "epoch": 0.756838905775076, + "grad_norm": 2.2527434825897217, + "learning_rate": 4.463130238020284e-06, + "loss": 0.5485198497772217, + "mean_token_accuracy": 0.8090173006057739, + "num_tokens": 9013570.0, + "step": 996 + }, + { + "epoch": 0.7575987841945289, + "grad_norm": 1.4130940437316895, + "learning_rate": 4.4618327497042676e-06, + "loss": 0.37994423508644104, + "mean_token_accuracy": 0.8625167012214661, + "num_tokens": 9025485.0, + "step": 997 + }, + { + "epoch": 0.7583586626139818, + "grad_norm": 2.685115098953247, + "learning_rate": 4.460533884498509e-06, + "loss": 0.447973370552063, + "mean_token_accuracy": 0.8564165234565735, + "num_tokens": 9030355.0, + "step": 998 + }, + { + "epoch": 0.7591185410334347, + "grad_norm": 3.2743139266967773, + "learning_rate": 4.4592336433146e-06, + "loss": 0.45275989174842834, + "mean_token_accuracy": 0.8462578058242798, + "num_tokens": 9034406.0, + "step": 999 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 1.9383049011230469, + "learning_rate": 4.457932027065102e-06, + "loss": 0.5387729406356812, + "mean_token_accuracy": 0.8357330560684204, + "num_tokens": 9041502.0, + "step": 1000 + }, + { + "epoch": 0.7606382978723404, + "grad_norm": 2.7348275184631348, + "learning_rate": 4.456629036663537e-06, + "loss": 0.4448447823524475, + "mean_token_accuracy": 0.8453642129898071, + "num_tokens": 9046088.0, + "step": 1001 + }, + { + "epoch": 0.7613981762917933, + "grad_norm": 1.8477401733398438, + "learning_rate": 4.455324673024396e-06, + "loss": 0.5766505002975464, + "mean_token_accuracy": 0.8074213862419128, + "num_tokens": 9055678.0, + "step": 1002 + }, + { + "epoch": 0.7621580547112462, + "grad_norm": 3.134481430053711, + "learning_rate": 4.4540189370631315e-06, + "loss": 0.5690872669219971, + "mean_token_accuracy": 0.8414670825004578, + "num_tokens": 9062006.0, + "step": 1003 + }, + { + "epoch": 0.7629179331306991, + "grad_norm": 1.7933398485183716, + "learning_rate": 4.452711829696158e-06, + "loss": 0.4898291826248169, + "mean_token_accuracy": 0.8259007930755615, + "num_tokens": 9070754.0, + "step": 1004 + }, + { + "epoch": 0.763677811550152, + "grad_norm": 1.2552275657653809, + "learning_rate": 4.451403351840855e-06, + "loss": 0.4280198812484741, + "mean_token_accuracy": 0.8409112691879272, + "num_tokens": 9085306.0, + "step": 1005 + }, + { + "epoch": 0.7644376899696048, + "grad_norm": 1.6749331951141357, + "learning_rate": 4.450093504415562e-06, + "loss": 0.3723178505897522, + "mean_token_accuracy": 0.8545734882354736, + "num_tokens": 9102453.0, + "step": 1006 + }, + { + "epoch": 0.7651975683890577, + "grad_norm": 2.7514500617980957, + "learning_rate": 4.44878228833958e-06, + "loss": 0.5463190674781799, + "mean_token_accuracy": 0.8121639490127563, + "num_tokens": 9108342.0, + "step": 1007 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 1.3322733640670776, + "learning_rate": 4.447469704533172e-06, + "loss": 0.573723316192627, + "mean_token_accuracy": 0.8065711259841919, + "num_tokens": 9123712.0, + "step": 1008 + }, + { + "epoch": 0.7667173252279635, + "grad_norm": 2.6893765926361084, + "learning_rate": 4.446155753917559e-06, + "loss": 0.6856257915496826, + "mean_token_accuracy": 0.7718256711959839, + "num_tokens": 9130728.0, + "step": 1009 + }, + { + "epoch": 0.7674772036474165, + "grad_norm": 1.792765498161316, + "learning_rate": 4.444840437414923e-06, + "loss": 0.48203110694885254, + "mean_token_accuracy": 0.8419194221496582, + "num_tokens": 9137983.0, + "step": 1010 + }, + { + "epoch": 0.7682370820668692, + "grad_norm": 1.4957399368286133, + "learning_rate": 4.443523755948401e-06, + "loss": 0.4372181296348572, + "mean_token_accuracy": 0.8491764664649963, + "num_tokens": 9148081.0, + "step": 1011 + }, + { + "epoch": 0.7689969604863222, + "grad_norm": 1.7294867038726807, + "learning_rate": 4.442205710442095e-06, + "loss": 0.54277503490448, + "mean_token_accuracy": 0.8196806907653809, + "num_tokens": 9158407.0, + "step": 1012 + }, + { + "epoch": 0.7697568389057751, + "grad_norm": 2.2091221809387207, + "learning_rate": 4.4408863018210564e-06, + "loss": 0.4888187646865845, + "mean_token_accuracy": 0.8384175300598145, + "num_tokens": 9164754.0, + "step": 1013 + }, + { + "epoch": 0.770516717325228, + "grad_norm": 1.7615830898284912, + "learning_rate": 4.439565531011299e-06, + "loss": 0.4640008211135864, + "mean_token_accuracy": 0.8424701690673828, + "num_tokens": 9172715.0, + "step": 1014 + }, + { + "epoch": 0.7712765957446809, + "grad_norm": 1.6796128749847412, + "learning_rate": 4.43824339893979e-06, + "loss": 0.5227609276771545, + "mean_token_accuracy": 0.8135923743247986, + "num_tokens": 9183214.0, + "step": 1015 + }, + { + "epoch": 0.7720364741641338, + "grad_norm": 2.1485698223114014, + "learning_rate": 4.436919906534452e-06, + "loss": 0.4857056140899658, + "mean_token_accuracy": 0.8323013782501221, + "num_tokens": 9190360.0, + "step": 1016 + }, + { + "epoch": 0.7727963525835866, + "grad_norm": 2.7842206954956055, + "learning_rate": 4.4355950547241645e-06, + "loss": 0.46406883001327515, + "mean_token_accuracy": 0.859869122505188, + "num_tokens": 9194523.0, + "step": 1017 + }, + { + "epoch": 0.7735562310030395, + "grad_norm": 2.3774640560150146, + "learning_rate": 4.434268844438758e-06, + "loss": 0.5625549554824829, + "mean_token_accuracy": 0.8188897371292114, + "num_tokens": 9201155.0, + "step": 1018 + }, + { + "epoch": 0.7743161094224924, + "grad_norm": 2.004427909851074, + "learning_rate": 4.432941276609018e-06, + "loss": 0.5164387226104736, + "mean_token_accuracy": 0.829569935798645, + "num_tokens": 9209269.0, + "step": 1019 + }, + { + "epoch": 0.7750759878419453, + "grad_norm": 1.7218989133834839, + "learning_rate": 4.431612352166684e-06, + "loss": 0.481005996465683, + "mean_token_accuracy": 0.8359906673431396, + "num_tokens": 9220860.0, + "step": 1020 + }, + { + "epoch": 0.7758358662613982, + "grad_norm": 2.197108507156372, + "learning_rate": 4.4302820720444454e-06, + "loss": 0.440413236618042, + "mean_token_accuracy": 0.8412867784500122, + "num_tokens": 9226414.0, + "step": 1021 + }, + { + "epoch": 0.776595744680851, + "grad_norm": 2.6995162963867188, + "learning_rate": 4.428950437175944e-06, + "loss": 0.3884299397468567, + "mean_token_accuracy": 0.8696021437644958, + "num_tokens": 9230898.0, + "step": 1022 + }, + { + "epoch": 0.7773556231003039, + "grad_norm": 2.1671667098999023, + "learning_rate": 4.427617448495772e-06, + "loss": 0.5747478008270264, + "mean_token_accuracy": 0.7842930555343628, + "num_tokens": 9238479.0, + "step": 1023 + }, + { + "epoch": 0.7781155015197568, + "grad_norm": 1.6299028396606445, + "learning_rate": 4.426283106939474e-06, + "loss": 0.39478403329849243, + "mean_token_accuracy": 0.8685503602027893, + "num_tokens": 9248263.0, + "step": 1024 + }, + { + "epoch": 0.7788753799392097, + "grad_norm": 2.2621798515319824, + "learning_rate": 4.424947413443539e-06, + "loss": 0.4582178592681885, + "mean_token_accuracy": 0.8312377333641052, + "num_tokens": 9254168.0, + "step": 1025 + }, + { + "epoch": 0.7796352583586627, + "grad_norm": 2.121091365814209, + "learning_rate": 4.423610368945411e-06, + "loss": 0.5315121412277222, + "mean_token_accuracy": 0.8121483325958252, + "num_tokens": 9261808.0, + "step": 1026 + }, + { + "epoch": 0.7803951367781155, + "grad_norm": 1.8558297157287598, + "learning_rate": 4.422271974383479e-06, + "loss": 0.4299176037311554, + "mean_token_accuracy": 0.8452648520469666, + "num_tokens": 9269264.0, + "step": 1027 + }, + { + "epoch": 0.7811550151975684, + "grad_norm": 1.9089949131011963, + "learning_rate": 4.420932230697079e-06, + "loss": 0.43876272439956665, + "mean_token_accuracy": 0.8434094190597534, + "num_tokens": 9277381.0, + "step": 1028 + }, + { + "epoch": 0.7819148936170213, + "grad_norm": 1.8619649410247803, + "learning_rate": 4.419591138826495e-06, + "loss": 0.48798668384552, + "mean_token_accuracy": 0.8281317353248596, + "num_tokens": 9285413.0, + "step": 1029 + }, + { + "epoch": 0.7826747720364742, + "grad_norm": 1.3273087739944458, + "learning_rate": 4.418248699712955e-06, + "loss": 0.4611460864543915, + "mean_token_accuracy": 0.8233213424682617, + "num_tokens": 9300805.0, + "step": 1030 + }, + { + "epoch": 0.7834346504559271, + "grad_norm": 1.0473746061325073, + "learning_rate": 4.416904914298637e-06, + "loss": 0.36537665128707886, + "mean_token_accuracy": 0.8671857118606567, + "num_tokens": 9320035.0, + "step": 1031 + }, + { + "epoch": 0.78419452887538, + "grad_norm": 1.9130918979644775, + "learning_rate": 4.415559783526661e-06, + "loss": 0.4916655123233795, + "mean_token_accuracy": 0.8266351222991943, + "num_tokens": 9326795.0, + "step": 1032 + }, + { + "epoch": 0.7849544072948328, + "grad_norm": 2.0001816749572754, + "learning_rate": 4.414213308341092e-06, + "loss": 0.5711008310317993, + "mean_token_accuracy": 0.8093076348304749, + "num_tokens": 9335625.0, + "step": 1033 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 3.933542251586914, + "learning_rate": 4.412865489686936e-06, + "loss": 0.621616542339325, + "mean_token_accuracy": 0.7938898801803589, + "num_tokens": 9339080.0, + "step": 1034 + }, + { + "epoch": 0.7864741641337386, + "grad_norm": 2.061558961868286, + "learning_rate": 4.411516328510145e-06, + "loss": 0.583686113357544, + "mean_token_accuracy": 0.8216883540153503, + "num_tokens": 9348581.0, + "step": 1035 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 1.9401264190673828, + "learning_rate": 4.410165825757613e-06, + "loss": 0.4905240535736084, + "mean_token_accuracy": 0.8229951858520508, + "num_tokens": 9356032.0, + "step": 1036 + }, + { + "epoch": 0.7879939209726444, + "grad_norm": 3.620547294616699, + "learning_rate": 4.408813982377175e-06, + "loss": 0.4269888997077942, + "mean_token_accuracy": 0.8713940978050232, + "num_tokens": 9359061.0, + "step": 1037 + }, + { + "epoch": 0.7887537993920972, + "grad_norm": 1.2027851343154907, + "learning_rate": 4.407460799317605e-06, + "loss": 0.39972418546676636, + "mean_token_accuracy": 0.8610097765922546, + "num_tokens": 9377068.0, + "step": 1038 + }, + { + "epoch": 0.7895136778115501, + "grad_norm": 2.566753387451172, + "learning_rate": 4.40610627752862e-06, + "loss": 0.45267152786254883, + "mean_token_accuracy": 0.83243328332901, + "num_tokens": 9383604.0, + "step": 1039 + }, + { + "epoch": 0.790273556231003, + "grad_norm": 2.940094470977783, + "learning_rate": 4.404750417960876e-06, + "loss": 0.42862242460250854, + "mean_token_accuracy": 0.8582849502563477, + "num_tokens": 9387541.0, + "step": 1040 + }, + { + "epoch": 0.791033434650456, + "grad_norm": 2.0223944187164307, + "learning_rate": 4.403393221565966e-06, + "loss": 0.4349963665008545, + "mean_token_accuracy": 0.8453047871589661, + "num_tokens": 9394382.0, + "step": 1041 + }, + { + "epoch": 0.7917933130699089, + "grad_norm": 2.9399030208587646, + "learning_rate": 4.402034689296425e-06, + "loss": 0.32197174429893494, + "mean_token_accuracy": 0.8953392505645752, + "num_tokens": 9397741.0, + "step": 1042 + }, + { + "epoch": 0.7925531914893617, + "grad_norm": 2.819016456604004, + "learning_rate": 4.400674822105721e-06, + "loss": 0.6790289878845215, + "mean_token_accuracy": 0.8135063648223877, + "num_tokens": 9403509.0, + "step": 1043 + }, + { + "epoch": 0.7933130699088146, + "grad_norm": 1.3225977420806885, + "learning_rate": 4.399313620948262e-06, + "loss": 0.42203834652900696, + "mean_token_accuracy": 0.8399381637573242, + "num_tokens": 9418870.0, + "step": 1044 + }, + { + "epoch": 0.7940729483282675, + "grad_norm": 1.7822176218032837, + "learning_rate": 4.397951086779392e-06, + "loss": 0.4666554927825928, + "mean_token_accuracy": 0.8364764451980591, + "num_tokens": 9427640.0, + "step": 1045 + }, + { + "epoch": 0.7948328267477204, + "grad_norm": 3.186439037322998, + "learning_rate": 4.396587220555389e-06, + "loss": 0.6048363447189331, + "mean_token_accuracy": 0.7806557416915894, + "num_tokens": 9431927.0, + "step": 1046 + }, + { + "epoch": 0.7955927051671733, + "grad_norm": 3.0804805755615234, + "learning_rate": 4.395222023233467e-06, + "loss": 0.445969820022583, + "mean_token_accuracy": 0.850671112537384, + "num_tokens": 9436136.0, + "step": 1047 + }, + { + "epoch": 0.7963525835866262, + "grad_norm": 1.675968885421753, + "learning_rate": 4.393855495771774e-06, + "loss": 0.4311422109603882, + "mean_token_accuracy": 0.8449079990386963, + "num_tokens": 9445189.0, + "step": 1048 + }, + { + "epoch": 0.797112462006079, + "grad_norm": 2.342410087585449, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.5733606219291687, + "mean_token_accuracy": 0.8156592845916748, + "num_tokens": 9451939.0, + "step": 1049 + }, + { + "epoch": 0.7978723404255319, + "grad_norm": 1.5967470407485962, + "learning_rate": 4.391118454266335e-06, + "loss": 0.46664729714393616, + "mean_token_accuracy": 0.8091695308685303, + "num_tokens": 9463968.0, + "step": 1050 + }, + { + "epoch": 0.7986322188449848, + "grad_norm": 1.5777863264083862, + "learning_rate": 4.389747942143549e-06, + "loss": 0.46028903126716614, + "mean_token_accuracy": 0.8347330093383789, + "num_tokens": 9475561.0, + "step": 1051 + }, + { + "epoch": 0.7993920972644377, + "grad_norm": 2.7630488872528076, + "learning_rate": 4.388376103722914e-06, + "loss": 0.5618188977241516, + "mean_token_accuracy": 0.8273467421531677, + "num_tokens": 9480661.0, + "step": 1052 + }, + { + "epoch": 0.8001519756838906, + "grad_norm": 2.093397378921509, + "learning_rate": 4.387002939967237e-06, + "loss": 0.2998353838920593, + "mean_token_accuracy": 0.8905231952667236, + "num_tokens": 9485924.0, + "step": 1053 + }, + { + "epoch": 0.8009118541033434, + "grad_norm": 1.4385871887207031, + "learning_rate": 4.38562845184026e-06, + "loss": 0.4944111704826355, + "mean_token_accuracy": 0.8403056263923645, + "num_tokens": 9500128.0, + "step": 1054 + }, + { + "epoch": 0.8016717325227963, + "grad_norm": 1.6393156051635742, + "learning_rate": 4.384252640306649e-06, + "loss": 0.5727907419204712, + "mean_token_accuracy": 0.7849414348602295, + "num_tokens": 9511569.0, + "step": 1055 + }, + { + "epoch": 0.8024316109422492, + "grad_norm": 2.3909664154052734, + "learning_rate": 4.382875506332002e-06, + "loss": 0.4760419726371765, + "mean_token_accuracy": 0.8408266305923462, + "num_tokens": 9517244.0, + "step": 1056 + }, + { + "epoch": 0.8031914893617021, + "grad_norm": 1.7288594245910645, + "learning_rate": 4.381497050882845e-06, + "loss": 0.5375926494598389, + "mean_token_accuracy": 0.8138614892959595, + "num_tokens": 9528736.0, + "step": 1057 + }, + { + "epoch": 0.8039513677811551, + "grad_norm": 2.093407392501831, + "learning_rate": 4.380117274926632e-06, + "loss": 0.46659404039382935, + "mean_token_accuracy": 0.8450702428817749, + "num_tokens": 9536200.0, + "step": 1058 + }, + { + "epoch": 0.8047112462006079, + "grad_norm": 1.6835898160934448, + "learning_rate": 4.3787361794317405e-06, + "loss": 0.43157699704170227, + "mean_token_accuracy": 0.8279973268508911, + "num_tokens": 9546314.0, + "step": 1059 + }, + { + "epoch": 0.8054711246200608, + "grad_norm": 1.983067512512207, + "learning_rate": 4.377353765367479e-06, + "loss": 0.5021739602088928, + "mean_token_accuracy": 0.8274815082550049, + "num_tokens": 9554375.0, + "step": 1060 + }, + { + "epoch": 0.8062310030395137, + "grad_norm": 2.0472030639648438, + "learning_rate": 4.375970033704078e-06, + "loss": 0.34298190474510193, + "mean_token_accuracy": 0.8900876045227051, + "num_tokens": 9560230.0, + "step": 1061 + }, + { + "epoch": 0.8069908814589666, + "grad_norm": 1.9613717794418335, + "learning_rate": 4.374584985412692e-06, + "loss": 0.3826758861541748, + "mean_token_accuracy": 0.839923620223999, + "num_tokens": 9566809.0, + "step": 1062 + }, + { + "epoch": 0.8077507598784195, + "grad_norm": 1.991289496421814, + "learning_rate": 4.373198621465405e-06, + "loss": 0.5492525100708008, + "mean_token_accuracy": 0.8153272867202759, + "num_tokens": 9576810.0, + "step": 1063 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.421370506286621, + "learning_rate": 4.3718109428352155e-06, + "loss": 0.5240297317504883, + "mean_token_accuracy": 0.8087242245674133, + "num_tokens": 9582906.0, + "step": 1064 + }, + { + "epoch": 0.8092705167173252, + "grad_norm": 3.697765588760376, + "learning_rate": 4.370421950496055e-06, + "loss": 0.6096476912498474, + "mean_token_accuracy": 0.787585973739624, + "num_tokens": 9586920.0, + "step": 1065 + }, + { + "epoch": 0.8100303951367781, + "grad_norm": 2.0767786502838135, + "learning_rate": 4.369031645422768e-06, + "loss": 0.41120079159736633, + "mean_token_accuracy": 0.8513731956481934, + "num_tokens": 9593902.0, + "step": 1066 + }, + { + "epoch": 0.810790273556231, + "grad_norm": 2.5968732833862305, + "learning_rate": 4.367640028591126e-06, + "loss": 0.3364982008934021, + "mean_token_accuracy": 0.8786963224411011, + "num_tokens": 9597745.0, + "step": 1067 + }, + { + "epoch": 0.8115501519756839, + "grad_norm": 2.165742874145508, + "learning_rate": 4.366247100977818e-06, + "loss": 0.406129390001297, + "mean_token_accuracy": 0.868243932723999, + "num_tokens": 9603496.0, + "step": 1068 + }, + { + "epoch": 0.8123100303951368, + "grad_norm": 2.0493404865264893, + "learning_rate": 4.364852863560456e-06, + "loss": 0.5356296300888062, + "mean_token_accuracy": 0.8191947340965271, + "num_tokens": 9610898.0, + "step": 1069 + }, + { + "epoch": 0.8130699088145896, + "grad_norm": 2.3224308490753174, + "learning_rate": 4.363457317317568e-06, + "loss": 0.41461923718452454, + "mean_token_accuracy": 0.8537945747375488, + "num_tokens": 9616626.0, + "step": 1070 + }, + { + "epoch": 0.8138297872340425, + "grad_norm": 1.7387986183166504, + "learning_rate": 4.362060463228603e-06, + "loss": 0.5134786367416382, + "mean_token_accuracy": 0.8511737585067749, + "num_tokens": 9626223.0, + "step": 1071 + }, + { + "epoch": 0.8145896656534954, + "grad_norm": 3.0270655155181885, + "learning_rate": 4.360662302273926e-06, + "loss": 0.3410695791244507, + "mean_token_accuracy": 0.8746449947357178, + "num_tokens": 9629455.0, + "step": 1072 + }, + { + "epoch": 0.8153495440729484, + "grad_norm": 1.7727062702178955, + "learning_rate": 4.35926283543482e-06, + "loss": 0.4610968828201294, + "mean_token_accuracy": 0.8444793224334717, + "num_tokens": 9638070.0, + "step": 1073 + }, + { + "epoch": 0.8161094224924013, + "grad_norm": 3.6333565711975098, + "learning_rate": 4.357862063693486e-06, + "loss": 0.3881273865699768, + "mean_token_accuracy": 0.8757344484329224, + "num_tokens": 9641028.0, + "step": 1074 + }, + { + "epoch": 0.8168693009118541, + "grad_norm": 3.024042844772339, + "learning_rate": 4.356459988033039e-06, + "loss": 0.3853808641433716, + "mean_token_accuracy": 0.8602254390716553, + "num_tokens": 9645730.0, + "step": 1075 + }, + { + "epoch": 0.817629179331307, + "grad_norm": 2.3359482288360596, + "learning_rate": 4.355056609437509e-06, + "loss": 0.4852045476436615, + "mean_token_accuracy": 0.8502728343009949, + "num_tokens": 9650975.0, + "step": 1076 + }, + { + "epoch": 0.8183890577507599, + "grad_norm": 2.2390685081481934, + "learning_rate": 4.353651928891842e-06, + "loss": 0.5287341475486755, + "mean_token_accuracy": 0.8247801065444946, + "num_tokens": 9657471.0, + "step": 1077 + }, + { + "epoch": 0.8191489361702128, + "grad_norm": 2.3809144496917725, + "learning_rate": 4.352245947381897e-06, + "loss": 0.5218510627746582, + "mean_token_accuracy": 0.8149170875549316, + "num_tokens": 9664108.0, + "step": 1078 + }, + { + "epoch": 0.8199088145896657, + "grad_norm": 1.7072309255599976, + "learning_rate": 4.3508386658944455e-06, + "loss": 0.46481168270111084, + "mean_token_accuracy": 0.834963321685791, + "num_tokens": 9673175.0, + "step": 1079 + }, + { + "epoch": 0.8206686930091185, + "grad_norm": 1.7383702993392944, + "learning_rate": 4.349430085417171e-06, + "loss": 0.4505952000617981, + "mean_token_accuracy": 0.8507769107818604, + "num_tokens": 9682800.0, + "step": 1080 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 2.4308547973632812, + "learning_rate": 4.348020206938672e-06, + "loss": 0.4832455515861511, + "mean_token_accuracy": 0.8538393974304199, + "num_tokens": 9688123.0, + "step": 1081 + }, + { + "epoch": 0.8221884498480243, + "grad_norm": 2.2686192989349365, + "learning_rate": 4.3466090314484526e-06, + "loss": 0.5112563371658325, + "mean_token_accuracy": 0.8308460712432861, + "num_tokens": 9694299.0, + "step": 1082 + }, + { + "epoch": 0.8229483282674772, + "grad_norm": 2.806093454360962, + "learning_rate": 4.345196559936931e-06, + "loss": 0.4818246364593506, + "mean_token_accuracy": 0.86617112159729, + "num_tokens": 9698471.0, + "step": 1083 + }, + { + "epoch": 0.8237082066869301, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.343782793395435e-06, + "loss": 0.38246971368789673, + "mean_token_accuracy": 0.8675198554992676, + "num_tokens": 9706444.0, + "step": 1084 + }, + { + "epoch": 0.824468085106383, + "grad_norm": 1.664942741394043, + "learning_rate": 4.3423677328162e-06, + "loss": 0.498797208070755, + "mean_token_accuracy": 0.8447319865226746, + "num_tokens": 9716765.0, + "step": 1085 + }, + { + "epoch": 0.8252279635258358, + "grad_norm": 1.3608235120773315, + "learning_rate": 4.340951379192369e-06, + "loss": 0.41961491107940674, + "mean_token_accuracy": 0.8339346647262573, + "num_tokens": 9729564.0, + "step": 1086 + }, + { + "epoch": 0.8259878419452887, + "grad_norm": 1.642503261566162, + "learning_rate": 4.3395337335179945e-06, + "loss": 0.5477945804595947, + "mean_token_accuracy": 0.8117889761924744, + "num_tokens": 9741217.0, + "step": 1087 + }, + { + "epoch": 0.8267477203647416, + "grad_norm": 3.0345044136047363, + "learning_rate": 4.338114796788035e-06, + "loss": 0.5024623870849609, + "mean_token_accuracy": 0.8333141207695007, + "num_tokens": 9744941.0, + "step": 1088 + }, + { + "epoch": 0.8275075987841946, + "grad_norm": 1.3096630573272705, + "learning_rate": 4.336694569998354e-06, + "loss": 0.44169723987579346, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 9757854.0, + "step": 1089 + }, + { + "epoch": 0.8282674772036475, + "grad_norm": 2.203279495239258, + "learning_rate": 4.3352730541457215e-06, + "loss": 0.5283265113830566, + "mean_token_accuracy": 0.8053759932518005, + "num_tokens": 9764096.0, + "step": 1090 + }, + { + "epoch": 0.8290273556231003, + "grad_norm": 1.3774312734603882, + "learning_rate": 4.333850250227814e-06, + "loss": 0.4584103226661682, + "mean_token_accuracy": 0.8342611193656921, + "num_tokens": 9777768.0, + "step": 1091 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 1.822637915611267, + "learning_rate": 4.332426159243206e-06, + "loss": 0.5432791709899902, + "mean_token_accuracy": 0.8136210441589355, + "num_tokens": 9791276.0, + "step": 1092 + }, + { + "epoch": 0.8305471124620061, + "grad_norm": 3.0190067291259766, + "learning_rate": 4.331000782191384e-06, + "loss": 0.5018150806427002, + "mean_token_accuracy": 0.8234807252883911, + "num_tokens": 9794902.0, + "step": 1093 + }, + { + "epoch": 0.831306990881459, + "grad_norm": 2.09987735748291, + "learning_rate": 4.329574120072728e-06, + "loss": 0.4270891547203064, + "mean_token_accuracy": 0.8544977903366089, + "num_tokens": 9800903.0, + "step": 1094 + }, + { + "epoch": 0.8320668693009119, + "grad_norm": 1.969549536705017, + "learning_rate": 4.328146173888528e-06, + "loss": 0.45801427960395813, + "mean_token_accuracy": 0.8334714770317078, + "num_tokens": 9808719.0, + "step": 1095 + }, + { + "epoch": 0.8328267477203647, + "grad_norm": 1.4565571546554565, + "learning_rate": 4.32671694464097e-06, + "loss": 0.34864288568496704, + "mean_token_accuracy": 0.8689061999320984, + "num_tokens": 9818262.0, + "step": 1096 + }, + { + "epoch": 0.8335866261398176, + "grad_norm": 1.2163832187652588, + "learning_rate": 4.3252864333331424e-06, + "loss": 0.37953704595565796, + "mean_token_accuracy": 0.866554856300354, + "num_tokens": 9833942.0, + "step": 1097 + }, + { + "epoch": 0.8343465045592705, + "grad_norm": 1.6112010478973389, + "learning_rate": 4.323854640969033e-06, + "loss": 0.5442801713943481, + "mean_token_accuracy": 0.8190416097640991, + "num_tokens": 9844765.0, + "step": 1098 + }, + { + "epoch": 0.8351063829787234, + "grad_norm": 1.8190315961837769, + "learning_rate": 4.322421568553529e-06, + "loss": 0.48271381855010986, + "mean_token_accuracy": 0.8203652501106262, + "num_tokens": 9852625.0, + "step": 1099 + }, + { + "epoch": 0.8358662613981763, + "grad_norm": 2.7897756099700928, + "learning_rate": 4.320987217092416e-06, + "loss": 0.4086323380470276, + "mean_token_accuracy": 0.8504934310913086, + "num_tokens": 9856888.0, + "step": 1100 + }, + { + "epoch": 0.8366261398176292, + "grad_norm": 1.7035977840423584, + "learning_rate": 4.319551587592377e-06, + "loss": 0.6325064301490784, + "mean_token_accuracy": 0.788190484046936, + "num_tokens": 9869419.0, + "step": 1101 + }, + { + "epoch": 0.837386018237082, + "grad_norm": 2.609731912612915, + "learning_rate": 4.318114681060989e-06, + "loss": 0.519314706325531, + "mean_token_accuracy": 0.8469992280006409, + "num_tokens": 9874553.0, + "step": 1102 + }, + { + "epoch": 0.8381458966565349, + "grad_norm": 1.2519766092300415, + "learning_rate": 4.316676498506735e-06, + "loss": 0.3566005825996399, + "mean_token_accuracy": 0.8588439226150513, + "num_tokens": 9886498.0, + "step": 1103 + }, + { + "epoch": 0.8389057750759878, + "grad_norm": 1.430892825126648, + "learning_rate": 4.3152370409389795e-06, + "loss": 0.5250182747840881, + "mean_token_accuracy": 0.8164948225021362, + "num_tokens": 9900256.0, + "step": 1104 + }, + { + "epoch": 0.8396656534954408, + "grad_norm": 3.1245436668395996, + "learning_rate": 4.3137963093679945e-06, + "loss": 0.3173971176147461, + "mean_token_accuracy": 0.8835347890853882, + "num_tokens": 9903899.0, + "step": 1105 + }, + { + "epoch": 0.8404255319148937, + "grad_norm": 3.131812572479248, + "learning_rate": 4.3123543048049395e-06, + "loss": 0.6567763090133667, + "mean_token_accuracy": 0.8233605027198792, + "num_tokens": 9908798.0, + "step": 1106 + }, + { + "epoch": 0.8411854103343465, + "grad_norm": 1.3551725149154663, + "learning_rate": 4.310911028261867e-06, + "loss": 0.3993729054927826, + "mean_token_accuracy": 0.8529655933380127, + "num_tokens": 9922577.0, + "step": 1107 + }, + { + "epoch": 0.8419452887537994, + "grad_norm": 2.572533130645752, + "learning_rate": 4.309466480751726e-06, + "loss": 0.40906503796577454, + "mean_token_accuracy": 0.8630726933479309, + "num_tokens": 9926890.0, + "step": 1108 + }, + { + "epoch": 0.8427051671732523, + "grad_norm": 1.9146469831466675, + "learning_rate": 4.308020663288356e-06, + "loss": 0.48423194885253906, + "mean_token_accuracy": 0.8370280861854553, + "num_tokens": 9934293.0, + "step": 1109 + }, + { + "epoch": 0.8434650455927052, + "grad_norm": 1.6178001165390015, + "learning_rate": 4.306573576886485e-06, + "loss": 0.4262213408946991, + "mean_token_accuracy": 0.839401125907898, + "num_tokens": 9944513.0, + "step": 1110 + }, + { + "epoch": 0.8442249240121581, + "grad_norm": 2.4444572925567627, + "learning_rate": 4.305125222561736e-06, + "loss": 0.5199950933456421, + "mean_token_accuracy": 0.8507720232009888, + "num_tokens": 9949512.0, + "step": 1111 + }, + { + "epoch": 0.8449848024316109, + "grad_norm": 1.7983134984970093, + "learning_rate": 4.303675601330618e-06, + "loss": 0.36155956983566284, + "mean_token_accuracy": 0.8568712472915649, + "num_tokens": 9956402.0, + "step": 1112 + }, + { + "epoch": 0.8457446808510638, + "grad_norm": 2.391096353530884, + "learning_rate": 4.302224714210532e-06, + "loss": 0.5391949415206909, + "mean_token_accuracy": 0.8183057308197021, + "num_tokens": 9961606.0, + "step": 1113 + }, + { + "epoch": 0.8465045592705167, + "grad_norm": 1.8520214557647705, + "learning_rate": 4.3007725622197675e-06, + "loss": 0.5758882761001587, + "mean_token_accuracy": 0.7924330234527588, + "num_tokens": 9971473.0, + "step": 1114 + }, + { + "epoch": 0.8472644376899696, + "grad_norm": 2.436640739440918, + "learning_rate": 4.2993191463775e-06, + "loss": 0.3837985396385193, + "mean_token_accuracy": 0.8620110750198364, + "num_tokens": 9976333.0, + "step": 1115 + }, + { + "epoch": 0.8480243161094225, + "grad_norm": 1.7287120819091797, + "learning_rate": 4.29786446770379e-06, + "loss": 0.40066856145858765, + "mean_token_accuracy": 0.8618333339691162, + "num_tokens": 9985617.0, + "step": 1116 + }, + { + "epoch": 0.8487841945288754, + "grad_norm": 2.0310518741607666, + "learning_rate": 4.296408527219592e-06, + "loss": 0.5465943217277527, + "mean_token_accuracy": 0.812044620513916, + "num_tokens": 9995363.0, + "step": 1117 + }, + { + "epoch": 0.8495440729483282, + "grad_norm": 1.4858589172363281, + "learning_rate": 4.294951325946737e-06, + "loss": 0.45840176939964294, + "mean_token_accuracy": 0.8432979583740234, + "num_tokens": 10006400.0, + "step": 1118 + }, + { + "epoch": 0.8503039513677811, + "grad_norm": 1.6153514385223389, + "learning_rate": 4.293492864907947e-06, + "loss": 0.5225611925125122, + "mean_token_accuracy": 0.8180211186408997, + "num_tokens": 10018352.0, + "step": 1119 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.1178412437438965, + "learning_rate": 4.2920331451268246e-06, + "loss": 0.5580621361732483, + "mean_token_accuracy": 0.8211709260940552, + "num_tokens": 10025614.0, + "step": 1120 + }, + { + "epoch": 0.851823708206687, + "grad_norm": 2.036839246749878, + "learning_rate": 4.2905721676278585e-06, + "loss": 0.4658433198928833, + "mean_token_accuracy": 0.8380423784255981, + "num_tokens": 10032489.0, + "step": 1121 + }, + { + "epoch": 0.8525835866261399, + "grad_norm": 2.0056262016296387, + "learning_rate": 4.28910993343642e-06, + "loss": 0.47023308277130127, + "mean_token_accuracy": 0.8340359926223755, + "num_tokens": 10040050.0, + "step": 1122 + }, + { + "epoch": 0.8533434650455927, + "grad_norm": 2.540024518966675, + "learning_rate": 4.2876464435787576e-06, + "loss": 0.502303957939148, + "mean_token_accuracy": 0.8288739919662476, + "num_tokens": 10045042.0, + "step": 1123 + }, + { + "epoch": 0.8541033434650456, + "grad_norm": 1.7894693613052368, + "learning_rate": 4.286181699082008e-06, + "loss": 0.4732973575592041, + "mean_token_accuracy": 0.8340568542480469, + "num_tokens": 10054424.0, + "step": 1124 + }, + { + "epoch": 0.8548632218844985, + "grad_norm": 1.5601223707199097, + "learning_rate": 4.284715700974186e-06, + "loss": 0.472471684217453, + "mean_token_accuracy": 0.8274722695350647, + "num_tokens": 10065523.0, + "step": 1125 + }, + { + "epoch": 0.8556231003039514, + "grad_norm": 1.7326055765151978, + "learning_rate": 4.283248450284182e-06, + "loss": 0.5924872159957886, + "mean_token_accuracy": 0.7943467497825623, + "num_tokens": 10076839.0, + "step": 1126 + }, + { + "epoch": 0.8563829787234043, + "grad_norm": 1.5165479183197021, + "learning_rate": 4.281779948041772e-06, + "loss": 0.44768425822257996, + "mean_token_accuracy": 0.8394696712493896, + "num_tokens": 10088168.0, + "step": 1127 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.5448920726776123, + "learning_rate": 4.280310195277606e-06, + "loss": 0.4458175003528595, + "mean_token_accuracy": 0.835773229598999, + "num_tokens": 10100306.0, + "step": 1128 + }, + { + "epoch": 0.85790273556231, + "grad_norm": 1.6311609745025635, + "learning_rate": 4.278839193023214e-06, + "loss": 0.4158072769641876, + "mean_token_accuracy": 0.8482539653778076, + "num_tokens": 10110581.0, + "step": 1129 + }, + { + "epoch": 0.8586626139817629, + "grad_norm": 1.6714754104614258, + "learning_rate": 4.277366942311001e-06, + "loss": 0.3686875104904175, + "mean_token_accuracy": 0.8681533336639404, + "num_tokens": 10118799.0, + "step": 1130 + }, + { + "epoch": 0.8594224924012158, + "grad_norm": 2.1604413986206055, + "learning_rate": 4.2758934441742494e-06, + "loss": 0.37267982959747314, + "mean_token_accuracy": 0.8520427346229553, + "num_tokens": 10124734.0, + "step": 1131 + }, + { + "epoch": 0.8601823708206687, + "grad_norm": 2.123013973236084, + "learning_rate": 4.274418699647117e-06, + "loss": 0.49963313341140747, + "mean_token_accuracy": 0.8248758912086487, + "num_tokens": 10131965.0, + "step": 1132 + }, + { + "epoch": 0.8609422492401215, + "grad_norm": 1.4308786392211914, + "learning_rate": 4.272942709764638e-06, + "loss": 0.48666873574256897, + "mean_token_accuracy": 0.8304717540740967, + "num_tokens": 10145164.0, + "step": 1133 + }, + { + "epoch": 0.8617021276595744, + "grad_norm": 1.7952618598937988, + "learning_rate": 4.271465475562716e-06, + "loss": 0.5536223649978638, + "mean_token_accuracy": 0.8093959093093872, + "num_tokens": 10154083.0, + "step": 1134 + }, + { + "epoch": 0.8624620060790273, + "grad_norm": 2.0622456073760986, + "learning_rate": 4.269986998078132e-06, + "loss": 0.5173629522323608, + "mean_token_accuracy": 0.8285619020462036, + "num_tokens": 10161889.0, + "step": 1135 + }, + { + "epoch": 0.8632218844984803, + "grad_norm": 2.0707509517669678, + "learning_rate": 4.268507278348539e-06, + "loss": 0.5871608257293701, + "mean_token_accuracy": 0.7827386856079102, + "num_tokens": 10170726.0, + "step": 1136 + }, + { + "epoch": 0.8639817629179332, + "grad_norm": 2.054368257522583, + "learning_rate": 4.2670263174124615e-06, + "loss": 0.5788969993591309, + "mean_token_accuracy": 0.7967237234115601, + "num_tokens": 10178474.0, + "step": 1137 + }, + { + "epoch": 0.8647416413373861, + "grad_norm": 1.901846170425415, + "learning_rate": 4.265544116309294e-06, + "loss": 0.5405587553977966, + "mean_token_accuracy": 0.8151819705963135, + "num_tokens": 10187013.0, + "step": 1138 + }, + { + "epoch": 0.8655015197568389, + "grad_norm": 2.901285409927368, + "learning_rate": 4.264060676079302e-06, + "loss": 0.44101861119270325, + "mean_token_accuracy": 0.8433429002761841, + "num_tokens": 10191517.0, + "step": 1139 + }, + { + "epoch": 0.8662613981762918, + "grad_norm": 2.4168388843536377, + "learning_rate": 4.262575997763622e-06, + "loss": 0.4686204195022583, + "mean_token_accuracy": 0.8505309820175171, + "num_tokens": 10196948.0, + "step": 1140 + }, + { + "epoch": 0.8670212765957447, + "grad_norm": 1.9588396549224854, + "learning_rate": 4.2610900824042575e-06, + "loss": 0.47056013345718384, + "mean_token_accuracy": 0.8280024528503418, + "num_tokens": 10204292.0, + "step": 1141 + }, + { + "epoch": 0.8677811550151976, + "grad_norm": 2.569150924682617, + "learning_rate": 4.2596029310440826e-06, + "loss": 0.573108434677124, + "mean_token_accuracy": 0.8108246326446533, + "num_tokens": 10209571.0, + "step": 1142 + }, + { + "epoch": 0.8685410334346505, + "grad_norm": 2.038032293319702, + "learning_rate": 4.258114544726835e-06, + "loss": 0.40545332431793213, + "mean_token_accuracy": 0.8611703515052795, + "num_tokens": 10215716.0, + "step": 1143 + }, + { + "epoch": 0.8693009118541033, + "grad_norm": 1.9884231090545654, + "learning_rate": 4.256624924497124e-06, + "loss": 0.40085992217063904, + "mean_token_accuracy": 0.8615031242370605, + "num_tokens": 10222775.0, + "step": 1144 + }, + { + "epoch": 0.8700607902735562, + "grad_norm": 1.912842035293579, + "learning_rate": 4.25513407140042e-06, + "loss": 0.41022324562072754, + "mean_token_accuracy": 0.8459607362747192, + "num_tokens": 10229589.0, + "step": 1145 + }, + { + "epoch": 0.8708206686930091, + "grad_norm": 1.9190576076507568, + "learning_rate": 4.253641986483063e-06, + "loss": 0.5541447401046753, + "mean_token_accuracy": 0.8256468772888184, + "num_tokens": 10240633.0, + "step": 1146 + }, + { + "epoch": 0.871580547112462, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.2521486707922545e-06, + "loss": 0.3680543899536133, + "mean_token_accuracy": 0.8654477596282959, + "num_tokens": 10251252.0, + "step": 1147 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 1.4438525438308716, + "learning_rate": 4.250654125376062e-06, + "loss": 0.45830875635147095, + "mean_token_accuracy": 0.8433834314346313, + "num_tokens": 10263980.0, + "step": 1148 + }, + { + "epoch": 0.8731003039513677, + "grad_norm": 2.1273653507232666, + "learning_rate": 4.249158351283414e-06, + "loss": 0.4129376709461212, + "mean_token_accuracy": 0.861556351184845, + "num_tokens": 10270426.0, + "step": 1149 + }, + { + "epoch": 0.8738601823708206, + "grad_norm": 2.598440647125244, + "learning_rate": 4.247661349564103e-06, + "loss": 0.418030709028244, + "mean_token_accuracy": 0.86553955078125, + "num_tokens": 10275493.0, + "step": 1150 + }, + { + "epoch": 0.8746200607902735, + "grad_norm": 1.6852490901947021, + "learning_rate": 4.246163121268782e-06, + "loss": 0.6403408050537109, + "mean_token_accuracy": 0.7966094017028809, + "num_tokens": 10287989.0, + "step": 1151 + }, + { + "epoch": 0.8753799392097265, + "grad_norm": 2.5013794898986816, + "learning_rate": 4.244663667448965e-06, + "loss": 0.49922505021095276, + "mean_token_accuracy": 0.8318735361099243, + "num_tokens": 10293360.0, + "step": 1152 + }, + { + "epoch": 0.8761398176291794, + "grad_norm": 1.2022709846496582, + "learning_rate": 4.243162989157027e-06, + "loss": 0.4414965510368347, + "mean_token_accuracy": 0.8338693380355835, + "num_tokens": 10310558.0, + "step": 1153 + }, + { + "epoch": 0.8768996960486323, + "grad_norm": 1.9903281927108765, + "learning_rate": 4.241661087446202e-06, + "loss": 0.4277610778808594, + "mean_token_accuracy": 0.8560749292373657, + "num_tokens": 10316983.0, + "step": 1154 + }, + { + "epoch": 0.8776595744680851, + "grad_norm": 2.104923725128174, + "learning_rate": 4.240157963370583e-06, + "loss": 0.44431713223457336, + "mean_token_accuracy": 0.8785282969474792, + "num_tokens": 10323294.0, + "step": 1155 + }, + { + "epoch": 0.878419452887538, + "grad_norm": 2.8364813327789307, + "learning_rate": 4.2386536179851175e-06, + "loss": 0.49948397278785706, + "mean_token_accuracy": 0.8305255174636841, + "num_tokens": 10327662.0, + "step": 1156 + }, + { + "epoch": 0.8791793313069909, + "grad_norm": 1.9493682384490967, + "learning_rate": 4.2371480523456156e-06, + "loss": 0.45867404341697693, + "mean_token_accuracy": 0.8373264074325562, + "num_tokens": 10335699.0, + "step": 1157 + }, + { + "epoch": 0.8799392097264438, + "grad_norm": 2.268616199493408, + "learning_rate": 4.235641267508741e-06, + "loss": 0.4547857940196991, + "mean_token_accuracy": 0.8252766132354736, + "num_tokens": 10342464.0, + "step": 1158 + }, + { + "epoch": 0.8806990881458967, + "grad_norm": 2.1334283351898193, + "learning_rate": 4.234133264532012e-06, + "loss": 0.39503124356269836, + "mean_token_accuracy": 0.8648351430892944, + "num_tokens": 10347514.0, + "step": 1159 + }, + { + "epoch": 0.8814589665653495, + "grad_norm": 1.2775357961654663, + "learning_rate": 4.232624044473805e-06, + "loss": 0.39945733547210693, + "mean_token_accuracy": 0.8369829654693604, + "num_tokens": 10363316.0, + "step": 1160 + }, + { + "epoch": 0.8822188449848024, + "grad_norm": 2.458413600921631, + "learning_rate": 4.231113608393348e-06, + "loss": 0.5020045638084412, + "mean_token_accuracy": 0.8295938968658447, + "num_tokens": 10368401.0, + "step": 1161 + }, + { + "epoch": 0.8829787234042553, + "grad_norm": 1.7464948892593384, + "learning_rate": 4.229601957350722e-06, + "loss": 0.5335392951965332, + "mean_token_accuracy": 0.8134858012199402, + "num_tokens": 10378337.0, + "step": 1162 + }, + { + "epoch": 0.8837386018237082, + "grad_norm": 3.1152119636535645, + "learning_rate": 4.228089092406863e-06, + "loss": 0.4811682105064392, + "mean_token_accuracy": 0.8460187315940857, + "num_tokens": 10382362.0, + "step": 1163 + }, + { + "epoch": 0.8844984802431611, + "grad_norm": 2.190847158432007, + "learning_rate": 4.226575014623557e-06, + "loss": 0.4428049921989441, + "mean_token_accuracy": 0.8382467031478882, + "num_tokens": 10388211.0, + "step": 1164 + }, + { + "epoch": 0.8852583586626139, + "grad_norm": 1.860153079032898, + "learning_rate": 4.225059725063444e-06, + "loss": 0.5265918970108032, + "mean_token_accuracy": 0.8181334733963013, + "num_tokens": 10398873.0, + "step": 1165 + }, + { + "epoch": 0.8860182370820668, + "grad_norm": 1.3372713327407837, + "learning_rate": 4.22354322479001e-06, + "loss": 0.43202850222587585, + "mean_token_accuracy": 0.8432420492172241, + "num_tokens": 10413158.0, + "step": 1166 + }, + { + "epoch": 0.8867781155015197, + "grad_norm": 1.3653379678726196, + "learning_rate": 4.222025514867596e-06, + "loss": 0.43780991435050964, + "mean_token_accuracy": 0.8441485166549683, + "num_tokens": 10428137.0, + "step": 1167 + }, + { + "epoch": 0.8875379939209727, + "grad_norm": 3.0230672359466553, + "learning_rate": 4.220506596361387e-06, + "loss": 0.6039337515830994, + "mean_token_accuracy": 0.8274872303009033, + "num_tokens": 10432586.0, + "step": 1168 + }, + { + "epoch": 0.8882978723404256, + "grad_norm": 2.2180392742156982, + "learning_rate": 4.218986470337419e-06, + "loss": 0.5453792810440063, + "mean_token_accuracy": 0.8127184510231018, + "num_tokens": 10439471.0, + "step": 1169 + }, + { + "epoch": 0.8890577507598785, + "grad_norm": 1.8519103527069092, + "learning_rate": 4.217465137862575e-06, + "loss": 0.5145469903945923, + "mean_token_accuracy": 0.8178654909133911, + "num_tokens": 10450471.0, + "step": 1170 + }, + { + "epoch": 0.8898176291793313, + "grad_norm": 2.034008026123047, + "learning_rate": 4.215942600004586e-06, + "loss": 0.44061461091041565, + "mean_token_accuracy": 0.8572084307670593, + "num_tokens": 10457382.0, + "step": 1171 + }, + { + "epoch": 0.8905775075987842, + "grad_norm": 3.4304304122924805, + "learning_rate": 4.214418857832025e-06, + "loss": 0.44397830963134766, + "mean_token_accuracy": 0.842149019241333, + "num_tokens": 10460650.0, + "step": 1172 + }, + { + "epoch": 0.8913373860182371, + "grad_norm": 1.9021750688552856, + "learning_rate": 4.212893912414316e-06, + "loss": 0.3769867420196533, + "mean_token_accuracy": 0.8806171417236328, + "num_tokens": 10468214.0, + "step": 1173 + }, + { + "epoch": 0.89209726443769, + "grad_norm": 1.9704062938690186, + "learning_rate": 4.211367764821722e-06, + "loss": 0.5501819849014282, + "mean_token_accuracy": 0.8176811337471008, + "num_tokens": 10476739.0, + "step": 1174 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.4350415468215942, + "learning_rate": 4.209840416125353e-06, + "loss": 0.41897401213645935, + "mean_token_accuracy": 0.8498011827468872, + "num_tokens": 10491769.0, + "step": 1175 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.8237783908843994, + "learning_rate": 4.208311867397162e-06, + "loss": 0.5296977162361145, + "mean_token_accuracy": 0.8168715834617615, + "num_tokens": 10494958.0, + "step": 1176 + }, + { + "epoch": 0.8943768996960486, + "grad_norm": 2.04784893989563, + "learning_rate": 4.206782119709942e-06, + "loss": 0.476105272769928, + "mean_token_accuracy": 0.834011435508728, + "num_tokens": 10502077.0, + "step": 1177 + }, + { + "epoch": 0.8951367781155015, + "grad_norm": 1.8839610815048218, + "learning_rate": 4.205251174137329e-06, + "loss": 0.49628815054893494, + "mean_token_accuracy": 0.8212119936943054, + "num_tokens": 10510077.0, + "step": 1178 + }, + { + "epoch": 0.8958966565349544, + "grad_norm": 1.2100634574890137, + "learning_rate": 4.2037190317538e-06, + "loss": 0.4931519329547882, + "mean_token_accuracy": 0.8170043230056763, + "num_tokens": 10528373.0, + "step": 1179 + }, + { + "epoch": 0.8966565349544073, + "grad_norm": 1.884637713432312, + "learning_rate": 4.202185693634671e-06, + "loss": 0.4913347363471985, + "mean_token_accuracy": 0.8234949707984924, + "num_tokens": 10537108.0, + "step": 1180 + }, + { + "epoch": 0.8974164133738601, + "grad_norm": 1.5062434673309326, + "learning_rate": 4.200651160856099e-06, + "loss": 0.4160492420196533, + "mean_token_accuracy": 0.845937192440033, + "num_tokens": 10547577.0, + "step": 1181 + }, + { + "epoch": 0.898176291793313, + "grad_norm": 2.331169605255127, + "learning_rate": 4.1991154344950755e-06, + "loss": 0.6532632112503052, + "mean_token_accuracy": 0.7743191123008728, + "num_tokens": 10556328.0, + "step": 1182 + }, + { + "epoch": 0.898936170212766, + "grad_norm": 1.3538362979888916, + "learning_rate": 4.197578515629435e-06, + "loss": 0.4437566101551056, + "mean_token_accuracy": 0.8427901268005371, + "num_tokens": 10570026.0, + "step": 1183 + }, + { + "epoch": 0.8996960486322189, + "grad_norm": 2.3828957080841064, + "learning_rate": 4.196040405337846e-06, + "loss": 0.6185290217399597, + "mean_token_accuracy": 0.7969824075698853, + "num_tokens": 10576465.0, + "step": 1184 + }, + { + "epoch": 0.9004559270516718, + "grad_norm": 2.4759042263031006, + "learning_rate": 4.194501104699813e-06, + "loss": 0.46489226818084717, + "mean_token_accuracy": 0.8472316265106201, + "num_tokens": 10582034.0, + "step": 1185 + }, + { + "epoch": 0.9012158054711246, + "grad_norm": 1.9215164184570312, + "learning_rate": 4.192960614795676e-06, + "loss": 0.48001551628112793, + "mean_token_accuracy": 0.8371596336364746, + "num_tokens": 10590556.0, + "step": 1186 + }, + { + "epoch": 0.9019756838905775, + "grad_norm": 2.2717080116271973, + "learning_rate": 4.19141893670661e-06, + "loss": 0.40083563327789307, + "mean_token_accuracy": 0.8464195728302002, + "num_tokens": 10595661.0, + "step": 1187 + }, + { + "epoch": 0.9027355623100304, + "grad_norm": 2.187122344970703, + "learning_rate": 4.189876071514624e-06, + "loss": 0.4942901134490967, + "mean_token_accuracy": 0.8186990022659302, + "num_tokens": 10603366.0, + "step": 1188 + }, + { + "epoch": 0.9034954407294833, + "grad_norm": 1.542414665222168, + "learning_rate": 4.188332020302561e-06, + "loss": 0.4731982946395874, + "mean_token_accuracy": 0.8487229347229004, + "num_tokens": 10616203.0, + "step": 1189 + }, + { + "epoch": 0.9042553191489362, + "grad_norm": 0.9957579970359802, + "learning_rate": 4.186786784154096e-06, + "loss": 0.33211836218833923, + "mean_token_accuracy": 0.870644748210907, + "num_tokens": 10633294.0, + "step": 1190 + }, + { + "epoch": 0.9050151975683891, + "grad_norm": 2.593867540359497, + "learning_rate": 4.1852403641537344e-06, + "loss": 0.6825464963912964, + "mean_token_accuracy": 0.7716869115829468, + "num_tokens": 10640615.0, + "step": 1191 + }, + { + "epoch": 0.9057750759878419, + "grad_norm": 2.0424516201019287, + "learning_rate": 4.183692761386813e-06, + "loss": 0.5672709941864014, + "mean_token_accuracy": 0.7973801493644714, + "num_tokens": 10649845.0, + "step": 1192 + }, + { + "epoch": 0.9065349544072948, + "grad_norm": 1.429018259048462, + "learning_rate": 4.1821439769395e-06, + "loss": 0.5427846908569336, + "mean_token_accuracy": 0.8200292587280273, + "num_tokens": 10665898.0, + "step": 1193 + }, + { + "epoch": 0.9072948328267477, + "grad_norm": 1.9764264822006226, + "learning_rate": 4.180594011898791e-06, + "loss": 0.4784567356109619, + "mean_token_accuracy": 0.82924485206604, + "num_tokens": 10673595.0, + "step": 1194 + }, + { + "epoch": 0.9080547112462006, + "grad_norm": 1.4004309177398682, + "learning_rate": 4.1790428673525104e-06, + "loss": 0.4791432023048401, + "mean_token_accuracy": 0.8334879875183105, + "num_tokens": 10687892.0, + "step": 1195 + }, + { + "epoch": 0.9088145896656535, + "grad_norm": 2.2207727432250977, + "learning_rate": 4.177490544389313e-06, + "loss": 0.5089365243911743, + "mean_token_accuracy": 0.8270776271820068, + "num_tokens": 10694911.0, + "step": 1196 + }, + { + "epoch": 0.9095744680851063, + "grad_norm": 2.2890450954437256, + "learning_rate": 4.175937044098678e-06, + "loss": 0.5152267813682556, + "mean_token_accuracy": 0.8527299165725708, + "num_tokens": 10700512.0, + "step": 1197 + }, + { + "epoch": 0.9103343465045592, + "grad_norm": 1.7938050031661987, + "learning_rate": 4.1743823675709115e-06, + "loss": 0.3507300615310669, + "mean_token_accuracy": 0.8694599866867065, + "num_tokens": 10707953.0, + "step": 1198 + }, + { + "epoch": 0.9110942249240122, + "grad_norm": 1.4368808269500732, + "learning_rate": 4.172826515897146e-06, + "loss": 0.407418429851532, + "mean_token_accuracy": 0.8432893753051758, + "num_tokens": 10717485.0, + "step": 1199 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 1.735339879989624, + "learning_rate": 4.171269490169337e-06, + "loss": 0.46996885538101196, + "mean_token_accuracy": 0.8331948518753052, + "num_tokens": 10726160.0, + "step": 1200 + }, + { + "epoch": 0.912613981762918, + "grad_norm": 1.7859221696853638, + "learning_rate": 4.1697112914802665e-06, + "loss": 0.5325199365615845, + "mean_token_accuracy": 0.8179605007171631, + "num_tokens": 10736284.0, + "step": 1201 + }, + { + "epoch": 0.9133738601823708, + "grad_norm": 2.6394896507263184, + "learning_rate": 4.168151920923536e-06, + "loss": 0.4039744734764099, + "mean_token_accuracy": 0.8545527458190918, + "num_tokens": 10740673.0, + "step": 1202 + }, + { + "epoch": 0.9141337386018237, + "grad_norm": 1.910988211631775, + "learning_rate": 4.1665913795935755e-06, + "loss": 0.5190291404724121, + "mean_token_accuracy": 0.8203921318054199, + "num_tokens": 10751946.0, + "step": 1203 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.0006964206695557, + "learning_rate": 4.16502966858563e-06, + "loss": 0.5856777429580688, + "mean_token_accuracy": 0.8061224222183228, + "num_tokens": 10756795.0, + "step": 1204 + }, + { + "epoch": 0.9156534954407295, + "grad_norm": 1.7396167516708374, + "learning_rate": 4.163466788995768e-06, + "loss": 0.54935222864151, + "mean_token_accuracy": 0.8052443265914917, + "num_tokens": 10767202.0, + "step": 1205 + }, + { + "epoch": 0.9164133738601824, + "grad_norm": 2.143735885620117, + "learning_rate": 4.161902741920881e-06, + "loss": 0.5020298361778259, + "mean_token_accuracy": 0.8249630928039551, + "num_tokens": 10774329.0, + "step": 1206 + }, + { + "epoch": 0.9171732522796353, + "grad_norm": 2.8871893882751465, + "learning_rate": 4.160337528458676e-06, + "loss": 0.5154489278793335, + "mean_token_accuracy": 0.8276848793029785, + "num_tokens": 10778929.0, + "step": 1207 + }, + { + "epoch": 0.9179331306990881, + "grad_norm": 1.4642788171768188, + "learning_rate": 4.15877114970768e-06, + "loss": 0.5033774375915527, + "mean_token_accuracy": 0.8296241164207458, + "num_tokens": 10790928.0, + "step": 1208 + }, + { + "epoch": 0.918693009118541, + "grad_norm": 1.8313497304916382, + "learning_rate": 4.1572036067672386e-06, + "loss": 0.5674909353256226, + "mean_token_accuracy": 0.7975562214851379, + "num_tokens": 10801372.0, + "step": 1209 + }, + { + "epoch": 0.9194528875379939, + "grad_norm": 2.005958080291748, + "learning_rate": 4.155634900737513e-06, + "loss": 0.5557019114494324, + "mean_token_accuracy": 0.8141391277313232, + "num_tokens": 10809150.0, + "step": 1210 + }, + { + "epoch": 0.9202127659574468, + "grad_norm": 2.333519697189331, + "learning_rate": 4.154065032719482e-06, + "loss": 0.6990420818328857, + "mean_token_accuracy": 0.7565394043922424, + "num_tokens": 10816612.0, + "step": 1211 + }, + { + "epoch": 0.9209726443768997, + "grad_norm": 1.4472655057907104, + "learning_rate": 4.152494003814939e-06, + "loss": 0.541398286819458, + "mean_token_accuracy": 0.8027358055114746, + "num_tokens": 10833840.0, + "step": 1212 + }, + { + "epoch": 0.9217325227963525, + "grad_norm": 1.6183619499206543, + "learning_rate": 4.150921815126493e-06, + "loss": 0.6096762418746948, + "mean_token_accuracy": 0.7994354963302612, + "num_tokens": 10846367.0, + "step": 1213 + }, + { + "epoch": 0.9224924012158054, + "grad_norm": 2.614919900894165, + "learning_rate": 4.149348467757566e-06, + "loss": 0.41846764087677, + "mean_token_accuracy": 0.8555068969726562, + "num_tokens": 10850836.0, + "step": 1214 + }, + { + "epoch": 0.9232522796352584, + "grad_norm": 1.4419831037521362, + "learning_rate": 4.147773962812393e-06, + "loss": 0.4139535427093506, + "mean_token_accuracy": 0.845671534538269, + "num_tokens": 10864228.0, + "step": 1215 + }, + { + "epoch": 0.9240121580547113, + "grad_norm": 2.3868865966796875, + "learning_rate": 4.146198301396025e-06, + "loss": 0.3357275128364563, + "mean_token_accuracy": 0.8829520344734192, + "num_tokens": 10868920.0, + "step": 1216 + }, + { + "epoch": 0.9247720364741642, + "grad_norm": 1.7685474157333374, + "learning_rate": 4.14462148461432e-06, + "loss": 0.45333072543144226, + "mean_token_accuracy": 0.8505891561508179, + "num_tokens": 10877286.0, + "step": 1217 + }, + { + "epoch": 0.925531914893617, + "grad_norm": 1.7627625465393066, + "learning_rate": 4.143043513573949e-06, + "loss": 0.5028705596923828, + "mean_token_accuracy": 0.825471043586731, + "num_tokens": 10887047.0, + "step": 1218 + }, + { + "epoch": 0.9262917933130699, + "grad_norm": 1.3168725967407227, + "learning_rate": 4.141464389382392e-06, + "loss": 0.5494637489318848, + "mean_token_accuracy": 0.8121747970581055, + "num_tokens": 10903599.0, + "step": 1219 + }, + { + "epoch": 0.9270516717325228, + "grad_norm": 2.5180399417877197, + "learning_rate": 4.13988411314794e-06, + "loss": 0.6134277582168579, + "mean_token_accuracy": 0.7983006834983826, + "num_tokens": 10909791.0, + "step": 1220 + }, + { + "epoch": 0.9278115501519757, + "grad_norm": 1.1889166831970215, + "learning_rate": 4.13830268597969e-06, + "loss": 0.36713096499443054, + "mean_token_accuracy": 0.8416121006011963, + "num_tokens": 10925794.0, + "step": 1221 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 2.142422676086426, + "learning_rate": 4.136720108987552e-06, + "loss": 0.4427933096885681, + "mean_token_accuracy": 0.8427745699882507, + "num_tokens": 10931622.0, + "step": 1222 + }, + { + "epoch": 0.9293313069908815, + "grad_norm": 1.908564567565918, + "learning_rate": 4.1351363832822364e-06, + "loss": 0.5088109374046326, + "mean_token_accuracy": 0.8309272527694702, + "num_tokens": 10940843.0, + "step": 1223 + }, + { + "epoch": 0.9300911854103343, + "grad_norm": 1.2862322330474854, + "learning_rate": 4.133551509975264e-06, + "loss": 0.3963761329650879, + "mean_token_accuracy": 0.8602159023284912, + "num_tokens": 10954481.0, + "step": 1224 + }, + { + "epoch": 0.9308510638297872, + "grad_norm": 1.5876200199127197, + "learning_rate": 4.13196549017896e-06, + "loss": 0.4311184287071228, + "mean_token_accuracy": 0.8460899591445923, + "num_tokens": 10963501.0, + "step": 1225 + }, + { + "epoch": 0.9316109422492401, + "grad_norm": 2.459878444671631, + "learning_rate": 4.130378325006453e-06, + "loss": 0.5016295313835144, + "mean_token_accuracy": 0.8125218152999878, + "num_tokens": 10968850.0, + "step": 1226 + }, + { + "epoch": 0.932370820668693, + "grad_norm": 2.059718370437622, + "learning_rate": 4.128790015571679e-06, + "loss": 0.48982277512550354, + "mean_token_accuracy": 0.8327049016952515, + "num_tokens": 10976642.0, + "step": 1227 + }, + { + "epoch": 0.9331306990881459, + "grad_norm": 1.3719185590744019, + "learning_rate": 4.127200562989372e-06, + "loss": 0.38778752088546753, + "mean_token_accuracy": 0.8623501062393188, + "num_tokens": 10988703.0, + "step": 1228 + }, + { + "epoch": 0.9338905775075987, + "grad_norm": 1.302140712738037, + "learning_rate": 4.125609968375073e-06, + "loss": 0.4887842535972595, + "mean_token_accuracy": 0.8322232961654663, + "num_tokens": 11005981.0, + "step": 1229 + }, + { + "epoch": 0.9346504559270516, + "grad_norm": 1.819624423980713, + "learning_rate": 4.12401823284512e-06, + "loss": 0.49825209379196167, + "mean_token_accuracy": 0.8278916478157043, + "num_tokens": 11014145.0, + "step": 1230 + }, + { + "epoch": 0.9354103343465046, + "grad_norm": 1.2762807607650757, + "learning_rate": 4.122425357516658e-06, + "loss": 0.433994323015213, + "mean_token_accuracy": 0.853028416633606, + "num_tokens": 11029232.0, + "step": 1231 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.2171671390533447, + "learning_rate": 4.1208313435076255e-06, + "loss": 0.38436949253082275, + "mean_token_accuracy": 0.8616260290145874, + "num_tokens": 11034743.0, + "step": 1232 + }, + { + "epoch": 0.9369300911854104, + "grad_norm": 1.355879545211792, + "learning_rate": 4.119236191936764e-06, + "loss": 0.5378084182739258, + "mean_token_accuracy": 0.8256701231002808, + "num_tokens": 11048149.0, + "step": 1233 + }, + { + "epoch": 0.9376899696048632, + "grad_norm": 2.66812801361084, + "learning_rate": 4.117639903923611e-06, + "loss": 0.5236451625823975, + "mean_token_accuracy": 0.8431973457336426, + "num_tokens": 11052295.0, + "step": 1234 + }, + { + "epoch": 0.9384498480243161, + "grad_norm": 1.5740545988082886, + "learning_rate": 4.116042480588505e-06, + "loss": 0.44322824478149414, + "mean_token_accuracy": 0.8436908721923828, + "num_tokens": 11062066.0, + "step": 1235 + }, + { + "epoch": 0.939209726443769, + "grad_norm": 1.230706810951233, + "learning_rate": 4.114443923052577e-06, + "loss": 0.3325323462486267, + "mean_token_accuracy": 0.8674666881561279, + "num_tokens": 11074300.0, + "step": 1236 + }, + { + "epoch": 0.9399696048632219, + "grad_norm": 1.9870070219039917, + "learning_rate": 4.112844232437757e-06, + "loss": 0.5711548328399658, + "mean_token_accuracy": 0.8081738948822021, + "num_tokens": 11082297.0, + "step": 1237 + }, + { + "epoch": 0.9407294832826748, + "grad_norm": 1.3020970821380615, + "learning_rate": 4.11124340986677e-06, + "loss": 0.4187922477722168, + "mean_token_accuracy": 0.8566171526908875, + "num_tokens": 11096810.0, + "step": 1238 + }, + { + "epoch": 0.9414893617021277, + "grad_norm": 2.1399197578430176, + "learning_rate": 4.109641456463135e-06, + "loss": 0.5293116569519043, + "mean_token_accuracy": 0.8176157474517822, + "num_tokens": 11102761.0, + "step": 1239 + }, + { + "epoch": 0.9422492401215805, + "grad_norm": 1.3503763675689697, + "learning_rate": 4.108038373351163e-06, + "loss": 0.4907652735710144, + "mean_token_accuracy": 0.8204987049102783, + "num_tokens": 11118480.0, + "step": 1240 + }, + { + "epoch": 0.9430091185410334, + "grad_norm": 1.9571399688720703, + "learning_rate": 4.106434161655962e-06, + "loss": 0.4709656536579132, + "mean_token_accuracy": 0.8371885418891907, + "num_tokens": 11126265.0, + "step": 1241 + }, + { + "epoch": 0.9437689969604863, + "grad_norm": 2.1277313232421875, + "learning_rate": 4.104828822503427e-06, + "loss": 0.4010283350944519, + "mean_token_accuracy": 0.8586333990097046, + "num_tokens": 11133022.0, + "step": 1242 + }, + { + "epoch": 0.9445288753799392, + "grad_norm": 1.6745036840438843, + "learning_rate": 4.103222357020248e-06, + "loss": 0.562545657157898, + "mean_token_accuracy": 0.8052060604095459, + "num_tokens": 11145255.0, + "step": 1243 + }, + { + "epoch": 0.9452887537993921, + "grad_norm": 2.3616299629211426, + "learning_rate": 4.101614766333904e-06, + "loss": 0.5878340601921082, + "mean_token_accuracy": 0.796745777130127, + "num_tokens": 11152020.0, + "step": 1244 + }, + { + "epoch": 0.9460486322188449, + "grad_norm": 1.6182078123092651, + "learning_rate": 4.100006051572664e-06, + "loss": 0.5357589721679688, + "mean_token_accuracy": 0.8089962005615234, + "num_tokens": 11163112.0, + "step": 1245 + }, + { + "epoch": 0.9468085106382979, + "grad_norm": 1.911770224571228, + "learning_rate": 4.098396213865587e-06, + "loss": 0.49805426597595215, + "mean_token_accuracy": 0.8289647102355957, + "num_tokens": 11171768.0, + "step": 1246 + }, + { + "epoch": 0.9475683890577508, + "grad_norm": 1.649155616760254, + "learning_rate": 4.096785254342518e-06, + "loss": 0.5756166577339172, + "mean_token_accuracy": 0.807680606842041, + "num_tokens": 11183527.0, + "step": 1247 + }, + { + "epoch": 0.9483282674772037, + "grad_norm": 1.8922761678695679, + "learning_rate": 4.095173174134091e-06, + "loss": 0.44688963890075684, + "mean_token_accuracy": 0.8375608921051025, + "num_tokens": 11191494.0, + "step": 1248 + }, + { + "epoch": 0.9490881458966566, + "grad_norm": 2.9044547080993652, + "learning_rate": 4.093559974371725e-06, + "loss": 0.48609739542007446, + "mean_token_accuracy": 0.8404892086982727, + "num_tokens": 11195837.0, + "step": 1249 + }, + { + "epoch": 0.9498480243161094, + "grad_norm": 2.287506580352783, + "learning_rate": 4.091945656187626e-06, + "loss": 0.5260225534439087, + "mean_token_accuracy": 0.8181945085525513, + "num_tokens": 11202174.0, + "step": 1250 + }, + { + "epoch": 0.9506079027355623, + "grad_norm": 1.7908886671066284, + "learning_rate": 4.090330220714785e-06, + "loss": 0.4207724928855896, + "mean_token_accuracy": 0.8616912364959717, + "num_tokens": 11209995.0, + "step": 1251 + }, + { + "epoch": 0.9513677811550152, + "grad_norm": 2.905418634414673, + "learning_rate": 4.0887136690869774e-06, + "loss": 0.4209241271018982, + "mean_token_accuracy": 0.8561323285102844, + "num_tokens": 11213799.0, + "step": 1252 + }, + { + "epoch": 0.9521276595744681, + "grad_norm": 2.814150333404541, + "learning_rate": 4.08709600243876e-06, + "loss": 0.36855608224868774, + "mean_token_accuracy": 0.8764539361000061, + "num_tokens": 11217643.0, + "step": 1253 + }, + { + "epoch": 0.952887537993921, + "grad_norm": 1.9385707378387451, + "learning_rate": 4.0854772219054735e-06, + "loss": 0.531031608581543, + "mean_token_accuracy": 0.80600905418396, + "num_tokens": 11225871.0, + "step": 1254 + }, + { + "epoch": 0.9536474164133738, + "grad_norm": 2.103058099746704, + "learning_rate": 4.083857328623243e-06, + "loss": 0.4576364755630493, + "mean_token_accuracy": 0.8447524905204773, + "num_tokens": 11231829.0, + "step": 1255 + }, + { + "epoch": 0.9544072948328267, + "grad_norm": 1.7518818378448486, + "learning_rate": 4.082236323728969e-06, + "loss": 0.5386767983436584, + "mean_token_accuracy": 0.8055596351623535, + "num_tokens": 11240977.0, + "step": 1256 + }, + { + "epoch": 0.9551671732522796, + "grad_norm": 1.8434966802597046, + "learning_rate": 4.0806142083603365e-06, + "loss": 0.5415925979614258, + "mean_token_accuracy": 0.809962272644043, + "num_tokens": 11249616.0, + "step": 1257 + }, + { + "epoch": 0.9559270516717325, + "grad_norm": 1.7341015338897705, + "learning_rate": 4.078990983655807e-06, + "loss": 0.4621101915836334, + "mean_token_accuracy": 0.8330386877059937, + "num_tokens": 11258616.0, + "step": 1258 + }, + { + "epoch": 0.9566869300911854, + "grad_norm": 1.8589727878570557, + "learning_rate": 4.077366650754624e-06, + "loss": 0.4031238555908203, + "mean_token_accuracy": 0.842434287071228, + "num_tokens": 11266006.0, + "step": 1259 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 1.657175898551941, + "learning_rate": 4.075741210796806e-06, + "loss": 0.41686388850212097, + "mean_token_accuracy": 0.8443650007247925, + "num_tokens": 11275601.0, + "step": 1260 + }, + { + "epoch": 0.9582066869300911, + "grad_norm": 2.4303717613220215, + "learning_rate": 4.07411466492315e-06, + "loss": 0.4554435610771179, + "mean_token_accuracy": 0.853043794631958, + "num_tokens": 11280650.0, + "step": 1261 + }, + { + "epoch": 0.958966565349544, + "grad_norm": 2.3653745651245117, + "learning_rate": 4.072487014275228e-06, + "loss": 0.4304995536804199, + "mean_token_accuracy": 0.8462260961532593, + "num_tokens": 11285637.0, + "step": 1262 + }, + { + "epoch": 0.959726443768997, + "grad_norm": 1.6689718961715698, + "learning_rate": 4.070858259995388e-06, + "loss": 0.5290807485580444, + "mean_token_accuracy": 0.8176917433738708, + "num_tokens": 11299110.0, + "step": 1263 + }, + { + "epoch": 0.9604863221884499, + "grad_norm": 2.103879451751709, + "learning_rate": 4.069228403226751e-06, + "loss": 0.4620879888534546, + "mean_token_accuracy": 0.835270345211029, + "num_tokens": 11305564.0, + "step": 1264 + }, + { + "epoch": 0.9612462006079028, + "grad_norm": 2.139012575149536, + "learning_rate": 4.067597445113216e-06, + "loss": 0.5143396258354187, + "mean_token_accuracy": 0.8191739320755005, + "num_tokens": 11311870.0, + "step": 1265 + }, + { + "epoch": 0.9620060790273556, + "grad_norm": 1.3971210718154907, + "learning_rate": 4.06596538679945e-06, + "loss": 0.472080260515213, + "mean_token_accuracy": 0.8321092128753662, + "num_tokens": 11323970.0, + "step": 1266 + }, + { + "epoch": 0.9627659574468085, + "grad_norm": 1.4965174198150635, + "learning_rate": 4.064332229430895e-06, + "loss": 0.359701007604599, + "mean_token_accuracy": 0.8903120160102844, + "num_tokens": 11333412.0, + "step": 1267 + }, + { + "epoch": 0.9635258358662614, + "grad_norm": 1.1898726224899292, + "learning_rate": 4.062697974153764e-06, + "loss": 0.3423798084259033, + "mean_token_accuracy": 0.8661491870880127, + "num_tokens": 11347657.0, + "step": 1268 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 1.4952168464660645, + "learning_rate": 4.06106262211504e-06, + "loss": 0.4214417338371277, + "mean_token_accuracy": 0.8362159729003906, + "num_tokens": 11357786.0, + "step": 1269 + }, + { + "epoch": 0.9650455927051672, + "grad_norm": 1.7949583530426025, + "learning_rate": 4.059426174462476e-06, + "loss": 0.59087735414505, + "mean_token_accuracy": 0.7965556979179382, + "num_tokens": 11370561.0, + "step": 1270 + }, + { + "epoch": 0.96580547112462, + "grad_norm": 1.8973214626312256, + "learning_rate": 4.057788632344594e-06, + "loss": 0.47525322437286377, + "mean_token_accuracy": 0.8317365050315857, + "num_tokens": 11378507.0, + "step": 1271 + }, + { + "epoch": 0.9665653495440729, + "grad_norm": 1.8665250539779663, + "learning_rate": 4.056149996910683e-06, + "loss": 0.3537125587463379, + "mean_token_accuracy": 0.8921569585800171, + "num_tokens": 11385186.0, + "step": 1272 + }, + { + "epoch": 0.9673252279635258, + "grad_norm": 1.5072317123413086, + "learning_rate": 4.054510269310803e-06, + "loss": 0.5145624876022339, + "mean_token_accuracy": 0.8265488147735596, + "num_tokens": 11397125.0, + "step": 1273 + }, + { + "epoch": 0.9680851063829787, + "grad_norm": 1.520525574684143, + "learning_rate": 4.052869450695776e-06, + "loss": 0.44322293996810913, + "mean_token_accuracy": 0.8403642177581787, + "num_tokens": 11409919.0, + "step": 1274 + }, + { + "epoch": 0.9688449848024316, + "grad_norm": 1.3764475584030151, + "learning_rate": 4.051227542217192e-06, + "loss": 0.5774400234222412, + "mean_token_accuracy": 0.804118275642395, + "num_tokens": 11425900.0, + "step": 1275 + }, + { + "epoch": 0.9696048632218845, + "grad_norm": 1.3922648429870605, + "learning_rate": 4.049584545027406e-06, + "loss": 0.42727944254875183, + "mean_token_accuracy": 0.8654505014419556, + "num_tokens": 11438787.0, + "step": 1276 + }, + { + "epoch": 0.9703647416413373, + "grad_norm": 1.8505840301513672, + "learning_rate": 4.047940460279537e-06, + "loss": 0.490803062915802, + "mean_token_accuracy": 0.8340574502944946, + "num_tokens": 11447997.0, + "step": 1277 + }, + { + "epoch": 0.9711246200607903, + "grad_norm": 2.28271222114563, + "learning_rate": 4.046295289127466e-06, + "loss": 0.588828444480896, + "mean_token_accuracy": 0.833497166633606, + "num_tokens": 11454072.0, + "step": 1278 + }, + { + "epoch": 0.9718844984802432, + "grad_norm": 2.4242560863494873, + "learning_rate": 4.044649032725836e-06, + "loss": 0.5128831267356873, + "mean_token_accuracy": 0.8225122690200806, + "num_tokens": 11460211.0, + "step": 1279 + }, + { + "epoch": 0.9726443768996961, + "grad_norm": 2.1738455295562744, + "learning_rate": 4.0430016922300566e-06, + "loss": 0.441631942987442, + "mean_token_accuracy": 0.841723620891571, + "num_tokens": 11466814.0, + "step": 1280 + }, + { + "epoch": 0.973404255319149, + "grad_norm": 2.541599988937378, + "learning_rate": 4.0413532687962926e-06, + "loss": 0.5062629580497742, + "mean_token_accuracy": 0.8013502359390259, + "num_tokens": 11472371.0, + "step": 1281 + }, + { + "epoch": 0.9741641337386018, + "grad_norm": 2.8011014461517334, + "learning_rate": 4.039703763581472e-06, + "loss": 0.5061966776847839, + "mean_token_accuracy": 0.829810380935669, + "num_tokens": 11476672.0, + "step": 1282 + }, + { + "epoch": 0.9749240121580547, + "grad_norm": 2.4505462646484375, + "learning_rate": 4.038053177743279e-06, + "loss": 0.43407535552978516, + "mean_token_accuracy": 0.8428469896316528, + "num_tokens": 11481297.0, + "step": 1283 + }, + { + "epoch": 0.9756838905775076, + "grad_norm": 2.1618378162384033, + "learning_rate": 4.036401512440161e-06, + "loss": 0.6056663393974304, + "mean_token_accuracy": 0.7977457642555237, + "num_tokens": 11488657.0, + "step": 1284 + }, + { + "epoch": 0.9764437689969605, + "grad_norm": 1.9192147254943848, + "learning_rate": 4.034748768831319e-06, + "loss": 0.524390697479248, + "mean_token_accuracy": 0.8120636940002441, + "num_tokens": 11496485.0, + "step": 1285 + }, + { + "epoch": 0.9772036474164134, + "grad_norm": 2.766435384750366, + "learning_rate": 4.033094948076713e-06, + "loss": 0.5494908690452576, + "mean_token_accuracy": 0.8141890168190002, + "num_tokens": 11501341.0, + "step": 1286 + }, + { + "epoch": 0.9779635258358662, + "grad_norm": 1.3519539833068848, + "learning_rate": 4.031440051337056e-06, + "loss": 0.4339691400527954, + "mean_token_accuracy": 0.8400131464004517, + "num_tokens": 11512843.0, + "step": 1287 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 1.2492141723632812, + "learning_rate": 4.02978407977382e-06, + "loss": 0.4433518052101135, + "mean_token_accuracy": 0.8432940244674683, + "num_tokens": 11530227.0, + "step": 1288 + }, + { + "epoch": 0.979483282674772, + "grad_norm": 1.6597715616226196, + "learning_rate": 4.02812703454923e-06, + "loss": 0.602222204208374, + "mean_token_accuracy": 0.786965548992157, + "num_tokens": 11543955.0, + "step": 1289 + }, + { + "epoch": 0.9802431610942249, + "grad_norm": 1.6621816158294678, + "learning_rate": 4.026468916826262e-06, + "loss": 0.35662174224853516, + "mean_token_accuracy": 0.8716133832931519, + "num_tokens": 11552064.0, + "step": 1290 + }, + { + "epoch": 0.9810030395136778, + "grad_norm": 4.539844989776611, + "learning_rate": 4.024809727768648e-06, + "loss": 0.543423593044281, + "mean_token_accuracy": 0.8293194770812988, + "num_tokens": 11555595.0, + "step": 1291 + }, + { + "epoch": 0.9817629179331308, + "grad_norm": 1.4026556015014648, + "learning_rate": 4.023149468540871e-06, + "loss": 0.4301237165927887, + "mean_token_accuracy": 0.8358224630355835, + "num_tokens": 11572275.0, + "step": 1292 + }, + { + "epoch": 0.9825227963525835, + "grad_norm": 1.611262321472168, + "learning_rate": 4.021488140308165e-06, + "loss": 0.5378580689430237, + "mean_token_accuracy": 0.8173760771751404, + "num_tokens": 11584299.0, + "step": 1293 + }, + { + "epoch": 0.9832826747720365, + "grad_norm": 4.138631820678711, + "learning_rate": 4.019825744236514e-06, + "loss": 0.40272149443626404, + "mean_token_accuracy": 0.8648844957351685, + "num_tokens": 11586705.0, + "step": 1294 + }, + { + "epoch": 0.9840425531914894, + "grad_norm": 3.177703619003296, + "learning_rate": 4.018162281492651e-06, + "loss": 0.5320103168487549, + "mean_token_accuracy": 0.8250276446342468, + "num_tokens": 11590689.0, + "step": 1295 + }, + { + "epoch": 0.9848024316109423, + "grad_norm": 2.727597713470459, + "learning_rate": 4.016497753244058e-06, + "loss": 0.5662774443626404, + "mean_token_accuracy": 0.8074625730514526, + "num_tokens": 11596092.0, + "step": 1296 + }, + { + "epoch": 0.9855623100303952, + "grad_norm": 1.485139012336731, + "learning_rate": 4.014832160658966e-06, + "loss": 0.5414972305297852, + "mean_token_accuracy": 0.8082696199417114, + "num_tokens": 11613785.0, + "step": 1297 + }, + { + "epoch": 0.986322188449848, + "grad_norm": 2.4025990962982178, + "learning_rate": 4.013165504906352e-06, + "loss": 0.6556503772735596, + "mean_token_accuracy": 0.7785214781761169, + "num_tokens": 11620421.0, + "step": 1298 + }, + { + "epoch": 0.9870820668693009, + "grad_norm": 1.878273606300354, + "learning_rate": 4.011497787155938e-06, + "loss": 0.4221133887767792, + "mean_token_accuracy": 0.850035548210144, + "num_tokens": 11627998.0, + "step": 1299 + }, + { + "epoch": 0.9878419452887538, + "grad_norm": 2.0430715084075928, + "learning_rate": 4.009829008578192e-06, + "loss": 0.5205984711647034, + "mean_token_accuracy": 0.819183349609375, + "num_tokens": 11636279.0, + "step": 1300 + }, + { + "epoch": 0.9886018237082067, + "grad_norm": 3.4769439697265625, + "learning_rate": 4.00815917034433e-06, + "loss": 0.5449948310852051, + "mean_token_accuracy": 0.8240023851394653, + "num_tokens": 11639638.0, + "step": 1301 + }, + { + "epoch": 0.9893617021276596, + "grad_norm": 2.4783987998962402, + "learning_rate": 4.006488273626307e-06, + "loss": 0.4316832423210144, + "mean_token_accuracy": 0.8474695086479187, + "num_tokens": 11645463.0, + "step": 1302 + }, + { + "epoch": 0.9901215805471124, + "grad_norm": 1.881475567817688, + "learning_rate": 4.004816319596822e-06, + "loss": 0.5157331824302673, + "mean_token_accuracy": 0.826042652130127, + "num_tokens": 11653955.0, + "step": 1303 + }, + { + "epoch": 0.9908814589665653, + "grad_norm": 2.6569254398345947, + "learning_rate": 4.003143309429317e-06, + "loss": 0.46492767333984375, + "mean_token_accuracy": 0.8320850133895874, + "num_tokens": 11659357.0, + "step": 1304 + }, + { + "epoch": 0.9916413373860182, + "grad_norm": 2.4917593002319336, + "learning_rate": 4.0014692442979756e-06, + "loss": 0.459585040807724, + "mean_token_accuracy": 0.8457611799240112, + "num_tokens": 11664207.0, + "step": 1305 + }, + { + "epoch": 0.9924012158054711, + "grad_norm": 2.6885526180267334, + "learning_rate": 3.999794125377721e-06, + "loss": 0.4677402973175049, + "mean_token_accuracy": 0.8307361602783203, + "num_tokens": 11668879.0, + "step": 1306 + }, + { + "epoch": 0.993161094224924, + "grad_norm": 1.9737319946289062, + "learning_rate": 3.998117953844215e-06, + "loss": 0.44684839248657227, + "mean_token_accuracy": 0.8367687463760376, + "num_tokens": 11676081.0, + "step": 1307 + }, + { + "epoch": 0.993920972644377, + "grad_norm": 1.4333021640777588, + "learning_rate": 3.996440730873861e-06, + "loss": 0.526146650314331, + "mean_token_accuracy": 0.816251814365387, + "num_tokens": 11689333.0, + "step": 1308 + }, + { + "epoch": 0.9946808510638298, + "grad_norm": 1.3689230680465698, + "learning_rate": 3.9947624576437975e-06, + "loss": 0.40214329957962036, + "mean_token_accuracy": 0.8610327839851379, + "num_tokens": 11701540.0, + "step": 1309 + }, + { + "epoch": 0.9954407294832827, + "grad_norm": 1.2435375452041626, + "learning_rate": 3.9930831353319025e-06, + "loss": 0.4532913267612457, + "mean_token_accuracy": 0.8415389060974121, + "num_tokens": 11717920.0, + "step": 1310 + }, + { + "epoch": 0.9962006079027356, + "grad_norm": 1.9968011379241943, + "learning_rate": 3.9914027651167866e-06, + "loss": 0.46954160928726196, + "mean_token_accuracy": 0.8351103663444519, + "num_tokens": 11724999.0, + "step": 1311 + }, + { + "epoch": 0.9969604863221885, + "grad_norm": 1.9521311521530151, + "learning_rate": 3.989721348177801e-06, + "loss": 0.5068016052246094, + "mean_token_accuracy": 0.8220845460891724, + "num_tokens": 11732569.0, + "step": 1312 + }, + { + "epoch": 0.9977203647416414, + "grad_norm": 2.7332582473754883, + "learning_rate": 3.988038885695028e-06, + "loss": 0.4154692590236664, + "mean_token_accuracy": 0.8493857383728027, + "num_tokens": 11736759.0, + "step": 1313 + }, + { + "epoch": 0.9984802431610942, + "grad_norm": 1.8656952381134033, + "learning_rate": 3.986355378849284e-06, + "loss": 0.4151354134082794, + "mean_token_accuracy": 0.83440101146698, + "num_tokens": 11743827.0, + "step": 1314 + }, + { + "epoch": 0.9992401215805471, + "grad_norm": 1.304006576538086, + "learning_rate": 3.984670828822118e-06, + "loss": 0.4926128089427948, + "mean_token_accuracy": 0.8603005409240723, + "num_tokens": 11757707.0, + "step": 1315 + }, + { + "epoch": 1.0, + "grad_norm": 1.497079610824585, + "learning_rate": 3.982985236795815e-06, + "loss": 0.43342477083206177, + "mean_token_accuracy": 0.8550825119018555, + "num_tokens": 11769678.0, + "step": 1316 + }, + { + "epoch": 1.000759878419453, + "grad_norm": 2.870274543762207, + "learning_rate": 3.981298603953385e-06, + "loss": 0.3723528981208801, + "mean_token_accuracy": 0.8745899796485901, + "num_tokens": 11773290.0, + "step": 1317 + }, + { + "epoch": 1.0015197568389058, + "grad_norm": 1.3442503213882446, + "learning_rate": 3.979610931478574e-06, + "loss": 0.34688329696655273, + "mean_token_accuracy": 0.8749074935913086, + "num_tokens": 11786400.0, + "step": 1318 + }, + { + "epoch": 1.0022796352583587, + "grad_norm": 1.7272238731384277, + "learning_rate": 3.977922220555855e-06, + "loss": 0.28274932503700256, + "mean_token_accuracy": 0.896713137626648, + "num_tokens": 11793059.0, + "step": 1319 + }, + { + "epoch": 1.0030395136778116, + "grad_norm": 1.7362451553344727, + "learning_rate": 3.976232472370431e-06, + "loss": 0.5494794845581055, + "mean_token_accuracy": 0.8341718912124634, + "num_tokens": 11802593.0, + "step": 1320 + }, + { + "epoch": 1.0037993920972645, + "grad_norm": 1.3316494226455688, + "learning_rate": 3.97454168810823e-06, + "loss": 0.41505366563796997, + "mean_token_accuracy": 0.8581969738006592, + "num_tokens": 11813925.0, + "step": 1321 + }, + { + "epoch": 1.0045592705167172, + "grad_norm": 1.6152615547180176, + "learning_rate": 3.972849868955913e-06, + "loss": 0.44761013984680176, + "mean_token_accuracy": 0.8413045406341553, + "num_tokens": 11825709.0, + "step": 1322 + }, + { + "epoch": 1.0053191489361701, + "grad_norm": 2.1172471046447754, + "learning_rate": 3.97115701610086e-06, + "loss": 0.3903353810310364, + "mean_token_accuracy": 0.8662760257720947, + "num_tokens": 11832070.0, + "step": 1323 + }, + { + "epoch": 1.006079027355623, + "grad_norm": 1.5923868417739868, + "learning_rate": 3.969463130731183e-06, + "loss": 0.4491051137447357, + "mean_token_accuracy": 0.8677828311920166, + "num_tokens": 11843154.0, + "step": 1324 + }, + { + "epoch": 1.006838905775076, + "grad_norm": 1.6848995685577393, + "learning_rate": 3.967768214035716e-06, + "loss": 0.45765817165374756, + "mean_token_accuracy": 0.8401060104370117, + "num_tokens": 11854826.0, + "step": 1325 + }, + { + "epoch": 1.0075987841945289, + "grad_norm": 2.3739020824432373, + "learning_rate": 3.966072267204014e-06, + "loss": 0.4482722580432892, + "mean_token_accuracy": 0.8368916511535645, + "num_tokens": 11860559.0, + "step": 1326 + }, + { + "epoch": 1.0083586626139818, + "grad_norm": 1.5403034687042236, + "learning_rate": 3.964375291426361e-06, + "loss": 0.35589972138404846, + "mean_token_accuracy": 0.8728118538856506, + "num_tokens": 11871959.0, + "step": 1327 + }, + { + "epoch": 1.0091185410334347, + "grad_norm": 1.6750119924545288, + "learning_rate": 3.962677287893758e-06, + "loss": 0.35873427987098694, + "mean_token_accuracy": 0.9027186632156372, + "num_tokens": 11881818.0, + "step": 1328 + }, + { + "epoch": 1.0098784194528876, + "grad_norm": 1.5489170551300049, + "learning_rate": 3.9609782577979305e-06, + "loss": 0.3634672462940216, + "mean_token_accuracy": 0.8582607507705688, + "num_tokens": 11891084.0, + "step": 1329 + }, + { + "epoch": 1.0106382978723405, + "grad_norm": 2.43859601020813, + "learning_rate": 3.959278202331323e-06, + "loss": 0.3640799820423126, + "mean_token_accuracy": 0.88062584400177, + "num_tokens": 11896032.0, + "step": 1330 + }, + { + "epoch": 1.0113981762917934, + "grad_norm": 3.612184524536133, + "learning_rate": 3.9575771226870986e-06, + "loss": 0.3733130097389221, + "mean_token_accuracy": 0.8946067094802856, + "num_tokens": 11899479.0, + "step": 1331 + }, + { + "epoch": 1.012158054711246, + "grad_norm": 1.541355848312378, + "learning_rate": 3.955875020059141e-06, + "loss": 0.320593923330307, + "mean_token_accuracy": 0.9057406783103943, + "num_tokens": 11910179.0, + "step": 1332 + }, + { + "epoch": 1.012917933130699, + "grad_norm": 2.0565030574798584, + "learning_rate": 3.954171895642052e-06, + "loss": 0.3341682553291321, + "mean_token_accuracy": 0.8829344511032104, + "num_tokens": 11916489.0, + "step": 1333 + }, + { + "epoch": 1.013677811550152, + "grad_norm": 2.9732539653778076, + "learning_rate": 3.9524677506311505e-06, + "loss": 0.38488566875457764, + "mean_token_accuracy": 0.8752974271774292, + "num_tokens": 11920682.0, + "step": 1334 + }, + { + "epoch": 1.0144376899696048, + "grad_norm": 2.7697458267211914, + "learning_rate": 3.950762586222469e-06, + "loss": 0.39864760637283325, + "mean_token_accuracy": 0.8593167662620544, + "num_tokens": 11925233.0, + "step": 1335 + }, + { + "epoch": 1.0151975683890577, + "grad_norm": 2.2302119731903076, + "learning_rate": 3.949056403612758e-06, + "loss": 0.3985682725906372, + "mean_token_accuracy": 0.8677899837493896, + "num_tokens": 11932000.0, + "step": 1336 + }, + { + "epoch": 1.0159574468085106, + "grad_norm": 2.360572576522827, + "learning_rate": 3.947349203999485e-06, + "loss": 0.36940714716911316, + "mean_token_accuracy": 0.8760676383972168, + "num_tokens": 11937569.0, + "step": 1337 + }, + { + "epoch": 1.0167173252279635, + "grad_norm": 1.3383921384811401, + "learning_rate": 3.945640988580824e-06, + "loss": 0.40628793835639954, + "mean_token_accuracy": 0.866442084312439, + "num_tokens": 11955679.0, + "step": 1338 + }, + { + "epoch": 1.0174772036474165, + "grad_norm": 2.1502623558044434, + "learning_rate": 3.943931758555669e-06, + "loss": 0.4493565559387207, + "mean_token_accuracy": 0.8307522535324097, + "num_tokens": 11962734.0, + "step": 1339 + }, + { + "epoch": 1.0182370820668694, + "grad_norm": 2.4737331867218018, + "learning_rate": 3.942221515123624e-06, + "loss": 0.28508758544921875, + "mean_token_accuracy": 0.8967142105102539, + "num_tokens": 11967783.0, + "step": 1340 + }, + { + "epoch": 1.0189969604863223, + "grad_norm": 2.4525370597839355, + "learning_rate": 3.940510259485002e-06, + "loss": 0.40227818489074707, + "mean_token_accuracy": 0.8618967533111572, + "num_tokens": 11972918.0, + "step": 1341 + }, + { + "epoch": 1.0197568389057752, + "grad_norm": 1.7299731969833374, + "learning_rate": 3.938797992840828e-06, + "loss": 0.26339593529701233, + "mean_token_accuracy": 0.9004406929016113, + "num_tokens": 11981250.0, + "step": 1342 + }, + { + "epoch": 1.0205167173252279, + "grad_norm": 2.8756747245788574, + "learning_rate": 3.937084716392839e-06, + "loss": 0.47792482376098633, + "mean_token_accuracy": 0.8440839052200317, + "num_tokens": 11986356.0, + "step": 1343 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 2.104473114013672, + "learning_rate": 3.935370431343475e-06, + "loss": 0.36723971366882324, + "mean_token_accuracy": 0.8831232786178589, + "num_tokens": 11994495.0, + "step": 1344 + }, + { + "epoch": 1.0220364741641337, + "grad_norm": 1.9173074960708618, + "learning_rate": 3.933655138895889e-06, + "loss": 0.409319669008255, + "mean_token_accuracy": 0.8632645606994629, + "num_tokens": 12002060.0, + "step": 1345 + }, + { + "epoch": 1.0227963525835866, + "grad_norm": 2.958311080932617, + "learning_rate": 3.9319388402539395e-06, + "loss": 0.5390093922615051, + "mean_token_accuracy": 0.8204828500747681, + "num_tokens": 12007588.0, + "step": 1346 + }, + { + "epoch": 1.0235562310030395, + "grad_norm": 1.6470831632614136, + "learning_rate": 3.930221536622192e-06, + "loss": 0.4524633288383484, + "mean_token_accuracy": 0.8516575694084167, + "num_tokens": 12018831.0, + "step": 1347 + }, + { + "epoch": 1.0243161094224924, + "grad_norm": 1.3160780668258667, + "learning_rate": 3.928503229205913e-06, + "loss": 0.4180558919906616, + "mean_token_accuracy": 0.8495022058486938, + "num_tokens": 12033947.0, + "step": 1348 + }, + { + "epoch": 1.0250759878419453, + "grad_norm": 1.9686089754104614, + "learning_rate": 3.92678391921108e-06, + "loss": 0.41927334666252136, + "mean_token_accuracy": 0.8462997674942017, + "num_tokens": 12042005.0, + "step": 1349 + }, + { + "epoch": 1.0258358662613982, + "grad_norm": 2.351778507232666, + "learning_rate": 3.92506360784437e-06, + "loss": 0.2946245074272156, + "mean_token_accuracy": 0.9170923233032227, + "num_tokens": 12046579.0, + "step": 1350 + }, + { + "epoch": 1.0265957446808511, + "grad_norm": 2.0636913776397705, + "learning_rate": 3.923342296313162e-06, + "loss": 0.3422774076461792, + "mean_token_accuracy": 0.8809213638305664, + "num_tokens": 12053214.0, + "step": 1351 + }, + { + "epoch": 1.027355623100304, + "grad_norm": 1.7272592782974243, + "learning_rate": 3.92161998582554e-06, + "loss": 0.5864541530609131, + "mean_token_accuracy": 0.7986117601394653, + "num_tokens": 12068522.0, + "step": 1352 + }, + { + "epoch": 1.028115501519757, + "grad_norm": 0.8980231881141663, + "learning_rate": 3.919896677590289e-06, + "loss": 0.2964550256729126, + "mean_token_accuracy": 0.8911845088005066, + "num_tokens": 12093834.0, + "step": 1353 + }, + { + "epoch": 1.0288753799392096, + "grad_norm": 1.6031712293624878, + "learning_rate": 3.918172372816892e-06, + "loss": 0.37254488468170166, + "mean_token_accuracy": 0.8615843057632446, + "num_tokens": 12104393.0, + "step": 1354 + }, + { + "epoch": 1.0296352583586625, + "grad_norm": 1.282134771347046, + "learning_rate": 3.916447072715531e-06, + "loss": 0.3522927761077881, + "mean_token_accuracy": 0.8713657259941101, + "num_tokens": 12118671.0, + "step": 1355 + }, + { + "epoch": 1.0303951367781155, + "grad_norm": 2.1986680030822754, + "learning_rate": 3.914720778497091e-06, + "loss": 0.3716316223144531, + "mean_token_accuracy": 0.8661249279975891, + "num_tokens": 12125178.0, + "step": 1356 + }, + { + "epoch": 1.0311550151975684, + "grad_norm": 1.5937882661819458, + "learning_rate": 3.91299349137315e-06, + "loss": 0.48067355155944824, + "mean_token_accuracy": 0.8284252882003784, + "num_tokens": 12136785.0, + "step": 1357 + }, + { + "epoch": 1.0319148936170213, + "grad_norm": 1.6743099689483643, + "learning_rate": 3.9112652125559845e-06, + "loss": 0.4461551308631897, + "mean_token_accuracy": 0.8381845355033875, + "num_tokens": 12150066.0, + "step": 1358 + }, + { + "epoch": 1.0326747720364742, + "grad_norm": 2.2346715927124023, + "learning_rate": 3.909535943258567e-06, + "loss": 0.3148220181465149, + "mean_token_accuracy": 0.8797591924667358, + "num_tokens": 12155506.0, + "step": 1359 + }, + { + "epoch": 1.033434650455927, + "grad_norm": 1.9608992338180542, + "learning_rate": 3.907805684694567e-06, + "loss": 0.32598960399627686, + "mean_token_accuracy": 0.8819410800933838, + "num_tokens": 12163261.0, + "step": 1360 + }, + { + "epoch": 1.03419452887538, + "grad_norm": 2.413477897644043, + "learning_rate": 3.906074438078343e-06, + "loss": 0.38179588317871094, + "mean_token_accuracy": 0.8739585876464844, + "num_tokens": 12169254.0, + "step": 1361 + }, + { + "epoch": 1.034954407294833, + "grad_norm": 2.0258278846740723, + "learning_rate": 3.904342204624955e-06, + "loss": 0.33240315318107605, + "mean_token_accuracy": 0.8808181285858154, + "num_tokens": 12175379.0, + "step": 1362 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 2.4111437797546387, + "learning_rate": 3.9026089855501475e-06, + "loss": 0.412802517414093, + "mean_token_accuracy": 0.8504396677017212, + "num_tokens": 12182007.0, + "step": 1363 + }, + { + "epoch": 1.0364741641337385, + "grad_norm": 2.0424840450286865, + "learning_rate": 3.900874782070362e-06, + "loss": 0.2914797067642212, + "mean_token_accuracy": 0.8731886148452759, + "num_tokens": 12187743.0, + "step": 1364 + }, + { + "epoch": 1.0372340425531914, + "grad_norm": 2.9248716831207275, + "learning_rate": 3.899139595402729e-06, + "loss": 0.34071338176727295, + "mean_token_accuracy": 0.8736443519592285, + "num_tokens": 12191830.0, + "step": 1365 + }, + { + "epoch": 1.0379939209726443, + "grad_norm": 2.240220785140991, + "learning_rate": 3.8974034267650695e-06, + "loss": 0.23049014806747437, + "mean_token_accuracy": 0.9000070691108704, + "num_tokens": 12196460.0, + "step": 1366 + }, + { + "epoch": 1.0387537993920972, + "grad_norm": 1.5038460493087769, + "learning_rate": 3.895666277375892e-06, + "loss": 0.32255327701568604, + "mean_token_accuracy": 0.873004674911499, + "num_tokens": 12206230.0, + "step": 1367 + }, + { + "epoch": 1.0395136778115501, + "grad_norm": 1.2339142560958862, + "learning_rate": 3.893928148454398e-06, + "loss": 0.4069131314754486, + "mean_token_accuracy": 0.8461740016937256, + "num_tokens": 12226502.0, + "step": 1368 + }, + { + "epoch": 1.040273556231003, + "grad_norm": 2.531553268432617, + "learning_rate": 3.89218904122047e-06, + "loss": 0.43681037425994873, + "mean_token_accuracy": 0.8497104048728943, + "num_tokens": 12232241.0, + "step": 1369 + }, + { + "epoch": 1.041033434650456, + "grad_norm": 3.8404815196990967, + "learning_rate": 3.890448956894682e-06, + "loss": 0.3241814970970154, + "mean_token_accuracy": 0.884732723236084, + "num_tokens": 12235126.0, + "step": 1370 + }, + { + "epoch": 1.0417933130699089, + "grad_norm": 2.9608030319213867, + "learning_rate": 3.888707896698293e-06, + "loss": 0.4641021490097046, + "mean_token_accuracy": 0.8496800661087036, + "num_tokens": 12240630.0, + "step": 1371 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.1166417598724365, + "learning_rate": 3.886965861853243e-06, + "loss": 0.42038479447364807, + "mean_token_accuracy": 0.8512747287750244, + "num_tokens": 12247969.0, + "step": 1372 + }, + { + "epoch": 1.0433130699088147, + "grad_norm": 2.5918161869049072, + "learning_rate": 3.885222853582163e-06, + "loss": 0.2871917188167572, + "mean_token_accuracy": 0.9129709601402283, + "num_tokens": 12252161.0, + "step": 1373 + }, + { + "epoch": 1.0440729483282676, + "grad_norm": 2.4261348247528076, + "learning_rate": 3.88347887310836e-06, + "loss": 0.4003123342990875, + "mean_token_accuracy": 0.8570356369018555, + "num_tokens": 12258135.0, + "step": 1374 + }, + { + "epoch": 1.0448328267477203, + "grad_norm": 1.3439548015594482, + "learning_rate": 3.881733921655829e-06, + "loss": 0.3278140425682068, + "mean_token_accuracy": 0.8831373453140259, + "num_tokens": 12272849.0, + "step": 1375 + }, + { + "epoch": 1.0455927051671732, + "grad_norm": 1.527989387512207, + "learning_rate": 3.879988000449243e-06, + "loss": 0.33789363503456116, + "mean_token_accuracy": 0.8825669884681702, + "num_tokens": 12283281.0, + "step": 1376 + }, + { + "epoch": 1.046352583586626, + "grad_norm": 1.6755503416061401, + "learning_rate": 3.878241110713957e-06, + "loss": 0.4816160798072815, + "mean_token_accuracy": 0.8193758726119995, + "num_tokens": 12295422.0, + "step": 1377 + }, + { + "epoch": 1.047112462006079, + "grad_norm": 2.8110361099243164, + "learning_rate": 3.876493253676004e-06, + "loss": 0.38662949204444885, + "mean_token_accuracy": 0.8611986637115479, + "num_tokens": 12299806.0, + "step": 1378 + }, + { + "epoch": 1.047872340425532, + "grad_norm": 1.86097252368927, + "learning_rate": 3.8747444305621e-06, + "loss": 0.27612629532814026, + "mean_token_accuracy": 0.8984048366546631, + "num_tokens": 12306599.0, + "step": 1379 + }, + { + "epoch": 1.0486322188449848, + "grad_norm": 2.361828565597534, + "learning_rate": 3.872994642599635e-06, + "loss": 0.469953715801239, + "mean_token_accuracy": 0.8464452028274536, + "num_tokens": 12314249.0, + "step": 1380 + }, + { + "epoch": 1.0493920972644377, + "grad_norm": 1.9524794816970825, + "learning_rate": 3.871243891016676e-06, + "loss": 0.5419625043869019, + "mean_token_accuracy": 0.8468329906463623, + "num_tokens": 12324987.0, + "step": 1381 + }, + { + "epoch": 1.0501519756838906, + "grad_norm": 1.6931511163711548, + "learning_rate": 3.869492177041971e-06, + "loss": 0.3791416883468628, + "mean_token_accuracy": 0.8692882061004639, + "num_tokens": 12336864.0, + "step": 1382 + }, + { + "epoch": 1.0509118541033435, + "grad_norm": 1.909692406654358, + "learning_rate": 3.867739501904938e-06, + "loss": 0.27974557876586914, + "mean_token_accuracy": 0.9004636406898499, + "num_tokens": 12343093.0, + "step": 1383 + }, + { + "epoch": 1.0516717325227964, + "grad_norm": 1.415162205696106, + "learning_rate": 3.8659858668356735e-06, + "loss": 0.38928335905075073, + "mean_token_accuracy": 0.8491984009742737, + "num_tokens": 12356613.0, + "step": 1384 + }, + { + "epoch": 1.0524316109422491, + "grad_norm": 1.8195741176605225, + "learning_rate": 3.864231273064944e-06, + "loss": 0.3798758089542389, + "mean_token_accuracy": 0.8728072047233582, + "num_tokens": 12364860.0, + "step": 1385 + }, + { + "epoch": 1.053191489361702, + "grad_norm": 1.8481454849243164, + "learning_rate": 3.862475721824193e-06, + "loss": 0.269635945558548, + "mean_token_accuracy": 0.899247407913208, + "num_tokens": 12371841.0, + "step": 1386 + }, + { + "epoch": 1.053951367781155, + "grad_norm": 1.7838784456253052, + "learning_rate": 3.8607192143455325e-06, + "loss": 0.36971768736839294, + "mean_token_accuracy": 0.8833638429641724, + "num_tokens": 12380685.0, + "step": 1387 + }, + { + "epoch": 1.0547112462006079, + "grad_norm": 1.333358645439148, + "learning_rate": 3.858961751861748e-06, + "loss": 0.4039418399333954, + "mean_token_accuracy": 0.8541078567504883, + "num_tokens": 12394072.0, + "step": 1388 + }, + { + "epoch": 1.0554711246200608, + "grad_norm": 2.1600265502929688, + "learning_rate": 3.857203335606294e-06, + "loss": 0.38211894035339355, + "mean_token_accuracy": 0.8549972772598267, + "num_tokens": 12400449.0, + "step": 1389 + }, + { + "epoch": 1.0562310030395137, + "grad_norm": 2.914902687072754, + "learning_rate": 3.855443966813295e-06, + "loss": 0.2237374186515808, + "mean_token_accuracy": 0.9253600835800171, + "num_tokens": 12403758.0, + "step": 1390 + }, + { + "epoch": 1.0569908814589666, + "grad_norm": 2.2361080646514893, + "learning_rate": 3.853683646717543e-06, + "loss": 0.3359566926956177, + "mean_token_accuracy": 0.898173451423645, + "num_tokens": 12410374.0, + "step": 1391 + }, + { + "epoch": 1.0577507598784195, + "grad_norm": 2.3639304637908936, + "learning_rate": 3.8519223765544985e-06, + "loss": 0.3844943046569824, + "mean_token_accuracy": 0.863599419593811, + "num_tokens": 12416016.0, + "step": 1392 + }, + { + "epoch": 1.0585106382978724, + "grad_norm": 2.202971935272217, + "learning_rate": 3.85016015756029e-06, + "loss": 0.3546281158924103, + "mean_token_accuracy": 0.8907540440559387, + "num_tokens": 12422026.0, + "step": 1393 + }, + { + "epoch": 1.0592705167173253, + "grad_norm": 1.1279661655426025, + "learning_rate": 3.848396990971709e-06, + "loss": 0.31522464752197266, + "mean_token_accuracy": 0.8662257194519043, + "num_tokens": 12439964.0, + "step": 1394 + }, + { + "epoch": 1.0600303951367782, + "grad_norm": 2.4731740951538086, + "learning_rate": 3.846632878026214e-06, + "loss": 0.456442266702652, + "mean_token_accuracy": 0.8516958951950073, + "num_tokens": 12446231.0, + "step": 1395 + }, + { + "epoch": 1.060790273556231, + "grad_norm": 1.7631878852844238, + "learning_rate": 3.844867819961928e-06, + "loss": 0.487227201461792, + "mean_token_accuracy": 0.8466947078704834, + "num_tokens": 12459989.0, + "step": 1396 + }, + { + "epoch": 1.0615501519756838, + "grad_norm": 2.4468278884887695, + "learning_rate": 3.843101818017637e-06, + "loss": 0.3367291986942291, + "mean_token_accuracy": 0.8734689950942993, + "num_tokens": 12465741.0, + "step": 1397 + }, + { + "epoch": 1.0623100303951367, + "grad_norm": 1.9045145511627197, + "learning_rate": 3.841334873432789e-06, + "loss": 0.4652615487575531, + "mean_token_accuracy": 0.8333107233047485, + "num_tokens": 12474963.0, + "step": 1398 + }, + { + "epoch": 1.0630699088145896, + "grad_norm": 1.6816917657852173, + "learning_rate": 3.839566987447492e-06, + "loss": 0.4144279956817627, + "mean_token_accuracy": 0.8472539186477661, + "num_tokens": 12485521.0, + "step": 1399 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 1.8990092277526855, + "learning_rate": 3.837798161302518e-06, + "loss": 0.4040985405445099, + "mean_token_accuracy": 0.8514704704284668, + "num_tokens": 12493495.0, + "step": 1400 + }, + { + "epoch": 1.0645896656534954, + "grad_norm": 2.27785325050354, + "learning_rate": 3.836028396239297e-06, + "loss": 0.43425723910331726, + "mean_token_accuracy": 0.8795069456100464, + "num_tokens": 12499789.0, + "step": 1401 + }, + { + "epoch": 1.0653495440729484, + "grad_norm": 2.5130882263183594, + "learning_rate": 3.8342576934999184e-06, + "loss": 0.33892524242401123, + "mean_token_accuracy": 0.8717449903488159, + "num_tokens": 12504885.0, + "step": 1402 + }, + { + "epoch": 1.0661094224924013, + "grad_norm": 2.650040864944458, + "learning_rate": 3.832486054327131e-06, + "loss": 0.4200317859649658, + "mean_token_accuracy": 0.8616159558296204, + "num_tokens": 12509783.0, + "step": 1403 + }, + { + "epoch": 1.0668693009118542, + "grad_norm": 2.9176881313323975, + "learning_rate": 3.830713479964335e-06, + "loss": 0.37018489837646484, + "mean_token_accuracy": 0.8676021695137024, + "num_tokens": 12514441.0, + "step": 1404 + }, + { + "epoch": 1.067629179331307, + "grad_norm": 1.6430318355560303, + "learning_rate": 3.828939971655595e-06, + "loss": 0.27539193630218506, + "mean_token_accuracy": 0.9077831506729126, + "num_tokens": 12523677.0, + "step": 1405 + }, + { + "epoch": 1.06838905775076, + "grad_norm": 1.3683708906173706, + "learning_rate": 3.827165530645627e-06, + "loss": 0.4085099697113037, + "mean_token_accuracy": 0.8579255938529968, + "num_tokens": 12540104.0, + "step": 1406 + }, + { + "epoch": 1.0691489361702127, + "grad_norm": 2.528465747833252, + "learning_rate": 3.825390158179802e-06, + "loss": 0.42462456226348877, + "mean_token_accuracy": 0.852813720703125, + "num_tokens": 12548239.0, + "step": 1407 + }, + { + "epoch": 1.0699088145896656, + "grad_norm": 1.8288795948028564, + "learning_rate": 3.823613855504144e-06, + "loss": 0.412417471408844, + "mean_token_accuracy": 0.8622130751609802, + "num_tokens": 12557316.0, + "step": 1408 + }, + { + "epoch": 1.0706686930091185, + "grad_norm": 2.341794490814209, + "learning_rate": 3.82183662386533e-06, + "loss": 0.2996668815612793, + "mean_token_accuracy": 0.8964041471481323, + "num_tokens": 12562377.0, + "step": 1409 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 2.555877208709717, + "learning_rate": 3.82005846451069e-06, + "loss": 0.4184221625328064, + "mean_token_accuracy": 0.8678828477859497, + "num_tokens": 12568516.0, + "step": 1410 + }, + { + "epoch": 1.0721884498480243, + "grad_norm": 2.081308126449585, + "learning_rate": 3.8182793786882065e-06, + "loss": 0.4376835823059082, + "mean_token_accuracy": 0.8409077525138855, + "num_tokens": 12576598.0, + "step": 1411 + }, + { + "epoch": 1.0729483282674772, + "grad_norm": 2.0272316932678223, + "learning_rate": 3.816499367646508e-06, + "loss": 0.3630060851573944, + "mean_token_accuracy": 0.8762413263320923, + "num_tokens": 12584587.0, + "step": 1412 + }, + { + "epoch": 1.0737082066869301, + "grad_norm": 2.6382484436035156, + "learning_rate": 3.814718432634877e-06, + "loss": 0.4244990348815918, + "mean_token_accuracy": 0.8509312272071838, + "num_tokens": 12590028.0, + "step": 1413 + }, + { + "epoch": 1.074468085106383, + "grad_norm": 2.429800271987915, + "learning_rate": 3.8129365749032398e-06, + "loss": 0.36990004777908325, + "mean_token_accuracy": 0.8749774098396301, + "num_tokens": 12594984.0, + "step": 1414 + }, + { + "epoch": 1.075227963525836, + "grad_norm": 3.5939090251922607, + "learning_rate": 3.8111537957021736e-06, + "loss": 0.4245661199092865, + "mean_token_accuracy": 0.8481623530387878, + "num_tokens": 12598494.0, + "step": 1415 + }, + { + "epoch": 1.0759878419452888, + "grad_norm": 2.705955982208252, + "learning_rate": 3.809370096282903e-06, + "loss": 0.41851678490638733, + "mean_token_accuracy": 0.8548051714897156, + "num_tokens": 12603876.0, + "step": 1416 + }, + { + "epoch": 1.0767477203647418, + "grad_norm": 1.7812079191207886, + "learning_rate": 3.807585477897296e-06, + "loss": 0.47113919258117676, + "mean_token_accuracy": 0.8346904516220093, + "num_tokens": 12613402.0, + "step": 1417 + }, + { + "epoch": 1.0775075987841944, + "grad_norm": 1.4335212707519531, + "learning_rate": 3.8057999417978654e-06, + "loss": 0.3802063465118408, + "mean_token_accuracy": 0.8563423156738281, + "num_tokens": 12626865.0, + "step": 1418 + }, + { + "epoch": 1.0782674772036474, + "grad_norm": 1.9171305894851685, + "learning_rate": 3.8040134892377702e-06, + "loss": 0.20898357033729553, + "mean_token_accuracy": 0.9189738035202026, + "num_tokens": 12632593.0, + "step": 1419 + }, + { + "epoch": 1.0790273556231003, + "grad_norm": 1.4996821880340576, + "learning_rate": 3.802226121470811e-06, + "loss": 0.4203261137008667, + "mean_token_accuracy": 0.8479211330413818, + "num_tokens": 12646395.0, + "step": 1420 + }, + { + "epoch": 1.0797872340425532, + "grad_norm": 2.2007253170013428, + "learning_rate": 3.800437839751432e-06, + "loss": 0.40370577573776245, + "mean_token_accuracy": 0.8427679538726807, + "num_tokens": 12653508.0, + "step": 1421 + }, + { + "epoch": 1.080547112462006, + "grad_norm": 1.7266581058502197, + "learning_rate": 3.7986486453347183e-06, + "loss": 0.46750491857528687, + "mean_token_accuracy": 0.8429205417633057, + "num_tokens": 12666329.0, + "step": 1422 + }, + { + "epoch": 1.081306990881459, + "grad_norm": 1.4716318845748901, + "learning_rate": 3.796858539476394e-06, + "loss": 0.3330317735671997, + "mean_token_accuracy": 0.879012942314148, + "num_tokens": 12676741.0, + "step": 1423 + }, + { + "epoch": 1.082066869300912, + "grad_norm": 2.652127265930176, + "learning_rate": 3.795067523432826e-06, + "loss": 0.35365715622901917, + "mean_token_accuracy": 0.8796792030334473, + "num_tokens": 12681479.0, + "step": 1424 + }, + { + "epoch": 1.0828267477203648, + "grad_norm": 1.2937829494476318, + "learning_rate": 3.793275598461017e-06, + "loss": 0.25272446870803833, + "mean_token_accuracy": 0.9231734275817871, + "num_tokens": 12694238.0, + "step": 1425 + }, + { + "epoch": 1.0835866261398177, + "grad_norm": 1.3831220865249634, + "learning_rate": 3.7914827658186104e-06, + "loss": 0.4935331344604492, + "mean_token_accuracy": 0.8417420387268066, + "num_tokens": 12712857.0, + "step": 1426 + }, + { + "epoch": 1.0843465045592706, + "grad_norm": 3.059525728225708, + "learning_rate": 3.7896890267638832e-06, + "loss": 0.2592190206050873, + "mean_token_accuracy": 0.9040263295173645, + "num_tokens": 12716766.0, + "step": 1427 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.8399202823638916, + "learning_rate": 3.787894382555752e-06, + "loss": 0.32098138332366943, + "mean_token_accuracy": 0.8838302493095398, + "num_tokens": 12720774.0, + "step": 1428 + }, + { + "epoch": 1.0858662613981762, + "grad_norm": 2.618479013442993, + "learning_rate": 3.7860988344537664e-06, + "loss": 0.425255686044693, + "mean_token_accuracy": 0.8564130067825317, + "num_tokens": 12726506.0, + "step": 1429 + }, + { + "epoch": 1.0866261398176291, + "grad_norm": 1.3108669519424438, + "learning_rate": 3.7843023837181126e-06, + "loss": 0.40220165252685547, + "mean_token_accuracy": 0.8588873147964478, + "num_tokens": 12742814.0, + "step": 1430 + }, + { + "epoch": 1.087386018237082, + "grad_norm": 2.2083566188812256, + "learning_rate": 3.782505031609607e-06, + "loss": 0.318379282951355, + "mean_token_accuracy": 0.8887606859207153, + "num_tokens": 12748388.0, + "step": 1431 + }, + { + "epoch": 1.088145896656535, + "grad_norm": 1.922358751296997, + "learning_rate": 3.7807067793897006e-06, + "loss": 0.2519589364528656, + "mean_token_accuracy": 0.8936764001846313, + "num_tokens": 12754761.0, + "step": 1432 + }, + { + "epoch": 1.0889057750759878, + "grad_norm": 1.7367439270019531, + "learning_rate": 3.778907628320477e-06, + "loss": 0.3970367908477783, + "mean_token_accuracy": 0.858735203742981, + "num_tokens": 12764016.0, + "step": 1433 + }, + { + "epoch": 1.0896656534954408, + "grad_norm": 2.1931066513061523, + "learning_rate": 3.77710757966465e-06, + "loss": 0.5250554084777832, + "mean_token_accuracy": 0.8356746435165405, + "num_tokens": 12772272.0, + "step": 1434 + }, + { + "epoch": 1.0904255319148937, + "grad_norm": 1.718337893486023, + "learning_rate": 3.775306634685562e-06, + "loss": 0.283231645822525, + "mean_token_accuracy": 0.9009919166564941, + "num_tokens": 12780706.0, + "step": 1435 + }, + { + "epoch": 1.0911854103343466, + "grad_norm": 2.1985926628112793, + "learning_rate": 3.773504794647187e-06, + "loss": 0.3913170397281647, + "mean_token_accuracy": 0.8909255266189575, + "num_tokens": 12787052.0, + "step": 1436 + }, + { + "epoch": 1.0919452887537995, + "grad_norm": 2.8687937259674072, + "learning_rate": 3.771702060814123e-06, + "loss": 0.3135771155357361, + "mean_token_accuracy": 0.9016125202178955, + "num_tokens": 12791854.0, + "step": 1437 + }, + { + "epoch": 1.0927051671732522, + "grad_norm": 4.203946590423584, + "learning_rate": 3.7698984344516e-06, + "loss": 0.3642737865447998, + "mean_token_accuracy": 0.8842349052429199, + "num_tokens": 12794969.0, + "step": 1438 + }, + { + "epoch": 1.093465045592705, + "grad_norm": 1.5134642124176025, + "learning_rate": 3.7680939168254733e-06, + "loss": 0.3732057213783264, + "mean_token_accuracy": 0.8671083450317383, + "num_tokens": 12808480.0, + "step": 1439 + }, + { + "epoch": 1.094224924012158, + "grad_norm": 3.2103970050811768, + "learning_rate": 3.7662885092022206e-06, + "loss": 0.3556194603443146, + "mean_token_accuracy": 0.8786529302597046, + "num_tokens": 12812654.0, + "step": 1440 + }, + { + "epoch": 1.094984802431611, + "grad_norm": 2.2774064540863037, + "learning_rate": 3.7644822128489476e-06, + "loss": 0.38409674167633057, + "mean_token_accuracy": 0.866563081741333, + "num_tokens": 12819854.0, + "step": 1441 + }, + { + "epoch": 1.0957446808510638, + "grad_norm": 1.8250885009765625, + "learning_rate": 3.7626750290333824e-06, + "loss": 0.3812350034713745, + "mean_token_accuracy": 0.8676212430000305, + "num_tokens": 12830338.0, + "step": 1442 + }, + { + "epoch": 1.0965045592705167, + "grad_norm": 1.8337891101837158, + "learning_rate": 3.7608669590238765e-06, + "loss": 0.3892471194267273, + "mean_token_accuracy": 0.8616238832473755, + "num_tokens": 12840340.0, + "step": 1443 + }, + { + "epoch": 1.0972644376899696, + "grad_norm": 1.5300254821777344, + "learning_rate": 3.7590580040894025e-06, + "loss": 0.35288217663764954, + "mean_token_accuracy": 0.8625509738922119, + "num_tokens": 12853144.0, + "step": 1444 + }, + { + "epoch": 1.0980243161094225, + "grad_norm": 2.152683734893799, + "learning_rate": 3.7572481654995554e-06, + "loss": 0.4004772901535034, + "mean_token_accuracy": 0.858427107334137, + "num_tokens": 12859970.0, + "step": 1445 + }, + { + "epoch": 1.0987841945288754, + "grad_norm": 1.532832145690918, + "learning_rate": 3.755437444524548e-06, + "loss": 0.46820127964019775, + "mean_token_accuracy": 0.8585472106933594, + "num_tokens": 12875243.0, + "step": 1446 + }, + { + "epoch": 1.0995440729483283, + "grad_norm": 1.6485342979431152, + "learning_rate": 3.7536258424352164e-06, + "loss": 0.46329325437545776, + "mean_token_accuracy": 0.8376060724258423, + "num_tokens": 12886383.0, + "step": 1447 + }, + { + "epoch": 1.1003039513677813, + "grad_norm": 2.402256488800049, + "learning_rate": 3.75181336050301e-06, + "loss": 0.43916207551956177, + "mean_token_accuracy": 0.8448786735534668, + "num_tokens": 12892613.0, + "step": 1448 + }, + { + "epoch": 1.101063829787234, + "grad_norm": 1.3893651962280273, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.3919021785259247, + "mean_token_accuracy": 0.8495820760726929, + "num_tokens": 12905523.0, + "step": 1449 + }, + { + "epoch": 1.1018237082066868, + "grad_norm": 1.5519827604293823, + "learning_rate": 3.7481857621988734e-06, + "loss": 0.4710700809955597, + "mean_token_accuracy": 0.8387632369995117, + "num_tokens": 12918236.0, + "step": 1450 + }, + { + "epoch": 1.1025835866261398, + "grad_norm": 2.0141353607177734, + "learning_rate": 3.74637064837293e-06, + "loss": 0.30866751074790955, + "mean_token_accuracy": 0.9059321880340576, + "num_tokens": 12924391.0, + "step": 1451 + }, + { + "epoch": 1.1033434650455927, + "grad_norm": 1.2201496362686157, + "learning_rate": 3.7445546597960882e-06, + "loss": 0.3938257396221161, + "mean_token_accuracy": 0.8726630210876465, + "num_tokens": 12943338.0, + "step": 1452 + }, + { + "epoch": 1.1041033434650456, + "grad_norm": 2.29434871673584, + "learning_rate": 3.742737797742878e-06, + "loss": 0.4347776174545288, + "mean_token_accuracy": 0.840569257736206, + "num_tokens": 12950636.0, + "step": 1453 + }, + { + "epoch": 1.1048632218844985, + "grad_norm": 2.3875105381011963, + "learning_rate": 3.7409200634884425e-06, + "loss": 0.48353564739227295, + "mean_token_accuracy": 0.8207056522369385, + "num_tokens": 12957635.0, + "step": 1454 + }, + { + "epoch": 1.1056231003039514, + "grad_norm": 2.3539648056030273, + "learning_rate": 3.7391014583085384e-06, + "loss": 0.3532431721687317, + "mean_token_accuracy": 0.8903788924217224, + "num_tokens": 12963032.0, + "step": 1455 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 1.5611135959625244, + "learning_rate": 3.737281983479534e-06, + "loss": 0.4734863042831421, + "mean_token_accuracy": 0.8413879871368408, + "num_tokens": 12977170.0, + "step": 1456 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.474320411682129, + "learning_rate": 3.735461640278404e-06, + "loss": 0.41854286193847656, + "mean_token_accuracy": 0.8499876856803894, + "num_tokens": 12993750.0, + "step": 1457 + }, + { + "epoch": 1.1079027355623101, + "grad_norm": 2.6873273849487305, + "learning_rate": 3.733640429982738e-06, + "loss": 0.47637903690338135, + "mean_token_accuracy": 0.83599853515625, + "num_tokens": 12999058.0, + "step": 1458 + }, + { + "epoch": 1.108662613981763, + "grad_norm": 1.4575026035308838, + "learning_rate": 3.731818353870729e-06, + "loss": 0.38441652059555054, + "mean_token_accuracy": 0.8582364320755005, + "num_tokens": 13013864.0, + "step": 1459 + }, + { + "epoch": 1.1094224924012157, + "grad_norm": 1.7722690105438232, + "learning_rate": 3.729995413221183e-06, + "loss": 0.4224998950958252, + "mean_token_accuracy": 0.8511888384819031, + "num_tokens": 13023714.0, + "step": 1460 + }, + { + "epoch": 1.1101823708206686, + "grad_norm": 2.625760555267334, + "learning_rate": 3.7281716093135068e-06, + "loss": 0.3487582802772522, + "mean_token_accuracy": 0.8834779262542725, + "num_tokens": 13028608.0, + "step": 1461 + }, + { + "epoch": 1.1109422492401215, + "grad_norm": 1.2554056644439697, + "learning_rate": 3.726346943427719e-06, + "loss": 0.33312469720840454, + "mean_token_accuracy": 0.8704153299331665, + "num_tokens": 13044901.0, + "step": 1462 + }, + { + "epoch": 1.1117021276595744, + "grad_norm": 2.1109910011291504, + "learning_rate": 3.7245214168444388e-06, + "loss": 0.387290894985199, + "mean_token_accuracy": 0.860816240310669, + "num_tokens": 13051452.0, + "step": 1463 + }, + { + "epoch": 1.1124620060790273, + "grad_norm": 3.159201145172119, + "learning_rate": 3.722695030844891e-06, + "loss": 0.37690871953964233, + "mean_token_accuracy": 0.8717561960220337, + "num_tokens": 13055131.0, + "step": 1464 + }, + { + "epoch": 1.1132218844984803, + "grad_norm": 1.3810011148452759, + "learning_rate": 3.7208677867109042e-06, + "loss": 0.36598485708236694, + "mean_token_accuracy": 0.8683375120162964, + "num_tokens": 13069798.0, + "step": 1465 + }, + { + "epoch": 1.1139817629179332, + "grad_norm": 2.500849485397339, + "learning_rate": 3.7190396857249087e-06, + "loss": 0.2781746983528137, + "mean_token_accuracy": 0.9026005268096924, + "num_tokens": 13075127.0, + "step": 1466 + }, + { + "epoch": 1.114741641337386, + "grad_norm": 1.7445712089538574, + "learning_rate": 3.7172107291699356e-06, + "loss": 0.5055314302444458, + "mean_token_accuracy": 0.8252174258232117, + "num_tokens": 13084843.0, + "step": 1467 + }, + { + "epoch": 1.115501519756839, + "grad_norm": 1.6386256217956543, + "learning_rate": 3.7153809183296174e-06, + "loss": 0.38478314876556396, + "mean_token_accuracy": 0.8600847721099854, + "num_tokens": 13096517.0, + "step": 1468 + }, + { + "epoch": 1.1162613981762919, + "grad_norm": 2.3818395137786865, + "learning_rate": 3.713550254488185e-06, + "loss": 0.40308547019958496, + "mean_token_accuracy": 0.8628184795379639, + "num_tokens": 13102324.0, + "step": 1469 + }, + { + "epoch": 1.1170212765957448, + "grad_norm": 1.73163640499115, + "learning_rate": 3.7117187389304703e-06, + "loss": 0.5035421848297119, + "mean_token_accuracy": 0.8229597210884094, + "num_tokens": 13113763.0, + "step": 1470 + }, + { + "epoch": 1.1177811550151975, + "grad_norm": 3.147177219390869, + "learning_rate": 3.7098863729418997e-06, + "loss": 0.557449221611023, + "mean_token_accuracy": 0.8266849517822266, + "num_tokens": 13118849.0, + "step": 1471 + }, + { + "epoch": 1.1185410334346504, + "grad_norm": 1.5061391592025757, + "learning_rate": 3.7080531578085e-06, + "loss": 0.3759554922580719, + "mean_token_accuracy": 0.8541903495788574, + "num_tokens": 13131337.0, + "step": 1472 + }, + { + "epoch": 1.1193009118541033, + "grad_norm": 2.172346353530884, + "learning_rate": 3.7062190948168906e-06, + "loss": 0.41491609811782837, + "mean_token_accuracy": 0.8531454801559448, + "num_tokens": 13139767.0, + "step": 1473 + }, + { + "epoch": 1.1200607902735562, + "grad_norm": 2.1527154445648193, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4309239387512207, + "mean_token_accuracy": 0.8327745199203491, + "num_tokens": 13147210.0, + "step": 1474 + }, + { + "epoch": 1.1208206686930091, + "grad_norm": 1.8342832326889038, + "learning_rate": 3.7025484304085035e-06, + "loss": 0.34393298625946045, + "mean_token_accuracy": 0.8948153257369995, + "num_tokens": 13154831.0, + "step": 1475 + }, + { + "epoch": 1.121580547112462, + "grad_norm": 2.509291172027588, + "learning_rate": 3.7007118315679384e-06, + "loss": 0.4479471445083618, + "mean_token_accuracy": 0.8280234336853027, + "num_tokens": 13161040.0, + "step": 1476 + }, + { + "epoch": 1.122340425531915, + "grad_norm": 2.914710521697998, + "learning_rate": 3.6988743900215895e-06, + "loss": 0.3724832832813263, + "mean_token_accuracy": 0.863893985748291, + "num_tokens": 13164975.0, + "step": 1477 + }, + { + "epoch": 1.1231003039513678, + "grad_norm": 3.274808645248413, + "learning_rate": 3.6970361070590443e-06, + "loss": 0.4088161885738373, + "mean_token_accuracy": 0.8474822044372559, + "num_tokens": 13168826.0, + "step": 1478 + }, + { + "epoch": 1.1238601823708207, + "grad_norm": 2.861546277999878, + "learning_rate": 3.695196983970481e-06, + "loss": 0.45837992429733276, + "mean_token_accuracy": 0.8579759001731873, + "num_tokens": 13173794.0, + "step": 1479 + }, + { + "epoch": 1.1246200607902737, + "grad_norm": 1.9491597414016724, + "learning_rate": 3.6933570220466654e-06, + "loss": 0.4333910346031189, + "mean_token_accuracy": 0.8444236516952515, + "num_tokens": 13181598.0, + "step": 1480 + }, + { + "epoch": 1.1253799392097266, + "grad_norm": 1.329848051071167, + "learning_rate": 3.6915162225789546e-06, + "loss": 0.36404621601104736, + "mean_token_accuracy": 0.8694117069244385, + "num_tokens": 13196381.0, + "step": 1481 + }, + { + "epoch": 1.1261398176291793, + "grad_norm": 1.8854197263717651, + "learning_rate": 3.6896745868592924e-06, + "loss": 0.4085756838321686, + "mean_token_accuracy": 0.855188250541687, + "num_tokens": 13205236.0, + "step": 1482 + }, + { + "epoch": 1.1268996960486322, + "grad_norm": 3.01684832572937, + "learning_rate": 3.6878321161802106e-06, + "loss": 0.28105655312538147, + "mean_token_accuracy": 0.9009426236152649, + "num_tokens": 13209380.0, + "step": 1483 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 1.8051308393478394, + "learning_rate": 3.685988811834823e-06, + "loss": 0.3314531147480011, + "mean_token_accuracy": 0.8805814385414124, + "num_tokens": 13217714.0, + "step": 1484 + }, + { + "epoch": 1.128419452887538, + "grad_norm": 1.61757493019104, + "learning_rate": 3.684144675116836e-06, + "loss": 0.4543863534927368, + "mean_token_accuracy": 0.8400536775588989, + "num_tokens": 13229330.0, + "step": 1485 + }, + { + "epoch": 1.1291793313069909, + "grad_norm": 1.602686882019043, + "learning_rate": 3.682299707320532e-06, + "loss": 0.3653204143047333, + "mean_token_accuracy": 0.8655825853347778, + "num_tokens": 13242872.0, + "step": 1486 + }, + { + "epoch": 1.1299392097264438, + "grad_norm": 2.3093113899230957, + "learning_rate": 3.680453909740782e-06, + "loss": 0.4383693039417267, + "mean_token_accuracy": 0.839782178401947, + "num_tokens": 13248976.0, + "step": 1487 + }, + { + "epoch": 1.1306990881458967, + "grad_norm": 1.180559754371643, + "learning_rate": 3.6786072836730376e-06, + "loss": 0.5354755520820618, + "mean_token_accuracy": 0.8151205778121948, + "num_tokens": 13272896.0, + "step": 1488 + }, + { + "epoch": 1.1314589665653496, + "grad_norm": 1.9554040431976318, + "learning_rate": 3.6767598304133325e-06, + "loss": 0.4485316872596741, + "mean_token_accuracy": 0.8399936556816101, + "num_tokens": 13280757.0, + "step": 1489 + }, + { + "epoch": 1.1322188449848025, + "grad_norm": 2.236471176147461, + "learning_rate": 3.674911551258279e-06, + "loss": 0.45594364404678345, + "mean_token_accuracy": 0.8552400469779968, + "num_tokens": 13287328.0, + "step": 1490 + }, + { + "epoch": 1.1329787234042552, + "grad_norm": 2.5228686332702637, + "learning_rate": 3.673062447505072e-06, + "loss": 0.4048641622066498, + "mean_token_accuracy": 0.8617376685142517, + "num_tokens": 13292716.0, + "step": 1491 + }, + { + "epoch": 1.1337386018237081, + "grad_norm": 1.1274473667144775, + "learning_rate": 3.6712125204514836e-06, + "loss": 0.3848876357078552, + "mean_token_accuracy": 0.8672975301742554, + "num_tokens": 13313403.0, + "step": 1492 + }, + { + "epoch": 1.134498480243161, + "grad_norm": 2.349541425704956, + "learning_rate": 3.6693617713958633e-06, + "loss": 0.3166058361530304, + "mean_token_accuracy": 0.8896721601486206, + "num_tokens": 13318720.0, + "step": 1493 + }, + { + "epoch": 1.135258358662614, + "grad_norm": 2.2438278198242188, + "learning_rate": 3.6675102016371387e-06, + "loss": 0.5418218970298767, + "mean_token_accuracy": 0.8256527185440063, + "num_tokens": 13325360.0, + "step": 1494 + }, + { + "epoch": 1.1360182370820668, + "grad_norm": 2.21268892288208, + "learning_rate": 3.665657812474812e-06, + "loss": 0.48603951930999756, + "mean_token_accuracy": 0.8273470401763916, + "num_tokens": 13333217.0, + "step": 1495 + }, + { + "epoch": 1.1367781155015197, + "grad_norm": 2.6105997562408447, + "learning_rate": 3.6638046052089614e-06, + "loss": 0.31221291422843933, + "mean_token_accuracy": 0.888375997543335, + "num_tokens": 13338413.0, + "step": 1496 + }, + { + "epoch": 1.1375379939209727, + "grad_norm": 3.655658483505249, + "learning_rate": 3.661950581140239e-06, + "loss": 0.3609023988246918, + "mean_token_accuracy": 0.8838576078414917, + "num_tokens": 13341499.0, + "step": 1497 + }, + { + "epoch": 1.1382978723404256, + "grad_norm": 2.242009162902832, + "learning_rate": 3.660095741569871e-06, + "loss": 0.40022802352905273, + "mean_token_accuracy": 0.8559960722923279, + "num_tokens": 13347917.0, + "step": 1498 + }, + { + "epoch": 1.1390577507598785, + "grad_norm": 1.7958979606628418, + "learning_rate": 3.658240087799655e-06, + "loss": 0.499157190322876, + "mean_token_accuracy": 0.8423802256584167, + "num_tokens": 13361570.0, + "step": 1499 + }, + { + "epoch": 1.1398176291793314, + "grad_norm": 2.5406908988952637, + "learning_rate": 3.6563836211319593e-06, + "loss": 0.4090137481689453, + "mean_token_accuracy": 0.8769663572311401, + "num_tokens": 13367183.0, + "step": 1500 + }, + { + "epoch": 1.1405775075987843, + "grad_norm": 1.9861716032028198, + "learning_rate": 3.654526342869724e-06, + "loss": 0.5125207304954529, + "mean_token_accuracy": 0.8315266370773315, + "num_tokens": 13376767.0, + "step": 1501 + }, + { + "epoch": 1.141337386018237, + "grad_norm": 1.731188178062439, + "learning_rate": 3.65266825431646e-06, + "loss": 0.39452576637268066, + "mean_token_accuracy": 0.8585706353187561, + "num_tokens": 13388437.0, + "step": 1502 + }, + { + "epoch": 1.1420972644376899, + "grad_norm": 1.5203773975372314, + "learning_rate": 3.6508093567762425e-06, + "loss": 0.39466819167137146, + "mean_token_accuracy": 0.8584027886390686, + "num_tokens": 13399727.0, + "step": 1503 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 2.606462001800537, + "learning_rate": 3.6489496515537204e-06, + "loss": 0.4521079361438751, + "mean_token_accuracy": 0.8413360118865967, + "num_tokens": 13408426.0, + "step": 1504 + }, + { + "epoch": 1.1436170212765957, + "grad_norm": 2.6207993030548096, + "learning_rate": 3.647089139954104e-06, + "loss": 0.4709353446960449, + "mean_token_accuracy": 0.8397113084793091, + "num_tokens": 13413506.0, + "step": 1505 + }, + { + "epoch": 1.1443768996960486, + "grad_norm": 1.7214165925979614, + "learning_rate": 3.6452278232831734e-06, + "loss": 0.45506367087364197, + "mean_token_accuracy": 0.8466023206710815, + "num_tokens": 13424592.0, + "step": 1506 + }, + { + "epoch": 1.1451367781155015, + "grad_norm": 1.7111759185791016, + "learning_rate": 3.643365702847272e-06, + "loss": 0.5016278624534607, + "mean_token_accuracy": 0.8196234703063965, + "num_tokens": 13434421.0, + "step": 1507 + }, + { + "epoch": 1.1458966565349544, + "grad_norm": 1.7528148889541626, + "learning_rate": 3.641502779953307e-06, + "loss": 0.5020896196365356, + "mean_token_accuracy": 0.826249361038208, + "num_tokens": 13445286.0, + "step": 1508 + }, + { + "epoch": 1.1466565349544073, + "grad_norm": 1.3470909595489502, + "learning_rate": 3.639639055908751e-06, + "loss": 0.45765724778175354, + "mean_token_accuracy": 0.8380560278892517, + "num_tokens": 13465030.0, + "step": 1509 + }, + { + "epoch": 1.1474164133738602, + "grad_norm": 2.4846835136413574, + "learning_rate": 3.6377745320216346e-06, + "loss": 0.46488267183303833, + "mean_token_accuracy": 0.8393925428390503, + "num_tokens": 13470883.0, + "step": 1510 + }, + { + "epoch": 1.1481762917933132, + "grad_norm": 1.770201563835144, + "learning_rate": 3.635909209600555e-06, + "loss": 0.5262179374694824, + "mean_token_accuracy": 0.8201162815093994, + "num_tokens": 13482558.0, + "step": 1511 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 1.5955098867416382, + "learning_rate": 3.6340430899546656e-06, + "loss": 0.430621862411499, + "mean_token_accuracy": 0.8488553762435913, + "num_tokens": 13493003.0, + "step": 1512 + }, + { + "epoch": 1.1496960486322187, + "grad_norm": 2.846176862716675, + "learning_rate": 3.632176174393682e-06, + "loss": 0.23461638391017914, + "mean_token_accuracy": 0.9218817353248596, + "num_tokens": 13496566.0, + "step": 1513 + }, + { + "epoch": 1.1504559270516717, + "grad_norm": 1.9606610536575317, + "learning_rate": 3.630308464227877e-06, + "loss": 0.4940161108970642, + "mean_token_accuracy": 0.8474864959716797, + "num_tokens": 13504843.0, + "step": 1514 + }, + { + "epoch": 1.1512158054711246, + "grad_norm": 1.1588608026504517, + "learning_rate": 3.628439960768082e-06, + "loss": 0.32650992274284363, + "mean_token_accuracy": 0.8797246217727661, + "num_tokens": 13521513.0, + "step": 1515 + }, + { + "epoch": 1.1519756838905775, + "grad_norm": 1.3566495180130005, + "learning_rate": 3.6265706653256837e-06, + "loss": 0.4359064996242523, + "mean_token_accuracy": 0.8379859328269958, + "num_tokens": 13540608.0, + "step": 1516 + }, + { + "epoch": 1.1527355623100304, + "grad_norm": 1.4728609323501587, + "learning_rate": 3.624700579212626e-06, + "loss": 0.29939693212509155, + "mean_token_accuracy": 0.8831408023834229, + "num_tokens": 13550641.0, + "step": 1517 + }, + { + "epoch": 1.1534954407294833, + "grad_norm": 2.162325382232666, + "learning_rate": 3.6228297037414077e-06, + "loss": 0.4097636938095093, + "mean_token_accuracy": 0.8575425148010254, + "num_tokens": 13556931.0, + "step": 1518 + }, + { + "epoch": 1.1542553191489362, + "grad_norm": 1.754439353942871, + "learning_rate": 3.6209580402250816e-06, + "loss": 0.400202214717865, + "mean_token_accuracy": 0.8569821119308472, + "num_tokens": 13565491.0, + "step": 1519 + }, + { + "epoch": 1.155015197568389, + "grad_norm": 1.5250083208084106, + "learning_rate": 3.619085589977251e-06, + "loss": 0.43330419063568115, + "mean_token_accuracy": 0.8492985963821411, + "num_tokens": 13577147.0, + "step": 1520 + }, + { + "epoch": 1.155775075987842, + "grad_norm": 1.9108905792236328, + "learning_rate": 3.617212354312076e-06, + "loss": 0.30567464232444763, + "mean_token_accuracy": 0.8850164413452148, + "num_tokens": 13584366.0, + "step": 1521 + }, + { + "epoch": 1.156534954407295, + "grad_norm": 2.2574243545532227, + "learning_rate": 3.615338334544265e-06, + "loss": 0.4391738772392273, + "mean_token_accuracy": 0.839765727519989, + "num_tokens": 13591816.0, + "step": 1522 + }, + { + "epoch": 1.1572948328267478, + "grad_norm": 2.1235218048095703, + "learning_rate": 3.6134635319890763e-06, + "loss": 0.45043107867240906, + "mean_token_accuracy": 0.8385299444198608, + "num_tokens": 13599736.0, + "step": 1523 + }, + { + "epoch": 1.1580547112462005, + "grad_norm": 2.2274110317230225, + "learning_rate": 3.611587947962319e-06, + "loss": 0.3623226284980774, + "mean_token_accuracy": 0.8724044561386108, + "num_tokens": 13605354.0, + "step": 1524 + }, + { + "epoch": 1.1588145896656534, + "grad_norm": 3.414236545562744, + "learning_rate": 3.6097115837803504e-06, + "loss": 0.30060696601867676, + "mean_token_accuracy": 0.8971061706542969, + "num_tokens": 13608851.0, + "step": 1525 + }, + { + "epoch": 1.1595744680851063, + "grad_norm": 2.496264696121216, + "learning_rate": 3.6078344407600744e-06, + "loss": 0.3567180037498474, + "mean_token_accuracy": 0.8596180081367493, + "num_tokens": 13614339.0, + "step": 1526 + }, + { + "epoch": 1.1603343465045592, + "grad_norm": 2.0191843509674072, + "learning_rate": 3.6059565202189433e-06, + "loss": 0.43206095695495605, + "mean_token_accuracy": 0.8464000821113586, + "num_tokens": 13622395.0, + "step": 1527 + }, + { + "epoch": 1.1610942249240122, + "grad_norm": 1.5475906133651733, + "learning_rate": 3.604077823474954e-06, + "loss": 0.4535648226737976, + "mean_token_accuracy": 0.8391586542129517, + "num_tokens": 13635356.0, + "step": 1528 + }, + { + "epoch": 1.161854103343465, + "grad_norm": 2.1348211765289307, + "learning_rate": 3.6021983518466468e-06, + "loss": 0.2733963429927826, + "mean_token_accuracy": 0.9007417559623718, + "num_tokens": 13640641.0, + "step": 1529 + }, + { + "epoch": 1.162613981762918, + "grad_norm": 2.8452792167663574, + "learning_rate": 3.600318106653108e-06, + "loss": 0.29591235518455505, + "mean_token_accuracy": 0.8934413194656372, + "num_tokens": 13644995.0, + "step": 1530 + }, + { + "epoch": 1.1633738601823709, + "grad_norm": 2.342907190322876, + "learning_rate": 3.5984370892139663e-06, + "loss": 0.4675130248069763, + "mean_token_accuracy": 0.8352028131484985, + "num_tokens": 13652695.0, + "step": 1531 + }, + { + "epoch": 1.1641337386018238, + "grad_norm": 2.3480238914489746, + "learning_rate": 3.5965553008493924e-06, + "loss": 0.3114515542984009, + "mean_token_accuracy": 0.8845353126525879, + "num_tokens": 13658101.0, + "step": 1532 + }, + { + "epoch": 1.1648936170212765, + "grad_norm": 1.8608155250549316, + "learning_rate": 3.594672742880097e-06, + "loss": 0.3864145278930664, + "mean_token_accuracy": 0.867354154586792, + "num_tokens": 13666042.0, + "step": 1533 + }, + { + "epoch": 1.1656534954407296, + "grad_norm": 1.4756088256835938, + "learning_rate": 3.5927894166273324e-06, + "loss": 0.3671600818634033, + "mean_token_accuracy": 0.8695988655090332, + "num_tokens": 13678253.0, + "step": 1534 + }, + { + "epoch": 1.1664133738601823, + "grad_norm": 2.8831355571746826, + "learning_rate": 3.5909053234128893e-06, + "loss": 0.267184317111969, + "mean_token_accuracy": 0.9008115530014038, + "num_tokens": 13681790.0, + "step": 1535 + }, + { + "epoch": 1.1671732522796352, + "grad_norm": 2.1984763145446777, + "learning_rate": 3.5890204645590964e-06, + "loss": 0.4431505799293518, + "mean_token_accuracy": 0.8623673915863037, + "num_tokens": 13688444.0, + "step": 1536 + }, + { + "epoch": 1.167933130699088, + "grad_norm": 1.8271523714065552, + "learning_rate": 3.5871348413888207e-06, + "loss": 0.3861040771007538, + "mean_token_accuracy": 0.8624277114868164, + "num_tokens": 13696872.0, + "step": 1537 + }, + { + "epoch": 1.168693009118541, + "grad_norm": 1.6313756704330444, + "learning_rate": 3.585248455225466e-06, + "loss": 0.3775154948234558, + "mean_token_accuracy": 0.8624461889266968, + "num_tokens": 13706167.0, + "step": 1538 + }, + { + "epoch": 1.169452887537994, + "grad_norm": 2.4377901554107666, + "learning_rate": 3.5833613073929684e-06, + "loss": 0.2308957427740097, + "mean_token_accuracy": 0.920600175857544, + "num_tokens": 13710367.0, + "step": 1539 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.2621750831604004, + "learning_rate": 3.5814733992158025e-06, + "loss": 0.33167219161987305, + "mean_token_accuracy": 0.8963261842727661, + "num_tokens": 13716384.0, + "step": 1540 + }, + { + "epoch": 1.1709726443768997, + "grad_norm": 1.3178150653839111, + "learning_rate": 3.579584732018975e-06, + "loss": 0.3276631832122803, + "mean_token_accuracy": 0.8853521347045898, + "num_tokens": 13731031.0, + "step": 1541 + }, + { + "epoch": 1.1717325227963526, + "grad_norm": 2.177750587463379, + "learning_rate": 3.577695307128024e-06, + "loss": 0.48177266120910645, + "mean_token_accuracy": 0.830329418182373, + "num_tokens": 13737925.0, + "step": 1542 + }, + { + "epoch": 1.1724924012158056, + "grad_norm": 2.2268829345703125, + "learning_rate": 3.5758051258690223e-06, + "loss": 0.48843517899513245, + "mean_token_accuracy": 0.8310644030570984, + "num_tokens": 13746039.0, + "step": 1543 + }, + { + "epoch": 1.1732522796352582, + "grad_norm": 1.498701572418213, + "learning_rate": 3.5739141895685708e-06, + "loss": 0.4542962312698364, + "mean_token_accuracy": 0.8500330448150635, + "num_tokens": 13765002.0, + "step": 1544 + }, + { + "epoch": 1.1740121580547112, + "grad_norm": 1.786670446395874, + "learning_rate": 3.5720224995538023e-06, + "loss": 0.27367928624153137, + "mean_token_accuracy": 0.8916142582893372, + "num_tokens": 13774113.0, + "step": 1545 + }, + { + "epoch": 1.174772036474164, + "grad_norm": 2.0311272144317627, + "learning_rate": 3.5701300571523757e-06, + "loss": 0.559987485408783, + "mean_token_accuracy": 0.8266973495483398, + "num_tokens": 13783912.0, + "step": 1546 + }, + { + "epoch": 1.175531914893617, + "grad_norm": 1.8732186555862427, + "learning_rate": 3.5682368636924825e-06, + "loss": 0.5184751152992249, + "mean_token_accuracy": 0.8450918197631836, + "num_tokens": 13792728.0, + "step": 1547 + }, + { + "epoch": 1.1762917933130699, + "grad_norm": 1.4410661458969116, + "learning_rate": 3.566342920502837e-06, + "loss": 0.383536696434021, + "mean_token_accuracy": 0.8672217726707458, + "num_tokens": 13813590.0, + "step": 1548 + }, + { + "epoch": 1.1770516717325228, + "grad_norm": 3.06056547164917, + "learning_rate": 3.564448228912682e-06, + "loss": 0.3941686153411865, + "mean_token_accuracy": 0.8696402311325073, + "num_tokens": 13817704.0, + "step": 1549 + }, + { + "epoch": 1.1778115501519757, + "grad_norm": 1.6150329113006592, + "learning_rate": 3.562552790251785e-06, + "loss": 0.41606605052948, + "mean_token_accuracy": 0.8488572835922241, + "num_tokens": 13831303.0, + "step": 1550 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 2.1199934482574463, + "learning_rate": 3.5606566058504377e-06, + "loss": 0.3974752426147461, + "mean_token_accuracy": 0.8686345219612122, + "num_tokens": 13837613.0, + "step": 1551 + }, + { + "epoch": 1.1793313069908815, + "grad_norm": 1.5683876276016235, + "learning_rate": 3.558759677039455e-06, + "loss": 0.35225993394851685, + "mean_token_accuracy": 0.8710784316062927, + "num_tokens": 13846779.0, + "step": 1552 + }, + { + "epoch": 1.1800911854103344, + "grad_norm": 1.4644675254821777, + "learning_rate": 3.5568620051501755e-06, + "loss": 0.38400042057037354, + "mean_token_accuracy": 0.8548328876495361, + "num_tokens": 13860713.0, + "step": 1553 + }, + { + "epoch": 1.1808510638297873, + "grad_norm": 1.461491346359253, + "learning_rate": 3.5549635915144578e-06, + "loss": 0.4572640061378479, + "mean_token_accuracy": 0.8506045937538147, + "num_tokens": 13877289.0, + "step": 1554 + }, + { + "epoch": 1.18161094224924, + "grad_norm": 2.6364715099334717, + "learning_rate": 3.553064437464682e-06, + "loss": 0.3954341411590576, + "mean_token_accuracy": 0.8561649322509766, + "num_tokens": 13882064.0, + "step": 1555 + }, + { + "epoch": 1.182370820668693, + "grad_norm": 2.027273654937744, + "learning_rate": 3.551164544333745e-06, + "loss": 0.47625732421875, + "mean_token_accuracy": 0.8349384069442749, + "num_tokens": 13890306.0, + "step": 1556 + }, + { + "epoch": 1.1831306990881458, + "grad_norm": 2.8427743911743164, + "learning_rate": 3.549263913455069e-06, + "loss": 0.4273033142089844, + "mean_token_accuracy": 0.8541387319564819, + "num_tokens": 13894882.0, + "step": 1557 + }, + { + "epoch": 1.1838905775075987, + "grad_norm": 1.6298975944519043, + "learning_rate": 3.5473625461625884e-06, + "loss": 0.4378639757633209, + "mean_token_accuracy": 0.8634963631629944, + "num_tokens": 13906152.0, + "step": 1558 + }, + { + "epoch": 1.1846504559270516, + "grad_norm": 2.4098947048187256, + "learning_rate": 3.5454604437907535e-06, + "loss": 0.47236716747283936, + "mean_token_accuracy": 0.8646864891052246, + "num_tokens": 13911803.0, + "step": 1559 + }, + { + "epoch": 1.1854103343465046, + "grad_norm": 1.5972497463226318, + "learning_rate": 3.543557607674537e-06, + "loss": 0.3001407980918884, + "mean_token_accuracy": 0.8927055597305298, + "num_tokens": 13921304.0, + "step": 1560 + }, + { + "epoch": 1.1861702127659575, + "grad_norm": 2.1140005588531494, + "learning_rate": 3.54165403914942e-06, + "loss": 0.41898271441459656, + "mean_token_accuracy": 0.8542245626449585, + "num_tokens": 13929434.0, + "step": 1561 + }, + { + "epoch": 1.1869300911854104, + "grad_norm": 1.8733803033828735, + "learning_rate": 3.539749739551401e-06, + "loss": 0.35469961166381836, + "mean_token_accuracy": 0.8805290460586548, + "num_tokens": 13937781.0, + "step": 1562 + }, + { + "epoch": 1.1876899696048633, + "grad_norm": 2.2805802822113037, + "learning_rate": 3.53784471021699e-06, + "loss": 0.44496792554855347, + "mean_token_accuracy": 0.8454172611236572, + "num_tokens": 13944394.0, + "step": 1563 + }, + { + "epoch": 1.1884498480243162, + "grad_norm": 0.9728449583053589, + "learning_rate": 3.535938952483211e-06, + "loss": 0.3156968355178833, + "mean_token_accuracy": 0.8739837408065796, + "num_tokens": 13966712.0, + "step": 1564 + }, + { + "epoch": 1.189209726443769, + "grad_norm": 3.025338888168335, + "learning_rate": 3.534032467687597e-06, + "loss": 0.30036938190460205, + "mean_token_accuracy": 0.9058252573013306, + "num_tokens": 13970183.0, + "step": 1565 + }, + { + "epoch": 1.1899696048632218, + "grad_norm": 2.0659425258636475, + "learning_rate": 3.532125257168193e-06, + "loss": 0.30619731545448303, + "mean_token_accuracy": 0.9041587710380554, + "num_tokens": 13976657.0, + "step": 1566 + }, + { + "epoch": 1.1907294832826747, + "grad_norm": 3.2036776542663574, + "learning_rate": 3.5302173222635526e-06, + "loss": 0.4145944118499756, + "mean_token_accuracy": 0.8502328395843506, + "num_tokens": 13981198.0, + "step": 1567 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 1.7767539024353027, + "learning_rate": 3.5283086643127396e-06, + "loss": 0.437128484249115, + "mean_token_accuracy": 0.8965631723403931, + "num_tokens": 13990259.0, + "step": 1568 + }, + { + "epoch": 1.1922492401215805, + "grad_norm": 1.7777384519577026, + "learning_rate": 3.5263992846553203e-06, + "loss": 0.33831220865249634, + "mean_token_accuracy": 0.8734279870986938, + "num_tokens": 13999363.0, + "step": 1569 + }, + { + "epoch": 1.1930091185410334, + "grad_norm": 1.6710708141326904, + "learning_rate": 3.5244891846313733e-06, + "loss": 0.4005590081214905, + "mean_token_accuracy": 0.8820298314094543, + "num_tokens": 14008719.0, + "step": 1570 + }, + { + "epoch": 1.1937689969604863, + "grad_norm": 1.0378777980804443, + "learning_rate": 3.5225783655814798e-06, + "loss": 0.3174915313720703, + "mean_token_accuracy": 0.8894162774085999, + "num_tokens": 14025806.0, + "step": 1571 + }, + { + "epoch": 1.1945288753799392, + "grad_norm": 1.2647521495819092, + "learning_rate": 3.520666828846726e-06, + "loss": 0.4173050820827484, + "mean_token_accuracy": 0.8437265157699585, + "num_tokens": 14046445.0, + "step": 1572 + }, + { + "epoch": 1.1952887537993921, + "grad_norm": 2.8625528812408447, + "learning_rate": 3.518754575768702e-06, + "loss": 0.37182557582855225, + "mean_token_accuracy": 0.8660947680473328, + "num_tokens": 14051197.0, + "step": 1573 + }, + { + "epoch": 1.196048632218845, + "grad_norm": 1.1213171482086182, + "learning_rate": 3.516841607689501e-06, + "loss": 0.332731157541275, + "mean_token_accuracy": 0.8573278784751892, + "num_tokens": 14070817.0, + "step": 1574 + }, + { + "epoch": 1.196808510638298, + "grad_norm": 1.197508692741394, + "learning_rate": 3.5149279259517165e-06, + "loss": 0.34058472514152527, + "mean_token_accuracy": 0.8603571653366089, + "num_tokens": 14085301.0, + "step": 1575 + }, + { + "epoch": 1.1975683890577509, + "grad_norm": 4.019949913024902, + "learning_rate": 3.5130135318984454e-06, + "loss": 0.3094622492790222, + "mean_token_accuracy": 0.8905094861984253, + "num_tokens": 14088107.0, + "step": 1576 + }, + { + "epoch": 1.1983282674772036, + "grad_norm": 2.591181755065918, + "learning_rate": 3.5110984268732827e-06, + "loss": 0.3407078981399536, + "mean_token_accuracy": 0.880385160446167, + "num_tokens": 14092887.0, + "step": 1577 + }, + { + "epoch": 1.1990881458966565, + "grad_norm": 1.3069331645965576, + "learning_rate": 3.509182612220322e-06, + "loss": 0.3761988878250122, + "mean_token_accuracy": 0.862013041973114, + "num_tokens": 14109216.0, + "step": 1578 + }, + { + "epoch": 1.1998480243161094, + "grad_norm": 1.7802022695541382, + "learning_rate": 3.507266089284157e-06, + "loss": 0.3824652135372162, + "mean_token_accuracy": 0.8707721829414368, + "num_tokens": 14119645.0, + "step": 1579 + }, + { + "epoch": 1.2006079027355623, + "grad_norm": 2.7937185764312744, + "learning_rate": 3.5053488594098763e-06, + "loss": 0.33828890323638916, + "mean_token_accuracy": 0.8765541315078735, + "num_tokens": 14124628.0, + "step": 1580 + }, + { + "epoch": 1.2013677811550152, + "grad_norm": 1.892671823501587, + "learning_rate": 3.5034309239430664e-06, + "loss": 0.3476094603538513, + "mean_token_accuracy": 0.9053795337677002, + "num_tokens": 14131756.0, + "step": 1581 + }, + { + "epoch": 1.202127659574468, + "grad_norm": 1.6857695579528809, + "learning_rate": 3.501512284229807e-06, + "loss": 0.5397108793258667, + "mean_token_accuracy": 0.8173421025276184, + "num_tokens": 14143024.0, + "step": 1582 + }, + { + "epoch": 1.202887537993921, + "grad_norm": 2.501737117767334, + "learning_rate": 3.4995929416166756e-06, + "loss": 0.4192458391189575, + "mean_token_accuracy": 0.8558136224746704, + "num_tokens": 14149499.0, + "step": 1583 + }, + { + "epoch": 1.203647416413374, + "grad_norm": 2.0133907794952393, + "learning_rate": 3.4976728974507387e-06, + "loss": 0.4791576564311981, + "mean_token_accuracy": 0.8253597021102905, + "num_tokens": 14158381.0, + "step": 1584 + }, + { + "epoch": 1.2044072948328268, + "grad_norm": 2.984611988067627, + "learning_rate": 3.4957521530795576e-06, + "loss": 0.3040750026702881, + "mean_token_accuracy": 0.8902391791343689, + "num_tokens": 14162419.0, + "step": 1585 + }, + { + "epoch": 1.2051671732522795, + "grad_norm": 1.518591284751892, + "learning_rate": 3.493830709851185e-06, + "loss": 0.35539618134498596, + "mean_token_accuracy": 0.8737183809280396, + "num_tokens": 14173048.0, + "step": 1586 + }, + { + "epoch": 1.2059270516717326, + "grad_norm": 2.628758192062378, + "learning_rate": 3.4919085691141636e-06, + "loss": 0.33340200781822205, + "mean_token_accuracy": 0.8705098628997803, + "num_tokens": 14178255.0, + "step": 1587 + }, + { + "epoch": 1.2066869300911853, + "grad_norm": 2.5565974712371826, + "learning_rate": 3.4899857322175252e-06, + "loss": 0.44939476251602173, + "mean_token_accuracy": 0.8315504193305969, + "num_tokens": 14183808.0, + "step": 1588 + }, + { + "epoch": 1.2074468085106382, + "grad_norm": 1.7521045207977295, + "learning_rate": 3.4880622005107916e-06, + "loss": 0.3168621063232422, + "mean_token_accuracy": 0.8824669122695923, + "num_tokens": 14192186.0, + "step": 1589 + }, + { + "epoch": 1.2082066869300911, + "grad_norm": 1.9816104173660278, + "learning_rate": 3.486137975343971e-06, + "loss": 0.3892582058906555, + "mean_token_accuracy": 0.8524188995361328, + "num_tokens": 14200512.0, + "step": 1590 + }, + { + "epoch": 1.208966565349544, + "grad_norm": 1.459800124168396, + "learning_rate": 3.484213058067559e-06, + "loss": 0.45930033922195435, + "mean_token_accuracy": 0.8408471345901489, + "num_tokens": 14215232.0, + "step": 1591 + }, + { + "epoch": 1.209726443768997, + "grad_norm": 2.015493154525757, + "learning_rate": 3.482287450032536e-06, + "loss": 0.5514016151428223, + "mean_token_accuracy": 0.8456779718399048, + "num_tokens": 14225402.0, + "step": 1592 + }, + { + "epoch": 1.2104863221884499, + "grad_norm": 3.4511911869049072, + "learning_rate": 3.4803611525903687e-06, + "loss": 0.4772771894931793, + "mean_token_accuracy": 0.8558698892593384, + "num_tokens": 14229038.0, + "step": 1593 + }, + { + "epoch": 1.2112462006079028, + "grad_norm": 2.2247982025146484, + "learning_rate": 3.4784341670930067e-06, + "loss": 0.4042825996875763, + "mean_token_accuracy": 0.8635870218276978, + "num_tokens": 14237057.0, + "step": 1594 + }, + { + "epoch": 1.2120060790273557, + "grad_norm": 2.0534820556640625, + "learning_rate": 3.4765064948928813e-06, + "loss": 0.34057414531707764, + "mean_token_accuracy": 0.8800770044326782, + "num_tokens": 14243013.0, + "step": 1595 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.594703197479248, + "learning_rate": 3.474578137342909e-06, + "loss": 0.4997410774230957, + "mean_token_accuracy": 0.8302106261253357, + "num_tokens": 14251210.0, + "step": 1596 + }, + { + "epoch": 1.2135258358662613, + "grad_norm": 2.517833948135376, + "learning_rate": 3.4726490957964836e-06, + "loss": 0.3630390465259552, + "mean_token_accuracy": 0.8679884672164917, + "num_tokens": 14255893.0, + "step": 1597 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.5177065134048462, + "learning_rate": 3.4707193716074816e-06, + "loss": 0.36218544840812683, + "mean_token_accuracy": 0.879178524017334, + "num_tokens": 14268143.0, + "step": 1598 + }, + { + "epoch": 1.215045592705167, + "grad_norm": 2.215291738510132, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.4166645407676697, + "mean_token_accuracy": 0.8495793342590332, + "num_tokens": 14276794.0, + "step": 1599 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 1.534294843673706, + "learning_rate": 3.466857880719645e-06, + "loss": 0.2635883092880249, + "mean_token_accuracy": 0.8971712589263916, + "num_tokens": 14287000.0, + "step": 1600 + }, + { + "epoch": 1.216565349544073, + "grad_norm": 1.2338658571243286, + "learning_rate": 3.464926116730953e-06, + "loss": 0.339110404253006, + "mean_token_accuracy": 0.895592987537384, + "num_tokens": 14303217.0, + "step": 1601 + }, + { + "epoch": 1.2173252279635258, + "grad_norm": 1.8717178106307983, + "learning_rate": 3.462993675519968e-06, + "loss": 0.41204726696014404, + "mean_token_accuracy": 0.8560728430747986, + "num_tokens": 14311372.0, + "step": 1602 + }, + { + "epoch": 1.2180851063829787, + "grad_norm": 2.844160795211792, + "learning_rate": 3.4610605584429526e-06, + "loss": 0.4129520058631897, + "mean_token_accuracy": 0.8555002212524414, + "num_tokens": 14316244.0, + "step": 1603 + }, + { + "epoch": 1.2188449848024316, + "grad_norm": 1.099926471710205, + "learning_rate": 3.4591267668566412e-06, + "loss": 0.35783132910728455, + "mean_token_accuracy": 0.8693175315856934, + "num_tokens": 14338414.0, + "step": 1604 + }, + { + "epoch": 1.2196048632218845, + "grad_norm": 1.6448384523391724, + "learning_rate": 3.457192302118244e-06, + "loss": 0.42060258984565735, + "mean_token_accuracy": 0.8557323217391968, + "num_tokens": 14349143.0, + "step": 1605 + }, + { + "epoch": 1.2203647416413375, + "grad_norm": 2.097529888153076, + "learning_rate": 3.455257165585444e-06, + "loss": 0.5227499008178711, + "mean_token_accuracy": 0.828961968421936, + "num_tokens": 14360032.0, + "step": 1606 + }, + { + "epoch": 1.2211246200607904, + "grad_norm": 1.602988600730896, + "learning_rate": 3.453321358616393e-06, + "loss": 0.3537187874317169, + "mean_token_accuracy": 0.8776708841323853, + "num_tokens": 14370005.0, + "step": 1607 + }, + { + "epoch": 1.221884498480243, + "grad_norm": 2.358971357345581, + "learning_rate": 3.4513848825697145e-06, + "loss": 0.3448919653892517, + "mean_token_accuracy": 0.8887944221496582, + "num_tokens": 14375718.0, + "step": 1608 + }, + { + "epoch": 1.222644376899696, + "grad_norm": 1.72306227684021, + "learning_rate": 3.4494477388045035e-06, + "loss": 0.36985084414482117, + "mean_token_accuracy": 0.859595537185669, + "num_tokens": 14385016.0, + "step": 1609 + }, + { + "epoch": 1.2234042553191489, + "grad_norm": 1.5494085550308228, + "learning_rate": 3.4475099286803204e-06, + "loss": 0.49003708362579346, + "mean_token_accuracy": 0.8701964616775513, + "num_tokens": 14399277.0, + "step": 1610 + }, + { + "epoch": 1.2241641337386018, + "grad_norm": 2.6874046325683594, + "learning_rate": 3.445571453557196e-06, + "loss": 0.3424490690231323, + "mean_token_accuracy": 0.8835943937301636, + "num_tokens": 14404182.0, + "step": 1611 + }, + { + "epoch": 1.2249240121580547, + "grad_norm": 2.2163190841674805, + "learning_rate": 3.443632314795627e-06, + "loss": 0.40944457054138184, + "mean_token_accuracy": 0.8649888038635254, + "num_tokens": 14410158.0, + "step": 1612 + }, + { + "epoch": 1.2256838905775076, + "grad_norm": 2.7961158752441406, + "learning_rate": 3.4416925137565756e-06, + "loss": 0.17890746891498566, + "mean_token_accuracy": 0.9439430832862854, + "num_tokens": 14413285.0, + "step": 1613 + }, + { + "epoch": 1.2264437689969605, + "grad_norm": 1.421451210975647, + "learning_rate": 3.439752051801467e-06, + "loss": 0.33948683738708496, + "mean_token_accuracy": 0.8754585981369019, + "num_tokens": 14424674.0, + "step": 1614 + }, + { + "epoch": 1.2272036474164134, + "grad_norm": 2.105196237564087, + "learning_rate": 3.4378109302921946e-06, + "loss": 0.40009379386901855, + "mean_token_accuracy": 0.8600341081619263, + "num_tokens": 14432400.0, + "step": 1615 + }, + { + "epoch": 1.2279635258358663, + "grad_norm": 2.004122734069824, + "learning_rate": 3.4358691505911105e-06, + "loss": 0.46013444662094116, + "mean_token_accuracy": 0.8400925993919373, + "num_tokens": 14440741.0, + "step": 1616 + }, + { + "epoch": 1.2287234042553192, + "grad_norm": 1.8407535552978516, + "learning_rate": 3.4339267140610317e-06, + "loss": 0.38828906416893005, + "mean_token_accuracy": 0.8582802414894104, + "num_tokens": 14448698.0, + "step": 1617 + }, + { + "epoch": 1.2294832826747721, + "grad_norm": 2.4285924434661865, + "learning_rate": 3.4319836220652334e-06, + "loss": 0.3109283447265625, + "mean_token_accuracy": 0.8888344764709473, + "num_tokens": 14453674.0, + "step": 1618 + }, + { + "epoch": 1.2302431610942248, + "grad_norm": 1.6322550773620605, + "learning_rate": 3.430039875967454e-06, + "loss": 0.5222204327583313, + "mean_token_accuracy": 0.825019121170044, + "num_tokens": 14465736.0, + "step": 1619 + }, + { + "epoch": 1.2310030395136777, + "grad_norm": 2.307573080062866, + "learning_rate": 3.428095477131888e-06, + "loss": 0.29477375745773315, + "mean_token_accuracy": 0.8899064660072327, + "num_tokens": 14471266.0, + "step": 1620 + }, + { + "epoch": 1.2317629179331306, + "grad_norm": 1.8044531345367432, + "learning_rate": 3.4261504269231904e-06, + "loss": 0.4883342981338501, + "mean_token_accuracy": 0.8310165405273438, + "num_tokens": 14481679.0, + "step": 1621 + }, + { + "epoch": 1.2325227963525835, + "grad_norm": 2.7585411071777344, + "learning_rate": 3.4242047267064714e-06, + "loss": 0.45369645953178406, + "mean_token_accuracy": 0.8432134985923767, + "num_tokens": 14487299.0, + "step": 1622 + }, + { + "epoch": 1.2332826747720365, + "grad_norm": 2.687490701675415, + "learning_rate": 3.4222583778472997e-06, + "loss": 0.5627540349960327, + "mean_token_accuracy": 0.8186438083648682, + "num_tokens": 14494254.0, + "step": 1623 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.622443199157715, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.28697147965431213, + "mean_token_accuracy": 0.8861737847328186, + "num_tokens": 14498632.0, + "step": 1624 + }, + { + "epoch": 1.2348024316109423, + "grad_norm": 2.6943359375, + "learning_rate": 3.4183637396661372e-06, + "loss": 0.25273287296295166, + "mean_token_accuracy": 0.9104914665222168, + "num_tokens": 14502797.0, + "step": 1625 + }, + { + "epoch": 1.2355623100303952, + "grad_norm": 2.428189992904663, + "learning_rate": 3.4164154530775552e-06, + "loss": 0.4213451147079468, + "mean_token_accuracy": 0.851524293422699, + "num_tokens": 14508503.0, + "step": 1626 + }, + { + "epoch": 1.236322188449848, + "grad_norm": 2.1722824573516846, + "learning_rate": 3.4144665233133318e-06, + "loss": 0.35238856077194214, + "mean_token_accuracy": 0.8730837106704712, + "num_tokens": 14516126.0, + "step": 1627 + }, + { + "epoch": 1.237082066869301, + "grad_norm": 2.291365146636963, + "learning_rate": 3.4125169517413005e-06, + "loss": 0.43963465094566345, + "mean_token_accuracy": 0.8525444865226746, + "num_tokens": 14522507.0, + "step": 1628 + }, + { + "epoch": 1.237841945288754, + "grad_norm": 1.6181648969650269, + "learning_rate": 3.410566739729746e-06, + "loss": 0.2799680233001709, + "mean_token_accuracy": 0.8915654420852661, + "num_tokens": 14531025.0, + "step": 1629 + }, + { + "epoch": 1.2386018237082066, + "grad_norm": 1.4039218425750732, + "learning_rate": 3.408615888647402e-06, + "loss": 0.29756587743759155, + "mean_token_accuracy": 0.8951715230941772, + "num_tokens": 14543770.0, + "step": 1630 + }, + { + "epoch": 1.2393617021276595, + "grad_norm": 2.148325204849243, + "learning_rate": 3.4066643998634506e-06, + "loss": 0.3983418345451355, + "mean_token_accuracy": 0.8635951280593872, + "num_tokens": 14550896.0, + "step": 1631 + }, + { + "epoch": 1.2401215805471124, + "grad_norm": 1.5225859880447388, + "learning_rate": 3.4047122747475227e-06, + "loss": 0.3247569799423218, + "mean_token_accuracy": 0.8727027177810669, + "num_tokens": 14562181.0, + "step": 1632 + }, + { + "epoch": 1.2408814589665653, + "grad_norm": 3.99835467338562, + "learning_rate": 3.402759514669694e-06, + "loss": 0.4317352771759033, + "mean_token_accuracy": 0.8488142490386963, + "num_tokens": 14565521.0, + "step": 1633 + }, + { + "epoch": 1.2416413373860182, + "grad_norm": 1.7306902408599854, + "learning_rate": 3.4008061210004872e-06, + "loss": 0.389854371547699, + "mean_token_accuracy": 0.8553084135055542, + "num_tokens": 14574633.0, + "step": 1634 + }, + { + "epoch": 1.2424012158054711, + "grad_norm": 2.3614673614501953, + "learning_rate": 3.3988520951108683e-06, + "loss": 0.3150152564048767, + "mean_token_accuracy": 0.8865959644317627, + "num_tokens": 14580240.0, + "step": 1635 + }, + { + "epoch": 1.243161094224924, + "grad_norm": 1.5625747442245483, + "learning_rate": 3.3968974383722497e-06, + "loss": 0.43160033226013184, + "mean_token_accuracy": 0.840155839920044, + "num_tokens": 14594255.0, + "step": 1636 + }, + { + "epoch": 1.243920972644377, + "grad_norm": 1.871620535850525, + "learning_rate": 3.3949421521564825e-06, + "loss": 0.49550193548202515, + "mean_token_accuracy": 0.8315126299858093, + "num_tokens": 14605416.0, + "step": 1637 + }, + { + "epoch": 1.2446808510638299, + "grad_norm": 2.111304759979248, + "learning_rate": 3.392986237835863e-06, + "loss": 0.2794899046421051, + "mean_token_accuracy": 0.9049773216247559, + "num_tokens": 14611711.0, + "step": 1638 + }, + { + "epoch": 1.2454407294832828, + "grad_norm": 3.7479894161224365, + "learning_rate": 3.391029696783127e-06, + "loss": 0.469397634267807, + "mean_token_accuracy": 0.8352956771850586, + "num_tokens": 14615536.0, + "step": 1639 + }, + { + "epoch": 1.2462006079027357, + "grad_norm": 3.277726650238037, + "learning_rate": 3.389072530371451e-06, + "loss": 0.35431790351867676, + "mean_token_accuracy": 0.8822286128997803, + "num_tokens": 14619390.0, + "step": 1640 + }, + { + "epoch": 1.2469604863221884, + "grad_norm": 1.9583072662353516, + "learning_rate": 3.3871147399744482e-06, + "loss": 0.3708694577217102, + "mean_token_accuracy": 0.8720351457595825, + "num_tokens": 14626573.0, + "step": 1641 + }, + { + "epoch": 1.2477203647416413, + "grad_norm": 1.8734042644500732, + "learning_rate": 3.385156326966173e-06, + "loss": 0.48163774609565735, + "mean_token_accuracy": 0.8479621410369873, + "num_tokens": 14636382.0, + "step": 1642 + }, + { + "epoch": 1.2484802431610942, + "grad_norm": 2.0085532665252686, + "learning_rate": 3.383197292721114e-06, + "loss": 0.4893198311328888, + "mean_token_accuracy": 0.838238000869751, + "num_tokens": 14645083.0, + "step": 1643 + }, + { + "epoch": 1.249240121580547, + "grad_norm": 2.0874593257904053, + "learning_rate": 3.3812376386141966e-06, + "loss": 0.4610505700111389, + "mean_token_accuracy": 0.8441368341445923, + "num_tokens": 14654048.0, + "step": 1644 + }, + { + "epoch": 1.25, + "grad_norm": 1.6887420415878296, + "learning_rate": 3.379277366020782e-06, + "loss": 0.3628596067428589, + "mean_token_accuracy": 0.8838590383529663, + "num_tokens": 14662317.0, + "step": 1645 + }, + { + "epoch": 1.250759878419453, + "grad_norm": 2.389002561569214, + "learning_rate": 3.3773164763166653e-06, + "loss": 0.21903495490550995, + "mean_token_accuracy": 0.9249413013458252, + "num_tokens": 14666394.0, + "step": 1646 + }, + { + "epoch": 1.2515197568389058, + "grad_norm": 1.7091087102890015, + "learning_rate": 3.3753549708780736e-06, + "loss": 0.37802332639694214, + "mean_token_accuracy": 0.8644627332687378, + "num_tokens": 14676214.0, + "step": 1647 + }, + { + "epoch": 1.2522796352583587, + "grad_norm": 2.5717999935150146, + "learning_rate": 3.3733928510816677e-06, + "loss": 0.4236462116241455, + "mean_token_accuracy": 0.8519910573959351, + "num_tokens": 14681681.0, + "step": 1648 + }, + { + "epoch": 1.2530395136778116, + "grad_norm": 1.958856463432312, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.3923419415950775, + "mean_token_accuracy": 0.8720202445983887, + "num_tokens": 14690419.0, + "step": 1649 + }, + { + "epoch": 1.2537993920972643, + "grad_norm": 1.5900038480758667, + "learning_rate": 3.369466773924207e-06, + "loss": 0.4182325601577759, + "mean_token_accuracy": 0.8515387177467346, + "num_tokens": 14699790.0, + "step": 1650 + }, + { + "epoch": 1.2545592705167175, + "grad_norm": 1.260547161102295, + "learning_rate": 3.3675028193186243e-06, + "loss": 0.3915718197822571, + "mean_token_accuracy": 0.8536830544471741, + "num_tokens": 14717502.0, + "step": 1651 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 1.8152283430099487, + "learning_rate": 3.365538255866169e-06, + "loss": 0.424524188041687, + "mean_token_accuracy": 0.8434420824050903, + "num_tokens": 14726591.0, + "step": 1652 + }, + { + "epoch": 1.256079027355623, + "grad_norm": 1.3357285261154175, + "learning_rate": 3.3635730849456484e-06, + "loss": 0.2949739396572113, + "mean_token_accuracy": 0.8868321180343628, + "num_tokens": 14739911.0, + "step": 1653 + }, + { + "epoch": 1.256838905775076, + "grad_norm": 1.1770358085632324, + "learning_rate": 3.3616073079362925e-06, + "loss": 0.29939576983451843, + "mean_token_accuracy": 0.8923654556274414, + "num_tokens": 14755521.0, + "step": 1654 + }, + { + "epoch": 1.2575987841945289, + "grad_norm": 2.059162139892578, + "learning_rate": 3.3596409262177633e-06, + "loss": 0.4562555253505707, + "mean_token_accuracy": 0.8585271239280701, + "num_tokens": 14764173.0, + "step": 1655 + }, + { + "epoch": 1.2583586626139818, + "grad_norm": 1.430752158164978, + "learning_rate": 3.357673941170139e-06, + "loss": 0.35301265120506287, + "mean_token_accuracy": 0.8920517563819885, + "num_tokens": 14775596.0, + "step": 1656 + }, + { + "epoch": 1.2591185410334347, + "grad_norm": 1.6066302061080933, + "learning_rate": 3.3557063541739283e-06, + "loss": 0.41129636764526367, + "mean_token_accuracy": 0.8512256145477295, + "num_tokens": 14786289.0, + "step": 1657 + }, + { + "epoch": 1.2598784194528876, + "grad_norm": 1.5471590757369995, + "learning_rate": 3.353738166610058e-06, + "loss": 0.3935067057609558, + "mean_token_accuracy": 0.8514131903648376, + "num_tokens": 14798672.0, + "step": 1658 + }, + { + "epoch": 1.2606382978723405, + "grad_norm": 1.3455181121826172, + "learning_rate": 3.35176937985988e-06, + "loss": 0.3486790657043457, + "mean_token_accuracy": 0.8644362688064575, + "num_tokens": 14811603.0, + "step": 1659 + }, + { + "epoch": 1.2613981762917934, + "grad_norm": 1.891432762145996, + "learning_rate": 3.349799995305162e-06, + "loss": 0.3325638175010681, + "mean_token_accuracy": 0.8844645023345947, + "num_tokens": 14819256.0, + "step": 1660 + }, + { + "epoch": 1.262158054711246, + "grad_norm": 2.600614309310913, + "learning_rate": 3.3478300143280946e-06, + "loss": 0.30310919880867004, + "mean_token_accuracy": 0.9103429317474365, + "num_tokens": 14823706.0, + "step": 1661 + }, + { + "epoch": 1.2629179331306992, + "grad_norm": 3.8636202812194824, + "learning_rate": 3.3458594383112868e-06, + "loss": 0.28377676010131836, + "mean_token_accuracy": 0.9047091007232666, + "num_tokens": 14826688.0, + "step": 1662 + }, + { + "epoch": 1.263677811550152, + "grad_norm": 2.3100268840789795, + "learning_rate": 3.343888268637765e-06, + "loss": 0.4723394513130188, + "mean_token_accuracy": 0.8306777477264404, + "num_tokens": 14835471.0, + "step": 1663 + }, + { + "epoch": 1.2644376899696048, + "grad_norm": 1.7582160234451294, + "learning_rate": 3.341916506690971e-06, + "loss": 0.48168784379959106, + "mean_token_accuracy": 0.8281306028366089, + "num_tokens": 14846513.0, + "step": 1664 + }, + { + "epoch": 1.2651975683890577, + "grad_norm": 2.166055917739868, + "learning_rate": 3.3399441538547638e-06, + "loss": 0.4626024067401886, + "mean_token_accuracy": 0.8377980589866638, + "num_tokens": 14853408.0, + "step": 1665 + }, + { + "epoch": 1.2659574468085106, + "grad_norm": 2.23038911819458, + "learning_rate": 3.337971211513417e-06, + "loss": 0.38434159755706787, + "mean_token_accuracy": 0.8708412647247314, + "num_tokens": 14859919.0, + "step": 1666 + }, + { + "epoch": 1.2667173252279635, + "grad_norm": 2.092505693435669, + "learning_rate": 3.3359976810516164e-06, + "loss": 0.35072219371795654, + "mean_token_accuracy": 0.8761640191078186, + "num_tokens": 14865624.0, + "step": 1667 + }, + { + "epoch": 1.2674772036474165, + "grad_norm": 1.8255130052566528, + "learning_rate": 3.3340235638544633e-06, + "loss": 0.4404270648956299, + "mean_token_accuracy": 0.836356520652771, + "num_tokens": 14874181.0, + "step": 1668 + }, + { + "epoch": 1.2682370820668694, + "grad_norm": 1.9889036417007446, + "learning_rate": 3.332048861307467e-06, + "loss": 0.4199368357658386, + "mean_token_accuracy": 0.8508217334747314, + "num_tokens": 14882275.0, + "step": 1669 + }, + { + "epoch": 1.2689969604863223, + "grad_norm": 4.050281047821045, + "learning_rate": 3.330073574796551e-06, + "loss": 0.4271625280380249, + "mean_token_accuracy": 0.8471108675003052, + "num_tokens": 14893633.0, + "step": 1670 + }, + { + "epoch": 1.2697568389057752, + "grad_norm": 1.998838186264038, + "learning_rate": 3.328097705708047e-06, + "loss": 0.34743767976760864, + "mean_token_accuracy": 0.8771528005599976, + "num_tokens": 14899859.0, + "step": 1671 + }, + { + "epoch": 1.2705167173252279, + "grad_norm": 1.7989062070846558, + "learning_rate": 3.3261212554286977e-06, + "loss": 0.5267184376716614, + "mean_token_accuracy": 0.8323302268981934, + "num_tokens": 14911131.0, + "step": 1672 + }, + { + "epoch": 1.2712765957446808, + "grad_norm": 1.312070369720459, + "learning_rate": 3.324144225345649e-06, + "loss": 0.4675425887107849, + "mean_token_accuracy": 0.8157106637954712, + "num_tokens": 14928955.0, + "step": 1673 + }, + { + "epoch": 1.2720364741641337, + "grad_norm": 2.0547919273376465, + "learning_rate": 3.3221666168464584e-06, + "loss": 0.33704331517219543, + "mean_token_accuracy": 0.8621441125869751, + "num_tokens": 14935536.0, + "step": 1674 + }, + { + "epoch": 1.2727963525835866, + "grad_norm": 2.810413122177124, + "learning_rate": 3.320188431319088e-06, + "loss": 0.4007563292980194, + "mean_token_accuracy": 0.8649672269821167, + "num_tokens": 14940219.0, + "step": 1675 + }, + { + "epoch": 1.2735562310030395, + "grad_norm": 1.3516674041748047, + "learning_rate": 3.318209670151904e-06, + "loss": 0.3457040786743164, + "mean_token_accuracy": 0.8698287010192871, + "num_tokens": 14952904.0, + "step": 1676 + }, + { + "epoch": 1.2743161094224924, + "grad_norm": 2.440643310546875, + "learning_rate": 3.3162303347336765e-06, + "loss": 0.5195086002349854, + "mean_token_accuracy": 0.8348199129104614, + "num_tokens": 14958623.0, + "step": 1677 + }, + { + "epoch": 1.2750759878419453, + "grad_norm": 1.3264343738555908, + "learning_rate": 3.3142504264535808e-06, + "loss": 0.2990425229072571, + "mean_token_accuracy": 0.8961933851242065, + "num_tokens": 14971494.0, + "step": 1678 + }, + { + "epoch": 1.2758358662613982, + "grad_norm": 1.3106894493103027, + "learning_rate": 3.3122699467011913e-06, + "loss": 0.291853666305542, + "mean_token_accuracy": 0.893449068069458, + "num_tokens": 14985239.0, + "step": 1679 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.5387396812438965, + "learning_rate": 3.3102888968664857e-06, + "loss": 0.4336916208267212, + "mean_token_accuracy": 0.8447890877723694, + "num_tokens": 14991453.0, + "step": 1680 + }, + { + "epoch": 1.2773556231003038, + "grad_norm": 2.7052135467529297, + "learning_rate": 3.308307278339842e-06, + "loss": 0.3279378116130829, + "mean_token_accuracy": 0.8935879468917847, + "num_tokens": 14995428.0, + "step": 1681 + }, + { + "epoch": 1.278115501519757, + "grad_norm": 1.6251261234283447, + "learning_rate": 3.306325092512034e-06, + "loss": 0.32066458463668823, + "mean_token_accuracy": 0.8909799456596375, + "num_tokens": 15004841.0, + "step": 1682 + }, + { + "epoch": 1.2788753799392096, + "grad_norm": 2.3014605045318604, + "learning_rate": 3.3043423407742374e-06, + "loss": 0.3523373603820801, + "mean_token_accuracy": 0.8810735940933228, + "num_tokens": 15010742.0, + "step": 1683 + }, + { + "epoch": 1.2796352583586625, + "grad_norm": 2.9563019275665283, + "learning_rate": 3.3023590245180237e-06, + "loss": 0.39715707302093506, + "mean_token_accuracy": 0.8779881000518799, + "num_tokens": 15015357.0, + "step": 1684 + }, + { + "epoch": 1.2803951367781155, + "grad_norm": 1.5787957906723022, + "learning_rate": 3.300375145135361e-06, + "loss": 0.44630166888237, + "mean_token_accuracy": 0.8400174975395203, + "num_tokens": 15031360.0, + "step": 1685 + }, + { + "epoch": 1.2811550151975684, + "grad_norm": 1.6753438711166382, + "learning_rate": 3.2983907040186112e-06, + "loss": 0.3235800862312317, + "mean_token_accuracy": 0.8938044309616089, + "num_tokens": 15040276.0, + "step": 1686 + }, + { + "epoch": 1.2819148936170213, + "grad_norm": 1.7331148386001587, + "learning_rate": 3.296405702560532e-06, + "loss": 0.39061424136161804, + "mean_token_accuracy": 0.8599754571914673, + "num_tokens": 15049725.0, + "step": 1687 + }, + { + "epoch": 1.2826747720364742, + "grad_norm": 2.2029430866241455, + "learning_rate": 3.294420142154274e-06, + "loss": 0.43598297238349915, + "mean_token_accuracy": 0.8663698434829712, + "num_tokens": 15058182.0, + "step": 1688 + }, + { + "epoch": 1.283434650455927, + "grad_norm": 2.943964958190918, + "learning_rate": 3.29243402419338e-06, + "loss": 0.405210942029953, + "mean_token_accuracy": 0.854996919631958, + "num_tokens": 15062920.0, + "step": 1689 + }, + { + "epoch": 1.28419452887538, + "grad_norm": 1.9343379735946655, + "learning_rate": 3.2904473500717826e-06, + "loss": 0.35011449456214905, + "mean_token_accuracy": 0.8745867013931274, + "num_tokens": 15070298.0, + "step": 1690 + }, + { + "epoch": 1.284954407294833, + "grad_norm": 2.559859037399292, + "learning_rate": 3.2884601211838087e-06, + "loss": 0.38816407322883606, + "mean_token_accuracy": 0.854763388633728, + "num_tokens": 15075667.0, + "step": 1691 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4357839822769165, + "learning_rate": 3.2864723389241697e-06, + "loss": 0.4512745141983032, + "mean_token_accuracy": 0.8398592472076416, + "num_tokens": 15090291.0, + "step": 1692 + }, + { + "epoch": 1.2864741641337387, + "grad_norm": 1.7643728256225586, + "learning_rate": 3.284484004687969e-06, + "loss": 0.3536742627620697, + "mean_token_accuracy": 0.8726381063461304, + "num_tokens": 15099325.0, + "step": 1693 + }, + { + "epoch": 1.2872340425531914, + "grad_norm": 1.853173017501831, + "learning_rate": 3.2824951198706958e-06, + "loss": 0.36579740047454834, + "mean_token_accuracy": 0.8988048434257507, + "num_tokens": 15107090.0, + "step": 1694 + }, + { + "epoch": 1.2879939209726443, + "grad_norm": 1.6526862382888794, + "learning_rate": 3.280505685868226e-06, + "loss": 0.3853636682033539, + "mean_token_accuracy": 0.8743607997894287, + "num_tokens": 15117818.0, + "step": 1695 + }, + { + "epoch": 1.2887537993920972, + "grad_norm": 2.790398597717285, + "learning_rate": 3.278515704076821e-06, + "loss": 0.2707311511039734, + "mean_token_accuracy": 0.9034668803215027, + "num_tokens": 15121641.0, + "step": 1696 + }, + { + "epoch": 1.2895136778115501, + "grad_norm": 1.69557523727417, + "learning_rate": 3.276525175893126e-06, + "loss": 0.3707970082759857, + "mean_token_accuracy": 0.8617855906486511, + "num_tokens": 15130414.0, + "step": 1697 + }, + { + "epoch": 1.290273556231003, + "grad_norm": 1.1360478401184082, + "learning_rate": 3.274534102714172e-06, + "loss": 0.3368082344532013, + "mean_token_accuracy": 0.8781654834747314, + "num_tokens": 15148307.0, + "step": 1698 + }, + { + "epoch": 1.291033434650456, + "grad_norm": 1.5894653797149658, + "learning_rate": 3.272542485937369e-06, + "loss": 0.3870658278465271, + "mean_token_accuracy": 0.8830926418304443, + "num_tokens": 15161841.0, + "step": 1699 + }, + { + "epoch": 1.2917933130699089, + "grad_norm": 2.3735709190368652, + "learning_rate": 3.270550326960511e-06, + "loss": 0.3873991370201111, + "mean_token_accuracy": 0.8729057908058167, + "num_tokens": 15167733.0, + "step": 1700 + }, + { + "epoch": 1.2925531914893618, + "grad_norm": 1.3739598989486694, + "learning_rate": 3.268557627181772e-06, + "loss": 0.30831626057624817, + "mean_token_accuracy": 0.8695719242095947, + "num_tokens": 15180861.0, + "step": 1701 + }, + { + "epoch": 1.2933130699088147, + "grad_norm": 1.7526969909667969, + "learning_rate": 3.2665643879997054e-06, + "loss": 0.4716024398803711, + "mean_token_accuracy": 0.8303275108337402, + "num_tokens": 15191642.0, + "step": 1702 + }, + { + "epoch": 1.2940729483282674, + "grad_norm": 2.7866084575653076, + "learning_rate": 3.2645706108132426e-06, + "loss": 0.33337634801864624, + "mean_token_accuracy": 0.8790726065635681, + "num_tokens": 15196038.0, + "step": 1703 + }, + { + "epoch": 1.2948328267477205, + "grad_norm": 2.319765090942383, + "learning_rate": 3.2625762970216944e-06, + "loss": 0.3999716639518738, + "mean_token_accuracy": 0.8693568706512451, + "num_tokens": 15202075.0, + "step": 1704 + }, + { + "epoch": 1.2955927051671732, + "grad_norm": 3.18292498588562, + "learning_rate": 3.2605814480247454e-06, + "loss": 0.4579541087150574, + "mean_token_accuracy": 0.8516187071800232, + "num_tokens": 15206886.0, + "step": 1705 + }, + { + "epoch": 1.296352583586626, + "grad_norm": 2.1816933155059814, + "learning_rate": 3.258586065222459e-06, + "loss": 0.5198885202407837, + "mean_token_accuracy": 0.8170592784881592, + "num_tokens": 15214088.0, + "step": 1706 + }, + { + "epoch": 1.297112462006079, + "grad_norm": 1.9076340198516846, + "learning_rate": 3.2565901500152702e-06, + "loss": 0.49752360582351685, + "mean_token_accuracy": 0.8681992292404175, + "num_tokens": 15226046.0, + "step": 1707 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.0223331451416016, + "learning_rate": 3.2545937038039904e-06, + "loss": 0.4515793025493622, + "mean_token_accuracy": 0.8429619073867798, + "num_tokens": 15234993.0, + "step": 1708 + }, + { + "epoch": 1.2986322188449848, + "grad_norm": 2.5089669227600098, + "learning_rate": 3.2525967279898017e-06, + "loss": 0.43628376722335815, + "mean_token_accuracy": 0.8493682146072388, + "num_tokens": 15240575.0, + "step": 1709 + }, + { + "epoch": 1.2993920972644377, + "grad_norm": 2.8347091674804688, + "learning_rate": 3.2505992239742582e-06, + "loss": 0.25112441182136536, + "mean_token_accuracy": 0.908825159072876, + "num_tokens": 15244085.0, + "step": 1710 + }, + { + "epoch": 1.3001519756838906, + "grad_norm": 2.3157572746276855, + "learning_rate": 3.2486011931592863e-06, + "loss": 0.482818067073822, + "mean_token_accuracy": 0.8305923938751221, + "num_tokens": 15250377.0, + "step": 1711 + }, + { + "epoch": 1.3009118541033435, + "grad_norm": 3.169052839279175, + "learning_rate": 3.2466026369471804e-06, + "loss": 0.3493242561817169, + "mean_token_accuracy": 0.86913001537323, + "num_tokens": 15255041.0, + "step": 1712 + }, + { + "epoch": 1.3016717325227964, + "grad_norm": 1.4475083351135254, + "learning_rate": 3.2446035567406033e-06, + "loss": 0.4177290201187134, + "mean_token_accuracy": 0.8497589826583862, + "num_tokens": 15266946.0, + "step": 1713 + }, + { + "epoch": 1.3024316109422491, + "grad_norm": 1.6473008394241333, + "learning_rate": 3.2426039539425875e-06, + "loss": 0.5272886753082275, + "mean_token_accuracy": 0.8440133333206177, + "num_tokens": 15279263.0, + "step": 1714 + }, + { + "epoch": 1.3031914893617023, + "grad_norm": 2.3996543884277344, + "learning_rate": 3.240603829956531e-06, + "loss": 0.4272066652774811, + "mean_token_accuracy": 0.8495640754699707, + "num_tokens": 15285213.0, + "step": 1715 + }, + { + "epoch": 1.303951367781155, + "grad_norm": 1.63034987449646, + "learning_rate": 3.238603186186198e-06, + "loss": 0.4034635126590729, + "mean_token_accuracy": 0.8638584613800049, + "num_tokens": 15295974.0, + "step": 1716 + }, + { + "epoch": 1.3047112462006079, + "grad_norm": 2.153608798980713, + "learning_rate": 3.2366020240357166e-06, + "loss": 0.30712565779685974, + "mean_token_accuracy": 0.8863866329193115, + "num_tokens": 15302220.0, + "step": 1717 + }, + { + "epoch": 1.3054711246200608, + "grad_norm": 2.9814558029174805, + "learning_rate": 3.2346003449095803e-06, + "loss": 0.3922840356826782, + "mean_token_accuracy": 0.868030309677124, + "num_tokens": 15306747.0, + "step": 1718 + }, + { + "epoch": 1.3062310030395137, + "grad_norm": 3.3417985439300537, + "learning_rate": 3.2325981502126434e-06, + "loss": 0.30750396847724915, + "mean_token_accuracy": 0.9065356850624084, + "num_tokens": 15310309.0, + "step": 1719 + }, + { + "epoch": 1.3069908814589666, + "grad_norm": 2.237682819366455, + "learning_rate": 3.2305954413501252e-06, + "loss": 0.35068294405937195, + "mean_token_accuracy": 0.8887614011764526, + "num_tokens": 15316463.0, + "step": 1720 + }, + { + "epoch": 1.3077507598784195, + "grad_norm": 1.9526605606079102, + "learning_rate": 3.228592219727602e-06, + "loss": 0.42061835527420044, + "mean_token_accuracy": 0.8456839323043823, + "num_tokens": 15323984.0, + "step": 1721 + }, + { + "epoch": 1.3085106382978724, + "grad_norm": 1.6454212665557861, + "learning_rate": 3.226588486751012e-06, + "loss": 0.5189976692199707, + "mean_token_accuracy": 0.8187375068664551, + "num_tokens": 15338807.0, + "step": 1722 + }, + { + "epoch": 1.3092705167173253, + "grad_norm": 1.4521609544754028, + "learning_rate": 3.2245842438266526e-06, + "loss": 0.329673171043396, + "mean_token_accuracy": 0.853867769241333, + "num_tokens": 15350400.0, + "step": 1723 + }, + { + "epoch": 1.3100303951367782, + "grad_norm": 1.8750989437103271, + "learning_rate": 3.222579492361179e-06, + "loss": 0.4635341167449951, + "mean_token_accuracy": 0.8393422365188599, + "num_tokens": 15360557.0, + "step": 1724 + }, + { + "epoch": 1.310790273556231, + "grad_norm": 1.2728849649429321, + "learning_rate": 3.220574233761603e-06, + "loss": 0.3255572021007538, + "mean_token_accuracy": 0.8989741802215576, + "num_tokens": 15376548.0, + "step": 1725 + }, + { + "epoch": 1.3115501519756838, + "grad_norm": 3.5155694484710693, + "learning_rate": 3.2185684694352913e-06, + "loss": 0.34204089641571045, + "mean_token_accuracy": 0.8781906366348267, + "num_tokens": 15380304.0, + "step": 1726 + }, + { + "epoch": 1.3123100303951367, + "grad_norm": 2.059800148010254, + "learning_rate": 3.216562200789968e-06, + "loss": 0.36288338899612427, + "mean_token_accuracy": 0.8595278263092041, + "num_tokens": 15387653.0, + "step": 1727 + }, + { + "epoch": 1.3130699088145896, + "grad_norm": 3.5388240814208984, + "learning_rate": 3.214555429233707e-06, + "loss": 0.5434849858283997, + "mean_token_accuracy": 0.8074631690979004, + "num_tokens": 15391662.0, + "step": 1728 + }, + { + "epoch": 1.3138297872340425, + "grad_norm": 2.8595592975616455, + "learning_rate": 3.2125481561749406e-06, + "loss": 0.5113687515258789, + "mean_token_accuracy": 0.8448649644851685, + "num_tokens": 15397536.0, + "step": 1729 + }, + { + "epoch": 1.3145896656534954, + "grad_norm": 2.50386905670166, + "learning_rate": 3.210540383022449e-06, + "loss": 0.5293697118759155, + "mean_token_accuracy": 0.8096445798873901, + "num_tokens": 15403478.0, + "step": 1730 + }, + { + "epoch": 1.3153495440729484, + "grad_norm": 1.880035400390625, + "learning_rate": 3.208532111185365e-06, + "loss": 0.5344835519790649, + "mean_token_accuracy": 0.8172965049743652, + "num_tokens": 15413812.0, + "step": 1731 + }, + { + "epoch": 1.3161094224924013, + "grad_norm": 1.3688768148422241, + "learning_rate": 3.2065233420731717e-06, + "loss": 0.2577427327632904, + "mean_token_accuracy": 0.9142681360244751, + "num_tokens": 15423583.0, + "step": 1732 + }, + { + "epoch": 1.3168693009118542, + "grad_norm": 1.7945705652236938, + "learning_rate": 3.2045140770956987e-06, + "loss": 0.3983926773071289, + "mean_token_accuracy": 0.8652000427246094, + "num_tokens": 15432473.0, + "step": 1733 + }, + { + "epoch": 1.3176291793313069, + "grad_norm": 1.8243350982666016, + "learning_rate": 3.2025043176631283e-06, + "loss": 0.48644185066223145, + "mean_token_accuracy": 0.8319193124771118, + "num_tokens": 15445463.0, + "step": 1734 + }, + { + "epoch": 1.31838905775076, + "grad_norm": 2.000094175338745, + "learning_rate": 3.2004940651859844e-06, + "loss": 0.43567317724227905, + "mean_token_accuracy": 0.8857482671737671, + "num_tokens": 15452382.0, + "step": 1735 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.379974365234375, + "learning_rate": 3.198483321075141e-06, + "loss": 0.5153506398200989, + "mean_token_accuracy": 0.8295865654945374, + "num_tokens": 15458740.0, + "step": 1736 + }, + { + "epoch": 1.3199088145896656, + "grad_norm": 1.6564184427261353, + "learning_rate": 3.196472086741815e-06, + "loss": 0.508430540561676, + "mean_token_accuracy": 0.8181540369987488, + "num_tokens": 15471844.0, + "step": 1737 + }, + { + "epoch": 1.3206686930091185, + "grad_norm": 2.006925344467163, + "learning_rate": 3.194460363597569e-06, + "loss": 0.34542378783226013, + "mean_token_accuracy": 0.8827437162399292, + "num_tokens": 15478414.0, + "step": 1738 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 3.589045763015747, + "learning_rate": 3.192448153054306e-06, + "loss": 0.4385780096054077, + "mean_token_accuracy": 0.8480287790298462, + "num_tokens": 15482063.0, + "step": 1739 + }, + { + "epoch": 1.3221884498480243, + "grad_norm": 1.9797427654266357, + "learning_rate": 3.190435456524275e-06, + "loss": 0.4330386519432068, + "mean_token_accuracy": 0.8458058834075928, + "num_tokens": 15489803.0, + "step": 1740 + }, + { + "epoch": 1.3229483282674772, + "grad_norm": 1.4777411222457886, + "learning_rate": 3.188422275420063e-06, + "loss": 0.3997895419597626, + "mean_token_accuracy": 0.8639512062072754, + "num_tokens": 15501103.0, + "step": 1741 + }, + { + "epoch": 1.3237082066869301, + "grad_norm": 2.882338523864746, + "learning_rate": 3.186408611154597e-06, + "loss": 0.2336438149213791, + "mean_token_accuracy": 0.9176726937294006, + "num_tokens": 15504854.0, + "step": 1742 + }, + { + "epoch": 1.324468085106383, + "grad_norm": 2.353503704071045, + "learning_rate": 3.184394465141146e-06, + "loss": 0.4107069671154022, + "mean_token_accuracy": 0.8677014112472534, + "num_tokens": 15510662.0, + "step": 1743 + }, + { + "epoch": 1.325227963525836, + "grad_norm": 2.6551976203918457, + "learning_rate": 3.1823798387933134e-06, + "loss": 0.3862302899360657, + "mean_token_accuracy": 0.8819445371627808, + "num_tokens": 15515681.0, + "step": 1744 + }, + { + "epoch": 1.3259878419452886, + "grad_norm": 1.478572964668274, + "learning_rate": 3.180364733525043e-06, + "loss": 0.43972986936569214, + "mean_token_accuracy": 0.832388162612915, + "num_tokens": 15529542.0, + "step": 1745 + }, + { + "epoch": 1.3267477203647418, + "grad_norm": 1.6003550291061401, + "learning_rate": 3.178349150750612e-06, + "loss": 0.3404902219772339, + "mean_token_accuracy": 0.8764007091522217, + "num_tokens": 15538865.0, + "step": 1746 + }, + { + "epoch": 1.3275075987841944, + "grad_norm": 2.130689859390259, + "learning_rate": 3.1763330918846347e-06, + "loss": 0.383136510848999, + "mean_token_accuracy": 0.8652247190475464, + "num_tokens": 15545567.0, + "step": 1747 + }, + { + "epoch": 1.3282674772036474, + "grad_norm": 2.395937442779541, + "learning_rate": 3.1743165583420586e-06, + "loss": 0.3870319128036499, + "mean_token_accuracy": 0.8618065118789673, + "num_tokens": 15551090.0, + "step": 1748 + }, + { + "epoch": 1.3290273556231003, + "grad_norm": 2.0841057300567627, + "learning_rate": 3.1722995515381644e-06, + "loss": 0.4838739335536957, + "mean_token_accuracy": 0.8548711538314819, + "num_tokens": 15558913.0, + "step": 1749 + }, + { + "epoch": 1.3297872340425532, + "grad_norm": 1.4237847328186035, + "learning_rate": 3.1702820728885657e-06, + "loss": 0.40350261330604553, + "mean_token_accuracy": 0.858984649181366, + "num_tokens": 15572045.0, + "step": 1750 + }, + { + "epoch": 1.330547112462006, + "grad_norm": 2.2641282081604004, + "learning_rate": 3.1682641238092064e-06, + "loss": 0.5117636919021606, + "mean_token_accuracy": 0.8078924417495728, + "num_tokens": 15579753.0, + "step": 1751 + }, + { + "epoch": 1.331306990881459, + "grad_norm": 1.0010309219360352, + "learning_rate": 3.1662457057163603e-06, + "loss": 0.3220978379249573, + "mean_token_accuracy": 0.8786559104919434, + "num_tokens": 15602823.0, + "step": 1752 + }, + { + "epoch": 1.332066869300912, + "grad_norm": 2.441230535507202, + "learning_rate": 3.164226820026632e-06, + "loss": 0.37529727816581726, + "mean_token_accuracy": 0.8886898756027222, + "num_tokens": 15608473.0, + "step": 1753 + }, + { + "epoch": 1.3328267477203648, + "grad_norm": 1.2960991859436035, + "learning_rate": 3.162207468156952e-06, + "loss": 0.3393767476081848, + "mean_token_accuracy": 0.8766993284225464, + "num_tokens": 15620893.0, + "step": 1754 + }, + { + "epoch": 1.3335866261398177, + "grad_norm": 2.0806996822357178, + "learning_rate": 3.16018765152458e-06, + "loss": 0.38034507632255554, + "mean_token_accuracy": 0.8854838609695435, + "num_tokens": 15627068.0, + "step": 1755 + }, + { + "epoch": 1.3343465045592704, + "grad_norm": 1.4316699504852295, + "learning_rate": 3.1581673715471007e-06, + "loss": 0.3665890693664551, + "mean_token_accuracy": 0.870919406414032, + "num_tokens": 15641070.0, + "step": 1756 + }, + { + "epoch": 1.3351063829787235, + "grad_norm": 1.3466622829437256, + "learning_rate": 3.1561466296424247e-06, + "loss": 0.37387198209762573, + "mean_token_accuracy": 0.8633951544761658, + "num_tokens": 15653777.0, + "step": 1757 + }, + { + "epoch": 1.3358662613981762, + "grad_norm": 1.8108628988265991, + "learning_rate": 3.154125427228786e-06, + "loss": 0.38428938388824463, + "mean_token_accuracy": 0.85402512550354, + "num_tokens": 15662494.0, + "step": 1758 + }, + { + "epoch": 1.3366261398176291, + "grad_norm": 1.3221700191497803, + "learning_rate": 3.152103765724743e-06, + "loss": 0.42825520038604736, + "mean_token_accuracy": 0.8435465097427368, + "num_tokens": 15677552.0, + "step": 1759 + }, + { + "epoch": 1.337386018237082, + "grad_norm": 2.6247692108154297, + "learning_rate": 3.150081646549174e-06, + "loss": 0.36186715960502625, + "mean_token_accuracy": 0.8767328262329102, + "num_tokens": 15682103.0, + "step": 1760 + }, + { + "epoch": 1.338145896656535, + "grad_norm": 2.1469814777374268, + "learning_rate": 3.1480590711212823e-06, + "loss": 0.3734385669231415, + "mean_token_accuracy": 0.8711104393005371, + "num_tokens": 15689182.0, + "step": 1761 + }, + { + "epoch": 1.3389057750759878, + "grad_norm": 2.1702585220336914, + "learning_rate": 3.1460360408605866e-06, + "loss": 0.2795315086841583, + "mean_token_accuracy": 0.8892190456390381, + "num_tokens": 15694272.0, + "step": 1762 + }, + { + "epoch": 1.3396656534954408, + "grad_norm": 1.918797254562378, + "learning_rate": 3.144012557186931e-06, + "loss": 0.4363473057746887, + "mean_token_accuracy": 0.8573931455612183, + "num_tokens": 15703532.0, + "step": 1763 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.5579960346221924, + "learning_rate": 3.14198862152047e-06, + "loss": 0.406247079372406, + "mean_token_accuracy": 0.8617593050003052, + "num_tokens": 15708652.0, + "step": 1764 + }, + { + "epoch": 1.3411854103343466, + "grad_norm": 2.3617870807647705, + "learning_rate": 3.1399642352816825e-06, + "loss": 0.2839522659778595, + "mean_token_accuracy": 0.8996064066886902, + "num_tokens": 15713598.0, + "step": 1765 + }, + { + "epoch": 1.3419452887537995, + "grad_norm": 1.248302936553955, + "learning_rate": 3.1379393998913594e-06, + "loss": 0.2922290861606598, + "mean_token_accuracy": 0.8948773145675659, + "num_tokens": 15726693.0, + "step": 1766 + }, + { + "epoch": 1.3427051671732522, + "grad_norm": 2.143599510192871, + "learning_rate": 3.135914116770609e-06, + "loss": 0.32176223397254944, + "mean_token_accuracy": 0.8808754682540894, + "num_tokens": 15731901.0, + "step": 1767 + }, + { + "epoch": 1.3434650455927053, + "grad_norm": 4.226369857788086, + "learning_rate": 3.1338883873408517e-06, + "loss": 0.4682556390762329, + "mean_token_accuracy": 0.8566025495529175, + "num_tokens": 15735029.0, + "step": 1768 + }, + { + "epoch": 1.344224924012158, + "grad_norm": 1.8695988655090332, + "learning_rate": 3.1318622130238237e-06, + "loss": 0.4297192394733429, + "mean_token_accuracy": 0.8419148921966553, + "num_tokens": 15744310.0, + "step": 1769 + }, + { + "epoch": 1.344984802431611, + "grad_norm": 2.4321305751800537, + "learning_rate": 3.1298355952415714e-06, + "loss": 0.36076444387435913, + "mean_token_accuracy": 0.8826035261154175, + "num_tokens": 15749337.0, + "step": 1770 + }, + { + "epoch": 1.3457446808510638, + "grad_norm": 1.5500011444091797, + "learning_rate": 3.127808535416454e-06, + "loss": 0.48664039373397827, + "mean_token_accuracy": 0.844344437122345, + "num_tokens": 15761096.0, + "step": 1771 + }, + { + "epoch": 1.3465045592705167, + "grad_norm": 2.1498289108276367, + "learning_rate": 3.1257810349711388e-06, + "loss": 0.4841752052307129, + "mean_token_accuracy": 0.8324567079544067, + "num_tokens": 15768646.0, + "step": 1772 + }, + { + "epoch": 1.3472644376899696, + "grad_norm": 1.2995187044143677, + "learning_rate": 3.1237530953286046e-06, + "loss": 0.492019385099411, + "mean_token_accuracy": 0.8285316228866577, + "num_tokens": 15788401.0, + "step": 1773 + }, + { + "epoch": 1.3480243161094225, + "grad_norm": 2.324819803237915, + "learning_rate": 3.121724717912138e-06, + "loss": 0.33166298270225525, + "mean_token_accuracy": 0.8856451511383057, + "num_tokens": 15794097.0, + "step": 1774 + }, + { + "epoch": 1.3487841945288754, + "grad_norm": 1.9611430168151855, + "learning_rate": 3.11969590414533e-06, + "loss": 0.3974284827709198, + "mean_token_accuracy": 0.8751305937767029, + "num_tokens": 15801065.0, + "step": 1775 + }, + { + "epoch": 1.3495440729483283, + "grad_norm": 1.7084417343139648, + "learning_rate": 3.1176666554520827e-06, + "loss": 0.38729435205459595, + "mean_token_accuracy": 0.8680770397186279, + "num_tokens": 15810353.0, + "step": 1776 + }, + { + "epoch": 1.3503039513677813, + "grad_norm": 1.7616240978240967, + "learning_rate": 3.1156369732566006e-06, + "loss": 0.4271578788757324, + "mean_token_accuracy": 0.843730092048645, + "num_tokens": 15821889.0, + "step": 1777 + }, + { + "epoch": 1.351063829787234, + "grad_norm": 2.030747413635254, + "learning_rate": 3.113606858983391e-06, + "loss": 0.361891508102417, + "mean_token_accuracy": 0.8522407412528992, + "num_tokens": 15830800.0, + "step": 1778 + }, + { + "epoch": 1.3518237082066868, + "grad_norm": 1.4842649698257446, + "learning_rate": 3.1115763140572686e-06, + "loss": 0.466334730386734, + "mean_token_accuracy": 0.8433995246887207, + "num_tokens": 15849422.0, + "step": 1779 + }, + { + "epoch": 1.3525835866261398, + "grad_norm": 1.6595379114151, + "learning_rate": 3.109545339903347e-06, + "loss": 0.4622533321380615, + "mean_token_accuracy": 0.8526314496994019, + "num_tokens": 15860431.0, + "step": 1780 + }, + { + "epoch": 1.3533434650455927, + "grad_norm": 2.1235809326171875, + "learning_rate": 3.107513937947041e-06, + "loss": 0.42694270610809326, + "mean_token_accuracy": 0.854864239692688, + "num_tokens": 15869044.0, + "step": 1781 + }, + { + "epoch": 1.3541033434650456, + "grad_norm": 1.5889263153076172, + "learning_rate": 3.1054821096140675e-06, + "loss": 0.41838499903678894, + "mean_token_accuracy": 0.8671513795852661, + "num_tokens": 15878598.0, + "step": 1782 + }, + { + "epoch": 1.3548632218844985, + "grad_norm": 2.2261741161346436, + "learning_rate": 3.1034498563304435e-06, + "loss": 0.4045066237449646, + "mean_token_accuracy": 0.843826949596405, + "num_tokens": 15885167.0, + "step": 1783 + }, + { + "epoch": 1.3556231003039514, + "grad_norm": 2.2569329738616943, + "learning_rate": 3.1014171795224794e-06, + "loss": 0.36677104234695435, + "mean_token_accuracy": 0.8747833967208862, + "num_tokens": 15891308.0, + "step": 1784 + }, + { + "epoch": 1.3563829787234043, + "grad_norm": 2.1027088165283203, + "learning_rate": 3.0993840806167884e-06, + "loss": 0.437946081161499, + "mean_token_accuracy": 0.8370785117149353, + "num_tokens": 15898952.0, + "step": 1785 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 1.8768929243087769, + "learning_rate": 3.0973505610402767e-06, + "loss": 0.4201734662055969, + "mean_token_accuracy": 0.8474810123443604, + "num_tokens": 15907340.0, + "step": 1786 + }, + { + "epoch": 1.35790273556231, + "grad_norm": 1.7216229438781738, + "learning_rate": 3.0953166222201474e-06, + "loss": 0.4225231409072876, + "mean_token_accuracy": 0.8437749147415161, + "num_tokens": 15917852.0, + "step": 1787 + }, + { + "epoch": 1.358662613981763, + "grad_norm": 2.6256966590881348, + "learning_rate": 3.093282265583895e-06, + "loss": 0.435439795255661, + "mean_token_accuracy": 0.8452040553092957, + "num_tokens": 15923739.0, + "step": 1788 + }, + { + "epoch": 1.3594224924012157, + "grad_norm": 2.90028977394104, + "learning_rate": 3.0912474925593124e-06, + "loss": 0.3730456829071045, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 15927943.0, + "step": 1789 + }, + { + "epoch": 1.3601823708206686, + "grad_norm": 1.5966626405715942, + "learning_rate": 3.0892123045744787e-06, + "loss": 0.42150455713272095, + "mean_token_accuracy": 0.854656457901001, + "num_tokens": 15939922.0, + "step": 1790 + }, + { + "epoch": 1.3609422492401215, + "grad_norm": 1.8069748878479004, + "learning_rate": 3.0871767030577686e-06, + "loss": 0.4954872131347656, + "mean_token_accuracy": 0.8289790153503418, + "num_tokens": 15950095.0, + "step": 1791 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.0855250358581543, + "learning_rate": 3.085140689437846e-06, + "loss": 0.41999945044517517, + "mean_token_accuracy": 0.8517382144927979, + "num_tokens": 15957972.0, + "step": 1792 + }, + { + "epoch": 1.3624620060790273, + "grad_norm": 2.108659267425537, + "learning_rate": 3.0831042651436634e-06, + "loss": 0.3668023645877838, + "mean_token_accuracy": 0.8710855841636658, + "num_tokens": 15965614.0, + "step": 1793 + }, + { + "epoch": 1.3632218844984803, + "grad_norm": 1.3799632787704468, + "learning_rate": 3.0810674316044602e-06, + "loss": 0.351409375667572, + "mean_token_accuracy": 0.870837390422821, + "num_tokens": 15978854.0, + "step": 1794 + }, + { + "epoch": 1.3639817629179332, + "grad_norm": 1.540397047996521, + "learning_rate": 3.0790301902497664e-06, + "loss": 0.403600811958313, + "mean_token_accuracy": 0.8485002517700195, + "num_tokens": 15993324.0, + "step": 1795 + }, + { + "epoch": 1.364741641337386, + "grad_norm": 1.946882963180542, + "learning_rate": 3.076992542509396e-06, + "loss": 0.40118327736854553, + "mean_token_accuracy": 0.8607497811317444, + "num_tokens": 16001937.0, + "step": 1796 + }, + { + "epoch": 1.365501519756839, + "grad_norm": 2.0464305877685547, + "learning_rate": 3.0749544898134487e-06, + "loss": 0.31742292642593384, + "mean_token_accuracy": 0.8878391981124878, + "num_tokens": 16009277.0, + "step": 1797 + }, + { + "epoch": 1.3662613981762917, + "grad_norm": 2.091754913330078, + "learning_rate": 3.072916033592307e-06, + "loss": 0.31580421328544617, + "mean_token_accuracy": 0.8875244855880737, + "num_tokens": 16015756.0, + "step": 1798 + }, + { + "epoch": 1.3670212765957448, + "grad_norm": 3.4449212551116943, + "learning_rate": 3.0708771752766397e-06, + "loss": 0.4692591726779938, + "mean_token_accuracy": 0.8456202149391174, + "num_tokens": 16019912.0, + "step": 1799 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 1.600419521331787, + "learning_rate": 3.068837916297396e-06, + "loss": 0.40389442443847656, + "mean_token_accuracy": 0.8378961086273193, + "num_tokens": 16032637.0, + "step": 1800 + }, + { + "epoch": 1.3685410334346504, + "grad_norm": 1.5282686948776245, + "learning_rate": 3.0667982580858047e-06, + "loss": 0.379841685295105, + "mean_token_accuracy": 0.8752143383026123, + "num_tokens": 16045205.0, + "step": 1801 + }, + { + "epoch": 1.3693009118541033, + "grad_norm": 2.486079454421997, + "learning_rate": 3.0647582020733773e-06, + "loss": 0.41060030460357666, + "mean_token_accuracy": 0.8575131893157959, + "num_tokens": 16051189.0, + "step": 1802 + }, + { + "epoch": 1.3700607902735562, + "grad_norm": 1.9458621740341187, + "learning_rate": 3.062717749691904e-06, + "loss": 0.4442213773727417, + "mean_token_accuracy": 0.8451495170593262, + "num_tokens": 16059700.0, + "step": 1803 + }, + { + "epoch": 1.3708206686930091, + "grad_norm": 1.4333001375198364, + "learning_rate": 3.0606769023734535e-06, + "loss": 0.39132001996040344, + "mean_token_accuracy": 0.8609901666641235, + "num_tokens": 16072458.0, + "step": 1804 + }, + { + "epoch": 1.371580547112462, + "grad_norm": 1.490355372428894, + "learning_rate": 3.0586356615503693e-06, + "loss": 0.4108564257621765, + "mean_token_accuracy": 0.8871046304702759, + "num_tokens": 16083142.0, + "step": 1805 + }, + { + "epoch": 1.372340425531915, + "grad_norm": 1.7765129804611206, + "learning_rate": 3.056594028655274e-06, + "loss": 0.3850266635417938, + "mean_token_accuracy": 0.8923365473747253, + "num_tokens": 16092519.0, + "step": 1806 + }, + { + "epoch": 1.3731003039513678, + "grad_norm": 1.955661416053772, + "learning_rate": 3.0545520051210637e-06, + "loss": 0.4665378928184509, + "mean_token_accuracy": 0.837419867515564, + "num_tokens": 16100618.0, + "step": 1807 + }, + { + "epoch": 1.3738601823708207, + "grad_norm": 3.259265422821045, + "learning_rate": 3.052509592380909e-06, + "loss": 0.24722981452941895, + "mean_token_accuracy": 0.9106054306030273, + "num_tokens": 16103836.0, + "step": 1808 + }, + { + "epoch": 1.3746200607902734, + "grad_norm": 1.7995736598968506, + "learning_rate": 3.050466791868254e-06, + "loss": 0.4982220530509949, + "mean_token_accuracy": 0.8298169374465942, + "num_tokens": 16114727.0, + "step": 1809 + }, + { + "epoch": 1.3753799392097266, + "grad_norm": 1.9643093347549438, + "learning_rate": 3.048423605016815e-06, + "loss": 0.5076829195022583, + "mean_token_accuracy": 0.8303098678588867, + "num_tokens": 16129491.0, + "step": 1810 + }, + { + "epoch": 1.3761398176291793, + "grad_norm": 3.505594491958618, + "learning_rate": 3.0463800332605787e-06, + "loss": 0.27466052770614624, + "mean_token_accuracy": 0.9018045663833618, + "num_tokens": 16132640.0, + "step": 1811 + }, + { + "epoch": 1.3768996960486322, + "grad_norm": 1.798437237739563, + "learning_rate": 3.0443360780338034e-06, + "loss": 0.4004853069782257, + "mean_token_accuracy": 0.8569544553756714, + "num_tokens": 16143317.0, + "step": 1812 + }, + { + "epoch": 1.377659574468085, + "grad_norm": 2.276740789413452, + "learning_rate": 3.042291740771014e-06, + "loss": 0.3823797106742859, + "mean_token_accuracy": 0.8764113783836365, + "num_tokens": 16148898.0, + "step": 1813 + }, + { + "epoch": 1.378419452887538, + "grad_norm": 2.5051357746124268, + "learning_rate": 3.0402470229070057e-06, + "loss": 0.40365856885910034, + "mean_token_accuracy": 0.8809891939163208, + "num_tokens": 16153815.0, + "step": 1814 + }, + { + "epoch": 1.3791793313069909, + "grad_norm": 1.2379236221313477, + "learning_rate": 3.03820192587684e-06, + "loss": 0.3955119848251343, + "mean_token_accuracy": 0.8536627292633057, + "num_tokens": 16167783.0, + "step": 1815 + }, + { + "epoch": 1.3799392097264438, + "grad_norm": 2.2286343574523926, + "learning_rate": 3.036156451115846e-06, + "loss": 0.39647501707077026, + "mean_token_accuracy": 0.8621993064880371, + "num_tokens": 16174707.0, + "step": 1816 + }, + { + "epoch": 1.3806990881458967, + "grad_norm": 1.884639024734497, + "learning_rate": 3.034110600059616e-06, + "loss": 0.31612110137939453, + "mean_token_accuracy": 0.8942475318908691, + "num_tokens": 16181919.0, + "step": 1817 + }, + { + "epoch": 1.3814589665653496, + "grad_norm": 1.891312599182129, + "learning_rate": 3.0320643741440052e-06, + "loss": 0.46209126710891724, + "mean_token_accuracy": 0.8374713659286499, + "num_tokens": 16189276.0, + "step": 1818 + }, + { + "epoch": 1.3822188449848025, + "grad_norm": 2.507478713989258, + "learning_rate": 3.0300177748051375e-06, + "loss": 0.37601593136787415, + "mean_token_accuracy": 0.8633589148521423, + "num_tokens": 16194346.0, + "step": 1819 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 1.5046696662902832, + "learning_rate": 3.0279708034793907e-06, + "loss": 0.3284982144832611, + "mean_token_accuracy": 0.8792630434036255, + "num_tokens": 16205457.0, + "step": 1820 + }, + { + "epoch": 1.3837386018237083, + "grad_norm": 2.4244449138641357, + "learning_rate": 3.025923461603412e-06, + "loss": 0.40939009189605713, + "mean_token_accuracy": 0.8596426248550415, + "num_tokens": 16211866.0, + "step": 1821 + }, + { + "epoch": 1.384498480243161, + "grad_norm": 2.8656933307647705, + "learning_rate": 3.0238757506141013e-06, + "loss": 0.4397110044956207, + "mean_token_accuracy": 0.8597331047058105, + "num_tokens": 16216607.0, + "step": 1822 + }, + { + "epoch": 1.385258358662614, + "grad_norm": 2.0718610286712646, + "learning_rate": 3.0218276719486245e-06, + "loss": 0.49057573080062866, + "mean_token_accuracy": 0.8325331211090088, + "num_tokens": 16224014.0, + "step": 1823 + }, + { + "epoch": 1.3860182370820668, + "grad_norm": 1.054450273513794, + "learning_rate": 3.019779227044398e-06, + "loss": 0.3758106827735901, + "mean_token_accuracy": 0.8689473867416382, + "num_tokens": 16248627.0, + "step": 1824 + }, + { + "epoch": 1.3867781155015197, + "grad_norm": 2.1115148067474365, + "learning_rate": 3.0177304173391038e-06, + "loss": 0.502967119216919, + "mean_token_accuracy": 0.823198676109314, + "num_tokens": 16256255.0, + "step": 1825 + }, + { + "epoch": 1.3875379939209727, + "grad_norm": 2.207277297973633, + "learning_rate": 3.015681244270672e-06, + "loss": 0.3458971083164215, + "mean_token_accuracy": 0.8930196762084961, + "num_tokens": 16261823.0, + "step": 1826 + }, + { + "epoch": 1.3882978723404256, + "grad_norm": 1.289669156074524, + "learning_rate": 3.0136317092772923e-06, + "loss": 0.4422765374183655, + "mean_token_accuracy": 0.8358346819877625, + "num_tokens": 16280659.0, + "step": 1827 + }, + { + "epoch": 1.3890577507598785, + "grad_norm": 2.233865737915039, + "learning_rate": 3.0115818137974066e-06, + "loss": 0.3643006384372711, + "mean_token_accuracy": 0.8682862520217896, + "num_tokens": 16286356.0, + "step": 1828 + }, + { + "epoch": 1.3898176291793314, + "grad_norm": 1.0950042009353638, + "learning_rate": 3.0095315592697126e-06, + "loss": 0.34712421894073486, + "mean_token_accuracy": 0.8578766584396362, + "num_tokens": 16307298.0, + "step": 1829 + }, + { + "epoch": 1.3905775075987843, + "grad_norm": 1.1708037853240967, + "learning_rate": 3.007480947133155e-06, + "loss": 0.33152541518211365, + "mean_token_accuracy": 0.894973874092102, + "num_tokens": 16323232.0, + "step": 1830 + }, + { + "epoch": 1.391337386018237, + "grad_norm": 1.2226970195770264, + "learning_rate": 3.0054299788269343e-06, + "loss": 0.3915635943412781, + "mean_token_accuracy": 0.8575779795646667, + "num_tokens": 16339273.0, + "step": 1831 + }, + { + "epoch": 1.39209726443769, + "grad_norm": 1.2226042747497559, + "learning_rate": 3.0033786557904982e-06, + "loss": 0.45846253633499146, + "mean_token_accuracy": 0.8290432691574097, + "num_tokens": 16360145.0, + "step": 1832 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 2.0117406845092773, + "learning_rate": 3.001326979463545e-06, + "loss": 0.3837882876396179, + "mean_token_accuracy": 0.8941739797592163, + "num_tokens": 16366602.0, + "step": 1833 + }, + { + "epoch": 1.3936170212765957, + "grad_norm": 1.8419997692108154, + "learning_rate": 2.9992749512860177e-06, + "loss": 0.40777021646499634, + "mean_token_accuracy": 0.854655385017395, + "num_tokens": 16375611.0, + "step": 1834 + }, + { + "epoch": 1.3943768996960486, + "grad_norm": 1.9405122995376587, + "learning_rate": 2.9972225726981114e-06, + "loss": 0.46685922145843506, + "mean_token_accuracy": 0.8493201732635498, + "num_tokens": 16384878.0, + "step": 1835 + }, + { + "epoch": 1.3951367781155015, + "grad_norm": 1.2425674200057983, + "learning_rate": 2.995169845140264e-06, + "loss": 0.394692063331604, + "mean_token_accuracy": 0.851348876953125, + "num_tokens": 16404452.0, + "step": 1836 + }, + { + "epoch": 1.3958966565349544, + "grad_norm": 1.2215365171432495, + "learning_rate": 2.9931167700531575e-06, + "loss": 0.31412452459335327, + "mean_token_accuracy": 0.882760763168335, + "num_tokens": 16419358.0, + "step": 1837 + }, + { + "epoch": 1.3966565349544073, + "grad_norm": 1.912168025970459, + "learning_rate": 2.9910633488777198e-06, + "loss": 0.5065487623214722, + "mean_token_accuracy": 0.8524355292320251, + "num_tokens": 16430418.0, + "step": 1838 + }, + { + "epoch": 1.3974164133738602, + "grad_norm": 2.2173948287963867, + "learning_rate": 2.989009583055121e-06, + "loss": 0.4290938377380371, + "mean_token_accuracy": 0.8381836414337158, + "num_tokens": 16438267.0, + "step": 1839 + }, + { + "epoch": 1.3981762917933132, + "grad_norm": 1.8293484449386597, + "learning_rate": 2.9869554740267726e-06, + "loss": 0.41683733463287354, + "mean_token_accuracy": 0.8548779487609863, + "num_tokens": 16447382.0, + "step": 1840 + }, + { + "epoch": 1.398936170212766, + "grad_norm": 1.835015892982483, + "learning_rate": 2.9849010232343274e-06, + "loss": 0.5080599784851074, + "mean_token_accuracy": 0.8193596601486206, + "num_tokens": 16458541.0, + "step": 1841 + }, + { + "epoch": 1.3996960486322187, + "grad_norm": 2.031339645385742, + "learning_rate": 2.982846232119679e-06, + "loss": 0.5168882012367249, + "mean_token_accuracy": 0.8525956869125366, + "num_tokens": 16467747.0, + "step": 1842 + }, + { + "epoch": 1.4004559270516717, + "grad_norm": 1.5554167032241821, + "learning_rate": 2.9807911021249573e-06, + "loss": 0.35098958015441895, + "mean_token_accuracy": 0.888373851776123, + "num_tokens": 16479319.0, + "step": 1843 + }, + { + "epoch": 1.4012158054711246, + "grad_norm": 1.7183740139007568, + "learning_rate": 2.9787356346925327e-06, + "loss": 0.41263148188591003, + "mean_token_accuracy": 0.8478364944458008, + "num_tokens": 16489952.0, + "step": 1844 + }, + { + "epoch": 1.4019756838905775, + "grad_norm": 1.7743209600448608, + "learning_rate": 2.9766798312650112e-06, + "loss": 0.4211183190345764, + "mean_token_accuracy": 0.8641136884689331, + "num_tokens": 16498655.0, + "step": 1845 + }, + { + "epoch": 1.4027355623100304, + "grad_norm": 2.141300916671753, + "learning_rate": 2.9746236932852355e-06, + "loss": 0.49548980593681335, + "mean_token_accuracy": 0.8304252028465271, + "num_tokens": 16506348.0, + "step": 1846 + }, + { + "epoch": 1.4034954407294833, + "grad_norm": 2.341571807861328, + "learning_rate": 2.9725672221962804e-06, + "loss": 0.40804803371429443, + "mean_token_accuracy": 0.8545800447463989, + "num_tokens": 16513091.0, + "step": 1847 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 1.934428095817566, + "learning_rate": 2.9705104194414587e-06, + "loss": 0.30029812455177307, + "mean_token_accuracy": 0.9032052755355835, + "num_tokens": 16519455.0, + "step": 1848 + }, + { + "epoch": 1.405015197568389, + "grad_norm": 1.420804500579834, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.4384060502052307, + "mean_token_accuracy": 0.8465110063552856, + "num_tokens": 16533222.0, + "step": 1849 + }, + { + "epoch": 1.405775075987842, + "grad_norm": 2.1180737018585205, + "learning_rate": 2.9663958247086165e-06, + "loss": 0.3915565609931946, + "mean_token_accuracy": 0.8633890748023987, + "num_tokens": 16539489.0, + "step": 1850 + }, + { + "epoch": 1.4065349544072947, + "grad_norm": 1.408048152923584, + "learning_rate": 2.964338035618378e-06, + "loss": 0.46166157722473145, + "mean_token_accuracy": 0.8305013179779053, + "num_tokens": 16555785.0, + "step": 1851 + }, + { + "epoch": 1.4072948328267478, + "grad_norm": 1.3418530225753784, + "learning_rate": 2.9622799206378306e-06, + "loss": 0.5314373970031738, + "mean_token_accuracy": 0.81779944896698, + "num_tokens": 16578111.0, + "step": 1852 + }, + { + "epoch": 1.4080547112462005, + "grad_norm": 1.4634262323379517, + "learning_rate": 2.9602214812114414e-06, + "loss": 0.4859408140182495, + "mean_token_accuracy": 0.8261818885803223, + "num_tokens": 16591976.0, + "step": 1853 + }, + { + "epoch": 1.4088145896656534, + "grad_norm": 1.4840295314788818, + "learning_rate": 2.9581627187838997e-06, + "loss": 0.4079628586769104, + "mean_token_accuracy": 0.8549603223800659, + "num_tokens": 16603631.0, + "step": 1854 + }, + { + "epoch": 1.4095744680851063, + "grad_norm": 2.1474642753601074, + "learning_rate": 2.956103634800126e-06, + "loss": 0.32997995615005493, + "mean_token_accuracy": 0.8836915493011475, + "num_tokens": 16609875.0, + "step": 1855 + }, + { + "epoch": 1.4103343465045592, + "grad_norm": 2.627460241317749, + "learning_rate": 2.9540442307052643e-06, + "loss": 0.3229186236858368, + "mean_token_accuracy": 0.8852157592773438, + "num_tokens": 16614113.0, + "step": 1856 + }, + { + "epoch": 1.4110942249240122, + "grad_norm": 1.9569811820983887, + "learning_rate": 2.9519845079446824e-06, + "loss": 0.5057883858680725, + "mean_token_accuracy": 0.8585711717605591, + "num_tokens": 16624611.0, + "step": 1857 + }, + { + "epoch": 1.411854103343465, + "grad_norm": 2.0604090690612793, + "learning_rate": 2.949924467963975e-06, + "loss": 0.4681510329246521, + "mean_token_accuracy": 0.8390560150146484, + "num_tokens": 16632938.0, + "step": 1858 + }, + { + "epoch": 1.412613981762918, + "grad_norm": 2.5430450439453125, + "learning_rate": 2.9478641122089563e-06, + "loss": 0.3090999126434326, + "mean_token_accuracy": 0.8943990468978882, + "num_tokens": 16637135.0, + "step": 1859 + }, + { + "epoch": 1.4133738601823709, + "grad_norm": 1.3275387287139893, + "learning_rate": 2.945803442125663e-06, + "loss": 0.3592180013656616, + "mean_token_accuracy": 0.8678265810012817, + "num_tokens": 16650322.0, + "step": 1860 + }, + { + "epoch": 1.4141337386018238, + "grad_norm": 1.9070929288864136, + "learning_rate": 2.943742459160354e-06, + "loss": 0.5332518815994263, + "mean_token_accuracy": 0.8475706577301025, + "num_tokens": 16660240.0, + "step": 1861 + }, + { + "epoch": 1.4148936170212765, + "grad_norm": 2.8724546432495117, + "learning_rate": 2.9416811647595052e-06, + "loss": 0.5052884817123413, + "mean_token_accuracy": 0.8363175392150879, + "num_tokens": 16665481.0, + "step": 1862 + }, + { + "epoch": 1.4156534954407296, + "grad_norm": 4.203817844390869, + "learning_rate": 2.939619560369813e-06, + "loss": 0.546925961971283, + "mean_token_accuracy": 0.834044337272644, + "num_tokens": 16669615.0, + "step": 1863 + }, + { + "epoch": 1.4164133738601823, + "grad_norm": 1.6466281414031982, + "learning_rate": 2.9375576474381907e-06, + "loss": 0.3474533259868622, + "mean_token_accuracy": 0.8571163415908813, + "num_tokens": 16678893.0, + "step": 1864 + }, + { + "epoch": 1.4171732522796352, + "grad_norm": 1.8885842561721802, + "learning_rate": 2.9354954274117683e-06, + "loss": 0.3726021349430084, + "mean_token_accuracy": 0.8629094958305359, + "num_tokens": 16685939.0, + "step": 1865 + }, + { + "epoch": 1.417933130699088, + "grad_norm": 2.830599784851074, + "learning_rate": 2.9334329017378898e-06, + "loss": 0.4138668477535248, + "mean_token_accuracy": 0.8670746088027954, + "num_tokens": 16690012.0, + "step": 1866 + }, + { + "epoch": 1.418693009118541, + "grad_norm": 1.6838961839675903, + "learning_rate": 2.9313700718641167e-06, + "loss": 0.33954259753227234, + "mean_token_accuracy": 0.8660278916358948, + "num_tokens": 16700061.0, + "step": 1867 + }, + { + "epoch": 1.419452887537994, + "grad_norm": 2.8767011165618896, + "learning_rate": 2.9293069392382224e-06, + "loss": 0.4650302827358246, + "mean_token_accuracy": 0.8448452949523926, + "num_tokens": 16705072.0, + "step": 1868 + }, + { + "epoch": 1.4202127659574468, + "grad_norm": 1.5901305675506592, + "learning_rate": 2.927243505308192e-06, + "loss": 0.40838998556137085, + "mean_token_accuracy": 0.8560664653778076, + "num_tokens": 16714763.0, + "step": 1869 + }, + { + "epoch": 1.4209726443768997, + "grad_norm": 1.3293657302856445, + "learning_rate": 2.925179771522223e-06, + "loss": 0.34712862968444824, + "mean_token_accuracy": 0.8633697032928467, + "num_tokens": 16729575.0, + "step": 1870 + }, + { + "epoch": 1.4217325227963526, + "grad_norm": 1.7465964555740356, + "learning_rate": 2.9231157393287234e-06, + "loss": 0.48190903663635254, + "mean_token_accuracy": 0.8255834579467773, + "num_tokens": 16742529.0, + "step": 1871 + }, + { + "epoch": 1.4224924012158056, + "grad_norm": 1.865749716758728, + "learning_rate": 2.9210514101763116e-06, + "loss": 0.4912028908729553, + "mean_token_accuracy": 0.8309572339057922, + "num_tokens": 16753989.0, + "step": 1872 + }, + { + "epoch": 1.4232522796352582, + "grad_norm": 2.55780291557312, + "learning_rate": 2.9189867855138103e-06, + "loss": 0.4550635814666748, + "mean_token_accuracy": 0.8584091067314148, + "num_tokens": 16758906.0, + "step": 1873 + }, + { + "epoch": 1.4240121580547114, + "grad_norm": 1.867530107498169, + "learning_rate": 2.9169218667902562e-06, + "loss": 0.3524911105632782, + "mean_token_accuracy": 0.8715004920959473, + "num_tokens": 16765969.0, + "step": 1874 + }, + { + "epoch": 1.424772036474164, + "grad_norm": 1.8886862993240356, + "learning_rate": 2.9148566554548857e-06, + "loss": 0.37144535779953003, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 16773935.0, + "step": 1875 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 1.266065239906311, + "learning_rate": 2.912791152957145e-06, + "loss": 0.3341747522354126, + "mean_token_accuracy": 0.8929134607315063, + "num_tokens": 16787780.0, + "step": 1876 + }, + { + "epoch": 1.4262917933130699, + "grad_norm": 2.524888753890991, + "learning_rate": 2.9107253607466833e-06, + "loss": 0.33709171414375305, + "mean_token_accuracy": 0.8857531547546387, + "num_tokens": 16792753.0, + "step": 1877 + }, + { + "epoch": 1.4270516717325228, + "grad_norm": 1.9269018173217773, + "learning_rate": 2.908659280273354e-06, + "loss": 0.32599249482154846, + "mean_token_accuracy": 0.8777773380279541, + "num_tokens": 16799904.0, + "step": 1878 + }, + { + "epoch": 1.4278115501519757, + "grad_norm": 1.9844375848770142, + "learning_rate": 2.9065929129872097e-06, + "loss": 0.4086732268333435, + "mean_token_accuracy": 0.8505409955978394, + "num_tokens": 16807774.0, + "step": 1879 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 4.0958662033081055, + "learning_rate": 2.9045262603385073e-06, + "loss": 0.3838827610015869, + "mean_token_accuracy": 0.877601146697998, + "num_tokens": 16810908.0, + "step": 1880 + }, + { + "epoch": 1.4293313069908815, + "grad_norm": 1.7323768138885498, + "learning_rate": 2.902459323777704e-06, + "loss": 0.37459003925323486, + "mean_token_accuracy": 0.8655836582183838, + "num_tokens": 16819494.0, + "step": 1881 + }, + { + "epoch": 1.4300911854103344, + "grad_norm": 2.608043670654297, + "learning_rate": 2.900392104755455e-06, + "loss": 0.5798726677894592, + "mean_token_accuracy": 0.8382592797279358, + "num_tokens": 16827745.0, + "step": 1882 + }, + { + "epoch": 1.4308510638297873, + "grad_norm": 1.3262078762054443, + "learning_rate": 2.8983246047226137e-06, + "loss": 0.3724595904350281, + "mean_token_accuracy": 0.8651963472366333, + "num_tokens": 16844171.0, + "step": 1883 + }, + { + "epoch": 1.43161094224924, + "grad_norm": 1.7250545024871826, + "learning_rate": 2.8962568251302327e-06, + "loss": 0.3478979468345642, + "mean_token_accuracy": 0.8807886242866516, + "num_tokens": 16852838.0, + "step": 1884 + }, + { + "epoch": 1.4323708206686931, + "grad_norm": 2.114525318145752, + "learning_rate": 2.8941887674295573e-06, + "loss": 0.5156140327453613, + "mean_token_accuracy": 0.825178861618042, + "num_tokens": 16861087.0, + "step": 1885 + }, + { + "epoch": 1.4331306990881458, + "grad_norm": 2.400829792022705, + "learning_rate": 2.892120433072031e-06, + "loss": 0.2807392477989197, + "mean_token_accuracy": 0.8907361030578613, + "num_tokens": 16866557.0, + "step": 1886 + }, + { + "epoch": 1.4338905775075987, + "grad_norm": 2.490880012512207, + "learning_rate": 2.8900518235092908e-06, + "loss": 0.2615952491760254, + "mean_token_accuracy": 0.9152894020080566, + "num_tokens": 16871357.0, + "step": 1887 + }, + { + "epoch": 1.4346504559270516, + "grad_norm": 1.9058431386947632, + "learning_rate": 2.887982940193165e-06, + "loss": 0.43623363971710205, + "mean_token_accuracy": 0.84696364402771, + "num_tokens": 16879016.0, + "step": 1888 + }, + { + "epoch": 1.4354103343465046, + "grad_norm": 1.4520210027694702, + "learning_rate": 2.8859137845756785e-06, + "loss": 0.3961856961250305, + "mean_token_accuracy": 0.8518897294998169, + "num_tokens": 16892254.0, + "step": 1889 + }, + { + "epoch": 1.4361702127659575, + "grad_norm": 2.500274896621704, + "learning_rate": 2.8838443581090415e-06, + "loss": 0.41457289457321167, + "mean_token_accuracy": 0.8751448392868042, + "num_tokens": 16897156.0, + "step": 1890 + }, + { + "epoch": 1.4369300911854104, + "grad_norm": 2.9312057495117188, + "learning_rate": 2.8817746622456585e-06, + "loss": 0.45875269174575806, + "mean_token_accuracy": 0.8411039113998413, + "num_tokens": 16902291.0, + "step": 1891 + }, + { + "epoch": 1.4376899696048633, + "grad_norm": 2.367419481277466, + "learning_rate": 2.879704698438121e-06, + "loss": 0.3643629848957062, + "mean_token_accuracy": 0.8771071434020996, + "num_tokens": 16908128.0, + "step": 1892 + }, + { + "epoch": 1.4384498480243162, + "grad_norm": 1.9907705783843994, + "learning_rate": 2.8776344681392106e-06, + "loss": 0.3206835389137268, + "mean_token_accuracy": 0.879996657371521, + "num_tokens": 16914918.0, + "step": 1893 + }, + { + "epoch": 1.439209726443769, + "grad_norm": 3.536956310272217, + "learning_rate": 2.875563972801893e-06, + "loss": 0.3640141785144806, + "mean_token_accuracy": 0.8814959526062012, + "num_tokens": 16918187.0, + "step": 1894 + }, + { + "epoch": 1.4399696048632218, + "grad_norm": 1.3451156616210938, + "learning_rate": 2.8734932138793226e-06, + "loss": 0.3427346348762512, + "mean_token_accuracy": 0.8835382461547852, + "num_tokens": 16931135.0, + "step": 1895 + }, + { + "epoch": 1.4407294832826747, + "grad_norm": 2.0735955238342285, + "learning_rate": 2.871422192824837e-06, + "loss": 0.4265315532684326, + "mean_token_accuracy": 0.8452677726745605, + "num_tokens": 16937995.0, + "step": 1896 + }, + { + "epoch": 1.4414893617021276, + "grad_norm": 1.5124932527542114, + "learning_rate": 2.8693509110919597e-06, + "loss": 0.497121661901474, + "mean_token_accuracy": 0.815092921257019, + "num_tokens": 16952743.0, + "step": 1897 + }, + { + "epoch": 1.4422492401215805, + "grad_norm": 3.716669797897339, + "learning_rate": 2.867279370134395e-06, + "loss": 0.5452651381492615, + "mean_token_accuracy": 0.8150380849838257, + "num_tokens": 16956797.0, + "step": 1898 + }, + { + "epoch": 1.4430091185410334, + "grad_norm": 1.3571398258209229, + "learning_rate": 2.8652075714060296e-06, + "loss": 0.4249724745750427, + "mean_token_accuracy": 0.8675867915153503, + "num_tokens": 16974494.0, + "step": 1899 + }, + { + "epoch": 1.4437689969604863, + "grad_norm": 2.310673475265503, + "learning_rate": 2.863135516360932e-06, + "loss": 0.39368677139282227, + "mean_token_accuracy": 0.878392219543457, + "num_tokens": 16980612.0, + "step": 1900 + }, + { + "epoch": 1.4445288753799392, + "grad_norm": 1.9025533199310303, + "learning_rate": 2.8610632064533517e-06, + "loss": 0.4786127805709839, + "mean_token_accuracy": 0.8720556497573853, + "num_tokens": 16992262.0, + "step": 1901 + }, + { + "epoch": 1.4452887537993921, + "grad_norm": 2.528564453125, + "learning_rate": 2.8589906431377133e-06, + "loss": 0.4223094582557678, + "mean_token_accuracy": 0.8513246178627014, + "num_tokens": 16997717.0, + "step": 1902 + }, + { + "epoch": 1.446048632218845, + "grad_norm": 1.010425329208374, + "learning_rate": 2.8569178278686222e-06, + "loss": 0.3908255696296692, + "mean_token_accuracy": 0.8620463609695435, + "num_tokens": 17020903.0, + "step": 1903 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 1.5760232210159302, + "learning_rate": 2.8548447621008614e-06, + "loss": 0.4134044051170349, + "mean_token_accuracy": 0.8472093343734741, + "num_tokens": 17035250.0, + "step": 1904 + }, + { + "epoch": 1.4475683890577509, + "grad_norm": 2.0668535232543945, + "learning_rate": 2.8527714472893866e-06, + "loss": 0.44095730781555176, + "mean_token_accuracy": 0.881983757019043, + "num_tokens": 17042170.0, + "step": 1905 + }, + { + "epoch": 1.4483282674772036, + "grad_norm": 1.1620599031448364, + "learning_rate": 2.85069788488933e-06, + "loss": 0.3607163429260254, + "mean_token_accuracy": 0.8684282898902893, + "num_tokens": 17061937.0, + "step": 1906 + }, + { + "epoch": 1.4490881458966565, + "grad_norm": 2.1316568851470947, + "learning_rate": 2.8486240763559984e-06, + "loss": 0.3478124141693115, + "mean_token_accuracy": 0.8772403001785278, + "num_tokens": 17068628.0, + "step": 1907 + }, + { + "epoch": 1.4498480243161094, + "grad_norm": 2.4756391048431396, + "learning_rate": 2.8465500231448707e-06, + "loss": 0.46441152691841125, + "mean_token_accuracy": 0.8436450958251953, + "num_tokens": 17075495.0, + "step": 1908 + }, + { + "epoch": 1.4506079027355623, + "grad_norm": 2.249720573425293, + "learning_rate": 2.844475726711595e-06, + "loss": 0.41565513610839844, + "mean_token_accuracy": 0.8525094985961914, + "num_tokens": 17080940.0, + "step": 1909 + }, + { + "epoch": 1.4513677811550152, + "grad_norm": 2.3081841468811035, + "learning_rate": 2.8424011885119956e-06, + "loss": 0.49903199076652527, + "mean_token_accuracy": 0.8212426900863647, + "num_tokens": 17092024.0, + "step": 1910 + }, + { + "epoch": 1.452127659574468, + "grad_norm": 1.2929959297180176, + "learning_rate": 2.8403264100020613e-06, + "loss": 0.47038257122039795, + "mean_token_accuracy": 0.8319816589355469, + "num_tokens": 17108840.0, + "step": 1911 + }, + { + "epoch": 1.452887537993921, + "grad_norm": 1.6476463079452515, + "learning_rate": 2.8382513926379508e-06, + "loss": 0.42287829518318176, + "mean_token_accuracy": 0.8555682897567749, + "num_tokens": 17119704.0, + "step": 1912 + }, + { + "epoch": 1.453647416413374, + "grad_norm": 1.759998083114624, + "learning_rate": 2.836176137875993e-06, + "loss": 0.40904951095581055, + "mean_token_accuracy": 0.8698266744613647, + "num_tokens": 17130676.0, + "step": 1913 + }, + { + "epoch": 1.4544072948328268, + "grad_norm": 1.510909914970398, + "learning_rate": 2.8341006471726817e-06, + "loss": 0.47834792733192444, + "mean_token_accuracy": 0.8335825204849243, + "num_tokens": 17146304.0, + "step": 1914 + }, + { + "epoch": 1.4551671732522795, + "grad_norm": 3.538071632385254, + "learning_rate": 2.832024921984674e-06, + "loss": 0.34059035778045654, + "mean_token_accuracy": 0.8769031763076782, + "num_tokens": 17150458.0, + "step": 1915 + }, + { + "epoch": 1.4559270516717326, + "grad_norm": 2.3368659019470215, + "learning_rate": 2.8299489637687955e-06, + "loss": 0.43068382143974304, + "mean_token_accuracy": 0.845360517501831, + "num_tokens": 17157368.0, + "step": 1916 + }, + { + "epoch": 1.4566869300911853, + "grad_norm": 1.8720396757125854, + "learning_rate": 2.8278727739820334e-06, + "loss": 0.37013399600982666, + "mean_token_accuracy": 0.854241132736206, + "num_tokens": 17166325.0, + "step": 1917 + }, + { + "epoch": 1.4574468085106382, + "grad_norm": 1.6706892251968384, + "learning_rate": 2.825796354081537e-06, + "loss": 0.5397020578384399, + "mean_token_accuracy": 0.8309713006019592, + "num_tokens": 17178920.0, + "step": 1918 + }, + { + "epoch": 1.4582066869300911, + "grad_norm": 2.729210376739502, + "learning_rate": 2.8237197055246175e-06, + "loss": 0.25137859582901, + "mean_token_accuracy": 0.9148792028427124, + "num_tokens": 17183107.0, + "step": 1919 + }, + { + "epoch": 1.458966565349544, + "grad_norm": 3.023500680923462, + "learning_rate": 2.821642829768748e-06, + "loss": 0.43312495946884155, + "mean_token_accuracy": 0.8481811285018921, + "num_tokens": 17187853.0, + "step": 1920 + }, + { + "epoch": 1.459726443768997, + "grad_norm": 1.8108519315719604, + "learning_rate": 2.8195657282715595e-06, + "loss": 0.5101792216300964, + "mean_token_accuracy": 0.8315553069114685, + "num_tokens": 17199247.0, + "step": 1921 + }, + { + "epoch": 1.4604863221884499, + "grad_norm": 2.0262672901153564, + "learning_rate": 2.817488402490841e-06, + "loss": 0.4449934959411621, + "mean_token_accuracy": 0.8634527325630188, + "num_tokens": 17206348.0, + "step": 1922 + }, + { + "epoch": 1.4612462006079028, + "grad_norm": 2.6163926124572754, + "learning_rate": 2.8154108538845405e-06, + "loss": 0.43052345514297485, + "mean_token_accuracy": 0.8375401496887207, + "num_tokens": 17211702.0, + "step": 1923 + }, + { + "epoch": 1.4620060790273557, + "grad_norm": 2.0854408740997314, + "learning_rate": 2.813333083910761e-06, + "loss": 0.5011380910873413, + "mean_token_accuracy": 0.8359915018081665, + "num_tokens": 17219096.0, + "step": 1924 + }, + { + "epoch": 1.4627659574468086, + "grad_norm": 2.2081687450408936, + "learning_rate": 2.8112550940277615e-06, + "loss": 0.5239193439483643, + "mean_token_accuracy": 0.8499593734741211, + "num_tokens": 17229266.0, + "step": 1925 + }, + { + "epoch": 1.4635258358662613, + "grad_norm": 1.798343539237976, + "learning_rate": 2.809176885693956e-06, + "loss": 0.4515029191970825, + "mean_token_accuracy": 0.8400485515594482, + "num_tokens": 17239280.0, + "step": 1926 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.897887945175171, + "learning_rate": 2.807098460367911e-06, + "loss": 0.35935714840888977, + "mean_token_accuracy": 0.8776072263717651, + "num_tokens": 17247132.0, + "step": 1927 + }, + { + "epoch": 1.465045592705167, + "grad_norm": 2.705836296081543, + "learning_rate": 2.8050198195083445e-06, + "loss": 0.3728443682193756, + "mean_token_accuracy": 0.8649885654449463, + "num_tokens": 17251865.0, + "step": 1928 + }, + { + "epoch": 1.46580547112462, + "grad_norm": 1.841178059577942, + "learning_rate": 2.802940964574127e-06, + "loss": 0.40604841709136963, + "mean_token_accuracy": 0.8537783622741699, + "num_tokens": 17260163.0, + "step": 1929 + }, + { + "epoch": 1.466565349544073, + "grad_norm": 2.7393605709075928, + "learning_rate": 2.800861897024279e-06, + "loss": 0.39346879720687866, + "mean_token_accuracy": 0.8628787994384766, + "num_tokens": 17264876.0, + "step": 1930 + }, + { + "epoch": 1.4673252279635258, + "grad_norm": 1.84367835521698, + "learning_rate": 2.798782618317971e-06, + "loss": 0.37411895394325256, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 17273049.0, + "step": 1931 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 1.6546733379364014, + "learning_rate": 2.796703129914519e-06, + "loss": 0.4997844099998474, + "mean_token_accuracy": 0.8267433643341064, + "num_tokens": 17285074.0, + "step": 1932 + }, + { + "epoch": 1.4688449848024316, + "grad_norm": 2.2749221324920654, + "learning_rate": 2.79462343327339e-06, + "loss": 0.35453367233276367, + "mean_token_accuracy": 0.8746850490570068, + "num_tokens": 17290273.0, + "step": 1933 + }, + { + "epoch": 1.4696048632218845, + "grad_norm": 1.7142518758773804, + "learning_rate": 2.7925435298541944e-06, + "loss": 0.345878541469574, + "mean_token_accuracy": 0.8600981831550598, + "num_tokens": 17301045.0, + "step": 1934 + }, + { + "epoch": 1.4703647416413375, + "grad_norm": 3.163342237472534, + "learning_rate": 2.7904634211166877e-06, + "loss": 0.4356975853443146, + "mean_token_accuracy": 0.8460350036621094, + "num_tokens": 17305108.0, + "step": 1935 + }, + { + "epoch": 1.4711246200607904, + "grad_norm": 1.6377612352371216, + "learning_rate": 2.7883831085207707e-06, + "loss": 0.4459729790687561, + "mean_token_accuracy": 0.8463394641876221, + "num_tokens": 17315479.0, + "step": 1936 + }, + { + "epoch": 1.471884498480243, + "grad_norm": 1.865268588066101, + "learning_rate": 2.7863025935264876e-06, + "loss": 0.394723117351532, + "mean_token_accuracy": 0.864177942276001, + "num_tokens": 17324795.0, + "step": 1937 + }, + { + "epoch": 1.4726443768996962, + "grad_norm": 1.241937518119812, + "learning_rate": 2.784221877594024e-06, + "loss": 0.2752220630645752, + "mean_token_accuracy": 0.8998259902000427, + "num_tokens": 17338000.0, + "step": 1938 + }, + { + "epoch": 1.4734042553191489, + "grad_norm": 1.8013651371002197, + "learning_rate": 2.7821409621837042e-06, + "loss": 0.4251005947589874, + "mean_token_accuracy": 0.8518919348716736, + "num_tokens": 17347351.0, + "step": 1939 + }, + { + "epoch": 1.4741641337386018, + "grad_norm": 1.2902207374572754, + "learning_rate": 2.7800598487559976e-06, + "loss": 0.3640727400779724, + "mean_token_accuracy": 0.8592870235443115, + "num_tokens": 17362335.0, + "step": 1940 + }, + { + "epoch": 1.4749240121580547, + "grad_norm": 2.5427513122558594, + "learning_rate": 2.777978538771508e-06, + "loss": 0.38166797161102295, + "mean_token_accuracy": 0.8653234839439392, + "num_tokens": 17367733.0, + "step": 1941 + }, + { + "epoch": 1.4756838905775076, + "grad_norm": 1.7793641090393066, + "learning_rate": 2.7758970336909795e-06, + "loss": 0.3113783895969391, + "mean_token_accuracy": 0.8812868595123291, + "num_tokens": 17375267.0, + "step": 1942 + }, + { + "epoch": 1.4764437689969605, + "grad_norm": 3.4031741619110107, + "learning_rate": 2.7738153349752923e-06, + "loss": 0.4800986647605896, + "mean_token_accuracy": 0.8336698412895203, + "num_tokens": 17379549.0, + "step": 1943 + }, + { + "epoch": 1.4772036474164134, + "grad_norm": 1.3451651334762573, + "learning_rate": 2.7717334440854634e-06, + "loss": 0.3115345239639282, + "mean_token_accuracy": 0.908623218536377, + "num_tokens": 17394455.0, + "step": 1944 + }, + { + "epoch": 1.4779635258358663, + "grad_norm": 1.980919599533081, + "learning_rate": 2.7696513624826422e-06, + "loss": 0.391154944896698, + "mean_token_accuracy": 0.8650267720222473, + "num_tokens": 17401931.0, + "step": 1945 + }, + { + "epoch": 1.4787234042553192, + "grad_norm": 1.0118765830993652, + "learning_rate": 2.7675690916281158e-06, + "loss": 0.3157956600189209, + "mean_token_accuracy": 0.8827471733093262, + "num_tokens": 17424144.0, + "step": 1946 + }, + { + "epoch": 1.4794832826747721, + "grad_norm": 1.579654335975647, + "learning_rate": 2.7654866329833e-06, + "loss": 0.4578486382961273, + "mean_token_accuracy": 0.8361750245094299, + "num_tokens": 17435769.0, + "step": 1947 + }, + { + "epoch": 1.4802431610942248, + "grad_norm": 1.7706717252731323, + "learning_rate": 2.763403988009746e-06, + "loss": 0.3564416170120239, + "mean_token_accuracy": 0.8689201474189758, + "num_tokens": 17444088.0, + "step": 1948 + }, + { + "epoch": 1.4810030395136777, + "grad_norm": 1.2264244556427002, + "learning_rate": 2.761321158169134e-06, + "loss": 0.30763837695121765, + "mean_token_accuracy": 0.8960219621658325, + "num_tokens": 17458096.0, + "step": 1949 + }, + { + "epoch": 1.4817629179331306, + "grad_norm": 1.214431881904602, + "learning_rate": 2.759238144923274e-06, + "loss": 0.49099457263946533, + "mean_token_accuracy": 0.8279136419296265, + "num_tokens": 17481062.0, + "step": 1950 + }, + { + "epoch": 1.4825227963525835, + "grad_norm": 1.593892216682434, + "learning_rate": 2.7571549497341044e-06, + "loss": 0.3745320737361908, + "mean_token_accuracy": 0.8690779209136963, + "num_tokens": 17490874.0, + "step": 1951 + }, + { + "epoch": 1.4832826747720365, + "grad_norm": 2.409924268722534, + "learning_rate": 2.755071574063692e-06, + "loss": 0.4310247600078583, + "mean_token_accuracy": 0.8521159291267395, + "num_tokens": 17496942.0, + "step": 1952 + }, + { + "epoch": 1.4840425531914894, + "grad_norm": 1.2557463645935059, + "learning_rate": 2.7529880193742297e-06, + "loss": 0.34304720163345337, + "mean_token_accuracy": 0.8748183250427246, + "num_tokens": 17514391.0, + "step": 1953 + }, + { + "epoch": 1.4848024316109423, + "grad_norm": 1.17310631275177, + "learning_rate": 2.7509042871280373e-06, + "loss": 0.3835817277431488, + "mean_token_accuracy": 0.8853274583816528, + "num_tokens": 17533289.0, + "step": 1954 + }, + { + "epoch": 1.4855623100303952, + "grad_norm": 1.5261479616165161, + "learning_rate": 2.748820378787558e-06, + "loss": 0.4799988865852356, + "mean_token_accuracy": 0.8252149820327759, + "num_tokens": 17544118.0, + "step": 1955 + }, + { + "epoch": 1.486322188449848, + "grad_norm": 2.030930757522583, + "learning_rate": 2.7467362958153585e-06, + "loss": 0.35690805315971375, + "mean_token_accuracy": 0.8959587216377258, + "num_tokens": 17550431.0, + "step": 1956 + }, + { + "epoch": 1.4870820668693008, + "grad_norm": 2.376520872116089, + "learning_rate": 2.7446520396741293e-06, + "loss": 0.262234091758728, + "mean_token_accuracy": 0.9054547548294067, + "num_tokens": 17554853.0, + "step": 1957 + }, + { + "epoch": 1.487841945288754, + "grad_norm": 1.6944479942321777, + "learning_rate": 2.742567611826681e-06, + "loss": 0.529259979724884, + "mean_token_accuracy": 0.8195339441299438, + "num_tokens": 17568016.0, + "step": 1958 + }, + { + "epoch": 1.4886018237082066, + "grad_norm": 2.833029270172119, + "learning_rate": 2.7404830137359445e-06, + "loss": 0.30229634046554565, + "mean_token_accuracy": 0.8933001756668091, + "num_tokens": 17572587.0, + "step": 1959 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 1.7040144205093384, + "learning_rate": 2.7383982468649715e-06, + "loss": 0.3166356682777405, + "mean_token_accuracy": 0.8871906399726868, + "num_tokens": 17580966.0, + "step": 1960 + }, + { + "epoch": 1.4901215805471124, + "grad_norm": 1.7539052963256836, + "learning_rate": 2.7363133126769326e-06, + "loss": 0.4231064021587372, + "mean_token_accuracy": 0.8708304166793823, + "num_tokens": 17590907.0, + "step": 1961 + }, + { + "epoch": 1.4908814589665653, + "grad_norm": 1.6198650598526, + "learning_rate": 2.7342282126351145e-06, + "loss": 0.4198967218399048, + "mean_token_accuracy": 0.8723280429840088, + "num_tokens": 17604291.0, + "step": 1962 + }, + { + "epoch": 1.4916413373860182, + "grad_norm": 1.8437711000442505, + "learning_rate": 2.73214294820292e-06, + "loss": 0.38923323154449463, + "mean_token_accuracy": 0.8697006106376648, + "num_tokens": 17612291.0, + "step": 1963 + }, + { + "epoch": 1.4924012158054711, + "grad_norm": 1.1129369735717773, + "learning_rate": 2.7300575208438684e-06, + "loss": 0.3107512593269348, + "mean_token_accuracy": 0.878618597984314, + "num_tokens": 17630073.0, + "step": 1964 + }, + { + "epoch": 1.493161094224924, + "grad_norm": 3.0210442543029785, + "learning_rate": 2.7279719320215924e-06, + "loss": 0.4630751609802246, + "mean_token_accuracy": 0.8567075729370117, + "num_tokens": 17634758.0, + "step": 1965 + }, + { + "epoch": 1.493920972644377, + "grad_norm": 2.8825972080230713, + "learning_rate": 2.725886183199839e-06, + "loss": 0.35351765155792236, + "mean_token_accuracy": 0.8711981773376465, + "num_tokens": 17639613.0, + "step": 1966 + }, + { + "epoch": 1.4946808510638299, + "grad_norm": 2.111238718032837, + "learning_rate": 2.723800275842468e-06, + "loss": 0.3529569208621979, + "mean_token_accuracy": 0.8679244518280029, + "num_tokens": 17645308.0, + "step": 1967 + }, + { + "epoch": 1.4954407294832825, + "grad_norm": 2.080509901046753, + "learning_rate": 2.7217142114134466e-06, + "loss": 0.43321219086647034, + "mean_token_accuracy": 0.8848220109939575, + "num_tokens": 17652292.0, + "step": 1968 + }, + { + "epoch": 1.4962006079027357, + "grad_norm": 2.8686363697052, + "learning_rate": 2.7196279913768587e-06, + "loss": 0.417035311460495, + "mean_token_accuracy": 0.8724601864814758, + "num_tokens": 17656908.0, + "step": 1969 + }, + { + "epoch": 1.4969604863221884, + "grad_norm": 3.294193744659424, + "learning_rate": 2.717541617196891e-06, + "loss": 0.3551934063434601, + "mean_token_accuracy": 0.8838565349578857, + "num_tokens": 17660590.0, + "step": 1970 + }, + { + "epoch": 1.4977203647416413, + "grad_norm": 1.766292929649353, + "learning_rate": 2.7154550903378425e-06, + "loss": 0.36521971225738525, + "mean_token_accuracy": 0.8810199499130249, + "num_tokens": 17668214.0, + "step": 1971 + }, + { + "epoch": 1.4984802431610942, + "grad_norm": 1.2127676010131836, + "learning_rate": 2.713368412264118e-06, + "loss": 0.35184425115585327, + "mean_token_accuracy": 0.8672580718994141, + "num_tokens": 17684736.0, + "step": 1972 + }, + { + "epoch": 1.499240121580547, + "grad_norm": 2.268256664276123, + "learning_rate": 2.711281584440228e-06, + "loss": 0.40115267038345337, + "mean_token_accuracy": 0.8517841100692749, + "num_tokens": 17691510.0, + "step": 1973 + }, + { + "epoch": 1.5, + "grad_norm": 2.7196054458618164, + "learning_rate": 2.70919460833079e-06, + "loss": 0.3819037675857544, + "mean_token_accuracy": 0.8765411376953125, + "num_tokens": 17696179.0, + "step": 1974 + }, + { + "epoch": 1.500759878419453, + "grad_norm": 2.969406843185425, + "learning_rate": 2.7071074854005206e-06, + "loss": 0.3922455608844757, + "mean_token_accuracy": 0.8796037435531616, + "num_tokens": 17700597.0, + "step": 1975 + }, + { + "epoch": 1.5015197568389058, + "grad_norm": 2.2965853214263916, + "learning_rate": 2.705020217114248e-06, + "loss": 0.5433666110038757, + "mean_token_accuracy": 0.809639036655426, + "num_tokens": 17708895.0, + "step": 1976 + }, + { + "epoch": 1.5022796352583585, + "grad_norm": 1.5584394931793213, + "learning_rate": 2.7029328049368942e-06, + "loss": 0.4736343324184418, + "mean_token_accuracy": 0.8197190761566162, + "num_tokens": 17725202.0, + "step": 1977 + }, + { + "epoch": 1.5030395136778116, + "grad_norm": 1.3903142213821411, + "learning_rate": 2.700845250333486e-06, + "loss": 0.4471571445465088, + "mean_token_accuracy": 0.839043140411377, + "num_tokens": 17742835.0, + "step": 1978 + }, + { + "epoch": 1.5037993920972643, + "grad_norm": 3.080716609954834, + "learning_rate": 2.69875755476915e-06, + "loss": 0.45760005712509155, + "mean_token_accuracy": 0.8366328477859497, + "num_tokens": 17747324.0, + "step": 1979 + }, + { + "epoch": 1.5045592705167175, + "grad_norm": 1.0150405168533325, + "learning_rate": 2.696669719709111e-06, + "loss": 0.33638954162597656, + "mean_token_accuracy": 0.8591676354408264, + "num_tokens": 17765565.0, + "step": 1980 + }, + { + "epoch": 1.5053191489361701, + "grad_norm": 2.402927875518799, + "learning_rate": 2.694581746618691e-06, + "loss": 0.4086601436138153, + "mean_token_accuracy": 0.8769911527633667, + "num_tokens": 17771275.0, + "step": 1981 + }, + { + "epoch": 1.506079027355623, + "grad_norm": 2.030583381652832, + "learning_rate": 2.6924936369633126e-06, + "loss": 0.5115457773208618, + "mean_token_accuracy": 0.8054746389389038, + "num_tokens": 17779999.0, + "step": 1982 + }, + { + "epoch": 1.506838905775076, + "grad_norm": 2.575199604034424, + "learning_rate": 2.6904053922084893e-06, + "loss": 0.363183856010437, + "mean_token_accuracy": 0.8716042637825012, + "num_tokens": 17785473.0, + "step": 1983 + }, + { + "epoch": 1.5075987841945289, + "grad_norm": 1.8497480154037476, + "learning_rate": 2.688317013819832e-06, + "loss": 0.4254384934902191, + "mean_token_accuracy": 0.8549597263336182, + "num_tokens": 17793812.0, + "step": 1984 + }, + { + "epoch": 1.5083586626139818, + "grad_norm": 1.7786511182785034, + "learning_rate": 2.686228503263045e-06, + "loss": 0.33400774002075195, + "mean_token_accuracy": 0.9027615189552307, + "num_tokens": 17801783.0, + "step": 1985 + }, + { + "epoch": 1.5091185410334347, + "grad_norm": 1.8365367650985718, + "learning_rate": 2.684139862003927e-06, + "loss": 0.35765063762664795, + "mean_token_accuracy": 0.8663736581802368, + "num_tokens": 17809562.0, + "step": 1986 + }, + { + "epoch": 1.5098784194528876, + "grad_norm": 1.8817477226257324, + "learning_rate": 2.682051091508365e-06, + "loss": 0.4627506732940674, + "mean_token_accuracy": 0.8358862400054932, + "num_tokens": 17819094.0, + "step": 1987 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.221547842025757, + "learning_rate": 2.679962193242338e-06, + "loss": 0.577020525932312, + "mean_token_accuracy": 0.80013108253479, + "num_tokens": 17826666.0, + "step": 1988 + }, + { + "epoch": 1.5113981762917934, + "grad_norm": 2.6618270874023438, + "learning_rate": 2.6778731686719177e-06, + "loss": 0.44632256031036377, + "mean_token_accuracy": 0.8611289262771606, + "num_tokens": 17833172.0, + "step": 1989 + }, + { + "epoch": 1.512158054711246, + "grad_norm": 2.9495689868927, + "learning_rate": 2.67578401926326e-06, + "loss": 0.3482511043548584, + "mean_token_accuracy": 0.8703314661979675, + "num_tokens": 17837220.0, + "step": 1990 + }, + { + "epoch": 1.5129179331306992, + "grad_norm": 2.0943644046783447, + "learning_rate": 2.6736947464826107e-06, + "loss": 0.2354314625263214, + "mean_token_accuracy": 0.9137634038925171, + "num_tokens": 17842712.0, + "step": 1991 + }, + { + "epoch": 1.513677811550152, + "grad_norm": 1.1303033828735352, + "learning_rate": 2.671605351796302e-06, + "loss": 0.3624761700630188, + "mean_token_accuracy": 0.8769594430923462, + "num_tokens": 17860902.0, + "step": 1992 + }, + { + "epoch": 1.5144376899696048, + "grad_norm": 2.8921146392822266, + "learning_rate": 2.6695158366707526e-06, + "loss": 0.2517220973968506, + "mean_token_accuracy": 0.8974182605743408, + "num_tokens": 17865160.0, + "step": 1993 + }, + { + "epoch": 1.5151975683890577, + "grad_norm": 2.320587158203125, + "learning_rate": 2.667426202572463e-06, + "loss": 0.4589889943599701, + "mean_token_accuracy": 0.8379613161087036, + "num_tokens": 17871994.0, + "step": 1994 + }, + { + "epoch": 1.5159574468085106, + "grad_norm": 1.1407674551010132, + "learning_rate": 2.665336450968019e-06, + "loss": 0.34412115812301636, + "mean_token_accuracy": 0.8776306509971619, + "num_tokens": 17889941.0, + "step": 1995 + }, + { + "epoch": 1.5167173252279635, + "grad_norm": 2.069814920425415, + "learning_rate": 2.6632465833240895e-06, + "loss": 0.47524404525756836, + "mean_token_accuracy": 0.830310046672821, + "num_tokens": 17898447.0, + "step": 1996 + }, + { + "epoch": 1.5174772036474165, + "grad_norm": 1.822415828704834, + "learning_rate": 2.661156601107424e-06, + "loss": 0.4541318416595459, + "mean_token_accuracy": 0.8856616020202637, + "num_tokens": 17908729.0, + "step": 1997 + }, + { + "epoch": 1.5182370820668694, + "grad_norm": 2.851428985595703, + "learning_rate": 2.659066505784852e-06, + "loss": 0.41761666536331177, + "mean_token_accuracy": 0.8710572719573975, + "num_tokens": 17913860.0, + "step": 1998 + }, + { + "epoch": 1.518996960486322, + "grad_norm": 1.8483710289001465, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.45517268776893616, + "mean_token_accuracy": 0.8411115407943726, + "num_tokens": 17923497.0, + "step": 1999 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 1.9044219255447388, + "learning_rate": 2.654885981689706e-06, + "loss": 0.42533189058303833, + "mean_token_accuracy": 0.8597894906997681, + "num_tokens": 17932670.0, + "step": 2000 + }, + { + "epoch": 1.5205167173252279, + "grad_norm": 1.8170348405838013, + "learning_rate": 2.652795555851184e-06, + "loss": 0.4009692072868347, + "mean_token_accuracy": 0.8553036451339722, + "num_tokens": 17941616.0, + "step": 2001 + }, + { + "epoch": 1.521276595744681, + "grad_norm": 1.4704090356826782, + "learning_rate": 2.6507050227748595e-06, + "loss": 0.3732764720916748, + "mean_token_accuracy": 0.8788566589355469, + "num_tokens": 17957187.0, + "step": 2002 + }, + { + "epoch": 1.5220364741641337, + "grad_norm": 1.6681534051895142, + "learning_rate": 2.648614383927949e-06, + "loss": 0.341326504945755, + "mean_token_accuracy": 0.874875545501709, + "num_tokens": 17966668.0, + "step": 2003 + }, + { + "epoch": 1.5227963525835866, + "grad_norm": 1.8578619956970215, + "learning_rate": 2.646523640777741e-06, + "loss": 0.3937399983406067, + "mean_token_accuracy": 0.8656851053237915, + "num_tokens": 17976194.0, + "step": 2004 + }, + { + "epoch": 1.5235562310030395, + "grad_norm": 1.7520431280136108, + "learning_rate": 2.6444327947916037e-06, + "loss": 0.3392767906188965, + "mean_token_accuracy": 0.8799679279327393, + "num_tokens": 17984492.0, + "step": 2005 + }, + { + "epoch": 1.5243161094224924, + "grad_norm": 3.4649906158447266, + "learning_rate": 2.6423418474369707e-06, + "loss": 0.3451516032218933, + "mean_token_accuracy": 0.8753262758255005, + "num_tokens": 17988240.0, + "step": 2006 + }, + { + "epoch": 1.5250759878419453, + "grad_norm": 1.8037052154541016, + "learning_rate": 2.64025080018135e-06, + "loss": 0.34428173303604126, + "mean_token_accuracy": 0.8719067573547363, + "num_tokens": 17996644.0, + "step": 2007 + }, + { + "epoch": 1.5258358662613982, + "grad_norm": 1.743722677230835, + "learning_rate": 2.6381596544923184e-06, + "loss": 0.4446655213832855, + "mean_token_accuracy": 0.8612518906593323, + "num_tokens": 18005109.0, + "step": 2008 + }, + { + "epoch": 1.5265957446808511, + "grad_norm": 1.3357981443405151, + "learning_rate": 2.636068411837523e-06, + "loss": 0.38647788763046265, + "mean_token_accuracy": 0.858294665813446, + "num_tokens": 18018193.0, + "step": 2009 + }, + { + "epoch": 1.5273556231003038, + "grad_norm": 1.4848440885543823, + "learning_rate": 2.6339770736846794e-06, + "loss": 0.3597261607646942, + "mean_token_accuracy": 0.8760983943939209, + "num_tokens": 18028959.0, + "step": 2010 + }, + { + "epoch": 1.528115501519757, + "grad_norm": 2.356933832168579, + "learning_rate": 2.6318856415015664e-06, + "loss": 0.2697138488292694, + "mean_token_accuracy": 0.9078473448753357, + "num_tokens": 18033946.0, + "step": 2011 + }, + { + "epoch": 1.5288753799392096, + "grad_norm": 1.964368224143982, + "learning_rate": 2.629794116756035e-06, + "loss": 0.41349685192108154, + "mean_token_accuracy": 0.8567900657653809, + "num_tokens": 18042724.0, + "step": 2012 + }, + { + "epoch": 1.5296352583586628, + "grad_norm": 1.5630402565002441, + "learning_rate": 2.627702500915995e-06, + "loss": 0.49310681223869324, + "mean_token_accuracy": 0.8229681253433228, + "num_tokens": 18054396.0, + "step": 2013 + }, + { + "epoch": 1.5303951367781155, + "grad_norm": 1.6657718420028687, + "learning_rate": 2.625610795449424e-06, + "loss": 0.4263935387134552, + "mean_token_accuracy": 0.8634918332099915, + "num_tokens": 18064347.0, + "step": 2014 + }, + { + "epoch": 1.5311550151975684, + "grad_norm": 1.3684180974960327, + "learning_rate": 2.6235190018243623e-06, + "loss": 0.2903984487056732, + "mean_token_accuracy": 0.8930408358573914, + "num_tokens": 18076826.0, + "step": 2015 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 1.635044813156128, + "learning_rate": 2.6214271215089106e-06, + "loss": 0.3066539168357849, + "mean_token_accuracy": 0.8912158012390137, + "num_tokens": 18085761.0, + "step": 2016 + }, + { + "epoch": 1.5326747720364742, + "grad_norm": 2.431518316268921, + "learning_rate": 2.6193351559712294e-06, + "loss": 0.31123271584510803, + "mean_token_accuracy": 0.8865828514099121, + "num_tokens": 18091715.0, + "step": 2017 + }, + { + "epoch": 1.533434650455927, + "grad_norm": 1.8317419290542603, + "learning_rate": 2.6172431066795428e-06, + "loss": 0.5042020082473755, + "mean_token_accuracy": 0.8245081901550293, + "num_tokens": 18102095.0, + "step": 2018 + }, + { + "epoch": 1.53419452887538, + "grad_norm": 3.4221980571746826, + "learning_rate": 2.6151509751021307e-06, + "loss": 0.2885819971561432, + "mean_token_accuracy": 0.8997149467468262, + "num_tokens": 18105456.0, + "step": 2019 + }, + { + "epoch": 1.534954407294833, + "grad_norm": 1.4435855150222778, + "learning_rate": 2.6130587627073315e-06, + "loss": 0.45573529601097107, + "mean_token_accuracy": 0.837191104888916, + "num_tokens": 18119039.0, + "step": 2020 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 1.5748237371444702, + "learning_rate": 2.6109664709635413e-06, + "loss": 0.4561889171600342, + "mean_token_accuracy": 0.8334558010101318, + "num_tokens": 18132150.0, + "step": 2021 + }, + { + "epoch": 1.5364741641337387, + "grad_norm": 2.8278751373291016, + "learning_rate": 2.60887410133921e-06, + "loss": 0.3495104908943176, + "mean_token_accuracy": 0.8926796913146973, + "num_tokens": 18136528.0, + "step": 2022 + }, + { + "epoch": 1.5372340425531914, + "grad_norm": 2.5045573711395264, + "learning_rate": 2.606781655302843e-06, + "loss": 0.45362481474876404, + "mean_token_accuracy": 0.8379551768302917, + "num_tokens": 18142581.0, + "step": 2023 + }, + { + "epoch": 1.5379939209726445, + "grad_norm": 2.5984106063842773, + "learning_rate": 2.604689134322999e-06, + "loss": 0.4210243821144104, + "mean_token_accuracy": 0.8571645021438599, + "num_tokens": 18148152.0, + "step": 2024 + }, + { + "epoch": 1.5387537993920972, + "grad_norm": 1.7180702686309814, + "learning_rate": 2.602596539868292e-06, + "loss": 0.2478562295436859, + "mean_token_accuracy": 0.9227135181427002, + "num_tokens": 18155435.0, + "step": 2025 + }, + { + "epoch": 1.5395136778115501, + "grad_norm": 2.3721933364868164, + "learning_rate": 2.6005038734073833e-06, + "loss": 0.3820664584636688, + "mean_token_accuracy": 0.8788443803787231, + "num_tokens": 18161403.0, + "step": 2026 + }, + { + "epoch": 1.540273556231003, + "grad_norm": 1.4967509508132935, + "learning_rate": 2.5984111364089875e-06, + "loss": 0.34247124195098877, + "mean_token_accuracy": 0.8809049129486084, + "num_tokens": 18173724.0, + "step": 2027 + }, + { + "epoch": 1.541033434650456, + "grad_norm": 2.5226845741271973, + "learning_rate": 2.5963183303418682e-06, + "loss": 0.2647642493247986, + "mean_token_accuracy": 0.8988642692565918, + "num_tokens": 18178927.0, + "step": 2028 + }, + { + "epoch": 1.5417933130699089, + "grad_norm": 2.217228412628174, + "learning_rate": 2.594225456674837e-06, + "loss": 0.37754058837890625, + "mean_token_accuracy": 0.8660204410552979, + "num_tokens": 18185268.0, + "step": 2029 + }, + { + "epoch": 1.5425531914893615, + "grad_norm": 2.336409091949463, + "learning_rate": 2.592132516876753e-06, + "loss": 0.45098528265953064, + "mean_token_accuracy": 0.842115044593811, + "num_tokens": 18192372.0, + "step": 2030 + }, + { + "epoch": 1.5433130699088147, + "grad_norm": 3.5437142848968506, + "learning_rate": 2.5900395124165216e-06, + "loss": 0.5326460003852844, + "mean_token_accuracy": 0.8125103712081909, + "num_tokens": 18199182.0, + "step": 2031 + }, + { + "epoch": 1.5440729483282674, + "grad_norm": 1.5785651206970215, + "learning_rate": 2.5879464447630947e-06, + "loss": 0.3714991509914398, + "mean_token_accuracy": 0.8711390495300293, + "num_tokens": 18209045.0, + "step": 2032 + }, + { + "epoch": 1.5448328267477205, + "grad_norm": 2.3616182804107666, + "learning_rate": 2.5858533153854676e-06, + "loss": 0.4548399746417999, + "mean_token_accuracy": 0.8411449193954468, + "num_tokens": 18215487.0, + "step": 2033 + }, + { + "epoch": 1.5455927051671732, + "grad_norm": 2.0750479698181152, + "learning_rate": 2.583760125752679e-06, + "loss": 0.3980535566806793, + "mean_token_accuracy": 0.8603327870368958, + "num_tokens": 18222606.0, + "step": 2034 + }, + { + "epoch": 1.5463525835866263, + "grad_norm": 2.609295129776001, + "learning_rate": 2.58166687733381e-06, + "loss": 0.40177756547927856, + "mean_token_accuracy": 0.8652099370956421, + "num_tokens": 18227341.0, + "step": 2035 + }, + { + "epoch": 1.547112462006079, + "grad_norm": 2.1621339321136475, + "learning_rate": 2.5795735715979826e-06, + "loss": 0.45104342699050903, + "mean_token_accuracy": 0.8481369018554688, + "num_tokens": 18235820.0, + "step": 2036 + }, + { + "epoch": 1.547872340425532, + "grad_norm": 1.0381370782852173, + "learning_rate": 2.577480210014359e-06, + "loss": 0.32621103525161743, + "mean_token_accuracy": 0.8867391347885132, + "num_tokens": 18258307.0, + "step": 2037 + }, + { + "epoch": 1.5486322188449848, + "grad_norm": 1.7634375095367432, + "learning_rate": 2.575386794052142e-06, + "loss": 0.5115169882774353, + "mean_token_accuracy": 0.818779468536377, + "num_tokens": 18272782.0, + "step": 2038 + }, + { + "epoch": 1.5493920972644377, + "grad_norm": 1.874875545501709, + "learning_rate": 2.5732933251805716e-06, + "loss": 0.4381459951400757, + "mean_token_accuracy": 0.8594684600830078, + "num_tokens": 18282618.0, + "step": 2039 + }, + { + "epoch": 1.5501519756838906, + "grad_norm": 2.1316351890563965, + "learning_rate": 2.571199804868923e-06, + "loss": 0.5410124063491821, + "mean_token_accuracy": 0.8247587084770203, + "num_tokens": 18289750.0, + "step": 2040 + }, + { + "epoch": 1.5509118541033433, + "grad_norm": 1.7574573755264282, + "learning_rate": 2.569106234586511e-06, + "loss": 0.29967373609542847, + "mean_token_accuracy": 0.8913218975067139, + "num_tokens": 18298110.0, + "step": 2041 + }, + { + "epoch": 1.5516717325227964, + "grad_norm": 1.929626703262329, + "learning_rate": 2.5670126158026843e-06, + "loss": 0.3287760019302368, + "mean_token_accuracy": 0.8870488405227661, + "num_tokens": 18305702.0, + "step": 2042 + }, + { + "epoch": 1.5524316109422491, + "grad_norm": 3.020153284072876, + "learning_rate": 2.5649189499868233e-06, + "loss": 0.38523542881011963, + "mean_token_accuracy": 0.854824960231781, + "num_tokens": 18309830.0, + "step": 2043 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 1.6378421783447266, + "learning_rate": 2.5628252386083443e-06, + "loss": 0.47371378540992737, + "mean_token_accuracy": 0.8627713918685913, + "num_tokens": 18322820.0, + "step": 2044 + }, + { + "epoch": 1.553951367781155, + "grad_norm": 1.3711130619049072, + "learning_rate": 2.560731483136694e-06, + "loss": 0.3319293260574341, + "mean_token_accuracy": 0.8704103231430054, + "num_tokens": 18335074.0, + "step": 2045 + }, + { + "epoch": 1.5547112462006079, + "grad_norm": 1.7589185237884521, + "learning_rate": 2.558637685041352e-06, + "loss": 0.4446021020412445, + "mean_token_accuracy": 0.8446722626686096, + "num_tokens": 18344115.0, + "step": 2046 + }, + { + "epoch": 1.5554711246200608, + "grad_norm": 2.5249195098876953, + "learning_rate": 2.5565438457918247e-06, + "loss": 0.4625541865825653, + "mean_token_accuracy": 0.8451195359230042, + "num_tokens": 18349235.0, + "step": 2047 + }, + { + "epoch": 1.5562310030395137, + "grad_norm": 1.0562543869018555, + "learning_rate": 2.5544499668576508e-06, + "loss": 0.33747735619544983, + "mean_token_accuracy": 0.8503615856170654, + "num_tokens": 18368253.0, + "step": 2048 + }, + { + "epoch": 1.5569908814589666, + "grad_norm": 2.9451215267181396, + "learning_rate": 2.5523560497083927e-06, + "loss": 0.3958815932273865, + "mean_token_accuracy": 0.8393744826316833, + "num_tokens": 18372887.0, + "step": 2049 + }, + { + "epoch": 1.5577507598784195, + "grad_norm": 1.3597660064697266, + "learning_rate": 2.5502620958136444e-06, + "loss": 0.46281275153160095, + "mean_token_accuracy": 0.8269470930099487, + "num_tokens": 18388074.0, + "step": 2050 + }, + { + "epoch": 1.5585106382978724, + "grad_norm": 3.269068717956543, + "learning_rate": 2.548168106643022e-06, + "loss": 0.2309008538722992, + "mean_token_accuracy": 0.9178205728530884, + "num_tokens": 18391406.0, + "step": 2051 + }, + { + "epoch": 1.559270516717325, + "grad_norm": 2.1459391117095947, + "learning_rate": 2.546074083666169e-06, + "loss": 0.4006733298301697, + "mean_token_accuracy": 0.8631902933120728, + "num_tokens": 18397497.0, + "step": 2052 + }, + { + "epoch": 1.5600303951367782, + "grad_norm": 1.4614566564559937, + "learning_rate": 2.5439800283527495e-06, + "loss": 0.40810418128967285, + "mean_token_accuracy": 0.8473483920097351, + "num_tokens": 18409474.0, + "step": 2053 + }, + { + "epoch": 1.560790273556231, + "grad_norm": 2.084808826446533, + "learning_rate": 2.541885942172454e-06, + "loss": 0.34967708587646484, + "mean_token_accuracy": 0.8707003593444824, + "num_tokens": 18416400.0, + "step": 2054 + }, + { + "epoch": 1.561550151975684, + "grad_norm": 1.90664541721344, + "learning_rate": 2.539791826594991e-06, + "loss": 0.37694251537323, + "mean_token_accuracy": 0.8704941272735596, + "num_tokens": 18424206.0, + "step": 2055 + }, + { + "epoch": 1.5623100303951367, + "grad_norm": 1.880176305770874, + "learning_rate": 2.537697683090093e-06, + "loss": 0.32510411739349365, + "mean_token_accuracy": 0.8848961591720581, + "num_tokens": 18431676.0, + "step": 2056 + }, + { + "epoch": 1.5630699088145896, + "grad_norm": 2.133375406265259, + "learning_rate": 2.5356035131275096e-06, + "loss": 0.30538493394851685, + "mean_token_accuracy": 0.8890067338943481, + "num_tokens": 18438014.0, + "step": 2057 + }, + { + "epoch": 1.5638297872340425, + "grad_norm": 2.3495655059814453, + "learning_rate": 2.5335093181770105e-06, + "loss": 0.3126775324344635, + "mean_token_accuracy": 0.8865689039230347, + "num_tokens": 18443604.0, + "step": 2058 + }, + { + "epoch": 1.5645896656534954, + "grad_norm": 2.37949538230896, + "learning_rate": 2.531415099708382e-06, + "loss": 0.3257793188095093, + "mean_token_accuracy": 0.8809669017791748, + "num_tokens": 18448654.0, + "step": 2059 + }, + { + "epoch": 1.5653495440729484, + "grad_norm": 1.8285472393035889, + "learning_rate": 2.5293208591914265e-06, + "loss": 0.32376936078071594, + "mean_token_accuracy": 0.8816431760787964, + "num_tokens": 18456619.0, + "step": 2060 + }, + { + "epoch": 1.5661094224924013, + "grad_norm": 2.3238534927368164, + "learning_rate": 2.5272265980959644e-06, + "loss": 0.40366506576538086, + "mean_token_accuracy": 0.8496750593185425, + "num_tokens": 18462788.0, + "step": 2061 + }, + { + "epoch": 1.5668693009118542, + "grad_norm": 1.8954942226409912, + "learning_rate": 2.525132317891827e-06, + "loss": 0.3405473828315735, + "mean_token_accuracy": 0.8849360942840576, + "num_tokens": 18470719.0, + "step": 2062 + }, + { + "epoch": 1.5676291793313069, + "grad_norm": 1.6268190145492554, + "learning_rate": 2.523038020048861e-06, + "loss": 0.3662685751914978, + "mean_token_accuracy": 0.8865662813186646, + "num_tokens": 18482095.0, + "step": 2063 + }, + { + "epoch": 1.56838905775076, + "grad_norm": 2.5198733806610107, + "learning_rate": 2.5209437060369266e-06, + "loss": 0.3968311548233032, + "mean_token_accuracy": 0.8643308281898499, + "num_tokens": 18488069.0, + "step": 2064 + }, + { + "epoch": 1.5691489361702127, + "grad_norm": 2.9197335243225098, + "learning_rate": 2.518849377325893e-06, + "loss": 0.24738386273384094, + "mean_token_accuracy": 0.91959547996521, + "num_tokens": 18491762.0, + "step": 2065 + }, + { + "epoch": 1.5699088145896658, + "grad_norm": 1.5914254188537598, + "learning_rate": 2.51675503538564e-06, + "loss": 0.33473581075668335, + "mean_token_accuracy": 0.8794662952423096, + "num_tokens": 18501316.0, + "step": 2066 + }, + { + "epoch": 1.5706686930091185, + "grad_norm": 2.5130460262298584, + "learning_rate": 2.5146606816860597e-06, + "loss": 0.4067240357398987, + "mean_token_accuracy": 0.8564209342002869, + "num_tokens": 18507169.0, + "step": 2067 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.093353509902954, + "learning_rate": 2.5125663176970475e-06, + "loss": 0.4312136769294739, + "mean_token_accuracy": 0.8540225028991699, + "num_tokens": 18514536.0, + "step": 2068 + }, + { + "epoch": 1.5721884498480243, + "grad_norm": 1.284495234489441, + "learning_rate": 2.5104719448885103e-06, + "loss": 0.3813856542110443, + "mean_token_accuracy": 0.8435653448104858, + "num_tokens": 18529947.0, + "step": 2069 + }, + { + "epoch": 1.5729483282674772, + "grad_norm": 2.0383973121643066, + "learning_rate": 2.5083775647303583e-06, + "loss": 0.4428079426288605, + "mean_token_accuracy": 0.8841741681098938, + "num_tokens": 18537109.0, + "step": 2070 + }, + { + "epoch": 1.5737082066869301, + "grad_norm": 1.7991697788238525, + "learning_rate": 2.5062831786925102e-06, + "loss": 0.460052490234375, + "mean_token_accuracy": 0.8459943532943726, + "num_tokens": 18547108.0, + "step": 2071 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.2168822288513184, + "learning_rate": 2.5041887882448845e-06, + "loss": 0.2863885462284088, + "mean_token_accuracy": 0.906816840171814, + "num_tokens": 18552357.0, + "step": 2072 + }, + { + "epoch": 1.575227963525836, + "grad_norm": 3.918499708175659, + "learning_rate": 2.5020943948574056e-06, + "loss": 0.3439999222755432, + "mean_token_accuracy": 0.8742123246192932, + "num_tokens": 18555272.0, + "step": 2073 + }, + { + "epoch": 1.5759878419452886, + "grad_norm": 1.773869514465332, + "learning_rate": 2.5e-06, + "loss": 0.2815646827220917, + "mean_token_accuracy": 0.8939872980117798, + "num_tokens": 18562989.0, + "step": 2074 + }, + { + "epoch": 1.5767477203647418, + "grad_norm": 1.8675572872161865, + "learning_rate": 2.497905605142595e-06, + "loss": 0.5005829930305481, + "mean_token_accuracy": 0.8242729902267456, + "num_tokens": 18575587.0, + "step": 2075 + }, + { + "epoch": 1.5775075987841944, + "grad_norm": 2.3143508434295654, + "learning_rate": 2.4958112117551163e-06, + "loss": 0.42472895979881287, + "mean_token_accuracy": 0.8540043830871582, + "num_tokens": 18581666.0, + "step": 2076 + }, + { + "epoch": 1.5782674772036476, + "grad_norm": 2.529740333557129, + "learning_rate": 2.4937168213074906e-06, + "loss": 0.24539905786514282, + "mean_token_accuracy": 0.9041235446929932, + "num_tokens": 18585773.0, + "step": 2077 + }, + { + "epoch": 1.5790273556231003, + "grad_norm": 2.5188395977020264, + "learning_rate": 2.491622435269642e-06, + "loss": 0.23059265315532684, + "mean_token_accuracy": 0.9204603433609009, + "num_tokens": 18589915.0, + "step": 2078 + }, + { + "epoch": 1.5797872340425532, + "grad_norm": 2.7752444744110107, + "learning_rate": 2.489528055111491e-06, + "loss": 0.452225923538208, + "mean_token_accuracy": 0.8444918990135193, + "num_tokens": 18595488.0, + "step": 2079 + }, + { + "epoch": 1.580547112462006, + "grad_norm": 1.174774408340454, + "learning_rate": 2.487433682302953e-06, + "loss": 0.3399246633052826, + "mean_token_accuracy": 0.8608446717262268, + "num_tokens": 18613756.0, + "step": 2080 + }, + { + "epoch": 1.581306990881459, + "grad_norm": 1.515575647354126, + "learning_rate": 2.485339318313941e-06, + "loss": 0.45886170864105225, + "mean_token_accuracy": 0.8479131460189819, + "num_tokens": 18629610.0, + "step": 2081 + }, + { + "epoch": 1.582066869300912, + "grad_norm": 1.7039403915405273, + "learning_rate": 2.4832449646143605e-06, + "loss": 0.349803626537323, + "mean_token_accuracy": 0.8721815347671509, + "num_tokens": 18637523.0, + "step": 2082 + }, + { + "epoch": 1.5828267477203646, + "grad_norm": 3.2289421558380127, + "learning_rate": 2.4811506226741077e-06, + "loss": 0.4967171549797058, + "mean_token_accuracy": 0.8303675651550293, + "num_tokens": 18641826.0, + "step": 2083 + }, + { + "epoch": 1.5835866261398177, + "grad_norm": 1.71235990524292, + "learning_rate": 2.4790562939630738e-06, + "loss": 0.4202485680580139, + "mean_token_accuracy": 0.8581224679946899, + "num_tokens": 18653146.0, + "step": 2084 + }, + { + "epoch": 1.5843465045592704, + "grad_norm": 1.710036277770996, + "learning_rate": 2.4769619799511392e-06, + "loss": 0.3942421078681946, + "mean_token_accuracy": 0.8553562164306641, + "num_tokens": 18663826.0, + "step": 2085 + }, + { + "epoch": 1.5851063829787235, + "grad_norm": 1.464859127998352, + "learning_rate": 2.474867682108174e-06, + "loss": 0.4093329906463623, + "mean_token_accuracy": 0.8598780632019043, + "num_tokens": 18675325.0, + "step": 2086 + }, + { + "epoch": 1.5858662613981762, + "grad_norm": 2.083707809448242, + "learning_rate": 2.472773401904037e-06, + "loss": 0.4252093434333801, + "mean_token_accuracy": 0.8433356881141663, + "num_tokens": 18682416.0, + "step": 2087 + }, + { + "epoch": 1.5866261398176293, + "grad_norm": 1.5577973127365112, + "learning_rate": 2.470679140808574e-06, + "loss": 0.3680085241794586, + "mean_token_accuracy": 0.8609116077423096, + "num_tokens": 18694445.0, + "step": 2088 + }, + { + "epoch": 1.587386018237082, + "grad_norm": 2.1617276668548584, + "learning_rate": 2.4685849002916184e-06, + "loss": 0.40488749742507935, + "mean_token_accuracy": 0.8429721593856812, + "num_tokens": 18701204.0, + "step": 2089 + }, + { + "epoch": 1.588145896656535, + "grad_norm": 2.046678304672241, + "learning_rate": 2.4664906818229903e-06, + "loss": 0.329141229391098, + "mean_token_accuracy": 0.8830771446228027, + "num_tokens": 18708354.0, + "step": 2090 + }, + { + "epoch": 1.5889057750759878, + "grad_norm": 2.7741200923919678, + "learning_rate": 2.4643964868724916e-06, + "loss": 0.42294493317604065, + "mean_token_accuracy": 0.8612706065177917, + "num_tokens": 18713017.0, + "step": 2091 + }, + { + "epoch": 1.5896656534954408, + "grad_norm": 2.085151433944702, + "learning_rate": 2.4623023169099074e-06, + "loss": 0.39038220047950745, + "mean_token_accuracy": 0.861169695854187, + "num_tokens": 18721423.0, + "step": 2092 + }, + { + "epoch": 1.5904255319148937, + "grad_norm": 2.8721165657043457, + "learning_rate": 2.4602081734050093e-06, + "loss": 0.27753859758377075, + "mean_token_accuracy": 0.8959167003631592, + "num_tokens": 18725044.0, + "step": 2093 + }, + { + "epoch": 1.5911854103343464, + "grad_norm": 1.7388207912445068, + "learning_rate": 2.4581140578275473e-06, + "loss": 0.3570033311843872, + "mean_token_accuracy": 0.8715590238571167, + "num_tokens": 18733891.0, + "step": 2094 + }, + { + "epoch": 1.5919452887537995, + "grad_norm": 2.3645241260528564, + "learning_rate": 2.456019971647251e-06, + "loss": 0.38982006907463074, + "mean_token_accuracy": 0.8734139800071716, + "num_tokens": 18740464.0, + "step": 2095 + }, + { + "epoch": 1.5927051671732522, + "grad_norm": 3.674072027206421, + "learning_rate": 2.4539259163338317e-06, + "loss": 0.4068281650543213, + "mean_token_accuracy": 0.8397839069366455, + "num_tokens": 18744857.0, + "step": 2096 + }, + { + "epoch": 1.5934650455927053, + "grad_norm": 1.8209186792373657, + "learning_rate": 2.4518318933569786e-06, + "loss": 0.3471015691757202, + "mean_token_accuracy": 0.8709044456481934, + "num_tokens": 18752414.0, + "step": 2097 + }, + { + "epoch": 1.594224924012158, + "grad_norm": 1.8138704299926758, + "learning_rate": 2.449737904186357e-06, + "loss": 0.3438487648963928, + "mean_token_accuracy": 0.8766711950302124, + "num_tokens": 18760587.0, + "step": 2098 + }, + { + "epoch": 1.594984802431611, + "grad_norm": 1.7893842458724976, + "learning_rate": 2.447643950291608e-06, + "loss": 0.43519508838653564, + "mean_token_accuracy": 0.8682907819747925, + "num_tokens": 18770293.0, + "step": 2099 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 1.4305094480514526, + "learning_rate": 2.4455500331423505e-06, + "loss": 0.37106508016586304, + "mean_token_accuracy": 0.8611354827880859, + "num_tokens": 18782456.0, + "step": 2100 + }, + { + "epoch": 1.5965045592705167, + "grad_norm": 2.0797057151794434, + "learning_rate": 2.4434561542081765e-06, + "loss": 0.43942689895629883, + "mean_token_accuracy": 0.8477288484573364, + "num_tokens": 18789547.0, + "step": 2101 + }, + { + "epoch": 1.5972644376899696, + "grad_norm": 1.2983288764953613, + "learning_rate": 2.441362314958649e-06, + "loss": 0.46385765075683594, + "mean_token_accuracy": 0.8340978622436523, + "num_tokens": 18809456.0, + "step": 2102 + }, + { + "epoch": 1.5980243161094225, + "grad_norm": 2.60866641998291, + "learning_rate": 2.439268516863306e-06, + "loss": 0.3106239140033722, + "mean_token_accuracy": 0.8859497308731079, + "num_tokens": 18813781.0, + "step": 2103 + }, + { + "epoch": 1.5987841945288754, + "grad_norm": 3.389376163482666, + "learning_rate": 2.4371747613916566e-06, + "loss": 0.44926169514656067, + "mean_token_accuracy": 0.8664819002151489, + "num_tokens": 18817666.0, + "step": 2104 + }, + { + "epoch": 1.5995440729483281, + "grad_norm": 3.3417351245880127, + "learning_rate": 2.4350810500131776e-06, + "loss": 0.4786076545715332, + "mean_token_accuracy": 0.8357523679733276, + "num_tokens": 18823717.0, + "step": 2105 + }, + { + "epoch": 1.6003039513677813, + "grad_norm": 1.5215197801589966, + "learning_rate": 2.4329873841973174e-06, + "loss": 0.4123923182487488, + "mean_token_accuracy": 0.853337287902832, + "num_tokens": 18835163.0, + "step": 2106 + }, + { + "epoch": 1.601063829787234, + "grad_norm": 1.8798415660858154, + "learning_rate": 2.4308937654134893e-06, + "loss": 0.45594000816345215, + "mean_token_accuracy": 0.8553717732429504, + "num_tokens": 18843923.0, + "step": 2107 + }, + { + "epoch": 1.601823708206687, + "grad_norm": 2.1012487411499023, + "learning_rate": 2.428800195131078e-06, + "loss": 0.4340161085128784, + "mean_token_accuracy": 0.8448120355606079, + "num_tokens": 18851852.0, + "step": 2108 + }, + { + "epoch": 1.6025835866261398, + "grad_norm": 2.827080726623535, + "learning_rate": 2.4267066748194297e-06, + "loss": 0.25922513008117676, + "mean_token_accuracy": 0.9024698734283447, + "num_tokens": 18856113.0, + "step": 2109 + }, + { + "epoch": 1.6033434650455927, + "grad_norm": 1.641032338142395, + "learning_rate": 2.4246132059478582e-06, + "loss": 0.591558575630188, + "mean_token_accuracy": 0.7960667610168457, + "num_tokens": 18870618.0, + "step": 2110 + }, + { + "epoch": 1.6041033434650456, + "grad_norm": 2.600771188735962, + "learning_rate": 2.4225197899856416e-06, + "loss": 0.382815957069397, + "mean_token_accuracy": 0.8654585480690002, + "num_tokens": 18875456.0, + "step": 2111 + }, + { + "epoch": 1.6048632218844985, + "grad_norm": 1.5125449895858765, + "learning_rate": 2.4204264284020182e-06, + "loss": 0.4643454849720001, + "mean_token_accuracy": 0.837038516998291, + "num_tokens": 18887979.0, + "step": 2112 + }, + { + "epoch": 1.6056231003039514, + "grad_norm": 1.7571941614151, + "learning_rate": 2.4183331226661913e-06, + "loss": 0.30713701248168945, + "mean_token_accuracy": 0.8856921195983887, + "num_tokens": 18896143.0, + "step": 2113 + }, + { + "epoch": 1.6063829787234043, + "grad_norm": 2.124593496322632, + "learning_rate": 2.4162398742473216e-06, + "loss": 0.2873607575893402, + "mean_token_accuracy": 0.8986717462539673, + "num_tokens": 18902364.0, + "step": 2114 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 2.3496272563934326, + "learning_rate": 2.4141466846145332e-06, + "loss": 0.33715200424194336, + "mean_token_accuracy": 0.8816461563110352, + "num_tokens": 18908038.0, + "step": 2115 + }, + { + "epoch": 1.60790273556231, + "grad_norm": 1.2783573865890503, + "learning_rate": 2.4120535552369057e-06, + "loss": 0.45153388381004333, + "mean_token_accuracy": 0.8345640897750854, + "num_tokens": 18926687.0, + "step": 2116 + }, + { + "epoch": 1.608662613981763, + "grad_norm": 2.1481080055236816, + "learning_rate": 2.4099604875834796e-06, + "loss": 0.43976694345474243, + "mean_token_accuracy": 0.847899317741394, + "num_tokens": 18932974.0, + "step": 2117 + }, + { + "epoch": 1.6094224924012157, + "grad_norm": 1.8669065237045288, + "learning_rate": 2.407867483123248e-06, + "loss": 0.4649358093738556, + "mean_token_accuracy": 0.8310785293579102, + "num_tokens": 18942551.0, + "step": 2118 + }, + { + "epoch": 1.6101823708206688, + "grad_norm": 2.7667746543884277, + "learning_rate": 2.4057745433251637e-06, + "loss": 0.4542210102081299, + "mean_token_accuracy": 0.8450086116790771, + "num_tokens": 18947525.0, + "step": 2119 + }, + { + "epoch": 1.6109422492401215, + "grad_norm": 2.2865076065063477, + "learning_rate": 2.4036816696581326e-06, + "loss": 0.34291431307792664, + "mean_token_accuracy": 0.8741394281387329, + "num_tokens": 18952967.0, + "step": 2120 + }, + { + "epoch": 1.6117021276595744, + "grad_norm": 3.055197238922119, + "learning_rate": 2.401588863591013e-06, + "loss": 0.4686807692050934, + "mean_token_accuracy": 0.8440030217170715, + "num_tokens": 18958257.0, + "step": 2121 + }, + { + "epoch": 1.6124620060790273, + "grad_norm": 2.268456220626831, + "learning_rate": 2.3994961265926166e-06, + "loss": 0.440069317817688, + "mean_token_accuracy": 0.8534891605377197, + "num_tokens": 18964745.0, + "step": 2122 + }, + { + "epoch": 1.6132218844984803, + "grad_norm": 2.061185359954834, + "learning_rate": 2.3974034601317085e-06, + "loss": 0.4383159279823303, + "mean_token_accuracy": 0.8484808802604675, + "num_tokens": 18972136.0, + "step": 2123 + }, + { + "epoch": 1.6139817629179332, + "grad_norm": 1.5121275186538696, + "learning_rate": 2.3953108656770018e-06, + "loss": 0.42403632402420044, + "mean_token_accuracy": 0.8467602133750916, + "num_tokens": 18985353.0, + "step": 2124 + }, + { + "epoch": 1.614741641337386, + "grad_norm": 1.9965397119522095, + "learning_rate": 2.3932183446971584e-06, + "loss": 0.3915751576423645, + "mean_token_accuracy": 0.8622956275939941, + "num_tokens": 18992017.0, + "step": 2125 + }, + { + "epoch": 1.615501519756839, + "grad_norm": 1.6688618659973145, + "learning_rate": 2.3911258986607907e-06, + "loss": 0.468288391828537, + "mean_token_accuracy": 0.8372251987457275, + "num_tokens": 19001930.0, + "step": 2126 + }, + { + "epoch": 1.6162613981762917, + "grad_norm": 1.8984699249267578, + "learning_rate": 2.3890335290364596e-06, + "loss": 0.3082895278930664, + "mean_token_accuracy": 0.8815990686416626, + "num_tokens": 19009712.0, + "step": 2127 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.6934773921966553, + "learning_rate": 2.386941237292669e-06, + "loss": 0.48406022787094116, + "mean_token_accuracy": 0.8300775289535522, + "num_tokens": 19015212.0, + "step": 2128 + }, + { + "epoch": 1.6177811550151975, + "grad_norm": 1.6615487337112427, + "learning_rate": 2.3848490248978693e-06, + "loss": 0.45227736234664917, + "mean_token_accuracy": 0.8421006798744202, + "num_tokens": 19027115.0, + "step": 2129 + }, + { + "epoch": 1.6185410334346506, + "grad_norm": 1.4625248908996582, + "learning_rate": 2.3827568933204576e-06, + "loss": 0.4141014814376831, + "mean_token_accuracy": 0.8479453325271606, + "num_tokens": 19041103.0, + "step": 2130 + }, + { + "epoch": 1.6193009118541033, + "grad_norm": 1.856701135635376, + "learning_rate": 2.3806648440287715e-06, + "loss": 0.3440483808517456, + "mean_token_accuracy": 0.8978210687637329, + "num_tokens": 19048124.0, + "step": 2131 + }, + { + "epoch": 1.6200607902735562, + "grad_norm": 1.7056550979614258, + "learning_rate": 2.378572878491091e-06, + "loss": 0.4136195182800293, + "mean_token_accuracy": 0.8579289317131042, + "num_tokens": 19057113.0, + "step": 2132 + }, + { + "epoch": 1.6208206686930091, + "grad_norm": 1.4673033952713013, + "learning_rate": 2.376480998175638e-06, + "loss": 0.40176504850387573, + "mean_token_accuracy": 0.8677150011062622, + "num_tokens": 19068258.0, + "step": 2133 + }, + { + "epoch": 1.621580547112462, + "grad_norm": 2.12859845161438, + "learning_rate": 2.3743892045505764e-06, + "loss": 0.39754825830459595, + "mean_token_accuracy": 0.8486959934234619, + "num_tokens": 19075469.0, + "step": 2134 + }, + { + "epoch": 1.622340425531915, + "grad_norm": 1.474247694015503, + "learning_rate": 2.372297499084006e-06, + "loss": 0.3546760678291321, + "mean_token_accuracy": 0.8767229318618774, + "num_tokens": 19086744.0, + "step": 2135 + }, + { + "epoch": 1.6231003039513676, + "grad_norm": 1.9945709705352783, + "learning_rate": 2.3702058832439667e-06, + "loss": 0.4200798273086548, + "mean_token_accuracy": 0.8435655832290649, + "num_tokens": 19095903.0, + "step": 2136 + }, + { + "epoch": 1.6238601823708207, + "grad_norm": 2.71991229057312, + "learning_rate": 2.368114358498434e-06, + "loss": 0.44925457239151, + "mean_token_accuracy": 0.8348450660705566, + "num_tokens": 19100864.0, + "step": 2137 + }, + { + "epoch": 1.6246200607902734, + "grad_norm": 2.817664623260498, + "learning_rate": 2.366022926315322e-06, + "loss": 0.44386279582977295, + "mean_token_accuracy": 0.8739628791809082, + "num_tokens": 19105355.0, + "step": 2138 + }, + { + "epoch": 1.6253799392097266, + "grad_norm": 1.3673229217529297, + "learning_rate": 2.3639315881624776e-06, + "loss": 0.3693230152130127, + "mean_token_accuracy": 0.8698620796203613, + "num_tokens": 19116748.0, + "step": 2139 + }, + { + "epoch": 1.6261398176291793, + "grad_norm": 2.712531805038452, + "learning_rate": 2.361840345507683e-06, + "loss": 0.4442938268184662, + "mean_token_accuracy": 0.8433241844177246, + "num_tokens": 19121437.0, + "step": 2140 + }, + { + "epoch": 1.6268996960486324, + "grad_norm": 2.2885231971740723, + "learning_rate": 2.359749199818651e-06, + "loss": 0.4021872878074646, + "mean_token_accuracy": 0.8605252504348755, + "num_tokens": 19127633.0, + "step": 2141 + }, + { + "epoch": 1.627659574468085, + "grad_norm": 1.9257299900054932, + "learning_rate": 2.3576581525630297e-06, + "loss": 0.3577788472175598, + "mean_token_accuracy": 0.8691596388816833, + "num_tokens": 19134450.0, + "step": 2142 + }, + { + "epoch": 1.628419452887538, + "grad_norm": 1.5035467147827148, + "learning_rate": 2.355567205208397e-06, + "loss": 0.3800235986709595, + "mean_token_accuracy": 0.867794394493103, + "num_tokens": 19146149.0, + "step": 2143 + }, + { + "epoch": 1.6291793313069909, + "grad_norm": 2.110445737838745, + "learning_rate": 2.353476359222259e-06, + "loss": 0.34394145011901855, + "mean_token_accuracy": 0.8777303695678711, + "num_tokens": 19152017.0, + "step": 2144 + }, + { + "epoch": 1.6299392097264438, + "grad_norm": 1.1713787317276, + "learning_rate": 2.351385616072052e-06, + "loss": 0.4060516357421875, + "mean_token_accuracy": 0.8411345481872559, + "num_tokens": 19172089.0, + "step": 2145 + }, + { + "epoch": 1.6306990881458967, + "grad_norm": 1.7600529193878174, + "learning_rate": 2.3492949772251418e-06, + "loss": 0.5299694538116455, + "mean_token_accuracy": 0.8218191862106323, + "num_tokens": 19184041.0, + "step": 2146 + }, + { + "epoch": 1.6314589665653494, + "grad_norm": 1.7126617431640625, + "learning_rate": 2.3472044441488175e-06, + "loss": 0.38628721237182617, + "mean_token_accuracy": 0.8526935577392578, + "num_tokens": 19193101.0, + "step": 2147 + }, + { + "epoch": 1.6322188449848025, + "grad_norm": 1.210344672203064, + "learning_rate": 2.345114018310295e-06, + "loss": 0.2732373774051666, + "mean_token_accuracy": 0.8903822898864746, + "num_tokens": 19206697.0, + "step": 2148 + }, + { + "epoch": 1.6329787234042552, + "grad_norm": 1.6693075895309448, + "learning_rate": 2.3430237011767166e-06, + "loss": 0.3472709655761719, + "mean_token_accuracy": 0.8767187595367432, + "num_tokens": 19217008.0, + "step": 2149 + }, + { + "epoch": 1.6337386018237083, + "grad_norm": 1.5242515802383423, + "learning_rate": 2.3409334942151485e-06, + "loss": 0.4345507025718689, + "mean_token_accuracy": 0.8481311202049255, + "num_tokens": 19231573.0, + "step": 2150 + }, + { + "epoch": 1.634498480243161, + "grad_norm": 2.470122814178467, + "learning_rate": 2.3388433988925767e-06, + "loss": 0.4453052878379822, + "mean_token_accuracy": 0.8411355018615723, + "num_tokens": 19237076.0, + "step": 2151 + }, + { + "epoch": 1.635258358662614, + "grad_norm": 2.4177467823028564, + "learning_rate": 2.3367534166759105e-06, + "loss": 0.454534113407135, + "mean_token_accuracy": 0.8635509014129639, + "num_tokens": 19242890.0, + "step": 2152 + }, + { + "epoch": 1.6360182370820668, + "grad_norm": 2.8036744594573975, + "learning_rate": 2.3346635490319815e-06, + "loss": 0.4396413564682007, + "mean_token_accuracy": 0.8491836786270142, + "num_tokens": 19247492.0, + "step": 2153 + }, + { + "epoch": 1.6367781155015197, + "grad_norm": 1.9286335706710815, + "learning_rate": 2.3325737974275382e-06, + "loss": 0.34988659620285034, + "mean_token_accuracy": 0.8704243898391724, + "num_tokens": 19254966.0, + "step": 2154 + }, + { + "epoch": 1.6375379939209727, + "grad_norm": 1.8929904699325562, + "learning_rate": 2.3304841633292487e-06, + "loss": 0.4195491671562195, + "mean_token_accuracy": 0.857181966304779, + "num_tokens": 19263324.0, + "step": 2155 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.2598466873168945, + "learning_rate": 2.328394648203698e-06, + "loss": 0.37977826595306396, + "mean_token_accuracy": 0.8626722097396851, + "num_tokens": 19269363.0, + "step": 2156 + }, + { + "epoch": 1.6390577507598785, + "grad_norm": 1.8118126392364502, + "learning_rate": 2.32630525351739e-06, + "loss": 0.3532063364982605, + "mean_token_accuracy": 0.8677854537963867, + "num_tokens": 19277360.0, + "step": 2157 + }, + { + "epoch": 1.6398176291793312, + "grad_norm": 1.5216798782348633, + "learning_rate": 2.324215980736741e-06, + "loss": 0.38609349727630615, + "mean_token_accuracy": 0.8685325980186462, + "num_tokens": 19292159.0, + "step": 2158 + }, + { + "epoch": 1.6405775075987843, + "grad_norm": 3.0511462688446045, + "learning_rate": 2.3221268313280836e-06, + "loss": 0.21988365054130554, + "mean_token_accuracy": 0.9172534942626953, + "num_tokens": 19295735.0, + "step": 2159 + }, + { + "epoch": 1.641337386018237, + "grad_norm": 1.957828164100647, + "learning_rate": 2.320037806757662e-06, + "loss": 0.3868909478187561, + "mean_token_accuracy": 0.8605331182479858, + "num_tokens": 19303287.0, + "step": 2160 + }, + { + "epoch": 1.64209726443769, + "grad_norm": 2.590040922164917, + "learning_rate": 2.317948908491636e-06, + "loss": 0.3940129578113556, + "mean_token_accuracy": 0.8814224004745483, + "num_tokens": 19308101.0, + "step": 2161 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 2.859248161315918, + "learning_rate": 2.315860137996074e-06, + "loss": 0.3437344431877136, + "mean_token_accuracy": 0.8789017200469971, + "num_tokens": 19313026.0, + "step": 2162 + }, + { + "epoch": 1.6436170212765957, + "grad_norm": 1.1788666248321533, + "learning_rate": 2.3137714967369544e-06, + "loss": 0.3976179361343384, + "mean_token_accuracy": 0.8383771181106567, + "num_tokens": 19331103.0, + "step": 2163 + }, + { + "epoch": 1.6443768996960486, + "grad_norm": 1.8409802913665771, + "learning_rate": 2.3116829861801687e-06, + "loss": 0.41898879408836365, + "mean_token_accuracy": 0.8575010299682617, + "num_tokens": 19340866.0, + "step": 2164 + }, + { + "epoch": 1.6451367781155015, + "grad_norm": 1.4124691486358643, + "learning_rate": 2.3095946077915115e-06, + "loss": 0.333813339471817, + "mean_token_accuracy": 0.8766071796417236, + "num_tokens": 19353673.0, + "step": 2165 + }, + { + "epoch": 1.6458966565349544, + "grad_norm": 1.76325261592865, + "learning_rate": 2.307506363036688e-06, + "loss": 0.4158991575241089, + "mean_token_accuracy": 0.8522704839706421, + "num_tokens": 19363635.0, + "step": 2166 + }, + { + "epoch": 1.6466565349544073, + "grad_norm": 1.758833885192871, + "learning_rate": 2.305418253381309e-06, + "loss": 0.298480749130249, + "mean_token_accuracy": 0.888424277305603, + "num_tokens": 19372291.0, + "step": 2167 + }, + { + "epoch": 1.6474164133738602, + "grad_norm": 1.6387488842010498, + "learning_rate": 2.3033302802908895e-06, + "loss": 0.4309447109699249, + "mean_token_accuracy": 0.8672212362289429, + "num_tokens": 19383480.0, + "step": 2168 + }, + { + "epoch": 1.648176291793313, + "grad_norm": 1.5251084566116333, + "learning_rate": 2.301242445230851e-06, + "loss": 0.44890880584716797, + "mean_token_accuracy": 0.847392737865448, + "num_tokens": 19394810.0, + "step": 2169 + }, + { + "epoch": 1.648936170212766, + "grad_norm": 1.6106950044631958, + "learning_rate": 2.299154749666515e-06, + "loss": 0.4403916597366333, + "mean_token_accuracy": 0.8379756212234497, + "num_tokens": 19405551.0, + "step": 2170 + }, + { + "epoch": 1.6496960486322187, + "grad_norm": 1.4238437414169312, + "learning_rate": 2.2970671950631066e-06, + "loss": 0.4015567898750305, + "mean_token_accuracy": 0.851482629776001, + "num_tokens": 19418621.0, + "step": 2171 + }, + { + "epoch": 1.6504559270516719, + "grad_norm": 1.3026156425476074, + "learning_rate": 2.2949797828857527e-06, + "loss": 0.3680947422981262, + "mean_token_accuracy": 0.8641397953033447, + "num_tokens": 19432118.0, + "step": 2172 + }, + { + "epoch": 1.6512158054711246, + "grad_norm": 2.1265358924865723, + "learning_rate": 2.2928925145994798e-06, + "loss": 0.43980664014816284, + "mean_token_accuracy": 0.8358430862426758, + "num_tokens": 19439069.0, + "step": 2173 + }, + { + "epoch": 1.6519756838905775, + "grad_norm": 1.8399443626403809, + "learning_rate": 2.290805391669212e-06, + "loss": 0.29801061749458313, + "mean_token_accuracy": 0.8773187398910522, + "num_tokens": 19446745.0, + "step": 2174 + }, + { + "epoch": 1.6527355623100304, + "grad_norm": 1.8680047988891602, + "learning_rate": 2.2887184155597725e-06, + "loss": 0.3235543966293335, + "mean_token_accuracy": 0.8754611015319824, + "num_tokens": 19455266.0, + "step": 2175 + }, + { + "epoch": 1.6534954407294833, + "grad_norm": 2.3048481941223145, + "learning_rate": 2.286631587735883e-06, + "loss": 0.4011988043785095, + "mean_token_accuracy": 0.8531811237335205, + "num_tokens": 19461049.0, + "step": 2176 + }, + { + "epoch": 1.6542553191489362, + "grad_norm": 2.6067066192626953, + "learning_rate": 2.2845449096621583e-06, + "loss": 0.4957500696182251, + "mean_token_accuracy": 0.8255549073219299, + "num_tokens": 19466884.0, + "step": 2177 + }, + { + "epoch": 1.655015197568389, + "grad_norm": 1.5211488008499146, + "learning_rate": 2.282458382803109e-06, + "loss": 0.32245099544525146, + "mean_token_accuracy": 0.8865629434585571, + "num_tokens": 19477294.0, + "step": 2178 + }, + { + "epoch": 1.655775075987842, + "grad_norm": 2.245542526245117, + "learning_rate": 2.280372008623142e-06, + "loss": 0.3790864944458008, + "mean_token_accuracy": 0.8766552209854126, + "num_tokens": 19483385.0, + "step": 2179 + }, + { + "epoch": 1.6565349544072947, + "grad_norm": 2.1158151626586914, + "learning_rate": 2.2782857885865538e-06, + "loss": 0.4726812243461609, + "mean_token_accuracy": 0.8384029865264893, + "num_tokens": 19491367.0, + "step": 2180 + }, + { + "epoch": 1.6572948328267478, + "grad_norm": 3.301389694213867, + "learning_rate": 2.2761997241575335e-06, + "loss": 0.37664809823036194, + "mean_token_accuracy": 0.8913813829421997, + "num_tokens": 19494876.0, + "step": 2181 + }, + { + "epoch": 1.6580547112462005, + "grad_norm": 2.2964162826538086, + "learning_rate": 2.274113816800161e-06, + "loss": 0.4110721945762634, + "mean_token_accuracy": 0.8551756143569946, + "num_tokens": 19500546.0, + "step": 2182 + }, + { + "epoch": 1.6588145896656536, + "grad_norm": 3.368161916732788, + "learning_rate": 2.272028067978408e-06, + "loss": 0.39089250564575195, + "mean_token_accuracy": 0.8786845207214355, + "num_tokens": 19504142.0, + "step": 2183 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 1.7299834489822388, + "learning_rate": 2.2699424791561324e-06, + "loss": 0.5205090641975403, + "mean_token_accuracy": 0.8394201993942261, + "num_tokens": 19514523.0, + "step": 2184 + }, + { + "epoch": 1.6603343465045592, + "grad_norm": 2.045919418334961, + "learning_rate": 2.267857051797081e-06, + "loss": 0.49093255400657654, + "mean_token_accuracy": 0.8338311910629272, + "num_tokens": 19522439.0, + "step": 2185 + }, + { + "epoch": 1.6610942249240122, + "grad_norm": 1.2035714387893677, + "learning_rate": 2.265771787364886e-06, + "loss": 0.37247753143310547, + "mean_token_accuracy": 0.8873692750930786, + "num_tokens": 19536717.0, + "step": 2186 + }, + { + "epoch": 1.661854103343465, + "grad_norm": 2.6186633110046387, + "learning_rate": 2.263686687323068e-06, + "loss": 0.3318040370941162, + "mean_token_accuracy": 0.8720577955245972, + "num_tokens": 19541966.0, + "step": 2187 + }, + { + "epoch": 1.662613981762918, + "grad_norm": 2.6845929622650146, + "learning_rate": 2.261601753135029e-06, + "loss": 0.32441991567611694, + "mean_token_accuracy": 0.8700553178787231, + "num_tokens": 19546644.0, + "step": 2188 + }, + { + "epoch": 1.6633738601823707, + "grad_norm": 2.078998327255249, + "learning_rate": 2.259516986264057e-06, + "loss": 0.3424156904220581, + "mean_token_accuracy": 0.8707810044288635, + "num_tokens": 19553472.0, + "step": 2189 + }, + { + "epoch": 1.6641337386018238, + "grad_norm": 2.380747079849243, + "learning_rate": 2.2574323881733202e-06, + "loss": 0.4994799494743347, + "mean_token_accuracy": 0.817003607749939, + "num_tokens": 19560502.0, + "step": 2190 + }, + { + "epoch": 1.6648936170212765, + "grad_norm": 1.2984378337860107, + "learning_rate": 2.255347960325871e-06, + "loss": 0.33139657974243164, + "mean_token_accuracy": 0.8763977289199829, + "num_tokens": 19575624.0, + "step": 2191 + }, + { + "epoch": 1.6656534954407296, + "grad_norm": 1.3232799768447876, + "learning_rate": 2.2532637041846423e-06, + "loss": 0.32994017004966736, + "mean_token_accuracy": 0.8790634274482727, + "num_tokens": 19588636.0, + "step": 2192 + }, + { + "epoch": 1.6664133738601823, + "grad_norm": 2.11212158203125, + "learning_rate": 2.2511796212124424e-06, + "loss": 0.3140082359313965, + "mean_token_accuracy": 0.8946622014045715, + "num_tokens": 19594917.0, + "step": 2193 + }, + { + "epoch": 1.6671732522796354, + "grad_norm": 2.7206521034240723, + "learning_rate": 2.2490957128719627e-06, + "loss": 0.3723612427711487, + "mean_token_accuracy": 0.8781955242156982, + "num_tokens": 19599310.0, + "step": 2194 + }, + { + "epoch": 1.667933130699088, + "grad_norm": 2.6681952476501465, + "learning_rate": 2.247011980625771e-06, + "loss": 0.3740317225456238, + "mean_token_accuracy": 0.8780536651611328, + "num_tokens": 19604172.0, + "step": 2195 + }, + { + "epoch": 1.668693009118541, + "grad_norm": 1.8933384418487549, + "learning_rate": 2.2449284259363093e-06, + "loss": 0.3359421491622925, + "mean_token_accuracy": 0.8785334825515747, + "num_tokens": 19612030.0, + "step": 2196 + }, + { + "epoch": 1.669452887537994, + "grad_norm": 2.4779889583587646, + "learning_rate": 2.2428450502658964e-06, + "loss": 0.3724144399166107, + "mean_token_accuracy": 0.8739810585975647, + "num_tokens": 19617800.0, + "step": 2197 + }, + { + "epoch": 1.6702127659574468, + "grad_norm": 3.0661120414733887, + "learning_rate": 2.240761855076727e-06, + "loss": 0.3627531826496124, + "mean_token_accuracy": 0.865296483039856, + "num_tokens": 19621885.0, + "step": 2198 + }, + { + "epoch": 1.6709726443768997, + "grad_norm": 2.431708574295044, + "learning_rate": 2.238678841830867e-06, + "loss": 0.31396129727363586, + "mean_token_accuracy": 0.9026765823364258, + "num_tokens": 19627122.0, + "step": 2199 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 2.5498745441436768, + "learning_rate": 2.2365960119902543e-06, + "loss": 0.3193191885948181, + "mean_token_accuracy": 0.8750600218772888, + "num_tokens": 19631771.0, + "step": 2200 + }, + { + "epoch": 1.6724924012158056, + "grad_norm": 2.0419046878814697, + "learning_rate": 2.2345133670167e-06, + "loss": 0.32747960090637207, + "mean_token_accuracy": 0.8603148460388184, + "num_tokens": 19638972.0, + "step": 2201 + }, + { + "epoch": 1.6732522796352582, + "grad_norm": 2.0412306785583496, + "learning_rate": 2.232430908371885e-06, + "loss": 0.4701780676841736, + "mean_token_accuracy": 0.8318476676940918, + "num_tokens": 19647968.0, + "step": 2202 + }, + { + "epoch": 1.6740121580547114, + "grad_norm": 2.054070472717285, + "learning_rate": 2.2303486375173586e-06, + "loss": 0.33284813165664673, + "mean_token_accuracy": 0.8760920763015747, + "num_tokens": 19654032.0, + "step": 2203 + }, + { + "epoch": 1.674772036474164, + "grad_norm": 1.6053217649459839, + "learning_rate": 2.228266555914538e-06, + "loss": 0.34431374073028564, + "mean_token_accuracy": 0.8764770030975342, + "num_tokens": 19663785.0, + "step": 2204 + }, + { + "epoch": 1.675531914893617, + "grad_norm": 1.474494457244873, + "learning_rate": 2.2261846650247077e-06, + "loss": 0.3541037440299988, + "mean_token_accuracy": 0.8782497644424438, + "num_tokens": 19675498.0, + "step": 2205 + }, + { + "epoch": 1.6762917933130699, + "grad_norm": 1.9318026304244995, + "learning_rate": 2.224102966309021e-06, + "loss": 0.4291660189628601, + "mean_token_accuracy": 0.8424201607704163, + "num_tokens": 19684576.0, + "step": 2206 + }, + { + "epoch": 1.6770516717325228, + "grad_norm": 2.2150020599365234, + "learning_rate": 2.2220214612284925e-06, + "loss": 0.46187907457351685, + "mean_token_accuracy": 0.840459942817688, + "num_tokens": 19690412.0, + "step": 2207 + }, + { + "epoch": 1.6778115501519757, + "grad_norm": 1.667281150817871, + "learning_rate": 2.2199401512440037e-06, + "loss": 0.37440744042396545, + "mean_token_accuracy": 0.8694081902503967, + "num_tokens": 19699600.0, + "step": 2208 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 2.6446619033813477, + "learning_rate": 2.2178590378162957e-06, + "loss": 0.3301953077316284, + "mean_token_accuracy": 0.8992182016372681, + "num_tokens": 19704162.0, + "step": 2209 + }, + { + "epoch": 1.6793313069908815, + "grad_norm": 1.4266780614852905, + "learning_rate": 2.215778122405977e-06, + "loss": 0.3811204135417938, + "mean_token_accuracy": 0.861638069152832, + "num_tokens": 19716511.0, + "step": 2210 + }, + { + "epoch": 1.6800911854103342, + "grad_norm": 1.826087474822998, + "learning_rate": 2.2136974064735132e-06, + "loss": 0.4790012836456299, + "mean_token_accuracy": 0.8404909372329712, + "num_tokens": 19726645.0, + "step": 2211 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 1.8551808595657349, + "learning_rate": 2.2116168914792293e-06, + "loss": 0.40999075770378113, + "mean_token_accuracy": 0.8419463634490967, + "num_tokens": 19735601.0, + "step": 2212 + }, + { + "epoch": 1.68161094224924, + "grad_norm": 2.560124158859253, + "learning_rate": 2.209536578883313e-06, + "loss": 0.43428558111190796, + "mean_token_accuracy": 0.8689159750938416, + "num_tokens": 19741138.0, + "step": 2213 + }, + { + "epoch": 1.6823708206686931, + "grad_norm": 2.0154869556427, + "learning_rate": 2.207456470145807e-06, + "loss": 0.43633338809013367, + "mean_token_accuracy": 0.8646916151046753, + "num_tokens": 19751929.0, + "step": 2214 + }, + { + "epoch": 1.6831306990881458, + "grad_norm": 1.3583155870437622, + "learning_rate": 2.205376566726611e-06, + "loss": 0.3050280511379242, + "mean_token_accuracy": 0.8998798727989197, + "num_tokens": 19764012.0, + "step": 2215 + }, + { + "epoch": 1.6838905775075987, + "grad_norm": 1.266262173652649, + "learning_rate": 2.2032968700854813e-06, + "loss": 0.4039713144302368, + "mean_token_accuracy": 0.8571382164955139, + "num_tokens": 19780683.0, + "step": 2216 + }, + { + "epoch": 1.6846504559270516, + "grad_norm": 1.864356517791748, + "learning_rate": 2.2012173816820297e-06, + "loss": 0.361503541469574, + "mean_token_accuracy": 0.868161678314209, + "num_tokens": 19788907.0, + "step": 2217 + }, + { + "epoch": 1.6854103343465046, + "grad_norm": 1.320155382156372, + "learning_rate": 2.1991381029757216e-06, + "loss": 0.28228244185447693, + "mean_token_accuracy": 0.8945217132568359, + "num_tokens": 19800354.0, + "step": 2218 + }, + { + "epoch": 1.6861702127659575, + "grad_norm": 1.9706367254257202, + "learning_rate": 2.1970590354258745e-06, + "loss": 0.2849377989768982, + "mean_token_accuracy": 0.9065699577331543, + "num_tokens": 19806735.0, + "step": 2219 + }, + { + "epoch": 1.6869300911854104, + "grad_norm": 1.9150370359420776, + "learning_rate": 2.1949801804916563e-06, + "loss": 0.4125257730484009, + "mean_token_accuracy": 0.8642163872718811, + "num_tokens": 19814056.0, + "step": 2220 + }, + { + "epoch": 1.6876899696048633, + "grad_norm": 2.062589645385742, + "learning_rate": 2.19290153963209e-06, + "loss": 0.451707124710083, + "mean_token_accuracy": 0.8311163187026978, + "num_tokens": 19821263.0, + "step": 2221 + }, + { + "epoch": 1.688449848024316, + "grad_norm": 1.3959208726882935, + "learning_rate": 2.190823114306045e-06, + "loss": 0.3326707184314728, + "mean_token_accuracy": 0.9037837982177734, + "num_tokens": 19835163.0, + "step": 2222 + }, + { + "epoch": 1.689209726443769, + "grad_norm": 2.09995698928833, + "learning_rate": 2.188744905972239e-06, + "loss": 0.4144105315208435, + "mean_token_accuracy": 0.8512029051780701, + "num_tokens": 19843164.0, + "step": 2223 + }, + { + "epoch": 1.6899696048632218, + "grad_norm": 1.4759427309036255, + "learning_rate": 2.186666916089239e-06, + "loss": 0.4707002639770508, + "mean_token_accuracy": 0.8371601104736328, + "num_tokens": 19858551.0, + "step": 2224 + }, + { + "epoch": 1.690729483282675, + "grad_norm": 2.3398702144622803, + "learning_rate": 2.1845891461154604e-06, + "loss": 0.34672820568084717, + "mean_token_accuracy": 0.879936695098877, + "num_tokens": 19864348.0, + "step": 2225 + }, + { + "epoch": 1.6914893617021276, + "grad_norm": 1.6283963918685913, + "learning_rate": 2.1825115975091594e-06, + "loss": 0.31835079193115234, + "mean_token_accuracy": 0.8695961833000183, + "num_tokens": 19873560.0, + "step": 2226 + }, + { + "epoch": 1.6922492401215805, + "grad_norm": 2.035759687423706, + "learning_rate": 2.1804342717284414e-06, + "loss": 0.43110257387161255, + "mean_token_accuracy": 0.8593922853469849, + "num_tokens": 19880796.0, + "step": 2227 + }, + { + "epoch": 1.6930091185410334, + "grad_norm": 2.1340725421905518, + "learning_rate": 2.1783571702312523e-06, + "loss": 0.46967440843582153, + "mean_token_accuracy": 0.8839266300201416, + "num_tokens": 19887911.0, + "step": 2228 + }, + { + "epoch": 1.6937689969604863, + "grad_norm": 1.710340142250061, + "learning_rate": 2.176280294475383e-06, + "loss": 0.4167519807815552, + "mean_token_accuracy": 0.8526116609573364, + "num_tokens": 19896674.0, + "step": 2229 + }, + { + "epoch": 1.6945288753799392, + "grad_norm": 1.7793304920196533, + "learning_rate": 2.174203645918464e-06, + "loss": 0.3875434994697571, + "mean_token_accuracy": 0.8637192249298096, + "num_tokens": 19904825.0, + "step": 2230 + }, + { + "epoch": 1.6952887537993921, + "grad_norm": 1.7908778190612793, + "learning_rate": 2.172127226017967e-06, + "loss": 0.42065349221229553, + "mean_token_accuracy": 0.850834846496582, + "num_tokens": 19914377.0, + "step": 2231 + }, + { + "epoch": 1.696048632218845, + "grad_norm": 3.0943970680236816, + "learning_rate": 2.1700510362312053e-06, + "loss": 0.44845050573349, + "mean_token_accuracy": 0.8460367918014526, + "num_tokens": 19918929.0, + "step": 2232 + }, + { + "epoch": 1.6968085106382977, + "grad_norm": 1.5586018562316895, + "learning_rate": 2.1679750780153265e-06, + "loss": 0.4723482131958008, + "mean_token_accuracy": 0.871384859085083, + "num_tokens": 19932738.0, + "step": 2233 + }, + { + "epoch": 1.6975683890577509, + "grad_norm": 2.014230728149414, + "learning_rate": 2.1658993528273196e-06, + "loss": 0.43307146430015564, + "mean_token_accuracy": 0.8677935600280762, + "num_tokens": 19940246.0, + "step": 2234 + }, + { + "epoch": 1.6983282674772036, + "grad_norm": 1.528979778289795, + "learning_rate": 2.163823862124007e-06, + "loss": 0.3897377550601959, + "mean_token_accuracy": 0.8737689256668091, + "num_tokens": 19951187.0, + "step": 2235 + }, + { + "epoch": 1.6990881458966567, + "grad_norm": 1.9856207370758057, + "learning_rate": 2.1617486073620496e-06, + "loss": 0.4285745620727539, + "mean_token_accuracy": 0.8744081258773804, + "num_tokens": 19957768.0, + "step": 2236 + }, + { + "epoch": 1.6998480243161094, + "grad_norm": 2.130525827407837, + "learning_rate": 2.15967358999794e-06, + "loss": 0.405293732881546, + "mean_token_accuracy": 0.8588452935218811, + "num_tokens": 19965354.0, + "step": 2237 + }, + { + "epoch": 1.7006079027355623, + "grad_norm": 1.665329098701477, + "learning_rate": 2.1575988114880057e-06, + "loss": 0.42987754940986633, + "mean_token_accuracy": 0.846322238445282, + "num_tokens": 19975780.0, + "step": 2238 + }, + { + "epoch": 1.7013677811550152, + "grad_norm": 1.0725677013397217, + "learning_rate": 2.155524273288405e-06, + "loss": 0.31892159581184387, + "mean_token_accuracy": 0.8692483305931091, + "num_tokens": 19995875.0, + "step": 2239 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.282604455947876, + "learning_rate": 2.15344997685513e-06, + "loss": 0.4460654556751251, + "mean_token_accuracy": 0.8623759746551514, + "num_tokens": 20001466.0, + "step": 2240 + }, + { + "epoch": 1.702887537993921, + "grad_norm": 1.1385949850082397, + "learning_rate": 2.1513759236440024e-06, + "loss": 0.37046104669570923, + "mean_token_accuracy": 0.8637164831161499, + "num_tokens": 20020998.0, + "step": 2241 + }, + { + "epoch": 1.7036474164133737, + "grad_norm": 1.5521315336227417, + "learning_rate": 2.1493021151106704e-06, + "loss": 0.4526556134223938, + "mean_token_accuracy": 0.8675785064697266, + "num_tokens": 20032750.0, + "step": 2242 + }, + { + "epoch": 1.7044072948328268, + "grad_norm": 1.7777446508407593, + "learning_rate": 2.147228552710614e-06, + "loss": 0.41294580698013306, + "mean_token_accuracy": 0.8597785234451294, + "num_tokens": 20041901.0, + "step": 2243 + }, + { + "epoch": 1.7051671732522795, + "grad_norm": 1.5157700777053833, + "learning_rate": 2.145155237899139e-06, + "loss": 0.4158926010131836, + "mean_token_accuracy": 0.8512611985206604, + "num_tokens": 20053705.0, + "step": 2244 + }, + { + "epoch": 1.7059270516717326, + "grad_norm": 1.5116809606552124, + "learning_rate": 2.143082172131378e-06, + "loss": 0.43943172693252563, + "mean_token_accuracy": 0.8429899215698242, + "num_tokens": 20069468.0, + "step": 2245 + }, + { + "epoch": 1.7066869300911853, + "grad_norm": 1.6095285415649414, + "learning_rate": 2.141009356862288e-06, + "loss": 0.41325604915618896, + "mean_token_accuracy": 0.8832963705062866, + "num_tokens": 20080596.0, + "step": 2246 + }, + { + "epoch": 1.7074468085106385, + "grad_norm": 1.39210844039917, + "learning_rate": 2.138936793546649e-06, + "loss": 0.3945302963256836, + "mean_token_accuracy": 0.8698325753211975, + "num_tokens": 20094158.0, + "step": 2247 + }, + { + "epoch": 1.7082066869300911, + "grad_norm": 2.9576594829559326, + "learning_rate": 2.1368644836390684e-06, + "loss": 0.16507276892662048, + "mean_token_accuracy": 0.9410445690155029, + "num_tokens": 20097002.0, + "step": 2248 + }, + { + "epoch": 1.708966565349544, + "grad_norm": 1.7631266117095947, + "learning_rate": 2.134792428593971e-06, + "loss": 0.519780695438385, + "mean_token_accuracy": 0.8276066780090332, + "num_tokens": 20107947.0, + "step": 2249 + }, + { + "epoch": 1.709726443768997, + "grad_norm": 2.144636869430542, + "learning_rate": 2.1327206298656055e-06, + "loss": 0.32923734188079834, + "mean_token_accuracy": 0.8766019344329834, + "num_tokens": 20113676.0, + "step": 2250 + }, + { + "epoch": 1.7104863221884499, + "grad_norm": 1.9511034488677979, + "learning_rate": 2.130649088908041e-06, + "loss": 0.4043842554092407, + "mean_token_accuracy": 0.8525843620300293, + "num_tokens": 20120787.0, + "step": 2251 + }, + { + "epoch": 1.7112462006079028, + "grad_norm": 1.5001336336135864, + "learning_rate": 2.1285778071751638e-06, + "loss": 0.4800187051296234, + "mean_token_accuracy": 0.8398486375808716, + "num_tokens": 20133534.0, + "step": 2252 + }, + { + "epoch": 1.7120060790273555, + "grad_norm": 1.435195803642273, + "learning_rate": 2.126506786120678e-06, + "loss": 0.44489604234695435, + "mean_token_accuracy": 0.8444881439208984, + "num_tokens": 20151787.0, + "step": 2253 + }, + { + "epoch": 1.7127659574468086, + "grad_norm": 1.3056137561798096, + "learning_rate": 2.1244360271981073e-06, + "loss": 0.300567090511322, + "mean_token_accuracy": 0.8903113007545471, + "num_tokens": 20163390.0, + "step": 2254 + }, + { + "epoch": 1.7135258358662613, + "grad_norm": 1.7347925901412964, + "learning_rate": 2.1223655318607907e-06, + "loss": 0.30601179599761963, + "mean_token_accuracy": 0.8845717906951904, + "num_tokens": 20171354.0, + "step": 2255 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.316306471824646, + "learning_rate": 2.1202953015618794e-06, + "loss": 0.3972984552383423, + "mean_token_accuracy": 0.845410943031311, + "num_tokens": 20184464.0, + "step": 2256 + }, + { + "epoch": 1.715045592705167, + "grad_norm": 2.1052892208099365, + "learning_rate": 2.1182253377543428e-06, + "loss": 0.3357020616531372, + "mean_token_accuracy": 0.8853542804718018, + "num_tokens": 20190539.0, + "step": 2257 + }, + { + "epoch": 1.71580547112462, + "grad_norm": 1.4192553758621216, + "learning_rate": 2.116155641890959e-06, + "loss": 0.3881692588329315, + "mean_token_accuracy": 0.8442144989967346, + "num_tokens": 20204570.0, + "step": 2258 + }, + { + "epoch": 1.716565349544073, + "grad_norm": 2.134113311767578, + "learning_rate": 2.1140862154243223e-06, + "loss": 0.37803274393081665, + "mean_token_accuracy": 0.8703107237815857, + "num_tokens": 20210535.0, + "step": 2259 + }, + { + "epoch": 1.7173252279635258, + "grad_norm": 2.9149155616760254, + "learning_rate": 2.1120170598068353e-06, + "loss": 0.34860676527023315, + "mean_token_accuracy": 0.8734345436096191, + "num_tokens": 20214375.0, + "step": 2260 + }, + { + "epoch": 1.7180851063829787, + "grad_norm": 1.6855589151382446, + "learning_rate": 2.109948176490711e-06, + "loss": 0.3676984906196594, + "mean_token_accuracy": 0.8531560301780701, + "num_tokens": 20223791.0, + "step": 2261 + }, + { + "epoch": 1.7188449848024316, + "grad_norm": 2.09671950340271, + "learning_rate": 2.10787956692797e-06, + "loss": 0.41744115948677063, + "mean_token_accuracy": 0.8570001125335693, + "num_tokens": 20231254.0, + "step": 2262 + }, + { + "epoch": 1.7196048632218845, + "grad_norm": 3.148813009262085, + "learning_rate": 2.1058112325704436e-06, + "loss": 0.20556189119815826, + "mean_token_accuracy": 0.926898717880249, + "num_tokens": 20234470.0, + "step": 2263 + }, + { + "epoch": 1.7203647416413372, + "grad_norm": 1.9707107543945312, + "learning_rate": 2.103743174869769e-06, + "loss": 0.40733110904693604, + "mean_token_accuracy": 0.8740406036376953, + "num_tokens": 20242286.0, + "step": 2264 + }, + { + "epoch": 1.7211246200607904, + "grad_norm": 1.2756069898605347, + "learning_rate": 2.1016753952773867e-06, + "loss": 0.3940718173980713, + "mean_token_accuracy": 0.860906720161438, + "num_tokens": 20260382.0, + "step": 2265 + }, + { + "epoch": 1.721884498480243, + "grad_norm": 1.5074653625488281, + "learning_rate": 2.0996078952445453e-06, + "loss": 0.3353617191314697, + "mean_token_accuracy": 0.8809853792190552, + "num_tokens": 20271665.0, + "step": 2266 + }, + { + "epoch": 1.7226443768996962, + "grad_norm": 1.4331210851669312, + "learning_rate": 2.0975406762222966e-06, + "loss": 0.32260069251060486, + "mean_token_accuracy": 0.901330828666687, + "num_tokens": 20283122.0, + "step": 2267 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.2378969192504883, + "learning_rate": 2.095473739661494e-06, + "loss": 0.39086243510246277, + "mean_token_accuracy": 0.8681687116622925, + "num_tokens": 20289243.0, + "step": 2268 + }, + { + "epoch": 1.7241641337386018, + "grad_norm": 2.754582405090332, + "learning_rate": 2.093407087012791e-06, + "loss": 0.42927244305610657, + "mean_token_accuracy": 0.8594136834144592, + "num_tokens": 20294537.0, + "step": 2269 + }, + { + "epoch": 1.7249240121580547, + "grad_norm": 2.2721824645996094, + "learning_rate": 2.091340719726647e-06, + "loss": 0.42479783296585083, + "mean_token_accuracy": 0.8411722183227539, + "num_tokens": 20301502.0, + "step": 2270 + }, + { + "epoch": 1.7256838905775076, + "grad_norm": 2.3230299949645996, + "learning_rate": 2.089274639253317e-06, + "loss": 0.4218963384628296, + "mean_token_accuracy": 0.8498032093048096, + "num_tokens": 20307710.0, + "step": 2271 + }, + { + "epoch": 1.7264437689969605, + "grad_norm": 2.3499748706817627, + "learning_rate": 2.0872088470428553e-06, + "loss": 0.4472277760505676, + "mean_token_accuracy": 0.8487255573272705, + "num_tokens": 20313945.0, + "step": 2272 + }, + { + "epoch": 1.7272036474164134, + "grad_norm": 1.3709690570831299, + "learning_rate": 2.0851433445451142e-06, + "loss": 0.38701117038726807, + "mean_token_accuracy": 0.8592075109481812, + "num_tokens": 20328023.0, + "step": 2273 + }, + { + "epoch": 1.7279635258358663, + "grad_norm": 1.1293425559997559, + "learning_rate": 2.0830781332097446e-06, + "loss": 0.34000539779663086, + "mean_token_accuracy": 0.8779317140579224, + "num_tokens": 20346767.0, + "step": 2274 + }, + { + "epoch": 1.728723404255319, + "grad_norm": 2.9770123958587646, + "learning_rate": 2.08101321448619e-06, + "loss": 0.4437636733055115, + "mean_token_accuracy": 0.8398602604866028, + "num_tokens": 20352306.0, + "step": 2275 + }, + { + "epoch": 1.7294832826747721, + "grad_norm": 3.510955572128296, + "learning_rate": 2.0789485898236897e-06, + "loss": 0.3359706401824951, + "mean_token_accuracy": 0.8872498273849487, + "num_tokens": 20355560.0, + "step": 2276 + }, + { + "epoch": 1.7302431610942248, + "grad_norm": 2.0873279571533203, + "learning_rate": 2.076884260671276e-06, + "loss": 0.38720619678497314, + "mean_token_accuracy": 0.865881621837616, + "num_tokens": 20362802.0, + "step": 2277 + }, + { + "epoch": 1.731003039513678, + "grad_norm": 2.4871230125427246, + "learning_rate": 2.0748202284777775e-06, + "loss": 0.3250775933265686, + "mean_token_accuracy": 0.8867610692977905, + "num_tokens": 20367080.0, + "step": 2278 + }, + { + "epoch": 1.7317629179331306, + "grad_norm": 3.5603582859039307, + "learning_rate": 2.072756494691809e-06, + "loss": 0.35600754618644714, + "mean_token_accuracy": 0.8781189918518066, + "num_tokens": 20370625.0, + "step": 2279 + }, + { + "epoch": 1.7325227963525835, + "grad_norm": 2.0948755741119385, + "learning_rate": 2.070693060761779e-06, + "loss": 0.3558604419231415, + "mean_token_accuracy": 0.902066707611084, + "num_tokens": 20376835.0, + "step": 2280 + }, + { + "epoch": 1.7332826747720365, + "grad_norm": 2.391188859939575, + "learning_rate": 2.0686299281358837e-06, + "loss": 0.36596938967704773, + "mean_token_accuracy": 0.8741272687911987, + "num_tokens": 20382282.0, + "step": 2281 + }, + { + "epoch": 1.7340425531914894, + "grad_norm": 1.6906369924545288, + "learning_rate": 2.0665670982621107e-06, + "loss": 0.5241266489028931, + "mean_token_accuracy": 0.8091107606887817, + "num_tokens": 20393736.0, + "step": 2282 + }, + { + "epoch": 1.7348024316109423, + "grad_norm": 1.7578394412994385, + "learning_rate": 2.0645045725882334e-06, + "loss": 0.37041786313056946, + "mean_token_accuracy": 0.8907113075256348, + "num_tokens": 20402715.0, + "step": 2283 + }, + { + "epoch": 1.7355623100303952, + "grad_norm": 2.191727638244629, + "learning_rate": 2.0624423525618097e-06, + "loss": 0.43301627039909363, + "mean_token_accuracy": 0.8706433773040771, + "num_tokens": 20409976.0, + "step": 2284 + }, + { + "epoch": 1.736322188449848, + "grad_norm": 1.958005666732788, + "learning_rate": 2.0603804396301875e-06, + "loss": 0.29002684354782104, + "mean_token_accuracy": 0.8914110660552979, + "num_tokens": 20417099.0, + "step": 2285 + }, + { + "epoch": 1.7370820668693008, + "grad_norm": 2.477837085723877, + "learning_rate": 2.058318835240495e-06, + "loss": 0.2953898310661316, + "mean_token_accuracy": 0.8975275754928589, + "num_tokens": 20422251.0, + "step": 2286 + }, + { + "epoch": 1.737841945288754, + "grad_norm": 2.156764268875122, + "learning_rate": 2.0562575408396475e-06, + "loss": 0.4063698649406433, + "mean_token_accuracy": 0.8497642278671265, + "num_tokens": 20429338.0, + "step": 2287 + }, + { + "epoch": 1.7386018237082066, + "grad_norm": 1.6748939752578735, + "learning_rate": 2.0541965578743373e-06, + "loss": 0.3272587060928345, + "mean_token_accuracy": 0.8646700382232666, + "num_tokens": 20439680.0, + "step": 2288 + }, + { + "epoch": 1.7393617021276597, + "grad_norm": 1.9948776960372925, + "learning_rate": 2.0521358877910446e-06, + "loss": 0.36843347549438477, + "mean_token_accuracy": 0.8613901138305664, + "num_tokens": 20448492.0, + "step": 2289 + }, + { + "epoch": 1.7401215805471124, + "grad_norm": 2.231428623199463, + "learning_rate": 2.0500755320360263e-06, + "loss": 0.3905152380466461, + "mean_token_accuracy": 0.8980990052223206, + "num_tokens": 20453945.0, + "step": 2290 + }, + { + "epoch": 1.7408814589665653, + "grad_norm": 2.2187650203704834, + "learning_rate": 2.048015492055319e-06, + "loss": 0.45920854806900024, + "mean_token_accuracy": 0.8282852172851562, + "num_tokens": 20462378.0, + "step": 2291 + }, + { + "epoch": 1.7416413373860182, + "grad_norm": 2.0668466091156006, + "learning_rate": 2.045955769294737e-06, + "loss": 0.3227751553058624, + "mean_token_accuracy": 0.8805934190750122, + "num_tokens": 20469822.0, + "step": 2292 + }, + { + "epoch": 1.7424012158054711, + "grad_norm": 1.9162774085998535, + "learning_rate": 2.0438963651998747e-06, + "loss": 0.4604800343513489, + "mean_token_accuracy": 0.8441175818443298, + "num_tokens": 20479099.0, + "step": 2293 + }, + { + "epoch": 1.743161094224924, + "grad_norm": 2.645329713821411, + "learning_rate": 2.0418372812161015e-06, + "loss": 0.3239654004573822, + "mean_token_accuracy": 0.8888648748397827, + "num_tokens": 20483926.0, + "step": 2294 + }, + { + "epoch": 1.743920972644377, + "grad_norm": 1.39468514919281, + "learning_rate": 2.03977851878856e-06, + "loss": 0.4003690183162689, + "mean_token_accuracy": 0.8769714832305908, + "num_tokens": 20496501.0, + "step": 2295 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.509174346923828, + "learning_rate": 2.0377200793621694e-06, + "loss": 0.2948213517665863, + "mean_token_accuracy": 0.8972329497337341, + "num_tokens": 20500000.0, + "step": 2296 + }, + { + "epoch": 1.7454407294832825, + "grad_norm": 1.5033894777297974, + "learning_rate": 2.0356619643816234e-06, + "loss": 0.40694737434387207, + "mean_token_accuracy": 0.8607243895530701, + "num_tokens": 20513473.0, + "step": 2297 + }, + { + "epoch": 1.7462006079027357, + "grad_norm": 1.4324895143508911, + "learning_rate": 2.0336041752913843e-06, + "loss": 0.3899157643318176, + "mean_token_accuracy": 0.858935534954071, + "num_tokens": 20524516.0, + "step": 2298 + }, + { + "epoch": 1.7469604863221884, + "grad_norm": 2.359544277191162, + "learning_rate": 2.031546713535688e-06, + "loss": 0.369213342666626, + "mean_token_accuracy": 0.8741403818130493, + "num_tokens": 20530421.0, + "step": 2299 + }, + { + "epoch": 1.7477203647416415, + "grad_norm": 2.282637357711792, + "learning_rate": 2.029489580558542e-06, + "loss": 0.3255441188812256, + "mean_token_accuracy": 0.9045462608337402, + "num_tokens": 20535954.0, + "step": 2300 + }, + { + "epoch": 1.7484802431610942, + "grad_norm": 1.7367198467254639, + "learning_rate": 2.0274327778037204e-06, + "loss": 0.43890488147735596, + "mean_token_accuracy": 0.8494667410850525, + "num_tokens": 20548638.0, + "step": 2301 + }, + { + "epoch": 1.749240121580547, + "grad_norm": 1.6236488819122314, + "learning_rate": 2.0253763067147657e-06, + "loss": 0.4440777897834778, + "mean_token_accuracy": 0.8414230942726135, + "num_tokens": 20559263.0, + "step": 2302 + }, + { + "epoch": 1.75, + "grad_norm": 1.3755455017089844, + "learning_rate": 2.0233201687349888e-06, + "loss": 0.3473797142505646, + "mean_token_accuracy": 0.8742472529411316, + "num_tokens": 20573109.0, + "step": 2303 + }, + { + "epoch": 1.750759878419453, + "grad_norm": 3.271153688430786, + "learning_rate": 2.0212643653074677e-06, + "loss": 0.4965784549713135, + "mean_token_accuracy": 0.8596988916397095, + "num_tokens": 20578525.0, + "step": 2304 + }, + { + "epoch": 1.7515197568389058, + "grad_norm": 2.6341168880462646, + "learning_rate": 2.019208897875043e-06, + "loss": 0.37775442004203796, + "mean_token_accuracy": 0.8721816539764404, + "num_tokens": 20583641.0, + "step": 2305 + }, + { + "epoch": 1.7522796352583585, + "grad_norm": 1.8308569192886353, + "learning_rate": 2.0171537678803222e-06, + "loss": 0.3243415355682373, + "mean_token_accuracy": 0.8837124109268188, + "num_tokens": 20591725.0, + "step": 2306 + }, + { + "epoch": 1.7530395136778116, + "grad_norm": 2.4362998008728027, + "learning_rate": 2.015098976765673e-06, + "loss": 0.3738787770271301, + "mean_token_accuracy": 0.8974303007125854, + "num_tokens": 20596587.0, + "step": 2307 + }, + { + "epoch": 1.7537993920972643, + "grad_norm": 3.2920920848846436, + "learning_rate": 2.0130445259732282e-06, + "loss": 0.33901530504226685, + "mean_token_accuracy": 0.9019063115119934, + "num_tokens": 20600379.0, + "step": 2308 + }, + { + "epoch": 1.7545592705167175, + "grad_norm": 1.290475606918335, + "learning_rate": 2.01099041694488e-06, + "loss": 0.37150678038597107, + "mean_token_accuracy": 0.8542044758796692, + "num_tokens": 20614340.0, + "step": 2309 + }, + { + "epoch": 1.7553191489361701, + "grad_norm": 2.7794933319091797, + "learning_rate": 2.0089366511222815e-06, + "loss": 0.3746095895767212, + "mean_token_accuracy": 0.8653185367584229, + "num_tokens": 20622056.0, + "step": 2310 + }, + { + "epoch": 1.756079027355623, + "grad_norm": 2.2112278938293457, + "learning_rate": 2.006883229946843e-06, + "loss": 0.35793858766555786, + "mean_token_accuracy": 0.875727653503418, + "num_tokens": 20628930.0, + "step": 2311 + }, + { + "epoch": 1.756838905775076, + "grad_norm": 1.5240603685379028, + "learning_rate": 2.0048301548597365e-06, + "loss": 0.512831449508667, + "mean_token_accuracy": 0.8139172792434692, + "num_tokens": 20643159.0, + "step": 2312 + }, + { + "epoch": 1.7575987841945289, + "grad_norm": 1.810485601425171, + "learning_rate": 2.0027774273018894e-06, + "loss": 0.43870818614959717, + "mean_token_accuracy": 0.8313089609146118, + "num_tokens": 20651914.0, + "step": 2313 + }, + { + "epoch": 1.7583586626139818, + "grad_norm": 1.748178243637085, + "learning_rate": 2.0007250487139827e-06, + "loss": 0.42277514934539795, + "mean_token_accuracy": 0.8463197946548462, + "num_tokens": 20660054.0, + "step": 2314 + }, + { + "epoch": 1.7591185410334347, + "grad_norm": 1.511717677116394, + "learning_rate": 1.998673020536456e-06, + "loss": 0.38304439187049866, + "mean_token_accuracy": 0.8508470058441162, + "num_tokens": 20673371.0, + "step": 2315 + }, + { + "epoch": 1.7598784194528876, + "grad_norm": 1.7790700197219849, + "learning_rate": 1.996621344209503e-06, + "loss": 0.3838311433792114, + "mean_token_accuracy": 0.8676829934120178, + "num_tokens": 20682072.0, + "step": 2316 + }, + { + "epoch": 1.7606382978723403, + "grad_norm": 1.9128468036651611, + "learning_rate": 1.994570021173067e-06, + "loss": 0.40384364128112793, + "mean_token_accuracy": 0.8747294545173645, + "num_tokens": 20689000.0, + "step": 2317 + }, + { + "epoch": 1.7613981762917934, + "grad_norm": 3.286569118499756, + "learning_rate": 1.9925190528668455e-06, + "loss": 0.38019680976867676, + "mean_token_accuracy": 0.8678069114685059, + "num_tokens": 20692763.0, + "step": 2318 + }, + { + "epoch": 1.762158054711246, + "grad_norm": 1.6108927726745605, + "learning_rate": 1.990468440730288e-06, + "loss": 0.3144170045852661, + "mean_token_accuracy": 0.8695170879364014, + "num_tokens": 20702620.0, + "step": 2319 + }, + { + "epoch": 1.7629179331306992, + "grad_norm": 3.185225009918213, + "learning_rate": 1.9884181862025938e-06, + "loss": 0.41619348526000977, + "mean_token_accuracy": 0.8543670177459717, + "num_tokens": 20706857.0, + "step": 2320 + }, + { + "epoch": 1.763677811550152, + "grad_norm": 2.3699469566345215, + "learning_rate": 1.986368290722709e-06, + "loss": 0.5115842819213867, + "mean_token_accuracy": 0.8141909837722778, + "num_tokens": 20713997.0, + "step": 2321 + }, + { + "epoch": 1.7644376899696048, + "grad_norm": 1.4449706077575684, + "learning_rate": 1.9843187557293286e-06, + "loss": 0.419655442237854, + "mean_token_accuracy": 0.8545533418655396, + "num_tokens": 20726548.0, + "step": 2322 + }, + { + "epoch": 1.7651975683890577, + "grad_norm": 2.127614974975586, + "learning_rate": 1.9822695826608975e-06, + "loss": 0.43722522258758545, + "mean_token_accuracy": 0.8542283773422241, + "num_tokens": 20733469.0, + "step": 2323 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 3.3081557750701904, + "learning_rate": 1.9802207729556023e-06, + "loss": 0.30904972553253174, + "mean_token_accuracy": 0.8896352648735046, + "num_tokens": 20737190.0, + "step": 2324 + }, + { + "epoch": 1.7667173252279635, + "grad_norm": 2.603506326675415, + "learning_rate": 1.978172328051377e-06, + "loss": 0.30952537059783936, + "mean_token_accuracy": 0.8868587017059326, + "num_tokens": 20741780.0, + "step": 2325 + }, + { + "epoch": 1.7674772036474165, + "grad_norm": 2.576824903488159, + "learning_rate": 1.9761242493858987e-06, + "loss": 0.29593953490257263, + "mean_token_accuracy": 0.888198733329773, + "num_tokens": 20746324.0, + "step": 2326 + }, + { + "epoch": 1.7682370820668694, + "grad_norm": 1.6168320178985596, + "learning_rate": 1.9740765383965894e-06, + "loss": 0.5093998908996582, + "mean_token_accuracy": 0.8301646709442139, + "num_tokens": 20760140.0, + "step": 2327 + }, + { + "epoch": 1.768996960486322, + "grad_norm": 2.1162400245666504, + "learning_rate": 1.9720291965206097e-06, + "loss": 0.36714404821395874, + "mean_token_accuracy": 0.8699671626091003, + "num_tokens": 20766961.0, + "step": 2328 + }, + { + "epoch": 1.7697568389057752, + "grad_norm": 1.046911597251892, + "learning_rate": 1.969982225194864e-06, + "loss": 0.40783989429473877, + "mean_token_accuracy": 0.8474892377853394, + "num_tokens": 20786737.0, + "step": 2329 + }, + { + "epoch": 1.7705167173252279, + "grad_norm": 1.7059568166732788, + "learning_rate": 1.9679356258559943e-06, + "loss": 0.44083845615386963, + "mean_token_accuracy": 0.841221034526825, + "num_tokens": 20798907.0, + "step": 2330 + }, + { + "epoch": 1.771276595744681, + "grad_norm": 1.5157767534255981, + "learning_rate": 1.9658893999403847e-06, + "loss": 0.4671107828617096, + "mean_token_accuracy": 0.8252813816070557, + "num_tokens": 20814304.0, + "step": 2331 + }, + { + "epoch": 1.7720364741641337, + "grad_norm": 2.1340525150299072, + "learning_rate": 1.9638435488841543e-06, + "loss": 0.4088709354400635, + "mean_token_accuracy": 0.8595127463340759, + "num_tokens": 20821827.0, + "step": 2332 + }, + { + "epoch": 1.7727963525835866, + "grad_norm": 1.948072910308838, + "learning_rate": 1.96179807412316e-06, + "loss": 0.3692860007286072, + "mean_token_accuracy": 0.8678920269012451, + "num_tokens": 20828612.0, + "step": 2333 + }, + { + "epoch": 1.7735562310030395, + "grad_norm": 1.5731977224349976, + "learning_rate": 1.959752977092995e-06, + "loss": 0.3743135929107666, + "mean_token_accuracy": 0.8723479509353638, + "num_tokens": 20838497.0, + "step": 2334 + }, + { + "epoch": 1.7743161094224924, + "grad_norm": 1.5506012439727783, + "learning_rate": 1.957708259228987e-06, + "loss": 0.4403391182422638, + "mean_token_accuracy": 0.854604959487915, + "num_tokens": 20851603.0, + "step": 2335 + }, + { + "epoch": 1.7750759878419453, + "grad_norm": 1.154336929321289, + "learning_rate": 1.9556639219661983e-06, + "loss": 0.5281188488006592, + "mean_token_accuracy": 0.8101300001144409, + "num_tokens": 20875661.0, + "step": 2336 + }, + { + "epoch": 1.7758358662613982, + "grad_norm": 4.720771312713623, + "learning_rate": 1.9536199667394217e-06, + "loss": 0.44419822096824646, + "mean_token_accuracy": 0.8740090131759644, + "num_tokens": 20886971.0, + "step": 2337 + }, + { + "epoch": 1.7765957446808511, + "grad_norm": 1.5492230653762817, + "learning_rate": 1.9515763949831852e-06, + "loss": 0.4538637697696686, + "mean_token_accuracy": 0.8362185955047607, + "num_tokens": 20899212.0, + "step": 2338 + }, + { + "epoch": 1.7773556231003038, + "grad_norm": 1.354101538658142, + "learning_rate": 1.9495332081317466e-06, + "loss": 0.4341534376144409, + "mean_token_accuracy": 0.8380170464515686, + "num_tokens": 20913065.0, + "step": 2339 + }, + { + "epoch": 1.778115501519757, + "grad_norm": 1.5805599689483643, + "learning_rate": 1.947490407619092e-06, + "loss": 0.40928739309310913, + "mean_token_accuracy": 0.8524469137191772, + "num_tokens": 20922919.0, + "step": 2340 + }, + { + "epoch": 1.7788753799392096, + "grad_norm": 2.097221851348877, + "learning_rate": 1.945447994878937e-06, + "loss": 0.4816104769706726, + "mean_token_accuracy": 0.888654351234436, + "num_tokens": 20931350.0, + "step": 2341 + }, + { + "epoch": 1.7796352583586628, + "grad_norm": 1.7193297147750854, + "learning_rate": 1.9434059713447264e-06, + "loss": 0.44925639033317566, + "mean_token_accuracy": 0.8500319123268127, + "num_tokens": 20940546.0, + "step": 2342 + }, + { + "epoch": 1.7803951367781155, + "grad_norm": 1.5971747636795044, + "learning_rate": 1.9413643384496315e-06, + "loss": 0.29559412598609924, + "mean_token_accuracy": 0.8871279954910278, + "num_tokens": 20950604.0, + "step": 2343 + }, + { + "epoch": 1.7811550151975684, + "grad_norm": 2.788029670715332, + "learning_rate": 1.9393230976265478e-06, + "loss": 0.31713539361953735, + "mean_token_accuracy": 0.8866176605224609, + "num_tokens": 20955296.0, + "step": 2344 + }, + { + "epoch": 1.7819148936170213, + "grad_norm": 1.5747952461242676, + "learning_rate": 1.937282250308096e-06, + "loss": 0.41813358664512634, + "mean_token_accuracy": 0.8418053984642029, + "num_tokens": 20967664.0, + "step": 2345 + }, + { + "epoch": 1.7826747720364742, + "grad_norm": 2.0813145637512207, + "learning_rate": 1.935241797926623e-06, + "loss": 0.39056286215782166, + "mean_token_accuracy": 0.8601781129837036, + "num_tokens": 20975895.0, + "step": 2346 + }, + { + "epoch": 1.783434650455927, + "grad_norm": 2.143022298812866, + "learning_rate": 1.933201741914196e-06, + "loss": 0.40797823667526245, + "mean_token_accuracy": 0.8846398591995239, + "num_tokens": 20983683.0, + "step": 2347 + }, + { + "epoch": 1.78419452887538, + "grad_norm": 1.8451775312423706, + "learning_rate": 1.931162083702606e-06, + "loss": 0.34083136916160583, + "mean_token_accuracy": 0.8643462657928467, + "num_tokens": 20992621.0, + "step": 2348 + }, + { + "epoch": 1.784954407294833, + "grad_norm": 1.8603935241699219, + "learning_rate": 1.9291228247233607e-06, + "loss": 0.4860231280326843, + "mean_token_accuracy": 0.8391251564025879, + "num_tokens": 21002427.0, + "step": 2349 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 2.751711845397949, + "learning_rate": 1.9270839664076937e-06, + "loss": 0.30588358640670776, + "mean_token_accuracy": 0.8836315274238586, + "num_tokens": 21006898.0, + "step": 2350 + }, + { + "epoch": 1.7864741641337387, + "grad_norm": 1.0335345268249512, + "learning_rate": 1.9250455101865526e-06, + "loss": 0.3119634985923767, + "mean_token_accuracy": 0.8912283182144165, + "num_tokens": 21024930.0, + "step": 2351 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.4693806171417236, + "learning_rate": 1.9230074574906043e-06, + "loss": 0.1976669877767563, + "mean_token_accuracy": 0.928974986076355, + "num_tokens": 21029027.0, + "step": 2352 + }, + { + "epoch": 1.7879939209726445, + "grad_norm": 1.2892690896987915, + "learning_rate": 1.920969809750234e-06, + "loss": 0.46008217334747314, + "mean_token_accuracy": 0.8299605846405029, + "num_tokens": 21047671.0, + "step": 2353 + }, + { + "epoch": 1.7887537993920972, + "grad_norm": 3.162534713745117, + "learning_rate": 1.91893256839554e-06, + "loss": 0.2916071116924286, + "mean_token_accuracy": 0.8932807445526123, + "num_tokens": 21051555.0, + "step": 2354 + }, + { + "epoch": 1.7895136778115501, + "grad_norm": 1.7627713680267334, + "learning_rate": 1.916895734856338e-06, + "loss": 0.3223535120487213, + "mean_token_accuracy": 0.8852578401565552, + "num_tokens": 21060056.0, + "step": 2355 + }, + { + "epoch": 1.790273556231003, + "grad_norm": 1.9448071718215942, + "learning_rate": 1.9148593105621542e-06, + "loss": 0.3650452196598053, + "mean_token_accuracy": 0.8709862232208252, + "num_tokens": 21067190.0, + "step": 2356 + }, + { + "epoch": 1.791033434650456, + "grad_norm": 2.026644229888916, + "learning_rate": 1.9128232969422318e-06, + "loss": 0.3620566427707672, + "mean_token_accuracy": 0.865707516670227, + "num_tokens": 21075197.0, + "step": 2357 + }, + { + "epoch": 1.7917933130699089, + "grad_norm": 2.2628564834594727, + "learning_rate": 1.9107876954255217e-06, + "loss": 0.353444367647171, + "mean_token_accuracy": 0.8590385913848877, + "num_tokens": 21080823.0, + "step": 2358 + }, + { + "epoch": 1.7925531914893615, + "grad_norm": 2.5959067344665527, + "learning_rate": 1.908752507440689e-06, + "loss": 0.43711763620376587, + "mean_token_accuracy": 0.8539710640907288, + "num_tokens": 21086016.0, + "step": 2359 + }, + { + "epoch": 1.7933130699088147, + "grad_norm": 1.6228864192962646, + "learning_rate": 1.906717734416105e-06, + "loss": 0.38630396127700806, + "mean_token_accuracy": 0.8611987829208374, + "num_tokens": 21096573.0, + "step": 2360 + }, + { + "epoch": 1.7940729483282674, + "grad_norm": 1.8471404314041138, + "learning_rate": 1.9046833777798534e-06, + "loss": 0.46608641743659973, + "mean_token_accuracy": 0.8782031536102295, + "num_tokens": 21105817.0, + "step": 2361 + }, + { + "epoch": 1.7948328267477205, + "grad_norm": 2.6532235145568848, + "learning_rate": 1.9026494389597239e-06, + "loss": 0.3310372829437256, + "mean_token_accuracy": 0.8781720399856567, + "num_tokens": 21111192.0, + "step": 2362 + }, + { + "epoch": 1.7955927051671732, + "grad_norm": 2.172534942626953, + "learning_rate": 1.9006159193832124e-06, + "loss": 0.49921661615371704, + "mean_token_accuracy": 0.8215196132659912, + "num_tokens": 21117878.0, + "step": 2363 + }, + { + "epoch": 1.7963525835866263, + "grad_norm": 1.6507720947265625, + "learning_rate": 1.8985828204775206e-06, + "loss": 0.4189162850379944, + "mean_token_accuracy": 0.8520572185516357, + "num_tokens": 21128287.0, + "step": 2364 + }, + { + "epoch": 1.797112462006079, + "grad_norm": 1.5932034254074097, + "learning_rate": 1.8965501436695578e-06, + "loss": 0.45531854033470154, + "mean_token_accuracy": 0.8391242027282715, + "num_tokens": 21140605.0, + "step": 2365 + }, + { + "epoch": 1.797872340425532, + "grad_norm": 2.4680638313293457, + "learning_rate": 1.894517890385933e-06, + "loss": 0.41174983978271484, + "mean_token_accuracy": 0.8616886138916016, + "num_tokens": 21147045.0, + "step": 2366 + }, + { + "epoch": 1.7986322188449848, + "grad_norm": 1.61875319480896, + "learning_rate": 1.8924860620529594e-06, + "loss": 0.47573935985565186, + "mean_token_accuracy": 0.8347671031951904, + "num_tokens": 21157253.0, + "step": 2367 + }, + { + "epoch": 1.7993920972644377, + "grad_norm": 3.4389333724975586, + "learning_rate": 1.8904546600966539e-06, + "loss": 0.34975939989089966, + "mean_token_accuracy": 0.8915865421295166, + "num_tokens": 21160486.0, + "step": 2368 + }, + { + "epoch": 1.8001519756838906, + "grad_norm": 2.0069527626037598, + "learning_rate": 1.888423685942732e-06, + "loss": 0.379585325717926, + "mean_token_accuracy": 0.8605983257293701, + "num_tokens": 21168016.0, + "step": 2369 + }, + { + "epoch": 1.8009118541033433, + "grad_norm": 3.0740530490875244, + "learning_rate": 1.886393141016609e-06, + "loss": 0.5244829058647156, + "mean_token_accuracy": 0.8282772302627563, + "num_tokens": 21172851.0, + "step": 2370 + }, + { + "epoch": 1.8016717325227964, + "grad_norm": 1.5724968910217285, + "learning_rate": 1.8843630267434e-06, + "loss": 0.2020694762468338, + "mean_token_accuracy": 0.8882503509521484, + "num_tokens": 21179866.0, + "step": 2371 + }, + { + "epoch": 1.8024316109422491, + "grad_norm": 2.1539509296417236, + "learning_rate": 1.8823333445479175e-06, + "loss": 0.37903186678886414, + "mean_token_accuracy": 0.8525497317314148, + "num_tokens": 21186941.0, + "step": 2372 + }, + { + "epoch": 1.8031914893617023, + "grad_norm": 2.0247764587402344, + "learning_rate": 1.8803040958546708e-06, + "loss": 0.293364018201828, + "mean_token_accuracy": 0.8954306244850159, + "num_tokens": 21193659.0, + "step": 2373 + }, + { + "epoch": 1.803951367781155, + "grad_norm": 1.7034926414489746, + "learning_rate": 1.8782752820878636e-06, + "loss": 0.33828210830688477, + "mean_token_accuracy": 0.9032940864562988, + "num_tokens": 21201399.0, + "step": 2374 + }, + { + "epoch": 1.8047112462006079, + "grad_norm": 1.7864601612091064, + "learning_rate": 1.8762469046713954e-06, + "loss": 0.3165147006511688, + "mean_token_accuracy": 0.8997465372085571, + "num_tokens": 21209105.0, + "step": 2375 + }, + { + "epoch": 1.8054711246200608, + "grad_norm": 2.3371729850769043, + "learning_rate": 1.8742189650288617e-06, + "loss": 0.4036901593208313, + "mean_token_accuracy": 0.8549420833587646, + "num_tokens": 21215429.0, + "step": 2376 + }, + { + "epoch": 1.8062310030395137, + "grad_norm": 1.7922348976135254, + "learning_rate": 1.872191464583547e-06, + "loss": 0.4366671144962311, + "mean_token_accuracy": 0.8614166975021362, + "num_tokens": 21226823.0, + "step": 2377 + }, + { + "epoch": 1.8069908814589666, + "grad_norm": 2.1667943000793457, + "learning_rate": 1.8701644047584294e-06, + "loss": 0.3543647825717926, + "mean_token_accuracy": 0.9031318426132202, + "num_tokens": 21232823.0, + "step": 2378 + }, + { + "epoch": 1.8077507598784195, + "grad_norm": 1.7554421424865723, + "learning_rate": 1.868137786976177e-06, + "loss": 0.32704365253448486, + "mean_token_accuracy": 0.8990532755851746, + "num_tokens": 21242036.0, + "step": 2379 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 1.6723839044570923, + "learning_rate": 1.8661116126591492e-06, + "loss": 0.3665752410888672, + "mean_token_accuracy": 0.8828305006027222, + "num_tokens": 21251290.0, + "step": 2380 + }, + { + "epoch": 1.809270516717325, + "grad_norm": 1.5078409910202026, + "learning_rate": 1.8640858832293924e-06, + "loss": 0.368108332157135, + "mean_token_accuracy": 0.8720884323120117, + "num_tokens": 21263510.0, + "step": 2381 + }, + { + "epoch": 1.8100303951367782, + "grad_norm": 2.245553493499756, + "learning_rate": 1.8620606001086423e-06, + "loss": 0.3189915716648102, + "mean_token_accuracy": 0.9015103578567505, + "num_tokens": 21269690.0, + "step": 2382 + }, + { + "epoch": 1.810790273556231, + "grad_norm": 1.780027151107788, + "learning_rate": 1.8600357647183188e-06, + "loss": 0.40369710326194763, + "mean_token_accuracy": 0.8539618253707886, + "num_tokens": 21278523.0, + "step": 2383 + }, + { + "epoch": 1.811550151975684, + "grad_norm": 2.1727912425994873, + "learning_rate": 1.8580113784795306e-06, + "loss": 0.29285651445388794, + "mean_token_accuracy": 0.8954071998596191, + "num_tokens": 21284717.0, + "step": 2384 + }, + { + "epoch": 1.8123100303951367, + "grad_norm": 2.310225248336792, + "learning_rate": 1.8559874428130708e-06, + "loss": 0.3090948760509491, + "mean_token_accuracy": 0.8853784203529358, + "num_tokens": 21290484.0, + "step": 2385 + }, + { + "epoch": 1.8130699088145896, + "grad_norm": 1.6556873321533203, + "learning_rate": 1.8539639591394131e-06, + "loss": 0.4425269663333893, + "mean_token_accuracy": 0.8488757610321045, + "num_tokens": 21302588.0, + "step": 2386 + }, + { + "epoch": 1.8138297872340425, + "grad_norm": 1.9238256216049194, + "learning_rate": 1.8519409288787182e-06, + "loss": 0.4781329929828644, + "mean_token_accuracy": 0.8392970561981201, + "num_tokens": 21310598.0, + "step": 2387 + }, + { + "epoch": 1.8145896656534954, + "grad_norm": 1.4976142644882202, + "learning_rate": 1.8499183534508263e-06, + "loss": 0.36829859018325806, + "mean_token_accuracy": 0.8687542676925659, + "num_tokens": 21322668.0, + "step": 2388 + }, + { + "epoch": 1.8153495440729484, + "grad_norm": 2.0216941833496094, + "learning_rate": 1.8478962342752584e-06, + "loss": 0.385962575674057, + "mean_token_accuracy": 0.8908089399337769, + "num_tokens": 21330378.0, + "step": 2389 + }, + { + "epoch": 1.8161094224924013, + "grad_norm": 1.647863507270813, + "learning_rate": 1.8458745727712142e-06, + "loss": 0.30903705954551697, + "mean_token_accuracy": 0.8914397954940796, + "num_tokens": 21339932.0, + "step": 2390 + }, + { + "epoch": 1.8168693009118542, + "grad_norm": 1.5832399129867554, + "learning_rate": 1.8438533703575757e-06, + "loss": 0.3636384606361389, + "mean_token_accuracy": 0.8611595630645752, + "num_tokens": 21351557.0, + "step": 2391 + }, + { + "epoch": 1.8176291793313069, + "grad_norm": 3.0069241523742676, + "learning_rate": 1.8418326284528997e-06, + "loss": 0.37970617413520813, + "mean_token_accuracy": 0.8620643615722656, + "num_tokens": 21355704.0, + "step": 2392 + }, + { + "epoch": 1.81838905775076, + "grad_norm": 2.004526376724243, + "learning_rate": 1.8398123484754204e-06, + "loss": 0.5333225131034851, + "mean_token_accuracy": 0.8062554597854614, + "num_tokens": 21364640.0, + "step": 2393 + }, + { + "epoch": 1.8191489361702127, + "grad_norm": 1.449981689453125, + "learning_rate": 1.8377925318430478e-06, + "loss": 0.3736325800418854, + "mean_token_accuracy": 0.858788251876831, + "num_tokens": 21377025.0, + "step": 2394 + }, + { + "epoch": 1.8199088145896658, + "grad_norm": 1.1959524154663086, + "learning_rate": 1.8357731799733686e-06, + "loss": 0.3272058963775635, + "mean_token_accuracy": 0.8840590715408325, + "num_tokens": 21395378.0, + "step": 2395 + }, + { + "epoch": 1.8206686930091185, + "grad_norm": 2.134742498397827, + "learning_rate": 1.8337542942836406e-06, + "loss": 0.3737856149673462, + "mean_token_accuracy": 0.8674061298370361, + "num_tokens": 21402106.0, + "step": 2396 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 2.2179460525512695, + "learning_rate": 1.8317358761907945e-06, + "loss": 0.37301796674728394, + "mean_token_accuracy": 0.8605623245239258, + "num_tokens": 21408367.0, + "step": 2397 + }, + { + "epoch": 1.8221884498480243, + "grad_norm": 2.1718010902404785, + "learning_rate": 1.8297179271114345e-06, + "loss": 0.2772231101989746, + "mean_token_accuracy": 0.8997501730918884, + "num_tokens": 21414274.0, + "step": 2398 + }, + { + "epoch": 1.8229483282674772, + "grad_norm": 1.410933494567871, + "learning_rate": 1.827700448461836e-06, + "loss": 0.4834601581096649, + "mean_token_accuracy": 0.8382522463798523, + "num_tokens": 21429120.0, + "step": 2399 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 3.4779679775238037, + "learning_rate": 1.8256834416579423e-06, + "loss": 0.44643428921699524, + "mean_token_accuracy": 0.8308249711990356, + "num_tokens": 21432437.0, + "step": 2400 + }, + { + "epoch": 1.824468085106383, + "grad_norm": 1.374484658241272, + "learning_rate": 1.8236669081153657e-06, + "loss": 0.3947869837284088, + "mean_token_accuracy": 0.8605848550796509, + "num_tokens": 21445656.0, + "step": 2401 + }, + { + "epoch": 1.825227963525836, + "grad_norm": 1.9599316120147705, + "learning_rate": 1.8216508492493887e-06, + "loss": 0.49040719866752625, + "mean_token_accuracy": 0.839459240436554, + "num_tokens": 21452889.0, + "step": 2402 + }, + { + "epoch": 1.8259878419452886, + "grad_norm": 2.1267881393432617, + "learning_rate": 1.8196352664749578e-06, + "loss": 0.3233179450035095, + "mean_token_accuracy": 0.8841243386268616, + "num_tokens": 21458788.0, + "step": 2403 + }, + { + "epoch": 1.8267477203647418, + "grad_norm": 2.6356115341186523, + "learning_rate": 1.8176201612066874e-06, + "loss": 0.43436336517333984, + "mean_token_accuracy": 0.850265622138977, + "num_tokens": 21464305.0, + "step": 2404 + }, + { + "epoch": 1.8275075987841944, + "grad_norm": 2.0232386589050293, + "learning_rate": 1.8156055348588548e-06, + "loss": 0.37281763553619385, + "mean_token_accuracy": 0.8616300821304321, + "num_tokens": 21471722.0, + "step": 2405 + }, + { + "epoch": 1.8282674772036476, + "grad_norm": 3.2616260051727295, + "learning_rate": 1.8135913888454034e-06, + "loss": 0.2882898151874542, + "mean_token_accuracy": 0.9001147747039795, + "num_tokens": 21475400.0, + "step": 2406 + }, + { + "epoch": 1.8290273556231003, + "grad_norm": 2.1665611267089844, + "learning_rate": 1.8115777245799383e-06, + "loss": 0.45269185304641724, + "mean_token_accuracy": 0.8420798778533936, + "num_tokens": 21481827.0, + "step": 2407 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 1.4406569004058838, + "learning_rate": 1.8095645434757261e-06, + "loss": 0.43665701150894165, + "mean_token_accuracy": 0.8401381969451904, + "num_tokens": 21496441.0, + "step": 2408 + }, + { + "epoch": 1.830547112462006, + "grad_norm": 1.6756342649459839, + "learning_rate": 1.8075518469456944e-06, + "loss": 0.3521783947944641, + "mean_token_accuracy": 0.8737466335296631, + "num_tokens": 21505568.0, + "step": 2409 + }, + { + "epoch": 1.831306990881459, + "grad_norm": 1.6623140573501587, + "learning_rate": 1.8055396364024318e-06, + "loss": 0.344537615776062, + "mean_token_accuracy": 0.886972188949585, + "num_tokens": 21513252.0, + "step": 2410 + }, + { + "epoch": 1.832066869300912, + "grad_norm": 2.064835548400879, + "learning_rate": 1.803527913258186e-06, + "loss": 0.3252706229686737, + "mean_token_accuracy": 0.885245680809021, + "num_tokens": 21520242.0, + "step": 2411 + }, + { + "epoch": 1.8328267477203646, + "grad_norm": 1.9969112873077393, + "learning_rate": 1.8015166789248606e-06, + "loss": 0.34694376587867737, + "mean_token_accuracy": 0.8818766474723816, + "num_tokens": 21527524.0, + "step": 2412 + }, + { + "epoch": 1.8335866261398177, + "grad_norm": 2.086148977279663, + "learning_rate": 1.7995059348140165e-06, + "loss": 0.23109188675880432, + "mean_token_accuracy": 0.912773609161377, + "num_tokens": 21532829.0, + "step": 2413 + }, + { + "epoch": 1.8343465045592704, + "grad_norm": 1.80828058719635, + "learning_rate": 1.7974956823368728e-06, + "loss": 0.5422223210334778, + "mean_token_accuracy": 0.8058640956878662, + "num_tokens": 21544440.0, + "step": 2414 + }, + { + "epoch": 1.8351063829787235, + "grad_norm": 1.8121788501739502, + "learning_rate": 1.7954859229043017e-06, + "loss": 0.3674035668373108, + "mean_token_accuracy": 0.8628277778625488, + "num_tokens": 21553160.0, + "step": 2415 + }, + { + "epoch": 1.8358662613981762, + "grad_norm": 1.9307979345321655, + "learning_rate": 1.7934766579268292e-06, + "loss": 0.4528796672821045, + "mean_token_accuracy": 0.8328302502632141, + "num_tokens": 21563485.0, + "step": 2416 + }, + { + "epoch": 1.8366261398176293, + "grad_norm": 1.2312756776809692, + "learning_rate": 1.7914678888146347e-06, + "loss": 0.40424543619155884, + "mean_token_accuracy": 0.8571025133132935, + "num_tokens": 21582662.0, + "step": 2417 + }, + { + "epoch": 1.837386018237082, + "grad_norm": 1.6305770874023438, + "learning_rate": 1.7894596169775514e-06, + "loss": 0.36575305461883545, + "mean_token_accuracy": 0.8768579959869385, + "num_tokens": 21592930.0, + "step": 2418 + }, + { + "epoch": 1.838145896656535, + "grad_norm": 1.8107178211212158, + "learning_rate": 1.7874518438250598e-06, + "loss": 0.3260963261127472, + "mean_token_accuracy": 0.896018385887146, + "num_tokens": 21600509.0, + "step": 2419 + }, + { + "epoch": 1.8389057750759878, + "grad_norm": 2.7195847034454346, + "learning_rate": 1.785444570766293e-06, + "loss": 0.2728347182273865, + "mean_token_accuracy": 0.9178709983825684, + "num_tokens": 21604489.0, + "step": 2420 + }, + { + "epoch": 1.8396656534954408, + "grad_norm": 1.9783591032028198, + "learning_rate": 1.7834377992100332e-06, + "loss": 0.3136378526687622, + "mean_token_accuracy": 0.8844017386436462, + "num_tokens": 21612060.0, + "step": 2421 + }, + { + "epoch": 1.8404255319148937, + "grad_norm": 2.1911418437957764, + "learning_rate": 1.7814315305647095e-06, + "loss": 0.39013993740081787, + "mean_token_accuracy": 0.8688976764678955, + "num_tokens": 21618778.0, + "step": 2422 + }, + { + "epoch": 1.8411854103343464, + "grad_norm": 1.9143604040145874, + "learning_rate": 1.779425766238398e-06, + "loss": 0.5113036632537842, + "mean_token_accuracy": 0.8329141139984131, + "num_tokens": 21628976.0, + "step": 2423 + }, + { + "epoch": 1.8419452887537995, + "grad_norm": 1.4184197187423706, + "learning_rate": 1.7774205076388207e-06, + "loss": 0.3821067810058594, + "mean_token_accuracy": 0.8604007959365845, + "num_tokens": 21643145.0, + "step": 2424 + }, + { + "epoch": 1.8427051671732522, + "grad_norm": 2.45896577835083, + "learning_rate": 1.7754157561733476e-06, + "loss": 0.3004961311817169, + "mean_token_accuracy": 0.89884352684021, + "num_tokens": 21647441.0, + "step": 2425 + }, + { + "epoch": 1.8434650455927053, + "grad_norm": 1.7999277114868164, + "learning_rate": 1.7734115132489887e-06, + "loss": 0.42533132433891296, + "mean_token_accuracy": 0.8838746547698975, + "num_tokens": 21657445.0, + "step": 2426 + }, + { + "epoch": 1.844224924012158, + "grad_norm": 2.099728584289551, + "learning_rate": 1.7714077802723994e-06, + "loss": 0.36200380325317383, + "mean_token_accuracy": 0.86548912525177, + "num_tokens": 21663966.0, + "step": 2427 + }, + { + "epoch": 1.844984802431611, + "grad_norm": 2.1970369815826416, + "learning_rate": 1.7694045586498754e-06, + "loss": 0.34944331645965576, + "mean_token_accuracy": 0.8670865297317505, + "num_tokens": 21670051.0, + "step": 2428 + }, + { + "epoch": 1.8457446808510638, + "grad_norm": 2.2928519248962402, + "learning_rate": 1.7674018497873568e-06, + "loss": 0.39500880241394043, + "mean_token_accuracy": 0.8744652271270752, + "num_tokens": 21676054.0, + "step": 2429 + }, + { + "epoch": 1.8465045592705167, + "grad_norm": 1.7598960399627686, + "learning_rate": 1.7653996550904208e-06, + "loss": 0.40113672614097595, + "mean_token_accuracy": 0.8552819490432739, + "num_tokens": 21685514.0, + "step": 2430 + }, + { + "epoch": 1.8472644376899696, + "grad_norm": 2.0529749393463135, + "learning_rate": 1.7633979759642844e-06, + "loss": 0.47586584091186523, + "mean_token_accuracy": 0.8412872552871704, + "num_tokens": 21693282.0, + "step": 2431 + }, + { + "epoch": 1.8480243161094225, + "grad_norm": 2.2423181533813477, + "learning_rate": 1.7613968138138027e-06, + "loss": 0.2757381796836853, + "mean_token_accuracy": 0.8992017507553101, + "num_tokens": 21698439.0, + "step": 2432 + }, + { + "epoch": 1.8487841945288754, + "grad_norm": 1.3280467987060547, + "learning_rate": 1.7593961700434692e-06, + "loss": 0.29535043239593506, + "mean_token_accuracy": 0.8943840861320496, + "num_tokens": 21711823.0, + "step": 2433 + }, + { + "epoch": 1.8495440729483281, + "grad_norm": 2.589221715927124, + "learning_rate": 1.7573960460574133e-06, + "loss": 0.46775516867637634, + "mean_token_accuracy": 0.8654797673225403, + "num_tokens": 21717180.0, + "step": 2434 + }, + { + "epoch": 1.8503039513677813, + "grad_norm": 2.1137642860412598, + "learning_rate": 1.7553964432593976e-06, + "loss": 0.3808780610561371, + "mean_token_accuracy": 0.8759565353393555, + "num_tokens": 21723980.0, + "step": 2435 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.386967182159424, + "learning_rate": 1.75339736305282e-06, + "loss": 0.42688336968421936, + "mean_token_accuracy": 0.8488960266113281, + "num_tokens": 21730411.0, + "step": 2436 + }, + { + "epoch": 1.851823708206687, + "grad_norm": 1.586552619934082, + "learning_rate": 1.7513988068407145e-06, + "loss": 0.33497530221939087, + "mean_token_accuracy": 0.8809621334075928, + "num_tokens": 21740228.0, + "step": 2437 + }, + { + "epoch": 1.8525835866261398, + "grad_norm": 2.107167959213257, + "learning_rate": 1.7494007760257428e-06, + "loss": 0.3801528513431549, + "mean_token_accuracy": 0.8666986227035522, + "num_tokens": 21746718.0, + "step": 2438 + }, + { + "epoch": 1.8533434650455927, + "grad_norm": 2.514514684677124, + "learning_rate": 1.7474032720101991e-06, + "loss": 0.285498708486557, + "mean_token_accuracy": 0.901540219783783, + "num_tokens": 21751009.0, + "step": 2439 + }, + { + "epoch": 1.8541033434650456, + "grad_norm": 1.8152034282684326, + "learning_rate": 1.7454062961960102e-06, + "loss": 0.3704795241355896, + "mean_token_accuracy": 0.8630262613296509, + "num_tokens": 21760164.0, + "step": 2440 + }, + { + "epoch": 1.8548632218844985, + "grad_norm": 2.714531183242798, + "learning_rate": 1.7434098499847308e-06, + "loss": 0.5070809125900269, + "mean_token_accuracy": 0.8408594131469727, + "num_tokens": 21765602.0, + "step": 2441 + }, + { + "epoch": 1.8556231003039514, + "grad_norm": 2.173832893371582, + "learning_rate": 1.7414139347775423e-06, + "loss": 0.3500945568084717, + "mean_token_accuracy": 0.8733699321746826, + "num_tokens": 21772029.0, + "step": 2442 + }, + { + "epoch": 1.8563829787234043, + "grad_norm": 1.580376148223877, + "learning_rate": 1.7394185519752546e-06, + "loss": 0.5137908458709717, + "mean_token_accuracy": 0.8141944408416748, + "num_tokens": 21784531.0, + "step": 2443 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 2.079318046569824, + "learning_rate": 1.7374237029783064e-06, + "loss": 0.41820770502090454, + "mean_token_accuracy": 0.8513275384902954, + "num_tokens": 21792047.0, + "step": 2444 + }, + { + "epoch": 1.85790273556231, + "grad_norm": 2.6890387535095215, + "learning_rate": 1.7354293891867582e-06, + "loss": 0.3810037672519684, + "mean_token_accuracy": 0.8790096044540405, + "num_tokens": 21796634.0, + "step": 2445 + }, + { + "epoch": 1.858662613981763, + "grad_norm": 2.161081552505493, + "learning_rate": 1.7334356120002956e-06, + "loss": 0.48064762353897095, + "mean_token_accuracy": 0.8329977989196777, + "num_tokens": 21803509.0, + "step": 2446 + }, + { + "epoch": 1.8594224924012157, + "grad_norm": 1.9201551675796509, + "learning_rate": 1.7314423728182283e-06, + "loss": 0.36369895935058594, + "mean_token_accuracy": 0.8713955879211426, + "num_tokens": 21810528.0, + "step": 2447 + }, + { + "epoch": 1.8601823708206688, + "grad_norm": 1.8095223903656006, + "learning_rate": 1.7294496730394897e-06, + "loss": 0.41493499279022217, + "mean_token_accuracy": 0.855312705039978, + "num_tokens": 21821176.0, + "step": 2448 + }, + { + "epoch": 1.8609422492401215, + "grad_norm": 2.172389507293701, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.3467463552951813, + "mean_token_accuracy": 0.8801594972610474, + "num_tokens": 21827486.0, + "step": 2449 + }, + { + "epoch": 1.8617021276595744, + "grad_norm": 2.8139185905456543, + "learning_rate": 1.7254658972858293e-06, + "loss": 0.35121995210647583, + "mean_token_accuracy": 0.8741901516914368, + "num_tokens": 21831915.0, + "step": 2450 + }, + { + "epoch": 1.8624620060790273, + "grad_norm": 1.2572762966156006, + "learning_rate": 1.7234748241068742e-06, + "loss": 0.3775328993797302, + "mean_token_accuracy": 0.8547425866127014, + "num_tokens": 21849623.0, + "step": 2451 + }, + { + "epoch": 1.8632218844984803, + "grad_norm": 1.2357900142669678, + "learning_rate": 1.7214842959231796e-06, + "loss": 0.28715917468070984, + "mean_token_accuracy": 0.9034290313720703, + "num_tokens": 21864507.0, + "step": 2452 + }, + { + "epoch": 1.8639817629179332, + "grad_norm": 1.2349165678024292, + "learning_rate": 1.719494314131775e-06, + "loss": 0.27918580174446106, + "mean_token_accuracy": 0.9073119759559631, + "num_tokens": 21878519.0, + "step": 2453 + }, + { + "epoch": 1.864741641337386, + "grad_norm": 1.960353136062622, + "learning_rate": 1.7175048801293042e-06, + "loss": 0.49304282665252686, + "mean_token_accuracy": 0.8193954229354858, + "num_tokens": 21886861.0, + "step": 2454 + }, + { + "epoch": 1.865501519756839, + "grad_norm": 1.480118751525879, + "learning_rate": 1.7155159953120315e-06, + "loss": 0.39433127641677856, + "mean_token_accuracy": 0.8674266338348389, + "num_tokens": 21899131.0, + "step": 2455 + }, + { + "epoch": 1.8662613981762917, + "grad_norm": 2.3136367797851562, + "learning_rate": 1.7135276610758309e-06, + "loss": 0.40943437814712524, + "mean_token_accuracy": 0.8511340022087097, + "num_tokens": 21905550.0, + "step": 2456 + }, + { + "epoch": 1.8670212765957448, + "grad_norm": 1.3622872829437256, + "learning_rate": 1.7115398788161923e-06, + "loss": 0.4255254566669464, + "mean_token_accuracy": 0.8457357883453369, + "num_tokens": 21919943.0, + "step": 2457 + }, + { + "epoch": 1.8677811550151975, + "grad_norm": 1.8197853565216064, + "learning_rate": 1.7095526499282172e-06, + "loss": 0.33384573459625244, + "mean_token_accuracy": 0.8757365942001343, + "num_tokens": 21928368.0, + "step": 2458 + }, + { + "epoch": 1.8685410334346506, + "grad_norm": 1.8771090507507324, + "learning_rate": 1.7075659758066207e-06, + "loss": 0.38854318857192993, + "mean_token_accuracy": 0.8565001487731934, + "num_tokens": 21936624.0, + "step": 2459 + }, + { + "epoch": 1.8693009118541033, + "grad_norm": 1.449811577796936, + "learning_rate": 1.7055798578457267e-06, + "loss": 0.45504286885261536, + "mean_token_accuracy": 0.8338158130645752, + "num_tokens": 21952192.0, + "step": 2460 + }, + { + "epoch": 1.8700607902735562, + "grad_norm": 2.253678321838379, + "learning_rate": 1.703594297439469e-06, + "loss": 0.44300752878189087, + "mean_token_accuracy": 0.8451106548309326, + "num_tokens": 21959107.0, + "step": 2461 + }, + { + "epoch": 1.8708206686930091, + "grad_norm": 2.5431747436523438, + "learning_rate": 1.7016092959813892e-06, + "loss": 0.34692925214767456, + "mean_token_accuracy": 0.8823766708374023, + "num_tokens": 21964543.0, + "step": 2462 + }, + { + "epoch": 1.871580547112462, + "grad_norm": 2.7001953125, + "learning_rate": 1.6996248548646393e-06, + "loss": 0.5270686745643616, + "mean_token_accuracy": 0.8366886377334595, + "num_tokens": 21970157.0, + "step": 2463 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.3855581283569336, + "learning_rate": 1.6976409754819767e-06, + "loss": 0.40109893679618835, + "mean_token_accuracy": 0.8477234840393066, + "num_tokens": 21976046.0, + "step": 2464 + }, + { + "epoch": 1.8731003039513676, + "grad_norm": 1.6014364957809448, + "learning_rate": 1.6956576592257635e-06, + "loss": 0.4344262480735779, + "mean_token_accuracy": 0.8464433550834656, + "num_tokens": 21986299.0, + "step": 2465 + }, + { + "epoch": 1.8738601823708207, + "grad_norm": 2.221372127532959, + "learning_rate": 1.6936749074879663e-06, + "loss": 0.24239015579223633, + "mean_token_accuracy": 0.9185566306114197, + "num_tokens": 21991541.0, + "step": 2466 + }, + { + "epoch": 1.8746200607902734, + "grad_norm": 1.6672178506851196, + "learning_rate": 1.6916927216601593e-06, + "loss": 0.35219496488571167, + "mean_token_accuracy": 0.8668237328529358, + "num_tokens": 22000797.0, + "step": 2467 + }, + { + "epoch": 1.8753799392097266, + "grad_norm": 1.364131212234497, + "learning_rate": 1.6897111031335145e-06, + "loss": 0.4456409513950348, + "mean_token_accuracy": 0.8350487947463989, + "num_tokens": 22018297.0, + "step": 2468 + }, + { + "epoch": 1.8761398176291793, + "grad_norm": 1.4535794258117676, + "learning_rate": 1.6877300532988095e-06, + "loss": 0.395782470703125, + "mean_token_accuracy": 0.8482908010482788, + "num_tokens": 22030096.0, + "step": 2469 + }, + { + "epoch": 1.8768996960486324, + "grad_norm": 2.0192270278930664, + "learning_rate": 1.6857495735464196e-06, + "loss": 0.31406813859939575, + "mean_token_accuracy": 0.889453649520874, + "num_tokens": 22036082.0, + "step": 2470 + }, + { + "epoch": 1.877659574468085, + "grad_norm": 2.159257173538208, + "learning_rate": 1.6837696652663244e-06, + "loss": 0.43942126631736755, + "mean_token_accuracy": 0.8518660068511963, + "num_tokens": 22043413.0, + "step": 2471 + }, + { + "epoch": 1.878419452887538, + "grad_norm": 1.9774882793426514, + "learning_rate": 1.681790329848097e-06, + "loss": 0.42464935779571533, + "mean_token_accuracy": 0.8545591831207275, + "num_tokens": 22050290.0, + "step": 2472 + }, + { + "epoch": 1.8791793313069909, + "grad_norm": 1.0219167470932007, + "learning_rate": 1.6798115686809125e-06, + "loss": 0.36917346715927124, + "mean_token_accuracy": 0.8650286197662354, + "num_tokens": 22070408.0, + "step": 2473 + }, + { + "epoch": 1.8799392097264438, + "grad_norm": 1.2943378686904907, + "learning_rate": 1.677833383153542e-06, + "loss": 0.3434808850288391, + "mean_token_accuracy": 0.878541111946106, + "num_tokens": 22083567.0, + "step": 2474 + }, + { + "epoch": 1.8806990881458967, + "grad_norm": 3.582855224609375, + "learning_rate": 1.6758557746543518e-06, + "loss": 0.39738911390304565, + "mean_token_accuracy": 0.8951535224914551, + "num_tokens": 22086886.0, + "step": 2475 + }, + { + "epoch": 1.8814589665653494, + "grad_norm": 1.680220365524292, + "learning_rate": 1.673878744571304e-06, + "loss": 0.38146206736564636, + "mean_token_accuracy": 0.8596681356430054, + "num_tokens": 22095564.0, + "step": 2476 + }, + { + "epoch": 1.8822188449848025, + "grad_norm": 1.448194146156311, + "learning_rate": 1.6719022942919527e-06, + "loss": 0.43309977650642395, + "mean_token_accuracy": 0.8669528961181641, + "num_tokens": 22109333.0, + "step": 2477 + }, + { + "epoch": 1.8829787234042552, + "grad_norm": 1.5353537797927856, + "learning_rate": 1.6699264252034498e-06, + "loss": 0.4479079842567444, + "mean_token_accuracy": 0.8379873037338257, + "num_tokens": 22124735.0, + "step": 2478 + }, + { + "epoch": 1.8837386018237083, + "grad_norm": 1.1744320392608643, + "learning_rate": 1.6679511386925337e-06, + "loss": 0.31951260566711426, + "mean_token_accuracy": 0.8792685270309448, + "num_tokens": 22140882.0, + "step": 2479 + }, + { + "epoch": 1.884498480243161, + "grad_norm": 2.1996841430664062, + "learning_rate": 1.6659764361455383e-06, + "loss": 0.39045992493629456, + "mean_token_accuracy": 0.8587675094604492, + "num_tokens": 22146843.0, + "step": 2480 + }, + { + "epoch": 1.885258358662614, + "grad_norm": 3.494931697845459, + "learning_rate": 1.6640023189483836e-06, + "loss": 0.44756871461868286, + "mean_token_accuracy": 0.8643628358840942, + "num_tokens": 22150504.0, + "step": 2481 + }, + { + "epoch": 1.8860182370820668, + "grad_norm": 2.2455973625183105, + "learning_rate": 1.6620287884865831e-06, + "loss": 0.3308878540992737, + "mean_token_accuracy": 0.8748078942298889, + "num_tokens": 22156537.0, + "step": 2482 + }, + { + "epoch": 1.8867781155015197, + "grad_norm": 2.31868314743042, + "learning_rate": 1.6600558461452368e-06, + "loss": 0.46583569049835205, + "mean_token_accuracy": 0.8438903093338013, + "num_tokens": 22163501.0, + "step": 2483 + }, + { + "epoch": 1.8875379939209727, + "grad_norm": 1.5695412158966064, + "learning_rate": 1.65808349330903e-06, + "loss": 0.351986825466156, + "mean_token_accuracy": 0.8707568645477295, + "num_tokens": 22173880.0, + "step": 2484 + }, + { + "epoch": 1.8882978723404256, + "grad_norm": 1.4109563827514648, + "learning_rate": 1.656111731362236e-06, + "loss": 0.36058586835861206, + "mean_token_accuracy": 0.8606001138687134, + "num_tokens": 22189000.0, + "step": 2485 + }, + { + "epoch": 1.8890577507598785, + "grad_norm": 1.0398776531219482, + "learning_rate": 1.6541405616887138e-06, + "loss": 0.36524999141693115, + "mean_token_accuracy": 0.8690586090087891, + "num_tokens": 22209187.0, + "step": 2486 + }, + { + "epoch": 1.8898176291793312, + "grad_norm": 2.1050004959106445, + "learning_rate": 1.6521699856719065e-06, + "loss": 0.2988269329071045, + "mean_token_accuracy": 0.8887280225753784, + "num_tokens": 22215539.0, + "step": 2487 + }, + { + "epoch": 1.8905775075987843, + "grad_norm": 2.5606791973114014, + "learning_rate": 1.650200004694839e-06, + "loss": 0.41077330708503723, + "mean_token_accuracy": 0.8436049818992615, + "num_tokens": 22221133.0, + "step": 2488 + }, + { + "epoch": 1.891337386018237, + "grad_norm": 1.5786094665527344, + "learning_rate": 1.6482306201401211e-06, + "loss": 0.4217292368412018, + "mean_token_accuracy": 0.859939455986023, + "num_tokens": 22231578.0, + "step": 2489 + }, + { + "epoch": 1.89209726443769, + "grad_norm": 1.7131884098052979, + "learning_rate": 1.6462618333899422e-06, + "loss": 0.3945464789867401, + "mean_token_accuracy": 0.8679244518280029, + "num_tokens": 22241252.0, + "step": 2490 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 2.8350300788879395, + "learning_rate": 1.6442936458260723e-06, + "loss": 0.3992699384689331, + "mean_token_accuracy": 0.8717275857925415, + "num_tokens": 22246226.0, + "step": 2491 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.2180120944976807, + "learning_rate": 1.6423260588298608e-06, + "loss": 0.3381099998950958, + "mean_token_accuracy": 0.8968075513839722, + "num_tokens": 22252355.0, + "step": 2492 + }, + { + "epoch": 1.8943768996960486, + "grad_norm": 2.6498866081237793, + "learning_rate": 1.6403590737822378e-06, + "loss": 0.36339250206947327, + "mean_token_accuracy": 0.8633373379707336, + "num_tokens": 22257407.0, + "step": 2493 + }, + { + "epoch": 1.8951367781155015, + "grad_norm": 2.634241819381714, + "learning_rate": 1.6383926920637077e-06, + "loss": 0.2562698721885681, + "mean_token_accuracy": 0.8999600410461426, + "num_tokens": 22261858.0, + "step": 2494 + }, + { + "epoch": 1.8958966565349544, + "grad_norm": 2.0163333415985107, + "learning_rate": 1.6364269150543533e-06, + "loss": 0.3413389027118683, + "mean_token_accuracy": 0.8718398809432983, + "num_tokens": 22268517.0, + "step": 2495 + }, + { + "epoch": 1.8966565349544073, + "grad_norm": 2.8333005905151367, + "learning_rate": 1.6344617441338311e-06, + "loss": 0.4354540705680847, + "mean_token_accuracy": 0.8491238355636597, + "num_tokens": 22273648.0, + "step": 2496 + }, + { + "epoch": 1.8974164133738602, + "grad_norm": 1.6280957460403442, + "learning_rate": 1.6324971806813766e-06, + "loss": 0.3015792965888977, + "mean_token_accuracy": 0.8937206268310547, + "num_tokens": 22282521.0, + "step": 2497 + }, + { + "epoch": 1.898176291793313, + "grad_norm": 1.2246302366256714, + "learning_rate": 1.6305332260757937e-06, + "loss": 0.26619502902030945, + "mean_token_accuracy": 0.8886681199073792, + "num_tokens": 22295179.0, + "step": 2498 + }, + { + "epoch": 1.898936170212766, + "grad_norm": 2.4014432430267334, + "learning_rate": 1.6285698816954626e-06, + "loss": 0.3735058903694153, + "mean_token_accuracy": 0.8693109750747681, + "num_tokens": 22300681.0, + "step": 2499 + }, + { + "epoch": 1.8996960486322187, + "grad_norm": 1.4447300434112549, + "learning_rate": 1.6266071489183327e-06, + "loss": 0.40768876671791077, + "mean_token_accuracy": 0.8556059002876282, + "num_tokens": 22312442.0, + "step": 2500 + }, + { + "epoch": 1.9004559270516719, + "grad_norm": 2.1339821815490723, + "learning_rate": 1.6246450291219268e-06, + "loss": 0.33442017436027527, + "mean_token_accuracy": 0.8837105631828308, + "num_tokens": 22318779.0, + "step": 2501 + }, + { + "epoch": 1.9012158054711246, + "grad_norm": 2.8564913272857666, + "learning_rate": 1.6226835236833356e-06, + "loss": 0.36013197898864746, + "mean_token_accuracy": 0.8810569047927856, + "num_tokens": 22323390.0, + "step": 2502 + }, + { + "epoch": 1.9019756838905775, + "grad_norm": 2.1201915740966797, + "learning_rate": 1.620722633979219e-06, + "loss": 0.4587489664554596, + "mean_token_accuracy": 0.8517274856567383, + "num_tokens": 22330275.0, + "step": 2503 + }, + { + "epoch": 1.9027355623100304, + "grad_norm": 2.211402177810669, + "learning_rate": 1.6187623613858038e-06, + "loss": 0.3698349595069885, + "mean_token_accuracy": 0.8768182992935181, + "num_tokens": 22336041.0, + "step": 2504 + }, + { + "epoch": 1.9034954407294833, + "grad_norm": 1.421604871749878, + "learning_rate": 1.6168027072788868e-06, + "loss": 0.38086453080177307, + "mean_token_accuracy": 0.8622198104858398, + "num_tokens": 22349310.0, + "step": 2505 + }, + { + "epoch": 1.9042553191489362, + "grad_norm": 2.4304113388061523, + "learning_rate": 1.6148436730338279e-06, + "loss": 0.34694477915763855, + "mean_token_accuracy": 0.8833136558532715, + "num_tokens": 22355069.0, + "step": 2506 + }, + { + "epoch": 1.905015197568389, + "grad_norm": 2.1076772212982178, + "learning_rate": 1.6128852600255518e-06, + "loss": 0.4973800778388977, + "mean_token_accuracy": 0.851190984249115, + "num_tokens": 22362402.0, + "step": 2507 + }, + { + "epoch": 1.905775075987842, + "grad_norm": 3.0934200286865234, + "learning_rate": 1.6109274696285496e-06, + "loss": 0.46498024463653564, + "mean_token_accuracy": 0.8436626195907593, + "num_tokens": 22367390.0, + "step": 2508 + }, + { + "epoch": 1.9065349544072947, + "grad_norm": 2.0114359855651855, + "learning_rate": 1.6089703032168736e-06, + "loss": 0.45143815875053406, + "mean_token_accuracy": 0.852748692035675, + "num_tokens": 22377032.0, + "step": 2509 + }, + { + "epoch": 1.9072948328267478, + "grad_norm": 1.8780893087387085, + "learning_rate": 1.6070137621641382e-06, + "loss": 0.3977179527282715, + "mean_token_accuracy": 0.8556262850761414, + "num_tokens": 22386880.0, + "step": 2510 + }, + { + "epoch": 1.9080547112462005, + "grad_norm": 1.6748069524765015, + "learning_rate": 1.6050578478435184e-06, + "loss": 0.35590440034866333, + "mean_token_accuracy": 0.8702141046524048, + "num_tokens": 22396616.0, + "step": 2511 + }, + { + "epoch": 1.9088145896656536, + "grad_norm": 0.9799401760101318, + "learning_rate": 1.6031025616277512e-06, + "loss": 0.3325427770614624, + "mean_token_accuracy": 0.8771291971206665, + "num_tokens": 22419580.0, + "step": 2512 + }, + { + "epoch": 1.9095744680851063, + "grad_norm": 1.5084866285324097, + "learning_rate": 1.6011479048891323e-06, + "loss": 0.44336390495300293, + "mean_token_accuracy": 0.8786209225654602, + "num_tokens": 22434235.0, + "step": 2513 + }, + { + "epoch": 1.9103343465045592, + "grad_norm": 1.8544305562973022, + "learning_rate": 1.5991938789995138e-06, + "loss": 0.3055306375026703, + "mean_token_accuracy": 0.9043174982070923, + "num_tokens": 22442003.0, + "step": 2514 + }, + { + "epoch": 1.9110942249240122, + "grad_norm": 4.29932165145874, + "learning_rate": 1.5972404853303061e-06, + "loss": 0.386760413646698, + "mean_token_accuracy": 0.8914207220077515, + "num_tokens": 22444787.0, + "step": 2515 + }, + { + "epoch": 1.911854103343465, + "grad_norm": 1.7560505867004395, + "learning_rate": 1.595287725252478e-06, + "loss": 0.4141422510147095, + "mean_token_accuracy": 0.862310528755188, + "num_tokens": 22453625.0, + "step": 2516 + }, + { + "epoch": 1.912613981762918, + "grad_norm": 2.685443878173828, + "learning_rate": 1.5933356001365502e-06, + "loss": 0.36217260360717773, + "mean_token_accuracy": 0.868883490562439, + "num_tokens": 22458597.0, + "step": 2517 + }, + { + "epoch": 1.9133738601823707, + "grad_norm": 2.2587239742279053, + "learning_rate": 1.591384111352599e-06, + "loss": 0.5298880934715271, + "mean_token_accuracy": 0.821168839931488, + "num_tokens": 22466091.0, + "step": 2518 + }, + { + "epoch": 1.9141337386018238, + "grad_norm": 2.273380756378174, + "learning_rate": 1.5894332602702545e-06, + "loss": 0.3194117546081543, + "mean_token_accuracy": 0.8849239945411682, + "num_tokens": 22471785.0, + "step": 2519 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.314634084701538, + "learning_rate": 1.5874830482587003e-06, + "loss": 0.457550585269928, + "mean_token_accuracy": 0.8367670774459839, + "num_tokens": 22479091.0, + "step": 2520 + }, + { + "epoch": 1.9156534954407296, + "grad_norm": 2.16206693649292, + "learning_rate": 1.585533476686669e-06, + "loss": 0.43055859208106995, + "mean_token_accuracy": 0.8659856915473938, + "num_tokens": 22487379.0, + "step": 2521 + }, + { + "epoch": 1.9164133738601823, + "grad_norm": 2.2091798782348633, + "learning_rate": 1.5835845469224447e-06, + "loss": 0.45421302318573, + "mean_token_accuracy": 0.8418087959289551, + "num_tokens": 22493755.0, + "step": 2522 + }, + { + "epoch": 1.9171732522796354, + "grad_norm": 1.6166985034942627, + "learning_rate": 1.5816362603338632e-06, + "loss": 0.5211667418479919, + "mean_token_accuracy": 0.809440016746521, + "num_tokens": 22506648.0, + "step": 2523 + }, + { + "epoch": 1.917933130699088, + "grad_norm": 2.4998703002929688, + "learning_rate": 1.5796886182883053e-06, + "loss": 0.45915648341178894, + "mean_token_accuracy": 0.833067774772644, + "num_tokens": 22513216.0, + "step": 2524 + }, + { + "epoch": 1.918693009118541, + "grad_norm": 1.492928147315979, + "learning_rate": 1.577741622152702e-06, + "loss": 0.45581498742103577, + "mean_token_accuracy": 0.8531479835510254, + "num_tokens": 22524908.0, + "step": 2525 + }, + { + "epoch": 1.919452887537994, + "grad_norm": 2.0502207279205322, + "learning_rate": 1.5757952732935288e-06, + "loss": 0.4156759977340698, + "mean_token_accuracy": 0.8677599430084229, + "num_tokens": 22532275.0, + "step": 2526 + }, + { + "epoch": 1.9202127659574468, + "grad_norm": 2.4572031497955322, + "learning_rate": 1.5738495730768104e-06, + "loss": 0.43373313546180725, + "mean_token_accuracy": 0.8435516357421875, + "num_tokens": 22538272.0, + "step": 2527 + }, + { + "epoch": 1.9209726443768997, + "grad_norm": 2.071903705596924, + "learning_rate": 1.5719045228681127e-06, + "loss": 0.3211413621902466, + "mean_token_accuracy": 0.87841796875, + "num_tokens": 22545487.0, + "step": 2528 + }, + { + "epoch": 1.9217325227963524, + "grad_norm": 1.6742064952850342, + "learning_rate": 1.5699601240325474e-06, + "loss": 0.3704240322113037, + "mean_token_accuracy": 0.8646563291549683, + "num_tokens": 22554840.0, + "step": 2529 + }, + { + "epoch": 1.9224924012158056, + "grad_norm": 1.0941399335861206, + "learning_rate": 1.5680163779347668e-06, + "loss": 0.3595704436302185, + "mean_token_accuracy": 0.8680597543716431, + "num_tokens": 22572627.0, + "step": 2530 + }, + { + "epoch": 1.9232522796352582, + "grad_norm": 2.9815237522125244, + "learning_rate": 1.5660732859389687e-06, + "loss": 0.2941335141658783, + "mean_token_accuracy": 0.8847303986549377, + "num_tokens": 22576851.0, + "step": 2531 + }, + { + "epoch": 1.9240121580547114, + "grad_norm": 2.898106813430786, + "learning_rate": 1.5641308494088903e-06, + "loss": 0.4066317081451416, + "mean_token_accuracy": 0.8469538688659668, + "num_tokens": 22581431.0, + "step": 2532 + }, + { + "epoch": 1.924772036474164, + "grad_norm": 1.6757515668869019, + "learning_rate": 1.5621890697078069e-06, + "loss": 0.33923569321632385, + "mean_token_accuracy": 0.8790708184242249, + "num_tokens": 22590648.0, + "step": 2533 + }, + { + "epoch": 1.925531914893617, + "grad_norm": 1.747314214706421, + "learning_rate": 1.5602479481985333e-06, + "loss": 0.4865703582763672, + "mean_token_accuracy": 0.8314566612243652, + "num_tokens": 22600153.0, + "step": 2534 + }, + { + "epoch": 1.9262917933130699, + "grad_norm": 2.7927849292755127, + "learning_rate": 1.5583074862434254e-06, + "loss": 0.335658460855484, + "mean_token_accuracy": 0.8769067525863647, + "num_tokens": 22604864.0, + "step": 2535 + }, + { + "epoch": 1.9270516717325228, + "grad_norm": 2.2553000450134277, + "learning_rate": 1.5563676852043738e-06, + "loss": 0.4442562460899353, + "mean_token_accuracy": 0.8381515145301819, + "num_tokens": 22611102.0, + "step": 2536 + }, + { + "epoch": 1.9278115501519757, + "grad_norm": 1.1937638521194458, + "learning_rate": 1.5544285464428044e-06, + "loss": 0.38608425855636597, + "mean_token_accuracy": 0.8589644432067871, + "num_tokens": 22627781.0, + "step": 2537 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 3.282639980316162, + "learning_rate": 1.55249007131968e-06, + "loss": 0.31231993436813354, + "mean_token_accuracy": 0.8917703032493591, + "num_tokens": 22632341.0, + "step": 2538 + }, + { + "epoch": 1.9293313069908815, + "grad_norm": 2.3212976455688477, + "learning_rate": 1.5505522611954977e-06, + "loss": 0.34952571988105774, + "mean_token_accuracy": 0.8752106428146362, + "num_tokens": 22638572.0, + "step": 2539 + }, + { + "epoch": 1.9300911854103342, + "grad_norm": 1.389098882675171, + "learning_rate": 1.548615117430286e-06, + "loss": 0.4298851788043976, + "mean_token_accuracy": 0.871698260307312, + "num_tokens": 22651875.0, + "step": 2540 + }, + { + "epoch": 1.9308510638297873, + "grad_norm": 1.5333977937698364, + "learning_rate": 1.5466786413836077e-06, + "loss": 0.45540744066238403, + "mean_token_accuracy": 0.8409075736999512, + "num_tokens": 22662903.0, + "step": 2541 + }, + { + "epoch": 1.93161094224924, + "grad_norm": 1.7833251953125, + "learning_rate": 1.5447428344145565e-06, + "loss": 0.333247572183609, + "mean_token_accuracy": 0.8796100616455078, + "num_tokens": 22671125.0, + "step": 2542 + }, + { + "epoch": 1.9323708206686931, + "grad_norm": 1.5165303945541382, + "learning_rate": 1.5428076978817564e-06, + "loss": 0.3085063099861145, + "mean_token_accuracy": 0.888705849647522, + "num_tokens": 22681482.0, + "step": 2543 + }, + { + "epoch": 1.9331306990881458, + "grad_norm": 2.3556196689605713, + "learning_rate": 1.5408732331433596e-06, + "loss": 0.44008776545524597, + "mean_token_accuracy": 0.8578170537948608, + "num_tokens": 22686952.0, + "step": 2544 + }, + { + "epoch": 1.9338905775075987, + "grad_norm": 2.9572882652282715, + "learning_rate": 1.538939441557048e-06, + "loss": 0.3779261112213135, + "mean_token_accuracy": 0.8657241463661194, + "num_tokens": 22691211.0, + "step": 2545 + }, + { + "epoch": 1.9346504559270516, + "grad_norm": 2.373473644256592, + "learning_rate": 1.5370063244800326e-06, + "loss": 0.4113072454929352, + "mean_token_accuracy": 0.872116208076477, + "num_tokens": 22697442.0, + "step": 2546 + }, + { + "epoch": 1.9354103343465046, + "grad_norm": 2.270207643508911, + "learning_rate": 1.5350738832690479e-06, + "loss": 0.4021070897579193, + "mean_token_accuracy": 0.8750372529029846, + "num_tokens": 22703693.0, + "step": 2547 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.429445266723633, + "learning_rate": 1.5331421192803565e-06, + "loss": 0.40210235118865967, + "mean_token_accuracy": 0.8593704104423523, + "num_tokens": 22709285.0, + "step": 2548 + }, + { + "epoch": 1.9369300911854104, + "grad_norm": 1.4576458930969238, + "learning_rate": 1.5312110338697427e-06, + "loss": 0.44822201132774353, + "mean_token_accuracy": 0.8737322688102722, + "num_tokens": 22723743.0, + "step": 2549 + }, + { + "epoch": 1.9376899696048633, + "grad_norm": 2.1008098125457764, + "learning_rate": 1.5292806283925192e-06, + "loss": 0.3514235019683838, + "mean_token_accuracy": 0.8689005374908447, + "num_tokens": 22730135.0, + "step": 2550 + }, + { + "epoch": 1.938449848024316, + "grad_norm": 1.9786806106567383, + "learning_rate": 1.5273509042035172e-06, + "loss": 0.4483771324157715, + "mean_token_accuracy": 0.8353633880615234, + "num_tokens": 22738717.0, + "step": 2551 + }, + { + "epoch": 1.939209726443769, + "grad_norm": 1.0649693012237549, + "learning_rate": 1.5254218626570927e-06, + "loss": 0.30712205171585083, + "mean_token_accuracy": 0.8802675008773804, + "num_tokens": 22757346.0, + "step": 2552 + }, + { + "epoch": 1.9399696048632218, + "grad_norm": 3.0401108264923096, + "learning_rate": 1.5234935051071193e-06, + "loss": 0.5213959217071533, + "mean_token_accuracy": 0.8249514102935791, + "num_tokens": 22762169.0, + "step": 2553 + }, + { + "epoch": 1.940729483282675, + "grad_norm": 2.892486572265625, + "learning_rate": 1.521565832906994e-06, + "loss": 0.5694394111633301, + "mean_token_accuracy": 0.8139263391494751, + "num_tokens": 22767824.0, + "step": 2554 + }, + { + "epoch": 1.9414893617021276, + "grad_norm": 1.6187207698822021, + "learning_rate": 1.519638847409632e-06, + "loss": 0.46748271584510803, + "mean_token_accuracy": 0.8541051149368286, + "num_tokens": 22778195.0, + "step": 2555 + }, + { + "epoch": 1.9422492401215805, + "grad_norm": 1.3857731819152832, + "learning_rate": 1.5177125499674639e-06, + "loss": 0.35661786794662476, + "mean_token_accuracy": 0.8711516857147217, + "num_tokens": 22792353.0, + "step": 2556 + }, + { + "epoch": 1.9430091185410334, + "grad_norm": 1.108441710472107, + "learning_rate": 1.515786941932441e-06, + "loss": 0.3537200391292572, + "mean_token_accuracy": 0.8739079833030701, + "num_tokens": 22813185.0, + "step": 2557 + }, + { + "epoch": 1.9437689969604863, + "grad_norm": 2.0528404712677, + "learning_rate": 1.5138620246560295e-06, + "loss": 0.4161028265953064, + "mean_token_accuracy": 0.8385938405990601, + "num_tokens": 22821227.0, + "step": 2558 + }, + { + "epoch": 1.9445288753799392, + "grad_norm": 1.5123628377914429, + "learning_rate": 1.5119377994892095e-06, + "loss": 0.4420986473560333, + "mean_token_accuracy": 0.8664361834526062, + "num_tokens": 22835064.0, + "step": 2559 + }, + { + "epoch": 1.9452887537993921, + "grad_norm": 2.5354838371276855, + "learning_rate": 1.5100142677824752e-06, + "loss": 0.3837323784828186, + "mean_token_accuracy": 0.8607655763626099, + "num_tokens": 22840455.0, + "step": 2560 + }, + { + "epoch": 1.946048632218845, + "grad_norm": 1.1354057788848877, + "learning_rate": 1.5080914308858375e-06, + "loss": 0.39776813983917236, + "mean_token_accuracy": 0.8586497902870178, + "num_tokens": 22858828.0, + "step": 2561 + }, + { + "epoch": 1.9468085106382977, + "grad_norm": 1.576740026473999, + "learning_rate": 1.5061692901488161e-06, + "loss": 0.3167848289012909, + "mean_token_accuracy": 0.8876185417175293, + "num_tokens": 22868674.0, + "step": 2562 + }, + { + "epoch": 1.9475683890577509, + "grad_norm": 1.4835401773452759, + "learning_rate": 1.5042478469204437e-06, + "loss": 0.44950318336486816, + "mean_token_accuracy": 0.8526639342308044, + "num_tokens": 22883019.0, + "step": 2563 + }, + { + "epoch": 1.9483282674772036, + "grad_norm": 1.617073655128479, + "learning_rate": 1.502327102549262e-06, + "loss": 0.45711010694503784, + "mean_token_accuracy": 0.834361732006073, + "num_tokens": 22896834.0, + "step": 2564 + }, + { + "epoch": 1.9490881458966567, + "grad_norm": 1.3348414897918701, + "learning_rate": 1.5004070583833252e-06, + "loss": 0.3691314458847046, + "mean_token_accuracy": 0.8779371380805969, + "num_tokens": 22912350.0, + "step": 2565 + }, + { + "epoch": 1.9498480243161094, + "grad_norm": 1.711234450340271, + "learning_rate": 1.4984877157701932e-06, + "loss": 0.38726937770843506, + "mean_token_accuracy": 0.8704015016555786, + "num_tokens": 22922575.0, + "step": 2566 + }, + { + "epoch": 1.9506079027355623, + "grad_norm": 2.4587950706481934, + "learning_rate": 1.4965690760569346e-06, + "loss": 0.4455464482307434, + "mean_token_accuracy": 0.8481032252311707, + "num_tokens": 22928717.0, + "step": 2567 + }, + { + "epoch": 1.9513677811550152, + "grad_norm": 2.4189560413360596, + "learning_rate": 1.4946511405901237e-06, + "loss": 0.4120418429374695, + "mean_token_accuracy": 0.8519487380981445, + "num_tokens": 22934977.0, + "step": 2568 + }, + { + "epoch": 1.952127659574468, + "grad_norm": 1.2503050565719604, + "learning_rate": 1.4927339107158437e-06, + "loss": 0.4434332251548767, + "mean_token_accuracy": 0.8448144793510437, + "num_tokens": 22950061.0, + "step": 2569 + }, + { + "epoch": 1.952887537993921, + "grad_norm": 1.788493275642395, + "learning_rate": 1.4908173877796784e-06, + "loss": 0.49203023314476013, + "mean_token_accuracy": 0.8601495623588562, + "num_tokens": 22961838.0, + "step": 2570 + }, + { + "epoch": 1.9536474164133737, + "grad_norm": 1.4260050058364868, + "learning_rate": 1.4889015731267186e-06, + "loss": 0.3286570906639099, + "mean_token_accuracy": 0.882429838180542, + "num_tokens": 22973192.0, + "step": 2571 + }, + { + "epoch": 1.9544072948328268, + "grad_norm": 1.6754822731018066, + "learning_rate": 1.486986468101555e-06, + "loss": 0.34655290842056274, + "mean_token_accuracy": 0.8807861804962158, + "num_tokens": 22983661.0, + "step": 2572 + }, + { + "epoch": 1.9551671732522795, + "grad_norm": 1.9064570665359497, + "learning_rate": 1.4850720740482842e-06, + "loss": 0.34020254015922546, + "mean_token_accuracy": 0.86677086353302, + "num_tokens": 22991231.0, + "step": 2573 + }, + { + "epoch": 1.9559270516717326, + "grad_norm": 1.977444052696228, + "learning_rate": 1.4831583923105e-06, + "loss": 0.21505260467529297, + "mean_token_accuracy": 0.921241819858551, + "num_tokens": 22996828.0, + "step": 2574 + }, + { + "epoch": 1.9566869300911853, + "grad_norm": 1.1019235849380493, + "learning_rate": 1.481245424231298e-06, + "loss": 0.3804295063018799, + "mean_token_accuracy": 0.8582668900489807, + "num_tokens": 23016018.0, + "step": 2575 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 1.7943179607391357, + "learning_rate": 1.4793331711532743e-06, + "loss": 0.38565245270729065, + "mean_token_accuracy": 0.8599048256874084, + "num_tokens": 23024461.0, + "step": 2576 + }, + { + "epoch": 1.9582066869300911, + "grad_norm": 2.273824453353882, + "learning_rate": 1.4774216344185204e-06, + "loss": 0.46297723054885864, + "mean_token_accuracy": 0.8294345140457153, + "num_tokens": 23031687.0, + "step": 2577 + }, + { + "epoch": 1.958966565349544, + "grad_norm": 2.308509111404419, + "learning_rate": 1.4755108153686275e-06, + "loss": 0.4366525411605835, + "mean_token_accuracy": 0.8515903949737549, + "num_tokens": 23037072.0, + "step": 2578 + }, + { + "epoch": 1.959726443768997, + "grad_norm": 2.069028377532959, + "learning_rate": 1.4736007153446803e-06, + "loss": 0.33900877833366394, + "mean_token_accuracy": 0.8937177658081055, + "num_tokens": 23043207.0, + "step": 2579 + }, + { + "epoch": 1.9604863221884499, + "grad_norm": 2.905163288116455, + "learning_rate": 1.4716913356872614e-06, + "loss": 0.3708382844924927, + "mean_token_accuracy": 0.8936747312545776, + "num_tokens": 23047020.0, + "step": 2580 + }, + { + "epoch": 1.9612462006079028, + "grad_norm": 2.4153175354003906, + "learning_rate": 1.4697826777364478e-06, + "loss": 0.473562091588974, + "mean_token_accuracy": 0.8350275158882141, + "num_tokens": 23053282.0, + "step": 2581 + }, + { + "epoch": 1.9620060790273555, + "grad_norm": 2.21589994430542, + "learning_rate": 1.467874742831808e-06, + "loss": 0.3812660276889801, + "mean_token_accuracy": 0.8623865842819214, + "num_tokens": 23059399.0, + "step": 2582 + }, + { + "epoch": 1.9627659574468086, + "grad_norm": 1.0847623348236084, + "learning_rate": 1.4659675323124037e-06, + "loss": 0.3846944570541382, + "mean_token_accuracy": 0.8633466958999634, + "num_tokens": 23081005.0, + "step": 2583 + }, + { + "epoch": 1.9635258358662613, + "grad_norm": 1.8754645586013794, + "learning_rate": 1.46406104751679e-06, + "loss": 0.3460300862789154, + "mean_token_accuracy": 0.8757443428039551, + "num_tokens": 23088710.0, + "step": 2584 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 2.13075852394104, + "learning_rate": 1.462155289783011e-06, + "loss": 0.3060935139656067, + "mean_token_accuracy": 0.9070644378662109, + "num_tokens": 23094862.0, + "step": 2585 + }, + { + "epoch": 1.965045592705167, + "grad_norm": 2.9674458503723145, + "learning_rate": 1.4602502604486e-06, + "loss": 0.4464406371116638, + "mean_token_accuracy": 0.8497441411018372, + "num_tokens": 23099821.0, + "step": 2586 + }, + { + "epoch": 1.96580547112462, + "grad_norm": 1.9171007871627808, + "learning_rate": 1.45834596085058e-06, + "loss": 0.3905114531517029, + "mean_token_accuracy": 0.8564352989196777, + "num_tokens": 23107804.0, + "step": 2587 + }, + { + "epoch": 1.966565349544073, + "grad_norm": 2.0817408561706543, + "learning_rate": 1.456442392325463e-06, + "loss": 0.3903818130493164, + "mean_token_accuracy": 0.8671162128448486, + "num_tokens": 23115224.0, + "step": 2588 + }, + { + "epoch": 1.9673252279635258, + "grad_norm": 2.6379549503326416, + "learning_rate": 1.4545395562092467e-06, + "loss": 0.22965987026691437, + "mean_token_accuracy": 0.9160916805267334, + "num_tokens": 23119184.0, + "step": 2589 + }, + { + "epoch": 1.9680851063829787, + "grad_norm": 2.525221824645996, + "learning_rate": 1.4526374538374133e-06, + "loss": 0.4132574498653412, + "mean_token_accuracy": 0.8486990332603455, + "num_tokens": 23124679.0, + "step": 2590 + }, + { + "epoch": 1.9688449848024316, + "grad_norm": 2.0362391471862793, + "learning_rate": 1.4507360865449318e-06, + "loss": 0.29624345898628235, + "mean_token_accuracy": 0.888127863407135, + "num_tokens": 23130756.0, + "step": 2591 + }, + { + "epoch": 1.9696048632218845, + "grad_norm": 1.5150481462478638, + "learning_rate": 1.4488354556662553e-06, + "loss": 0.3852264881134033, + "mean_token_accuracy": 0.8532775640487671, + "num_tokens": 23141597.0, + "step": 2592 + }, + { + "epoch": 1.9703647416413372, + "grad_norm": 1.5255193710327148, + "learning_rate": 1.4469355625353199e-06, + "loss": 0.37015780806541443, + "mean_token_accuracy": 0.8669752478599548, + "num_tokens": 23152487.0, + "step": 2593 + }, + { + "epoch": 1.9711246200607904, + "grad_norm": 1.1780041456222534, + "learning_rate": 1.4450364084855433e-06, + "loss": 0.34421291947364807, + "mean_token_accuracy": 0.8593694567680359, + "num_tokens": 23168769.0, + "step": 2594 + }, + { + "epoch": 1.971884498480243, + "grad_norm": 2.4549946784973145, + "learning_rate": 1.4431379948498254e-06, + "loss": 0.4000544548034668, + "mean_token_accuracy": 0.8551953434944153, + "num_tokens": 23175428.0, + "step": 2595 + }, + { + "epoch": 1.9726443768996962, + "grad_norm": 2.374192476272583, + "learning_rate": 1.4412403229605453e-06, + "loss": 0.31329840421676636, + "mean_token_accuracy": 0.8917277455329895, + "num_tokens": 23180678.0, + "step": 2596 + }, + { + "epoch": 1.9734042553191489, + "grad_norm": 1.268515706062317, + "learning_rate": 1.4393433941495638e-06, + "loss": 0.34808623790740967, + "mean_token_accuracy": 0.8726245164871216, + "num_tokens": 23194733.0, + "step": 2597 + }, + { + "epoch": 1.9741641337386018, + "grad_norm": 2.0898988246917725, + "learning_rate": 1.4374472097482156e-06, + "loss": 0.45849233865737915, + "mean_token_accuracy": 0.8414266109466553, + "num_tokens": 23202211.0, + "step": 2598 + }, + { + "epoch": 1.9749240121580547, + "grad_norm": 2.1497802734375, + "learning_rate": 1.4355517710873184e-06, + "loss": 0.4304521977901459, + "mean_token_accuracy": 0.8502874374389648, + "num_tokens": 23209623.0, + "step": 2599 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 1.821786880493164, + "learning_rate": 1.4336570794971643e-06, + "loss": 0.3910462558269501, + "mean_token_accuracy": 0.8962477445602417, + "num_tokens": 23218904.0, + "step": 2600 + }, + { + "epoch": 1.9764437689969605, + "grad_norm": 2.2523093223571777, + "learning_rate": 1.4317631363075186e-06, + "loss": 0.3456020951271057, + "mean_token_accuracy": 0.8703117370605469, + "num_tokens": 23225602.0, + "step": 2601 + }, + { + "epoch": 1.9772036474164134, + "grad_norm": 1.6920030117034912, + "learning_rate": 1.4298699428476236e-06, + "loss": 0.4629668593406677, + "mean_token_accuracy": 0.841956615447998, + "num_tokens": 23236812.0, + "step": 2602 + }, + { + "epoch": 1.9779635258358663, + "grad_norm": 1.8796344995498657, + "learning_rate": 1.427977500446199e-06, + "loss": 0.3302173316478729, + "mean_token_accuracy": 0.8769404888153076, + "num_tokens": 23245851.0, + "step": 2603 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.4003775119781494, + "learning_rate": 1.4260858104314299e-06, + "loss": 0.48402607440948486, + "mean_token_accuracy": 0.8477497100830078, + "num_tokens": 23252429.0, + "step": 2604 + }, + { + "epoch": 1.9794832826747721, + "grad_norm": 3.576800584793091, + "learning_rate": 1.4241948741309783e-06, + "loss": 0.2943669259548187, + "mean_token_accuracy": 0.8933546543121338, + "num_tokens": 23255431.0, + "step": 2605 + }, + { + "epoch": 1.9802431610942248, + "grad_norm": 2.7589938640594482, + "learning_rate": 1.4223046928719764e-06, + "loss": 0.5138746500015259, + "mean_token_accuracy": 0.817468523979187, + "num_tokens": 23261351.0, + "step": 2606 + }, + { + "epoch": 1.981003039513678, + "grad_norm": 1.6950130462646484, + "learning_rate": 1.420415267981026e-06, + "loss": 0.2744991183280945, + "mean_token_accuracy": 0.9005721211433411, + "num_tokens": 23269482.0, + "step": 2607 + }, + { + "epoch": 1.9817629179331306, + "grad_norm": 1.5962934494018555, + "learning_rate": 1.418526600784198e-06, + "loss": 0.4629114270210266, + "mean_token_accuracy": 0.8337699174880981, + "num_tokens": 23279796.0, + "step": 2608 + }, + { + "epoch": 1.9825227963525835, + "grad_norm": 1.4962197542190552, + "learning_rate": 1.4166386926070322e-06, + "loss": 0.4217689633369446, + "mean_token_accuracy": 0.8445580005645752, + "num_tokens": 23293050.0, + "step": 2609 + }, + { + "epoch": 1.9832826747720365, + "grad_norm": 1.4243721961975098, + "learning_rate": 1.414751544774535e-06, + "loss": 0.4888152480125427, + "mean_token_accuracy": 0.8298524022102356, + "num_tokens": 23308501.0, + "step": 2610 + }, + { + "epoch": 1.9840425531914894, + "grad_norm": 1.5776121616363525, + "learning_rate": 1.412865158611179e-06, + "loss": 0.3156965970993042, + "mean_token_accuracy": 0.8773540258407593, + "num_tokens": 23317401.0, + "step": 2611 + }, + { + "epoch": 1.9848024316109423, + "grad_norm": 1.4690552949905396, + "learning_rate": 1.4109795354409045e-06, + "loss": 0.35854774713516235, + "mean_token_accuracy": 0.869156002998352, + "num_tokens": 23328891.0, + "step": 2612 + }, + { + "epoch": 1.9855623100303952, + "grad_norm": 1.5036180019378662, + "learning_rate": 1.4090946765871105e-06, + "loss": 0.3579009771347046, + "mean_token_accuracy": 0.8698509931564331, + "num_tokens": 23340473.0, + "step": 2613 + }, + { + "epoch": 1.986322188449848, + "grad_norm": 2.0811538696289062, + "learning_rate": 1.4072105833726685e-06, + "loss": 0.2905905246734619, + "mean_token_accuracy": 0.9131759405136108, + "num_tokens": 23346480.0, + "step": 2614 + }, + { + "epoch": 1.9870820668693008, + "grad_norm": 1.2866275310516357, + "learning_rate": 1.4053272571199037e-06, + "loss": 0.4091147184371948, + "mean_token_accuracy": 0.8537255525588989, + "num_tokens": 23361957.0, + "step": 2615 + }, + { + "epoch": 1.987841945288754, + "grad_norm": 1.439497470855713, + "learning_rate": 1.4034446991506084e-06, + "loss": 0.4888972342014313, + "mean_token_accuracy": 0.8451695442199707, + "num_tokens": 23374936.0, + "step": 2616 + }, + { + "epoch": 1.9886018237082066, + "grad_norm": 1.758204698562622, + "learning_rate": 1.401562910786034e-06, + "loss": 0.4976118803024292, + "mean_token_accuracy": 0.8346713781356812, + "num_tokens": 23386102.0, + "step": 2617 + }, + { + "epoch": 1.9893617021276597, + "grad_norm": 1.436486840248108, + "learning_rate": 1.3996818933468926e-06, + "loss": 0.42407113313674927, + "mean_token_accuracy": 0.8529444932937622, + "num_tokens": 23398645.0, + "step": 2618 + }, + { + "epoch": 1.9901215805471124, + "grad_norm": 2.1466588973999023, + "learning_rate": 1.397801648153354e-06, + "loss": 0.45519331097602844, + "mean_token_accuracy": 0.8460411429405212, + "num_tokens": 23406162.0, + "step": 2619 + }, + { + "epoch": 1.9908814589665653, + "grad_norm": 2.0492005348205566, + "learning_rate": 1.395922176525047e-06, + "loss": 0.31093084812164307, + "mean_token_accuracy": 0.8927264213562012, + "num_tokens": 23412051.0, + "step": 2620 + }, + { + "epoch": 1.9916413373860182, + "grad_norm": 2.2639048099517822, + "learning_rate": 1.3940434797810567e-06, + "loss": 0.3804079592227936, + "mean_token_accuracy": 0.8720212578773499, + "num_tokens": 23418252.0, + "step": 2621 + }, + { + "epoch": 1.9924012158054711, + "grad_norm": 1.9541687965393066, + "learning_rate": 1.3921655592399256e-06, + "loss": 0.38776344060897827, + "mean_token_accuracy": 0.858753502368927, + "num_tokens": 23425901.0, + "step": 2622 + }, + { + "epoch": 1.993161094224924, + "grad_norm": 1.5119032859802246, + "learning_rate": 1.3902884162196509e-06, + "loss": 0.39581215381622314, + "mean_token_accuracy": 0.8539663553237915, + "num_tokens": 23439390.0, + "step": 2623 + }, + { + "epoch": 1.993920972644377, + "grad_norm": 2.1608591079711914, + "learning_rate": 1.388412052037682e-06, + "loss": 0.41801220178604126, + "mean_token_accuracy": 0.8703387975692749, + "num_tokens": 23445725.0, + "step": 2624 + }, + { + "epoch": 1.9946808510638299, + "grad_norm": 2.463165521621704, + "learning_rate": 1.3865364680109239e-06, + "loss": 0.3252835273742676, + "mean_token_accuracy": 0.9031686186790466, + "num_tokens": 23451122.0, + "step": 2625 + }, + { + "epoch": 1.9954407294832825, + "grad_norm": 1.1901201009750366, + "learning_rate": 1.384661665455736e-06, + "loss": 0.3358447253704071, + "mean_token_accuracy": 0.8767676949501038, + "num_tokens": 23467381.0, + "step": 2626 + }, + { + "epoch": 1.9962006079027357, + "grad_norm": 1.3035757541656494, + "learning_rate": 1.3827876456879247e-06, + "loss": 0.3736562430858612, + "mean_token_accuracy": 0.849855899810791, + "num_tokens": 23482192.0, + "step": 2627 + }, + { + "epoch": 1.9969604863221884, + "grad_norm": 1.8807034492492676, + "learning_rate": 1.3809144100227483e-06, + "loss": 0.45943766832351685, + "mean_token_accuracy": 0.8456380367279053, + "num_tokens": 23495167.0, + "step": 2628 + }, + { + "epoch": 1.9977203647416415, + "grad_norm": 2.3645784854888916, + "learning_rate": 1.3790419597749198e-06, + "loss": 0.4271511435508728, + "mean_token_accuracy": 0.846099853515625, + "num_tokens": 23500790.0, + "step": 2629 + }, + { + "epoch": 1.9984802431610942, + "grad_norm": 1.8451792001724243, + "learning_rate": 1.3771702962585928e-06, + "loss": 0.38092344999313354, + "mean_token_accuracy": 0.8641276359558105, + "num_tokens": 23508845.0, + "step": 2630 + }, + { + "epoch": 1.999240121580547, + "grad_norm": 1.1115045547485352, + "learning_rate": 1.3752994207873743e-06, + "loss": 0.35954269766807556, + "mean_token_accuracy": 0.8642125129699707, + "num_tokens": 23527929.0, + "step": 2631 + }, + { + "epoch": 2.0, + "grad_norm": 1.406253457069397, + "learning_rate": 1.373429334674317e-06, + "loss": 0.33467042446136475, + "mean_token_accuracy": 0.8713197708129883, + "num_tokens": 23539356.0, + "step": 2632 + }, + { + "epoch": 2.0007598784194527, + "grad_norm": 2.8150978088378906, + "learning_rate": 1.3715600392319186e-06, + "loss": 0.22929656505584717, + "mean_token_accuracy": 0.9197485446929932, + "num_tokens": 23543746.0, + "step": 2633 + }, + { + "epoch": 2.001519756838906, + "grad_norm": 2.6291964054107666, + "learning_rate": 1.369691535772123e-06, + "loss": 0.290000855922699, + "mean_token_accuracy": 0.8979663848876953, + "num_tokens": 23548633.0, + "step": 2634 + }, + { + "epoch": 2.0022796352583585, + "grad_norm": 1.724357008934021, + "learning_rate": 1.3678238256063193e-06, + "loss": 0.3717018663883209, + "mean_token_accuracy": 0.8743406534194946, + "num_tokens": 23557187.0, + "step": 2635 + }, + { + "epoch": 2.0030395136778116, + "grad_norm": 2.3801965713500977, + "learning_rate": 1.3659569100453346e-06, + "loss": 0.3452329635620117, + "mean_token_accuracy": 0.8799462914466858, + "num_tokens": 23563321.0, + "step": 2636 + }, + { + "epoch": 2.0037993920972643, + "grad_norm": 1.8925955295562744, + "learning_rate": 1.3640907903994455e-06, + "loss": 0.32880955934524536, + "mean_token_accuracy": 0.888347864151001, + "num_tokens": 23570571.0, + "step": 2637 + }, + { + "epoch": 2.0045592705167175, + "grad_norm": 1.0761849880218506, + "learning_rate": 1.3622254679783665e-06, + "loss": 0.395224004983902, + "mean_token_accuracy": 0.8637001514434814, + "num_tokens": 23589504.0, + "step": 2638 + }, + { + "epoch": 2.00531914893617, + "grad_norm": 2.1172127723693848, + "learning_rate": 1.3603609440912508e-06, + "loss": 0.32195356488227844, + "mean_token_accuracy": 0.8984324932098389, + "num_tokens": 23595586.0, + "step": 2639 + }, + { + "epoch": 2.0060790273556233, + "grad_norm": 2.127723217010498, + "learning_rate": 1.3584972200466936e-06, + "loss": 0.4710606634616852, + "mean_token_accuracy": 0.8563182950019836, + "num_tokens": 23602747.0, + "step": 2640 + }, + { + "epoch": 2.006838905775076, + "grad_norm": 1.9752192497253418, + "learning_rate": 1.356634297152729e-06, + "loss": 0.24204617738723755, + "mean_token_accuracy": 0.9082983136177063, + "num_tokens": 23609005.0, + "step": 2641 + }, + { + "epoch": 2.007598784194529, + "grad_norm": 2.5435397624969482, + "learning_rate": 1.3547721767168273e-06, + "loss": 0.16702288389205933, + "mean_token_accuracy": 0.9353867769241333, + "num_tokens": 23612852.0, + "step": 2642 + }, + { + "epoch": 2.0083586626139818, + "grad_norm": 1.8113304376602173, + "learning_rate": 1.3529108600458967e-06, + "loss": 0.4245433509349823, + "mean_token_accuracy": 0.8446527719497681, + "num_tokens": 23621462.0, + "step": 2643 + }, + { + "epoch": 2.0091185410334345, + "grad_norm": 1.0438088178634644, + "learning_rate": 1.3510503484462807e-06, + "loss": 0.3710743188858032, + "mean_token_accuracy": 0.8731123208999634, + "num_tokens": 23642029.0, + "step": 2644 + }, + { + "epoch": 2.0098784194528876, + "grad_norm": 1.9650516510009766, + "learning_rate": 1.349190643223758e-06, + "loss": 0.32384324073791504, + "mean_token_accuracy": 0.8859044313430786, + "num_tokens": 23648970.0, + "step": 2645 + }, + { + "epoch": 2.0106382978723403, + "grad_norm": 1.4213180541992188, + "learning_rate": 1.347331745683542e-06, + "loss": 0.42391857504844666, + "mean_token_accuracy": 0.8568997383117676, + "num_tokens": 23663012.0, + "step": 2646 + }, + { + "epoch": 2.0113981762917934, + "grad_norm": 1.852386236190796, + "learning_rate": 1.3454736571302761e-06, + "loss": 0.37283188104629517, + "mean_token_accuracy": 0.9096506834030151, + "num_tokens": 23671632.0, + "step": 2647 + }, + { + "epoch": 2.012158054711246, + "grad_norm": 1.8350872993469238, + "learning_rate": 1.3436163788680411e-06, + "loss": 0.21148793399333954, + "mean_token_accuracy": 0.9306647181510925, + "num_tokens": 23678554.0, + "step": 2648 + }, + { + "epoch": 2.012917933130699, + "grad_norm": 1.8285188674926758, + "learning_rate": 1.3417599122003464e-06, + "loss": 0.2638583183288574, + "mean_token_accuracy": 0.904695987701416, + "num_tokens": 23686905.0, + "step": 2649 + }, + { + "epoch": 2.013677811550152, + "grad_norm": 1.1955424547195435, + "learning_rate": 1.3399042584301298e-06, + "loss": 0.30598434805870056, + "mean_token_accuracy": 0.8953701257705688, + "num_tokens": 23702734.0, + "step": 2650 + }, + { + "epoch": 2.014437689969605, + "grad_norm": 1.5378512144088745, + "learning_rate": 1.3380494188597603e-06, + "loss": 0.33754611015319824, + "mean_token_accuracy": 0.9063926935195923, + "num_tokens": 23715891.0, + "step": 2651 + }, + { + "epoch": 2.0151975683890577, + "grad_norm": 1.6957111358642578, + "learning_rate": 1.3361953947910394e-06, + "loss": 0.26302939653396606, + "mean_token_accuracy": 0.90192711353302, + "num_tokens": 23724034.0, + "step": 2652 + }, + { + "epoch": 2.015957446808511, + "grad_norm": 1.1756837368011475, + "learning_rate": 1.334342187525189e-06, + "loss": 0.3312695622444153, + "mean_token_accuracy": 0.870500385761261, + "num_tokens": 23741241.0, + "step": 2653 + }, + { + "epoch": 2.0167173252279635, + "grad_norm": 1.027145266532898, + "learning_rate": 1.3324897983628621e-06, + "loss": 0.2534530758857727, + "mean_token_accuracy": 0.894199550151825, + "num_tokens": 23758399.0, + "step": 2654 + }, + { + "epoch": 2.0174772036474162, + "grad_norm": 2.2585113048553467, + "learning_rate": 1.330638228604137e-06, + "loss": 0.4558389186859131, + "mean_token_accuracy": 0.8372241258621216, + "num_tokens": 23766871.0, + "step": 2655 + }, + { + "epoch": 2.0182370820668694, + "grad_norm": 1.886893630027771, + "learning_rate": 1.3287874795485168e-06, + "loss": 0.29894912242889404, + "mean_token_accuracy": 0.9086098670959473, + "num_tokens": 23774935.0, + "step": 2656 + }, + { + "epoch": 2.018996960486322, + "grad_norm": 2.082537889480591, + "learning_rate": 1.3269375524949286e-06, + "loss": 0.39323803782463074, + "mean_token_accuracy": 0.8598287105560303, + "num_tokens": 23781303.0, + "step": 2657 + }, + { + "epoch": 2.019756838905775, + "grad_norm": 1.7059803009033203, + "learning_rate": 1.3250884487417227e-06, + "loss": 0.17909850180149078, + "mean_token_accuracy": 0.9276094436645508, + "num_tokens": 23789148.0, + "step": 2658 + }, + { + "epoch": 2.020516717325228, + "grad_norm": 2.150275945663452, + "learning_rate": 1.3232401695866686e-06, + "loss": 0.3707781434059143, + "mean_token_accuracy": 0.8587700128555298, + "num_tokens": 23795484.0, + "step": 2659 + }, + { + "epoch": 2.021276595744681, + "grad_norm": 2.0554518699645996, + "learning_rate": 1.321392716326963e-06, + "loss": 0.33217954635620117, + "mean_token_accuracy": 0.874828577041626, + "num_tokens": 23802968.0, + "step": 2660 + }, + { + "epoch": 2.0220364741641337, + "grad_norm": 2.4556071758270264, + "learning_rate": 1.3195460902592193e-06, + "loss": 0.2790899872779846, + "mean_token_accuracy": 0.9071618914604187, + "num_tokens": 23807788.0, + "step": 2661 + }, + { + "epoch": 2.022796352583587, + "grad_norm": 1.7501509189605713, + "learning_rate": 1.3177002926794685e-06, + "loss": 0.3080750107765198, + "mean_token_accuracy": 0.8942672610282898, + "num_tokens": 23816023.0, + "step": 2662 + }, + { + "epoch": 2.0235562310030395, + "grad_norm": 1.3934804201126099, + "learning_rate": 1.3158553248831658e-06, + "loss": 0.286912202835083, + "mean_token_accuracy": 0.9284837245941162, + "num_tokens": 23827186.0, + "step": 2663 + }, + { + "epoch": 2.024316109422492, + "grad_norm": 1.2530465126037598, + "learning_rate": 1.3140111881651773e-06, + "loss": 0.2630627155303955, + "mean_token_accuracy": 0.9029854536056519, + "num_tokens": 23841399.0, + "step": 2664 + }, + { + "epoch": 2.0250759878419453, + "grad_norm": 1.3417384624481201, + "learning_rate": 1.312167883819791e-06, + "loss": 0.37794870138168335, + "mean_token_accuracy": 0.8722256422042847, + "num_tokens": 23856061.0, + "step": 2665 + }, + { + "epoch": 2.025835866261398, + "grad_norm": 2.234257698059082, + "learning_rate": 1.3103254131407082e-06, + "loss": 0.2739933133125305, + "mean_token_accuracy": 0.9055665135383606, + "num_tokens": 23861865.0, + "step": 2666 + }, + { + "epoch": 2.026595744680851, + "grad_norm": 1.4187006950378418, + "learning_rate": 1.308483777421046e-06, + "loss": 0.24370817840099335, + "mean_token_accuracy": 0.9145886301994324, + "num_tokens": 23873632.0, + "step": 2667 + }, + { + "epoch": 2.027355623100304, + "grad_norm": 2.3645882606506348, + "learning_rate": 1.3066429779533352e-06, + "loss": 0.23659822344779968, + "mean_token_accuracy": 0.9209753274917603, + "num_tokens": 23878866.0, + "step": 2668 + }, + { + "epoch": 2.028115501519757, + "grad_norm": 1.4782226085662842, + "learning_rate": 1.3048030160295196e-06, + "loss": 0.3353138267993927, + "mean_token_accuracy": 0.8747807741165161, + "num_tokens": 23891089.0, + "step": 2669 + }, + { + "epoch": 2.0288753799392096, + "grad_norm": 2.051754951477051, + "learning_rate": 1.3029638929409555e-06, + "loss": 0.2905973196029663, + "mean_token_accuracy": 0.887441873550415, + "num_tokens": 23897653.0, + "step": 2670 + }, + { + "epoch": 2.0296352583586628, + "grad_norm": 1.322279453277588, + "learning_rate": 1.3011256099784103e-06, + "loss": 0.3938416540622711, + "mean_token_accuracy": 0.8911079168319702, + "num_tokens": 23912525.0, + "step": 2671 + }, + { + "epoch": 2.0303951367781155, + "grad_norm": 1.87980318069458, + "learning_rate": 1.2992881684320627e-06, + "loss": 0.16637520492076874, + "mean_token_accuracy": 0.9472321271896362, + "num_tokens": 23918752.0, + "step": 2672 + }, + { + "epoch": 2.0311550151975686, + "grad_norm": 2.0867233276367188, + "learning_rate": 1.297451569591498e-06, + "loss": 0.37282776832580566, + "mean_token_accuracy": 0.8688399195671082, + "num_tokens": 23925918.0, + "step": 2673 + }, + { + "epoch": 2.0319148936170213, + "grad_norm": 1.129468560218811, + "learning_rate": 1.2956158147457116e-06, + "loss": 0.33072173595428467, + "mean_token_accuracy": 0.8788217306137085, + "num_tokens": 23944702.0, + "step": 2674 + }, + { + "epoch": 2.032674772036474, + "grad_norm": 3.6016290187835693, + "learning_rate": 1.2937809051831102e-06, + "loss": 0.28343498706817627, + "mean_token_accuracy": 0.911794900894165, + "num_tokens": 23948417.0, + "step": 2675 + }, + { + "epoch": 2.033434650455927, + "grad_norm": 1.4904811382293701, + "learning_rate": 1.2919468421915008e-06, + "loss": 0.4072638750076294, + "mean_token_accuracy": 0.8615934252738953, + "num_tokens": 23963654.0, + "step": 2676 + }, + { + "epoch": 2.0341945288753798, + "grad_norm": 2.90740704536438, + "learning_rate": 1.2901136270580994e-06, + "loss": 0.3685106635093689, + "mean_token_accuracy": 0.8923419713973999, + "num_tokens": 23968608.0, + "step": 2677 + }, + { + "epoch": 2.034954407294833, + "grad_norm": 1.8772104978561401, + "learning_rate": 1.2882812610695305e-06, + "loss": 0.2947828471660614, + "mean_token_accuracy": 0.9065762758255005, + "num_tokens": 23978298.0, + "step": 2678 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 1.2135536670684814, + "learning_rate": 1.2864497455118152e-06, + "loss": 0.36015012860298157, + "mean_token_accuracy": 0.8481813073158264, + "num_tokens": 23995784.0, + "step": 2679 + }, + { + "epoch": 2.0364741641337387, + "grad_norm": 1.941889762878418, + "learning_rate": 1.2846190816703836e-06, + "loss": 0.3004198670387268, + "mean_token_accuracy": 0.8843618631362915, + "num_tokens": 24002651.0, + "step": 2680 + }, + { + "epoch": 2.0372340425531914, + "grad_norm": 1.8905075788497925, + "learning_rate": 1.2827892708300648e-06, + "loss": 0.26640570163726807, + "mean_token_accuracy": 0.9079146385192871, + "num_tokens": 24010400.0, + "step": 2681 + }, + { + "epoch": 2.0379939209726445, + "grad_norm": 1.2975934743881226, + "learning_rate": 1.280960314275092e-06, + "loss": 0.19093887507915497, + "mean_token_accuracy": 0.9277223348617554, + "num_tokens": 24021528.0, + "step": 2682 + }, + { + "epoch": 2.038753799392097, + "grad_norm": 1.6483098268508911, + "learning_rate": 1.279132213289096e-06, + "loss": 0.29260069131851196, + "mean_token_accuracy": 0.892486572265625, + "num_tokens": 24030470.0, + "step": 2683 + }, + { + "epoch": 2.0395136778115504, + "grad_norm": 1.6875916719436646, + "learning_rate": 1.2773049691551103e-06, + "loss": 0.3784627914428711, + "mean_token_accuracy": 0.8682783842086792, + "num_tokens": 24041608.0, + "step": 2684 + }, + { + "epoch": 2.040273556231003, + "grad_norm": 2.1055848598480225, + "learning_rate": 1.2754785831555617e-06, + "loss": 0.14676237106323242, + "mean_token_accuracy": 0.9532995223999023, + "num_tokens": 24046687.0, + "step": 2685 + }, + { + "epoch": 2.0410334346504557, + "grad_norm": 1.3862961530685425, + "learning_rate": 1.273653056572282e-06, + "loss": 0.34408485889434814, + "mean_token_accuracy": 0.8748919367790222, + "num_tokens": 24059147.0, + "step": 2686 + }, + { + "epoch": 2.041793313069909, + "grad_norm": 2.936876058578491, + "learning_rate": 1.2718283906864939e-06, + "loss": 0.2471027672290802, + "mean_token_accuracy": 0.9177526235580444, + "num_tokens": 24062963.0, + "step": 2687 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 1.3992520570755005, + "learning_rate": 1.2700045867788184e-06, + "loss": 0.421109139919281, + "mean_token_accuracy": 0.8664785623550415, + "num_tokens": 24077912.0, + "step": 2688 + }, + { + "epoch": 2.0433130699088147, + "grad_norm": 3.0531985759735107, + "learning_rate": 1.2681816461292715e-06, + "loss": 0.292591392993927, + "mean_token_accuracy": 0.8992351293563843, + "num_tokens": 24082058.0, + "step": 2689 + }, + { + "epoch": 2.0440729483282674, + "grad_norm": 1.4562251567840576, + "learning_rate": 1.2663595700172631e-06, + "loss": 0.39367130398750305, + "mean_token_accuracy": 0.8894597887992859, + "num_tokens": 24093954.0, + "step": 2690 + }, + { + "epoch": 2.0448328267477205, + "grad_norm": 1.9354028701782227, + "learning_rate": 1.2645383597215965e-06, + "loss": 0.28203579783439636, + "mean_token_accuracy": 0.9011955261230469, + "num_tokens": 24100590.0, + "step": 2691 + }, + { + "epoch": 2.045592705167173, + "grad_norm": 1.5010690689086914, + "learning_rate": 1.2627180165204671e-06, + "loss": 0.3463609516620636, + "mean_token_accuracy": 0.8978298306465149, + "num_tokens": 24111104.0, + "step": 2692 + }, + { + "epoch": 2.0463525835866263, + "grad_norm": 2.585813045501709, + "learning_rate": 1.2608985416914616e-06, + "loss": 0.2142711877822876, + "mean_token_accuracy": 0.9260460138320923, + "num_tokens": 24115301.0, + "step": 2693 + }, + { + "epoch": 2.047112462006079, + "grad_norm": 2.317268133163452, + "learning_rate": 1.259079936511558e-06, + "loss": 0.14454546570777893, + "mean_token_accuracy": 0.9498077034950256, + "num_tokens": 24120295.0, + "step": 2694 + }, + { + "epoch": 2.047872340425532, + "grad_norm": 1.966550350189209, + "learning_rate": 1.257262202257124e-06, + "loss": 0.20745311677455902, + "mean_token_accuracy": 0.9157166481018066, + "num_tokens": 24127158.0, + "step": 2695 + }, + { + "epoch": 2.048632218844985, + "grad_norm": 1.6521401405334473, + "learning_rate": 1.2554453402039124e-06, + "loss": 0.2547406256198883, + "mean_token_accuracy": 0.9356101751327515, + "num_tokens": 24135620.0, + "step": 2696 + }, + { + "epoch": 2.0493920972644375, + "grad_norm": 2.341756582260132, + "learning_rate": 1.2536293516270704e-06, + "loss": 0.35540008544921875, + "mean_token_accuracy": 0.874363899230957, + "num_tokens": 24141766.0, + "step": 2697 + }, + { + "epoch": 2.0501519756838906, + "grad_norm": 1.7938716411590576, + "learning_rate": 1.251814237801128e-06, + "loss": 0.37250861525535583, + "mean_token_accuracy": 0.8644422292709351, + "num_tokens": 24149997.0, + "step": 2698 + }, + { + "epoch": 2.0509118541033433, + "grad_norm": 2.0868122577667236, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.44527092576026917, + "mean_token_accuracy": 0.8510264158248901, + "num_tokens": 24158208.0, + "step": 2699 + }, + { + "epoch": 2.0516717325227964, + "grad_norm": 2.412604808807373, + "learning_rate": 1.24818663949699e-06, + "loss": 0.19276219606399536, + "mean_token_accuracy": 0.9317681789398193, + "num_tokens": 24162905.0, + "step": 2700 + }, + { + "epoch": 2.052431610942249, + "grad_norm": 1.4488455057144165, + "learning_rate": 1.246374157564785e-06, + "loss": 0.3493705093860626, + "mean_token_accuracy": 0.9016396999359131, + "num_tokens": 24175852.0, + "step": 2701 + }, + { + "epoch": 2.0531914893617023, + "grad_norm": 2.1629185676574707, + "learning_rate": 1.2445625554754526e-06, + "loss": 0.30588388442993164, + "mean_token_accuracy": 0.8871392011642456, + "num_tokens": 24181507.0, + "step": 2702 + }, + { + "epoch": 2.053951367781155, + "grad_norm": 2.0489449501037598, + "learning_rate": 1.2427518345004459e-06, + "loss": 0.4578161835670471, + "mean_token_accuracy": 0.8498104214668274, + "num_tokens": 24191918.0, + "step": 2703 + }, + { + "epoch": 2.054711246200608, + "grad_norm": 2.063019037246704, + "learning_rate": 1.2409419959105981e-06, + "loss": 0.31680572032928467, + "mean_token_accuracy": 0.8809083700180054, + "num_tokens": 24199336.0, + "step": 2704 + }, + { + "epoch": 2.0554711246200608, + "grad_norm": 2.4594223499298096, + "learning_rate": 1.239133040976124e-06, + "loss": 0.3048282265663147, + "mean_token_accuracy": 0.8897095322608948, + "num_tokens": 24205118.0, + "step": 2705 + }, + { + "epoch": 2.056231003039514, + "grad_norm": 1.6359999179840088, + "learning_rate": 1.237324970966618e-06, + "loss": 0.4312370717525482, + "mean_token_accuracy": 0.8526142835617065, + "num_tokens": 24215792.0, + "step": 2706 + }, + { + "epoch": 2.0569908814589666, + "grad_norm": 1.5534536838531494, + "learning_rate": 1.2355177871510538e-06, + "loss": 0.3647908568382263, + "mean_token_accuracy": 0.8680631518363953, + "num_tokens": 24235325.0, + "step": 2707 + }, + { + "epoch": 2.0577507598784193, + "grad_norm": 2.4902515411376953, + "learning_rate": 1.2337114907977798e-06, + "loss": 0.3605276942253113, + "mean_token_accuracy": 0.8776376843452454, + "num_tokens": 24241502.0, + "step": 2708 + }, + { + "epoch": 2.0585106382978724, + "grad_norm": 1.7282993793487549, + "learning_rate": 1.2319060831745273e-06, + "loss": 0.38326722383499146, + "mean_token_accuracy": 0.8531644344329834, + "num_tokens": 24252665.0, + "step": 2709 + }, + { + "epoch": 2.059270516717325, + "grad_norm": 1.4213361740112305, + "learning_rate": 1.2301015655484006e-06, + "loss": 0.32221150398254395, + "mean_token_accuracy": 0.8890664577484131, + "num_tokens": 24266409.0, + "step": 2710 + }, + { + "epoch": 2.060030395136778, + "grad_norm": 2.6412453651428223, + "learning_rate": 1.2282979391858767e-06, + "loss": 0.20225220918655396, + "mean_token_accuracy": 0.9287782311439514, + "num_tokens": 24271069.0, + "step": 2711 + }, + { + "epoch": 2.060790273556231, + "grad_norm": 3.2601654529571533, + "learning_rate": 1.2264952053528145e-06, + "loss": 0.23259003460407257, + "mean_token_accuracy": 0.9290606379508972, + "num_tokens": 24274992.0, + "step": 2712 + }, + { + "epoch": 2.061550151975684, + "grad_norm": 1.6633410453796387, + "learning_rate": 1.2246933653144386e-06, + "loss": 0.355314165353775, + "mean_token_accuracy": 0.870380163192749, + "num_tokens": 24284917.0, + "step": 2713 + }, + { + "epoch": 2.0623100303951367, + "grad_norm": 2.9081318378448486, + "learning_rate": 1.2228924203353507e-06, + "loss": 0.38050833344459534, + "mean_token_accuracy": 0.8879997730255127, + "num_tokens": 24289694.0, + "step": 2714 + }, + { + "epoch": 2.06306990881459, + "grad_norm": 3.2404227256774902, + "learning_rate": 1.2210923716795233e-06, + "loss": 0.2502570152282715, + "mean_token_accuracy": 0.9150978922843933, + "num_tokens": 24293254.0, + "step": 2715 + }, + { + "epoch": 2.0638297872340425, + "grad_norm": 1.9262174367904663, + "learning_rate": 1.2192932206103e-06, + "loss": 0.26763200759887695, + "mean_token_accuracy": 0.9203122854232788, + "num_tokens": 24300881.0, + "step": 2716 + }, + { + "epoch": 2.0645896656534957, + "grad_norm": 1.6790109872817993, + "learning_rate": 1.2174949683903943e-06, + "loss": 0.22275440394878387, + "mean_token_accuracy": 0.9212621450424194, + "num_tokens": 24309288.0, + "step": 2717 + }, + { + "epoch": 2.0653495440729484, + "grad_norm": 1.8272414207458496, + "learning_rate": 1.2156976162818895e-06, + "loss": 0.3183424472808838, + "mean_token_accuracy": 0.8813169002532959, + "num_tokens": 24316980.0, + "step": 2718 + }, + { + "epoch": 2.066109422492401, + "grad_norm": 2.7388651371002197, + "learning_rate": 1.2139011655462338e-06, + "loss": 0.24794816970825195, + "mean_token_accuracy": 0.9109550714492798, + "num_tokens": 24321867.0, + "step": 2719 + }, + { + "epoch": 2.066869300911854, + "grad_norm": 1.4866925477981567, + "learning_rate": 1.2121056174442484e-06, + "loss": 0.24177205562591553, + "mean_token_accuracy": 0.9102780818939209, + "num_tokens": 24332874.0, + "step": 2720 + }, + { + "epoch": 2.067629179331307, + "grad_norm": 1.6006059646606445, + "learning_rate": 1.2103109732361178e-06, + "loss": 0.29220807552337646, + "mean_token_accuracy": 0.8947570323944092, + "num_tokens": 24342790.0, + "step": 2721 + }, + { + "epoch": 2.06838905775076, + "grad_norm": 2.2688677310943604, + "learning_rate": 1.208517234181391e-06, + "loss": 0.39247143268585205, + "mean_token_accuracy": 0.8514304161071777, + "num_tokens": 24349329.0, + "step": 2722 + }, + { + "epoch": 2.0691489361702127, + "grad_norm": 2.404534339904785, + "learning_rate": 1.2067244015389829e-06, + "loss": 0.4461793303489685, + "mean_token_accuracy": 0.8531662821769714, + "num_tokens": 24356287.0, + "step": 2723 + }, + { + "epoch": 2.069908814589666, + "grad_norm": 1.813341498374939, + "learning_rate": 1.204932476567175e-06, + "loss": 0.38300177454948425, + "mean_token_accuracy": 0.8597674369812012, + "num_tokens": 24366181.0, + "step": 2724 + }, + { + "epoch": 2.0706686930091185, + "grad_norm": 3.49125337600708, + "learning_rate": 1.2031414605236066e-06, + "loss": 0.33281540870666504, + "mean_token_accuracy": 0.8774969577789307, + "num_tokens": 24370362.0, + "step": 2725 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 1.7682114839553833, + "learning_rate": 1.2013513546652827e-06, + "loss": 0.3001813590526581, + "mean_token_accuracy": 0.8840254545211792, + "num_tokens": 24380469.0, + "step": 2726 + }, + { + "epoch": 2.0721884498480243, + "grad_norm": 2.3688952922821045, + "learning_rate": 1.1995621602485685e-06, + "loss": 0.20055249333381653, + "mean_token_accuracy": 0.9246129989624023, + "num_tokens": 24385474.0, + "step": 2727 + }, + { + "epoch": 2.072948328267477, + "grad_norm": 2.3368382453918457, + "learning_rate": 1.1977738785291894e-06, + "loss": 0.18379954993724823, + "mean_token_accuracy": 0.9385529160499573, + "num_tokens": 24390002.0, + "step": 2728 + }, + { + "epoch": 2.07370820668693, + "grad_norm": 1.857473373413086, + "learning_rate": 1.1959865107622306e-06, + "loss": 0.4606894552707672, + "mean_token_accuracy": 0.8437427282333374, + "num_tokens": 24400880.0, + "step": 2729 + }, + { + "epoch": 2.074468085106383, + "grad_norm": 1.2714136838912964, + "learning_rate": 1.1942000582021355e-06, + "loss": 0.21171459555625916, + "mean_token_accuracy": 0.9216019511222839, + "num_tokens": 24413113.0, + "step": 2730 + }, + { + "epoch": 2.075227963525836, + "grad_norm": 2.2025210857391357, + "learning_rate": 1.1924145221027048e-06, + "loss": 0.44211941957473755, + "mean_token_accuracy": 0.8538386821746826, + "num_tokens": 24420504.0, + "step": 2731 + }, + { + "epoch": 2.0759878419452886, + "grad_norm": 1.6706589460372925, + "learning_rate": 1.190629903717097e-06, + "loss": 0.35163265466690063, + "mean_token_accuracy": 0.8716240525245667, + "num_tokens": 24430203.0, + "step": 2732 + }, + { + "epoch": 2.0767477203647418, + "grad_norm": 2.299182176589966, + "learning_rate": 1.1888462042978268e-06, + "loss": 0.30983975529670715, + "mean_token_accuracy": 0.8859797716140747, + "num_tokens": 24437387.0, + "step": 2733 + }, + { + "epoch": 2.0775075987841944, + "grad_norm": 2.975123167037964, + "learning_rate": 1.1870634250967606e-06, + "loss": 0.23585952818393707, + "mean_token_accuracy": 0.9167368412017822, + "num_tokens": 24441176.0, + "step": 2734 + }, + { + "epoch": 2.0782674772036476, + "grad_norm": 1.1052464246749878, + "learning_rate": 1.1852815673651246e-06, + "loss": 0.24136316776275635, + "mean_token_accuracy": 0.8897353410720825, + "num_tokens": 24457092.0, + "step": 2735 + }, + { + "epoch": 2.0790273556231003, + "grad_norm": 1.5531870126724243, + "learning_rate": 1.1835006323534926e-06, + "loss": 0.302223265171051, + "mean_token_accuracy": 0.8940514326095581, + "num_tokens": 24467643.0, + "step": 2736 + }, + { + "epoch": 2.0797872340425534, + "grad_norm": 1.706140398979187, + "learning_rate": 1.1817206213117943e-06, + "loss": 0.39235255122184753, + "mean_token_accuracy": 0.8615218997001648, + "num_tokens": 24477715.0, + "step": 2737 + }, + { + "epoch": 2.080547112462006, + "grad_norm": 2.1109750270843506, + "learning_rate": 1.1799415354893103e-06, + "loss": 0.2526751756668091, + "mean_token_accuracy": 0.9108465909957886, + "num_tokens": 24484248.0, + "step": 2738 + }, + { + "epoch": 2.0813069908814588, + "grad_norm": 1.9943277835845947, + "learning_rate": 1.178163376134671e-06, + "loss": 0.3540172874927521, + "mean_token_accuracy": 0.9131139516830444, + "num_tokens": 24492207.0, + "step": 2739 + }, + { + "epoch": 2.082066869300912, + "grad_norm": 1.9536099433898926, + "learning_rate": 1.1763861444958573e-06, + "loss": 0.3902950584888458, + "mean_token_accuracy": 0.8611530065536499, + "num_tokens": 24501567.0, + "step": 2740 + }, + { + "epoch": 2.0828267477203646, + "grad_norm": 3.146925926208496, + "learning_rate": 1.1746098418201987e-06, + "loss": 0.43440669775009155, + "mean_token_accuracy": 0.8709320425987244, + "num_tokens": 24506684.0, + "step": 2741 + }, + { + "epoch": 2.0835866261398177, + "grad_norm": 2.763427495956421, + "learning_rate": 1.172834469354373e-06, + "loss": 0.3513452410697937, + "mean_token_accuracy": 0.8774256110191345, + "num_tokens": 24511509.0, + "step": 2742 + }, + { + "epoch": 2.0843465045592704, + "grad_norm": 2.773829221725464, + "learning_rate": 1.1710600283444048e-06, + "loss": 0.24668049812316895, + "mean_token_accuracy": 0.9146889448165894, + "num_tokens": 24516030.0, + "step": 2743 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 1.666471242904663, + "learning_rate": 1.169286520035666e-06, + "loss": 0.36206915974617004, + "mean_token_accuracy": 0.8711973428726196, + "num_tokens": 24526656.0, + "step": 2744 + }, + { + "epoch": 2.085866261398176, + "grad_norm": 2.818890333175659, + "learning_rate": 1.1675139456728702e-06, + "loss": 0.32967281341552734, + "mean_token_accuracy": 0.880983829498291, + "num_tokens": 24531625.0, + "step": 2745 + }, + { + "epoch": 2.0866261398176293, + "grad_norm": 1.09058678150177, + "learning_rate": 1.1657423065000811e-06, + "loss": 0.36224377155303955, + "mean_token_accuracy": 0.8708326816558838, + "num_tokens": 24557123.0, + "step": 2746 + }, + { + "epoch": 2.087386018237082, + "grad_norm": 1.1434987783432007, + "learning_rate": 1.1639716037607036e-06, + "loss": 0.26490458846092224, + "mean_token_accuracy": 0.9131897687911987, + "num_tokens": 24573223.0, + "step": 2747 + }, + { + "epoch": 2.088145896656535, + "grad_norm": 2.437505006790161, + "learning_rate": 1.1622018386974829e-06, + "loss": 0.18964408338069916, + "mean_token_accuracy": 0.9271818399429321, + "num_tokens": 24578306.0, + "step": 2748 + }, + { + "epoch": 2.088905775075988, + "grad_norm": 1.797308325767517, + "learning_rate": 1.160433012552508e-06, + "loss": 0.3090781569480896, + "mean_token_accuracy": 0.8960750102996826, + "num_tokens": 24587562.0, + "step": 2749 + }, + { + "epoch": 2.0896656534954405, + "grad_norm": 2.4050841331481934, + "learning_rate": 1.1586651265672122e-06, + "loss": 0.4001041054725647, + "mean_token_accuracy": 0.8588370084762573, + "num_tokens": 24594223.0, + "step": 2750 + }, + { + "epoch": 2.0904255319148937, + "grad_norm": 1.8757156133651733, + "learning_rate": 1.1568981819823636e-06, + "loss": 0.37845075130462646, + "mean_token_accuracy": 0.866146445274353, + "num_tokens": 24602556.0, + "step": 2751 + }, + { + "epoch": 2.0911854103343464, + "grad_norm": 1.8205114603042603, + "learning_rate": 1.1551321800380722e-06, + "loss": 0.24738016724586487, + "mean_token_accuracy": 0.923284113407135, + "num_tokens": 24611627.0, + "step": 2752 + }, + { + "epoch": 2.0919452887537995, + "grad_norm": 2.107512950897217, + "learning_rate": 1.153367121973786e-06, + "loss": 0.3062688410282135, + "mean_token_accuracy": 0.8909003734588623, + "num_tokens": 24619569.0, + "step": 2753 + }, + { + "epoch": 2.092705167173252, + "grad_norm": 1.93110191822052, + "learning_rate": 1.1516030090282915e-06, + "loss": 0.38658422231674194, + "mean_token_accuracy": 0.869437038898468, + "num_tokens": 24628869.0, + "step": 2754 + }, + { + "epoch": 2.0934650455927053, + "grad_norm": 2.3618004322052, + "learning_rate": 1.1498398424397106e-06, + "loss": 0.19193072617053986, + "mean_token_accuracy": 0.9329519271850586, + "num_tokens": 24633724.0, + "step": 2755 + }, + { + "epoch": 2.094224924012158, + "grad_norm": 2.274510622024536, + "learning_rate": 1.1480776234455024e-06, + "loss": 0.24939998984336853, + "mean_token_accuracy": 0.9104958772659302, + "num_tokens": 24642762.0, + "step": 2756 + }, + { + "epoch": 2.094984802431611, + "grad_norm": 1.7468934059143066, + "learning_rate": 1.1463163532824572e-06, + "loss": 0.3876607418060303, + "mean_token_accuracy": 0.8540539145469666, + "num_tokens": 24652138.0, + "step": 2757 + }, + { + "epoch": 2.095744680851064, + "grad_norm": 2.905381441116333, + "learning_rate": 1.1445560331867054e-06, + "loss": 0.33666878938674927, + "mean_token_accuracy": 0.8805598616600037, + "num_tokens": 24656612.0, + "step": 2758 + }, + { + "epoch": 2.096504559270517, + "grad_norm": 1.5513007640838623, + "learning_rate": 1.142796664393707e-06, + "loss": 0.25168463587760925, + "mean_token_accuracy": 0.925534725189209, + "num_tokens": 24667132.0, + "step": 2759 + }, + { + "epoch": 2.0972644376899696, + "grad_norm": 1.6804249286651611, + "learning_rate": 1.141038248138253e-06, + "loss": 0.3862859010696411, + "mean_token_accuracy": 0.8686253428459167, + "num_tokens": 24679274.0, + "step": 2760 + }, + { + "epoch": 2.0980243161094223, + "grad_norm": 1.7432880401611328, + "learning_rate": 1.1392807856544682e-06, + "loss": 0.3200700879096985, + "mean_token_accuracy": 0.9188123941421509, + "num_tokens": 24688628.0, + "step": 2761 + }, + { + "epoch": 2.0987841945288754, + "grad_norm": 1.8734468221664429, + "learning_rate": 1.1375242781758077e-06, + "loss": 0.34758424758911133, + "mean_token_accuracy": 0.8724187016487122, + "num_tokens": 24698159.0, + "step": 2762 + }, + { + "epoch": 2.099544072948328, + "grad_norm": 3.7156829833984375, + "learning_rate": 1.1357687269350564e-06, + "loss": 0.30014732480049133, + "mean_token_accuracy": 0.9021577835083008, + "num_tokens": 24701797.0, + "step": 2763 + }, + { + "epoch": 2.1003039513677813, + "grad_norm": 1.5196985006332397, + "learning_rate": 1.1340141331643276e-06, + "loss": 0.45747464895248413, + "mean_token_accuracy": 0.839891791343689, + "num_tokens": 24716468.0, + "step": 2764 + }, + { + "epoch": 2.101063829787234, + "grad_norm": 1.978009581565857, + "learning_rate": 1.132260498095062e-06, + "loss": 0.3130183815956116, + "mean_token_accuracy": 0.90610271692276, + "num_tokens": 24723211.0, + "step": 2765 + }, + { + "epoch": 2.101823708206687, + "grad_norm": 1.5883251428604126, + "learning_rate": 1.1305078229580294e-06, + "loss": 0.30493029952049255, + "mean_token_accuracy": 0.8889745473861694, + "num_tokens": 24733839.0, + "step": 2766 + }, + { + "epoch": 2.1025835866261398, + "grad_norm": 1.2397783994674683, + "learning_rate": 1.128756108983325e-06, + "loss": 0.2606407105922699, + "mean_token_accuracy": 0.9061247110366821, + "num_tokens": 24747488.0, + "step": 2767 + }, + { + "epoch": 2.103343465045593, + "grad_norm": 1.3046784400939941, + "learning_rate": 1.1270053574003658e-06, + "loss": 0.38750404119491577, + "mean_token_accuracy": 0.8777017593383789, + "num_tokens": 24763893.0, + "step": 2768 + }, + { + "epoch": 2.1041033434650456, + "grad_norm": 1.499266266822815, + "learning_rate": 1.1252555694379005e-06, + "loss": 0.4804937243461609, + "mean_token_accuracy": 0.8344086408615112, + "num_tokens": 24779323.0, + "step": 2769 + }, + { + "epoch": 2.1048632218844983, + "grad_norm": 1.211094856262207, + "learning_rate": 1.123506746323997e-06, + "loss": 0.3579246997833252, + "mean_token_accuracy": 0.8705919981002808, + "num_tokens": 24794965.0, + "step": 2770 + }, + { + "epoch": 2.1056231003039514, + "grad_norm": 2.490551471710205, + "learning_rate": 1.1217588892860446e-06, + "loss": 0.4084790349006653, + "mean_token_accuracy": 0.8553222417831421, + "num_tokens": 24800614.0, + "step": 2771 + }, + { + "epoch": 2.106382978723404, + "grad_norm": 1.5249632596969604, + "learning_rate": 1.1200119995507572e-06, + "loss": 0.36853182315826416, + "mean_token_accuracy": 0.8847414255142212, + "num_tokens": 24812886.0, + "step": 2772 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 1.8510968685150146, + "learning_rate": 1.1182660783441719e-06, + "loss": 0.2918103337287903, + "mean_token_accuracy": 0.8898224830627441, + "num_tokens": 24821545.0, + "step": 2773 + }, + { + "epoch": 2.10790273556231, + "grad_norm": 1.7721803188323975, + "learning_rate": 1.11652112689164e-06, + "loss": 0.2920452654361725, + "mean_token_accuracy": 0.8879085779190063, + "num_tokens": 24831526.0, + "step": 2774 + }, + { + "epoch": 2.108662613981763, + "grad_norm": 1.3987336158752441, + "learning_rate": 1.1147771464178378e-06, + "loss": 0.4407062828540802, + "mean_token_accuracy": 0.8472493886947632, + "num_tokens": 24845847.0, + "step": 2775 + }, + { + "epoch": 2.1094224924012157, + "grad_norm": 1.8927375078201294, + "learning_rate": 1.1130341381467569e-06, + "loss": 0.36293038725852966, + "mean_token_accuracy": 0.8881135582923889, + "num_tokens": 24854760.0, + "step": 2776 + }, + { + "epoch": 2.110182370820669, + "grad_norm": 3.0480666160583496, + "learning_rate": 1.111292103301708e-06, + "loss": 0.30395108461380005, + "mean_token_accuracy": 0.9036306142807007, + "num_tokens": 24859051.0, + "step": 2777 + }, + { + "epoch": 2.1109422492401215, + "grad_norm": 1.5833618640899658, + "learning_rate": 1.1095510431053176e-06, + "loss": 0.26424330472946167, + "mean_token_accuracy": 0.9020674824714661, + "num_tokens": 24869853.0, + "step": 2778 + }, + { + "epoch": 2.1117021276595747, + "grad_norm": 1.645459532737732, + "learning_rate": 1.1078109587795311e-06, + "loss": 0.3563994765281677, + "mean_token_accuracy": 0.8732106685638428, + "num_tokens": 24880184.0, + "step": 2779 + }, + { + "epoch": 2.1124620060790273, + "grad_norm": 2.2964093685150146, + "learning_rate": 1.1060718515456022e-06, + "loss": 0.19739922881126404, + "mean_token_accuracy": 0.9273765087127686, + "num_tokens": 24885398.0, + "step": 2780 + }, + { + "epoch": 2.11322188449848, + "grad_norm": 2.094024181365967, + "learning_rate": 1.1043337226241075e-06, + "loss": 0.3321923315525055, + "mean_token_accuracy": 0.8865819573402405, + "num_tokens": 24893908.0, + "step": 2781 + }, + { + "epoch": 2.113981762917933, + "grad_norm": 1.9787025451660156, + "learning_rate": 1.1025965732349318e-06, + "loss": 0.37631168961524963, + "mean_token_accuracy": 0.8808693885803223, + "num_tokens": 24901270.0, + "step": 2782 + }, + { + "epoch": 2.114741641337386, + "grad_norm": 2.376060724258423, + "learning_rate": 1.100860404597271e-06, + "loss": 0.2591894268989563, + "mean_token_accuracy": 0.9174780249595642, + "num_tokens": 24906578.0, + "step": 2783 + }, + { + "epoch": 2.115501519756839, + "grad_norm": 1.0967903137207031, + "learning_rate": 1.0991252179296389e-06, + "loss": 0.26626938581466675, + "mean_token_accuracy": 0.9305505752563477, + "num_tokens": 24922329.0, + "step": 2784 + }, + { + "epoch": 2.1162613981762917, + "grad_norm": 3.3701183795928955, + "learning_rate": 1.0973910144498534e-06, + "loss": 0.2710079848766327, + "mean_token_accuracy": 0.9095271825790405, + "num_tokens": 24925777.0, + "step": 2785 + }, + { + "epoch": 2.117021276595745, + "grad_norm": 1.636264681816101, + "learning_rate": 1.0956577953750461e-06, + "loss": 0.2995981276035309, + "mean_token_accuracy": 0.8988568782806396, + "num_tokens": 24934230.0, + "step": 2786 + }, + { + "epoch": 2.1177811550151975, + "grad_norm": 2.3107731342315674, + "learning_rate": 1.093925561921657e-06, + "loss": 0.3424459397792816, + "mean_token_accuracy": 0.9100210070610046, + "num_tokens": 24939830.0, + "step": 2787 + }, + { + "epoch": 2.1185410334346506, + "grad_norm": 1.814764380455017, + "learning_rate": 1.0921943153054343e-06, + "loss": 0.3182154893875122, + "mean_token_accuracy": 0.883027195930481, + "num_tokens": 24947764.0, + "step": 2788 + }, + { + "epoch": 2.1193009118541033, + "grad_norm": 1.693555235862732, + "learning_rate": 1.0904640567414332e-06, + "loss": 0.3685447573661804, + "mean_token_accuracy": 0.8900846242904663, + "num_tokens": 24957680.0, + "step": 2789 + }, + { + "epoch": 2.1200607902735564, + "grad_norm": 1.0726022720336914, + "learning_rate": 1.088734787444017e-06, + "loss": 0.28461548686027527, + "mean_token_accuracy": 0.9026681184768677, + "num_tokens": 24975181.0, + "step": 2790 + }, + { + "epoch": 2.120820668693009, + "grad_norm": 1.3013874292373657, + "learning_rate": 1.0870065086268506e-06, + "loss": 0.28222548961639404, + "mean_token_accuracy": 0.9041857719421387, + "num_tokens": 24993211.0, + "step": 2791 + }, + { + "epoch": 2.121580547112462, + "grad_norm": 2.592106580734253, + "learning_rate": 1.085279221502909e-06, + "loss": 0.31733593344688416, + "mean_token_accuracy": 0.90151047706604, + "num_tokens": 24998151.0, + "step": 2792 + }, + { + "epoch": 2.122340425531915, + "grad_norm": 2.649210214614868, + "learning_rate": 1.0835529272844694e-06, + "loss": 0.341595321893692, + "mean_token_accuracy": 0.8989696502685547, + "num_tokens": 25003399.0, + "step": 2793 + }, + { + "epoch": 2.1231003039513676, + "grad_norm": 2.376619577407837, + "learning_rate": 1.0818276271831094e-06, + "loss": 0.2770065665245056, + "mean_token_accuracy": 0.8967875242233276, + "num_tokens": 25009686.0, + "step": 2794 + }, + { + "epoch": 2.1238601823708207, + "grad_norm": 2.1539604663848877, + "learning_rate": 1.080103322409711e-06, + "loss": 0.37501147389411926, + "mean_token_accuracy": 0.8768513202667236, + "num_tokens": 25016339.0, + "step": 2795 + }, + { + "epoch": 2.1246200607902734, + "grad_norm": 2.5727670192718506, + "learning_rate": 1.0783800141744607e-06, + "loss": 0.31852903962135315, + "mean_token_accuracy": 0.8897477388381958, + "num_tokens": 25021410.0, + "step": 2796 + }, + { + "epoch": 2.1253799392097266, + "grad_norm": 2.1428916454315186, + "learning_rate": 1.0766577036868395e-06, + "loss": 0.2348000407218933, + "mean_token_accuracy": 0.9012142419815063, + "num_tokens": 25027375.0, + "step": 2797 + }, + { + "epoch": 2.1261398176291793, + "grad_norm": 2.4231064319610596, + "learning_rate": 1.074936392155631e-06, + "loss": 0.30580806732177734, + "mean_token_accuracy": 0.8963108658790588, + "num_tokens": 25033211.0, + "step": 2798 + }, + { + "epoch": 2.1268996960486324, + "grad_norm": 2.1027259826660156, + "learning_rate": 1.073216080788921e-06, + "loss": 0.2508814334869385, + "mean_token_accuracy": 0.9095165729522705, + "num_tokens": 25040316.0, + "step": 2799 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 1.6513079404830933, + "learning_rate": 1.0714967707940876e-06, + "loss": 0.40694183111190796, + "mean_token_accuracy": 0.8895826935768127, + "num_tokens": 25054978.0, + "step": 2800 + }, + { + "epoch": 2.128419452887538, + "grad_norm": 2.0551133155822754, + "learning_rate": 1.0697784633778093e-06, + "loss": 0.3452662229537964, + "mean_token_accuracy": 0.8710684776306152, + "num_tokens": 25062755.0, + "step": 2801 + }, + { + "epoch": 2.129179331306991, + "grad_norm": 2.1780688762664795, + "learning_rate": 1.0680611597460607e-06, + "loss": 0.2918209135532379, + "mean_token_accuracy": 0.8689337968826294, + "num_tokens": 25069453.0, + "step": 2802 + }, + { + "epoch": 2.1299392097264436, + "grad_norm": 1.7905635833740234, + "learning_rate": 1.0663448611041114e-06, + "loss": 0.3535313308238983, + "mean_token_accuracy": 0.8762770295143127, + "num_tokens": 25080004.0, + "step": 2803 + }, + { + "epoch": 2.1306990881458967, + "grad_norm": 1.6187241077423096, + "learning_rate": 1.0646295686565258e-06, + "loss": 0.3042716681957245, + "mean_token_accuracy": 0.884156346321106, + "num_tokens": 25089652.0, + "step": 2804 + }, + { + "epoch": 2.1314589665653494, + "grad_norm": 2.667459011077881, + "learning_rate": 1.0629152836071633e-06, + "loss": 0.3904019892215729, + "mean_token_accuracy": 0.8603606224060059, + "num_tokens": 25095556.0, + "step": 2805 + }, + { + "epoch": 2.1322188449848025, + "grad_norm": 1.4227970838546753, + "learning_rate": 1.0612020071591722e-06, + "loss": 0.3765299320220947, + "mean_token_accuracy": 0.8655093908309937, + "num_tokens": 25108963.0, + "step": 2806 + }, + { + "epoch": 2.132978723404255, + "grad_norm": 2.262726068496704, + "learning_rate": 1.0594897405149994e-06, + "loss": 0.2727298140525818, + "mean_token_accuracy": 0.9005513191223145, + "num_tokens": 25115135.0, + "step": 2807 + }, + { + "epoch": 2.1337386018237083, + "grad_norm": 2.0810186862945557, + "learning_rate": 1.0577784848763773e-06, + "loss": 0.4001343250274658, + "mean_token_accuracy": 0.8537896871566772, + "num_tokens": 25123079.0, + "step": 2808 + }, + { + "epoch": 2.134498480243161, + "grad_norm": 1.6573376655578613, + "learning_rate": 1.0560682414443315e-06, + "loss": 0.4197486340999603, + "mean_token_accuracy": 0.8549862504005432, + "num_tokens": 25135398.0, + "step": 2809 + }, + { + "epoch": 2.135258358662614, + "grad_norm": 2.200150489807129, + "learning_rate": 1.0543590114191768e-06, + "loss": 0.32026296854019165, + "mean_token_accuracy": 0.8797904253005981, + "num_tokens": 25141382.0, + "step": 2810 + }, + { + "epoch": 2.136018237082067, + "grad_norm": 2.678558111190796, + "learning_rate": 1.0526507960005164e-06, + "loss": 0.30048054456710815, + "mean_token_accuracy": 0.8849201202392578, + "num_tokens": 25146235.0, + "step": 2811 + }, + { + "epoch": 2.13677811550152, + "grad_norm": 1.5207500457763672, + "learning_rate": 1.0509435963872422e-06, + "loss": 0.3706427216529846, + "mean_token_accuracy": 0.8740214109420776, + "num_tokens": 25157108.0, + "step": 2812 + }, + { + "epoch": 2.1375379939209727, + "grad_norm": 1.4632720947265625, + "learning_rate": 1.049237413777532e-06, + "loss": 0.27156776189804077, + "mean_token_accuracy": 0.8950715661048889, + "num_tokens": 25167937.0, + "step": 2813 + }, + { + "epoch": 2.1382978723404253, + "grad_norm": 2.101048469543457, + "learning_rate": 1.0475322493688506e-06, + "loss": 0.366736501455307, + "mean_token_accuracy": 0.8700850009918213, + "num_tokens": 25177043.0, + "step": 2814 + }, + { + "epoch": 2.1390577507598785, + "grad_norm": 2.54221248626709, + "learning_rate": 1.0458281043579482e-06, + "loss": 0.20383943617343903, + "mean_token_accuracy": 0.9226665496826172, + "num_tokens": 25182105.0, + "step": 2815 + }, + { + "epoch": 2.139817629179331, + "grad_norm": 1.7742674350738525, + "learning_rate": 1.04412497994086e-06, + "loss": 0.26852455735206604, + "mean_token_accuracy": 0.8987031579017639, + "num_tokens": 25190178.0, + "step": 2816 + }, + { + "epoch": 2.1405775075987843, + "grad_norm": 3.2856075763702393, + "learning_rate": 1.0424228773129019e-06, + "loss": 0.24643859267234802, + "mean_token_accuracy": 0.9189155101776123, + "num_tokens": 25194105.0, + "step": 2817 + }, + { + "epoch": 2.141337386018237, + "grad_norm": 3.374311923980713, + "learning_rate": 1.0407217976686777e-06, + "loss": 0.2575511336326599, + "mean_token_accuracy": 0.9143530130386353, + "num_tokens": 25197787.0, + "step": 2818 + }, + { + "epoch": 2.14209726443769, + "grad_norm": 1.4967217445373535, + "learning_rate": 1.03902174220207e-06, + "loss": 0.3054750859737396, + "mean_token_accuracy": 0.8989205360412598, + "num_tokens": 25209150.0, + "step": 2819 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 2.654459238052368, + "learning_rate": 1.0373227121062423e-06, + "loss": 0.27398061752319336, + "mean_token_accuracy": 0.9181102514266968, + "num_tokens": 25214015.0, + "step": 2820 + }, + { + "epoch": 2.143617021276596, + "grad_norm": 1.3205828666687012, + "learning_rate": 1.0356247085736388e-06, + "loss": 0.4085468053817749, + "mean_token_accuracy": 0.8745299577713013, + "num_tokens": 25230588.0, + "step": 2821 + }, + { + "epoch": 2.1443768996960486, + "grad_norm": 1.6965736150741577, + "learning_rate": 1.0339277327959863e-06, + "loss": 0.27269643545150757, + "mean_token_accuracy": 0.9001271724700928, + "num_tokens": 25239298.0, + "step": 2822 + }, + { + "epoch": 2.1451367781155017, + "grad_norm": 2.789114236831665, + "learning_rate": 1.0322317859642852e-06, + "loss": 0.2319176197052002, + "mean_token_accuracy": 0.9237110614776611, + "num_tokens": 25243286.0, + "step": 2823 + }, + { + "epoch": 2.1458966565349544, + "grad_norm": 1.8817718029022217, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.2917990982532501, + "mean_token_accuracy": 0.9211062788963318, + "num_tokens": 25250575.0, + "step": 2824 + }, + { + "epoch": 2.146656534954407, + "grad_norm": 2.1824984550476074, + "learning_rate": 1.0288429838991405e-06, + "loss": 0.39010798931121826, + "mean_token_accuracy": 0.8887852430343628, + "num_tokens": 25257947.0, + "step": 2825 + }, + { + "epoch": 2.1474164133738602, + "grad_norm": 1.302579641342163, + "learning_rate": 1.0271501310440882e-06, + "loss": 0.3511282503604889, + "mean_token_accuracy": 0.8728797435760498, + "num_tokens": 25272846.0, + "step": 2826 + }, + { + "epoch": 2.148176291793313, + "grad_norm": 1.691807746887207, + "learning_rate": 1.0254583118917699e-06, + "loss": 0.34246695041656494, + "mean_token_accuracy": 0.8743435144424438, + "num_tokens": 25283004.0, + "step": 2827 + }, + { + "epoch": 2.148936170212766, + "grad_norm": 1.2483569383621216, + "learning_rate": 1.0237675276295709e-06, + "loss": 0.3346659243106842, + "mean_token_accuracy": 0.8823951482772827, + "num_tokens": 25297786.0, + "step": 2828 + }, + { + "epoch": 2.1496960486322187, + "grad_norm": 3.7242841720581055, + "learning_rate": 1.022077779444145e-06, + "loss": 0.25516486167907715, + "mean_token_accuracy": 0.9189130663871765, + "num_tokens": 25301524.0, + "step": 2829 + }, + { + "epoch": 2.150455927051672, + "grad_norm": 2.5851144790649414, + "learning_rate": 1.020389068521426e-06, + "loss": 0.3543069362640381, + "mean_token_accuracy": 0.8942399621009827, + "num_tokens": 25307277.0, + "step": 2830 + }, + { + "epoch": 2.1512158054711246, + "grad_norm": 1.3453631401062012, + "learning_rate": 1.018701396046616e-06, + "loss": 0.2900702953338623, + "mean_token_accuracy": 0.8847548365592957, + "num_tokens": 25321366.0, + "step": 2831 + }, + { + "epoch": 2.1519756838905777, + "grad_norm": 1.6905686855316162, + "learning_rate": 1.0170147632041858e-06, + "loss": 0.24844832718372345, + "mean_token_accuracy": 0.9167388677597046, + "num_tokens": 25328916.0, + "step": 2832 + }, + { + "epoch": 2.1527355623100304, + "grad_norm": 2.6469411849975586, + "learning_rate": 1.0153291711778825e-06, + "loss": 0.18566903471946716, + "mean_token_accuracy": 0.9346771836280823, + "num_tokens": 25332871.0, + "step": 2833 + }, + { + "epoch": 2.1534954407294835, + "grad_norm": 1.3880906105041504, + "learning_rate": 1.0136446211507175e-06, + "loss": 0.37413570284843445, + "mean_token_accuracy": 0.8685535788536072, + "num_tokens": 25347447.0, + "step": 2834 + }, + { + "epoch": 2.154255319148936, + "grad_norm": 1.1376656293869019, + "learning_rate": 1.0119611143049731e-06, + "loss": 0.2844143509864807, + "mean_token_accuracy": 0.8910006284713745, + "num_tokens": 25365930.0, + "step": 2835 + }, + { + "epoch": 2.155015197568389, + "grad_norm": 2.259666919708252, + "learning_rate": 1.0102786518221997e-06, + "loss": 0.3148176074028015, + "mean_token_accuracy": 0.8851165175437927, + "num_tokens": 25373047.0, + "step": 2836 + }, + { + "epoch": 2.155775075987842, + "grad_norm": 3.304095506668091, + "learning_rate": 1.0085972348832138e-06, + "loss": 0.2042517364025116, + "mean_token_accuracy": 0.9247308969497681, + "num_tokens": 25376348.0, + "step": 2837 + }, + { + "epoch": 2.1565349544072947, + "grad_norm": 1.9856120347976685, + "learning_rate": 1.0069168646680985e-06, + "loss": 0.3547414541244507, + "mean_token_accuracy": 0.8941285610198975, + "num_tokens": 25384675.0, + "step": 2838 + }, + { + "epoch": 2.157294832826748, + "grad_norm": 2.8482213020324707, + "learning_rate": 1.0052375423562038e-06, + "loss": 0.3530133366584778, + "mean_token_accuracy": 0.8789700269699097, + "num_tokens": 25389631.0, + "step": 2839 + }, + { + "epoch": 2.1580547112462005, + "grad_norm": 1.4270408153533936, + "learning_rate": 1.0035592691261395e-06, + "loss": 0.34078776836395264, + "mean_token_accuracy": 0.8648165464401245, + "num_tokens": 25403746.0, + "step": 2840 + }, + { + "epoch": 2.1588145896656536, + "grad_norm": 0.9342723488807678, + "learning_rate": 1.0018820461557852e-06, + "loss": 0.2615935504436493, + "mean_token_accuracy": 0.9082236289978027, + "num_tokens": 25424695.0, + "step": 2841 + }, + { + "epoch": 2.1595744680851063, + "grad_norm": 2.695632219314575, + "learning_rate": 1.0002058746222807e-06, + "loss": 0.2202145904302597, + "mean_token_accuracy": 0.9221563339233398, + "num_tokens": 25428783.0, + "step": 2842 + }, + { + "epoch": 2.1603343465045595, + "grad_norm": 1.5679794549942017, + "learning_rate": 9.985307557020257e-07, + "loss": 0.24275024235248566, + "mean_token_accuracy": 0.9363338351249695, + "num_tokens": 25439104.0, + "step": 2843 + }, + { + "epoch": 2.161094224924012, + "grad_norm": 1.5985528230667114, + "learning_rate": 9.968566905706833e-07, + "loss": 0.2541901171207428, + "mean_token_accuracy": 0.9040743112564087, + "num_tokens": 25448829.0, + "step": 2844 + }, + { + "epoch": 2.161854103343465, + "grad_norm": 2.6022164821624756, + "learning_rate": 9.951836804031795e-07, + "loss": 0.24492180347442627, + "mean_token_accuracy": 0.9109418392181396, + "num_tokens": 25453902.0, + "step": 2845 + }, + { + "epoch": 2.162613981762918, + "grad_norm": 1.6719969511032104, + "learning_rate": 9.935117263736943e-07, + "loss": 0.43255117535591125, + "mean_token_accuracy": 0.868374228477478, + "num_tokens": 25465538.0, + "step": 2846 + }, + { + "epoch": 2.1633738601823707, + "grad_norm": 1.8284894227981567, + "learning_rate": 9.918408296556706e-07, + "loss": 0.32285982370376587, + "mean_token_accuracy": 0.9016412496566772, + "num_tokens": 25473721.0, + "step": 2847 + }, + { + "epoch": 2.164133738601824, + "grad_norm": 1.4488024711608887, + "learning_rate": 9.90170991421808e-07, + "loss": 0.35639309883117676, + "mean_token_accuracy": 0.8861881494522095, + "num_tokens": 25487535.0, + "step": 2848 + }, + { + "epoch": 2.1648936170212765, + "grad_norm": 2.089930534362793, + "learning_rate": 9.88502212844063e-07, + "loss": 0.2588546574115753, + "mean_token_accuracy": 0.9029642939567566, + "num_tokens": 25494567.0, + "step": 2849 + }, + { + "epoch": 2.1656534954407296, + "grad_norm": 1.1274315118789673, + "learning_rate": 9.86834495093649e-07, + "loss": 0.37268880009651184, + "mean_token_accuracy": 0.859347939491272, + "num_tokens": 25518278.0, + "step": 2850 + }, + { + "epoch": 2.1664133738601823, + "grad_norm": 2.3886640071868896, + "learning_rate": 9.851678393410343e-07, + "loss": 0.34938913583755493, + "mean_token_accuracy": 0.8724287748336792, + "num_tokens": 25524001.0, + "step": 2851 + }, + { + "epoch": 2.1671732522796354, + "grad_norm": 2.521230459213257, + "learning_rate": 9.83502246755942e-07, + "loss": 0.34781408309936523, + "mean_token_accuracy": 0.8970093131065369, + "num_tokens": 25529982.0, + "step": 2852 + }, + { + "epoch": 2.167933130699088, + "grad_norm": 2.467618942260742, + "learning_rate": 9.818377185073493e-07, + "loss": 0.29725387692451477, + "mean_token_accuracy": 0.8991899490356445, + "num_tokens": 25535356.0, + "step": 2853 + }, + { + "epoch": 2.1686930091185412, + "grad_norm": 2.335873603820801, + "learning_rate": 9.801742557634872e-07, + "loss": 0.39603036642074585, + "mean_token_accuracy": 0.8755916357040405, + "num_tokens": 25542526.0, + "step": 2854 + }, + { + "epoch": 2.169452887537994, + "grad_norm": 1.8388596773147583, + "learning_rate": 9.78511859691835e-07, + "loss": 0.3414672017097473, + "mean_token_accuracy": 0.8951467275619507, + "num_tokens": 25551904.0, + "step": 2855 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 1.86272394657135, + "learning_rate": 9.768505314591295e-07, + "loss": 0.45748448371887207, + "mean_token_accuracy": 0.8614354133605957, + "num_tokens": 25562197.0, + "step": 2856 + }, + { + "epoch": 2.1709726443768997, + "grad_norm": 1.9142264127731323, + "learning_rate": 9.751902722313527e-07, + "loss": 0.20877259969711304, + "mean_token_accuracy": 0.9316688179969788, + "num_tokens": 25569403.0, + "step": 2857 + }, + { + "epoch": 2.1717325227963524, + "grad_norm": 2.1138272285461426, + "learning_rate": 9.73531083173739e-07, + "loss": 0.37058722972869873, + "mean_token_accuracy": 0.8654135465621948, + "num_tokens": 25577200.0, + "step": 2858 + }, + { + "epoch": 2.1724924012158056, + "grad_norm": 1.973467469215393, + "learning_rate": 9.718729654507713e-07, + "loss": 0.4106993079185486, + "mean_token_accuracy": 0.8958662152290344, + "num_tokens": 25585694.0, + "step": 2859 + }, + { + "epoch": 2.1732522796352582, + "grad_norm": 1.957513451576233, + "learning_rate": 9.702159202261802e-07, + "loss": 0.2067333608865738, + "mean_token_accuracy": 0.9413473606109619, + "num_tokens": 25591604.0, + "step": 2860 + }, + { + "epoch": 2.1740121580547114, + "grad_norm": 2.7639806270599365, + "learning_rate": 9.685599486629444e-07, + "loss": 0.3446827232837677, + "mean_token_accuracy": 0.8837845325469971, + "num_tokens": 25596528.0, + "step": 2861 + }, + { + "epoch": 2.174772036474164, + "grad_norm": 2.483734607696533, + "learning_rate": 9.669050519232875e-07, + "loss": 0.21230249106884003, + "mean_token_accuracy": 0.9334918856620789, + "num_tokens": 25601182.0, + "step": 2862 + }, + { + "epoch": 2.175531914893617, + "grad_norm": 1.7194870710372925, + "learning_rate": 9.65251231168681e-07, + "loss": 0.2657586932182312, + "mean_token_accuracy": 0.9035707712173462, + "num_tokens": 25610561.0, + "step": 2863 + }, + { + "epoch": 2.17629179331307, + "grad_norm": 2.6709611415863037, + "learning_rate": 9.63598487559839e-07, + "loss": 0.3673030138015747, + "mean_token_accuracy": 0.8976202011108398, + "num_tokens": 25615822.0, + "step": 2864 + }, + { + "epoch": 2.1770516717325226, + "grad_norm": 1.6646889448165894, + "learning_rate": 9.619468222567216e-07, + "loss": 0.2796666622161865, + "mean_token_accuracy": 0.8698215484619141, + "num_tokens": 25626148.0, + "step": 2865 + }, + { + "epoch": 2.1778115501519757, + "grad_norm": 1.8341799974441528, + "learning_rate": 9.602962364185286e-07, + "loss": 0.44835132360458374, + "mean_token_accuracy": 0.84391850233078, + "num_tokens": 25636305.0, + "step": 2866 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 2.3579823970794678, + "learning_rate": 9.586467312037076e-07, + "loss": 0.2875673472881317, + "mean_token_accuracy": 0.889403223991394, + "num_tokens": 25642593.0, + "step": 2867 + }, + { + "epoch": 2.1793313069908815, + "grad_norm": 1.1284339427947998, + "learning_rate": 9.569983077699447e-07, + "loss": 0.3402171730995178, + "mean_token_accuracy": 0.8795222043991089, + "num_tokens": 25663734.0, + "step": 2868 + }, + { + "epoch": 2.180091185410334, + "grad_norm": 1.4705578088760376, + "learning_rate": 9.553509672741646e-07, + "loss": 0.4216107726097107, + "mean_token_accuracy": 0.845354437828064, + "num_tokens": 25678197.0, + "step": 2869 + }, + { + "epoch": 2.1808510638297873, + "grad_norm": 2.6181085109710693, + "learning_rate": 9.53704710872535e-07, + "loss": 0.2777765393257141, + "mean_token_accuracy": 0.8884872198104858, + "num_tokens": 25683808.0, + "step": 2870 + }, + { + "epoch": 2.18161094224924, + "grad_norm": 2.7285003662109375, + "learning_rate": 9.520595397204643e-07, + "loss": 0.33339786529541016, + "mean_token_accuracy": 0.8892828226089478, + "num_tokens": 25690125.0, + "step": 2871 + }, + { + "epoch": 2.182370820668693, + "grad_norm": 2.200571298599243, + "learning_rate": 9.504154549725944e-07, + "loss": 0.46546393632888794, + "mean_token_accuracy": 0.8389996290206909, + "num_tokens": 25697279.0, + "step": 2872 + }, + { + "epoch": 2.183130699088146, + "grad_norm": 3.491392135620117, + "learning_rate": 9.487724577828081e-07, + "loss": 0.17026299238204956, + "mean_token_accuracy": 0.9410334825515747, + "num_tokens": 25700263.0, + "step": 2873 + }, + { + "epoch": 2.183890577507599, + "grad_norm": 2.7800233364105225, + "learning_rate": 9.471305493042243e-07, + "loss": 0.2309894859790802, + "mean_token_accuracy": 0.9233936071395874, + "num_tokens": 25704486.0, + "step": 2874 + }, + { + "epoch": 2.1846504559270516, + "grad_norm": 2.6505582332611084, + "learning_rate": 9.454897306891972e-07, + "loss": 0.4378674328327179, + "mean_token_accuracy": 0.8846660852432251, + "num_tokens": 25710115.0, + "step": 2875 + }, + { + "epoch": 2.1854103343465043, + "grad_norm": 1.5393849611282349, + "learning_rate": 9.438500030893166e-07, + "loss": 0.42081019282341003, + "mean_token_accuracy": 0.8672939538955688, + "num_tokens": 25724598.0, + "step": 2876 + }, + { + "epoch": 2.1861702127659575, + "grad_norm": 1.911198377609253, + "learning_rate": 9.422113676554073e-07, + "loss": 0.19115394353866577, + "mean_token_accuracy": 0.9201297163963318, + "num_tokens": 25731040.0, + "step": 2877 + }, + { + "epoch": 2.18693009118541, + "grad_norm": 1.371443748474121, + "learning_rate": 9.405738255375243e-07, + "loss": 0.3639947772026062, + "mean_token_accuracy": 0.8653393983840942, + "num_tokens": 25745335.0, + "step": 2878 + }, + { + "epoch": 2.1876899696048633, + "grad_norm": 3.216238498687744, + "learning_rate": 9.389373778849612e-07, + "loss": 0.2623414397239685, + "mean_token_accuracy": 0.9046015739440918, + "num_tokens": 25749223.0, + "step": 2879 + }, + { + "epoch": 2.188449848024316, + "grad_norm": 2.7558846473693848, + "learning_rate": 9.37302025846237e-07, + "loss": 0.31921297311782837, + "mean_token_accuracy": 0.8903186321258545, + "num_tokens": 25754341.0, + "step": 2880 + }, + { + "epoch": 2.189209726443769, + "grad_norm": 2.06365704536438, + "learning_rate": 9.356677705691058e-07, + "loss": 0.357482373714447, + "mean_token_accuracy": 0.8661626577377319, + "num_tokens": 25761199.0, + "step": 2881 + }, + { + "epoch": 2.189969604863222, + "grad_norm": 3.240328550338745, + "learning_rate": 9.340346132005507e-07, + "loss": 0.3157888650894165, + "mean_token_accuracy": 0.8948285579681396, + "num_tokens": 25765099.0, + "step": 2882 + }, + { + "epoch": 2.190729483282675, + "grad_norm": 1.4671967029571533, + "learning_rate": 9.324025548867849e-07, + "loss": 0.32077109813690186, + "mean_token_accuracy": 0.8813248872756958, + "num_tokens": 25777636.0, + "step": 2883 + }, + { + "epoch": 2.1914893617021276, + "grad_norm": 2.6475353240966797, + "learning_rate": 9.307715967732492e-07, + "loss": 0.35567623376846313, + "mean_token_accuracy": 0.8738130331039429, + "num_tokens": 25783737.0, + "step": 2884 + }, + { + "epoch": 2.1922492401215807, + "grad_norm": 1.791491150856018, + "learning_rate": 9.29141740004613e-07, + "loss": 0.2556282877922058, + "mean_token_accuracy": 0.9223519563674927, + "num_tokens": 25792069.0, + "step": 2885 + }, + { + "epoch": 2.1930091185410334, + "grad_norm": 2.3944389820098877, + "learning_rate": 9.275129857247722e-07, + "loss": 0.3145869970321655, + "mean_token_accuracy": 0.8938079476356506, + "num_tokens": 25798400.0, + "step": 2886 + }, + { + "epoch": 2.193768996960486, + "grad_norm": 2.0802059173583984, + "learning_rate": 9.258853350768499e-07, + "loss": 0.37343069911003113, + "mean_token_accuracy": 0.8705670833587646, + "num_tokens": 25806567.0, + "step": 2887 + }, + { + "epoch": 2.1945288753799392, + "grad_norm": 2.10831880569458, + "learning_rate": 9.242587892031945e-07, + "loss": 0.1989251971244812, + "mean_token_accuracy": 0.931064248085022, + "num_tokens": 25812715.0, + "step": 2888 + }, + { + "epoch": 2.195288753799392, + "grad_norm": 2.1305530071258545, + "learning_rate": 9.226333492453759e-07, + "loss": 0.29377204179763794, + "mean_token_accuracy": 0.8942701816558838, + "num_tokens": 25819988.0, + "step": 2889 + }, + { + "epoch": 2.196048632218845, + "grad_norm": 2.179025411605835, + "learning_rate": 9.210090163441928e-07, + "loss": 0.37565115094184875, + "mean_token_accuracy": 0.8700202703475952, + "num_tokens": 25827777.0, + "step": 2890 + }, + { + "epoch": 2.1968085106382977, + "grad_norm": 3.177180290222168, + "learning_rate": 9.19385791639665e-07, + "loss": 0.16646479070186615, + "mean_token_accuracy": 0.9426749348640442, + "num_tokens": 25831724.0, + "step": 2891 + }, + { + "epoch": 2.197568389057751, + "grad_norm": 1.103196620941162, + "learning_rate": 9.177636762710321e-07, + "loss": 0.29140013456344604, + "mean_token_accuracy": 0.8789779543876648, + "num_tokens": 25854707.0, + "step": 2892 + }, + { + "epoch": 2.1983282674772036, + "grad_norm": 1.597692847251892, + "learning_rate": 9.161426713767574e-07, + "loss": 0.37799614667892456, + "mean_token_accuracy": 0.8623079061508179, + "num_tokens": 25868429.0, + "step": 2893 + }, + { + "epoch": 2.1990881458966567, + "grad_norm": 2.227132558822632, + "learning_rate": 9.145227780945265e-07, + "loss": 0.2683261036872864, + "mean_token_accuracy": 0.9092563390731812, + "num_tokens": 25875367.0, + "step": 2894 + }, + { + "epoch": 2.1998480243161094, + "grad_norm": 3.1229634284973145, + "learning_rate": 9.129039975612408e-07, + "loss": 0.21859994530677795, + "mean_token_accuracy": 0.9187530875205994, + "num_tokens": 25879456.0, + "step": 2895 + }, + { + "epoch": 2.2006079027355625, + "grad_norm": 2.3224828243255615, + "learning_rate": 9.112863309130235e-07, + "loss": 0.3557605743408203, + "mean_token_accuracy": 0.8735873103141785, + "num_tokens": 25886477.0, + "step": 2896 + }, + { + "epoch": 2.201367781155015, + "grad_norm": 1.7784863710403442, + "learning_rate": 9.096697792852155e-07, + "loss": 0.334577351808548, + "mean_token_accuracy": 0.8948780298233032, + "num_tokens": 25894977.0, + "step": 2897 + }, + { + "epoch": 2.202127659574468, + "grad_norm": 2.34066104888916, + "learning_rate": 9.080543438123746e-07, + "loss": 0.16479721665382385, + "mean_token_accuracy": 0.9405456781387329, + "num_tokens": 25900015.0, + "step": 2898 + }, + { + "epoch": 2.202887537993921, + "grad_norm": 1.944082498550415, + "learning_rate": 9.064400256282757e-07, + "loss": 0.40259572863578796, + "mean_token_accuracy": 0.8632713556289673, + "num_tokens": 25908749.0, + "step": 2899 + }, + { + "epoch": 2.2036474164133737, + "grad_norm": 1.2758828401565552, + "learning_rate": 9.048268258659098e-07, + "loss": 0.3939874470233917, + "mean_token_accuracy": 0.8652969598770142, + "num_tokens": 25924972.0, + "step": 2900 + }, + { + "epoch": 2.204407294832827, + "grad_norm": 1.4483891725540161, + "learning_rate": 9.032147456574822e-07, + "loss": 0.4132935404777527, + "mean_token_accuracy": 0.868486762046814, + "num_tokens": 25939785.0, + "step": 2901 + }, + { + "epoch": 2.2051671732522795, + "grad_norm": 1.4866713285446167, + "learning_rate": 9.01603786134413e-07, + "loss": 0.3644951581954956, + "mean_token_accuracy": 0.8750203847885132, + "num_tokens": 25952648.0, + "step": 2902 + }, + { + "epoch": 2.2059270516717326, + "grad_norm": 1.6555454730987549, + "learning_rate": 8.999939484273362e-07, + "loss": 0.48656779527664185, + "mean_token_accuracy": 0.8372372984886169, + "num_tokens": 25965062.0, + "step": 2903 + }, + { + "epoch": 2.2066869300911853, + "grad_norm": 2.3154168128967285, + "learning_rate": 8.983852336660959e-07, + "loss": 0.3768891990184784, + "mean_token_accuracy": 0.8614999055862427, + "num_tokens": 25972152.0, + "step": 2904 + }, + { + "epoch": 2.2074468085106385, + "grad_norm": 2.3618056774139404, + "learning_rate": 8.967776429797529e-07, + "loss": 0.24905793368816376, + "mean_token_accuracy": 0.9170958995819092, + "num_tokens": 25977808.0, + "step": 2905 + }, + { + "epoch": 2.208206686930091, + "grad_norm": 1.929051399230957, + "learning_rate": 8.951711774965741e-07, + "loss": 0.38099539279937744, + "mean_token_accuracy": 0.8812143802642822, + "num_tokens": 25987871.0, + "step": 2906 + }, + { + "epoch": 2.2089665653495443, + "grad_norm": 1.6529620885849, + "learning_rate": 8.93565838344039e-07, + "loss": 0.31784749031066895, + "mean_token_accuracy": 0.8929437398910522, + "num_tokens": 25997777.0, + "step": 2907 + }, + { + "epoch": 2.209726443768997, + "grad_norm": 2.1413469314575195, + "learning_rate": 8.919616266488373e-07, + "loss": 0.4043882191181183, + "mean_token_accuracy": 0.8937146663665771, + "num_tokens": 26005213.0, + "step": 2908 + }, + { + "epoch": 2.2104863221884496, + "grad_norm": 1.3838988542556763, + "learning_rate": 8.903585435368658e-07, + "loss": 0.2858969569206238, + "mean_token_accuracy": 0.9084860682487488, + "num_tokens": 26018371.0, + "step": 2909 + }, + { + "epoch": 2.211246200607903, + "grad_norm": 1.2853319644927979, + "learning_rate": 8.887565901332304e-07, + "loss": 0.3178713619709015, + "mean_token_accuracy": 0.872230589389801, + "num_tokens": 26034136.0, + "step": 2910 + }, + { + "epoch": 2.2120060790273555, + "grad_norm": 2.9032399654388428, + "learning_rate": 8.871557675622442e-07, + "loss": 0.20348960161209106, + "mean_token_accuracy": 0.9275314807891846, + "num_tokens": 26038299.0, + "step": 2911 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 2.4349892139434814, + "learning_rate": 8.855560769474237e-07, + "loss": 0.24282032251358032, + "mean_token_accuracy": 0.9103988409042358, + "num_tokens": 26043427.0, + "step": 2912 + }, + { + "epoch": 2.2135258358662613, + "grad_norm": 2.324664831161499, + "learning_rate": 8.839575194114958e-07, + "loss": 0.3808317184448242, + "mean_token_accuracy": 0.8598989844322205, + "num_tokens": 26049667.0, + "step": 2913 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 2.594947576522827, + "learning_rate": 8.823600960763901e-07, + "loss": 0.39623332023620605, + "mean_token_accuracy": 0.8738477230072021, + "num_tokens": 26055428.0, + "step": 2914 + }, + { + "epoch": 2.215045592705167, + "grad_norm": 1.674308180809021, + "learning_rate": 8.807638080632375e-07, + "loss": 0.2641369104385376, + "mean_token_accuracy": 0.9119734764099121, + "num_tokens": 26064355.0, + "step": 2915 + }, + { + "epoch": 2.2158054711246202, + "grad_norm": 2.9884912967681885, + "learning_rate": 8.791686564923746e-07, + "loss": 0.19229236245155334, + "mean_token_accuracy": 0.9388723969459534, + "num_tokens": 26067563.0, + "step": 2916 + }, + { + "epoch": 2.216565349544073, + "grad_norm": 1.8513846397399902, + "learning_rate": 8.775746424833428e-07, + "loss": 0.3076218366622925, + "mean_token_accuracy": 0.9165210723876953, + "num_tokens": 26075609.0, + "step": 2917 + }, + { + "epoch": 2.217325227963526, + "grad_norm": 1.229604721069336, + "learning_rate": 8.759817671548801e-07, + "loss": 0.2727023959159851, + "mean_token_accuracy": 0.8931418061256409, + "num_tokens": 26091183.0, + "step": 2918 + }, + { + "epoch": 2.2180851063829787, + "grad_norm": 2.384413957595825, + "learning_rate": 8.743900316249273e-07, + "loss": 0.27312609553337097, + "mean_token_accuracy": 0.8972288370132446, + "num_tokens": 26096677.0, + "step": 2919 + }, + { + "epoch": 2.2188449848024314, + "grad_norm": 2.186370611190796, + "learning_rate": 8.727994370106288e-07, + "loss": 0.36045557260513306, + "mean_token_accuracy": 0.8788503408432007, + "num_tokens": 26104464.0, + "step": 2920 + }, + { + "epoch": 2.2196048632218845, + "grad_norm": 2.769796848297119, + "learning_rate": 8.71209984428322e-07, + "loss": 0.3427591919898987, + "mean_token_accuracy": 0.892108678817749, + "num_tokens": 26109571.0, + "step": 2921 + }, + { + "epoch": 2.2203647416413372, + "grad_norm": 2.9888014793395996, + "learning_rate": 8.696216749935471e-07, + "loss": 0.20137615501880646, + "mean_token_accuracy": 0.9366025924682617, + "num_tokens": 26113165.0, + "step": 2922 + }, + { + "epoch": 2.2211246200607904, + "grad_norm": 1.484858751296997, + "learning_rate": 8.680345098210408e-07, + "loss": 0.2884698510169983, + "mean_token_accuracy": 0.8992507457733154, + "num_tokens": 26124385.0, + "step": 2923 + }, + { + "epoch": 2.221884498480243, + "grad_norm": 1.690119981765747, + "learning_rate": 8.664484900247363e-07, + "loss": 0.34275567531585693, + "mean_token_accuracy": 0.8682634234428406, + "num_tokens": 26134944.0, + "step": 2924 + }, + { + "epoch": 2.222644376899696, + "grad_norm": 1.6171982288360596, + "learning_rate": 8.64863616717764e-07, + "loss": 0.256338506937027, + "mean_token_accuracy": 0.9281957745552063, + "num_tokens": 26143586.0, + "step": 2925 + }, + { + "epoch": 2.223404255319149, + "grad_norm": 2.4853835105895996, + "learning_rate": 8.632798910124493e-07, + "loss": 0.26290056109428406, + "mean_token_accuracy": 0.9119559526443481, + "num_tokens": 26148931.0, + "step": 2926 + }, + { + "epoch": 2.224164133738602, + "grad_norm": 2.0014333724975586, + "learning_rate": 8.616973140203097e-07, + "loss": 0.33400261402130127, + "mean_token_accuracy": 0.8796782493591309, + "num_tokens": 26156246.0, + "step": 2927 + }, + { + "epoch": 2.2249240121580547, + "grad_norm": 1.4637027978897095, + "learning_rate": 8.601158868520617e-07, + "loss": 0.24374958872795105, + "mean_token_accuracy": 0.9116952419281006, + "num_tokens": 26166431.0, + "step": 2928 + }, + { + "epoch": 2.225683890577508, + "grad_norm": 2.2056987285614014, + "learning_rate": 8.585356106176093e-07, + "loss": 0.3419337570667267, + "mean_token_accuracy": 0.8703858852386475, + "num_tokens": 26173974.0, + "step": 2929 + }, + { + "epoch": 2.2264437689969605, + "grad_norm": 1.3687927722930908, + "learning_rate": 8.569564864260524e-07, + "loss": 0.43176111578941345, + "mean_token_accuracy": 0.8616900444030762, + "num_tokens": 26191632.0, + "step": 2930 + }, + { + "epoch": 2.227203647416413, + "grad_norm": 1.4975634813308716, + "learning_rate": 8.553785153856809e-07, + "loss": 0.38525745272636414, + "mean_token_accuracy": 0.8611687421798706, + "num_tokens": 26203300.0, + "step": 2931 + }, + { + "epoch": 2.2279635258358663, + "grad_norm": 1.970109462738037, + "learning_rate": 8.538016986039751e-07, + "loss": 0.31731468439102173, + "mean_token_accuracy": 0.884365975856781, + "num_tokens": 26210037.0, + "step": 2932 + }, + { + "epoch": 2.228723404255319, + "grad_norm": 2.681717872619629, + "learning_rate": 8.522260371876068e-07, + "loss": 0.2770140767097473, + "mean_token_accuracy": 0.9020107984542847, + "num_tokens": 26215460.0, + "step": 2933 + }, + { + "epoch": 2.229483282674772, + "grad_norm": 2.2324795722961426, + "learning_rate": 8.506515322424349e-07, + "loss": 0.30599141120910645, + "mean_token_accuracy": 0.8939633965492249, + "num_tokens": 26221260.0, + "step": 2934 + }, + { + "epoch": 2.230243161094225, + "grad_norm": 2.08915376663208, + "learning_rate": 8.49078184873508e-07, + "loss": 0.3609209954738617, + "mean_token_accuracy": 0.8776482343673706, + "num_tokens": 26228397.0, + "step": 2935 + }, + { + "epoch": 2.231003039513678, + "grad_norm": 1.641366958618164, + "learning_rate": 8.475059961850617e-07, + "loss": 0.2969125509262085, + "mean_token_accuracy": 0.8949217796325684, + "num_tokens": 26238533.0, + "step": 2936 + }, + { + "epoch": 2.2317629179331306, + "grad_norm": 1.082148551940918, + "learning_rate": 8.459349672805198e-07, + "loss": 0.23957109451293945, + "mean_token_accuracy": 0.9255712032318115, + "num_tokens": 26254154.0, + "step": 2937 + }, + { + "epoch": 2.2325227963525838, + "grad_norm": 2.495208740234375, + "learning_rate": 8.443650992624877e-07, + "loss": 0.2879767417907715, + "mean_token_accuracy": 0.8911515474319458, + "num_tokens": 26260812.0, + "step": 2938 + }, + { + "epoch": 2.2332826747720365, + "grad_norm": 3.566549062728882, + "learning_rate": 8.427963932327621e-07, + "loss": 0.31420570611953735, + "mean_token_accuracy": 0.8888009190559387, + "num_tokens": 26264592.0, + "step": 2939 + }, + { + "epoch": 2.2340425531914896, + "grad_norm": 2.217177391052246, + "learning_rate": 8.412288502923211e-07, + "loss": 0.30547618865966797, + "mean_token_accuracy": 0.9065294861793518, + "num_tokens": 26270729.0, + "step": 2940 + }, + { + "epoch": 2.2348024316109423, + "grad_norm": 1.404260277748108, + "learning_rate": 8.396624715413251e-07, + "loss": 0.32485032081604004, + "mean_token_accuracy": 0.8799532651901245, + "num_tokens": 26284280.0, + "step": 2941 + }, + { + "epoch": 2.235562310030395, + "grad_norm": 1.5519827604293823, + "learning_rate": 8.380972580791191e-07, + "loss": 0.3330575227737427, + "mean_token_accuracy": 0.8865892887115479, + "num_tokens": 26293635.0, + "step": 2942 + }, + { + "epoch": 2.236322188449848, + "grad_norm": 2.604766845703125, + "learning_rate": 8.365332110042323e-07, + "loss": 0.18986842036247253, + "mean_token_accuracy": 0.9276989102363586, + "num_tokens": 26298553.0, + "step": 2943 + }, + { + "epoch": 2.237082066869301, + "grad_norm": 2.1750004291534424, + "learning_rate": 8.349703314143712e-07, + "loss": 0.3661153018474579, + "mean_token_accuracy": 0.8879489302635193, + "num_tokens": 26305697.0, + "step": 2944 + }, + { + "epoch": 2.237841945288754, + "grad_norm": 2.247069835662842, + "learning_rate": 8.334086204064254e-07, + "loss": 0.3127560615539551, + "mean_token_accuracy": 0.8846344351768494, + "num_tokens": 26312347.0, + "step": 2945 + }, + { + "epoch": 2.2386018237082066, + "grad_norm": 1.905275821685791, + "learning_rate": 8.318480790764638e-07, + "loss": 0.44245776534080505, + "mean_token_accuracy": 0.87440425157547, + "num_tokens": 26322787.0, + "step": 2946 + }, + { + "epoch": 2.2393617021276597, + "grad_norm": 1.8596254587173462, + "learning_rate": 8.302887085197342e-07, + "loss": 0.30068373680114746, + "mean_token_accuracy": 0.8847110271453857, + "num_tokens": 26330437.0, + "step": 2947 + }, + { + "epoch": 2.2401215805471124, + "grad_norm": 2.0028860569000244, + "learning_rate": 8.28730509830663e-07, + "loss": 0.4276006817817688, + "mean_token_accuracy": 0.8406014442443848, + "num_tokens": 26340100.0, + "step": 2948 + }, + { + "epoch": 2.2408814589665655, + "grad_norm": 2.494434356689453, + "learning_rate": 8.271734841028553e-07, + "loss": 0.3874223232269287, + "mean_token_accuracy": 0.8782174587249756, + "num_tokens": 26345750.0, + "step": 2949 + }, + { + "epoch": 2.2416413373860182, + "grad_norm": 1.955613613128662, + "learning_rate": 8.256176324290885e-07, + "loss": 0.28770074248313904, + "mean_token_accuracy": 0.9004360437393188, + "num_tokens": 26353342.0, + "step": 2950 + }, + { + "epoch": 2.2424012158054714, + "grad_norm": 1.7579785585403442, + "learning_rate": 8.240629559013222e-07, + "loss": 0.2277943640947342, + "mean_token_accuracy": 0.9145861864089966, + "num_tokens": 26361348.0, + "step": 2951 + }, + { + "epoch": 2.243161094224924, + "grad_norm": 1.5848479270935059, + "learning_rate": 8.22509455610688e-07, + "loss": 0.32944542169570923, + "mean_token_accuracy": 0.8662827014923096, + "num_tokens": 26372006.0, + "step": 2952 + }, + { + "epoch": 2.2439209726443767, + "grad_norm": 2.6263222694396973, + "learning_rate": 8.209571326474897e-07, + "loss": 0.34646326303482056, + "mean_token_accuracy": 0.8817736506462097, + "num_tokens": 26377664.0, + "step": 2953 + }, + { + "epoch": 2.24468085106383, + "grad_norm": 2.407590627670288, + "learning_rate": 8.194059881012107e-07, + "loss": 0.41302192211151123, + "mean_token_accuracy": 0.8898757696151733, + "num_tokens": 26384225.0, + "step": 2954 + }, + { + "epoch": 2.2454407294832825, + "grad_norm": 2.5156402587890625, + "learning_rate": 8.178560230605012e-07, + "loss": 0.3468608558177948, + "mean_token_accuracy": 0.8879599571228027, + "num_tokens": 26389374.0, + "step": 2955 + }, + { + "epoch": 2.2462006079027357, + "grad_norm": 1.5076090097427368, + "learning_rate": 8.163072386131876e-07, + "loss": 0.3750625550746918, + "mean_token_accuracy": 0.8712738752365112, + "num_tokens": 26402674.0, + "step": 2956 + }, + { + "epoch": 2.2469604863221884, + "grad_norm": 1.5181068181991577, + "learning_rate": 8.147596358462662e-07, + "loss": 0.19113478064537048, + "mean_token_accuracy": 0.9323463439941406, + "num_tokens": 26411626.0, + "step": 2957 + }, + { + "epoch": 2.2477203647416415, + "grad_norm": 1.0806915760040283, + "learning_rate": 8.132132158459044e-07, + "loss": 0.3411233425140381, + "mean_token_accuracy": 0.8736830949783325, + "num_tokens": 26435891.0, + "step": 2958 + }, + { + "epoch": 2.248480243161094, + "grad_norm": 1.5527247190475464, + "learning_rate": 8.116679796974389e-07, + "loss": 0.425741970539093, + "mean_token_accuracy": 0.8448845148086548, + "num_tokens": 26448134.0, + "step": 2959 + }, + { + "epoch": 2.2492401215805473, + "grad_norm": 1.2390631437301636, + "learning_rate": 8.10123928485377e-07, + "loss": 0.38084933161735535, + "mean_token_accuracy": 0.8656617999076843, + "num_tokens": 26467213.0, + "step": 2960 + }, + { + "epoch": 2.25, + "grad_norm": 3.0672852993011475, + "learning_rate": 8.08581063293391e-07, + "loss": 0.29300111532211304, + "mean_token_accuracy": 0.8933638334274292, + "num_tokens": 26471599.0, + "step": 2961 + }, + { + "epoch": 2.250759878419453, + "grad_norm": 1.2359145879745483, + "learning_rate": 8.070393852043251e-07, + "loss": 0.41337621212005615, + "mean_token_accuracy": 0.854198694229126, + "num_tokens": 26488461.0, + "step": 2962 + }, + { + "epoch": 2.251519756838906, + "grad_norm": 1.8551225662231445, + "learning_rate": 8.054988953001889e-07, + "loss": 0.3036419153213501, + "mean_token_accuracy": 0.8883144855499268, + "num_tokens": 26496398.0, + "step": 2963 + }, + { + "epoch": 2.2522796352583585, + "grad_norm": 1.3691812753677368, + "learning_rate": 8.039595946621551e-07, + "loss": 0.3286219835281372, + "mean_token_accuracy": 0.892130434513092, + "num_tokens": 26510493.0, + "step": 2964 + }, + { + "epoch": 2.2530395136778116, + "grad_norm": 1.7371556758880615, + "learning_rate": 8.024214843705647e-07, + "loss": 0.4105026125907898, + "mean_token_accuracy": 0.8889180421829224, + "num_tokens": 26519148.0, + "step": 2965 + }, + { + "epoch": 2.2537993920972643, + "grad_norm": 2.211665630340576, + "learning_rate": 8.00884565504925e-07, + "loss": 0.3912196159362793, + "mean_token_accuracy": 0.8632891774177551, + "num_tokens": 26526314.0, + "step": 2966 + }, + { + "epoch": 2.2545592705167175, + "grad_norm": 2.476206064224243, + "learning_rate": 7.993488391439025e-07, + "loss": 0.20462508499622345, + "mean_token_accuracy": 0.9276266098022461, + "num_tokens": 26531781.0, + "step": 2967 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 1.4944102764129639, + "learning_rate": 7.978143063653296e-07, + "loss": 0.2694895267486572, + "mean_token_accuracy": 0.9033881425857544, + "num_tokens": 26543780.0, + "step": 2968 + }, + { + "epoch": 2.2560790273556233, + "grad_norm": 1.7570104598999023, + "learning_rate": 7.962809682462008e-07, + "loss": 0.3060353100299835, + "mean_token_accuracy": 0.8908290863037109, + "num_tokens": 26551978.0, + "step": 2969 + }, + { + "epoch": 2.256838905775076, + "grad_norm": 2.215514898300171, + "learning_rate": 7.947488258626718e-07, + "loss": 0.2930528521537781, + "mean_token_accuracy": 0.8989757299423218, + "num_tokens": 26558267.0, + "step": 2970 + }, + { + "epoch": 2.2575987841945286, + "grad_norm": 2.3069000244140625, + "learning_rate": 7.93217880290059e-07, + "loss": 0.18501774966716766, + "mean_token_accuracy": 0.931271493434906, + "num_tokens": 26563286.0, + "step": 2971 + }, + { + "epoch": 2.2583586626139818, + "grad_norm": 1.6555116176605225, + "learning_rate": 7.916881326028387e-07, + "loss": 0.3178265392780304, + "mean_token_accuracy": 0.9016884565353394, + "num_tokens": 26572087.0, + "step": 2972 + }, + { + "epoch": 2.2591185410334345, + "grad_norm": 2.222161054611206, + "learning_rate": 7.901595838746471e-07, + "loss": 0.3013504445552826, + "mean_token_accuracy": 0.8942798376083374, + "num_tokens": 26578159.0, + "step": 2973 + }, + { + "epoch": 2.2598784194528876, + "grad_norm": 1.979411005973816, + "learning_rate": 7.886322351782782e-07, + "loss": 0.42746615409851074, + "mean_token_accuracy": 0.85303795337677, + "num_tokens": 26586252.0, + "step": 2974 + }, + { + "epoch": 2.2606382978723403, + "grad_norm": 1.4925786256790161, + "learning_rate": 7.871060875856854e-07, + "loss": 0.33495625853538513, + "mean_token_accuracy": 0.8911026120185852, + "num_tokens": 26599921.0, + "step": 2975 + }, + { + "epoch": 2.2613981762917934, + "grad_norm": 1.9037046432495117, + "learning_rate": 7.855811421679746e-07, + "loss": 0.31471866369247437, + "mean_token_accuracy": 0.9007552862167358, + "num_tokens": 26607954.0, + "step": 2976 + }, + { + "epoch": 2.262158054711246, + "grad_norm": 2.2751407623291016, + "learning_rate": 7.840573999954154e-07, + "loss": 0.26972368359565735, + "mean_token_accuracy": 0.8992317914962769, + "num_tokens": 26614036.0, + "step": 2977 + }, + { + "epoch": 2.262917933130699, + "grad_norm": 2.680572271347046, + "learning_rate": 7.825348621374257e-07, + "loss": 0.4264066219329834, + "mean_token_accuracy": 0.8547691106796265, + "num_tokens": 26619545.0, + "step": 2978 + }, + { + "epoch": 2.263677811550152, + "grad_norm": 2.3535876274108887, + "learning_rate": 7.810135296625817e-07, + "loss": 0.37871062755584717, + "mean_token_accuracy": 0.8621708750724792, + "num_tokens": 26626248.0, + "step": 2979 + }, + { + "epoch": 2.264437689969605, + "grad_norm": 1.2249537706375122, + "learning_rate": 7.794934036386139e-07, + "loss": 0.3877285122871399, + "mean_token_accuracy": 0.8593572378158569, + "num_tokens": 26648023.0, + "step": 2980 + }, + { + "epoch": 2.2651975683890577, + "grad_norm": 2.43371844291687, + "learning_rate": 7.779744851324048e-07, + "loss": 0.37463510036468506, + "mean_token_accuracy": 0.8646193742752075, + "num_tokens": 26654016.0, + "step": 2981 + }, + { + "epoch": 2.2659574468085104, + "grad_norm": 1.7429327964782715, + "learning_rate": 7.7645677520999e-07, + "loss": 0.4033060669898987, + "mean_token_accuracy": 0.8644014596939087, + "num_tokens": 26664447.0, + "step": 2982 + }, + { + "epoch": 2.2667173252279635, + "grad_norm": 2.4090006351470947, + "learning_rate": 7.749402749365573e-07, + "loss": 0.2981206774711609, + "mean_token_accuracy": 0.8886175751686096, + "num_tokens": 26670355.0, + "step": 2983 + }, + { + "epoch": 2.2674772036474162, + "grad_norm": 1.3855396509170532, + "learning_rate": 7.734249853764428e-07, + "loss": 0.35967472195625305, + "mean_token_accuracy": 0.8652631044387817, + "num_tokens": 26685385.0, + "step": 2984 + }, + { + "epoch": 2.2682370820668694, + "grad_norm": 1.328214168548584, + "learning_rate": 7.719109075931375e-07, + "loss": 0.3571951389312744, + "mean_token_accuracy": 0.8894522190093994, + "num_tokens": 26703265.0, + "step": 2985 + }, + { + "epoch": 2.268996960486322, + "grad_norm": 2.5001046657562256, + "learning_rate": 7.703980426492791e-07, + "loss": 0.3512844741344452, + "mean_token_accuracy": 0.887405514717102, + "num_tokens": 26709095.0, + "step": 2986 + }, + { + "epoch": 2.269756838905775, + "grad_norm": 1.8704569339752197, + "learning_rate": 7.688863916066524e-07, + "loss": 0.2746743857860565, + "mean_token_accuracy": 0.903412401676178, + "num_tokens": 26716815.0, + "step": 2987 + }, + { + "epoch": 2.270516717325228, + "grad_norm": 2.1134285926818848, + "learning_rate": 7.673759555261947e-07, + "loss": 0.38385504484176636, + "mean_token_accuracy": 0.8759124279022217, + "num_tokens": 26724046.0, + "step": 2988 + }, + { + "epoch": 2.271276595744681, + "grad_norm": 1.2651840448379517, + "learning_rate": 7.65866735467988e-07, + "loss": 0.3499506413936615, + "mean_token_accuracy": 0.8704953193664551, + "num_tokens": 26743024.0, + "step": 2989 + }, + { + "epoch": 2.2720364741641337, + "grad_norm": 1.7289817333221436, + "learning_rate": 7.643587324912597e-07, + "loss": 0.3768725097179413, + "mean_token_accuracy": 0.8623670339584351, + "num_tokens": 26754336.0, + "step": 2990 + }, + { + "epoch": 2.272796352583587, + "grad_norm": 1.6121667623519897, + "learning_rate": 7.628519476543839e-07, + "loss": 0.42746737599372864, + "mean_token_accuracy": 0.8425478935241699, + "num_tokens": 26766813.0, + "step": 2991 + }, + { + "epoch": 2.2735562310030395, + "grad_norm": 2.705442428588867, + "learning_rate": 7.613463820148831e-07, + "loss": 0.27137982845306396, + "mean_token_accuracy": 0.9014253616333008, + "num_tokens": 26772565.0, + "step": 2992 + }, + { + "epoch": 2.274316109422492, + "grad_norm": 1.3811960220336914, + "learning_rate": 7.598420366294185e-07, + "loss": 0.2957465350627899, + "mean_token_accuracy": 0.8935354351997375, + "num_tokens": 26787325.0, + "step": 2993 + }, + { + "epoch": 2.2750759878419453, + "grad_norm": 2.469336986541748, + "learning_rate": 7.583389125537982e-07, + "loss": 0.2811780273914337, + "mean_token_accuracy": 0.8956634998321533, + "num_tokens": 26793457.0, + "step": 2994 + }, + { + "epoch": 2.275835866261398, + "grad_norm": 2.945681571960449, + "learning_rate": 7.568370108429732e-07, + "loss": 0.3186708092689514, + "mean_token_accuracy": 0.8817545175552368, + "num_tokens": 26797867.0, + "step": 2995 + }, + { + "epoch": 2.276595744680851, + "grad_norm": 1.7748228311538696, + "learning_rate": 7.553363325510355e-07, + "loss": 0.3279818892478943, + "mean_token_accuracy": 0.884396493434906, + "num_tokens": 26806656.0, + "step": 2996 + }, + { + "epoch": 2.277355623100304, + "grad_norm": 1.312500238418579, + "learning_rate": 7.538368787312186e-07, + "loss": 0.3754822611808777, + "mean_token_accuracy": 0.8653179407119751, + "num_tokens": 26823126.0, + "step": 2997 + }, + { + "epoch": 2.278115501519757, + "grad_norm": 3.1305344104766846, + "learning_rate": 7.523386504358984e-07, + "loss": 0.3293214440345764, + "mean_token_accuracy": 0.8908799886703491, + "num_tokens": 26828250.0, + "step": 2998 + }, + { + "epoch": 2.2788753799392096, + "grad_norm": 2.6449344158172607, + "learning_rate": 7.508416487165862e-07, + "loss": 0.23732036352157593, + "mean_token_accuracy": 0.9029837846755981, + "num_tokens": 26833123.0, + "step": 2999 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 2.04388427734375, + "learning_rate": 7.49345874623939e-07, + "loss": 0.31240373849868774, + "mean_token_accuracy": 0.8860392570495605, + "num_tokens": 26840878.0, + "step": 3000 + } + ], + "logging_steps": 1.0, + "max_steps": 3948, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.925799536381133e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2fc4f538d721f958cdceda5408f2f4e1a35f4210 --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 +size 6225 diff --git a/checkpoint-3948/chat_template.jinja b/checkpoint-3948/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..70adff8a08fb31e0636f618564838d4bf3c05286 --- /dev/null +++ b/checkpoint-3948/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/checkpoint-3948/config.json b/checkpoint-3948/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c351e5fb52f50ea6e07b40981aef81c80f9df7e4 --- /dev/null +++ b/checkpoint-3948/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151662, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/checkpoint-3948/generation_config.json b/checkpoint-3948/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2104b83493c2833855e8fe32a7a784805ab5c2ee --- /dev/null +++ b/checkpoint-3948/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151662, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.5.3" +} diff --git a/checkpoint-3948/model.safetensors b/checkpoint-3948/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f787ad62bc7ccc577c324b6d71689c0739123f0c --- /dev/null +++ b/checkpoint-3948/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7db19800bbcf792dcb25dea9b5ae39f4e934a0d56f64ed6f74d7d89e87ae928 +size 17645743048 diff --git a/checkpoint-3948/optimizer.bin b/checkpoint-3948/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..90ea9835df74c549d6f6b88c64f00fdc211af5fa --- /dev/null +++ b/checkpoint-3948/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:656d334c407ae1443fcaeda271d597e51249875fdde8e1a12a024812f6de73ab +size 32180124005 diff --git a/checkpoint-3948/pytorch_model_fsdp.bin b/checkpoint-3948/pytorch_model_fsdp.bin new file mode 100644 index 0000000000000000000000000000000000000000..a96db7a5fcab43218d82108cacd5f6fc2583929f --- /dev/null +++ b/checkpoint-3948/pytorch_model_fsdp.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d19fbc90bb938bf3c747a8b9c2b23f00398029d4ab146ca0ca0a0ea7d8885c +size 17645897996 diff --git a/checkpoint-3948/rng_state_0.pth b/checkpoint-3948/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5379ca97bc0c62d226d0fc37920d4937a7bb8b43 --- /dev/null +++ b/checkpoint-3948/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e957b4cd785256be4cb26eb03060ef689e1d58f1766d7f26ca36a62bec4994 +size 14917 diff --git a/checkpoint-3948/rng_state_1.pth b/checkpoint-3948/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..662ad0d5b30369c825f66c080779973608c5058e --- /dev/null +++ b/checkpoint-3948/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550c54d430b44b77b0abe44c6e3ceba90a155305315c081b7616b35e2c18d1ce +size 14917 diff --git a/checkpoint-3948/scheduler.pt b/checkpoint-3948/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..51ed35f90326eb016d2a1c3993d7061549624ca8 --- /dev/null +++ b/checkpoint-3948/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deaab1725fa5d6abb332a09b31b7c4d93808c0289cb39a32cd5102547b98e285 +size 1465 diff --git a/checkpoint-3948/tokenizer.json b/checkpoint-3948/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/checkpoint-3948/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/checkpoint-3948/tokenizer_config.json b/checkpoint-3948/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e47e52c4e7f0b2bcf2103a878790216f3f6436d --- /dev/null +++ b/checkpoint-3948/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 1010000, + "pad_token": "<|fim_pad|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/checkpoint-3948/trainer_state.json b/checkpoint-3948/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5d447faf10413b9ec27585679ff7a32bdbe441fe --- /dev/null +++ b/checkpoint-3948/trainer_state.json @@ -0,0 +1,35566 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3948, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007598784194528875, + "grad_norm": 11.767926216125488, + "learning_rate": 0.0, + "loss": 0.7937269806861877, + "mean_token_accuracy": 0.7822731137275696, + "num_tokens": 10507.0, + "step": 1 + }, + { + "epoch": 0.001519756838905775, + "grad_norm": 14.9199800491333, + "learning_rate": 2.5252525252525256e-08, + "loss": 0.7665389776229858, + "mean_token_accuracy": 0.8342233300209045, + "num_tokens": 14806.0, + "step": 2 + }, + { + "epoch": 0.0022796352583586625, + "grad_norm": 11.991217613220215, + "learning_rate": 5.050505050505051e-08, + "loss": 0.9597002267837524, + "mean_token_accuracy": 0.7054992318153381, + "num_tokens": 27170.0, + "step": 3 + }, + { + "epoch": 0.00303951367781155, + "grad_norm": 12.958333015441895, + "learning_rate": 7.575757575757576e-08, + "loss": 0.9971482753753662, + "mean_token_accuracy": 0.7261134386062622, + "num_tokens": 33729.0, + "step": 4 + }, + { + "epoch": 0.003799392097264438, + "grad_norm": 13.5665283203125, + "learning_rate": 1.0101010101010103e-07, + "loss": 0.9504883885383606, + "mean_token_accuracy": 0.745307445526123, + "num_tokens": 41174.0, + "step": 5 + }, + { + "epoch": 0.004559270516717325, + "grad_norm": 10.09444808959961, + "learning_rate": 1.2626262626262626e-07, + "loss": 0.759548008441925, + "mean_token_accuracy": 0.7842121124267578, + "num_tokens": 47943.0, + "step": 6 + }, + { + "epoch": 0.005319148936170213, + "grad_norm": 10.741650581359863, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.8231598138809204, + "mean_token_accuracy": 0.7550969123840332, + "num_tokens": 56665.0, + "step": 7 + }, + { + "epoch": 0.0060790273556231, + "grad_norm": 12.250170707702637, + "learning_rate": 1.767676767676768e-07, + "loss": 0.8576581478118896, + "mean_token_accuracy": 0.7568671703338623, + "num_tokens": 67606.0, + "step": 8 + }, + { + "epoch": 0.006838905775075988, + "grad_norm": 12.828629493713379, + "learning_rate": 2.0202020202020205e-07, + "loss": 0.9886435866355896, + "mean_token_accuracy": 0.733400285243988, + "num_tokens": 74272.0, + "step": 9 + }, + { + "epoch": 0.007598784194528876, + "grad_norm": 15.966923713684082, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.064985990524292, + "mean_token_accuracy": 0.7101132869720459, + "num_tokens": 80524.0, + "step": 10 + }, + { + "epoch": 0.008358662613981762, + "grad_norm": 10.864850044250488, + "learning_rate": 2.525252525252525e-07, + "loss": 0.8311550617218018, + "mean_token_accuracy": 0.7431639432907104, + "num_tokens": 96292.0, + "step": 11 + }, + { + "epoch": 0.00911854103343465, + "grad_norm": 16.438785552978516, + "learning_rate": 2.7777777777777776e-07, + "loss": 1.0579866170883179, + "mean_token_accuracy": 0.7222976684570312, + "num_tokens": 102992.0, + "step": 12 + }, + { + "epoch": 0.009878419452887538, + "grad_norm": 11.179214477539062, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9816144704818726, + "mean_token_accuracy": 0.7206371426582336, + "num_tokens": 113571.0, + "step": 13 + }, + { + "epoch": 0.010638297872340425, + "grad_norm": 12.780299186706543, + "learning_rate": 3.2828282828282834e-07, + "loss": 0.847449004650116, + "mean_token_accuracy": 0.7826199531555176, + "num_tokens": 119568.0, + "step": 14 + }, + { + "epoch": 0.011398176291793313, + "grad_norm": 14.800421714782715, + "learning_rate": 3.535353535353536e-07, + "loss": 0.9275516271591187, + "mean_token_accuracy": 0.7655045986175537, + "num_tokens": 126258.0, + "step": 15 + }, + { + "epoch": 0.0121580547112462, + "grad_norm": 11.267602920532227, + "learning_rate": 3.787878787878788e-07, + "loss": 0.8464037179946899, + "mean_token_accuracy": 0.7606508731842041, + "num_tokens": 136831.0, + "step": 16 + }, + { + "epoch": 0.012917933130699088, + "grad_norm": 12.891013145446777, + "learning_rate": 4.040404040404041e-07, + "loss": 0.9903074502944946, + "mean_token_accuracy": 0.7247487306594849, + "num_tokens": 150434.0, + "step": 17 + }, + { + "epoch": 0.013677811550151976, + "grad_norm": 11.13957691192627, + "learning_rate": 4.2929292929292934e-07, + "loss": 0.8287211656570435, + "mean_token_accuracy": 0.7621913552284241, + "num_tokens": 158516.0, + "step": 18 + }, + { + "epoch": 0.014437689969604863, + "grad_norm": 18.39569664001465, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.150015115737915, + "mean_token_accuracy": 0.7349498271942139, + "num_tokens": 162214.0, + "step": 19 + }, + { + "epoch": 0.015197568389057751, + "grad_norm": 9.353750228881836, + "learning_rate": 4.797979797979798e-07, + "loss": 0.7228299379348755, + "mean_token_accuracy": 0.7969573736190796, + "num_tokens": 173035.0, + "step": 20 + }, + { + "epoch": 0.015957446808510637, + "grad_norm": 8.267163276672363, + "learning_rate": 5.05050505050505e-07, + "loss": 0.7358136177062988, + "mean_token_accuracy": 0.7903937101364136, + "num_tokens": 183568.0, + "step": 21 + }, + { + "epoch": 0.016717325227963525, + "grad_norm": 11.137128829956055, + "learning_rate": 5.303030303030304e-07, + "loss": 1.0075397491455078, + "mean_token_accuracy": 0.702807605266571, + "num_tokens": 192759.0, + "step": 22 + }, + { + "epoch": 0.017477203647416412, + "grad_norm": 10.734103202819824, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8925919532775879, + "mean_token_accuracy": 0.7475671768188477, + "num_tokens": 201280.0, + "step": 23 + }, + { + "epoch": 0.0182370820668693, + "grad_norm": 11.945566177368164, + "learning_rate": 5.808080808080809e-07, + "loss": 0.7260514497756958, + "mean_token_accuracy": 0.7859152555465698, + "num_tokens": 218053.0, + "step": 24 + }, + { + "epoch": 0.018996960486322188, + "grad_norm": 18.610652923583984, + "learning_rate": 6.060606060606061e-07, + "loss": 0.8995465636253357, + "mean_token_accuracy": 0.7931990623474121, + "num_tokens": 220953.0, + "step": 25 + }, + { + "epoch": 0.019756838905775075, + "grad_norm": 10.51898193359375, + "learning_rate": 6.313131313131314e-07, + "loss": 0.9532671570777893, + "mean_token_accuracy": 0.7257645726203918, + "num_tokens": 231200.0, + "step": 26 + }, + { + "epoch": 0.020516717325227963, + "grad_norm": 9.581812858581543, + "learning_rate": 6.565656565656567e-07, + "loss": 0.9038010239601135, + "mean_token_accuracy": 0.7390379905700684, + "num_tokens": 237711.0, + "step": 27 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 12.297484397888184, + "learning_rate": 6.818181818181818e-07, + "loss": 1.048936367034912, + "mean_token_accuracy": 0.7175670862197876, + "num_tokens": 242503.0, + "step": 28 + }, + { + "epoch": 0.022036474164133738, + "grad_norm": 7.437953472137451, + "learning_rate": 7.070707070707071e-07, + "loss": 0.8308826684951782, + "mean_token_accuracy": 0.7415335774421692, + "num_tokens": 250842.0, + "step": 29 + }, + { + "epoch": 0.022796352583586626, + "grad_norm": 6.134475231170654, + "learning_rate": 7.323232323232324e-07, + "loss": 0.647913932800293, + "mean_token_accuracy": 0.8124054670333862, + "num_tokens": 267453.0, + "step": 30 + }, + { + "epoch": 0.023556231003039513, + "grad_norm": 6.678966045379639, + "learning_rate": 7.575757575757576e-07, + "loss": 0.7052810192108154, + "mean_token_accuracy": 0.7908754348754883, + "num_tokens": 284416.0, + "step": 31 + }, + { + "epoch": 0.0243161094224924, + "grad_norm": 7.42232084274292, + "learning_rate": 7.82828282828283e-07, + "loss": 1.022383213043213, + "mean_token_accuracy": 0.7053230404853821, + "num_tokens": 292073.0, + "step": 32 + }, + { + "epoch": 0.02507598784194529, + "grad_norm": 6.463219165802002, + "learning_rate": 8.080808080808082e-07, + "loss": 0.7603012323379517, + "mean_token_accuracy": 0.7728140354156494, + "num_tokens": 298550.0, + "step": 33 + }, + { + "epoch": 0.025835866261398176, + "grad_norm": 5.668411731719971, + "learning_rate": 8.333333333333333e-07, + "loss": 0.7707852721214294, + "mean_token_accuracy": 0.7827773094177246, + "num_tokens": 306683.0, + "step": 34 + }, + { + "epoch": 0.026595744680851064, + "grad_norm": 4.984964847564697, + "learning_rate": 8.585858585858587e-07, + "loss": 0.6317349672317505, + "mean_token_accuracy": 0.8106861114501953, + "num_tokens": 318842.0, + "step": 35 + }, + { + "epoch": 0.02735562310030395, + "grad_norm": 4.421732425689697, + "learning_rate": 8.838383838383839e-07, + "loss": 0.6228617429733276, + "mean_token_accuracy": 0.8023355603218079, + "num_tokens": 329850.0, + "step": 36 + }, + { + "epoch": 0.02811550151975684, + "grad_norm": 5.970808029174805, + "learning_rate": 9.090909090909091e-07, + "loss": 0.8443238139152527, + "mean_token_accuracy": 0.7462409734725952, + "num_tokens": 335844.0, + "step": 37 + }, + { + "epoch": 0.028875379939209727, + "grad_norm": 4.5389084815979, + "learning_rate": 9.343434343434345e-07, + "loss": 0.6976436376571655, + "mean_token_accuracy": 0.790410041809082, + "num_tokens": 348768.0, + "step": 38 + }, + { + "epoch": 0.029635258358662615, + "grad_norm": 4.116631507873535, + "learning_rate": 9.595959595959596e-07, + "loss": 0.6698519587516785, + "mean_token_accuracy": 0.7818127870559692, + "num_tokens": 355460.0, + "step": 39 + }, + { + "epoch": 0.030395136778115502, + "grad_norm": 3.3714773654937744, + "learning_rate": 9.84848484848485e-07, + "loss": 0.5723201036453247, + "mean_token_accuracy": 0.8100086450576782, + "num_tokens": 368507.0, + "step": 40 + }, + { + "epoch": 0.03115501519756839, + "grad_norm": 4.4438347816467285, + "learning_rate": 1.01010101010101e-06, + "loss": 0.7508786916732788, + "mean_token_accuracy": 0.7711942791938782, + "num_tokens": 376467.0, + "step": 41 + }, + { + "epoch": 0.031914893617021274, + "grad_norm": 5.609974384307861, + "learning_rate": 1.0353535353535354e-06, + "loss": 0.566256046295166, + "mean_token_accuracy": 0.8319284319877625, + "num_tokens": 381399.0, + "step": 42 + }, + { + "epoch": 0.03267477203647416, + "grad_norm": 5.124386787414551, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.8151067495346069, + "mean_token_accuracy": 0.7537785768508911, + "num_tokens": 387389.0, + "step": 43 + }, + { + "epoch": 0.03343465045592705, + "grad_norm": 3.6318116188049316, + "learning_rate": 1.085858585858586e-06, + "loss": 0.5989949107170105, + "mean_token_accuracy": 0.8129256963729858, + "num_tokens": 395302.0, + "step": 44 + }, + { + "epoch": 0.03419452887537994, + "grad_norm": 2.694424629211426, + "learning_rate": 1.111111111111111e-06, + "loss": 0.5831396579742432, + "mean_token_accuracy": 0.8056820631027222, + "num_tokens": 409920.0, + "step": 45 + }, + { + "epoch": 0.034954407294832825, + "grad_norm": 2.2949178218841553, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.472550630569458, + "mean_token_accuracy": 0.8343006372451782, + "num_tokens": 428323.0, + "step": 46 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 3.3930575847625732, + "learning_rate": 1.1616161616161617e-06, + "loss": 0.6246505379676819, + "mean_token_accuracy": 0.783149003982544, + "num_tokens": 435889.0, + "step": 47 + }, + { + "epoch": 0.0364741641337386, + "grad_norm": 3.692598819732666, + "learning_rate": 1.186868686868687e-06, + "loss": 0.46132946014404297, + "mean_token_accuracy": 0.8583089113235474, + "num_tokens": 441192.0, + "step": 48 + }, + { + "epoch": 0.03723404255319149, + "grad_norm": 6.571533203125, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.9351121783256531, + "mean_token_accuracy": 0.7580878734588623, + "num_tokens": 444277.0, + "step": 49 + }, + { + "epoch": 0.037993920972644375, + "grad_norm": 5.029570579528809, + "learning_rate": 1.2373737373737375e-06, + "loss": 0.6921554803848267, + "mean_token_accuracy": 0.8131166100502014, + "num_tokens": 447646.0, + "step": 50 + }, + { + "epoch": 0.03875379939209726, + "grad_norm": 2.9174208641052246, + "learning_rate": 1.2626262626262629e-06, + "loss": 0.591706395149231, + "mean_token_accuracy": 0.8108617067337036, + "num_tokens": 461397.0, + "step": 51 + }, + { + "epoch": 0.03951367781155015, + "grad_norm": 4.315536022186279, + "learning_rate": 1.287878787878788e-06, + "loss": 0.6986310482025146, + "mean_token_accuracy": 0.7710754871368408, + "num_tokens": 472047.0, + "step": 52 + }, + { + "epoch": 0.04027355623100304, + "grad_norm": 2.6216275691986084, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5553690791130066, + "mean_token_accuracy": 0.8167896866798401, + "num_tokens": 482795.0, + "step": 53 + }, + { + "epoch": 0.041033434650455926, + "grad_norm": 3.0562477111816406, + "learning_rate": 1.3383838383838385e-06, + "loss": 0.6909202337265015, + "mean_token_accuracy": 0.7859863638877869, + "num_tokens": 494818.0, + "step": 54 + }, + { + "epoch": 0.04179331306990881, + "grad_norm": 2.1420412063598633, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.5415265560150146, + "mean_token_accuracy": 0.818886399269104, + "num_tokens": 513695.0, + "step": 55 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.9610488414764404, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.6602212190628052, + "mean_token_accuracy": 0.7830734252929688, + "num_tokens": 523784.0, + "step": 56 + }, + { + "epoch": 0.04331306990881459, + "grad_norm": 2.511972665786743, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.5717809796333313, + "mean_token_accuracy": 0.8053616285324097, + "num_tokens": 546308.0, + "step": 57 + }, + { + "epoch": 0.044072948328267476, + "grad_norm": 3.52642822265625, + "learning_rate": 1.4393939393939396e-06, + "loss": 0.6242594718933105, + "mean_token_accuracy": 0.8162082433700562, + "num_tokens": 552019.0, + "step": 58 + }, + { + "epoch": 0.044832826747720364, + "grad_norm": 3.02362322807312, + "learning_rate": 1.4646464646464648e-06, + "loss": 0.6634255647659302, + "mean_token_accuracy": 0.7682032585144043, + "num_tokens": 560009.0, + "step": 59 + }, + { + "epoch": 0.04559270516717325, + "grad_norm": 2.3910107612609863, + "learning_rate": 1.48989898989899e-06, + "loss": 0.5519146919250488, + "mean_token_accuracy": 0.8270269632339478, + "num_tokens": 571005.0, + "step": 60 + }, + { + "epoch": 0.04635258358662614, + "grad_norm": 4.28154993057251, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.7437789440155029, + "mean_token_accuracy": 0.7782418131828308, + "num_tokens": 574950.0, + "step": 61 + }, + { + "epoch": 0.04711246200607903, + "grad_norm": 3.4078686237335205, + "learning_rate": 1.5404040404040404e-06, + "loss": 0.6345915198326111, + "mean_token_accuracy": 0.7903392314910889, + "num_tokens": 581657.0, + "step": 62 + }, + { + "epoch": 0.047872340425531915, + "grad_norm": 2.6834158897399902, + "learning_rate": 1.565656565656566e-06, + "loss": 0.5981127023696899, + "mean_token_accuracy": 0.7911489605903625, + "num_tokens": 591267.0, + "step": 63 + }, + { + "epoch": 0.0486322188449848, + "grad_norm": 2.1054461002349854, + "learning_rate": 1.590909090909091e-06, + "loss": 0.5523523688316345, + "mean_token_accuracy": 0.8194501399993896, + "num_tokens": 606787.0, + "step": 64 + }, + { + "epoch": 0.04939209726443769, + "grad_norm": 3.322596788406372, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.48417025804519653, + "mean_token_accuracy": 0.8293706178665161, + "num_tokens": 611068.0, + "step": 65 + }, + { + "epoch": 0.05015197568389058, + "grad_norm": 2.302450180053711, + "learning_rate": 1.6414141414141415e-06, + "loss": 0.6498389840126038, + "mean_token_accuracy": 0.7728497385978699, + "num_tokens": 624452.0, + "step": 66 + }, + { + "epoch": 0.050911854103343465, + "grad_norm": 2.680191993713379, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.6347037553787231, + "mean_token_accuracy": 0.8108306527137756, + "num_tokens": 638049.0, + "step": 67 + }, + { + "epoch": 0.05167173252279635, + "grad_norm": 3.0297021865844727, + "learning_rate": 1.6919191919191922e-06, + "loss": 0.5344363451004028, + "mean_token_accuracy": 0.8113535046577454, + "num_tokens": 643892.0, + "step": 68 + }, + { + "epoch": 0.05243161094224924, + "grad_norm": 2.9283676147460938, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.6999260187149048, + "mean_token_accuracy": 0.7782022356987, + "num_tokens": 654418.0, + "step": 69 + }, + { + "epoch": 0.05319148936170213, + "grad_norm": 3.4098572731018066, + "learning_rate": 1.7424242424242427e-06, + "loss": 0.6508946418762207, + "mean_token_accuracy": 0.7942900657653809, + "num_tokens": 659837.0, + "step": 70 + }, + { + "epoch": 0.053951367781155016, + "grad_norm": 2.6756019592285156, + "learning_rate": 1.7676767676767678e-06, + "loss": 0.603486180305481, + "mean_token_accuracy": 0.8015457391738892, + "num_tokens": 668361.0, + "step": 71 + }, + { + "epoch": 0.0547112462006079, + "grad_norm": 2.2630293369293213, + "learning_rate": 1.792929292929293e-06, + "loss": 0.6608274579048157, + "mean_token_accuracy": 0.7753809690475464, + "num_tokens": 679025.0, + "step": 72 + }, + { + "epoch": 0.05547112462006079, + "grad_norm": 2.123962879180908, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4525482654571533, + "mean_token_accuracy": 0.8425612449645996, + "num_tokens": 688574.0, + "step": 73 + }, + { + "epoch": 0.05623100303951368, + "grad_norm": 7.90519905090332, + "learning_rate": 1.8434343434343434e-06, + "loss": 0.6507195830345154, + "mean_token_accuracy": 0.7714964151382446, + "num_tokens": 694534.0, + "step": 74 + }, + { + "epoch": 0.056990881458966566, + "grad_norm": 2.372203826904297, + "learning_rate": 1.868686868686869e-06, + "loss": 0.4458143413066864, + "mean_token_accuracy": 0.7991449236869812, + "num_tokens": 703114.0, + "step": 75 + }, + { + "epoch": 0.057750759878419454, + "grad_norm": 2.918677568435669, + "learning_rate": 1.8939393939393941e-06, + "loss": 0.5614339113235474, + "mean_token_accuracy": 0.8211464881896973, + "num_tokens": 709038.0, + "step": 76 + }, + { + "epoch": 0.05851063829787234, + "grad_norm": 1.6106709241867065, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.5802098512649536, + "mean_token_accuracy": 0.8055065870285034, + "num_tokens": 730482.0, + "step": 77 + }, + { + "epoch": 0.05927051671732523, + "grad_norm": 2.8069989681243896, + "learning_rate": 1.944444444444445e-06, + "loss": 0.5709059238433838, + "mean_token_accuracy": 0.8024872541427612, + "num_tokens": 751817.0, + "step": 78 + }, + { + "epoch": 0.06003039513677812, + "grad_norm": 2.641667127609253, + "learning_rate": 1.96969696969697e-06, + "loss": 0.6480152606964111, + "mean_token_accuracy": 0.7912271618843079, + "num_tokens": 759236.0, + "step": 79 + }, + { + "epoch": 0.060790273556231005, + "grad_norm": 2.6034350395202637, + "learning_rate": 1.994949494949495e-06, + "loss": 0.5535176396369934, + "mean_token_accuracy": 0.7980542778968811, + "num_tokens": 766496.0, + "step": 80 + }, + { + "epoch": 0.06155015197568389, + "grad_norm": 1.7095069885253906, + "learning_rate": 2.02020202020202e-06, + "loss": 0.4545496106147766, + "mean_token_accuracy": 0.8229660391807556, + "num_tokens": 780124.0, + "step": 81 + }, + { + "epoch": 0.06231003039513678, + "grad_norm": 3.788830518722534, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.6679391264915466, + "mean_token_accuracy": 0.7942397594451904, + "num_tokens": 784555.0, + "step": 82 + }, + { + "epoch": 0.06306990881458967, + "grad_norm": 2.009831666946411, + "learning_rate": 2.070707070707071e-06, + "loss": 0.5067101120948792, + "mean_token_accuracy": 0.8276634216308594, + "num_tokens": 797459.0, + "step": 83 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 2.201627731323242, + "learning_rate": 2.095959595959596e-06, + "loss": 0.5012127161026001, + "mean_token_accuracy": 0.8432504534721375, + "num_tokens": 810817.0, + "step": 84 + }, + { + "epoch": 0.06458966565349544, + "grad_norm": 2.492568016052246, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.6142797470092773, + "mean_token_accuracy": 0.8338661193847656, + "num_tokens": 818191.0, + "step": 85 + }, + { + "epoch": 0.06534954407294832, + "grad_norm": 2.8360862731933594, + "learning_rate": 2.1464646464646467e-06, + "loss": 0.5569300651550293, + "mean_token_accuracy": 0.8121030330657959, + "num_tokens": 825325.0, + "step": 86 + }, + { + "epoch": 0.06610942249240122, + "grad_norm": 2.407548427581787, + "learning_rate": 2.171717171717172e-06, + "loss": 0.6442930102348328, + "mean_token_accuracy": 0.792514443397522, + "num_tokens": 834439.0, + "step": 87 + }, + { + "epoch": 0.0668693009118541, + "grad_norm": 2.340728759765625, + "learning_rate": 2.196969696969697e-06, + "loss": 0.6494365930557251, + "mean_token_accuracy": 0.7746615409851074, + "num_tokens": 843078.0, + "step": 88 + }, + { + "epoch": 0.067629179331307, + "grad_norm": 1.7703697681427002, + "learning_rate": 2.222222222222222e-06, + "loss": 0.598991870880127, + "mean_token_accuracy": 0.7992157340049744, + "num_tokens": 860171.0, + "step": 89 + }, + { + "epoch": 0.06838905775075987, + "grad_norm": 2.5779271125793457, + "learning_rate": 2.2474747474747476e-06, + "loss": 0.5693082809448242, + "mean_token_accuracy": 0.8093700408935547, + "num_tokens": 866669.0, + "step": 90 + }, + { + "epoch": 0.06914893617021277, + "grad_norm": 2.014092206954956, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5346695780754089, + "mean_token_accuracy": 0.8165590763092041, + "num_tokens": 876698.0, + "step": 91 + }, + { + "epoch": 0.06990881458966565, + "grad_norm": 1.7555919885635376, + "learning_rate": 2.2979797979797983e-06, + "loss": 0.5321458578109741, + "mean_token_accuracy": 0.8166656494140625, + "num_tokens": 889488.0, + "step": 92 + }, + { + "epoch": 0.07066869300911854, + "grad_norm": 1.8631824254989624, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.5246532559394836, + "mean_token_accuracy": 0.8088107705116272, + "num_tokens": 901322.0, + "step": 93 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.2332139015197754, + "learning_rate": 2.348484848484849e-06, + "loss": 0.5141711235046387, + "mean_token_accuracy": 0.8382217884063721, + "num_tokens": 905792.0, + "step": 94 + }, + { + "epoch": 0.07218844984802432, + "grad_norm": 1.7806555032730103, + "learning_rate": 2.373737373737374e-06, + "loss": 0.5233149528503418, + "mean_token_accuracy": 0.8101529479026794, + "num_tokens": 917320.0, + "step": 95 + }, + { + "epoch": 0.0729483282674772, + "grad_norm": 1.8169859647750854, + "learning_rate": 2.3989898989898993e-06, + "loss": 0.578881561756134, + "mean_token_accuracy": 0.8044873476028442, + "num_tokens": 931062.0, + "step": 96 + }, + { + "epoch": 0.0737082066869301, + "grad_norm": 4.677402496337891, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.7842556238174438, + "mean_token_accuracy": 0.7579764127731323, + "num_tokens": 934712.0, + "step": 97 + }, + { + "epoch": 0.07446808510638298, + "grad_norm": 2.6987264156341553, + "learning_rate": 2.4494949494949495e-06, + "loss": 0.5669287443161011, + "mean_token_accuracy": 0.8186933994293213, + "num_tokens": 941058.0, + "step": 98 + }, + { + "epoch": 0.07522796352583587, + "grad_norm": 1.6906023025512695, + "learning_rate": 2.474747474747475e-06, + "loss": 0.4976363778114319, + "mean_token_accuracy": 0.8198553323745728, + "num_tokens": 956509.0, + "step": 99 + }, + { + "epoch": 0.07598784194528875, + "grad_norm": 2.7256152629852295, + "learning_rate": 2.5e-06, + "loss": 0.7138420343399048, + "mean_token_accuracy": 0.7752805948257446, + "num_tokens": 963920.0, + "step": 100 + }, + { + "epoch": 0.07674772036474165, + "grad_norm": 2.174870491027832, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.6733541488647461, + "mean_token_accuracy": 0.7745175361633301, + "num_tokens": 975268.0, + "step": 101 + }, + { + "epoch": 0.07750759878419453, + "grad_norm": 1.5587213039398193, + "learning_rate": 2.5505050505050505e-06, + "loss": 0.44223445653915405, + "mean_token_accuracy": 0.8278359174728394, + "num_tokens": 991837.0, + "step": 102 + }, + { + "epoch": 0.07826747720364742, + "grad_norm": 2.181840658187866, + "learning_rate": 2.575757575757576e-06, + "loss": 0.625128448009491, + "mean_token_accuracy": 0.7941786050796509, + "num_tokens": 1004325.0, + "step": 103 + }, + { + "epoch": 0.0790273556231003, + "grad_norm": 1.4986687898635864, + "learning_rate": 2.601010101010101e-06, + "loss": 0.39262527227401733, + "mean_token_accuracy": 0.8412648439407349, + "num_tokens": 1018331.0, + "step": 104 + }, + { + "epoch": 0.0797872340425532, + "grad_norm": 2.3416061401367188, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.5495132803916931, + "mean_token_accuracy": 0.8193322420120239, + "num_tokens": 1026090.0, + "step": 105 + }, + { + "epoch": 0.08054711246200608, + "grad_norm": 3.8168859481811523, + "learning_rate": 2.6515151515151514e-06, + "loss": 0.4898706376552582, + "mean_token_accuracy": 0.8467956185340881, + "num_tokens": 1029955.0, + "step": 106 + }, + { + "epoch": 0.08130699088145897, + "grad_norm": 4.113908767700195, + "learning_rate": 2.676767676767677e-06, + "loss": 0.6189584732055664, + "mean_token_accuracy": 0.8019394278526306, + "num_tokens": 1033598.0, + "step": 107 + }, + { + "epoch": 0.08206686930091185, + "grad_norm": 2.50003981590271, + "learning_rate": 2.7020202020202025e-06, + "loss": 0.6479471921920776, + "mean_token_accuracy": 0.7790026664733887, + "num_tokens": 1042533.0, + "step": 108 + }, + { + "epoch": 0.08282674772036475, + "grad_norm": 1.408934473991394, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.3909248113632202, + "mean_token_accuracy": 0.8477586507797241, + "num_tokens": 1061755.0, + "step": 109 + }, + { + "epoch": 0.08358662613981763, + "grad_norm": 3.360633611679077, + "learning_rate": 2.7525252525252528e-06, + "loss": 0.6952459812164307, + "mean_token_accuracy": 0.777535080909729, + "num_tokens": 1067316.0, + "step": 110 + }, + { + "epoch": 0.08434650455927052, + "grad_norm": 1.8631696701049805, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.5420593023300171, + "mean_token_accuracy": 0.8157662749290466, + "num_tokens": 1079930.0, + "step": 111 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.4308314323425293, + "learning_rate": 2.803030303030303e-06, + "loss": 0.5863882303237915, + "mean_token_accuracy": 0.8206346035003662, + "num_tokens": 1088069.0, + "step": 112 + }, + { + "epoch": 0.0858662613981763, + "grad_norm": 2.922808885574341, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5217319130897522, + "mean_token_accuracy": 0.8253234028816223, + "num_tokens": 1093607.0, + "step": 113 + }, + { + "epoch": 0.08662613981762918, + "grad_norm": 2.3596107959747314, + "learning_rate": 2.8535353535353537e-06, + "loss": 0.5070714950561523, + "mean_token_accuracy": 0.8258323669433594, + "num_tokens": 1100405.0, + "step": 114 + }, + { + "epoch": 0.08738601823708207, + "grad_norm": 3.0853066444396973, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.591964840888977, + "mean_token_accuracy": 0.8047322630882263, + "num_tokens": 1107535.0, + "step": 115 + }, + { + "epoch": 0.08814589665653495, + "grad_norm": 1.9251092672348022, + "learning_rate": 2.904040404040404e-06, + "loss": 0.5226191878318787, + "mean_token_accuracy": 0.8022720217704773, + "num_tokens": 1118716.0, + "step": 116 + }, + { + "epoch": 0.08890577507598785, + "grad_norm": 1.9692988395690918, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.5462069511413574, + "mean_token_accuracy": 0.8157015442848206, + "num_tokens": 1131917.0, + "step": 117 + }, + { + "epoch": 0.08966565349544073, + "grad_norm": 1.4738909006118774, + "learning_rate": 2.954545454545455e-06, + "loss": 0.4564219117164612, + "mean_token_accuracy": 0.849632978439331, + "num_tokens": 1148534.0, + "step": 118 + }, + { + "epoch": 0.09042553191489362, + "grad_norm": 2.72646164894104, + "learning_rate": 2.97979797979798e-06, + "loss": 0.6654808521270752, + "mean_token_accuracy": 0.7752684354782104, + "num_tokens": 1155438.0, + "step": 119 + }, + { + "epoch": 0.0911854103343465, + "grad_norm": 2.7843852043151855, + "learning_rate": 3.0050505050505054e-06, + "loss": 0.5354680418968201, + "mean_token_accuracy": 0.8196378946304321, + "num_tokens": 1161815.0, + "step": 120 + }, + { + "epoch": 0.0919452887537994, + "grad_norm": 2.8052573204040527, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.6366757154464722, + "mean_token_accuracy": 0.7967483997344971, + "num_tokens": 1168295.0, + "step": 121 + }, + { + "epoch": 0.09270516717325228, + "grad_norm": 2.7462735176086426, + "learning_rate": 3.055555555555556e-06, + "loss": 0.59470534324646, + "mean_token_accuracy": 0.8023771047592163, + "num_tokens": 1174502.0, + "step": 122 + }, + { + "epoch": 0.09346504559270517, + "grad_norm": 2.2743821144104004, + "learning_rate": 3.0808080808080807e-06, + "loss": 0.5720560550689697, + "mean_token_accuracy": 0.8162771463394165, + "num_tokens": 1183615.0, + "step": 123 + }, + { + "epoch": 0.09422492401215805, + "grad_norm": 1.8669533729553223, + "learning_rate": 3.1060606060606063e-06, + "loss": 0.4655378758907318, + "mean_token_accuracy": 0.8360732793807983, + "num_tokens": 1193761.0, + "step": 124 + }, + { + "epoch": 0.09498480243161095, + "grad_norm": 1.7666901350021362, + "learning_rate": 3.131313131313132e-06, + "loss": 0.5524153709411621, + "mean_token_accuracy": 0.8252713680267334, + "num_tokens": 1207870.0, + "step": 125 + }, + { + "epoch": 0.09574468085106383, + "grad_norm": 2.4720070362091064, + "learning_rate": 3.1565656565656566e-06, + "loss": 0.5003011226654053, + "mean_token_accuracy": 0.8491042852401733, + "num_tokens": 1214603.0, + "step": 126 + }, + { + "epoch": 0.09650455927051672, + "grad_norm": 1.6500422954559326, + "learning_rate": 3.181818181818182e-06, + "loss": 0.5137069225311279, + "mean_token_accuracy": 0.8273531198501587, + "num_tokens": 1228717.0, + "step": 127 + }, + { + "epoch": 0.0972644376899696, + "grad_norm": 3.402543067932129, + "learning_rate": 3.2070707070707072e-06, + "loss": 0.708167552947998, + "mean_token_accuracy": 0.7705385684967041, + "num_tokens": 1234361.0, + "step": 128 + }, + { + "epoch": 0.0980243161094225, + "grad_norm": 2.547285795211792, + "learning_rate": 3.232323232323233e-06, + "loss": 0.6020137071609497, + "mean_token_accuracy": 0.7981340289115906, + "num_tokens": 1244169.0, + "step": 129 + }, + { + "epoch": 0.09878419452887538, + "grad_norm": 2.0578792095184326, + "learning_rate": 3.257575757575758e-06, + "loss": 0.4425000250339508, + "mean_token_accuracy": 0.8567807674407959, + "num_tokens": 1252709.0, + "step": 130 + }, + { + "epoch": 0.09954407294832827, + "grad_norm": 1.672614336013794, + "learning_rate": 3.282828282828283e-06, + "loss": 0.4860966205596924, + "mean_token_accuracy": 0.8393139243125916, + "num_tokens": 1265766.0, + "step": 131 + }, + { + "epoch": 0.10030395136778116, + "grad_norm": 3.2560198307037354, + "learning_rate": 3.3080808080808086e-06, + "loss": 0.624736487865448, + "mean_token_accuracy": 0.7875322699546814, + "num_tokens": 1270779.0, + "step": 132 + }, + { + "epoch": 0.10106382978723404, + "grad_norm": 2.4468185901641846, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5062227249145508, + "mean_token_accuracy": 0.8217229843139648, + "num_tokens": 1277113.0, + "step": 133 + }, + { + "epoch": 0.10182370820668693, + "grad_norm": 2.6371328830718994, + "learning_rate": 3.358585858585859e-06, + "loss": 0.477113276720047, + "mean_token_accuracy": 0.8605583906173706, + "num_tokens": 1282514.0, + "step": 134 + }, + { + "epoch": 0.10258358662613981, + "grad_norm": 2.48421311378479, + "learning_rate": 3.3838383838383844e-06, + "loss": 0.40855684876441956, + "mean_token_accuracy": 0.864548921585083, + "num_tokens": 1287859.0, + "step": 135 + }, + { + "epoch": 0.1033434650455927, + "grad_norm": 1.993099331855774, + "learning_rate": 3.409090909090909e-06, + "loss": 0.5913145542144775, + "mean_token_accuracy": 0.8248485922813416, + "num_tokens": 1301074.0, + "step": 136 + }, + { + "epoch": 0.10410334346504559, + "grad_norm": 3.5947680473327637, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.5028599500656128, + "mean_token_accuracy": 0.8367215394973755, + "num_tokens": 1305219.0, + "step": 137 + }, + { + "epoch": 0.10486322188449848, + "grad_norm": 2.5778582096099854, + "learning_rate": 3.45959595959596e-06, + "loss": 0.5297672748565674, + "mean_token_accuracy": 0.8232187032699585, + "num_tokens": 1312482.0, + "step": 138 + }, + { + "epoch": 0.10562310030395136, + "grad_norm": 1.8961588144302368, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.39954107999801636, + "mean_token_accuracy": 0.8605833053588867, + "num_tokens": 1323404.0, + "step": 139 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 1.9687960147857666, + "learning_rate": 3.51010101010101e-06, + "loss": 0.48791587352752686, + "mean_token_accuracy": 0.8200347423553467, + "num_tokens": 1333027.0, + "step": 140 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 2.520242691040039, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.6106002330780029, + "mean_token_accuracy": 0.790692150592804, + "num_tokens": 1340999.0, + "step": 141 + }, + { + "epoch": 0.10790273556231003, + "grad_norm": 3.751617431640625, + "learning_rate": 3.560606060606061e-06, + "loss": 0.48141729831695557, + "mean_token_accuracy": 0.8421382904052734, + "num_tokens": 1344687.0, + "step": 142 + }, + { + "epoch": 0.10866261398176291, + "grad_norm": 2.7101709842681885, + "learning_rate": 3.585858585858586e-06, + "loss": 0.5375241637229919, + "mean_token_accuracy": 0.8061438202857971, + "num_tokens": 1350192.0, + "step": 143 + }, + { + "epoch": 0.1094224924012158, + "grad_norm": 2.583484411239624, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.6492470502853394, + "mean_token_accuracy": 0.7863001823425293, + "num_tokens": 1358148.0, + "step": 144 + }, + { + "epoch": 0.11018237082066869, + "grad_norm": 1.792561650276184, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.48480600118637085, + "mean_token_accuracy": 0.8358709812164307, + "num_tokens": 1369519.0, + "step": 145 + }, + { + "epoch": 0.11094224924012158, + "grad_norm": 2.6480472087860107, + "learning_rate": 3.661616161616162e-06, + "loss": 0.5268933176994324, + "mean_token_accuracy": 0.8214013576507568, + "num_tokens": 1375862.0, + "step": 146 + }, + { + "epoch": 0.11170212765957446, + "grad_norm": 2.3174469470977783, + "learning_rate": 3.686868686868687e-06, + "loss": 0.42517897486686707, + "mean_token_accuracy": 0.8523461222648621, + "num_tokens": 1381546.0, + "step": 147 + }, + { + "epoch": 0.11246200607902736, + "grad_norm": 3.0090949535369873, + "learning_rate": 3.7121212121212124e-06, + "loss": 0.4042336940765381, + "mean_token_accuracy": 0.8670448064804077, + "num_tokens": 1385896.0, + "step": 148 + }, + { + "epoch": 0.11322188449848024, + "grad_norm": 2.4928104877471924, + "learning_rate": 3.737373737373738e-06, + "loss": 0.6498878598213196, + "mean_token_accuracy": 0.7967068552970886, + "num_tokens": 1394169.0, + "step": 149 + }, + { + "epoch": 0.11398176291793313, + "grad_norm": 1.5984913110733032, + "learning_rate": 3.7626262626262627e-06, + "loss": 0.546096920967102, + "mean_token_accuracy": 0.8035850524902344, + "num_tokens": 1408785.0, + "step": 150 + }, + { + "epoch": 0.11474164133738601, + "grad_norm": 2.3663532733917236, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.6111721992492676, + "mean_token_accuracy": 0.8015355467796326, + "num_tokens": 1417510.0, + "step": 151 + }, + { + "epoch": 0.11550151975683891, + "grad_norm": 2.518932819366455, + "learning_rate": 3.8131313131313138e-06, + "loss": 0.5274964570999146, + "mean_token_accuracy": 0.8155480623245239, + "num_tokens": 1424186.0, + "step": 152 + }, + { + "epoch": 0.11626139817629179, + "grad_norm": 2.14353609085083, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.5283297896385193, + "mean_token_accuracy": 0.8275758028030396, + "num_tokens": 1432630.0, + "step": 153 + }, + { + "epoch": 0.11702127659574468, + "grad_norm": 1.8243604898452759, + "learning_rate": 3.863636363636364e-06, + "loss": 0.41854870319366455, + "mean_token_accuracy": 0.8222295045852661, + "num_tokens": 1442691.0, + "step": 154 + }, + { + "epoch": 0.11778115501519756, + "grad_norm": 2.088212251663208, + "learning_rate": 3.88888888888889e-06, + "loss": 0.6062943339347839, + "mean_token_accuracy": 0.8009427785873413, + "num_tokens": 1456890.0, + "step": 155 + }, + { + "epoch": 0.11854103343465046, + "grad_norm": 1.3469511270523071, + "learning_rate": 3.914141414141415e-06, + "loss": 0.4390433728694916, + "mean_token_accuracy": 0.8436295986175537, + "num_tokens": 1475349.0, + "step": 156 + }, + { + "epoch": 0.11930091185410334, + "grad_norm": 3.247023105621338, + "learning_rate": 3.93939393939394e-06, + "loss": 0.6490433216094971, + "mean_token_accuracy": 0.8037861585617065, + "num_tokens": 1479952.0, + "step": 157 + }, + { + "epoch": 0.12006079027355623, + "grad_norm": 2.6610445976257324, + "learning_rate": 3.964646464646465e-06, + "loss": 0.6221826076507568, + "mean_token_accuracy": 0.7848749160766602, + "num_tokens": 1487306.0, + "step": 158 + }, + { + "epoch": 0.12082066869300911, + "grad_norm": 2.3060810565948486, + "learning_rate": 3.98989898989899e-06, + "loss": 0.5052388310432434, + "mean_token_accuracy": 0.8281195759773254, + "num_tokens": 1495367.0, + "step": 159 + }, + { + "epoch": 0.12158054711246201, + "grad_norm": 2.504448652267456, + "learning_rate": 4.015151515151515e-06, + "loss": 0.5005477666854858, + "mean_token_accuracy": 0.8408058881759644, + "num_tokens": 1502069.0, + "step": 160 + }, + { + "epoch": 0.12234042553191489, + "grad_norm": 3.993938446044922, + "learning_rate": 4.04040404040404e-06, + "loss": 0.5569638013839722, + "mean_token_accuracy": 0.8095242977142334, + "num_tokens": 1510224.0, + "step": 161 + }, + { + "epoch": 0.12310030395136778, + "grad_norm": 2.2287683486938477, + "learning_rate": 4.065656565656566e-06, + "loss": 0.524042546749115, + "mean_token_accuracy": 0.8102203607559204, + "num_tokens": 1518364.0, + "step": 162 + }, + { + "epoch": 0.12386018237082067, + "grad_norm": 1.9531738758087158, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.45794573426246643, + "mean_token_accuracy": 0.8560376167297363, + "num_tokens": 1528097.0, + "step": 163 + }, + { + "epoch": 0.12462006079027356, + "grad_norm": 1.5841206312179565, + "learning_rate": 4.116161616161617e-06, + "loss": 0.5420972108840942, + "mean_token_accuracy": 0.8092726469039917, + "num_tokens": 1544119.0, + "step": 164 + }, + { + "epoch": 0.12537993920972645, + "grad_norm": 1.7536218166351318, + "learning_rate": 4.141414141414142e-06, + "loss": 0.554668664932251, + "mean_token_accuracy": 0.8193825483322144, + "num_tokens": 1559140.0, + "step": 165 + }, + { + "epoch": 0.12613981762917933, + "grad_norm": 3.545454740524292, + "learning_rate": 4.166666666666667e-06, + "loss": 0.580947995185852, + "mean_token_accuracy": 0.8286383152008057, + "num_tokens": 1563625.0, + "step": 166 + }, + { + "epoch": 0.12689969604863222, + "grad_norm": 1.6608915328979492, + "learning_rate": 4.191919191919192e-06, + "loss": 0.5523324012756348, + "mean_token_accuracy": 0.8155215978622437, + "num_tokens": 1574945.0, + "step": 167 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 1.4832708835601807, + "learning_rate": 4.217171717171717e-06, + "loss": 0.5133191347122192, + "mean_token_accuracy": 0.8367571830749512, + "num_tokens": 1595865.0, + "step": 168 + }, + { + "epoch": 0.128419452887538, + "grad_norm": 1.7807520627975464, + "learning_rate": 4.242424242424243e-06, + "loss": 0.5131410360336304, + "mean_token_accuracy": 0.8129367232322693, + "num_tokens": 1608723.0, + "step": 169 + }, + { + "epoch": 0.12917933130699089, + "grad_norm": 2.707569122314453, + "learning_rate": 4.267676767676767e-06, + "loss": 0.6129013299942017, + "mean_token_accuracy": 0.7926048040390015, + "num_tokens": 1616136.0, + "step": 170 + }, + { + "epoch": 0.12993920972644377, + "grad_norm": 2.5831644535064697, + "learning_rate": 4.292929292929293e-06, + "loss": 0.6264227628707886, + "mean_token_accuracy": 0.8074911236763, + "num_tokens": 1624228.0, + "step": 171 + }, + { + "epoch": 0.13069908814589665, + "grad_norm": 3.1124250888824463, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.41763827204704285, + "mean_token_accuracy": 0.8565453290939331, + "num_tokens": 1628098.0, + "step": 172 + }, + { + "epoch": 0.13145896656534956, + "grad_norm": 2.3214211463928223, + "learning_rate": 4.343434343434344e-06, + "loss": 0.421974778175354, + "mean_token_accuracy": 0.8391546010971069, + "num_tokens": 1634950.0, + "step": 173 + }, + { + "epoch": 0.13221884498480244, + "grad_norm": 2.1010327339172363, + "learning_rate": 4.368686868686869e-06, + "loss": 0.5307331681251526, + "mean_token_accuracy": 0.8139588236808777, + "num_tokens": 1644132.0, + "step": 174 + }, + { + "epoch": 0.13297872340425532, + "grad_norm": 2.533612012863159, + "learning_rate": 4.393939393939394e-06, + "loss": 0.5626664161682129, + "mean_token_accuracy": 0.8029808402061462, + "num_tokens": 1651637.0, + "step": 175 + }, + { + "epoch": 0.1337386018237082, + "grad_norm": 1.669508457183838, + "learning_rate": 4.41919191919192e-06, + "loss": 0.5351508259773254, + "mean_token_accuracy": 0.8281655311584473, + "num_tokens": 1666776.0, + "step": 176 + }, + { + "epoch": 0.1344984802431611, + "grad_norm": 1.7579659223556519, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5235031247138977, + "mean_token_accuracy": 0.8143284320831299, + "num_tokens": 1679241.0, + "step": 177 + }, + { + "epoch": 0.135258358662614, + "grad_norm": 3.123563528060913, + "learning_rate": 4.46969696969697e-06, + "loss": 0.43051332235336304, + "mean_token_accuracy": 0.8518186211585999, + "num_tokens": 1683317.0, + "step": 178 + }, + { + "epoch": 0.13601823708206687, + "grad_norm": 2.2411575317382812, + "learning_rate": 4.494949494949495e-06, + "loss": 0.5471380949020386, + "mean_token_accuracy": 0.8267596960067749, + "num_tokens": 1691366.0, + "step": 179 + }, + { + "epoch": 0.13677811550151975, + "grad_norm": 2.621973991394043, + "learning_rate": 4.520202020202021e-06, + "loss": 0.5685839653015137, + "mean_token_accuracy": 0.8260642290115356, + "num_tokens": 1698148.0, + "step": 180 + }, + { + "epoch": 0.13753799392097266, + "grad_norm": 2.1553852558135986, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5703883171081543, + "mean_token_accuracy": 0.8219090700149536, + "num_tokens": 1707225.0, + "step": 181 + }, + { + "epoch": 0.13829787234042554, + "grad_norm": 5.1767897605896, + "learning_rate": 4.5707070707070715e-06, + "loss": 0.32704639434814453, + "mean_token_accuracy": 0.8754568099975586, + "num_tokens": 1712748.0, + "step": 182 + }, + { + "epoch": 0.13905775075987842, + "grad_norm": 2.609168291091919, + "learning_rate": 4.595959595959597e-06, + "loss": 0.5939987301826477, + "mean_token_accuracy": 0.8034975528717041, + "num_tokens": 1719932.0, + "step": 183 + }, + { + "epoch": 0.1398176291793313, + "grad_norm": 2.2059099674224854, + "learning_rate": 4.621212121212122e-06, + "loss": 0.5310720205307007, + "mean_token_accuracy": 0.8177368640899658, + "num_tokens": 1727640.0, + "step": 184 + }, + { + "epoch": 0.1405775075987842, + "grad_norm": 2.6367759704589844, + "learning_rate": 4.646464646464647e-06, + "loss": 0.522086501121521, + "mean_token_accuracy": 0.826233983039856, + "num_tokens": 1733609.0, + "step": 185 + }, + { + "epoch": 0.1413373860182371, + "grad_norm": 3.326732873916626, + "learning_rate": 4.671717171717172e-06, + "loss": 0.4127829074859619, + "mean_token_accuracy": 0.8551101684570312, + "num_tokens": 1737256.0, + "step": 186 + }, + { + "epoch": 0.14209726443768997, + "grad_norm": 1.828412413597107, + "learning_rate": 4.696969696969698e-06, + "loss": 0.5444269180297852, + "mean_token_accuracy": 0.8350818157196045, + "num_tokens": 1750196.0, + "step": 187 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.209203004837036, + "learning_rate": 4.722222222222222e-06, + "loss": 0.5087994933128357, + "mean_token_accuracy": 0.8349015712738037, + "num_tokens": 1754836.0, + "step": 188 + }, + { + "epoch": 0.14361702127659576, + "grad_norm": 1.7339166402816772, + "learning_rate": 4.747474747474748e-06, + "loss": 0.5151352286338806, + "mean_token_accuracy": 0.8321266174316406, + "num_tokens": 1766015.0, + "step": 189 + }, + { + "epoch": 0.14437689969604864, + "grad_norm": 2.699068069458008, + "learning_rate": 4.772727272727273e-06, + "loss": 0.4406203031539917, + "mean_token_accuracy": 0.8425000905990601, + "num_tokens": 1771684.0, + "step": 190 + }, + { + "epoch": 0.14513677811550152, + "grad_norm": 2.8117282390594482, + "learning_rate": 4.7979797979797985e-06, + "loss": 0.40428489446640015, + "mean_token_accuracy": 0.8654326796531677, + "num_tokens": 1776301.0, + "step": 191 + }, + { + "epoch": 0.1458966565349544, + "grad_norm": 2.9204647541046143, + "learning_rate": 4.823232323232324e-06, + "loss": 0.4191770553588867, + "mean_token_accuracy": 0.8574687242507935, + "num_tokens": 1781678.0, + "step": 192 + }, + { + "epoch": 0.1466565349544073, + "grad_norm": 2.1648988723754883, + "learning_rate": 4.848484848484849e-06, + "loss": 0.5839012861251831, + "mean_token_accuracy": 0.8053664565086365, + "num_tokens": 1792516.0, + "step": 193 + }, + { + "epoch": 0.1474164133738602, + "grad_norm": 2.3221631050109863, + "learning_rate": 4.873737373737374e-06, + "loss": 0.5037894248962402, + "mean_token_accuracy": 0.8427227139472961, + "num_tokens": 1800192.0, + "step": 194 + }, + { + "epoch": 0.14817629179331307, + "grad_norm": 2.4536430835723877, + "learning_rate": 4.898989898989899e-06, + "loss": 0.42326074838638306, + "mean_token_accuracy": 0.8510633111000061, + "num_tokens": 1806159.0, + "step": 195 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 2.4875805377960205, + "learning_rate": 4.924242424242425e-06, + "loss": 0.539531409740448, + "mean_token_accuracy": 0.8060250282287598, + "num_tokens": 1813392.0, + "step": 196 + }, + { + "epoch": 0.14969604863221886, + "grad_norm": 2.1664798259735107, + "learning_rate": 4.94949494949495e-06, + "loss": 0.42502015829086304, + "mean_token_accuracy": 0.8503251075744629, + "num_tokens": 1821424.0, + "step": 197 + }, + { + "epoch": 0.15045592705167174, + "grad_norm": 2.568808078765869, + "learning_rate": 4.974747474747475e-06, + "loss": 0.5025098323822021, + "mean_token_accuracy": 0.8182311058044434, + "num_tokens": 1827225.0, + "step": 198 + }, + { + "epoch": 0.15121580547112462, + "grad_norm": 1.9116802215576172, + "learning_rate": 5e-06, + "loss": 0.4907258450984955, + "mean_token_accuracy": 0.8310189843177795, + "num_tokens": 1836297.0, + "step": 199 + }, + { + "epoch": 0.1519756838905775, + "grad_norm": 3.150765895843506, + "learning_rate": 4.999999122701883e-06, + "loss": 0.390616774559021, + "mean_token_accuracy": 0.8626647591590881, + "num_tokens": 1839984.0, + "step": 200 + }, + { + "epoch": 0.15273556231003038, + "grad_norm": 3.2229044437408447, + "learning_rate": 4.999996490808146e-06, + "loss": 0.48009657859802246, + "mean_token_accuracy": 0.825214147567749, + "num_tokens": 1844610.0, + "step": 201 + }, + { + "epoch": 0.1534954407294833, + "grad_norm": 1.4473289251327515, + "learning_rate": 4.9999921043206356e-06, + "loss": 0.40135183930397034, + "mean_token_accuracy": 0.8537827730178833, + "num_tokens": 1859573.0, + "step": 202 + }, + { + "epoch": 0.15425531914893617, + "grad_norm": 4.072319507598877, + "learning_rate": 4.999985963242432e-06, + "loss": 0.6158689260482788, + "mean_token_accuracy": 0.8075432777404785, + "num_tokens": 1863147.0, + "step": 203 + }, + { + "epoch": 0.15501519756838905, + "grad_norm": 3.15741229057312, + "learning_rate": 4.999978067577844e-06, + "loss": 0.4603108763694763, + "mean_token_accuracy": 0.8418779373168945, + "num_tokens": 1867201.0, + "step": 204 + }, + { + "epoch": 0.15577507598784193, + "grad_norm": 2.1925418376922607, + "learning_rate": 4.999968417332415e-06, + "loss": 0.5552488565444946, + "mean_token_accuracy": 0.8216016292572021, + "num_tokens": 1874837.0, + "step": 205 + }, + { + "epoch": 0.15653495440729484, + "grad_norm": 2.2518117427825928, + "learning_rate": 4.999957012512916e-06, + "loss": 0.4912569522857666, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 1881842.0, + "step": 206 + }, + { + "epoch": 0.15729483282674772, + "grad_norm": 1.8223762512207031, + "learning_rate": 4.999943853127351e-06, + "loss": 0.47709137201309204, + "mean_token_accuracy": 0.8311659097671509, + "num_tokens": 1890805.0, + "step": 207 + }, + { + "epoch": 0.1580547112462006, + "grad_norm": 2.066499948501587, + "learning_rate": 4.999928939184958e-06, + "loss": 0.44794657826423645, + "mean_token_accuracy": 0.8513424396514893, + "num_tokens": 1898264.0, + "step": 208 + }, + { + "epoch": 0.15881458966565348, + "grad_norm": 3.53865909576416, + "learning_rate": 4.999912270696202e-06, + "loss": 0.5978270769119263, + "mean_token_accuracy": 0.8080137968063354, + "num_tokens": 1902435.0, + "step": 209 + }, + { + "epoch": 0.1595744680851064, + "grad_norm": 2.0760679244995117, + "learning_rate": 4.999893847672783e-06, + "loss": 0.5930601358413696, + "mean_token_accuracy": 0.8028650283813477, + "num_tokens": 1912252.0, + "step": 210 + }, + { + "epoch": 0.16033434650455927, + "grad_norm": 2.21551513671875, + "learning_rate": 4.99987367012763e-06, + "loss": 0.6336753964424133, + "mean_token_accuracy": 0.7902286648750305, + "num_tokens": 1922095.0, + "step": 211 + }, + { + "epoch": 0.16109422492401215, + "grad_norm": 1.7654480934143066, + "learning_rate": 4.999851738074904e-06, + "loss": 0.6373403668403625, + "mean_token_accuracy": 0.7802424430847168, + "num_tokens": 1938962.0, + "step": 212 + }, + { + "epoch": 0.16185410334346503, + "grad_norm": 2.852834701538086, + "learning_rate": 4.9998280515300006e-06, + "loss": 0.6418683528900146, + "mean_token_accuracy": 0.7895716428756714, + "num_tokens": 1944668.0, + "step": 213 + }, + { + "epoch": 0.16261398176291794, + "grad_norm": 3.4737212657928467, + "learning_rate": 4.999802610509541e-06, + "loss": 0.6323273181915283, + "mean_token_accuracy": 0.7982614636421204, + "num_tokens": 1949142.0, + "step": 214 + }, + { + "epoch": 0.16337386018237082, + "grad_norm": 3.0802664756774902, + "learning_rate": 4.999775415031381e-06, + "loss": 0.5929068326950073, + "mean_token_accuracy": 0.8112219572067261, + "num_tokens": 1954141.0, + "step": 215 + }, + { + "epoch": 0.1641337386018237, + "grad_norm": 2.9808855056762695, + "learning_rate": 4.999746465114609e-06, + "loss": 0.5556406378746033, + "mean_token_accuracy": 0.8117628693580627, + "num_tokens": 1959406.0, + "step": 216 + }, + { + "epoch": 0.16489361702127658, + "grad_norm": 1.7346166372299194, + "learning_rate": 4.999715760779541e-06, + "loss": 0.5122925043106079, + "mean_token_accuracy": 0.8040724992752075, + "num_tokens": 1971921.0, + "step": 217 + }, + { + "epoch": 0.1656534954407295, + "grad_norm": 1.4183907508850098, + "learning_rate": 4.999683302047729e-06, + "loss": 0.46471893787384033, + "mean_token_accuracy": 0.8381330966949463, + "num_tokens": 1988863.0, + "step": 218 + }, + { + "epoch": 0.16641337386018237, + "grad_norm": 1.6797802448272705, + "learning_rate": 4.999649088941951e-06, + "loss": 0.38348832726478577, + "mean_token_accuracy": 0.8344278931617737, + "num_tokens": 2000003.0, + "step": 219 + }, + { + "epoch": 0.16717325227963525, + "grad_norm": 3.036963939666748, + "learning_rate": 4.999613121486222e-06, + "loss": 0.6062780618667603, + "mean_token_accuracy": 0.8217900991439819, + "num_tokens": 2004813.0, + "step": 220 + }, + { + "epoch": 0.16793313069908813, + "grad_norm": 2.0343217849731445, + "learning_rate": 4.999575399705782e-06, + "loss": 0.5052450895309448, + "mean_token_accuracy": 0.8368623852729797, + "num_tokens": 2013565.0, + "step": 221 + }, + { + "epoch": 0.16869300911854104, + "grad_norm": 2.1162009239196777, + "learning_rate": 4.9995359236271094e-06, + "loss": 0.5169756412506104, + "mean_token_accuracy": 0.8339958190917969, + "num_tokens": 2025763.0, + "step": 222 + }, + { + "epoch": 0.16945288753799392, + "grad_norm": 2.055333375930786, + "learning_rate": 4.9994946932779076e-06, + "loss": 0.6327048540115356, + "mean_token_accuracy": 0.8078711032867432, + "num_tokens": 2037005.0, + "step": 223 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 3.334620475769043, + "learning_rate": 4.999451708687114e-06, + "loss": 0.5688358545303345, + "mean_token_accuracy": 0.8015589714050293, + "num_tokens": 2041473.0, + "step": 224 + }, + { + "epoch": 0.17097264437689969, + "grad_norm": 2.3734676837921143, + "learning_rate": 4.999406969884897e-06, + "loss": 0.5673821568489075, + "mean_token_accuracy": 0.8054057359695435, + "num_tokens": 2049397.0, + "step": 225 + }, + { + "epoch": 0.1717325227963526, + "grad_norm": 1.807358980178833, + "learning_rate": 4.999360476902656e-06, + "loss": 0.4376158118247986, + "mean_token_accuracy": 0.8456039428710938, + "num_tokens": 2058721.0, + "step": 226 + }, + { + "epoch": 0.17249240121580547, + "grad_norm": 3.231638193130493, + "learning_rate": 4.999312229773022e-06, + "loss": 0.5592809915542603, + "mean_token_accuracy": 0.8170154094696045, + "num_tokens": 2063455.0, + "step": 227 + }, + { + "epoch": 0.17325227963525835, + "grad_norm": 2.2717151641845703, + "learning_rate": 4.999262228529855e-06, + "loss": 0.6144396066665649, + "mean_token_accuracy": 0.7948470115661621, + "num_tokens": 2071686.0, + "step": 228 + }, + { + "epoch": 0.17401215805471124, + "grad_norm": 1.4171342849731445, + "learning_rate": 4.99921047320825e-06, + "loss": 0.43680912256240845, + "mean_token_accuracy": 0.84850013256073, + "num_tokens": 2086999.0, + "step": 229 + }, + { + "epoch": 0.17477203647416414, + "grad_norm": 3.162736654281616, + "learning_rate": 4.99915696384453e-06, + "loss": 0.6025407910346985, + "mean_token_accuracy": 0.8042335510253906, + "num_tokens": 2092001.0, + "step": 230 + }, + { + "epoch": 0.17553191489361702, + "grad_norm": 1.8672804832458496, + "learning_rate": 4.99910170047625e-06, + "loss": 0.5843087434768677, + "mean_token_accuracy": 0.8016980886459351, + "num_tokens": 2103372.0, + "step": 231 + }, + { + "epoch": 0.1762917933130699, + "grad_norm": 2.967587471008301, + "learning_rate": 4.999044683142196e-06, + "loss": 0.5123642086982727, + "mean_token_accuracy": 0.8216149806976318, + "num_tokens": 2108008.0, + "step": 232 + }, + { + "epoch": 0.1770516717325228, + "grad_norm": 1.9651981592178345, + "learning_rate": 4.998985911882383e-06, + "loss": 0.5868178606033325, + "mean_token_accuracy": 0.7904198169708252, + "num_tokens": 2119009.0, + "step": 233 + }, + { + "epoch": 0.1778115501519757, + "grad_norm": 2.7785449028015137, + "learning_rate": 4.998925386738063e-06, + "loss": 0.5075510144233704, + "mean_token_accuracy": 0.8280210494995117, + "num_tokens": 2124915.0, + "step": 234 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 2.957470417022705, + "learning_rate": 4.998863107751711e-06, + "loss": 0.5351958274841309, + "mean_token_accuracy": 0.846825122833252, + "num_tokens": 2129905.0, + "step": 235 + }, + { + "epoch": 0.17933130699088146, + "grad_norm": 3.207671880722046, + "learning_rate": 4.99879907496704e-06, + "loss": 0.6209091544151306, + "mean_token_accuracy": 0.789960503578186, + "num_tokens": 2135027.0, + "step": 236 + }, + { + "epoch": 0.18009118541033434, + "grad_norm": 2.018953800201416, + "learning_rate": 4.998733288428987e-06, + "loss": 0.601510763168335, + "mean_token_accuracy": 0.8136930465698242, + "num_tokens": 2147016.0, + "step": 237 + }, + { + "epoch": 0.18085106382978725, + "grad_norm": 2.437281847000122, + "learning_rate": 4.998665748183727e-06, + "loss": 0.5813639163970947, + "mean_token_accuracy": 0.8116716146469116, + "num_tokens": 2155386.0, + "step": 238 + }, + { + "epoch": 0.18161094224924013, + "grad_norm": 1.5708180665969849, + "learning_rate": 4.998596454278661e-06, + "loss": 0.5252395272254944, + "mean_token_accuracy": 0.8193864822387695, + "num_tokens": 2170295.0, + "step": 239 + }, + { + "epoch": 0.182370820668693, + "grad_norm": 1.9921495914459229, + "learning_rate": 4.998525406762422e-06, + "loss": 0.5335029363632202, + "mean_token_accuracy": 0.8120872974395752, + "num_tokens": 2180012.0, + "step": 240 + }, + { + "epoch": 0.1831306990881459, + "grad_norm": 2.6562681198120117, + "learning_rate": 4.998452605684874e-06, + "loss": 0.48021435737609863, + "mean_token_accuracy": 0.8388714790344238, + "num_tokens": 2185607.0, + "step": 241 + }, + { + "epoch": 0.1838905775075988, + "grad_norm": 2.2535853385925293, + "learning_rate": 4.998378051097111e-06, + "loss": 0.5747300386428833, + "mean_token_accuracy": 0.8004639148712158, + "num_tokens": 2194105.0, + "step": 242 + }, + { + "epoch": 0.18465045592705168, + "grad_norm": 1.6151788234710693, + "learning_rate": 4.998301743051459e-06, + "loss": 0.6190565824508667, + "mean_token_accuracy": 0.7816627621650696, + "num_tokens": 2210629.0, + "step": 243 + }, + { + "epoch": 0.18541033434650456, + "grad_norm": 2.1088173389434814, + "learning_rate": 4.9982236816014735e-06, + "loss": 0.4715560972690582, + "mean_token_accuracy": 0.8485721349716187, + "num_tokens": 2218958.0, + "step": 244 + }, + { + "epoch": 0.18617021276595744, + "grad_norm": 2.6168735027313232, + "learning_rate": 4.998143866801941e-06, + "loss": 0.6077103018760681, + "mean_token_accuracy": 0.8057924509048462, + "num_tokens": 2226368.0, + "step": 245 + }, + { + "epoch": 0.18693009118541035, + "grad_norm": 2.5988616943359375, + "learning_rate": 4.99806229870888e-06, + "loss": 0.5021637678146362, + "mean_token_accuracy": 0.8361666202545166, + "num_tokens": 2232485.0, + "step": 246 + }, + { + "epoch": 0.18768996960486323, + "grad_norm": 2.015887498855591, + "learning_rate": 4.9979789773795365e-06, + "loss": 0.4309737980365753, + "mean_token_accuracy": 0.8508044481277466, + "num_tokens": 2240819.0, + "step": 247 + }, + { + "epoch": 0.1884498480243161, + "grad_norm": 2.3115265369415283, + "learning_rate": 4.997893902872389e-06, + "loss": 0.5776500701904297, + "mean_token_accuracy": 0.8079549074172974, + "num_tokens": 2249460.0, + "step": 248 + }, + { + "epoch": 0.189209726443769, + "grad_norm": 1.7387021780014038, + "learning_rate": 4.997807075247147e-06, + "loss": 0.430944561958313, + "mean_token_accuracy": 0.8483544588088989, + "num_tokens": 2259124.0, + "step": 249 + }, + { + "epoch": 0.1899696048632219, + "grad_norm": 1.6378381252288818, + "learning_rate": 4.997718494564747e-06, + "loss": 0.4123363792896271, + "mean_token_accuracy": 0.8557409644126892, + "num_tokens": 2269899.0, + "step": 250 + }, + { + "epoch": 0.19072948328267478, + "grad_norm": 1.336282730102539, + "learning_rate": 4.997628160887361e-06, + "loss": 0.502329409122467, + "mean_token_accuracy": 0.8186938166618347, + "num_tokens": 2292821.0, + "step": 251 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 3.3335583209991455, + "learning_rate": 4.997536074278388e-06, + "loss": 0.584446907043457, + "mean_token_accuracy": 0.8062717318534851, + "num_tokens": 2297175.0, + "step": 252 + }, + { + "epoch": 0.19224924012158054, + "grad_norm": 2.246727228164673, + "learning_rate": 4.9974422348024565e-06, + "loss": 0.5683060884475708, + "mean_token_accuracy": 0.8193703293800354, + "num_tokens": 2305456.0, + "step": 253 + }, + { + "epoch": 0.19300911854103345, + "grad_norm": 2.3520865440368652, + "learning_rate": 4.997346642525429e-06, + "loss": 0.4724946618080139, + "mean_token_accuracy": 0.8426719307899475, + "num_tokens": 2312241.0, + "step": 254 + }, + { + "epoch": 0.19376899696048633, + "grad_norm": 2.7115702629089355, + "learning_rate": 4.9972492975143936e-06, + "loss": 0.5019032955169678, + "mean_token_accuracy": 0.8253573179244995, + "num_tokens": 2318094.0, + "step": 255 + }, + { + "epoch": 0.1945288753799392, + "grad_norm": 1.705528974533081, + "learning_rate": 4.997150199837671e-06, + "loss": 0.45588475465774536, + "mean_token_accuracy": 0.836666464805603, + "num_tokens": 2329025.0, + "step": 256 + }, + { + "epoch": 0.1952887537993921, + "grad_norm": 2.161400318145752, + "learning_rate": 4.997049349564814e-06, + "loss": 0.5170183777809143, + "mean_token_accuracy": 0.8287534117698669, + "num_tokens": 2337448.0, + "step": 257 + }, + { + "epoch": 0.196048632218845, + "grad_norm": 2.629669189453125, + "learning_rate": 4.996946746766602e-06, + "loss": 0.44650501012802124, + "mean_token_accuracy": 0.850114107131958, + "num_tokens": 2343207.0, + "step": 258 + }, + { + "epoch": 0.19680851063829788, + "grad_norm": 1.6735503673553467, + "learning_rate": 4.996842391515045e-06, + "loss": 0.5247820019721985, + "mean_token_accuracy": 0.8285071849822998, + "num_tokens": 2356801.0, + "step": 259 + }, + { + "epoch": 0.19756838905775076, + "grad_norm": 1.2753115892410278, + "learning_rate": 4.996736283883382e-06, + "loss": 0.41870927810668945, + "mean_token_accuracy": 0.8448047637939453, + "num_tokens": 2377306.0, + "step": 260 + }, + { + "epoch": 0.19832826747720364, + "grad_norm": 2.6947314739227295, + "learning_rate": 4.9966284239460875e-06, + "loss": 0.5059205889701843, + "mean_token_accuracy": 0.8430814743041992, + "num_tokens": 2383352.0, + "step": 261 + }, + { + "epoch": 0.19908814589665655, + "grad_norm": 2.0509963035583496, + "learning_rate": 4.996518811778858e-06, + "loss": 0.4565388560295105, + "mean_token_accuracy": 0.8453130722045898, + "num_tokens": 2391149.0, + "step": 262 + }, + { + "epoch": 0.19984802431610943, + "grad_norm": 2.1856348514556885, + "learning_rate": 4.996407447458626e-06, + "loss": 0.531380832195282, + "mean_token_accuracy": 0.8387004137039185, + "num_tokens": 2399875.0, + "step": 263 + }, + { + "epoch": 0.2006079027355623, + "grad_norm": 2.7348573207855225, + "learning_rate": 4.99629433106355e-06, + "loss": 0.5242817401885986, + "mean_token_accuracy": 0.8177423477172852, + "num_tokens": 2406586.0, + "step": 264 + }, + { + "epoch": 0.2013677811550152, + "grad_norm": 1.76587975025177, + "learning_rate": 4.99617946267302e-06, + "loss": 0.49298471212387085, + "mean_token_accuracy": 0.8271149396896362, + "num_tokens": 2418683.0, + "step": 265 + }, + { + "epoch": 0.20212765957446807, + "grad_norm": 2.8129730224609375, + "learning_rate": 4.996062842367655e-06, + "loss": 0.46420302987098694, + "mean_token_accuracy": 0.8453244566917419, + "num_tokens": 2422929.0, + "step": 266 + }, + { + "epoch": 0.20288753799392098, + "grad_norm": 2.575744152069092, + "learning_rate": 4.9959444702293025e-06, + "loss": 0.43208545446395874, + "mean_token_accuracy": 0.8494843244552612, + "num_tokens": 2429567.0, + "step": 267 + }, + { + "epoch": 0.20364741641337386, + "grad_norm": 2.7586750984191895, + "learning_rate": 4.995824346341041e-06, + "loss": 0.4390473961830139, + "mean_token_accuracy": 0.8348895311355591, + "num_tokens": 2434700.0, + "step": 268 + }, + { + "epoch": 0.20440729483282674, + "grad_norm": 1.972145438194275, + "learning_rate": 4.99570247078718e-06, + "loss": 0.6219544410705566, + "mean_token_accuracy": 0.7939999103546143, + "num_tokens": 2447007.0, + "step": 269 + }, + { + "epoch": 0.20516717325227962, + "grad_norm": 2.2963485717773438, + "learning_rate": 4.995578843653255e-06, + "loss": 0.5008970499038696, + "mean_token_accuracy": 0.8255308866500854, + "num_tokens": 2453936.0, + "step": 270 + }, + { + "epoch": 0.20592705167173253, + "grad_norm": 1.8897721767425537, + "learning_rate": 4.995453465026033e-06, + "loss": 0.5436089038848877, + "mean_token_accuracy": 0.819086492061615, + "num_tokens": 2464494.0, + "step": 271 + }, + { + "epoch": 0.2066869300911854, + "grad_norm": 2.319728374481201, + "learning_rate": 4.995326334993508e-06, + "loss": 0.5136368870735168, + "mean_token_accuracy": 0.820817232131958, + "num_tokens": 2470938.0, + "step": 272 + }, + { + "epoch": 0.2074468085106383, + "grad_norm": 2.230414390563965, + "learning_rate": 4.9951974536449055e-06, + "loss": 0.5272846817970276, + "mean_token_accuracy": 0.8203279972076416, + "num_tokens": 2478629.0, + "step": 273 + }, + { + "epoch": 0.20820668693009117, + "grad_norm": 3.401937484741211, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.4389592111110687, + "mean_token_accuracy": 0.8647899031639099, + "num_tokens": 2482193.0, + "step": 274 + }, + { + "epoch": 0.20896656534954408, + "grad_norm": 2.1278507709503174, + "learning_rate": 4.994934437362513e-06, + "loss": 0.598863422870636, + "mean_token_accuracy": 0.7945119738578796, + "num_tokens": 2492465.0, + "step": 275 + }, + { + "epoch": 0.20972644376899696, + "grad_norm": 1.9259960651397705, + "learning_rate": 4.994800302613318e-06, + "loss": 0.49520939588546753, + "mean_token_accuracy": 0.8371536135673523, + "num_tokens": 2500825.0, + "step": 276 + }, + { + "epoch": 0.21048632218844984, + "grad_norm": 2.346418857574463, + "learning_rate": 4.994664416917236e-06, + "loss": 0.5412614345550537, + "mean_token_accuracy": 0.810661792755127, + "num_tokens": 2509513.0, + "step": 277 + }, + { + "epoch": 0.21124620060790272, + "grad_norm": 1.3092039823532104, + "learning_rate": 4.994526780369636e-06, + "loss": 0.46305379271507263, + "mean_token_accuracy": 0.8358527421951294, + "num_tokens": 2531405.0, + "step": 278 + }, + { + "epoch": 0.21200607902735563, + "grad_norm": 2.924611806869507, + "learning_rate": 4.9943873930671175e-06, + "loss": 0.6134544610977173, + "mean_token_accuracy": 0.7947378754615784, + "num_tokens": 2536744.0, + "step": 279 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 2.8290598392486572, + "learning_rate": 4.994246255107506e-06, + "loss": 0.465520441532135, + "mean_token_accuracy": 0.8440108299255371, + "num_tokens": 2541184.0, + "step": 280 + }, + { + "epoch": 0.2135258358662614, + "grad_norm": 3.8081259727478027, + "learning_rate": 4.994103366589859e-06, + "loss": 0.43394139409065247, + "mean_token_accuracy": 0.8579148054122925, + "num_tokens": 2545395.0, + "step": 281 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.7994529008865356, + "learning_rate": 4.993958727614462e-06, + "loss": 0.5076484680175781, + "mean_token_accuracy": 0.8270803093910217, + "num_tokens": 2556541.0, + "step": 282 + }, + { + "epoch": 0.21504559270516718, + "grad_norm": 2.5582659244537354, + "learning_rate": 4.993812338282826e-06, + "loss": 0.4453684389591217, + "mean_token_accuracy": 0.8488293886184692, + "num_tokens": 2562949.0, + "step": 283 + }, + { + "epoch": 0.21580547112462006, + "grad_norm": 1.6448938846588135, + "learning_rate": 4.993664198697694e-06, + "loss": 0.461971640586853, + "mean_token_accuracy": 0.824763298034668, + "num_tokens": 2576407.0, + "step": 284 + }, + { + "epoch": 0.21656534954407294, + "grad_norm": 2.1264469623565674, + "learning_rate": 4.993514308963037e-06, + "loss": 0.6241602897644043, + "mean_token_accuracy": 0.7916014790534973, + "num_tokens": 2585695.0, + "step": 285 + }, + { + "epoch": 0.21732522796352582, + "grad_norm": 3.629991292953491, + "learning_rate": 4.993362669184051e-06, + "loss": 0.610355019569397, + "mean_token_accuracy": 0.7847568988800049, + "num_tokens": 2589778.0, + "step": 286 + }, + { + "epoch": 0.21808510638297873, + "grad_norm": 1.9070756435394287, + "learning_rate": 4.993209279467164e-06, + "loss": 0.5513623952865601, + "mean_token_accuracy": 0.7911607027053833, + "num_tokens": 2600920.0, + "step": 287 + }, + { + "epoch": 0.2188449848024316, + "grad_norm": 1.761062741279602, + "learning_rate": 4.993054139920031e-06, + "loss": 0.4579957127571106, + "mean_token_accuracy": 0.8189530372619629, + "num_tokens": 2611856.0, + "step": 288 + }, + { + "epoch": 0.2196048632218845, + "grad_norm": 1.7264713048934937, + "learning_rate": 4.992897250651535e-06, + "loss": 0.5871305465698242, + "mean_token_accuracy": 0.7918527126312256, + "num_tokens": 2624730.0, + "step": 289 + }, + { + "epoch": 0.22036474164133737, + "grad_norm": 1.7455977201461792, + "learning_rate": 4.992738611771787e-06, + "loss": 0.5475119948387146, + "mean_token_accuracy": 0.8226917386054993, + "num_tokens": 2635705.0, + "step": 290 + }, + { + "epoch": 0.22112462006079028, + "grad_norm": 2.095095157623291, + "learning_rate": 4.992578223392124e-06, + "loss": 0.5952225923538208, + "mean_token_accuracy": 0.8078469038009644, + "num_tokens": 2643954.0, + "step": 291 + }, + { + "epoch": 0.22188449848024316, + "grad_norm": 2.994664192199707, + "learning_rate": 4.992416085625115e-06, + "loss": 0.5432442426681519, + "mean_token_accuracy": 0.8329008221626282, + "num_tokens": 2648800.0, + "step": 292 + }, + { + "epoch": 0.22264437689969604, + "grad_norm": 2.796790361404419, + "learning_rate": 4.992252198584554e-06, + "loss": 0.5168961882591248, + "mean_token_accuracy": 0.8393474817276001, + "num_tokens": 2653546.0, + "step": 293 + }, + { + "epoch": 0.22340425531914893, + "grad_norm": 1.8610522747039795, + "learning_rate": 4.992086562385462e-06, + "loss": 0.5728024244308472, + "mean_token_accuracy": 0.797406792640686, + "num_tokens": 2667483.0, + "step": 294 + }, + { + "epoch": 0.22416413373860183, + "grad_norm": 1.695472002029419, + "learning_rate": 4.9919191771440905e-06, + "loss": 0.5460028648376465, + "mean_token_accuracy": 0.8123016357421875, + "num_tokens": 2683574.0, + "step": 295 + }, + { + "epoch": 0.22492401215805471, + "grad_norm": 2.8627376556396484, + "learning_rate": 4.9917500429779165e-06, + "loss": 0.5566985011100769, + "mean_token_accuracy": 0.815531313419342, + "num_tokens": 2688985.0, + "step": 296 + }, + { + "epoch": 0.2256838905775076, + "grad_norm": 2.73323655128479, + "learning_rate": 4.991579160005644e-06, + "loss": 0.48197102546691895, + "mean_token_accuracy": 0.8471829295158386, + "num_tokens": 2694799.0, + "step": 297 + }, + { + "epoch": 0.22644376899696048, + "grad_norm": 1.8436161279678345, + "learning_rate": 4.991406528347206e-06, + "loss": 0.4528339207172394, + "mean_token_accuracy": 0.8603188395500183, + "num_tokens": 2707321.0, + "step": 298 + }, + { + "epoch": 0.22720364741641338, + "grad_norm": 2.6231515407562256, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.5916541814804077, + "mean_token_accuracy": 0.8050242066383362, + "num_tokens": 2714233.0, + "step": 299 + }, + { + "epoch": 0.22796352583586627, + "grad_norm": 3.08776593208313, + "learning_rate": 4.991056019457697e-06, + "loss": 0.4860580563545227, + "mean_token_accuracy": 0.8464088439941406, + "num_tokens": 2718443.0, + "step": 300 + }, + { + "epoch": 0.22872340425531915, + "grad_norm": 2.2537803649902344, + "learning_rate": 4.990878142472628e-06, + "loss": 0.5158311128616333, + "mean_token_accuracy": 0.824694812297821, + "num_tokens": 2726158.0, + "step": 301 + }, + { + "epoch": 0.22948328267477203, + "grad_norm": 2.1122705936431885, + "learning_rate": 4.990698517293394e-06, + "loss": 0.495265394449234, + "mean_token_accuracy": 0.8343238830566406, + "num_tokens": 2735022.0, + "step": 302 + }, + { + "epoch": 0.23024316109422494, + "grad_norm": 3.5503528118133545, + "learning_rate": 4.9905171440460645e-06, + "loss": 0.46063232421875, + "mean_token_accuracy": 0.8420047760009766, + "num_tokens": 2738550.0, + "step": 303 + }, + { + "epoch": 0.23100303951367782, + "grad_norm": 3.9858486652374268, + "learning_rate": 4.990334022857932e-06, + "loss": 0.5832710266113281, + "mean_token_accuracy": 0.8144199848175049, + "num_tokens": 2741720.0, + "step": 304 + }, + { + "epoch": 0.2317629179331307, + "grad_norm": 2.407231330871582, + "learning_rate": 4.990149153857519e-06, + "loss": 0.4692630171775818, + "mean_token_accuracy": 0.8429223299026489, + "num_tokens": 2748693.0, + "step": 305 + }, + { + "epoch": 0.23252279635258358, + "grad_norm": 1.6996397972106934, + "learning_rate": 4.989962537174573e-06, + "loss": 0.49143946170806885, + "mean_token_accuracy": 0.8340128064155579, + "num_tokens": 2761254.0, + "step": 306 + }, + { + "epoch": 0.23328267477203649, + "grad_norm": 3.746432065963745, + "learning_rate": 4.989774172940071e-06, + "loss": 0.6282026767730713, + "mean_token_accuracy": 0.775698184967041, + "num_tokens": 2765115.0, + "step": 307 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 2.212872266769409, + "learning_rate": 4.989584061286211e-06, + "loss": 0.5193763971328735, + "mean_token_accuracy": 0.8168246746063232, + "num_tokens": 2772345.0, + "step": 308 + }, + { + "epoch": 0.23480243161094225, + "grad_norm": 1.752297282218933, + "learning_rate": 4.989392202346423e-06, + "loss": 0.4437984824180603, + "mean_token_accuracy": 0.8451256155967712, + "num_tokens": 2783072.0, + "step": 309 + }, + { + "epoch": 0.23556231003039513, + "grad_norm": 2.386019706726074, + "learning_rate": 4.989198596255361e-06, + "loss": 0.4090752899646759, + "mean_token_accuracy": 0.8480085134506226, + "num_tokens": 2788757.0, + "step": 310 + }, + { + "epoch": 0.23632218844984804, + "grad_norm": 3.9981489181518555, + "learning_rate": 4.989003243148904e-06, + "loss": 0.5149132013320923, + "mean_token_accuracy": 0.8179056644439697, + "num_tokens": 2792096.0, + "step": 311 + }, + { + "epoch": 0.23708206686930092, + "grad_norm": 1.8723100423812866, + "learning_rate": 4.988806143164159e-06, + "loss": 0.4531487822532654, + "mean_token_accuracy": 0.8400167226791382, + "num_tokens": 2802210.0, + "step": 312 + }, + { + "epoch": 0.2378419452887538, + "grad_norm": 2.3415136337280273, + "learning_rate": 4.988607296439459e-06, + "loss": 0.5974439978599548, + "mean_token_accuracy": 0.8035976886749268, + "num_tokens": 2810088.0, + "step": 313 + }, + { + "epoch": 0.23860182370820668, + "grad_norm": 1.5317577123641968, + "learning_rate": 4.98840670311436e-06, + "loss": 0.49247145652770996, + "mean_token_accuracy": 0.8292540311813354, + "num_tokens": 2824005.0, + "step": 314 + }, + { + "epoch": 0.2393617021276596, + "grad_norm": 2.170772075653076, + "learning_rate": 4.988204363329648e-06, + "loss": 0.6359974145889282, + "mean_token_accuracy": 0.7785564661026001, + "num_tokens": 2834680.0, + "step": 315 + }, + { + "epoch": 0.24012158054711247, + "grad_norm": 3.2655932903289795, + "learning_rate": 4.988000277227334e-06, + "loss": 0.5080196857452393, + "mean_token_accuracy": 0.8295877575874329, + "num_tokens": 2838735.0, + "step": 316 + }, + { + "epoch": 0.24088145896656535, + "grad_norm": 3.406589984893799, + "learning_rate": 4.987794444950651e-06, + "loss": 0.3939085006713867, + "mean_token_accuracy": 0.8700719475746155, + "num_tokens": 2842127.0, + "step": 317 + }, + { + "epoch": 0.24164133738601823, + "grad_norm": 1.8211106061935425, + "learning_rate": 4.987586866644061e-06, + "loss": 0.5270540118217468, + "mean_token_accuracy": 0.826683521270752, + "num_tokens": 2853656.0, + "step": 318 + }, + { + "epoch": 0.24240121580547114, + "grad_norm": 1.8429969549179077, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.4705049991607666, + "mean_token_accuracy": 0.8355701565742493, + "num_tokens": 2863513.0, + "step": 319 + }, + { + "epoch": 0.24316109422492402, + "grad_norm": 2.2425320148468018, + "learning_rate": 4.9871664725251314e-06, + "loss": 0.485736608505249, + "mean_token_accuracy": 0.835182785987854, + "num_tokens": 2871556.0, + "step": 320 + }, + { + "epoch": 0.2439209726443769, + "grad_norm": 1.6202056407928467, + "learning_rate": 4.986953657007841e-06, + "loss": 0.4437887370586395, + "mean_token_accuracy": 0.8282591700553894, + "num_tokens": 2884335.0, + "step": 321 + }, + { + "epoch": 0.24468085106382978, + "grad_norm": 1.1027268171310425, + "learning_rate": 4.98673909605074e-06, + "loss": 0.3770800828933716, + "mean_token_accuracy": 0.8325437307357788, + "num_tokens": 2904286.0, + "step": 322 + }, + { + "epoch": 0.2454407294832827, + "grad_norm": 2.3239076137542725, + "learning_rate": 4.986522789804417e-06, + "loss": 0.5387254953384399, + "mean_token_accuracy": 0.806242823600769, + "num_tokens": 2910975.0, + "step": 323 + }, + { + "epoch": 0.24620060790273557, + "grad_norm": 2.243482828140259, + "learning_rate": 4.986304738420684e-06, + "loss": 0.4396553039550781, + "mean_token_accuracy": 0.8561904430389404, + "num_tokens": 2917087.0, + "step": 324 + }, + { + "epoch": 0.24696048632218845, + "grad_norm": 2.537264347076416, + "learning_rate": 4.986084942052577e-06, + "loss": 0.395110160112381, + "mean_token_accuracy": 0.8636915683746338, + "num_tokens": 2921887.0, + "step": 325 + }, + { + "epoch": 0.24772036474164133, + "grad_norm": 2.319399118423462, + "learning_rate": 4.9858634008543574e-06, + "loss": 0.581517219543457, + "mean_token_accuracy": 0.8157487511634827, + "num_tokens": 2928996.0, + "step": 326 + }, + { + "epoch": 0.24848024316109424, + "grad_norm": 1.9787474870681763, + "learning_rate": 4.985640114981513e-06, + "loss": 0.5084106922149658, + "mean_token_accuracy": 0.835221529006958, + "num_tokens": 2940302.0, + "step": 327 + }, + { + "epoch": 0.24924012158054712, + "grad_norm": 2.4783265590667725, + "learning_rate": 4.985415084590752e-06, + "loss": 0.6062222719192505, + "mean_token_accuracy": 0.7885516285896301, + "num_tokens": 2946386.0, + "step": 328 + }, + { + "epoch": 0.25, + "grad_norm": 2.4081411361694336, + "learning_rate": 4.985188309840012e-06, + "loss": 0.5079880356788635, + "mean_token_accuracy": 0.8313904404640198, + "num_tokens": 2952323.0, + "step": 329 + }, + { + "epoch": 0.2507598784194529, + "grad_norm": 2.64993953704834, + "learning_rate": 4.984959790888451e-06, + "loss": 0.5461447834968567, + "mean_token_accuracy": 0.8125468492507935, + "num_tokens": 2958119.0, + "step": 330 + }, + { + "epoch": 0.25151975683890576, + "grad_norm": 2.549734115600586, + "learning_rate": 4.984729527896451e-06, + "loss": 0.5998573303222656, + "mean_token_accuracy": 0.8076666593551636, + "num_tokens": 2964947.0, + "step": 331 + }, + { + "epoch": 0.25227963525835867, + "grad_norm": 3.2185161113739014, + "learning_rate": 4.984497521025622e-06, + "loss": 0.4232945442199707, + "mean_token_accuracy": 0.8543803095817566, + "num_tokens": 2968598.0, + "step": 332 + }, + { + "epoch": 0.2530395136778115, + "grad_norm": 2.588994264602661, + "learning_rate": 4.984263770438793e-06, + "loss": 0.460967481136322, + "mean_token_accuracy": 0.8416207432746887, + "num_tokens": 2974510.0, + "step": 333 + }, + { + "epoch": 0.25379939209726443, + "grad_norm": 2.1373162269592285, + "learning_rate": 4.984028276300021e-06, + "loss": 0.49382102489471436, + "mean_token_accuracy": 0.8388048410415649, + "num_tokens": 2981632.0, + "step": 334 + }, + { + "epoch": 0.25455927051671734, + "grad_norm": 2.2524826526641846, + "learning_rate": 4.983791038774585e-06, + "loss": 0.4947671890258789, + "mean_token_accuracy": 0.8066365122795105, + "num_tokens": 2988736.0, + "step": 335 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.7244199514389038, + "learning_rate": 4.983552058028985e-06, + "loss": 0.48096776008605957, + "mean_token_accuracy": 0.830735445022583, + "num_tokens": 3003576.0, + "step": 336 + }, + { + "epoch": 0.2560790273556231, + "grad_norm": 3.0628933906555176, + "learning_rate": 4.9833113342309495e-06, + "loss": 0.6027032136917114, + "mean_token_accuracy": 0.8008694648742676, + "num_tokens": 3009549.0, + "step": 337 + }, + { + "epoch": 0.256838905775076, + "grad_norm": 2.438674211502075, + "learning_rate": 4.983068867549427e-06, + "loss": 0.517090916633606, + "mean_token_accuracy": 0.827893853187561, + "num_tokens": 3015236.0, + "step": 338 + }, + { + "epoch": 0.25759878419452886, + "grad_norm": 2.131535053253174, + "learning_rate": 4.982824658154589e-06, + "loss": 0.6656812429428101, + "mean_token_accuracy": 0.7772425413131714, + "num_tokens": 3028142.0, + "step": 339 + }, + { + "epoch": 0.25835866261398177, + "grad_norm": 2.3206584453582764, + "learning_rate": 4.9825787062178315e-06, + "loss": 0.5757625699043274, + "mean_token_accuracy": 0.8073873519897461, + "num_tokens": 3040996.0, + "step": 340 + }, + { + "epoch": 0.2591185410334346, + "grad_norm": 1.3905521631240845, + "learning_rate": 4.982331011911774e-06, + "loss": 0.4193805456161499, + "mean_token_accuracy": 0.8399466872215271, + "num_tokens": 3061931.0, + "step": 341 + }, + { + "epoch": 0.25987841945288753, + "grad_norm": 2.184173345565796, + "learning_rate": 4.982081575410256e-06, + "loss": 0.4751223921775818, + "mean_token_accuracy": 0.8409271240234375, + "num_tokens": 3069081.0, + "step": 342 + }, + { + "epoch": 0.26063829787234044, + "grad_norm": 3.538764238357544, + "learning_rate": 4.9818303968883445e-06, + "loss": 0.8119601011276245, + "mean_token_accuracy": 0.7442739009857178, + "num_tokens": 3073628.0, + "step": 343 + }, + { + "epoch": 0.2613981762917933, + "grad_norm": 1.8063762187957764, + "learning_rate": 4.981577476522323e-06, + "loss": 0.5615730881690979, + "mean_token_accuracy": 0.8207751512527466, + "num_tokens": 3086596.0, + "step": 344 + }, + { + "epoch": 0.2621580547112462, + "grad_norm": 2.4346961975097656, + "learning_rate": 4.981322814489703e-06, + "loss": 0.5266709327697754, + "mean_token_accuracy": 0.8211277723312378, + "num_tokens": 3092631.0, + "step": 345 + }, + { + "epoch": 0.2629179331306991, + "grad_norm": 1.91289484500885, + "learning_rate": 4.981066410969215e-06, + "loss": 0.5047177672386169, + "mean_token_accuracy": 0.8356877565383911, + "num_tokens": 3101102.0, + "step": 346 + }, + { + "epoch": 0.26367781155015196, + "grad_norm": 2.1495707035064697, + "learning_rate": 4.980808266140813e-06, + "loss": 0.47876280546188354, + "mean_token_accuracy": 0.8364313244819641, + "num_tokens": 3107998.0, + "step": 347 + }, + { + "epoch": 0.26443768996960487, + "grad_norm": 2.5961992740631104, + "learning_rate": 4.9805483801856744e-06, + "loss": 0.5512958765029907, + "mean_token_accuracy": 0.8181467652320862, + "num_tokens": 3113848.0, + "step": 348 + }, + { + "epoch": 0.2651975683890577, + "grad_norm": 3.2828900814056396, + "learning_rate": 4.980286753286196e-06, + "loss": 0.4217945635318756, + "mean_token_accuracy": 0.8617103099822998, + "num_tokens": 3117652.0, + "step": 349 + }, + { + "epoch": 0.26595744680851063, + "grad_norm": 1.425554871559143, + "learning_rate": 4.980023385625996e-06, + "loss": 0.4042487144470215, + "mean_token_accuracy": 0.8492785692214966, + "num_tokens": 3132336.0, + "step": 350 + }, + { + "epoch": 0.26671732522796354, + "grad_norm": 2.933504104614258, + "learning_rate": 4.979758277389919e-06, + "loss": 0.5406704545021057, + "mean_token_accuracy": 0.8035423755645752, + "num_tokens": 3137544.0, + "step": 351 + }, + { + "epoch": 0.2674772036474164, + "grad_norm": 1.9958966970443726, + "learning_rate": 4.9794914287640264e-06, + "loss": 0.5857555270195007, + "mean_token_accuracy": 0.7965140342712402, + "num_tokens": 3149705.0, + "step": 352 + }, + { + "epoch": 0.2682370820668693, + "grad_norm": 2.467694044113159, + "learning_rate": 4.979222839935602e-06, + "loss": 0.6404043436050415, + "mean_token_accuracy": 0.7823755741119385, + "num_tokens": 3158353.0, + "step": 353 + }, + { + "epoch": 0.2689969604863222, + "grad_norm": 2.0102720260620117, + "learning_rate": 4.9789525110931545e-06, + "loss": 0.5681496858596802, + "mean_token_accuracy": 0.8108169436454773, + "num_tokens": 3167121.0, + "step": 354 + }, + { + "epoch": 0.26975683890577506, + "grad_norm": 2.6017866134643555, + "learning_rate": 4.978680442426409e-06, + "loss": 0.6309828162193298, + "mean_token_accuracy": 0.7742617130279541, + "num_tokens": 3175012.0, + "step": 355 + }, + { + "epoch": 0.270516717325228, + "grad_norm": 1.8799268007278442, + "learning_rate": 4.978406634126315e-06, + "loss": 0.524029016494751, + "mean_token_accuracy": 0.8317689895629883, + "num_tokens": 3185331.0, + "step": 356 + }, + { + "epoch": 0.2712765957446808, + "grad_norm": 1.508332371711731, + "learning_rate": 4.978131086385041e-06, + "loss": 0.46656402945518494, + "mean_token_accuracy": 0.8339117765426636, + "num_tokens": 3198813.0, + "step": 357 + }, + { + "epoch": 0.27203647416413373, + "grad_norm": 3.595707654953003, + "learning_rate": 4.977853799395976e-06, + "loss": 0.5101234912872314, + "mean_token_accuracy": 0.8251723051071167, + "num_tokens": 3206557.0, + "step": 358 + }, + { + "epoch": 0.27279635258358664, + "grad_norm": 3.5317916870117188, + "learning_rate": 4.977574773353732e-06, + "loss": 0.5684665441513062, + "mean_token_accuracy": 0.8124493360519409, + "num_tokens": 3210912.0, + "step": 359 + }, + { + "epoch": 0.2735562310030395, + "grad_norm": 2.8606204986572266, + "learning_rate": 4.97729400845414e-06, + "loss": 0.4746384620666504, + "mean_token_accuracy": 0.8195606470108032, + "num_tokens": 3215365.0, + "step": 360 + }, + { + "epoch": 0.2743161094224924, + "grad_norm": 1.8214033842086792, + "learning_rate": 4.977011504894253e-06, + "loss": 0.4842769503593445, + "mean_token_accuracy": 0.82928866147995, + "num_tokens": 3224037.0, + "step": 361 + }, + { + "epoch": 0.2750759878419453, + "grad_norm": 1.628746509552002, + "learning_rate": 4.97672726287234e-06, + "loss": 0.4397493302822113, + "mean_token_accuracy": 0.8606528043746948, + "num_tokens": 3235589.0, + "step": 362 + }, + { + "epoch": 0.27583586626139817, + "grad_norm": 3.557973861694336, + "learning_rate": 4.976441282587894e-06, + "loss": 0.5732032060623169, + "mean_token_accuracy": 0.8041545748710632, + "num_tokens": 3239958.0, + "step": 363 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 1.3467901945114136, + "learning_rate": 4.9761535642416284e-06, + "loss": 0.4525323510169983, + "mean_token_accuracy": 0.8281061053276062, + "num_tokens": 3257703.0, + "step": 364 + }, + { + "epoch": 0.2773556231003039, + "grad_norm": 2.2649986743927, + "learning_rate": 4.9758641080354745e-06, + "loss": 0.5074734687805176, + "mean_token_accuracy": 0.8447474241256714, + "num_tokens": 3264334.0, + "step": 365 + }, + { + "epoch": 0.27811550151975684, + "grad_norm": 2.8667566776275635, + "learning_rate": 4.975572914172581e-06, + "loss": 0.5759559869766235, + "mean_token_accuracy": 0.7976793050765991, + "num_tokens": 3269314.0, + "step": 366 + }, + { + "epoch": 0.27887537993920974, + "grad_norm": 2.2514986991882324, + "learning_rate": 4.975279982857324e-06, + "loss": 0.5786465406417847, + "mean_token_accuracy": 0.8058781623840332, + "num_tokens": 3277324.0, + "step": 367 + }, + { + "epoch": 0.2796352583586626, + "grad_norm": 1.3826723098754883, + "learning_rate": 4.97498531429529e-06, + "loss": 0.40801727771759033, + "mean_token_accuracy": 0.8601310849189758, + "num_tokens": 3290530.0, + "step": 368 + }, + { + "epoch": 0.2803951367781155, + "grad_norm": 2.084092617034912, + "learning_rate": 4.97468890869329e-06, + "loss": 0.47076648473739624, + "mean_token_accuracy": 0.8310186862945557, + "num_tokens": 3298325.0, + "step": 369 + }, + { + "epoch": 0.2811550151975684, + "grad_norm": 1.3467998504638672, + "learning_rate": 4.974390766259353e-06, + "loss": 0.44668465852737427, + "mean_token_accuracy": 0.8275353908538818, + "num_tokens": 3314302.0, + "step": 370 + }, + { + "epoch": 0.28191489361702127, + "grad_norm": 2.5921075344085693, + "learning_rate": 4.974090887202726e-06, + "loss": 0.5343953967094421, + "mean_token_accuracy": 0.8110706806182861, + "num_tokens": 3320963.0, + "step": 371 + }, + { + "epoch": 0.2826747720364742, + "grad_norm": 2.042781352996826, + "learning_rate": 4.973789271733877e-06, + "loss": 0.6293343305587769, + "mean_token_accuracy": 0.7800243496894836, + "num_tokens": 3332742.0, + "step": 372 + }, + { + "epoch": 0.28343465045592703, + "grad_norm": 4.822193145751953, + "learning_rate": 4.973485920064491e-06, + "loss": 0.6256728768348694, + "mean_token_accuracy": 0.7962433099746704, + "num_tokens": 3335872.0, + "step": 373 + }, + { + "epoch": 0.28419452887537994, + "grad_norm": 1.260988473892212, + "learning_rate": 4.973180832407471e-06, + "loss": 0.38731223344802856, + "mean_token_accuracy": 0.8385066986083984, + "num_tokens": 3351884.0, + "step": 374 + }, + { + "epoch": 0.28495440729483285, + "grad_norm": 2.669966697692871, + "learning_rate": 4.97287400897694e-06, + "loss": 0.5594710111618042, + "mean_token_accuracy": 0.8097212314605713, + "num_tokens": 3358197.0, + "step": 375 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 3.0344486236572266, + "learning_rate": 4.972565449988238e-06, + "loss": 0.34449583292007446, + "mean_token_accuracy": 0.8813316822052002, + "num_tokens": 3362133.0, + "step": 376 + }, + { + "epoch": 0.2864741641337386, + "grad_norm": 2.562251091003418, + "learning_rate": 4.972255155657925e-06, + "loss": 0.5331522822380066, + "mean_token_accuracy": 0.8212941288948059, + "num_tokens": 3370346.0, + "step": 377 + }, + { + "epoch": 0.2872340425531915, + "grad_norm": 2.7083740234375, + "learning_rate": 4.9719431262037755e-06, + "loss": 0.5403046011924744, + "mean_token_accuracy": 0.8108335733413696, + "num_tokens": 3375588.0, + "step": 378 + }, + { + "epoch": 0.28799392097264437, + "grad_norm": 1.396430492401123, + "learning_rate": 4.971629361844785e-06, + "loss": 0.4041529893875122, + "mean_token_accuracy": 0.8588063716888428, + "num_tokens": 3390749.0, + "step": 379 + }, + { + "epoch": 0.2887537993920973, + "grad_norm": 1.9872784614562988, + "learning_rate": 4.971313862801166e-06, + "loss": 0.4336993098258972, + "mean_token_accuracy": 0.8511303663253784, + "num_tokens": 3399064.0, + "step": 380 + }, + { + "epoch": 0.28951367781155013, + "grad_norm": 1.9652575254440308, + "learning_rate": 4.9709966292943455e-06, + "loss": 0.4578358232975006, + "mean_token_accuracy": 0.8229440450668335, + "num_tokens": 3407229.0, + "step": 381 + }, + { + "epoch": 0.29027355623100304, + "grad_norm": 1.6626898050308228, + "learning_rate": 4.970677661546972e-06, + "loss": 0.5427594184875488, + "mean_token_accuracy": 0.815427303314209, + "num_tokens": 3422321.0, + "step": 382 + }, + { + "epoch": 0.29103343465045595, + "grad_norm": 3.5265562534332275, + "learning_rate": 4.970356959782909e-06, + "loss": 0.6661460995674133, + "mean_token_accuracy": 0.7856965065002441, + "num_tokens": 3427442.0, + "step": 383 + }, + { + "epoch": 0.2917933130699088, + "grad_norm": 1.667205572128296, + "learning_rate": 4.970034524227239e-06, + "loss": 0.36256325244903564, + "mean_token_accuracy": 0.8711205720901489, + "num_tokens": 3436662.0, + "step": 384 + }, + { + "epoch": 0.2925531914893617, + "grad_norm": 1.3389486074447632, + "learning_rate": 4.969710355106256e-06, + "loss": 0.4282698631286621, + "mean_token_accuracy": 0.838951587677002, + "num_tokens": 3450060.0, + "step": 385 + }, + { + "epoch": 0.2933130699088146, + "grad_norm": 2.5163397789001465, + "learning_rate": 4.969384452647477e-06, + "loss": 0.5176984071731567, + "mean_token_accuracy": 0.8235267996788025, + "num_tokens": 3456990.0, + "step": 386 + }, + { + "epoch": 0.29407294832826747, + "grad_norm": 1.7588495016098022, + "learning_rate": 4.969056817079633e-06, + "loss": 0.49710947275161743, + "mean_token_accuracy": 0.818520724773407, + "num_tokens": 3468098.0, + "step": 387 + }, + { + "epoch": 0.2948328267477204, + "grad_norm": 2.6381046772003174, + "learning_rate": 4.968727448632669e-06, + "loss": 0.4425308108329773, + "mean_token_accuracy": 0.8451643586158752, + "num_tokens": 3472899.0, + "step": 388 + }, + { + "epoch": 0.29559270516717323, + "grad_norm": 1.6345038414001465, + "learning_rate": 4.968396347537751e-06, + "loss": 0.4177059829235077, + "mean_token_accuracy": 0.8498886227607727, + "num_tokens": 3484826.0, + "step": 389 + }, + { + "epoch": 0.29635258358662614, + "grad_norm": 3.0466468334198, + "learning_rate": 4.968063514027258e-06, + "loss": 0.4274463951587677, + "mean_token_accuracy": 0.8387278318405151, + "num_tokens": 3488610.0, + "step": 390 + }, + { + "epoch": 0.29711246200607905, + "grad_norm": 2.6509406566619873, + "learning_rate": 4.967728948334784e-06, + "loss": 0.5401753783226013, + "mean_token_accuracy": 0.8252490162849426, + "num_tokens": 3493657.0, + "step": 391 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 1.6372219324111938, + "learning_rate": 4.967392650695141e-06, + "loss": 0.3862472176551819, + "mean_token_accuracy": 0.8555525541305542, + "num_tokens": 3505588.0, + "step": 392 + }, + { + "epoch": 0.2986322188449848, + "grad_norm": 2.1615452766418457, + "learning_rate": 4.967054621344356e-06, + "loss": 0.57850581407547, + "mean_token_accuracy": 0.8222678899765015, + "num_tokens": 3514396.0, + "step": 393 + }, + { + "epoch": 0.2993920972644377, + "grad_norm": 1.8610916137695312, + "learning_rate": 4.96671486051967e-06, + "loss": 0.5440595149993896, + "mean_token_accuracy": 0.8196715116500854, + "num_tokens": 3523604.0, + "step": 394 + }, + { + "epoch": 0.30015197568389057, + "grad_norm": 2.9585862159729004, + "learning_rate": 4.966373368459542e-06, + "loss": 0.6921588182449341, + "mean_token_accuracy": 0.7816659808158875, + "num_tokens": 3529849.0, + "step": 395 + }, + { + "epoch": 0.3009118541033435, + "grad_norm": 1.9374035596847534, + "learning_rate": 4.966030145403642e-06, + "loss": 0.5494055151939392, + "mean_token_accuracy": 0.8126792907714844, + "num_tokens": 3539529.0, + "step": 396 + }, + { + "epoch": 0.30167173252279633, + "grad_norm": 1.730530023574829, + "learning_rate": 4.965685191592859e-06, + "loss": 0.4271572232246399, + "mean_token_accuracy": 0.8383668661117554, + "num_tokens": 3550972.0, + "step": 397 + }, + { + "epoch": 0.30243161094224924, + "grad_norm": 3.9635560512542725, + "learning_rate": 4.9653385072692935e-06, + "loss": 0.5576210021972656, + "mean_token_accuracy": 0.799404501914978, + "num_tokens": 3554147.0, + "step": 398 + }, + { + "epoch": 0.30319148936170215, + "grad_norm": 2.5731968879699707, + "learning_rate": 4.964990092676263e-06, + "loss": 0.5478942394256592, + "mean_token_accuracy": 0.8220961093902588, + "num_tokens": 3559972.0, + "step": 399 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 2.2096588611602783, + "learning_rate": 4.964639948058297e-06, + "loss": 0.35461270809173584, + "mean_token_accuracy": 0.8640927076339722, + "num_tokens": 3565770.0, + "step": 400 + }, + { + "epoch": 0.3047112462006079, + "grad_norm": 1.7874189615249634, + "learning_rate": 4.964288073661142e-06, + "loss": 0.38849619030952454, + "mean_token_accuracy": 0.8443037271499634, + "num_tokens": 3574514.0, + "step": 401 + }, + { + "epoch": 0.30547112462006076, + "grad_norm": 1.5583146810531616, + "learning_rate": 4.963934469731756e-06, + "loss": 0.48909449577331543, + "mean_token_accuracy": 0.8429768681526184, + "num_tokens": 3585877.0, + "step": 402 + }, + { + "epoch": 0.30623100303951367, + "grad_norm": 3.026599645614624, + "learning_rate": 4.963579136518312e-06, + "loss": 0.5138992071151733, + "mean_token_accuracy": 0.8283728361129761, + "num_tokens": 3590412.0, + "step": 403 + }, + { + "epoch": 0.3069908814589666, + "grad_norm": 2.777505874633789, + "learning_rate": 4.963222074270197e-06, + "loss": 0.6241534948348999, + "mean_token_accuracy": 0.8130464553833008, + "num_tokens": 3596246.0, + "step": 404 + }, + { + "epoch": 0.30775075987841943, + "grad_norm": 2.4772839546203613, + "learning_rate": 4.962863283238011e-06, + "loss": 0.5930814146995544, + "mean_token_accuracy": 0.8036394715309143, + "num_tokens": 3602878.0, + "step": 405 + }, + { + "epoch": 0.30851063829787234, + "grad_norm": 1.5049982070922852, + "learning_rate": 4.962502763673566e-06, + "loss": 0.4903082549571991, + "mean_token_accuracy": 0.8184912204742432, + "num_tokens": 3617018.0, + "step": 406 + }, + { + "epoch": 0.30927051671732525, + "grad_norm": 2.453155040740967, + "learning_rate": 4.96214051582989e-06, + "loss": 0.5138067603111267, + "mean_token_accuracy": 0.8336835503578186, + "num_tokens": 3624188.0, + "step": 407 + }, + { + "epoch": 0.3100303951367781, + "grad_norm": 2.4038336277008057, + "learning_rate": 4.961776539961222e-06, + "loss": 0.5752760171890259, + "mean_token_accuracy": 0.8054730892181396, + "num_tokens": 3634152.0, + "step": 408 + }, + { + "epoch": 0.310790273556231, + "grad_norm": 2.629068374633789, + "learning_rate": 4.961410836323014e-06, + "loss": 0.5580606460571289, + "mean_token_accuracy": 0.8121089935302734, + "num_tokens": 3639528.0, + "step": 409 + }, + { + "epoch": 0.31155015197568386, + "grad_norm": 1.4245928525924683, + "learning_rate": 4.961043405171931e-06, + "loss": 0.5399882793426514, + "mean_token_accuracy": 0.812280535697937, + "num_tokens": 3655744.0, + "step": 410 + }, + { + "epoch": 0.3123100303951368, + "grad_norm": 1.5236459970474243, + "learning_rate": 4.9606742467658505e-06, + "loss": 0.5234690308570862, + "mean_token_accuracy": 0.8188928365707397, + "num_tokens": 3675010.0, + "step": 411 + }, + { + "epoch": 0.3130699088145897, + "grad_norm": 2.27961802482605, + "learning_rate": 4.960303361363863e-06, + "loss": 0.5502505898475647, + "mean_token_accuracy": 0.8161963224411011, + "num_tokens": 3682328.0, + "step": 412 + }, + { + "epoch": 0.31382978723404253, + "grad_norm": 1.554518222808838, + "learning_rate": 4.959930749226269e-06, + "loss": 0.420867919921875, + "mean_token_accuracy": 0.8499157428741455, + "num_tokens": 3694980.0, + "step": 413 + }, + { + "epoch": 0.31458966565349544, + "grad_norm": 2.609218120574951, + "learning_rate": 4.9595564106145825e-06, + "loss": 0.4706704318523407, + "mean_token_accuracy": 0.8412490487098694, + "num_tokens": 3700033.0, + "step": 414 + }, + { + "epoch": 0.31534954407294835, + "grad_norm": 1.5303231477737427, + "learning_rate": 4.959180345791528e-06, + "loss": 0.4668654799461365, + "mean_token_accuracy": 0.8125015497207642, + "num_tokens": 3715012.0, + "step": 415 + }, + { + "epoch": 0.3161094224924012, + "grad_norm": 1.2774665355682373, + "learning_rate": 4.958802555021042e-06, + "loss": 0.4339369237422943, + "mean_token_accuracy": 0.8442851901054382, + "num_tokens": 3733928.0, + "step": 416 + }, + { + "epoch": 0.3168693009118541, + "grad_norm": 2.1240181922912598, + "learning_rate": 4.958423038568274e-06, + "loss": 0.4029104709625244, + "mean_token_accuracy": 0.8627674579620361, + "num_tokens": 3740202.0, + "step": 417 + }, + { + "epoch": 0.31762917933130697, + "grad_norm": 2.00538969039917, + "learning_rate": 4.958041796699583e-06, + "loss": 0.5229607820510864, + "mean_token_accuracy": 0.8282366394996643, + "num_tokens": 3749308.0, + "step": 418 + }, + { + "epoch": 0.3183890577507599, + "grad_norm": 2.6555092334747314, + "learning_rate": 4.957658829682539e-06, + "loss": 0.5344101190567017, + "mean_token_accuracy": 0.8183202743530273, + "num_tokens": 3754595.0, + "step": 419 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 1.7468839883804321, + "learning_rate": 4.9572741377859225e-06, + "loss": 0.5667245984077454, + "mean_token_accuracy": 0.8080123662948608, + "num_tokens": 3765761.0, + "step": 420 + }, + { + "epoch": 0.31990881458966564, + "grad_norm": 2.9612457752227783, + "learning_rate": 4.956887721279726e-06, + "loss": 0.5389559864997864, + "mean_token_accuracy": 0.8019476532936096, + "num_tokens": 3770844.0, + "step": 421 + }, + { + "epoch": 0.32066869300911854, + "grad_norm": 1.842403769493103, + "learning_rate": 4.95649958043515e-06, + "loss": 0.38279837369918823, + "mean_token_accuracy": 0.858866810798645, + "num_tokens": 3778094.0, + "step": 422 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 2.3108131885528564, + "learning_rate": 4.956109715524609e-06, + "loss": 0.5453893542289734, + "mean_token_accuracy": 0.8085013031959534, + "num_tokens": 3785015.0, + "step": 423 + }, + { + "epoch": 0.3221884498480243, + "grad_norm": 3.0326945781707764, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.5550523400306702, + "mean_token_accuracy": 0.8125876188278198, + "num_tokens": 3789830.0, + "step": 424 + }, + { + "epoch": 0.3229483282674772, + "grad_norm": 1.8851977586746216, + "learning_rate": 4.955324814601324e-06, + "loss": 0.4902324974536896, + "mean_token_accuracy": 0.8205406665802002, + "num_tokens": 3799862.0, + "step": 425 + }, + { + "epoch": 0.32370820668693007, + "grad_norm": 2.6018171310424805, + "learning_rate": 4.954929779139455e-06, + "loss": 0.5920133590698242, + "mean_token_accuracy": 0.8340690732002258, + "num_tokens": 3806617.0, + "step": 426 + }, + { + "epoch": 0.324468085106383, + "grad_norm": 2.4283878803253174, + "learning_rate": 4.954533020713367e-06, + "loss": 0.5305854082107544, + "mean_token_accuracy": 0.8137468099594116, + "num_tokens": 3813843.0, + "step": 427 + }, + { + "epoch": 0.3252279635258359, + "grad_norm": 2.667978525161743, + "learning_rate": 4.954134539601519e-06, + "loss": 0.5333638787269592, + "mean_token_accuracy": 0.8402629494667053, + "num_tokens": 3819450.0, + "step": 428 + }, + { + "epoch": 0.32598784194528874, + "grad_norm": 1.7302523851394653, + "learning_rate": 4.953734336083582e-06, + "loss": 0.422895610332489, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 3831027.0, + "step": 429 + }, + { + "epoch": 0.32674772036474165, + "grad_norm": 2.427192211151123, + "learning_rate": 4.953332410440434e-06, + "loss": 0.6334598064422607, + "mean_token_accuracy": 0.7817479968070984, + "num_tokens": 3841776.0, + "step": 430 + }, + { + "epoch": 0.32750759878419455, + "grad_norm": 1.460949182510376, + "learning_rate": 4.952928762954161e-06, + "loss": 0.3654777705669403, + "mean_token_accuracy": 0.8780122995376587, + "num_tokens": 3852213.0, + "step": 431 + }, + { + "epoch": 0.3282674772036474, + "grad_norm": 1.9855005741119385, + "learning_rate": 4.952523393908059e-06, + "loss": 0.5117089748382568, + "mean_token_accuracy": 0.811911404132843, + "num_tokens": 3861176.0, + "step": 432 + }, + { + "epoch": 0.3290273556231003, + "grad_norm": 2.2653207778930664, + "learning_rate": 4.952116303586631e-06, + "loss": 0.42514950037002563, + "mean_token_accuracy": 0.8448518514633179, + "num_tokens": 3867164.0, + "step": 433 + }, + { + "epoch": 0.32978723404255317, + "grad_norm": 1.9780964851379395, + "learning_rate": 4.951707492275589e-06, + "loss": 0.5095293521881104, + "mean_token_accuracy": 0.8262748718261719, + "num_tokens": 3876406.0, + "step": 434 + }, + { + "epoch": 0.3305471124620061, + "grad_norm": 2.9480233192443848, + "learning_rate": 4.951296960261853e-06, + "loss": 0.3494448959827423, + "mean_token_accuracy": 0.8781307935714722, + "num_tokens": 3880298.0, + "step": 435 + }, + { + "epoch": 0.331306990881459, + "grad_norm": 2.335571527481079, + "learning_rate": 4.95088470783355e-06, + "loss": 0.5456914901733398, + "mean_token_accuracy": 0.816297173500061, + "num_tokens": 3886487.0, + "step": 436 + }, + { + "epoch": 0.33206686930091184, + "grad_norm": 2.3046419620513916, + "learning_rate": 4.950470735280013e-06, + "loss": 0.4835948944091797, + "mean_token_accuracy": 0.8539175391197205, + "num_tokens": 3892706.0, + "step": 437 + }, + { + "epoch": 0.33282674772036475, + "grad_norm": 2.44047474861145, + "learning_rate": 4.950055042891786e-06, + "loss": 0.5154092907905579, + "mean_token_accuracy": 0.8579919338226318, + "num_tokens": 3899532.0, + "step": 438 + }, + { + "epoch": 0.33358662613981765, + "grad_norm": 4.826764106750488, + "learning_rate": 4.949637630960618e-06, + "loss": 0.5270259976387024, + "mean_token_accuracy": 0.8172192573547363, + "num_tokens": 3902260.0, + "step": 439 + }, + { + "epoch": 0.3343465045592705, + "grad_norm": 2.001574754714966, + "learning_rate": 4.949218499779462e-06, + "loss": 0.5413002967834473, + "mean_token_accuracy": 0.8162837028503418, + "num_tokens": 3911706.0, + "step": 440 + }, + { + "epoch": 0.3351063829787234, + "grad_norm": 1.7998944520950317, + "learning_rate": 4.948797649642484e-06, + "loss": 0.5131614208221436, + "mean_token_accuracy": 0.8367440700531006, + "num_tokens": 3923490.0, + "step": 441 + }, + { + "epoch": 0.33586626139817627, + "grad_norm": 3.4566173553466797, + "learning_rate": 4.94837508084505e-06, + "loss": 0.7258909940719604, + "mean_token_accuracy": 0.771377444267273, + "num_tokens": 3928099.0, + "step": 442 + }, + { + "epoch": 0.3366261398176292, + "grad_norm": 2.0040442943573, + "learning_rate": 4.9479507936837364e-06, + "loss": 0.482135534286499, + "mean_token_accuracy": 0.8339327573776245, + "num_tokens": 3937328.0, + "step": 443 + }, + { + "epoch": 0.3373860182370821, + "grad_norm": 2.949502944946289, + "learning_rate": 4.947524788456325e-06, + "loss": 0.6474795341491699, + "mean_token_accuracy": 0.7951677441596985, + "num_tokens": 3942529.0, + "step": 444 + }, + { + "epoch": 0.33814589665653494, + "grad_norm": 1.5528364181518555, + "learning_rate": 4.947097065461801e-06, + "loss": 0.48791584372520447, + "mean_token_accuracy": 0.8425545692443848, + "num_tokens": 3955200.0, + "step": 445 + }, + { + "epoch": 0.33890577507598785, + "grad_norm": 1.8813284635543823, + "learning_rate": 4.946667625000358e-06, + "loss": 0.45922309160232544, + "mean_token_accuracy": 0.8206527233123779, + "num_tokens": 3962975.0, + "step": 446 + }, + { + "epoch": 0.33966565349544076, + "grad_norm": 1.7157847881317139, + "learning_rate": 4.946236467373392e-06, + "loss": 0.5454182028770447, + "mean_token_accuracy": 0.8049604892730713, + "num_tokens": 3973956.0, + "step": 447 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 2.008857250213623, + "learning_rate": 4.945803592883509e-06, + "loss": 0.5151860117912292, + "mean_token_accuracy": 0.8262045383453369, + "num_tokens": 3982853.0, + "step": 448 + }, + { + "epoch": 0.3411854103343465, + "grad_norm": 1.6632496118545532, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.42710691690444946, + "mean_token_accuracy": 0.8521314859390259, + "num_tokens": 3993838.0, + "step": 449 + }, + { + "epoch": 0.34194528875379937, + "grad_norm": 1.365234375, + "learning_rate": 4.944932694531423e-06, + "loss": 0.5172526836395264, + "mean_token_accuracy": 0.8277045488357544, + "num_tokens": 4014179.0, + "step": 450 + }, + { + "epoch": 0.3427051671732523, + "grad_norm": 1.7610243558883667, + "learning_rate": 4.94449467128045e-06, + "loss": 0.42104798555374146, + "mean_token_accuracy": 0.8552065491676331, + "num_tokens": 4023663.0, + "step": 451 + }, + { + "epoch": 0.3434650455927052, + "grad_norm": 2.3732354640960693, + "learning_rate": 4.944054932389018e-06, + "loss": 0.5471175909042358, + "mean_token_accuracy": 0.8487317562103271, + "num_tokens": 4030100.0, + "step": 452 + }, + { + "epoch": 0.34422492401215804, + "grad_norm": 1.5973623991012573, + "learning_rate": 4.943613478165753e-06, + "loss": 0.419813871383667, + "mean_token_accuracy": 0.8484025001525879, + "num_tokens": 4041124.0, + "step": 453 + }, + { + "epoch": 0.34498480243161095, + "grad_norm": 2.966381549835205, + "learning_rate": 4.943170308920484e-06, + "loss": 0.5370652675628662, + "mean_token_accuracy": 0.8439491987228394, + "num_tokens": 4045675.0, + "step": 454 + }, + { + "epoch": 0.34574468085106386, + "grad_norm": 2.5097248554229736, + "learning_rate": 4.9427254249642445e-06, + "loss": 0.5776349306106567, + "mean_token_accuracy": 0.8060523867607117, + "num_tokens": 4053250.0, + "step": 455 + }, + { + "epoch": 0.3465045592705167, + "grad_norm": 1.6779125928878784, + "learning_rate": 4.942278826609272e-06, + "loss": 0.5245476961135864, + "mean_token_accuracy": 0.8168526887893677, + "num_tokens": 4064106.0, + "step": 456 + }, + { + "epoch": 0.3472644376899696, + "grad_norm": 1.5945546627044678, + "learning_rate": 4.9418305141690045e-06, + "loss": 0.4972047209739685, + "mean_token_accuracy": 0.8257735967636108, + "num_tokens": 4077687.0, + "step": 457 + }, + { + "epoch": 0.34802431610942247, + "grad_norm": 2.864778757095337, + "learning_rate": 4.9413804879580865e-06, + "loss": 0.5372499823570251, + "mean_token_accuracy": 0.8423776626586914, + "num_tokens": 4082632.0, + "step": 458 + }, + { + "epoch": 0.3487841945288754, + "grad_norm": 1.4797078371047974, + "learning_rate": 4.940928748292363e-06, + "loss": 0.5903409719467163, + "mean_token_accuracy": 0.8061295747756958, + "num_tokens": 4104218.0, + "step": 459 + }, + { + "epoch": 0.3495440729483283, + "grad_norm": 2.4376983642578125, + "learning_rate": 4.940475295488882e-06, + "loss": 0.4534894824028015, + "mean_token_accuracy": 0.8395825028419495, + "num_tokens": 4110530.0, + "step": 460 + }, + { + "epoch": 0.35030395136778114, + "grad_norm": 1.2955626249313354, + "learning_rate": 4.940020129865895e-06, + "loss": 0.47155818343162537, + "mean_token_accuracy": 0.8253582715988159, + "num_tokens": 4128398.0, + "step": 461 + }, + { + "epoch": 0.35106382978723405, + "grad_norm": 2.066575527191162, + "learning_rate": 4.9395632517428546e-06, + "loss": 0.5555641651153564, + "mean_token_accuracy": 0.814624547958374, + "num_tokens": 4137623.0, + "step": 462 + }, + { + "epoch": 0.3518237082066869, + "grad_norm": 1.6407525539398193, + "learning_rate": 4.939104661440415e-06, + "loss": 0.4361790418624878, + "mean_token_accuracy": 0.8544459342956543, + "num_tokens": 4152803.0, + "step": 463 + }, + { + "epoch": 0.3525835866261398, + "grad_norm": 2.1685116291046143, + "learning_rate": 4.938644359280433e-06, + "loss": 0.5347012877464294, + "mean_token_accuracy": 0.853853702545166, + "num_tokens": 4160778.0, + "step": 464 + }, + { + "epoch": 0.3533434650455927, + "grad_norm": 1.8824869394302368, + "learning_rate": 4.938182345585967e-06, + "loss": 0.5512481927871704, + "mean_token_accuracy": 0.7985891699790955, + "num_tokens": 4170380.0, + "step": 465 + }, + { + "epoch": 0.3541033434650456, + "grad_norm": 2.2229504585266113, + "learning_rate": 4.937718620681273e-06, + "loss": 0.516828179359436, + "mean_token_accuracy": 0.8265621066093445, + "num_tokens": 4178179.0, + "step": 466 + }, + { + "epoch": 0.3548632218844985, + "grad_norm": 1.955990195274353, + "learning_rate": 4.9372531848918145e-06, + "loss": 0.5586158037185669, + "mean_token_accuracy": 0.8367916345596313, + "num_tokens": 4188626.0, + "step": 467 + }, + { + "epoch": 0.3556231003039514, + "grad_norm": 1.9687023162841797, + "learning_rate": 4.936786038544251e-06, + "loss": 0.5517531633377075, + "mean_token_accuracy": 0.8134098052978516, + "num_tokens": 4198144.0, + "step": 468 + }, + { + "epoch": 0.35638297872340424, + "grad_norm": 1.405516505241394, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.5305492877960205, + "mean_token_accuracy": 0.8014427423477173, + "num_tokens": 4222818.0, + "step": 469 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.6355695724487305, + "learning_rate": 4.9358466154874535e-06, + "loss": 0.5303391218185425, + "mean_token_accuracy": 0.8028861284255981, + "num_tokens": 4228318.0, + "step": 470 + }, + { + "epoch": 0.35790273556231, + "grad_norm": 1.5133824348449707, + "learning_rate": 4.935374339437543e-06, + "loss": 0.5329189300537109, + "mean_token_accuracy": 0.8479441404342651, + "num_tokens": 4244527.0, + "step": 471 + }, + { + "epoch": 0.3586626139817629, + "grad_norm": 3.4356725215911865, + "learning_rate": 4.934900354148173e-06, + "loss": 0.5431582927703857, + "mean_token_accuracy": 0.8328983783721924, + "num_tokens": 4248034.0, + "step": 472 + }, + { + "epoch": 0.3594224924012158, + "grad_norm": 2.5789499282836914, + "learning_rate": 4.934424659952006e-06, + "loss": 0.4141455292701721, + "mean_token_accuracy": 0.8658635020256042, + "num_tokens": 4252953.0, + "step": 473 + }, + { + "epoch": 0.3601823708206687, + "grad_norm": 1.145262598991394, + "learning_rate": 4.933947257182901e-06, + "loss": 0.40294092893600464, + "mean_token_accuracy": 0.8565847277641296, + "num_tokens": 4277813.0, + "step": 474 + }, + { + "epoch": 0.3609422492401216, + "grad_norm": 1.7242133617401123, + "learning_rate": 4.933468146175918e-06, + "loss": 0.6036738753318787, + "mean_token_accuracy": 0.8072597980499268, + "num_tokens": 4291088.0, + "step": 475 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 2.3490941524505615, + "learning_rate": 4.932987327267317e-06, + "loss": 0.49456146359443665, + "mean_token_accuracy": 0.8372673988342285, + "num_tokens": 4297376.0, + "step": 476 + }, + { + "epoch": 0.36246200607902734, + "grad_norm": 1.3605526685714722, + "learning_rate": 4.932504800794553e-06, + "loss": 0.43595948815345764, + "mean_token_accuracy": 0.8415953516960144, + "num_tokens": 4312054.0, + "step": 477 + }, + { + "epoch": 0.36322188449848025, + "grad_norm": 1.4525885581970215, + "learning_rate": 4.9320205670962815e-06, + "loss": 0.5390371680259705, + "mean_token_accuracy": 0.8101649284362793, + "num_tokens": 4328701.0, + "step": 478 + }, + { + "epoch": 0.3639817629179331, + "grad_norm": 1.9862419366836548, + "learning_rate": 4.931534626512359e-06, + "loss": 0.45436930656433105, + "mean_token_accuracy": 0.8352861404418945, + "num_tokens": 4338372.0, + "step": 479 + }, + { + "epoch": 0.364741641337386, + "grad_norm": 1.7804961204528809, + "learning_rate": 4.931046979383836e-06, + "loss": 0.4677754044532776, + "mean_token_accuracy": 0.840467095375061, + "num_tokens": 4347897.0, + "step": 480 + }, + { + "epoch": 0.3655015197568389, + "grad_norm": 2.066632032394409, + "learning_rate": 4.930557626052961e-06, + "loss": 0.42418140172958374, + "mean_token_accuracy": 0.8528275489807129, + "num_tokens": 4354061.0, + "step": 481 + }, + { + "epoch": 0.3662613981762918, + "grad_norm": 1.6155282258987427, + "learning_rate": 4.930066566863182e-06, + "loss": 0.5424284934997559, + "mean_token_accuracy": 0.825040876865387, + "num_tokens": 4370400.0, + "step": 482 + }, + { + "epoch": 0.3670212765957447, + "grad_norm": 2.1452953815460205, + "learning_rate": 4.929573802159143e-06, + "loss": 0.5105804204940796, + "mean_token_accuracy": 0.8284053802490234, + "num_tokens": 4377579.0, + "step": 483 + }, + { + "epoch": 0.3677811550151976, + "grad_norm": 1.8940945863723755, + "learning_rate": 4.929079332286685e-06, + "loss": 0.43478304147720337, + "mean_token_accuracy": 0.8505665063858032, + "num_tokens": 4385686.0, + "step": 484 + }, + { + "epoch": 0.36854103343465044, + "grad_norm": 1.6785860061645508, + "learning_rate": 4.928583157592846e-06, + "loss": 0.40227848291397095, + "mean_token_accuracy": 0.8623573780059814, + "num_tokens": 4396128.0, + "step": 485 + }, + { + "epoch": 0.36930091185410335, + "grad_norm": 1.6416733264923096, + "learning_rate": 4.928085278425862e-06, + "loss": 0.526267409324646, + "mean_token_accuracy": 0.8284667730331421, + "num_tokens": 4407963.0, + "step": 486 + }, + { + "epoch": 0.3700607902735562, + "grad_norm": 1.8882389068603516, + "learning_rate": 4.927585695135162e-06, + "loss": 0.5555213093757629, + "mean_token_accuracy": 0.8115293979644775, + "num_tokens": 4418057.0, + "step": 487 + }, + { + "epoch": 0.3708206686930091, + "grad_norm": 2.300248384475708, + "learning_rate": 4.9270844080713735e-06, + "loss": 0.5812339186668396, + "mean_token_accuracy": 0.800270676612854, + "num_tokens": 4425358.0, + "step": 488 + }, + { + "epoch": 0.371580547112462, + "grad_norm": 1.6802922487258911, + "learning_rate": 4.926581417586319e-06, + "loss": 0.5134941935539246, + "mean_token_accuracy": 0.8247408866882324, + "num_tokens": 4437702.0, + "step": 489 + }, + { + "epoch": 0.3723404255319149, + "grad_norm": 1.7620291709899902, + "learning_rate": 4.926076724033016e-06, + "loss": 0.5233973264694214, + "mean_token_accuracy": 0.8102161884307861, + "num_tokens": 4448584.0, + "step": 490 + }, + { + "epoch": 0.3731003039513678, + "grad_norm": 1.6911998987197876, + "learning_rate": 4.925570327765678e-06, + "loss": 0.5337274074554443, + "mean_token_accuracy": 0.845306396484375, + "num_tokens": 4462651.0, + "step": 491 + }, + { + "epoch": 0.3738601823708207, + "grad_norm": 1.7991242408752441, + "learning_rate": 4.9250622291397144e-06, + "loss": 0.31018948554992676, + "mean_token_accuracy": 0.8857606053352356, + "num_tokens": 4469971.0, + "step": 492 + }, + { + "epoch": 0.37462006079027355, + "grad_norm": 4.9776835441589355, + "learning_rate": 4.924552428511727e-06, + "loss": 0.44114983081817627, + "mean_token_accuracy": 0.8429906368255615, + "num_tokens": 4478275.0, + "step": 493 + }, + { + "epoch": 0.37537993920972645, + "grad_norm": 1.8007272481918335, + "learning_rate": 4.924040926239515e-06, + "loss": 0.574328601360321, + "mean_token_accuracy": 0.7669196128845215, + "num_tokens": 4491551.0, + "step": 494 + }, + { + "epoch": 0.3761398176291793, + "grad_norm": 2.021300792694092, + "learning_rate": 4.92352772268207e-06, + "loss": 0.45636120438575745, + "mean_token_accuracy": 0.840438723564148, + "num_tokens": 4498658.0, + "step": 495 + }, + { + "epoch": 0.3768996960486322, + "grad_norm": 2.369748592376709, + "learning_rate": 4.923012818199576e-06, + "loss": 0.5206376910209656, + "mean_token_accuracy": 0.8521823287010193, + "num_tokens": 4504648.0, + "step": 496 + }, + { + "epoch": 0.3776595744680851, + "grad_norm": 2.733485221862793, + "learning_rate": 4.922496213153416e-06, + "loss": 0.5067723989486694, + "mean_token_accuracy": 0.8168281316757202, + "num_tokens": 4509990.0, + "step": 497 + }, + { + "epoch": 0.378419452887538, + "grad_norm": 2.3751676082611084, + "learning_rate": 4.921977907906161e-06, + "loss": 0.49757206439971924, + "mean_token_accuracy": 0.8325017690658569, + "num_tokens": 4518373.0, + "step": 498 + }, + { + "epoch": 0.3791793313069909, + "grad_norm": 2.1672775745391846, + "learning_rate": 4.921457902821578e-06, + "loss": 0.4237566590309143, + "mean_token_accuracy": 0.8404698371887207, + "num_tokens": 4524338.0, + "step": 499 + }, + { + "epoch": 0.3799392097264438, + "grad_norm": 1.8374360799789429, + "learning_rate": 4.9209361982646275e-06, + "loss": 0.4995468854904175, + "mean_token_accuracy": 0.8299649953842163, + "num_tokens": 4533396.0, + "step": 500 + }, + { + "epoch": 0.38069908814589665, + "grad_norm": 2.083967924118042, + "learning_rate": 4.920412794601461e-06, + "loss": 0.489935040473938, + "mean_token_accuracy": 0.8315291404724121, + "num_tokens": 4540941.0, + "step": 501 + }, + { + "epoch": 0.38145896656534956, + "grad_norm": 2.2075610160827637, + "learning_rate": 4.919887692199423e-06, + "loss": 0.5233147740364075, + "mean_token_accuracy": 0.804171085357666, + "num_tokens": 4548215.0, + "step": 502 + }, + { + "epoch": 0.3822188449848024, + "grad_norm": 2.076775312423706, + "learning_rate": 4.9193608914270515e-06, + "loss": 0.5785550475120544, + "mean_token_accuracy": 0.7993186116218567, + "num_tokens": 4558204.0, + "step": 503 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.238546133041382, + "learning_rate": 4.918832392654075e-06, + "loss": 0.5287384390830994, + "mean_token_accuracy": 0.8214945793151855, + "num_tokens": 4565407.0, + "step": 504 + }, + { + "epoch": 0.3837386018237082, + "grad_norm": 1.6783074140548706, + "learning_rate": 4.9183021962514145e-06, + "loss": 0.6063359379768372, + "mean_token_accuracy": 0.7914625406265259, + "num_tokens": 4580991.0, + "step": 505 + }, + { + "epoch": 0.3844984802431611, + "grad_norm": 1.6287449598312378, + "learning_rate": 4.917770302591183e-06, + "loss": 0.3598247766494751, + "mean_token_accuracy": 0.8706809878349304, + "num_tokens": 4590579.0, + "step": 506 + }, + { + "epoch": 0.385258358662614, + "grad_norm": 1.5432041883468628, + "learning_rate": 4.917236712046682e-06, + "loss": 0.5267890095710754, + "mean_token_accuracy": 0.8032117486000061, + "num_tokens": 4608380.0, + "step": 507 + }, + { + "epoch": 0.3860182370820669, + "grad_norm": 1.7664037942886353, + "learning_rate": 4.9167014249924075e-06, + "loss": 0.3552354574203491, + "mean_token_accuracy": 0.8569793701171875, + "num_tokens": 4616426.0, + "step": 508 + }, + { + "epoch": 0.38677811550151975, + "grad_norm": 2.1147472858428955, + "learning_rate": 4.916164441804044e-06, + "loss": 0.5212404727935791, + "mean_token_accuracy": 0.8196578025817871, + "num_tokens": 4623908.0, + "step": 509 + }, + { + "epoch": 0.38753799392097266, + "grad_norm": 2.1092333793640137, + "learning_rate": 4.915625762858467e-06, + "loss": 0.5197038650512695, + "mean_token_accuracy": 0.8245604634284973, + "num_tokens": 4630956.0, + "step": 510 + }, + { + "epoch": 0.3882978723404255, + "grad_norm": 1.23331880569458, + "learning_rate": 4.915085388533743e-06, + "loss": 0.4759839177131653, + "mean_token_accuracy": 0.8192248344421387, + "num_tokens": 4651269.0, + "step": 511 + }, + { + "epoch": 0.3890577507598784, + "grad_norm": 2.424199104309082, + "learning_rate": 4.914543319209126e-06, + "loss": 0.5576270818710327, + "mean_token_accuracy": 0.8203302621841431, + "num_tokens": 4657296.0, + "step": 512 + }, + { + "epoch": 0.3898176291793313, + "grad_norm": 2.725156307220459, + "learning_rate": 4.913999555265062e-06, + "loss": 0.4337949752807617, + "mean_token_accuracy": 0.8382406234741211, + "num_tokens": 4661850.0, + "step": 513 + }, + { + "epoch": 0.3905775075987842, + "grad_norm": 2.3120534420013428, + "learning_rate": 4.913454097083185e-06, + "loss": 0.4941597580909729, + "mean_token_accuracy": 0.8302834033966064, + "num_tokens": 4667769.0, + "step": 514 + }, + { + "epoch": 0.3913373860182371, + "grad_norm": 2.3111207485198975, + "learning_rate": 4.912906945046319e-06, + "loss": 0.5253715515136719, + "mean_token_accuracy": 0.84515380859375, + "num_tokens": 4674537.0, + "step": 515 + }, + { + "epoch": 0.39209726443769, + "grad_norm": 1.4117841720581055, + "learning_rate": 4.912358099538476e-06, + "loss": 0.4521017074584961, + "mean_token_accuracy": 0.8208256959915161, + "num_tokens": 4690605.0, + "step": 516 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 2.3742799758911133, + "learning_rate": 4.911807560944858e-06, + "loss": 0.41572901606559753, + "mean_token_accuracy": 0.8550551533699036, + "num_tokens": 4706437.0, + "step": 517 + }, + { + "epoch": 0.39361702127659576, + "grad_norm": 2.4052202701568604, + "learning_rate": 4.911255329651852e-06, + "loss": 0.6003736257553101, + "mean_token_accuracy": 0.8247885704040527, + "num_tokens": 4712746.0, + "step": 518 + }, + { + "epoch": 0.3943768996960486, + "grad_norm": 1.9335490465164185, + "learning_rate": 4.910701406047037e-06, + "loss": 0.5457713603973389, + "mean_token_accuracy": 0.787429690361023, + "num_tokens": 4731937.0, + "step": 519 + }, + { + "epoch": 0.3951367781155015, + "grad_norm": 2.257706880569458, + "learning_rate": 4.910145790519177e-06, + "loss": 0.5300652980804443, + "mean_token_accuracy": 0.8192912936210632, + "num_tokens": 4739422.0, + "step": 520 + }, + { + "epoch": 0.3958966565349544, + "grad_norm": 1.2099462747573853, + "learning_rate": 4.9095884834582256e-06, + "loss": 0.45872747898101807, + "mean_token_accuracy": 0.8362667560577393, + "num_tokens": 4757113.0, + "step": 521 + }, + { + "epoch": 0.3966565349544073, + "grad_norm": 2.7991135120391846, + "learning_rate": 4.909029485255321e-06, + "loss": 0.49039560556411743, + "mean_token_accuracy": 0.8260016441345215, + "num_tokens": 4761709.0, + "step": 522 + }, + { + "epoch": 0.3974164133738602, + "grad_norm": 2.2360129356384277, + "learning_rate": 4.90846879630279e-06, + "loss": 0.49556830525398254, + "mean_token_accuracy": 0.827864408493042, + "num_tokens": 4769048.0, + "step": 523 + }, + { + "epoch": 0.3981762917933131, + "grad_norm": 2.5953688621520996, + "learning_rate": 4.907906416994146e-06, + "loss": 0.387208491563797, + "mean_token_accuracy": 0.8467001914978027, + "num_tokens": 4774637.0, + "step": 524 + }, + { + "epoch": 0.39893617021276595, + "grad_norm": 2.1046814918518066, + "learning_rate": 4.907342347724088e-06, + "loss": 0.5477259755134583, + "mean_token_accuracy": 0.8060322999954224, + "num_tokens": 4782774.0, + "step": 525 + }, + { + "epoch": 0.39969604863221886, + "grad_norm": 2.5622646808624268, + "learning_rate": 4.906776588888502e-06, + "loss": 0.5684159398078918, + "mean_token_accuracy": 0.8095303177833557, + "num_tokens": 4788766.0, + "step": 526 + }, + { + "epoch": 0.4004559270516717, + "grad_norm": 1.9027913808822632, + "learning_rate": 4.906209140884459e-06, + "loss": 0.535524845123291, + "mean_token_accuracy": 0.815237820148468, + "num_tokens": 4798492.0, + "step": 527 + }, + { + "epoch": 0.4012158054711246, + "grad_norm": 2.1447622776031494, + "learning_rate": 4.905640004110216e-06, + "loss": 0.5628632307052612, + "mean_token_accuracy": 0.8085395097732544, + "num_tokens": 4805737.0, + "step": 528 + }, + { + "epoch": 0.40197568389057753, + "grad_norm": 1.6754741668701172, + "learning_rate": 4.905069178965215e-06, + "loss": 0.5046736598014832, + "mean_token_accuracy": 0.8247535228729248, + "num_tokens": 4816912.0, + "step": 529 + }, + { + "epoch": 0.4027355623100304, + "grad_norm": 2.271230459213257, + "learning_rate": 4.904496665850083e-06, + "loss": 0.6086187958717346, + "mean_token_accuracy": 0.7935276627540588, + "num_tokens": 4824577.0, + "step": 530 + }, + { + "epoch": 0.4034954407294833, + "grad_norm": 2.107595205307007, + "learning_rate": 4.903922465166633e-06, + "loss": 0.5431341528892517, + "mean_token_accuracy": 0.8129537105560303, + "num_tokens": 4831772.0, + "step": 531 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 1.3860732316970825, + "learning_rate": 4.903346577317859e-06, + "loss": 0.45816320180892944, + "mean_token_accuracy": 0.8328287601470947, + "num_tokens": 4850302.0, + "step": 532 + }, + { + "epoch": 0.40501519756838905, + "grad_norm": 1.9186837673187256, + "learning_rate": 4.902769002707942e-06, + "loss": 0.3294633626937866, + "mean_token_accuracy": 0.8853933811187744, + "num_tokens": 4856624.0, + "step": 533 + }, + { + "epoch": 0.40577507598784196, + "grad_norm": 1.516194462776184, + "learning_rate": 4.902189741742247e-06, + "loss": 0.45482105016708374, + "mean_token_accuracy": 0.8370342254638672, + "num_tokens": 4870395.0, + "step": 534 + }, + { + "epoch": 0.4065349544072948, + "grad_norm": 2.3235628604888916, + "learning_rate": 4.901608794827321e-06, + "loss": 0.40688639879226685, + "mean_token_accuracy": 0.8643521666526794, + "num_tokens": 4875645.0, + "step": 535 + }, + { + "epoch": 0.4072948328267477, + "grad_norm": 2.29286527633667, + "learning_rate": 4.9010261623708945e-06, + "loss": 0.45482826232910156, + "mean_token_accuracy": 0.8429383039474487, + "num_tokens": 4881772.0, + "step": 536 + }, + { + "epoch": 0.40805471124620063, + "grad_norm": 1.5907070636749268, + "learning_rate": 4.900441844781882e-06, + "loss": 0.5266948342323303, + "mean_token_accuracy": 0.8348641395568848, + "num_tokens": 4894289.0, + "step": 537 + }, + { + "epoch": 0.4088145896656535, + "grad_norm": 2.1816294193267822, + "learning_rate": 4.89985584247038e-06, + "loss": 0.4797617793083191, + "mean_token_accuracy": 0.8549500703811646, + "num_tokens": 4901106.0, + "step": 538 + }, + { + "epoch": 0.4095744680851064, + "grad_norm": 1.7347146272659302, + "learning_rate": 4.899268155847667e-06, + "loss": 0.4754739999771118, + "mean_token_accuracy": 0.8278418183326721, + "num_tokens": 4912131.0, + "step": 539 + }, + { + "epoch": 0.41033434650455924, + "grad_norm": 2.0694527626037598, + "learning_rate": 4.898678785326205e-06, + "loss": 0.5071008801460266, + "mean_token_accuracy": 0.8157946467399597, + "num_tokens": 4921141.0, + "step": 540 + }, + { + "epoch": 0.41109422492401215, + "grad_norm": 2.570047616958618, + "learning_rate": 4.898087731319637e-06, + "loss": 0.43639278411865234, + "mean_token_accuracy": 0.8682913780212402, + "num_tokens": 4926182.0, + "step": 541 + }, + { + "epoch": 0.41185410334346506, + "grad_norm": 4.064006805419922, + "learning_rate": 4.8974949942427854e-06, + "loss": 0.539260745048523, + "mean_token_accuracy": 0.8225528001785278, + "num_tokens": 4929449.0, + "step": 542 + }, + { + "epoch": 0.4126139817629179, + "grad_norm": 1.7644332647323608, + "learning_rate": 4.896900574511657e-06, + "loss": 0.472618043422699, + "mean_token_accuracy": 0.8332902193069458, + "num_tokens": 4939443.0, + "step": 543 + }, + { + "epoch": 0.4133738601823708, + "grad_norm": 2.879918336868286, + "learning_rate": 4.89630447254344e-06, + "loss": 0.6360667943954468, + "mean_token_accuracy": 0.8215296268463135, + "num_tokens": 4950838.0, + "step": 544 + }, + { + "epoch": 0.41413373860182373, + "grad_norm": 1.4575570821762085, + "learning_rate": 4.8957066887565005e-06, + "loss": 0.45617997646331787, + "mean_token_accuracy": 0.8373187184333801, + "num_tokens": 4965222.0, + "step": 545 + }, + { + "epoch": 0.4148936170212766, + "grad_norm": 2.4829535484313965, + "learning_rate": 4.895107223570386e-06, + "loss": 0.42285341024398804, + "mean_token_accuracy": 0.8686380386352539, + "num_tokens": 4970724.0, + "step": 546 + }, + { + "epoch": 0.4156534954407295, + "grad_norm": 2.639474630355835, + "learning_rate": 4.894506077405824e-06, + "loss": 0.5906289219856262, + "mean_token_accuracy": 0.8174435496330261, + "num_tokens": 4976766.0, + "step": 547 + }, + { + "epoch": 0.41641337386018235, + "grad_norm": 2.7960562705993652, + "learning_rate": 4.893903250684723e-06, + "loss": 0.4518949091434479, + "mean_token_accuracy": 0.8387585282325745, + "num_tokens": 4980991.0, + "step": 548 + }, + { + "epoch": 0.41717325227963525, + "grad_norm": 2.184176206588745, + "learning_rate": 4.893298743830168e-06, + "loss": 0.5223842859268188, + "mean_token_accuracy": 0.8170937299728394, + "num_tokens": 4987781.0, + "step": 549 + }, + { + "epoch": 0.41793313069908816, + "grad_norm": 2.2393438816070557, + "learning_rate": 4.892692557266429e-06, + "loss": 0.5238431692123413, + "mean_token_accuracy": 0.8217905759811401, + "num_tokens": 4994321.0, + "step": 550 + }, + { + "epoch": 0.418693009118541, + "grad_norm": 3.579047441482544, + "learning_rate": 4.8920846914189465e-06, + "loss": 0.5367584228515625, + "mean_token_accuracy": 0.8312011361122131, + "num_tokens": 4997951.0, + "step": 551 + }, + { + "epoch": 0.4194528875379939, + "grad_norm": 1.6330240964889526, + "learning_rate": 4.891475146714348e-06, + "loss": 0.6054705381393433, + "mean_token_accuracy": 0.7938206791877747, + "num_tokens": 5012726.0, + "step": 552 + }, + { + "epoch": 0.42021276595744683, + "grad_norm": 1.5775716304779053, + "learning_rate": 4.8908639235804324e-06, + "loss": 0.4774656891822815, + "mean_token_accuracy": 0.828762948513031, + "num_tokens": 5026751.0, + "step": 553 + }, + { + "epoch": 0.4209726443768997, + "grad_norm": 1.5719101428985596, + "learning_rate": 4.890251022446181e-06, + "loss": 0.549429178237915, + "mean_token_accuracy": 0.8110791444778442, + "num_tokens": 5041861.0, + "step": 554 + }, + { + "epoch": 0.4217325227963526, + "grad_norm": 1.8585275411605835, + "learning_rate": 4.889636443741752e-06, + "loss": 0.4448118805885315, + "mean_token_accuracy": 0.8462690711021423, + "num_tokens": 5052690.0, + "step": 555 + }, + { + "epoch": 0.42249240121580545, + "grad_norm": 2.189202070236206, + "learning_rate": 4.88902018789848e-06, + "loss": 0.4296762943267822, + "mean_token_accuracy": 0.8488791584968567, + "num_tokens": 5058964.0, + "step": 556 + }, + { + "epoch": 0.42325227963525835, + "grad_norm": 1.9328460693359375, + "learning_rate": 4.888402255348877e-06, + "loss": 0.5369474291801453, + "mean_token_accuracy": 0.8184729814529419, + "num_tokens": 5068465.0, + "step": 557 + }, + { + "epoch": 0.42401215805471126, + "grad_norm": 1.6233323812484741, + "learning_rate": 4.887782646526631e-06, + "loss": 0.5284391641616821, + "mean_token_accuracy": 0.8276044726371765, + "num_tokens": 5081052.0, + "step": 558 + }, + { + "epoch": 0.4247720364741641, + "grad_norm": 2.222813844680786, + "learning_rate": 4.887161361866608e-06, + "loss": 0.5679137706756592, + "mean_token_accuracy": 0.8012375831604004, + "num_tokens": 5090001.0, + "step": 559 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.1062207221984863, + "learning_rate": 4.8865384018048494e-06, + "loss": 0.5554201602935791, + "mean_token_accuracy": 0.8128066062927246, + "num_tokens": 5097644.0, + "step": 560 + }, + { + "epoch": 0.42629179331306993, + "grad_norm": 1.5380984544754028, + "learning_rate": 4.8859137667785735e-06, + "loss": 0.4948265850543976, + "mean_token_accuracy": 0.8258291482925415, + "num_tokens": 5110069.0, + "step": 561 + }, + { + "epoch": 0.4270516717325228, + "grad_norm": 2.0290257930755615, + "learning_rate": 4.8852874572261715e-06, + "loss": 0.4969530403614044, + "mean_token_accuracy": 0.8297134637832642, + "num_tokens": 5117452.0, + "step": 562 + }, + { + "epoch": 0.4278115501519757, + "grad_norm": 1.5651452541351318, + "learning_rate": 4.884659473587213e-06, + "loss": 0.5353102087974548, + "mean_token_accuracy": 0.8161719441413879, + "num_tokens": 5133756.0, + "step": 563 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.2470998764038086, + "learning_rate": 4.884029816302441e-06, + "loss": 0.5104288458824158, + "mean_token_accuracy": 0.8081635236740112, + "num_tokens": 5140278.0, + "step": 564 + }, + { + "epoch": 0.42933130699088146, + "grad_norm": 1.726891279220581, + "learning_rate": 4.883398485813772e-06, + "loss": 0.4508771002292633, + "mean_token_accuracy": 0.8548800349235535, + "num_tokens": 5150115.0, + "step": 565 + }, + { + "epoch": 0.43009118541033436, + "grad_norm": 1.4779289960861206, + "learning_rate": 4.8827654825642984e-06, + "loss": 0.46861088275909424, + "mean_token_accuracy": 0.8209476470947266, + "num_tokens": 5163225.0, + "step": 566 + }, + { + "epoch": 0.4308510638297872, + "grad_norm": 1.2361034154891968, + "learning_rate": 4.882130806998287e-06, + "loss": 0.4591076672077179, + "mean_token_accuracy": 0.803041934967041, + "num_tokens": 5180342.0, + "step": 567 + }, + { + "epoch": 0.4316109422492401, + "grad_norm": 1.882467269897461, + "learning_rate": 4.881494459561177e-06, + "loss": 0.579258143901825, + "mean_token_accuracy": 0.8007112741470337, + "num_tokens": 5189595.0, + "step": 568 + }, + { + "epoch": 0.43237082066869303, + "grad_norm": 1.095462441444397, + "learning_rate": 4.880856440699582e-06, + "loss": 0.3806574046611786, + "mean_token_accuracy": 0.8650111556053162, + "num_tokens": 5211642.0, + "step": 569 + }, + { + "epoch": 0.4331306990881459, + "grad_norm": 1.6469846963882446, + "learning_rate": 4.880216750861288e-06, + "loss": 0.544589638710022, + "mean_token_accuracy": 0.8060122728347778, + "num_tokens": 5224137.0, + "step": 570 + }, + { + "epoch": 0.4338905775075988, + "grad_norm": 1.8561251163482666, + "learning_rate": 4.879575390495254e-06, + "loss": 0.4094924330711365, + "mean_token_accuracy": 0.8591406345367432, + "num_tokens": 5231588.0, + "step": 571 + }, + { + "epoch": 0.43465045592705165, + "grad_norm": 3.01326847076416, + "learning_rate": 4.878932360051611e-06, + "loss": 0.6139192581176758, + "mean_token_accuracy": 0.8108739852905273, + "num_tokens": 5236853.0, + "step": 572 + }, + { + "epoch": 0.43541033434650456, + "grad_norm": 2.1753034591674805, + "learning_rate": 4.878287659981663e-06, + "loss": 0.49082931876182556, + "mean_token_accuracy": 0.862828254699707, + "num_tokens": 5243264.0, + "step": 573 + }, + { + "epoch": 0.43617021276595747, + "grad_norm": 1.4437755346298218, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.5608728528022766, + "mean_token_accuracy": 0.8271626234054565, + "num_tokens": 5261757.0, + "step": 574 + }, + { + "epoch": 0.4369300911854103, + "grad_norm": 1.786683440208435, + "learning_rate": 4.876993252773923e-06, + "loss": 0.4377627968788147, + "mean_token_accuracy": 0.844936192035675, + "num_tokens": 5271038.0, + "step": 575 + }, + { + "epoch": 0.4376899696048632, + "grad_norm": 1.3425915241241455, + "learning_rate": 4.876343546544596e-06, + "loss": 0.44762521982192993, + "mean_token_accuracy": 0.8397793769836426, + "num_tokens": 5285555.0, + "step": 576 + }, + { + "epoch": 0.43844984802431614, + "grad_norm": 2.1549675464630127, + "learning_rate": 4.8756921725058935e-06, + "loss": 0.5332942008972168, + "mean_token_accuracy": 0.820149302482605, + "num_tokens": 5294595.0, + "step": 577 + }, + { + "epoch": 0.439209726443769, + "grad_norm": 1.5254042148590088, + "learning_rate": 4.875039131114975e-06, + "loss": 0.3646543622016907, + "mean_token_accuracy": 0.8442583084106445, + "num_tokens": 5304955.0, + "step": 578 + }, + { + "epoch": 0.4399696048632219, + "grad_norm": 1.5751557350158691, + "learning_rate": 4.8743844228301676e-06, + "loss": 0.4854734539985657, + "mean_token_accuracy": 0.8317523002624512, + "num_tokens": 5317351.0, + "step": 579 + }, + { + "epoch": 0.44072948328267475, + "grad_norm": 1.6950466632843018, + "learning_rate": 4.873728048110973e-06, + "loss": 0.5907570719718933, + "mean_token_accuracy": 0.7946986556053162, + "num_tokens": 5332542.0, + "step": 580 + }, + { + "epoch": 0.44148936170212766, + "grad_norm": 2.1180708408355713, + "learning_rate": 4.873070007418059e-06, + "loss": 0.5220296382904053, + "mean_token_accuracy": 0.8037363290786743, + "num_tokens": 5341722.0, + "step": 581 + }, + { + "epoch": 0.44224924012158057, + "grad_norm": 1.3643816709518433, + "learning_rate": 4.872410301213265e-06, + "loss": 0.4865502417087555, + "mean_token_accuracy": 0.8377852439880371, + "num_tokens": 5359359.0, + "step": 582 + }, + { + "epoch": 0.4430091185410334, + "grad_norm": 1.483280897140503, + "learning_rate": 4.871748929959598e-06, + "loss": 0.36856764554977417, + "mean_token_accuracy": 0.8709549903869629, + "num_tokens": 5369749.0, + "step": 583 + }, + { + "epoch": 0.44376899696048633, + "grad_norm": 1.6891541481018066, + "learning_rate": 4.871085894121234e-06, + "loss": 0.5768930912017822, + "mean_token_accuracy": 0.8030461668968201, + "num_tokens": 5383912.0, + "step": 584 + }, + { + "epoch": 0.44452887537993924, + "grad_norm": 2.1318740844726562, + "learning_rate": 4.870421194163515e-06, + "loss": 0.4337100386619568, + "mean_token_accuracy": 0.8562518358230591, + "num_tokens": 5389412.0, + "step": 585 + }, + { + "epoch": 0.4452887537993921, + "grad_norm": 2.540255546569824, + "learning_rate": 4.869754830552956e-06, + "loss": 0.4708256125450134, + "mean_token_accuracy": 0.8446552753448486, + "num_tokens": 5394762.0, + "step": 586 + }, + { + "epoch": 0.446048632218845, + "grad_norm": 2.048015594482422, + "learning_rate": 4.869086803757235e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8181137442588806, + "num_tokens": 5402379.0, + "step": 587 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 2.9821012020111084, + "learning_rate": 4.868417114245199e-06, + "loss": 0.6299797296524048, + "mean_token_accuracy": 0.8237329125404358, + "num_tokens": 5408229.0, + "step": 588 + }, + { + "epoch": 0.44756838905775076, + "grad_norm": 1.7807202339172363, + "learning_rate": 4.867745762486862e-06, + "loss": 0.5176759958267212, + "mean_token_accuracy": 0.8184244632720947, + "num_tokens": 5418383.0, + "step": 589 + }, + { + "epoch": 0.44832826747720367, + "grad_norm": 1.5466399192810059, + "learning_rate": 4.8670727489534035e-06, + "loss": 0.5137228965759277, + "mean_token_accuracy": 0.8365053534507751, + "num_tokens": 5432127.0, + "step": 590 + }, + { + "epoch": 0.4490881458966565, + "grad_norm": 2.9521141052246094, + "learning_rate": 4.866398074117173e-06, + "loss": 0.4056887924671173, + "mean_token_accuracy": 0.8561501502990723, + "num_tokens": 5436062.0, + "step": 591 + }, + { + "epoch": 0.44984802431610943, + "grad_norm": 2.058743953704834, + "learning_rate": 4.86572173845168e-06, + "loss": 0.6124799251556396, + "mean_token_accuracy": 0.8007957339286804, + "num_tokens": 5444989.0, + "step": 592 + }, + { + "epoch": 0.4506079027355623, + "grad_norm": 2.1243767738342285, + "learning_rate": 4.865043742431605e-06, + "loss": 0.5659694671630859, + "mean_token_accuracy": 0.8084750175476074, + "num_tokens": 5453865.0, + "step": 593 + }, + { + "epoch": 0.4513677811550152, + "grad_norm": 1.6732314825057983, + "learning_rate": 4.864364086532792e-06, + "loss": 0.47879064083099365, + "mean_token_accuracy": 0.8346436023712158, + "num_tokens": 5466398.0, + "step": 594 + }, + { + "epoch": 0.4521276595744681, + "grad_norm": 1.3793858289718628, + "learning_rate": 4.863682771232249e-06, + "loss": 0.45989373326301575, + "mean_token_accuracy": 0.8254791498184204, + "num_tokens": 5482121.0, + "step": 595 + }, + { + "epoch": 0.45288753799392095, + "grad_norm": 1.9812315702438354, + "learning_rate": 4.862999797008149e-06, + "loss": 0.5778874754905701, + "mean_token_accuracy": 0.8041508197784424, + "num_tokens": 5493000.0, + "step": 596 + }, + { + "epoch": 0.45364741641337386, + "grad_norm": 3.3065083026885986, + "learning_rate": 4.862315164339829e-06, + "loss": 0.4623975157737732, + "mean_token_accuracy": 0.8426318168640137, + "num_tokens": 5496723.0, + "step": 597 + }, + { + "epoch": 0.45440729483282677, + "grad_norm": 3.167119026184082, + "learning_rate": 4.861628873707792e-06, + "loss": 0.6984533667564392, + "mean_token_accuracy": 0.772136926651001, + "num_tokens": 5501161.0, + "step": 598 + }, + { + "epoch": 0.4551671732522796, + "grad_norm": 2.2130985260009766, + "learning_rate": 4.860940925593703e-06, + "loss": 0.4823192059993744, + "mean_token_accuracy": 0.8462972640991211, + "num_tokens": 5509544.0, + "step": 599 + }, + { + "epoch": 0.45592705167173253, + "grad_norm": 3.029191732406616, + "learning_rate": 4.86025132048039e-06, + "loss": 0.523664116859436, + "mean_token_accuracy": 0.8229140043258667, + "num_tokens": 5514586.0, + "step": 600 + }, + { + "epoch": 0.4566869300911854, + "grad_norm": 1.6983962059020996, + "learning_rate": 4.859560058851844e-06, + "loss": 0.4832698106765747, + "mean_token_accuracy": 0.8403248190879822, + "num_tokens": 5525773.0, + "step": 601 + }, + { + "epoch": 0.4574468085106383, + "grad_norm": 3.0504038333892822, + "learning_rate": 4.8588671411932195e-06, + "loss": 0.5158926248550415, + "mean_token_accuracy": 0.8098392486572266, + "num_tokens": 5529739.0, + "step": 602 + }, + { + "epoch": 0.4582066869300912, + "grad_norm": 2.584836483001709, + "learning_rate": 4.858172567990832e-06, + "loss": 0.5724587440490723, + "mean_token_accuracy": 0.8128519058227539, + "num_tokens": 5535763.0, + "step": 603 + }, + { + "epoch": 0.45896656534954405, + "grad_norm": 2.0514042377471924, + "learning_rate": 4.857476339732162e-06, + "loss": 0.4337679445743561, + "mean_token_accuracy": 0.8405929207801819, + "num_tokens": 5543075.0, + "step": 604 + }, + { + "epoch": 0.45972644376899696, + "grad_norm": 2.2949347496032715, + "learning_rate": 4.856778456905846e-06, + "loss": 0.46532145142555237, + "mean_token_accuracy": 0.8345137238502502, + "num_tokens": 5549035.0, + "step": 605 + }, + { + "epoch": 0.46048632218844987, + "grad_norm": 2.2067551612854004, + "learning_rate": 4.856078920001689e-06, + "loss": 0.5855136513710022, + "mean_token_accuracy": 0.8043795228004456, + "num_tokens": 5555545.0, + "step": 606 + }, + { + "epoch": 0.4612462006079027, + "grad_norm": 2.101945161819458, + "learning_rate": 4.855377729510648e-06, + "loss": 0.6071814298629761, + "mean_token_accuracy": 0.7973253130912781, + "num_tokens": 5563615.0, + "step": 607 + }, + { + "epoch": 0.46200607902735563, + "grad_norm": 2.5958821773529053, + "learning_rate": 4.8546748859248504e-06, + "loss": 0.6278061866760254, + "mean_token_accuracy": 0.7864972352981567, + "num_tokens": 5570078.0, + "step": 608 + }, + { + "epoch": 0.4627659574468085, + "grad_norm": 2.778101921081543, + "learning_rate": 4.853970389737576e-06, + "loss": 0.35521194338798523, + "mean_token_accuracy": 0.8752605319023132, + "num_tokens": 5573995.0, + "step": 609 + }, + { + "epoch": 0.4635258358662614, + "grad_norm": 2.600534677505493, + "learning_rate": 4.8532642414432675e-06, + "loss": 0.6541563868522644, + "mean_token_accuracy": 0.7843613028526306, + "num_tokens": 5580333.0, + "step": 610 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.778337836265564, + "learning_rate": 4.852556441537528e-06, + "loss": 0.3561405837535858, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 5588430.0, + "step": 611 + }, + { + "epoch": 0.46504559270516715, + "grad_norm": 1.5653862953186035, + "learning_rate": 4.851846990517118e-06, + "loss": 0.6067906618118286, + "mean_token_accuracy": 0.7919317483901978, + "num_tokens": 5601700.0, + "step": 612 + }, + { + "epoch": 0.46580547112462006, + "grad_norm": 1.6097723245620728, + "learning_rate": 4.851135888879958e-06, + "loss": 0.446664422750473, + "mean_token_accuracy": 0.8441969156265259, + "num_tokens": 5612063.0, + "step": 613 + }, + { + "epoch": 0.46656534954407297, + "grad_norm": 1.961207389831543, + "learning_rate": 4.850423137125126e-06, + "loss": 0.5508605241775513, + "mean_token_accuracy": 0.8240450024604797, + "num_tokens": 5620245.0, + "step": 614 + }, + { + "epoch": 0.4673252279635258, + "grad_norm": 2.2189085483551025, + "learning_rate": 4.8497087357528585e-06, + "loss": 0.6805076599121094, + "mean_token_accuracy": 0.771978497505188, + "num_tokens": 5629590.0, + "step": 615 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 2.5176279544830322, + "learning_rate": 4.8489926852645505e-06, + "loss": 0.4512156844139099, + "mean_token_accuracy": 0.836459755897522, + "num_tokens": 5635259.0, + "step": 616 + }, + { + "epoch": 0.4688449848024316, + "grad_norm": 1.5327287912368774, + "learning_rate": 4.848274986162754e-06, + "loss": 0.4884302616119385, + "mean_token_accuracy": 0.8194037079811096, + "num_tokens": 5649993.0, + "step": 617 + }, + { + "epoch": 0.4696048632218845, + "grad_norm": 2.184554100036621, + "learning_rate": 4.847555638951177e-06, + "loss": 0.5141451358795166, + "mean_token_accuracy": 0.8245922327041626, + "num_tokens": 5657375.0, + "step": 618 + }, + { + "epoch": 0.4703647416413374, + "grad_norm": 1.6143407821655273, + "learning_rate": 4.846834644134686e-06, + "loss": 0.4276641607284546, + "mean_token_accuracy": 0.8481845855712891, + "num_tokens": 5667941.0, + "step": 619 + }, + { + "epoch": 0.47112462006079026, + "grad_norm": 2.3747270107269287, + "learning_rate": 4.846112002219301e-06, + "loss": 0.5608246922492981, + "mean_token_accuracy": 0.8073011040687561, + "num_tokens": 5675042.0, + "step": 620 + }, + { + "epoch": 0.47188449848024316, + "grad_norm": 2.390404224395752, + "learning_rate": 4.845387713712203e-06, + "loss": 0.46616724133491516, + "mean_token_accuracy": 0.8468319177627563, + "num_tokens": 5680207.0, + "step": 621 + }, + { + "epoch": 0.4726443768996961, + "grad_norm": 1.7245099544525146, + "learning_rate": 4.844661779121723e-06, + "loss": 0.5652435421943665, + "mean_token_accuracy": 0.8010749816894531, + "num_tokens": 5693759.0, + "step": 622 + }, + { + "epoch": 0.4734042553191489, + "grad_norm": 2.6923108100891113, + "learning_rate": 4.843934198957351e-06, + "loss": 0.6254661679267883, + "mean_token_accuracy": 0.8236024975776672, + "num_tokens": 5699916.0, + "step": 623 + }, + { + "epoch": 0.47416413373860183, + "grad_norm": 2.516901969909668, + "learning_rate": 4.84320497372973e-06, + "loss": 0.6334252953529358, + "mean_token_accuracy": 0.7803834676742554, + "num_tokens": 5706554.0, + "step": 624 + }, + { + "epoch": 0.4749240121580547, + "grad_norm": 2.3744447231292725, + "learning_rate": 4.842474103950658e-06, + "loss": 0.4221811890602112, + "mean_token_accuracy": 0.8639545440673828, + "num_tokens": 5711756.0, + "step": 625 + }, + { + "epoch": 0.4756838905775076, + "grad_norm": 3.2373476028442383, + "learning_rate": 4.841741590133089e-06, + "loss": 0.6637828946113586, + "mean_token_accuracy": 0.7968347072601318, + "num_tokens": 5716458.0, + "step": 626 + }, + { + "epoch": 0.4764437689969605, + "grad_norm": 2.153888463973999, + "learning_rate": 4.841007432791129e-06, + "loss": 0.4877486228942871, + "mean_token_accuracy": 0.8345249891281128, + "num_tokens": 5723155.0, + "step": 627 + }, + { + "epoch": 0.47720364741641336, + "grad_norm": 2.120497703552246, + "learning_rate": 4.8402716324400375e-06, + "loss": 0.37323033809661865, + "mean_token_accuracy": 0.8734050393104553, + "num_tokens": 5729171.0, + "step": 628 + }, + { + "epoch": 0.47796352583586627, + "grad_norm": 1.5294172763824463, + "learning_rate": 4.839534189596228e-06, + "loss": 0.4057067334651947, + "mean_token_accuracy": 0.8523319959640503, + "num_tokens": 5740112.0, + "step": 629 + }, + { + "epoch": 0.4787234042553192, + "grad_norm": 2.1913886070251465, + "learning_rate": 4.8387951047772656e-06, + "loss": 0.4835960865020752, + "mean_token_accuracy": 0.8438145518302917, + "num_tokens": 5746838.0, + "step": 630 + }, + { + "epoch": 0.479483282674772, + "grad_norm": 1.482897162437439, + "learning_rate": 4.838054378501868e-06, + "loss": 0.46967992186546326, + "mean_token_accuracy": 0.8315759897232056, + "num_tokens": 5760428.0, + "step": 631 + }, + { + "epoch": 0.48024316109422494, + "grad_norm": 1.38850998878479, + "learning_rate": 4.837312011289907e-06, + "loss": 0.41845446825027466, + "mean_token_accuracy": 0.8557186126708984, + "num_tokens": 5773437.0, + "step": 632 + }, + { + "epoch": 0.4810030395136778, + "grad_norm": 3.8337457180023193, + "learning_rate": 4.836568003662403e-06, + "loss": 0.5102912187576294, + "mean_token_accuracy": 0.830644965171814, + "num_tokens": 5776367.0, + "step": 633 + }, + { + "epoch": 0.4817629179331307, + "grad_norm": 1.2084007263183594, + "learning_rate": 4.8358223561415304e-06, + "loss": 0.3835333585739136, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 5792246.0, + "step": 634 + }, + { + "epoch": 0.4825227963525836, + "grad_norm": 1.939408540725708, + "learning_rate": 4.835075069250613e-06, + "loss": 0.4044850468635559, + "mean_token_accuracy": 0.8488376140594482, + "num_tokens": 5799853.0, + "step": 635 + }, + { + "epoch": 0.48328267477203646, + "grad_norm": 1.345870852470398, + "learning_rate": 4.8343261435141245e-06, + "loss": 0.46660199761390686, + "mean_token_accuracy": 0.8371681571006775, + "num_tokens": 5817478.0, + "step": 636 + }, + { + "epoch": 0.48404255319148937, + "grad_norm": 1.6531339883804321, + "learning_rate": 4.833575579457691e-06, + "loss": 0.3886989951133728, + "mean_token_accuracy": 0.8763507008552551, + "num_tokens": 5825739.0, + "step": 637 + }, + { + "epoch": 0.4848024316109423, + "grad_norm": 1.6443969011306763, + "learning_rate": 4.832823377608088e-06, + "loss": 0.4070289731025696, + "mean_token_accuracy": 0.8586630821228027, + "num_tokens": 5837917.0, + "step": 638 + }, + { + "epoch": 0.48556231003039513, + "grad_norm": 2.005136013031006, + "learning_rate": 4.832069538493237e-06, + "loss": 0.40616685152053833, + "mean_token_accuracy": 0.8571510314941406, + "num_tokens": 5845250.0, + "step": 639 + }, + { + "epoch": 0.48632218844984804, + "grad_norm": 1.5244266986846924, + "learning_rate": 4.831314062642213e-06, + "loss": 0.49530288577079773, + "mean_token_accuracy": 0.8328841924667358, + "num_tokens": 5857407.0, + "step": 640 + }, + { + "epoch": 0.4870820668693009, + "grad_norm": 1.9876971244812012, + "learning_rate": 4.830556950585239e-06, + "loss": 0.4583776593208313, + "mean_token_accuracy": 0.8427221179008484, + "num_tokens": 5865391.0, + "step": 641 + }, + { + "epoch": 0.4878419452887538, + "grad_norm": 3.023336172103882, + "learning_rate": 4.829798202853683e-06, + "loss": 0.6134771108627319, + "mean_token_accuracy": 0.7981935739517212, + "num_tokens": 5870729.0, + "step": 642 + }, + { + "epoch": 0.4886018237082067, + "grad_norm": 1.8889515399932861, + "learning_rate": 4.829037819980065e-06, + "loss": 0.4420135021209717, + "mean_token_accuracy": 0.8480775356292725, + "num_tokens": 5878982.0, + "step": 643 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.2408435344696045, + "learning_rate": 4.828275802498051e-06, + "loss": 0.525706946849823, + "mean_token_accuracy": 0.8271557092666626, + "num_tokens": 5885097.0, + "step": 644 + }, + { + "epoch": 0.49012158054711247, + "grad_norm": 1.9734224081039429, + "learning_rate": 4.827512150942454e-06, + "loss": 0.44246578216552734, + "mean_token_accuracy": 0.8456668257713318, + "num_tokens": 5893941.0, + "step": 645 + }, + { + "epoch": 0.4908814589665654, + "grad_norm": 1.9618173837661743, + "learning_rate": 4.8267468658492335e-06, + "loss": 0.5119768381118774, + "mean_token_accuracy": 0.8355510830879211, + "num_tokens": 5902829.0, + "step": 646 + }, + { + "epoch": 0.49164133738601823, + "grad_norm": 1.7181587219238281, + "learning_rate": 4.825979947755496e-06, + "loss": 0.5666520595550537, + "mean_token_accuracy": 0.7951971888542175, + "num_tokens": 5915212.0, + "step": 647 + }, + { + "epoch": 0.49240121580547114, + "grad_norm": 3.0121164321899414, + "learning_rate": 4.8252113971994955e-06, + "loss": 0.628632128238678, + "mean_token_accuracy": 0.8041050434112549, + "num_tokens": 5921410.0, + "step": 648 + }, + { + "epoch": 0.493161094224924, + "grad_norm": 2.9980475902557373, + "learning_rate": 4.824441214720629e-06, + "loss": 0.4507424831390381, + "mean_token_accuracy": 0.8636263608932495, + "num_tokens": 5925179.0, + "step": 649 + }, + { + "epoch": 0.4939209726443769, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.823669400859441e-06, + "loss": 0.602759838104248, + "mean_token_accuracy": 0.8104915618896484, + "num_tokens": 5934160.0, + "step": 650 + }, + { + "epoch": 0.4946808510638298, + "grad_norm": 1.1186442375183105, + "learning_rate": 4.8228959561576195e-06, + "loss": 0.41168469190597534, + "mean_token_accuracy": 0.8461419939994812, + "num_tokens": 5954163.0, + "step": 651 + }, + { + "epoch": 0.49544072948328266, + "grad_norm": 1.855465054512024, + "learning_rate": 4.822120881157998e-06, + "loss": 0.5049735307693481, + "mean_token_accuracy": 0.8225747346878052, + "num_tokens": 5963840.0, + "step": 652 + }, + { + "epoch": 0.49620060790273557, + "grad_norm": 3.550563335418701, + "learning_rate": 4.821344176404554e-06, + "loss": 0.49025264382362366, + "mean_token_accuracy": 0.8265978693962097, + "num_tokens": 5967358.0, + "step": 653 + }, + { + "epoch": 0.4969604863221885, + "grad_norm": 3.063910484313965, + "learning_rate": 4.820565842442408e-06, + "loss": 0.5652767419815063, + "mean_token_accuracy": 0.811700701713562, + "num_tokens": 5971858.0, + "step": 654 + }, + { + "epoch": 0.49772036474164133, + "grad_norm": 2.4613308906555176, + "learning_rate": 4.819785879817827e-06, + "loss": 0.5296125411987305, + "mean_token_accuracy": 0.8336488008499146, + "num_tokens": 5977442.0, + "step": 655 + }, + { + "epoch": 0.49848024316109424, + "grad_norm": 2.342519760131836, + "learning_rate": 4.819004289078217e-06, + "loss": 0.5753380060195923, + "mean_token_accuracy": 0.7922406792640686, + "num_tokens": 5984531.0, + "step": 656 + }, + { + "epoch": 0.4992401215805471, + "grad_norm": 2.0410680770874023, + "learning_rate": 4.818221070772129e-06, + "loss": 0.5433275699615479, + "mean_token_accuracy": 0.8043830990791321, + "num_tokens": 5992642.0, + "step": 657 + }, + { + "epoch": 0.5, + "grad_norm": 1.4999698400497437, + "learning_rate": 4.8174362254492555e-06, + "loss": 0.5248899459838867, + "mean_token_accuracy": 0.8107168674468994, + "num_tokens": 6005543.0, + "step": 658 + }, + { + "epoch": 0.5007598784194529, + "grad_norm": 1.9494401216506958, + "learning_rate": 4.816649753660431e-06, + "loss": 0.41291385889053345, + "mean_token_accuracy": 0.8650569915771484, + "num_tokens": 6012185.0, + "step": 659 + }, + { + "epoch": 0.5015197568389058, + "grad_norm": 2.7514095306396484, + "learning_rate": 4.815861655957632e-06, + "loss": 0.4244142770767212, + "mean_token_accuracy": 0.8485112190246582, + "num_tokens": 6016809.0, + "step": 660 + }, + { + "epoch": 0.5022796352583586, + "grad_norm": 1.4354928731918335, + "learning_rate": 4.815071932893976e-06, + "loss": 0.4332060217857361, + "mean_token_accuracy": 0.8386815786361694, + "num_tokens": 6034795.0, + "step": 661 + }, + { + "epoch": 0.5030395136778115, + "grad_norm": 1.3113417625427246, + "learning_rate": 4.81428058502372e-06, + "loss": 0.5415540933609009, + "mean_token_accuracy": 0.8115285038948059, + "num_tokens": 6053624.0, + "step": 662 + }, + { + "epoch": 0.5037993920972644, + "grad_norm": 1.820868730545044, + "learning_rate": 4.813487612902265e-06, + "loss": 0.5360245108604431, + "mean_token_accuracy": 0.8313555717468262, + "num_tokens": 6063399.0, + "step": 663 + }, + { + "epoch": 0.5045592705167173, + "grad_norm": 2.347001552581787, + "learning_rate": 4.812693017086145e-06, + "loss": 0.4926982820034027, + "mean_token_accuracy": 0.8137006759643555, + "num_tokens": 6070111.0, + "step": 664 + }, + { + "epoch": 0.5053191489361702, + "grad_norm": 1.8830888271331787, + "learning_rate": 4.811896798133042e-06, + "loss": 0.5419014692306519, + "mean_token_accuracy": 0.8027454614639282, + "num_tokens": 6081090.0, + "step": 665 + }, + { + "epoch": 0.506079027355623, + "grad_norm": 2.3258056640625, + "learning_rate": 4.811098956601772e-06, + "loss": 0.4629337787628174, + "mean_token_accuracy": 0.8416580557823181, + "num_tokens": 6087921.0, + "step": 666 + }, + { + "epoch": 0.506838905775076, + "grad_norm": 1.9578291177749634, + "learning_rate": 4.810299493052289e-06, + "loss": 0.40305402874946594, + "mean_token_accuracy": 0.8529061079025269, + "num_tokens": 6100034.0, + "step": 667 + }, + { + "epoch": 0.5075987841945289, + "grad_norm": 2.800635576248169, + "learning_rate": 4.809498408045691e-06, + "loss": 0.5087342262268066, + "mean_token_accuracy": 0.8214689493179321, + "num_tokens": 6104742.0, + "step": 668 + }, + { + "epoch": 0.5083586626139818, + "grad_norm": 1.5318149328231812, + "learning_rate": 4.808695702144206e-06, + "loss": 0.4733222723007202, + "mean_token_accuracy": 0.837577223777771, + "num_tokens": 6117242.0, + "step": 669 + }, + { + "epoch": 0.5091185410334347, + "grad_norm": 1.2368661165237427, + "learning_rate": 4.807891375911207e-06, + "loss": 0.3929097056388855, + "mean_token_accuracy": 0.8331400752067566, + "num_tokens": 6133509.0, + "step": 670 + }, + { + "epoch": 0.5098784194528876, + "grad_norm": 2.4711415767669678, + "learning_rate": 4.8070854299112e-06, + "loss": 0.6294851303100586, + "mean_token_accuracy": 0.7956781983375549, + "num_tokens": 6140294.0, + "step": 671 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.590961217880249, + "learning_rate": 4.806277864709828e-06, + "loss": 0.580160915851593, + "mean_token_accuracy": 0.809589684009552, + "num_tokens": 6145803.0, + "step": 672 + }, + { + "epoch": 0.5113981762917933, + "grad_norm": 2.4653842449188232, + "learning_rate": 4.805468680873874e-06, + "loss": 0.5262120366096497, + "mean_token_accuracy": 0.822458803653717, + "num_tokens": 6151236.0, + "step": 673 + }, + { + "epoch": 0.5121580547112462, + "grad_norm": 2.860720157623291, + "learning_rate": 4.804657878971252e-06, + "loss": 0.4007391035556793, + "mean_token_accuracy": 0.8637382984161377, + "num_tokens": 6155310.0, + "step": 674 + }, + { + "epoch": 0.5129179331306991, + "grad_norm": 2.520282030105591, + "learning_rate": 4.803845459571014e-06, + "loss": 0.45798182487487793, + "mean_token_accuracy": 0.8270114660263062, + "num_tokens": 6160326.0, + "step": 675 + }, + { + "epoch": 0.513677811550152, + "grad_norm": 2.7290921211242676, + "learning_rate": 4.803031423243349e-06, + "loss": 0.5745848417282104, + "mean_token_accuracy": 0.8401234745979309, + "num_tokens": 6165709.0, + "step": 676 + }, + { + "epoch": 0.5144376899696048, + "grad_norm": 1.6678650379180908, + "learning_rate": 4.802215770559578e-06, + "loss": 0.5257721543312073, + "mean_token_accuracy": 0.8241991996765137, + "num_tokens": 6177875.0, + "step": 677 + }, + { + "epoch": 0.5151975683890577, + "grad_norm": 2.1720468997955322, + "learning_rate": 4.801398502092156e-06, + "loss": 0.45342206954956055, + "mean_token_accuracy": 0.8463799953460693, + "num_tokens": 6185415.0, + "step": 678 + }, + { + "epoch": 0.5159574468085106, + "grad_norm": 2.282259702682495, + "learning_rate": 4.800579618414677e-06, + "loss": 0.4864169955253601, + "mean_token_accuracy": 0.8300632238388062, + "num_tokens": 6191832.0, + "step": 679 + }, + { + "epoch": 0.5167173252279635, + "grad_norm": 2.0092248916625977, + "learning_rate": 4.799759120101861e-06, + "loss": 0.5781463980674744, + "mean_token_accuracy": 0.8267031908035278, + "num_tokens": 6199440.0, + "step": 680 + }, + { + "epoch": 0.5174772036474165, + "grad_norm": 1.396580696105957, + "learning_rate": 4.798937007729568e-06, + "loss": 0.49689239263534546, + "mean_token_accuracy": 0.8257499933242798, + "num_tokens": 6213840.0, + "step": 681 + }, + { + "epoch": 0.5182370820668692, + "grad_norm": 1.9060769081115723, + "learning_rate": 4.798113281874788e-06, + "loss": 0.48969539999961853, + "mean_token_accuracy": 0.8171790838241577, + "num_tokens": 6223006.0, + "step": 682 + }, + { + "epoch": 0.5189969604863222, + "grad_norm": 1.6255282163619995, + "learning_rate": 4.797287943115642e-06, + "loss": 0.5532330870628357, + "mean_token_accuracy": 0.8173393607139587, + "num_tokens": 6234857.0, + "step": 683 + }, + { + "epoch": 0.5197568389057751, + "grad_norm": 1.6923905611038208, + "learning_rate": 4.796460992031386e-06, + "loss": 0.4880887269973755, + "mean_token_accuracy": 0.834983229637146, + "num_tokens": 6245252.0, + "step": 684 + }, + { + "epoch": 0.520516717325228, + "grad_norm": 2.13161301612854, + "learning_rate": 4.7956324292024045e-06, + "loss": 0.5687593817710876, + "mean_token_accuracy": 0.7996571063995361, + "num_tokens": 6253726.0, + "step": 685 + }, + { + "epoch": 0.5212765957446809, + "grad_norm": 2.509375810623169, + "learning_rate": 4.794802255210217e-06, + "loss": 0.5396929979324341, + "mean_token_accuracy": 0.8007107973098755, + "num_tokens": 6259238.0, + "step": 686 + }, + { + "epoch": 0.5220364741641338, + "grad_norm": 2.393710136413574, + "learning_rate": 4.793970470637469e-06, + "loss": 0.6165191531181335, + "mean_token_accuracy": 0.7891418933868408, + "num_tokens": 6266325.0, + "step": 687 + }, + { + "epoch": 0.5227963525835866, + "grad_norm": 1.511647343635559, + "learning_rate": 4.7931370760679415e-06, + "loss": 0.4773876965045929, + "mean_token_accuracy": 0.8381044864654541, + "num_tokens": 6277447.0, + "step": 688 + }, + { + "epoch": 0.5235562310030395, + "grad_norm": 2.206587314605713, + "learning_rate": 4.792302072086542e-06, + "loss": 0.5482058525085449, + "mean_token_accuracy": 0.8239108920097351, + "num_tokens": 6285163.0, + "step": 689 + }, + { + "epoch": 0.5243161094224924, + "grad_norm": 3.018146514892578, + "learning_rate": 4.7914654592793065e-06, + "loss": 0.4880615472793579, + "mean_token_accuracy": 0.8361308574676514, + "num_tokens": 6289386.0, + "step": 690 + }, + { + "epoch": 0.5250759878419453, + "grad_norm": 1.6469231843948364, + "learning_rate": 4.790627238233405e-06, + "loss": 0.4164774715900421, + "mean_token_accuracy": 0.8496290445327759, + "num_tokens": 6298915.0, + "step": 691 + }, + { + "epoch": 0.5258358662613982, + "grad_norm": 2.352505922317505, + "learning_rate": 4.789787409537131e-06, + "loss": 0.5366303324699402, + "mean_token_accuracy": 0.8350417613983154, + "num_tokens": 6306130.0, + "step": 692 + }, + { + "epoch": 0.526595744680851, + "grad_norm": 1.7463021278381348, + "learning_rate": 4.7889459737799105e-06, + "loss": 0.4389137923717499, + "mean_token_accuracy": 0.8463300466537476, + "num_tokens": 6315503.0, + "step": 693 + }, + { + "epoch": 0.5273556231003039, + "grad_norm": 2.257706642150879, + "learning_rate": 4.788102931552294e-06, + "loss": 0.5309344530105591, + "mean_token_accuracy": 0.8164352178573608, + "num_tokens": 6321852.0, + "step": 694 + }, + { + "epoch": 0.5281155015197568, + "grad_norm": 2.392732620239258, + "learning_rate": 4.787258283445962e-06, + "loss": 0.3956204056739807, + "mean_token_accuracy": 0.8671456575393677, + "num_tokens": 6327380.0, + "step": 695 + }, + { + "epoch": 0.5288753799392097, + "grad_norm": 2.210514545440674, + "learning_rate": 4.786412030053721e-06, + "loss": 0.4842875003814697, + "mean_token_accuracy": 0.8508446216583252, + "num_tokens": 6334898.0, + "step": 696 + }, + { + "epoch": 0.5296352583586627, + "grad_norm": 1.8678946495056152, + "learning_rate": 4.785564171969503e-06, + "loss": 0.47399595379829407, + "mean_token_accuracy": 0.8514996767044067, + "num_tokens": 6346374.0, + "step": 697 + }, + { + "epoch": 0.5303951367781155, + "grad_norm": 2.604079484939575, + "learning_rate": 4.784714709788368e-06, + "loss": 0.5950228571891785, + "mean_token_accuracy": 0.7983481884002686, + "num_tokens": 6351648.0, + "step": 698 + }, + { + "epoch": 0.5311550151975684, + "grad_norm": 1.662381649017334, + "learning_rate": 4.783863644106502e-06, + "loss": 0.41616758704185486, + "mean_token_accuracy": 0.8554803133010864, + "num_tokens": 6360506.0, + "step": 699 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 1.6300342082977295, + "learning_rate": 4.783010975521216e-06, + "loss": 0.43029269576072693, + "mean_token_accuracy": 0.8443028926849365, + "num_tokens": 6370675.0, + "step": 700 + }, + { + "epoch": 0.5326747720364742, + "grad_norm": 1.731873869895935, + "learning_rate": 4.782156704630944e-06, + "loss": 0.4383814334869385, + "mean_token_accuracy": 0.8443183898925781, + "num_tokens": 6381803.0, + "step": 701 + }, + { + "epoch": 0.5334346504559271, + "grad_norm": 3.1788413524627686, + "learning_rate": 4.7813008320352475e-06, + "loss": 0.32194480299949646, + "mean_token_accuracy": 0.8870962858200073, + "num_tokens": 6389263.0, + "step": 702 + }, + { + "epoch": 0.53419452887538, + "grad_norm": 2.099513530731201, + "learning_rate": 4.78044335833481e-06, + "loss": 0.36962923407554626, + "mean_token_accuracy": 0.8661133646965027, + "num_tokens": 6395589.0, + "step": 703 + }, + { + "epoch": 0.5349544072948328, + "grad_norm": 1.4859435558319092, + "learning_rate": 4.77958428413144e-06, + "loss": 0.4619954824447632, + "mean_token_accuracy": 0.8438555002212524, + "num_tokens": 6407470.0, + "step": 704 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.2561073303222656, + "learning_rate": 4.7787236100280685e-06, + "loss": 0.3770977258682251, + "mean_token_accuracy": 0.8515733480453491, + "num_tokens": 6422888.0, + "step": 705 + }, + { + "epoch": 0.5364741641337386, + "grad_norm": 1.4455817937850952, + "learning_rate": 4.777861336628751e-06, + "loss": 0.46481069922447205, + "mean_token_accuracy": 0.8502002954483032, + "num_tokens": 6441266.0, + "step": 706 + }, + { + "epoch": 0.5372340425531915, + "grad_norm": 1.1387295722961426, + "learning_rate": 4.7769974645386616e-06, + "loss": 0.36964765191078186, + "mean_token_accuracy": 0.8719524145126343, + "num_tokens": 6463686.0, + "step": 707 + }, + { + "epoch": 0.5379939209726444, + "grad_norm": 1.7179663181304932, + "learning_rate": 4.776131994364102e-06, + "loss": 0.4231719970703125, + "mean_token_accuracy": 0.8416585922241211, + "num_tokens": 6472956.0, + "step": 708 + }, + { + "epoch": 0.5387537993920972, + "grad_norm": 1.6328502893447876, + "learning_rate": 4.775264926712489e-06, + "loss": 0.5836569666862488, + "mean_token_accuracy": 0.8039724230766296, + "num_tokens": 6485773.0, + "step": 709 + }, + { + "epoch": 0.5395136778115501, + "grad_norm": 1.8515360355377197, + "learning_rate": 4.774396262192368e-06, + "loss": 0.5477553009986877, + "mean_token_accuracy": 0.8136521577835083, + "num_tokens": 6496379.0, + "step": 710 + }, + { + "epoch": 0.540273556231003, + "grad_norm": 1.741858959197998, + "learning_rate": 4.7735260014133986e-06, + "loss": 0.4663267731666565, + "mean_token_accuracy": 0.8473691940307617, + "num_tokens": 6507652.0, + "step": 711 + }, + { + "epoch": 0.541033434650456, + "grad_norm": 1.7516659498214722, + "learning_rate": 4.772654144986364e-06, + "loss": 0.374914288520813, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 6519030.0, + "step": 712 + }, + { + "epoch": 0.5417933130699089, + "grad_norm": 2.662343978881836, + "learning_rate": 4.7717806935231665e-06, + "loss": 0.4206875264644623, + "mean_token_accuracy": 0.8544126749038696, + "num_tokens": 6523669.0, + "step": 713 + }, + { + "epoch": 0.5425531914893617, + "grad_norm": 1.4088834524154663, + "learning_rate": 4.770905647636828e-06, + "loss": 0.5824331045150757, + "mean_token_accuracy": 0.7857901453971863, + "num_tokens": 6540560.0, + "step": 714 + }, + { + "epoch": 0.5433130699088146, + "grad_norm": 2.173656940460205, + "learning_rate": 4.77002900794149e-06, + "loss": 0.555023729801178, + "mean_token_accuracy": 0.8067290782928467, + "num_tokens": 6548946.0, + "step": 715 + }, + { + "epoch": 0.5440729483282675, + "grad_norm": 2.121018648147583, + "learning_rate": 4.769150775052411e-06, + "loss": 0.559730052947998, + "mean_token_accuracy": 0.8166372776031494, + "num_tokens": 6556065.0, + "step": 716 + }, + { + "epoch": 0.5448328267477204, + "grad_norm": 3.335866928100586, + "learning_rate": 4.768270949585968e-06, + "loss": 0.6442267894744873, + "mean_token_accuracy": 0.7858607769012451, + "num_tokens": 6560615.0, + "step": 717 + }, + { + "epoch": 0.5455927051671733, + "grad_norm": 2.3813695907592773, + "learning_rate": 4.767389532159659e-06, + "loss": 0.4027421474456787, + "mean_token_accuracy": 0.8635619282722473, + "num_tokens": 6565841.0, + "step": 718 + }, + { + "epoch": 0.5463525835866262, + "grad_norm": 2.0657708644866943, + "learning_rate": 4.766506523392095e-06, + "loss": 0.38899827003479004, + "mean_token_accuracy": 0.8660480380058289, + "num_tokens": 6572362.0, + "step": 719 + }, + { + "epoch": 0.547112462006079, + "grad_norm": 1.093705415725708, + "learning_rate": 4.765621923903005e-06, + "loss": 0.45967352390289307, + "mean_token_accuracy": 0.8338102102279663, + "num_tokens": 6595998.0, + "step": 720 + }, + { + "epoch": 0.5478723404255319, + "grad_norm": 2.942065954208374, + "learning_rate": 4.764735734313236e-06, + "loss": 0.42910510301589966, + "mean_token_accuracy": 0.8406122922897339, + "num_tokens": 6601075.0, + "step": 721 + }, + { + "epoch": 0.5486322188449848, + "grad_norm": 2.049011707305908, + "learning_rate": 4.763847955244749e-06, + "loss": 0.5584231615066528, + "mean_token_accuracy": 0.8171684741973877, + "num_tokens": 6609310.0, + "step": 722 + }, + { + "epoch": 0.5493920972644377, + "grad_norm": 2.485543966293335, + "learning_rate": 4.762958587320623e-06, + "loss": 0.5396170020103455, + "mean_token_accuracy": 0.8158525824546814, + "num_tokens": 6616185.0, + "step": 723 + }, + { + "epoch": 0.5501519756838906, + "grad_norm": 1.87015962600708, + "learning_rate": 4.762067631165049e-06, + "loss": 0.49739527702331543, + "mean_token_accuracy": 0.8303765654563904, + "num_tokens": 6625629.0, + "step": 724 + }, + { + "epoch": 0.5509118541033434, + "grad_norm": 4.239654541015625, + "learning_rate": 4.761175087403336e-06, + "loss": 0.6029239296913147, + "mean_token_accuracy": 0.8123486042022705, + "num_tokens": 6629194.0, + "step": 725 + }, + { + "epoch": 0.5516717325227963, + "grad_norm": 2.0134730339050293, + "learning_rate": 4.760280956661904e-06, + "loss": 0.4777873754501343, + "mean_token_accuracy": 0.8283513784408569, + "num_tokens": 6636929.0, + "step": 726 + }, + { + "epoch": 0.5524316109422492, + "grad_norm": 1.991780400276184, + "learning_rate": 4.75938523956829e-06, + "loss": 0.4631248116493225, + "mean_token_accuracy": 0.8275107741355896, + "num_tokens": 6645135.0, + "step": 727 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 1.423792839050293, + "learning_rate": 4.75848793675114e-06, + "loss": 0.49630722403526306, + "mean_token_accuracy": 0.8388000130653381, + "num_tokens": 6662690.0, + "step": 728 + }, + { + "epoch": 0.5539513677811551, + "grad_norm": 2.345294952392578, + "learning_rate": 4.757589048840219e-06, + "loss": 0.37830638885498047, + "mean_token_accuracy": 0.8782080411911011, + "num_tokens": 6667285.0, + "step": 729 + }, + { + "epoch": 0.5547112462006079, + "grad_norm": 2.7452144622802734, + "learning_rate": 4.756688576466398e-06, + "loss": 0.51595538854599, + "mean_token_accuracy": 0.8441770672798157, + "num_tokens": 6672324.0, + "step": 730 + }, + { + "epoch": 0.5554711246200608, + "grad_norm": 1.5247859954833984, + "learning_rate": 4.755786520261666e-06, + "loss": 0.48365193605422974, + "mean_token_accuracy": 0.8276445269584656, + "num_tokens": 6685296.0, + "step": 731 + }, + { + "epoch": 0.5562310030395137, + "grad_norm": 1.4018276929855347, + "learning_rate": 4.75488288085912e-06, + "loss": 0.3876481354236603, + "mean_token_accuracy": 0.8612343072891235, + "num_tokens": 6697515.0, + "step": 732 + }, + { + "epoch": 0.5569908814589666, + "grad_norm": 2.9570324420928955, + "learning_rate": 4.753977658892967e-06, + "loss": 0.5468149185180664, + "mean_token_accuracy": 0.8054271340370178, + "num_tokens": 6702194.0, + "step": 733 + }, + { + "epoch": 0.5577507598784195, + "grad_norm": 1.9282715320587158, + "learning_rate": 4.753070854998529e-06, + "loss": 0.4758574962615967, + "mean_token_accuracy": 0.8379775285720825, + "num_tokens": 6709938.0, + "step": 734 + }, + { + "epoch": 0.5585106382978723, + "grad_norm": 1.981264591217041, + "learning_rate": 4.752162469812234e-06, + "loss": 0.48461222648620605, + "mean_token_accuracy": 0.833509087562561, + "num_tokens": 6718125.0, + "step": 735 + }, + { + "epoch": 0.5592705167173252, + "grad_norm": 1.1643427610397339, + "learning_rate": 4.751252503971624e-06, + "loss": 0.410121887922287, + "mean_token_accuracy": 0.8221402764320374, + "num_tokens": 6735125.0, + "step": 736 + }, + { + "epoch": 0.5600303951367781, + "grad_norm": 1.786566972732544, + "learning_rate": 4.750340958115346e-06, + "loss": 0.5964341163635254, + "mean_token_accuracy": 0.8038164377212524, + "num_tokens": 6747369.0, + "step": 737 + }, + { + "epoch": 0.560790273556231, + "grad_norm": 1.7256991863250732, + "learning_rate": 4.749427832883158e-06, + "loss": 0.48737066984176636, + "mean_token_accuracy": 0.830894947052002, + "num_tokens": 6758115.0, + "step": 738 + }, + { + "epoch": 0.5615501519756839, + "grad_norm": 1.997747540473938, + "learning_rate": 4.748513128915928e-06, + "loss": 0.5238886475563049, + "mean_token_accuracy": 0.8066858053207397, + "num_tokens": 6766111.0, + "step": 739 + }, + { + "epoch": 0.5623100303951368, + "grad_norm": 2.127016305923462, + "learning_rate": 4.747596846855629e-06, + "loss": 0.5045586228370667, + "mean_token_accuracy": 0.821424126625061, + "num_tokens": 6772893.0, + "step": 740 + }, + { + "epoch": 0.5630699088145896, + "grad_norm": 1.7664796113967896, + "learning_rate": 4.7466789873453446e-06, + "loss": 0.42954835295677185, + "mean_token_accuracy": 0.8533384799957275, + "num_tokens": 6785133.0, + "step": 741 + }, + { + "epoch": 0.5638297872340425, + "grad_norm": 1.4987404346466064, + "learning_rate": 4.7457595510292615e-06, + "loss": 0.5378558039665222, + "mean_token_accuracy": 0.8184819221496582, + "num_tokens": 6799563.0, + "step": 742 + }, + { + "epoch": 0.5645896656534954, + "grad_norm": 1.4444655179977417, + "learning_rate": 4.744838538552678e-06, + "loss": 0.42193782329559326, + "mean_token_accuracy": 0.837514340877533, + "num_tokens": 6812470.0, + "step": 743 + }, + { + "epoch": 0.5653495440729484, + "grad_norm": 3.867751121520996, + "learning_rate": 4.7439159505619946e-06, + "loss": 0.4457814693450928, + "mean_token_accuracy": 0.8630104660987854, + "num_tokens": 6815652.0, + "step": 744 + }, + { + "epoch": 0.5661094224924013, + "grad_norm": 2.1250710487365723, + "learning_rate": 4.74299178770472e-06, + "loss": 0.5638922452926636, + "mean_token_accuracy": 0.7969781160354614, + "num_tokens": 6824566.0, + "step": 745 + }, + { + "epoch": 0.5668693009118541, + "grad_norm": 2.547072410583496, + "learning_rate": 4.742066050629465e-06, + "loss": 0.5516207814216614, + "mean_token_accuracy": 0.8160669803619385, + "num_tokens": 6830589.0, + "step": 746 + }, + { + "epoch": 0.567629179331307, + "grad_norm": 1.2975233793258667, + "learning_rate": 4.741138739985951e-06, + "loss": 0.3823344111442566, + "mean_token_accuracy": 0.8668368458747864, + "num_tokens": 6842707.0, + "step": 747 + }, + { + "epoch": 0.5683890577507599, + "grad_norm": 1.3410450220108032, + "learning_rate": 4.740209856424998e-06, + "loss": 0.5148671269416809, + "mean_token_accuracy": 0.8188045024871826, + "num_tokens": 6857624.0, + "step": 748 + }, + { + "epoch": 0.5691489361702128, + "grad_norm": 1.219467282295227, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.3998957872390747, + "mean_token_accuracy": 0.855175256729126, + "num_tokens": 6875064.0, + "step": 749 + }, + { + "epoch": 0.5699088145896657, + "grad_norm": 1.3530343770980835, + "learning_rate": 4.738347373159585e-06, + "loss": 0.5359633564949036, + "mean_token_accuracy": 0.8178457021713257, + "num_tokens": 6890911.0, + "step": 750 + }, + { + "epoch": 0.5706686930091185, + "grad_norm": 2.146988868713379, + "learning_rate": 4.737413774762287e-06, + "loss": 0.4460008144378662, + "mean_token_accuracy": 0.8172903060913086, + "num_tokens": 6896959.0, + "step": 751 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.456023097038269, + "learning_rate": 4.736478606061876e-06, + "loss": 0.43616920709609985, + "mean_token_accuracy": 0.8465108871459961, + "num_tokens": 6908904.0, + "step": 752 + }, + { + "epoch": 0.5721884498480243, + "grad_norm": 2.9696967601776123, + "learning_rate": 4.735541867714687e-06, + "loss": 0.43464532494544983, + "mean_token_accuracy": 0.8608652353286743, + "num_tokens": 6913026.0, + "step": 753 + }, + { + "epoch": 0.5729483282674772, + "grad_norm": 2.2990667819976807, + "learning_rate": 4.73460356037816e-06, + "loss": 0.6619116067886353, + "mean_token_accuracy": 0.7821142673492432, + "num_tokens": 6920588.0, + "step": 754 + }, + { + "epoch": 0.5737082066869301, + "grad_norm": 2.054746389389038, + "learning_rate": 4.733663684710835e-06, + "loss": 0.5304250717163086, + "mean_token_accuracy": 0.8265531063079834, + "num_tokens": 6928910.0, + "step": 755 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.0050594806671143, + "learning_rate": 4.732722241372354e-06, + "loss": 0.6393026113510132, + "mean_token_accuracy": 0.796819806098938, + "num_tokens": 6940217.0, + "step": 756 + }, + { + "epoch": 0.5752279635258358, + "grad_norm": 1.4285320043563843, + "learning_rate": 4.731779231023456e-06, + "loss": 0.5432837009429932, + "mean_token_accuracy": 0.8104778528213501, + "num_tokens": 6959101.0, + "step": 757 + }, + { + "epoch": 0.5759878419452887, + "grad_norm": 2.3941943645477295, + "learning_rate": 4.730834654325984e-06, + "loss": 0.46550673246383667, + "mean_token_accuracy": 0.8444503545761108, + "num_tokens": 6965036.0, + "step": 758 + }, + { + "epoch": 0.5767477203647416, + "grad_norm": 2.3850574493408203, + "learning_rate": 4.729888511942877e-06, + "loss": 0.4916389584541321, + "mean_token_accuracy": 0.8228527307510376, + "num_tokens": 6971184.0, + "step": 759 + }, + { + "epoch": 0.5775075987841946, + "grad_norm": 1.627480149269104, + "learning_rate": 4.728940804538176e-06, + "loss": 0.5863215923309326, + "mean_token_accuracy": 0.7995302677154541, + "num_tokens": 6982569.0, + "step": 760 + }, + { + "epoch": 0.5782674772036475, + "grad_norm": 1.1723195314407349, + "learning_rate": 4.727991532777016e-06, + "loss": 0.36908864974975586, + "mean_token_accuracy": 0.8355655670166016, + "num_tokens": 6998659.0, + "step": 761 + }, + { + "epoch": 0.5790273556231003, + "grad_norm": 1.5324925184249878, + "learning_rate": 4.727040697325634e-06, + "loss": 0.557658851146698, + "mean_token_accuracy": 0.8141458034515381, + "num_tokens": 7012969.0, + "step": 762 + }, + { + "epoch": 0.5797872340425532, + "grad_norm": 2.4106390476226807, + "learning_rate": 4.726088298851362e-06, + "loss": 0.5004243850708008, + "mean_token_accuracy": 0.8376860618591309, + "num_tokens": 7018301.0, + "step": 763 + }, + { + "epoch": 0.5805471124620061, + "grad_norm": 2.2594921588897705, + "learning_rate": 4.725134338022631e-06, + "loss": 0.6067016124725342, + "mean_token_accuracy": 0.8100241422653198, + "num_tokens": 7025201.0, + "step": 764 + }, + { + "epoch": 0.581306990881459, + "grad_norm": 1.4649826288223267, + "learning_rate": 4.724178815508967e-06, + "loss": 0.36200693249702454, + "mean_token_accuracy": 0.8621826171875, + "num_tokens": 7035112.0, + "step": 765 + }, + { + "epoch": 0.5820668693009119, + "grad_norm": 2.3634560108184814, + "learning_rate": 4.723221731980993e-06, + "loss": 0.41862213611602783, + "mean_token_accuracy": 0.8541463613510132, + "num_tokens": 7040339.0, + "step": 766 + }, + { + "epoch": 0.5828267477203647, + "grad_norm": 2.7798104286193848, + "learning_rate": 4.722263088110426e-06, + "loss": 0.4647108018398285, + "mean_token_accuracy": 0.8505672216415405, + "num_tokens": 7044880.0, + "step": 767 + }, + { + "epoch": 0.5835866261398176, + "grad_norm": 2.070528507232666, + "learning_rate": 4.721302884570079e-06, + "loss": 0.5147565007209778, + "mean_token_accuracy": 0.8113877773284912, + "num_tokens": 7052433.0, + "step": 768 + }, + { + "epoch": 0.5843465045592705, + "grad_norm": 2.1953284740448, + "learning_rate": 4.720341122033862e-06, + "loss": 0.5075466632843018, + "mean_token_accuracy": 0.8474211096763611, + "num_tokens": 7058686.0, + "step": 769 + }, + { + "epoch": 0.5851063829787234, + "grad_norm": 1.9287755489349365, + "learning_rate": 4.719377801176774e-06, + "loss": 0.5382202863693237, + "mean_token_accuracy": 0.8148090243339539, + "num_tokens": 7067538.0, + "step": 770 + }, + { + "epoch": 0.5858662613981763, + "grad_norm": 1.5574456453323364, + "learning_rate": 4.718412922674913e-06, + "loss": 0.43406790494918823, + "mean_token_accuracy": 0.8477081060409546, + "num_tokens": 7077853.0, + "step": 771 + }, + { + "epoch": 0.5866261398176292, + "grad_norm": 1.5490336418151855, + "learning_rate": 4.717446487205466e-06, + "loss": 0.43164271116256714, + "mean_token_accuracy": 0.8504570126533508, + "num_tokens": 7091728.0, + "step": 772 + }, + { + "epoch": 0.587386018237082, + "grad_norm": 1.6945984363555908, + "learning_rate": 4.716478495446717e-06, + "loss": 0.5153743624687195, + "mean_token_accuracy": 0.8213579058647156, + "num_tokens": 7108680.0, + "step": 773 + }, + { + "epoch": 0.5881458966565349, + "grad_norm": 2.2633883953094482, + "learning_rate": 4.715508948078037e-06, + "loss": 0.45254790782928467, + "mean_token_accuracy": 0.8392219543457031, + "num_tokens": 7115546.0, + "step": 774 + }, + { + "epoch": 0.5889057750759878, + "grad_norm": 1.5731090307235718, + "learning_rate": 4.714537845779894e-06, + "loss": 0.38678881525993347, + "mean_token_accuracy": 0.8800252676010132, + "num_tokens": 7126360.0, + "step": 775 + }, + { + "epoch": 0.5896656534954408, + "grad_norm": 2.4873392581939697, + "learning_rate": 4.7135651892338445e-06, + "loss": 0.5190927386283875, + "mean_token_accuracy": 0.8145407438278198, + "num_tokens": 7135705.0, + "step": 776 + }, + { + "epoch": 0.5904255319148937, + "grad_norm": 1.2931004762649536, + "learning_rate": 4.712590979122534e-06, + "loss": 0.3686544895172119, + "mean_token_accuracy": 0.8720537424087524, + "num_tokens": 7150688.0, + "step": 777 + }, + { + "epoch": 0.5911854103343465, + "grad_norm": 1.6353671550750732, + "learning_rate": 4.7116152161297045e-06, + "loss": 0.49065062403678894, + "mean_token_accuracy": 0.8203760385513306, + "num_tokens": 7161040.0, + "step": 778 + }, + { + "epoch": 0.5919452887537994, + "grad_norm": 1.2345483303070068, + "learning_rate": 4.710637900940181e-06, + "loss": 0.4004976451396942, + "mean_token_accuracy": 0.8302007913589478, + "num_tokens": 7178074.0, + "step": 779 + }, + { + "epoch": 0.5927051671732523, + "grad_norm": 2.2506837844848633, + "learning_rate": 4.7096590342398825e-06, + "loss": 0.45142874121665955, + "mean_token_accuracy": 0.8481036424636841, + "num_tokens": 7184153.0, + "step": 780 + }, + { + "epoch": 0.5934650455927052, + "grad_norm": 1.420479416847229, + "learning_rate": 4.708678616715815e-06, + "loss": 0.4802100360393524, + "mean_token_accuracy": 0.8586992025375366, + "num_tokens": 7202810.0, + "step": 781 + }, + { + "epoch": 0.5942249240121581, + "grad_norm": 3.457632303237915, + "learning_rate": 4.707696649056073e-06, + "loss": 0.5265094041824341, + "mean_token_accuracy": 0.8260114192962646, + "num_tokens": 7206396.0, + "step": 782 + }, + { + "epoch": 0.5949848024316109, + "grad_norm": 1.1592093706130981, + "learning_rate": 4.706713131949839e-06, + "loss": 0.3708173632621765, + "mean_token_accuracy": 0.8476542234420776, + "num_tokens": 7225034.0, + "step": 783 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 1.6761400699615479, + "learning_rate": 4.705728066087384e-06, + "loss": 0.4137252867221832, + "mean_token_accuracy": 0.8462049961090088, + "num_tokens": 7237101.0, + "step": 784 + }, + { + "epoch": 0.5965045592705167, + "grad_norm": 2.320185422897339, + "learning_rate": 4.704741452160064e-06, + "loss": 0.5157154202461243, + "mean_token_accuracy": 0.8391785621643066, + "num_tokens": 7243826.0, + "step": 785 + }, + { + "epoch": 0.5972644376899696, + "grad_norm": 2.079423427581787, + "learning_rate": 4.703753290860323e-06, + "loss": 0.4734993278980255, + "mean_token_accuracy": 0.8353281021118164, + "num_tokens": 7250175.0, + "step": 786 + }, + { + "epoch": 0.5980243161094225, + "grad_norm": 1.8215159177780151, + "learning_rate": 4.702763582881692e-06, + "loss": 0.520193338394165, + "mean_token_accuracy": 0.844062864780426, + "num_tokens": 7258868.0, + "step": 787 + }, + { + "epoch": 0.5987841945288754, + "grad_norm": 1.3823071718215942, + "learning_rate": 4.701772328918784e-06, + "loss": 0.4177844822406769, + "mean_token_accuracy": 0.8363165259361267, + "num_tokens": 7271744.0, + "step": 788 + }, + { + "epoch": 0.5995440729483282, + "grad_norm": 2.4749298095703125, + "learning_rate": 4.700779529667301e-06, + "loss": 0.5115069150924683, + "mean_token_accuracy": 0.8473520278930664, + "num_tokens": 7277040.0, + "step": 789 + }, + { + "epoch": 0.6003039513677811, + "grad_norm": 1.7072296142578125, + "learning_rate": 4.699785185824026e-06, + "loss": 0.5265800952911377, + "mean_token_accuracy": 0.8161447048187256, + "num_tokens": 7288288.0, + "step": 790 + }, + { + "epoch": 0.601063829787234, + "grad_norm": 1.6479384899139404, + "learning_rate": 4.69878929808683e-06, + "loss": 0.4445168972015381, + "mean_token_accuracy": 0.8381255865097046, + "num_tokens": 7298640.0, + "step": 791 + }, + { + "epoch": 0.601823708206687, + "grad_norm": 1.9095896482467651, + "learning_rate": 4.6977918671546635e-06, + "loss": 0.5841238498687744, + "mean_token_accuracy": 0.7971454858779907, + "num_tokens": 7307220.0, + "step": 792 + }, + { + "epoch": 0.6025835866261399, + "grad_norm": 1.9614146947860718, + "learning_rate": 4.696792893727562e-06, + "loss": 0.34684082865715027, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 7313875.0, + "step": 793 + }, + { + "epoch": 0.6033434650455927, + "grad_norm": 2.015570640563965, + "learning_rate": 4.695792378506645e-06, + "loss": 0.42779117822647095, + "mean_token_accuracy": 0.8625012636184692, + "num_tokens": 7321439.0, + "step": 794 + }, + { + "epoch": 0.6041033434650456, + "grad_norm": 2.8581228256225586, + "learning_rate": 4.694790322194111e-06, + "loss": 0.6519991159439087, + "mean_token_accuracy": 0.7629562616348267, + "num_tokens": 7326916.0, + "step": 795 + }, + { + "epoch": 0.6048632218844985, + "grad_norm": 2.482715368270874, + "learning_rate": 4.693786725493242e-06, + "loss": 0.532963216304779, + "mean_token_accuracy": 0.832184910774231, + "num_tokens": 7333311.0, + "step": 796 + }, + { + "epoch": 0.6056231003039514, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.692781589108402e-06, + "loss": 0.43381205201148987, + "mean_token_accuracy": 0.8402494192123413, + "num_tokens": 7343731.0, + "step": 797 + }, + { + "epoch": 0.6063829787234043, + "grad_norm": 2.2133216857910156, + "learning_rate": 4.691774913745033e-06, + "loss": 0.4380851089954376, + "mean_token_accuracy": 0.8600908517837524, + "num_tokens": 7350224.0, + "step": 798 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 2.046280860900879, + "learning_rate": 4.690766700109659e-06, + "loss": 0.3821919560432434, + "mean_token_accuracy": 0.8691814541816711, + "num_tokens": 7356717.0, + "step": 799 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 1.8482693433761597, + "learning_rate": 4.689756948909884e-06, + "loss": 0.5217651128768921, + "mean_token_accuracy": 0.803473711013794, + "num_tokens": 7365806.0, + "step": 800 + }, + { + "epoch": 0.6086626139817629, + "grad_norm": 2.192134141921997, + "learning_rate": 4.688745660854388e-06, + "loss": 0.573980987071991, + "mean_token_accuracy": 0.8198676109313965, + "num_tokens": 7380281.0, + "step": 801 + }, + { + "epoch": 0.6094224924012158, + "grad_norm": 2.363626718521118, + "learning_rate": 4.687732836652935e-06, + "loss": 0.5204599499702454, + "mean_token_accuracy": 0.8373252153396606, + "num_tokens": 7386938.0, + "step": 802 + }, + { + "epoch": 0.6101823708206687, + "grad_norm": 1.9320523738861084, + "learning_rate": 4.686718477016361e-06, + "loss": 0.47316622734069824, + "mean_token_accuracy": 0.830596923828125, + "num_tokens": 7395069.0, + "step": 803 + }, + { + "epoch": 0.6109422492401215, + "grad_norm": 2.6573057174682617, + "learning_rate": 4.6857025826565845e-06, + "loss": 0.5495861768722534, + "mean_token_accuracy": 0.8187421560287476, + "num_tokens": 7400563.0, + "step": 804 + }, + { + "epoch": 0.6117021276595744, + "grad_norm": 2.0893123149871826, + "learning_rate": 4.684685154286599e-06, + "loss": 0.5362675786018372, + "mean_token_accuracy": 0.8394701480865479, + "num_tokens": 7406973.0, + "step": 805 + }, + { + "epoch": 0.6124620060790273, + "grad_norm": 2.455130100250244, + "learning_rate": 4.683666192620474e-06, + "loss": 0.5405995845794678, + "mean_token_accuracy": 0.8079100847244263, + "num_tokens": 7412931.0, + "step": 806 + }, + { + "epoch": 0.6132218844984803, + "grad_norm": 2.311915636062622, + "learning_rate": 4.682645698373357e-06, + "loss": 0.5395106077194214, + "mean_token_accuracy": 0.8156260251998901, + "num_tokens": 7419699.0, + "step": 807 + }, + { + "epoch": 0.6139817629179332, + "grad_norm": 1.686838984489441, + "learning_rate": 4.6816236722614694e-06, + "loss": 0.6034521460533142, + "mean_token_accuracy": 0.7855954170227051, + "num_tokens": 7431899.0, + "step": 808 + }, + { + "epoch": 0.6147416413373861, + "grad_norm": 1.682759165763855, + "learning_rate": 4.680600115002109e-06, + "loss": 0.48593831062316895, + "mean_token_accuracy": 0.8229435682296753, + "num_tokens": 7443187.0, + "step": 809 + }, + { + "epoch": 0.6155015197568389, + "grad_norm": 2.064589738845825, + "learning_rate": 4.679575027313649e-06, + "loss": 0.5098468661308289, + "mean_token_accuracy": 0.8234638571739197, + "num_tokens": 7450868.0, + "step": 810 + }, + { + "epoch": 0.6162613981762918, + "grad_norm": 2.2063486576080322, + "learning_rate": 4.6785484099155324e-06, + "loss": 0.5138497352600098, + "mean_token_accuracy": 0.8152111172676086, + "num_tokens": 7457176.0, + "step": 811 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 1.6258726119995117, + "learning_rate": 4.67752026352828e-06, + "loss": 0.4064181447029114, + "mean_token_accuracy": 0.8720619678497314, + "num_tokens": 7466557.0, + "step": 812 + }, + { + "epoch": 0.6177811550151976, + "grad_norm": 2.3309383392333984, + "learning_rate": 4.676490588873486e-06, + "loss": 0.5180112719535828, + "mean_token_accuracy": 0.8233879804611206, + "num_tokens": 7472650.0, + "step": 813 + }, + { + "epoch": 0.6185410334346505, + "grad_norm": 1.4545246362686157, + "learning_rate": 4.675459386673815e-06, + "loss": 0.37917959690093994, + "mean_token_accuracy": 0.8598103523254395, + "num_tokens": 7485171.0, + "step": 814 + }, + { + "epoch": 0.6193009118541033, + "grad_norm": 2.654231071472168, + "learning_rate": 4.674426657653003e-06, + "loss": 0.554074227809906, + "mean_token_accuracy": 0.8026446104049683, + "num_tokens": 7490787.0, + "step": 815 + }, + { + "epoch": 0.6200607902735562, + "grad_norm": 1.5543994903564453, + "learning_rate": 4.67339240253586e-06, + "loss": 0.6335440278053284, + "mean_token_accuracy": 0.783241868019104, + "num_tokens": 7505975.0, + "step": 816 + }, + { + "epoch": 0.6208206686930091, + "grad_norm": 2.079998016357422, + "learning_rate": 4.672356622048266e-06, + "loss": 0.5169394016265869, + "mean_token_accuracy": 0.8088761568069458, + "num_tokens": 7513470.0, + "step": 817 + }, + { + "epoch": 0.621580547112462, + "grad_norm": 1.5971896648406982, + "learning_rate": 4.671319316917172e-06, + "loss": 0.44588586688041687, + "mean_token_accuracy": 0.8518649339675903, + "num_tokens": 7524352.0, + "step": 818 + }, + { + "epoch": 0.6223404255319149, + "grad_norm": 2.477579116821289, + "learning_rate": 4.670280487870599e-06, + "loss": 0.5713893175125122, + "mean_token_accuracy": 0.8116940259933472, + "num_tokens": 7530359.0, + "step": 819 + }, + { + "epoch": 0.6231003039513677, + "grad_norm": 2.066211700439453, + "learning_rate": 4.669240135637635e-06, + "loss": 0.5295331478118896, + "mean_token_accuracy": 0.819536566734314, + "num_tokens": 7536963.0, + "step": 820 + }, + { + "epoch": 0.6238601823708206, + "grad_norm": 2.1217997074127197, + "learning_rate": 4.668198260948442e-06, + "loss": 0.6146406531333923, + "mean_token_accuracy": 0.7932635545730591, + "num_tokens": 7545800.0, + "step": 821 + }, + { + "epoch": 0.6246200607902735, + "grad_norm": 2.0173542499542236, + "learning_rate": 4.667154864534245e-06, + "loss": 0.6240535974502563, + "mean_token_accuracy": 0.7883644104003906, + "num_tokens": 7556165.0, + "step": 822 + }, + { + "epoch": 0.6253799392097265, + "grad_norm": 2.014526128768921, + "learning_rate": 4.666109947127343e-06, + "loss": 0.40367332100868225, + "mean_token_accuracy": 0.8653522729873657, + "num_tokens": 7562665.0, + "step": 823 + }, + { + "epoch": 0.6261398176291794, + "grad_norm": 2.5078861713409424, + "learning_rate": 4.665063509461098e-06, + "loss": 0.5903617739677429, + "mean_token_accuracy": 0.7902897596359253, + "num_tokens": 7568922.0, + "step": 824 + }, + { + "epoch": 0.6268996960486323, + "grad_norm": 2.454622745513916, + "learning_rate": 4.664015552269938e-06, + "loss": 0.5238361358642578, + "mean_token_accuracy": 0.838546872138977, + "num_tokens": 7575965.0, + "step": 825 + }, + { + "epoch": 0.6276595744680851, + "grad_norm": 2.920919418334961, + "learning_rate": 4.662966076289363e-06, + "loss": 0.5028782486915588, + "mean_token_accuracy": 0.8311152458190918, + "num_tokens": 7580193.0, + "step": 826 + }, + { + "epoch": 0.628419452887538, + "grad_norm": 1.545382022857666, + "learning_rate": 4.661915082255932e-06, + "loss": 0.4817378520965576, + "mean_token_accuracy": 0.8373227119445801, + "num_tokens": 7593024.0, + "step": 827 + }, + { + "epoch": 0.6291793313069909, + "grad_norm": 1.5152469873428345, + "learning_rate": 4.6608625709072766e-06, + "loss": 0.4693033695220947, + "mean_token_accuracy": 0.8150848150253296, + "num_tokens": 7606459.0, + "step": 828 + }, + { + "epoch": 0.6299392097264438, + "grad_norm": 2.1310224533081055, + "learning_rate": 4.659808542982089e-06, + "loss": 0.4653395414352417, + "mean_token_accuracy": 0.8286294341087341, + "num_tokens": 7613036.0, + "step": 829 + }, + { + "epoch": 0.6306990881458967, + "grad_norm": 2.1949679851531982, + "learning_rate": 4.658752999220125e-06, + "loss": 0.3698633909225464, + "mean_token_accuracy": 0.871590793132782, + "num_tokens": 7618527.0, + "step": 830 + }, + { + "epoch": 0.6314589665653495, + "grad_norm": 2.2770416736602783, + "learning_rate": 4.657695940362207e-06, + "loss": 0.5202419757843018, + "mean_token_accuracy": 0.817577600479126, + "num_tokens": 7624459.0, + "step": 831 + }, + { + "epoch": 0.6322188449848024, + "grad_norm": 1.402042269706726, + "learning_rate": 4.65663736715022e-06, + "loss": 0.51531583070755, + "mean_token_accuracy": 0.8228116631507874, + "num_tokens": 7639371.0, + "step": 832 + }, + { + "epoch": 0.6329787234042553, + "grad_norm": 3.3554883003234863, + "learning_rate": 4.65557728032711e-06, + "loss": 0.6771188378334045, + "mean_token_accuracy": 0.7880028486251831, + "num_tokens": 7643924.0, + "step": 833 + }, + { + "epoch": 0.6337386018237082, + "grad_norm": 2.081040143966675, + "learning_rate": 4.654515680636888e-06, + "loss": 0.5712796449661255, + "mean_token_accuracy": 0.8177868127822876, + "num_tokens": 7651881.0, + "step": 834 + }, + { + "epoch": 0.6344984802431611, + "grad_norm": 0.9128716588020325, + "learning_rate": 4.653452568824625e-06, + "loss": 0.3423936069011688, + "mean_token_accuracy": 0.8782886266708374, + "num_tokens": 7677829.0, + "step": 835 + }, + { + "epoch": 0.6352583586626139, + "grad_norm": 3.49015736579895, + "learning_rate": 4.652387945636454e-06, + "loss": 0.34657734632492065, + "mean_token_accuracy": 0.8770567178726196, + "num_tokens": 7680796.0, + "step": 836 + }, + { + "epoch": 0.6360182370820668, + "grad_norm": 2.026247501373291, + "learning_rate": 4.651321811819568e-06, + "loss": 0.5098431706428528, + "mean_token_accuracy": 0.8216961622238159, + "num_tokens": 7688746.0, + "step": 837 + }, + { + "epoch": 0.6367781155015197, + "grad_norm": 2.444343090057373, + "learning_rate": 4.650254168122222e-06, + "loss": 0.5490090250968933, + "mean_token_accuracy": 0.8092857599258423, + "num_tokens": 7695220.0, + "step": 838 + }, + { + "epoch": 0.6375379939209727, + "grad_norm": 2.0171122550964355, + "learning_rate": 4.649185015293728e-06, + "loss": 0.47221142053604126, + "mean_token_accuracy": 0.8514408469200134, + "num_tokens": 7702759.0, + "step": 839 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.9800984859466553, + "learning_rate": 4.64811435408446e-06, + "loss": 0.5238803625106812, + "mean_token_accuracy": 0.8479194641113281, + "num_tokens": 7714017.0, + "step": 840 + }, + { + "epoch": 0.6390577507598785, + "grad_norm": 3.0674357414245605, + "learning_rate": 4.647042185245848e-06, + "loss": 0.4668245315551758, + "mean_token_accuracy": 0.8381714820861816, + "num_tokens": 7717801.0, + "step": 841 + }, + { + "epoch": 0.6398176291793313, + "grad_norm": 1.5672820806503296, + "learning_rate": 4.645968509530381e-06, + "loss": 0.4428741931915283, + "mean_token_accuracy": 0.8416479825973511, + "num_tokens": 7728342.0, + "step": 842 + }, + { + "epoch": 0.6405775075987842, + "grad_norm": 2.3042354583740234, + "learning_rate": 4.644893327691608e-06, + "loss": 0.49937760829925537, + "mean_token_accuracy": 0.827070951461792, + "num_tokens": 7734576.0, + "step": 843 + }, + { + "epoch": 0.6413373860182371, + "grad_norm": 2.057772159576416, + "learning_rate": 4.6438166404841316e-06, + "loss": 0.5912986993789673, + "mean_token_accuracy": 0.805509090423584, + "num_tokens": 7742481.0, + "step": 844 + }, + { + "epoch": 0.64209726443769, + "grad_norm": 1.9688186645507812, + "learning_rate": 4.6427384486636115e-06, + "loss": 0.482401967048645, + "mean_token_accuracy": 0.8358086347579956, + "num_tokens": 7750002.0, + "step": 845 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 2.6852948665618896, + "learning_rate": 4.6416587529867665e-06, + "loss": 0.5479315519332886, + "mean_token_accuracy": 0.8091106414794922, + "num_tokens": 7755578.0, + "step": 846 + }, + { + "epoch": 0.6436170212765957, + "grad_norm": 2.0547337532043457, + "learning_rate": 4.640577554211366e-06, + "loss": 0.5327274203300476, + "mean_token_accuracy": 0.8280376195907593, + "num_tokens": 7763513.0, + "step": 847 + }, + { + "epoch": 0.6443768996960486, + "grad_norm": 2.0328633785247803, + "learning_rate": 4.63949485309624e-06, + "loss": 0.4814409613609314, + "mean_token_accuracy": 0.8527672290802002, + "num_tokens": 7771131.0, + "step": 848 + }, + { + "epoch": 0.6451367781155015, + "grad_norm": 1.5892863273620605, + "learning_rate": 4.638410650401267e-06, + "loss": 0.4492785334587097, + "mean_token_accuracy": 0.846997857093811, + "num_tokens": 7781572.0, + "step": 849 + }, + { + "epoch": 0.6458966565349544, + "grad_norm": 1.8295910358428955, + "learning_rate": 4.637324946887384e-06, + "loss": 0.37088239192962646, + "mean_token_accuracy": 0.8616628646850586, + "num_tokens": 7788604.0, + "step": 850 + }, + { + "epoch": 0.6466565349544073, + "grad_norm": 3.380040168762207, + "learning_rate": 4.636237743316578e-06, + "loss": 0.4737280607223511, + "mean_token_accuracy": 0.855940580368042, + "num_tokens": 7792504.0, + "step": 851 + }, + { + "epoch": 0.6474164133738601, + "grad_norm": 2.8790009021759033, + "learning_rate": 4.635149040451891e-06, + "loss": 0.39790448546409607, + "mean_token_accuracy": 0.8710698485374451, + "num_tokens": 7796333.0, + "step": 852 + }, + { + "epoch": 0.648176291793313, + "grad_norm": 1.914914608001709, + "learning_rate": 4.634058839057417e-06, + "loss": 0.2954312562942505, + "mean_token_accuracy": 0.8880234956741333, + "num_tokens": 7802456.0, + "step": 853 + }, + { + "epoch": 0.648936170212766, + "grad_norm": 1.3709120750427246, + "learning_rate": 4.632967139898301e-06, + "loss": 0.43224576115608215, + "mean_token_accuracy": 0.8446190357208252, + "num_tokens": 7816770.0, + "step": 854 + }, + { + "epoch": 0.6496960486322189, + "grad_norm": 1.6579312086105347, + "learning_rate": 4.63187394374074e-06, + "loss": 0.3535553514957428, + "mean_token_accuracy": 0.8738704919815063, + "num_tokens": 7824963.0, + "step": 855 + }, + { + "epoch": 0.6504559270516718, + "grad_norm": 2.4055678844451904, + "learning_rate": 4.63077925135198e-06, + "loss": 0.5078744292259216, + "mean_token_accuracy": 0.8430874347686768, + "num_tokens": 7830962.0, + "step": 856 + }, + { + "epoch": 0.6512158054711246, + "grad_norm": 2.5171499252319336, + "learning_rate": 4.629683063500319e-06, + "loss": 0.5172419548034668, + "mean_token_accuracy": 0.8087141513824463, + "num_tokens": 7836638.0, + "step": 857 + }, + { + "epoch": 0.6519756838905775, + "grad_norm": 1.7588486671447754, + "learning_rate": 4.628585380955104e-06, + "loss": 0.5759496092796326, + "mean_token_accuracy": 0.8043236136436462, + "num_tokens": 7844654.0, + "step": 858 + }, + { + "epoch": 0.6527355623100304, + "grad_norm": 1.5887070894241333, + "learning_rate": 4.62748620448673e-06, + "loss": 0.41849038004875183, + "mean_token_accuracy": 0.8556643724441528, + "num_tokens": 7855642.0, + "step": 859 + }, + { + "epoch": 0.6534954407294833, + "grad_norm": 3.227942705154419, + "learning_rate": 4.626385534866642e-06, + "loss": 0.5279449224472046, + "mean_token_accuracy": 0.8250958323478699, + "num_tokens": 7859890.0, + "step": 860 + }, + { + "epoch": 0.6542553191489362, + "grad_norm": 2.440467119216919, + "learning_rate": 4.625283372867333e-06, + "loss": 0.5294933319091797, + "mean_token_accuracy": 0.8235013484954834, + "num_tokens": 7866766.0, + "step": 861 + }, + { + "epoch": 0.6550151975683891, + "grad_norm": 2.4106903076171875, + "learning_rate": 4.624179719262342e-06, + "loss": 0.5662813186645508, + "mean_token_accuracy": 0.8061668872833252, + "num_tokens": 7872809.0, + "step": 862 + }, + { + "epoch": 0.6557750759878419, + "grad_norm": 3.5151145458221436, + "learning_rate": 4.623074574826254e-06, + "loss": 0.5471097230911255, + "mean_token_accuracy": 0.8220691084861755, + "num_tokens": 7876136.0, + "step": 863 + }, + { + "epoch": 0.6565349544072948, + "grad_norm": 1.5319840908050537, + "learning_rate": 4.621967940334705e-06, + "loss": 0.4178982377052307, + "mean_token_accuracy": 0.8517135977745056, + "num_tokens": 7886113.0, + "step": 864 + }, + { + "epoch": 0.6572948328267477, + "grad_norm": 1.63701331615448, + "learning_rate": 4.620859816564371e-06, + "loss": 0.4666512608528137, + "mean_token_accuracy": 0.8223508596420288, + "num_tokens": 7897982.0, + "step": 865 + }, + { + "epoch": 0.6580547112462006, + "grad_norm": 2.1515414714813232, + "learning_rate": 4.619750204292978e-06, + "loss": 0.5359305143356323, + "mean_token_accuracy": 0.8192868232727051, + "num_tokens": 7904947.0, + "step": 866 + }, + { + "epoch": 0.6588145896656535, + "grad_norm": 2.2140955924987793, + "learning_rate": 4.618639104299294e-06, + "loss": 0.5275633931159973, + "mean_token_accuracy": 0.8120715618133545, + "num_tokens": 7913913.0, + "step": 867 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 1.3956893682479858, + "learning_rate": 4.6175265173631304e-06, + "loss": 0.4378768503665924, + "mean_token_accuracy": 0.8479125499725342, + "num_tokens": 7927979.0, + "step": 868 + }, + { + "epoch": 0.6603343465045592, + "grad_norm": 2.98103928565979, + "learning_rate": 4.616412444265344e-06, + "loss": 0.42614591121673584, + "mean_token_accuracy": 0.8595094680786133, + "num_tokens": 7934293.0, + "step": 869 + }, + { + "epoch": 0.6610942249240122, + "grad_norm": 2.554845094680786, + "learning_rate": 4.6152968857878365e-06, + "loss": 0.3698030412197113, + "mean_token_accuracy": 0.8717041015625, + "num_tokens": 7938547.0, + "step": 870 + }, + { + "epoch": 0.6618541033434651, + "grad_norm": 3.0901825428009033, + "learning_rate": 4.6141798427135475e-06, + "loss": 0.5037497282028198, + "mean_token_accuracy": 0.8354041576385498, + "num_tokens": 7942829.0, + "step": 871 + }, + { + "epoch": 0.662613981762918, + "grad_norm": 2.8692073822021484, + "learning_rate": 4.6130613158264605e-06, + "loss": 0.5418164134025574, + "mean_token_accuracy": 0.8298909664154053, + "num_tokens": 7949303.0, + "step": 872 + }, + { + "epoch": 0.6633738601823708, + "grad_norm": 3.960404396057129, + "learning_rate": 4.611941305911602e-06, + "loss": 0.6284480094909668, + "mean_token_accuracy": 0.837495744228363, + "num_tokens": 7952486.0, + "step": 873 + }, + { + "epoch": 0.6641337386018237, + "grad_norm": 2.6690115928649902, + "learning_rate": 4.610819813755038e-06, + "loss": 0.5214360952377319, + "mean_token_accuracy": 0.8213508129119873, + "num_tokens": 7957559.0, + "step": 874 + }, + { + "epoch": 0.6648936170212766, + "grad_norm": 2.3376171588897705, + "learning_rate": 4.609696840143875e-06, + "loss": 0.46887528896331787, + "mean_token_accuracy": 0.8438819646835327, + "num_tokens": 7962826.0, + "step": 875 + }, + { + "epoch": 0.6656534954407295, + "grad_norm": 2.2222683429718018, + "learning_rate": 4.6085723858662575e-06, + "loss": 0.5607719421386719, + "mean_token_accuracy": 0.8128405809402466, + "num_tokens": 7970131.0, + "step": 876 + }, + { + "epoch": 0.6664133738601824, + "grad_norm": 2.069091558456421, + "learning_rate": 4.607446451711372e-06, + "loss": 0.506301760673523, + "mean_token_accuracy": 0.8256827592849731, + "num_tokens": 7977524.0, + "step": 877 + }, + { + "epoch": 0.6671732522796353, + "grad_norm": 1.3724967241287231, + "learning_rate": 4.606319038469443e-06, + "loss": 0.43285101652145386, + "mean_token_accuracy": 0.8525032997131348, + "num_tokens": 7989174.0, + "step": 878 + }, + { + "epoch": 0.6679331306990881, + "grad_norm": 2.278205156326294, + "learning_rate": 4.605190146931731e-06, + "loss": 0.4845905303955078, + "mean_token_accuracy": 0.8284652829170227, + "num_tokens": 7998524.0, + "step": 879 + }, + { + "epoch": 0.668693009118541, + "grad_norm": 1.3871766328811646, + "learning_rate": 4.604059777890537e-06, + "loss": 0.5736679434776306, + "mean_token_accuracy": 0.8223285675048828, + "num_tokens": 8015776.0, + "step": 880 + }, + { + "epoch": 0.6694528875379939, + "grad_norm": 1.926164984703064, + "learning_rate": 4.602927932139197e-06, + "loss": 0.4133230447769165, + "mean_token_accuracy": 0.8653768301010132, + "num_tokens": 8022979.0, + "step": 881 + }, + { + "epoch": 0.6702127659574468, + "grad_norm": 2.109272003173828, + "learning_rate": 4.601794610472083e-06, + "loss": 0.7005600929260254, + "mean_token_accuracy": 0.7777010202407837, + "num_tokens": 8032618.0, + "step": 882 + }, + { + "epoch": 0.6709726443768997, + "grad_norm": 2.077977418899536, + "learning_rate": 4.6006598136846056e-06, + "loss": 0.5278208255767822, + "mean_token_accuracy": 0.8230358958244324, + "num_tokens": 8040534.0, + "step": 883 + }, + { + "epoch": 0.6717325227963525, + "grad_norm": 1.678581714630127, + "learning_rate": 4.599523542573207e-06, + "loss": 0.4955351650714874, + "mean_token_accuracy": 0.8270003795623779, + "num_tokens": 8052249.0, + "step": 884 + }, + { + "epoch": 0.6724924012158054, + "grad_norm": 2.0751662254333496, + "learning_rate": 4.598385797935368e-06, + "loss": 0.5266247987747192, + "mean_token_accuracy": 0.8263581991195679, + "num_tokens": 8060600.0, + "step": 885 + }, + { + "epoch": 0.6732522796352584, + "grad_norm": 2.418405771255493, + "learning_rate": 4.5972465805696e-06, + "loss": 0.4481425881385803, + "mean_token_accuracy": 0.846164345741272, + "num_tokens": 8066025.0, + "step": 886 + }, + { + "epoch": 0.6740121580547113, + "grad_norm": 2.3936474323272705, + "learning_rate": 4.596105891275449e-06, + "loss": 0.4553404450416565, + "mean_token_accuracy": 0.8412896394729614, + "num_tokens": 8071544.0, + "step": 887 + }, + { + "epoch": 0.6747720364741642, + "grad_norm": 2.2024407386779785, + "learning_rate": 4.594963730853497e-06, + "loss": 0.6218541860580444, + "mean_token_accuracy": 0.7890232801437378, + "num_tokens": 8079061.0, + "step": 888 + }, + { + "epoch": 0.675531914893617, + "grad_norm": 2.51015567779541, + "learning_rate": 4.593820100105355e-06, + "loss": 0.5149124264717102, + "mean_token_accuracy": 0.8241918087005615, + "num_tokens": 8084293.0, + "step": 889 + }, + { + "epoch": 0.6762917933130699, + "grad_norm": 1.8748939037322998, + "learning_rate": 4.5926749998336665e-06, + "loss": 0.50836181640625, + "mean_token_accuracy": 0.8067223429679871, + "num_tokens": 8092511.0, + "step": 890 + }, + { + "epoch": 0.6770516717325228, + "grad_norm": 1.801193118095398, + "learning_rate": 4.5915284308421075e-06, + "loss": 0.4372861683368683, + "mean_token_accuracy": 0.8510604500770569, + "num_tokens": 8101174.0, + "step": 891 + }, + { + "epoch": 0.6778115501519757, + "grad_norm": 2.6476457118988037, + "learning_rate": 4.590380393935383e-06, + "loss": 0.38700711727142334, + "mean_token_accuracy": 0.8659796714782715, + "num_tokens": 8105398.0, + "step": 892 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 1.1147183179855347, + "learning_rate": 4.589230889919232e-06, + "loss": 0.38546115159988403, + "mean_token_accuracy": 0.8570581674575806, + "num_tokens": 8127394.0, + "step": 893 + }, + { + "epoch": 0.6793313069908815, + "grad_norm": 2.908905506134033, + "learning_rate": 4.588079919600419e-06, + "loss": 0.5108504295349121, + "mean_token_accuracy": 0.8121406435966492, + "num_tokens": 8131801.0, + "step": 894 + }, + { + "epoch": 0.6800911854103343, + "grad_norm": 3.1522326469421387, + "learning_rate": 4.586927483786739e-06, + "loss": 0.44059112668037415, + "mean_token_accuracy": 0.8448011875152588, + "num_tokens": 8154416.0, + "step": 895 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 1.5142440795898438, + "learning_rate": 4.585773583287017e-06, + "loss": 0.513217568397522, + "mean_token_accuracy": 0.8386049270629883, + "num_tokens": 8171156.0, + "step": 896 + }, + { + "epoch": 0.6816109422492401, + "grad_norm": 2.597881317138672, + "learning_rate": 4.584618218911104e-06, + "loss": 0.4937712550163269, + "mean_token_accuracy": 0.8223681449890137, + "num_tokens": 8176124.0, + "step": 897 + }, + { + "epoch": 0.682370820668693, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.583461391469879e-06, + "loss": 0.519811749458313, + "mean_token_accuracy": 0.8169777393341064, + "num_tokens": 8185136.0, + "step": 898 + }, + { + "epoch": 0.6831306990881459, + "grad_norm": 3.2061994075775146, + "learning_rate": 4.582303101775249e-06, + "loss": 0.4655115008354187, + "mean_token_accuracy": 0.8425977230072021, + "num_tokens": 8188864.0, + "step": 899 + }, + { + "epoch": 0.6838905775075987, + "grad_norm": 1.3485229015350342, + "learning_rate": 4.581143350640146e-06, + "loss": 0.5014470815658569, + "mean_token_accuracy": 0.8273109197616577, + "num_tokens": 8203460.0, + "step": 900 + }, + { + "epoch": 0.6846504559270516, + "grad_norm": 1.3264713287353516, + "learning_rate": 4.579982138878527e-06, + "loss": 0.5073703527450562, + "mean_token_accuracy": 0.8259357213973999, + "num_tokens": 8219348.0, + "step": 901 + }, + { + "epoch": 0.6854103343465046, + "grad_norm": 2.4436347484588623, + "learning_rate": 4.578819467305375e-06, + "loss": 0.47020310163497925, + "mean_token_accuracy": 0.8567265272140503, + "num_tokens": 8224427.0, + "step": 902 + }, + { + "epoch": 0.6861702127659575, + "grad_norm": 1.921749234199524, + "learning_rate": 4.5776553367367e-06, + "loss": 0.622514009475708, + "mean_token_accuracy": 0.7863982319831848, + "num_tokens": 8233151.0, + "step": 903 + }, + { + "epoch": 0.6869300911854104, + "grad_norm": 1.8815616369247437, + "learning_rate": 4.576489747989532e-06, + "loss": 0.4910545349121094, + "mean_token_accuracy": 0.8147122859954834, + "num_tokens": 8240762.0, + "step": 904 + }, + { + "epoch": 0.6876899696048632, + "grad_norm": 1.2366989850997925, + "learning_rate": 4.575322701881926e-06, + "loss": 0.3947566747665405, + "mean_token_accuracy": 0.873993992805481, + "num_tokens": 8259381.0, + "step": 905 + }, + { + "epoch": 0.6884498480243161, + "grad_norm": 1.5767735242843628, + "learning_rate": 4.57415419923296e-06, + "loss": 0.57136070728302, + "mean_token_accuracy": 0.8028088808059692, + "num_tokens": 8273296.0, + "step": 906 + }, + { + "epoch": 0.689209726443769, + "grad_norm": 2.378675699234009, + "learning_rate": 4.572984240862733e-06, + "loss": 0.5894849896430969, + "mean_token_accuracy": 0.7977708578109741, + "num_tokens": 8280083.0, + "step": 907 + }, + { + "epoch": 0.6899696048632219, + "grad_norm": 2.0401132106781006, + "learning_rate": 4.57181282759237e-06, + "loss": 0.5524613261222839, + "mean_token_accuracy": 0.8138598203659058, + "num_tokens": 8288236.0, + "step": 908 + }, + { + "epoch": 0.6907294832826748, + "grad_norm": 2.293701648712158, + "learning_rate": 4.570639960244011e-06, + "loss": 0.5154546499252319, + "mean_token_accuracy": 0.8234660625457764, + "num_tokens": 8294493.0, + "step": 909 + }, + { + "epoch": 0.6914893617021277, + "grad_norm": 1.9286527633666992, + "learning_rate": 4.56946563964082e-06, + "loss": 0.5364264845848083, + "mean_token_accuracy": 0.8147368431091309, + "num_tokens": 8303441.0, + "step": 910 + }, + { + "epoch": 0.6922492401215805, + "grad_norm": 1.2571251392364502, + "learning_rate": 4.5682898666069815e-06, + "loss": 0.43535223603248596, + "mean_token_accuracy": 0.859239935874939, + "num_tokens": 8321548.0, + "step": 911 + }, + { + "epoch": 0.6930091185410334, + "grad_norm": 1.2224860191345215, + "learning_rate": 4.567112641967697e-06, + "loss": 0.40205076336860657, + "mean_token_accuracy": 0.8724711537361145, + "num_tokens": 8335205.0, + "step": 912 + }, + { + "epoch": 0.6937689969604863, + "grad_norm": 1.2064491510391235, + "learning_rate": 4.5659339665491894e-06, + "loss": 0.37790587544441223, + "mean_token_accuracy": 0.8464339971542358, + "num_tokens": 8350926.0, + "step": 913 + }, + { + "epoch": 0.6945288753799392, + "grad_norm": 2.1755270957946777, + "learning_rate": 4.5647538411786965e-06, + "loss": 0.42034298181533813, + "mean_token_accuracy": 0.84148108959198, + "num_tokens": 8356739.0, + "step": 914 + }, + { + "epoch": 0.6952887537993921, + "grad_norm": 1.234864592552185, + "learning_rate": 4.563572266684478e-06, + "loss": 0.5062938332557678, + "mean_token_accuracy": 0.8132052421569824, + "num_tokens": 8373660.0, + "step": 915 + }, + { + "epoch": 0.6960486322188449, + "grad_norm": 2.4250621795654297, + "learning_rate": 4.562389243895807e-06, + "loss": 0.4907791018486023, + "mean_token_accuracy": 0.8337979912757874, + "num_tokens": 8378661.0, + "step": 916 + }, + { + "epoch": 0.6968085106382979, + "grad_norm": 1.5018314123153687, + "learning_rate": 4.561204773642974e-06, + "loss": 0.41041281819343567, + "mean_token_accuracy": 0.8569784164428711, + "num_tokens": 8390322.0, + "step": 917 + }, + { + "epoch": 0.6975683890577508, + "grad_norm": 2.797269344329834, + "learning_rate": 4.5600188567572874e-06, + "loss": 0.3146931529045105, + "mean_token_accuracy": 0.8913302421569824, + "num_tokens": 8393567.0, + "step": 918 + }, + { + "epoch": 0.6983282674772037, + "grad_norm": 1.4002827405929565, + "learning_rate": 4.558831494071069e-06, + "loss": 0.4275597333908081, + "mean_token_accuracy": 0.8504893779754639, + "num_tokens": 8407119.0, + "step": 919 + }, + { + "epoch": 0.6990881458966566, + "grad_norm": 1.7045831680297852, + "learning_rate": 4.557642686417654e-06, + "loss": 0.49593430757522583, + "mean_token_accuracy": 0.8185091018676758, + "num_tokens": 8417408.0, + "step": 920 + }, + { + "epoch": 0.6998480243161094, + "grad_norm": 2.8818066120147705, + "learning_rate": 4.556452434631396e-06, + "loss": 0.637908935546875, + "mean_token_accuracy": 0.7883946895599365, + "num_tokens": 8422319.0, + "step": 921 + }, + { + "epoch": 0.7006079027355623, + "grad_norm": 2.3587265014648438, + "learning_rate": 4.555260739547657e-06, + "loss": 0.38749319314956665, + "mean_token_accuracy": 0.8774704933166504, + "num_tokens": 8427315.0, + "step": 922 + }, + { + "epoch": 0.7013677811550152, + "grad_norm": 1.6648749113082886, + "learning_rate": 4.554067602002815e-06, + "loss": 0.4044865369796753, + "mean_token_accuracy": 0.8524141311645508, + "num_tokens": 8438662.0, + "step": 923 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.467787742614746, + "learning_rate": 4.55287302283426e-06, + "loss": 0.591016411781311, + "mean_token_accuracy": 0.81184983253479, + "num_tokens": 8442237.0, + "step": 924 + }, + { + "epoch": 0.702887537993921, + "grad_norm": 2.1458635330200195, + "learning_rate": 4.551677002880395e-06, + "loss": 0.5017476677894592, + "mean_token_accuracy": 0.822914183139801, + "num_tokens": 8449494.0, + "step": 925 + }, + { + "epoch": 0.7036474164133738, + "grad_norm": 2.521714448928833, + "learning_rate": 4.550479542980632e-06, + "loss": 0.531912088394165, + "mean_token_accuracy": 0.8225687742233276, + "num_tokens": 8454983.0, + "step": 926 + }, + { + "epoch": 0.7044072948328267, + "grad_norm": 3.5248100757598877, + "learning_rate": 4.549280643975394e-06, + "loss": 0.4631815254688263, + "mean_token_accuracy": 0.8443771600723267, + "num_tokens": 8458504.0, + "step": 927 + }, + { + "epoch": 0.7051671732522796, + "grad_norm": 2.5105819702148438, + "learning_rate": 4.548080306706114e-06, + "loss": 0.30487123131752014, + "mean_token_accuracy": 0.9018767476081848, + "num_tokens": 8462589.0, + "step": 928 + }, + { + "epoch": 0.7059270516717325, + "grad_norm": 1.3367713689804077, + "learning_rate": 4.5468785320152365e-06, + "loss": 0.4355026185512543, + "mean_token_accuracy": 0.8323584794998169, + "num_tokens": 8478450.0, + "step": 929 + }, + { + "epoch": 0.7066869300911854, + "grad_norm": 2.2506282329559326, + "learning_rate": 4.545675320746212e-06, + "loss": 0.5082957744598389, + "mean_token_accuracy": 0.823430597782135, + "num_tokens": 8485991.0, + "step": 930 + }, + { + "epoch": 0.7074468085106383, + "grad_norm": 1.7164632081985474, + "learning_rate": 4.544470673743502e-06, + "loss": 0.3960164785385132, + "mean_token_accuracy": 0.8592486381530762, + "num_tokens": 8495217.0, + "step": 931 + }, + { + "epoch": 0.7082066869300911, + "grad_norm": 1.5864969491958618, + "learning_rate": 4.543264591852572e-06, + "loss": 0.49114471673965454, + "mean_token_accuracy": 0.8330780267715454, + "num_tokens": 8508904.0, + "step": 932 + }, + { + "epoch": 0.708966565349544, + "grad_norm": 2.1707003116607666, + "learning_rate": 4.542057075919898e-06, + "loss": 0.49895772337913513, + "mean_token_accuracy": 0.8327431082725525, + "num_tokens": 8515792.0, + "step": 933 + }, + { + "epoch": 0.709726443768997, + "grad_norm": 1.9002083539962769, + "learning_rate": 4.54084812679296e-06, + "loss": 0.4548531472682953, + "mean_token_accuracy": 0.834532618522644, + "num_tokens": 8524006.0, + "step": 934 + }, + { + "epoch": 0.7104863221884499, + "grad_norm": 1.8505141735076904, + "learning_rate": 4.539637745320247e-06, + "loss": 0.35716521739959717, + "mean_token_accuracy": 0.872222900390625, + "num_tokens": 8533647.0, + "step": 935 + }, + { + "epoch": 0.7112462006079028, + "grad_norm": 2.092620849609375, + "learning_rate": 4.53842593235125e-06, + "loss": 0.4673694372177124, + "mean_token_accuracy": 0.8460999131202698, + "num_tokens": 8540734.0, + "step": 936 + }, + { + "epoch": 0.7120060790273556, + "grad_norm": 2.689514636993408, + "learning_rate": 4.537212688736466e-06, + "loss": 0.45461273193359375, + "mean_token_accuracy": 0.8450704216957092, + "num_tokens": 8544948.0, + "step": 937 + }, + { + "epoch": 0.7127659574468085, + "grad_norm": 2.4507734775543213, + "learning_rate": 4.535998015327396e-06, + "loss": 0.4571906626224518, + "mean_token_accuracy": 0.8429360389709473, + "num_tokens": 8550445.0, + "step": 938 + }, + { + "epoch": 0.7135258358662614, + "grad_norm": 1.8960013389587402, + "learning_rate": 4.534781912976546e-06, + "loss": 0.4461391568183899, + "mean_token_accuracy": 0.8487973213195801, + "num_tokens": 8557630.0, + "step": 939 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.602611780166626, + "learning_rate": 4.533564382537421e-06, + "loss": 0.5277102589607239, + "mean_token_accuracy": 0.8330916166305542, + "num_tokens": 8570397.0, + "step": 940 + }, + { + "epoch": 0.7150455927051672, + "grad_norm": 1.8936395645141602, + "learning_rate": 4.532345424864533e-06, + "loss": 0.38619571924209595, + "mean_token_accuracy": 0.8514572381973267, + "num_tokens": 8582673.0, + "step": 941 + }, + { + "epoch": 0.71580547112462, + "grad_norm": 1.3898619413375854, + "learning_rate": 4.531125040813392e-06, + "loss": 0.4825032949447632, + "mean_token_accuracy": 0.833012580871582, + "num_tokens": 8597239.0, + "step": 942 + }, + { + "epoch": 0.7165653495440729, + "grad_norm": 2.128230571746826, + "learning_rate": 4.529903231240511e-06, + "loss": 0.4862118065357208, + "mean_token_accuracy": 0.8210917711257935, + "num_tokens": 8605877.0, + "step": 943 + }, + { + "epoch": 0.7173252279635258, + "grad_norm": 1.6552259922027588, + "learning_rate": 4.528679997003403e-06, + "loss": 0.5092059373855591, + "mean_token_accuracy": 0.8247389793395996, + "num_tokens": 8617060.0, + "step": 944 + }, + { + "epoch": 0.7180851063829787, + "grad_norm": 2.1174771785736084, + "learning_rate": 4.52745533896058e-06, + "loss": 0.39110174775123596, + "mean_token_accuracy": 0.8672944903373718, + "num_tokens": 8623306.0, + "step": 945 + }, + { + "epoch": 0.7188449848024316, + "grad_norm": 2.8648383617401123, + "learning_rate": 4.526229257971556e-06, + "loss": 0.49864327907562256, + "mean_token_accuracy": 0.8305130004882812, + "num_tokens": 8627466.0, + "step": 946 + }, + { + "epoch": 0.7196048632218845, + "grad_norm": 2.155514717102051, + "learning_rate": 4.52500175489684e-06, + "loss": 0.5070191025733948, + "mean_token_accuracy": 0.8311188817024231, + "num_tokens": 8634759.0, + "step": 947 + }, + { + "epoch": 0.7203647416413373, + "grad_norm": 1.8432683944702148, + "learning_rate": 4.523772830597942e-06, + "loss": 0.5569252371788025, + "mean_token_accuracy": 0.8070821762084961, + "num_tokens": 8644160.0, + "step": 948 + }, + { + "epoch": 0.7211246200607903, + "grad_norm": 2.8912241458892822, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4799427390098572, + "mean_token_accuracy": 0.8443552851676941, + "num_tokens": 8648377.0, + "step": 949 + }, + { + "epoch": 0.7218844984802432, + "grad_norm": 3.3449625968933105, + "learning_rate": 4.521310721778622e-06, + "loss": 0.44043463468551636, + "mean_token_accuracy": 0.8521315455436707, + "num_tokens": 8651846.0, + "step": 950 + }, + { + "epoch": 0.7226443768996961, + "grad_norm": 1.4127917289733887, + "learning_rate": 4.520077538986203e-06, + "loss": 0.4700999855995178, + "mean_token_accuracy": 0.8377952575683594, + "num_tokens": 8665199.0, + "step": 951 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.1607301235198975, + "learning_rate": 4.518842938425606e-06, + "loss": 0.4374256730079651, + "mean_token_accuracy": 0.8448896408081055, + "num_tokens": 8672158.0, + "step": 952 + }, + { + "epoch": 0.7241641337386018, + "grad_norm": 1.3442779779434204, + "learning_rate": 4.51760692096332e-06, + "loss": 0.38948923349380493, + "mean_token_accuracy": 0.8598923683166504, + "num_tokens": 8684532.0, + "step": 953 + }, + { + "epoch": 0.7249240121580547, + "grad_norm": 2.0003178119659424, + "learning_rate": 4.516369487466832e-06, + "loss": 0.3797217011451721, + "mean_token_accuracy": 0.8652102947235107, + "num_tokens": 8691460.0, + "step": 954 + }, + { + "epoch": 0.7256838905775076, + "grad_norm": 1.8196535110473633, + "learning_rate": 4.5151306388046175e-06, + "loss": 0.5676811933517456, + "mean_token_accuracy": 0.818500816822052, + "num_tokens": 8701624.0, + "step": 955 + }, + { + "epoch": 0.7264437689969605, + "grad_norm": 2.1962296962738037, + "learning_rate": 4.513890375846152e-06, + "loss": 0.45399484038352966, + "mean_token_accuracy": 0.8463879227638245, + "num_tokens": 8707410.0, + "step": 956 + }, + { + "epoch": 0.7272036474164134, + "grad_norm": 1.8798872232437134, + "learning_rate": 4.512648699461897e-06, + "loss": 0.5679811239242554, + "mean_token_accuracy": 0.8089900016784668, + "num_tokens": 8715630.0, + "step": 957 + }, + { + "epoch": 0.7279635258358662, + "grad_norm": 2.3540258407592773, + "learning_rate": 4.511405610523309e-06, + "loss": 0.5282865762710571, + "mean_token_accuracy": 0.8196114301681519, + "num_tokens": 8721934.0, + "step": 958 + }, + { + "epoch": 0.7287234042553191, + "grad_norm": 2.5630908012390137, + "learning_rate": 4.510161109902837e-06, + "loss": 0.39442378282546997, + "mean_token_accuracy": 0.8400980830192566, + "num_tokens": 8726511.0, + "step": 959 + }, + { + "epoch": 0.729483282674772, + "grad_norm": 1.9829226732254028, + "learning_rate": 4.508915198473919e-06, + "loss": 0.4611976742744446, + "mean_token_accuracy": 0.8439624309539795, + "num_tokens": 8733460.0, + "step": 960 + }, + { + "epoch": 0.7302431610942249, + "grad_norm": 3.0291950702667236, + "learning_rate": 4.507667877110982e-06, + "loss": 0.5158340930938721, + "mean_token_accuracy": 0.8300060033798218, + "num_tokens": 8737629.0, + "step": 961 + }, + { + "epoch": 0.7310030395136778, + "grad_norm": 1.9208252429962158, + "learning_rate": 4.506419146689445e-06, + "loss": 0.3807099163532257, + "mean_token_accuracy": 0.871469259262085, + "num_tokens": 8744615.0, + "step": 962 + }, + { + "epoch": 0.7317629179331308, + "grad_norm": 3.051565408706665, + "learning_rate": 4.505169008085717e-06, + "loss": 0.38461726903915405, + "mean_token_accuracy": 0.874465823173523, + "num_tokens": 8748154.0, + "step": 963 + }, + { + "epoch": 0.7325227963525835, + "grad_norm": 1.375466227531433, + "learning_rate": 4.503917462177192e-06, + "loss": 0.42490679025650024, + "mean_token_accuracy": 0.8457326889038086, + "num_tokens": 8760965.0, + "step": 964 + }, + { + "epoch": 0.7332826747720365, + "grad_norm": 2.216681957244873, + "learning_rate": 4.5026645098422515e-06, + "loss": 0.43149900436401367, + "mean_token_accuracy": 0.8527278900146484, + "num_tokens": 8766996.0, + "step": 965 + }, + { + "epoch": 0.7340425531914894, + "grad_norm": 1.9422595500946045, + "learning_rate": 4.5014101519602684e-06, + "loss": 0.4964504539966583, + "mean_token_accuracy": 0.8137556314468384, + "num_tokens": 8774411.0, + "step": 966 + }, + { + "epoch": 0.7348024316109423, + "grad_norm": 2.058887004852295, + "learning_rate": 4.500154389411598e-06, + "loss": 0.4977570176124573, + "mean_token_accuracy": 0.8254626989364624, + "num_tokens": 8782220.0, + "step": 967 + }, + { + "epoch": 0.7355623100303952, + "grad_norm": 2.9977786540985107, + "learning_rate": 4.498897223077582e-06, + "loss": 0.4061415195465088, + "mean_token_accuracy": 0.8752427101135254, + "num_tokens": 8786120.0, + "step": 968 + }, + { + "epoch": 0.736322188449848, + "grad_norm": 2.2636303901672363, + "learning_rate": 4.49763865384055e-06, + "loss": 0.5062161087989807, + "mean_token_accuracy": 0.8171653747558594, + "num_tokens": 8792459.0, + "step": 969 + }, + { + "epoch": 0.7370820668693009, + "grad_norm": 1.8850842714309692, + "learning_rate": 4.496378682583813e-06, + "loss": 0.5014280676841736, + "mean_token_accuracy": 0.8547511100769043, + "num_tokens": 8800675.0, + "step": 970 + }, + { + "epoch": 0.7378419452887538, + "grad_norm": 1.191985011100769, + "learning_rate": 4.495117310191667e-06, + "loss": 0.4713883101940155, + "mean_token_accuracy": 0.8213596343994141, + "num_tokens": 8820740.0, + "step": 971 + }, + { + "epoch": 0.7386018237082067, + "grad_norm": 1.823000192642212, + "learning_rate": 4.493854537549393e-06, + "loss": 0.46332645416259766, + "mean_token_accuracy": 0.8359860777854919, + "num_tokens": 8828884.0, + "step": 972 + }, + { + "epoch": 0.7393617021276596, + "grad_norm": 2.590446949005127, + "learning_rate": 4.492590365543253e-06, + "loss": 0.49074703454971313, + "mean_token_accuracy": 0.8433758020401001, + "num_tokens": 8833859.0, + "step": 973 + }, + { + "epoch": 0.7401215805471124, + "grad_norm": 2.2762670516967773, + "learning_rate": 4.491324795060491e-06, + "loss": 0.39465656876564026, + "mean_token_accuracy": 0.8734766244888306, + "num_tokens": 8839350.0, + "step": 974 + }, + { + "epoch": 0.7408814589665653, + "grad_norm": 2.698725461959839, + "learning_rate": 4.490057826989333e-06, + "loss": 0.5552085041999817, + "mean_token_accuracy": 0.8132266998291016, + "num_tokens": 8844373.0, + "step": 975 + }, + { + "epoch": 0.7416413373860182, + "grad_norm": 2.704606294631958, + "learning_rate": 4.488789462218988e-06, + "loss": 0.3447791635990143, + "mean_token_accuracy": 0.8736170530319214, + "num_tokens": 8848236.0, + "step": 976 + }, + { + "epoch": 0.7424012158054711, + "grad_norm": 3.1260716915130615, + "learning_rate": 4.487519701639641e-06, + "loss": 0.5945233702659607, + "mean_token_accuracy": 0.7997599840164185, + "num_tokens": 8852935.0, + "step": 977 + }, + { + "epoch": 0.743161094224924, + "grad_norm": 1.6895452737808228, + "learning_rate": 4.486248546142459e-06, + "loss": 0.4823892116546631, + "mean_token_accuracy": 0.8279662132263184, + "num_tokens": 8861743.0, + "step": 978 + }, + { + "epoch": 0.743920972644377, + "grad_norm": 1.9161452054977417, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.5266581773757935, + "mean_token_accuracy": 0.8218623399734497, + "num_tokens": 8870601.0, + "step": 979 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 1.6894301176071167, + "learning_rate": 4.483702053964154e-06, + "loss": 0.4186219573020935, + "mean_token_accuracy": 0.8471781015396118, + "num_tokens": 8885617.0, + "step": 980 + }, + { + "epoch": 0.7454407294832827, + "grad_norm": 1.6319992542266846, + "learning_rate": 4.482426719070258e-06, + "loss": 0.541317880153656, + "mean_token_accuracy": 0.8216162323951721, + "num_tokens": 8897595.0, + "step": 981 + }, + { + "epoch": 0.7462006079027356, + "grad_norm": 5.102413177490234, + "learning_rate": 4.4811499928329775e-06, + "loss": 0.3928517699241638, + "mean_token_accuracy": 0.858033299446106, + "num_tokens": 8901682.0, + "step": 982 + }, + { + "epoch": 0.7469604863221885, + "grad_norm": 2.213860273361206, + "learning_rate": 4.479871876148368e-06, + "loss": 0.4276347756385803, + "mean_token_accuracy": 0.8529798984527588, + "num_tokens": 8908088.0, + "step": 983 + }, + { + "epoch": 0.7477203647416414, + "grad_norm": 1.2180038690567017, + "learning_rate": 4.478592369913464e-06, + "loss": 0.3941590189933777, + "mean_token_accuracy": 0.8608149290084839, + "num_tokens": 8925876.0, + "step": 984 + }, + { + "epoch": 0.7484802431610942, + "grad_norm": 2.849802255630493, + "learning_rate": 4.477311475026271e-06, + "loss": 0.42190325260162354, + "mean_token_accuracy": 0.860505223274231, + "num_tokens": 8930190.0, + "step": 985 + }, + { + "epoch": 0.7492401215805471, + "grad_norm": 1.704128384590149, + "learning_rate": 4.476029192385769e-06, + "loss": 0.4786282777786255, + "mean_token_accuracy": 0.8302322626113892, + "num_tokens": 8938340.0, + "step": 986 + }, + { + "epoch": 0.75, + "grad_norm": 2.06322979927063, + "learning_rate": 4.474745522891915e-06, + "loss": 0.4648786187171936, + "mean_token_accuracy": 0.8366481065750122, + "num_tokens": 8944633.0, + "step": 987 + }, + { + "epoch": 0.7507598784194529, + "grad_norm": 2.0745396614074707, + "learning_rate": 4.473460467445637e-06, + "loss": 0.5744885206222534, + "mean_token_accuracy": 0.8357284069061279, + "num_tokens": 8954457.0, + "step": 988 + }, + { + "epoch": 0.7515197568389058, + "grad_norm": 1.9281407594680786, + "learning_rate": 4.472174026948836e-06, + "loss": 0.528974175453186, + "mean_token_accuracy": 0.8083580732345581, + "num_tokens": 8962701.0, + "step": 989 + }, + { + "epoch": 0.7522796352583586, + "grad_norm": 3.012381076812744, + "learning_rate": 4.470886202304385e-06, + "loss": 0.48754751682281494, + "mean_token_accuracy": 0.8368391990661621, + "num_tokens": 8967272.0, + "step": 990 + }, + { + "epoch": 0.7530395136778115, + "grad_norm": 1.691826581954956, + "learning_rate": 4.469596994416131e-06, + "loss": 0.484740674495697, + "mean_token_accuracy": 0.8500643968582153, + "num_tokens": 8976615.0, + "step": 991 + }, + { + "epoch": 0.7537993920972644, + "grad_norm": 2.4961965084075928, + "learning_rate": 4.468306404188887e-06, + "loss": 0.50777268409729, + "mean_token_accuracy": 0.8168395757675171, + "num_tokens": 8983235.0, + "step": 992 + }, + { + "epoch": 0.7545592705167173, + "grad_norm": 1.512007713317871, + "learning_rate": 4.467014432528441e-06, + "loss": 0.4583340287208557, + "mean_token_accuracy": 0.8465162515640259, + "num_tokens": 8993815.0, + "step": 993 + }, + { + "epoch": 0.7553191489361702, + "grad_norm": 1.9362257719039917, + "learning_rate": 4.465721080341547e-06, + "loss": 0.6027892827987671, + "mean_token_accuracy": 0.8052380084991455, + "num_tokens": 9002697.0, + "step": 994 + }, + { + "epoch": 0.756079027355623, + "grad_norm": 2.473632335662842, + "learning_rate": 4.4644263485359316e-06, + "loss": 0.5394320487976074, + "mean_token_accuracy": 0.834665834903717, + "num_tokens": 9007428.0, + "step": 995 + }, + { + "epoch": 0.756838905775076, + "grad_norm": 2.2527434825897217, + "learning_rate": 4.463130238020284e-06, + "loss": 0.5485198497772217, + "mean_token_accuracy": 0.8090173006057739, + "num_tokens": 9013570.0, + "step": 996 + }, + { + "epoch": 0.7575987841945289, + "grad_norm": 1.4130940437316895, + "learning_rate": 4.4618327497042676e-06, + "loss": 0.37994423508644104, + "mean_token_accuracy": 0.8625167012214661, + "num_tokens": 9025485.0, + "step": 997 + }, + { + "epoch": 0.7583586626139818, + "grad_norm": 2.685115098953247, + "learning_rate": 4.460533884498509e-06, + "loss": 0.447973370552063, + "mean_token_accuracy": 0.8564165234565735, + "num_tokens": 9030355.0, + "step": 998 + }, + { + "epoch": 0.7591185410334347, + "grad_norm": 3.2743139266967773, + "learning_rate": 4.4592336433146e-06, + "loss": 0.45275989174842834, + "mean_token_accuracy": 0.8462578058242798, + "num_tokens": 9034406.0, + "step": 999 + }, + { + "epoch": 0.7598784194528876, + "grad_norm": 1.9383049011230469, + "learning_rate": 4.457932027065102e-06, + "loss": 0.5387729406356812, + "mean_token_accuracy": 0.8357330560684204, + "num_tokens": 9041502.0, + "step": 1000 + }, + { + "epoch": 0.7606382978723404, + "grad_norm": 2.7348275184631348, + "learning_rate": 4.456629036663537e-06, + "loss": 0.4448447823524475, + "mean_token_accuracy": 0.8453642129898071, + "num_tokens": 9046088.0, + "step": 1001 + }, + { + "epoch": 0.7613981762917933, + "grad_norm": 1.8477401733398438, + "learning_rate": 4.455324673024396e-06, + "loss": 0.5766505002975464, + "mean_token_accuracy": 0.8074213862419128, + "num_tokens": 9055678.0, + "step": 1002 + }, + { + "epoch": 0.7621580547112462, + "grad_norm": 3.134481430053711, + "learning_rate": 4.4540189370631315e-06, + "loss": 0.5690872669219971, + "mean_token_accuracy": 0.8414670825004578, + "num_tokens": 9062006.0, + "step": 1003 + }, + { + "epoch": 0.7629179331306991, + "grad_norm": 1.7933398485183716, + "learning_rate": 4.452711829696158e-06, + "loss": 0.4898291826248169, + "mean_token_accuracy": 0.8259007930755615, + "num_tokens": 9070754.0, + "step": 1004 + }, + { + "epoch": 0.763677811550152, + "grad_norm": 1.2552275657653809, + "learning_rate": 4.451403351840855e-06, + "loss": 0.4280198812484741, + "mean_token_accuracy": 0.8409112691879272, + "num_tokens": 9085306.0, + "step": 1005 + }, + { + "epoch": 0.7644376899696048, + "grad_norm": 1.6749331951141357, + "learning_rate": 4.450093504415562e-06, + "loss": 0.3723178505897522, + "mean_token_accuracy": 0.8545734882354736, + "num_tokens": 9102453.0, + "step": 1006 + }, + { + "epoch": 0.7651975683890577, + "grad_norm": 2.7514500617980957, + "learning_rate": 4.44878228833958e-06, + "loss": 0.5463190674781799, + "mean_token_accuracy": 0.8121639490127563, + "num_tokens": 9108342.0, + "step": 1007 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 1.3322733640670776, + "learning_rate": 4.447469704533172e-06, + "loss": 0.573723316192627, + "mean_token_accuracy": 0.8065711259841919, + "num_tokens": 9123712.0, + "step": 1008 + }, + { + "epoch": 0.7667173252279635, + "grad_norm": 2.6893765926361084, + "learning_rate": 4.446155753917559e-06, + "loss": 0.6856257915496826, + "mean_token_accuracy": 0.7718256711959839, + "num_tokens": 9130728.0, + "step": 1009 + }, + { + "epoch": 0.7674772036474165, + "grad_norm": 1.792765498161316, + "learning_rate": 4.444840437414923e-06, + "loss": 0.48203110694885254, + "mean_token_accuracy": 0.8419194221496582, + "num_tokens": 9137983.0, + "step": 1010 + }, + { + "epoch": 0.7682370820668692, + "grad_norm": 1.4957399368286133, + "learning_rate": 4.443523755948401e-06, + "loss": 0.4372181296348572, + "mean_token_accuracy": 0.8491764664649963, + "num_tokens": 9148081.0, + "step": 1011 + }, + { + "epoch": 0.7689969604863222, + "grad_norm": 1.7294867038726807, + "learning_rate": 4.442205710442095e-06, + "loss": 0.54277503490448, + "mean_token_accuracy": 0.8196806907653809, + "num_tokens": 9158407.0, + "step": 1012 + }, + { + "epoch": 0.7697568389057751, + "grad_norm": 2.2091221809387207, + "learning_rate": 4.4408863018210564e-06, + "loss": 0.4888187646865845, + "mean_token_accuracy": 0.8384175300598145, + "num_tokens": 9164754.0, + "step": 1013 + }, + { + "epoch": 0.770516717325228, + "grad_norm": 1.7615830898284912, + "learning_rate": 4.439565531011299e-06, + "loss": 0.4640008211135864, + "mean_token_accuracy": 0.8424701690673828, + "num_tokens": 9172715.0, + "step": 1014 + }, + { + "epoch": 0.7712765957446809, + "grad_norm": 1.6796128749847412, + "learning_rate": 4.43824339893979e-06, + "loss": 0.5227609276771545, + "mean_token_accuracy": 0.8135923743247986, + "num_tokens": 9183214.0, + "step": 1015 + }, + { + "epoch": 0.7720364741641338, + "grad_norm": 2.1485698223114014, + "learning_rate": 4.436919906534452e-06, + "loss": 0.4857056140899658, + "mean_token_accuracy": 0.8323013782501221, + "num_tokens": 9190360.0, + "step": 1016 + }, + { + "epoch": 0.7727963525835866, + "grad_norm": 2.7842206954956055, + "learning_rate": 4.4355950547241645e-06, + "loss": 0.46406883001327515, + "mean_token_accuracy": 0.859869122505188, + "num_tokens": 9194523.0, + "step": 1017 + }, + { + "epoch": 0.7735562310030395, + "grad_norm": 2.3774640560150146, + "learning_rate": 4.434268844438758e-06, + "loss": 0.5625549554824829, + "mean_token_accuracy": 0.8188897371292114, + "num_tokens": 9201155.0, + "step": 1018 + }, + { + "epoch": 0.7743161094224924, + "grad_norm": 2.004427909851074, + "learning_rate": 4.432941276609018e-06, + "loss": 0.5164387226104736, + "mean_token_accuracy": 0.829569935798645, + "num_tokens": 9209269.0, + "step": 1019 + }, + { + "epoch": 0.7750759878419453, + "grad_norm": 1.7218989133834839, + "learning_rate": 4.431612352166684e-06, + "loss": 0.481005996465683, + "mean_token_accuracy": 0.8359906673431396, + "num_tokens": 9220860.0, + "step": 1020 + }, + { + "epoch": 0.7758358662613982, + "grad_norm": 2.197108507156372, + "learning_rate": 4.4302820720444454e-06, + "loss": 0.440413236618042, + "mean_token_accuracy": 0.8412867784500122, + "num_tokens": 9226414.0, + "step": 1021 + }, + { + "epoch": 0.776595744680851, + "grad_norm": 2.6995162963867188, + "learning_rate": 4.428950437175944e-06, + "loss": 0.3884299397468567, + "mean_token_accuracy": 0.8696021437644958, + "num_tokens": 9230898.0, + "step": 1022 + }, + { + "epoch": 0.7773556231003039, + "grad_norm": 2.1671667098999023, + "learning_rate": 4.427617448495772e-06, + "loss": 0.5747478008270264, + "mean_token_accuracy": 0.7842930555343628, + "num_tokens": 9238479.0, + "step": 1023 + }, + { + "epoch": 0.7781155015197568, + "grad_norm": 1.6299028396606445, + "learning_rate": 4.426283106939474e-06, + "loss": 0.39478403329849243, + "mean_token_accuracy": 0.8685503602027893, + "num_tokens": 9248263.0, + "step": 1024 + }, + { + "epoch": 0.7788753799392097, + "grad_norm": 2.2621798515319824, + "learning_rate": 4.424947413443539e-06, + "loss": 0.4582178592681885, + "mean_token_accuracy": 0.8312377333641052, + "num_tokens": 9254168.0, + "step": 1025 + }, + { + "epoch": 0.7796352583586627, + "grad_norm": 2.121091365814209, + "learning_rate": 4.423610368945411e-06, + "loss": 0.5315121412277222, + "mean_token_accuracy": 0.8121483325958252, + "num_tokens": 9261808.0, + "step": 1026 + }, + { + "epoch": 0.7803951367781155, + "grad_norm": 1.8558297157287598, + "learning_rate": 4.422271974383479e-06, + "loss": 0.4299176037311554, + "mean_token_accuracy": 0.8452648520469666, + "num_tokens": 9269264.0, + "step": 1027 + }, + { + "epoch": 0.7811550151975684, + "grad_norm": 1.9089949131011963, + "learning_rate": 4.420932230697079e-06, + "loss": 0.43876272439956665, + "mean_token_accuracy": 0.8434094190597534, + "num_tokens": 9277381.0, + "step": 1028 + }, + { + "epoch": 0.7819148936170213, + "grad_norm": 1.8619649410247803, + "learning_rate": 4.419591138826495e-06, + "loss": 0.48798668384552, + "mean_token_accuracy": 0.8281317353248596, + "num_tokens": 9285413.0, + "step": 1029 + }, + { + "epoch": 0.7826747720364742, + "grad_norm": 1.3273087739944458, + "learning_rate": 4.418248699712955e-06, + "loss": 0.4611460864543915, + "mean_token_accuracy": 0.8233213424682617, + "num_tokens": 9300805.0, + "step": 1030 + }, + { + "epoch": 0.7834346504559271, + "grad_norm": 1.0473746061325073, + "learning_rate": 4.416904914298637e-06, + "loss": 0.36537665128707886, + "mean_token_accuracy": 0.8671857118606567, + "num_tokens": 9320035.0, + "step": 1031 + }, + { + "epoch": 0.78419452887538, + "grad_norm": 1.9130918979644775, + "learning_rate": 4.415559783526661e-06, + "loss": 0.4916655123233795, + "mean_token_accuracy": 0.8266351222991943, + "num_tokens": 9326795.0, + "step": 1032 + }, + { + "epoch": 0.7849544072948328, + "grad_norm": 2.0001816749572754, + "learning_rate": 4.414213308341092e-06, + "loss": 0.5711008310317993, + "mean_token_accuracy": 0.8093076348304749, + "num_tokens": 9335625.0, + "step": 1033 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 3.933542251586914, + "learning_rate": 4.412865489686936e-06, + "loss": 0.621616542339325, + "mean_token_accuracy": 0.7938898801803589, + "num_tokens": 9339080.0, + "step": 1034 + }, + { + "epoch": 0.7864741641337386, + "grad_norm": 2.061558961868286, + "learning_rate": 4.411516328510145e-06, + "loss": 0.583686113357544, + "mean_token_accuracy": 0.8216883540153503, + "num_tokens": 9348581.0, + "step": 1035 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 1.9401264190673828, + "learning_rate": 4.410165825757613e-06, + "loss": 0.4905240535736084, + "mean_token_accuracy": 0.8229951858520508, + "num_tokens": 9356032.0, + "step": 1036 + }, + { + "epoch": 0.7879939209726444, + "grad_norm": 3.620547294616699, + "learning_rate": 4.408813982377175e-06, + "loss": 0.4269888997077942, + "mean_token_accuracy": 0.8713940978050232, + "num_tokens": 9359061.0, + "step": 1037 + }, + { + "epoch": 0.7887537993920972, + "grad_norm": 1.2027851343154907, + "learning_rate": 4.407460799317605e-06, + "loss": 0.39972418546676636, + "mean_token_accuracy": 0.8610097765922546, + "num_tokens": 9377068.0, + "step": 1038 + }, + { + "epoch": 0.7895136778115501, + "grad_norm": 2.566753387451172, + "learning_rate": 4.40610627752862e-06, + "loss": 0.45267152786254883, + "mean_token_accuracy": 0.83243328332901, + "num_tokens": 9383604.0, + "step": 1039 + }, + { + "epoch": 0.790273556231003, + "grad_norm": 2.940094470977783, + "learning_rate": 4.404750417960876e-06, + "loss": 0.42862242460250854, + "mean_token_accuracy": 0.8582849502563477, + "num_tokens": 9387541.0, + "step": 1040 + }, + { + "epoch": 0.791033434650456, + "grad_norm": 2.0223944187164307, + "learning_rate": 4.403393221565966e-06, + "loss": 0.4349963665008545, + "mean_token_accuracy": 0.8453047871589661, + "num_tokens": 9394382.0, + "step": 1041 + }, + { + "epoch": 0.7917933130699089, + "grad_norm": 2.9399030208587646, + "learning_rate": 4.402034689296425e-06, + "loss": 0.32197174429893494, + "mean_token_accuracy": 0.8953392505645752, + "num_tokens": 9397741.0, + "step": 1042 + }, + { + "epoch": 0.7925531914893617, + "grad_norm": 2.819016456604004, + "learning_rate": 4.400674822105721e-06, + "loss": 0.6790289878845215, + "mean_token_accuracy": 0.8135063648223877, + "num_tokens": 9403509.0, + "step": 1043 + }, + { + "epoch": 0.7933130699088146, + "grad_norm": 1.3225977420806885, + "learning_rate": 4.399313620948262e-06, + "loss": 0.42203834652900696, + "mean_token_accuracy": 0.8399381637573242, + "num_tokens": 9418870.0, + "step": 1044 + }, + { + "epoch": 0.7940729483282675, + "grad_norm": 1.7822176218032837, + "learning_rate": 4.397951086779392e-06, + "loss": 0.4666554927825928, + "mean_token_accuracy": 0.8364764451980591, + "num_tokens": 9427640.0, + "step": 1045 + }, + { + "epoch": 0.7948328267477204, + "grad_norm": 3.186439037322998, + "learning_rate": 4.396587220555389e-06, + "loss": 0.6048363447189331, + "mean_token_accuracy": 0.7806557416915894, + "num_tokens": 9431927.0, + "step": 1046 + }, + { + "epoch": 0.7955927051671733, + "grad_norm": 3.0804805755615234, + "learning_rate": 4.395222023233467e-06, + "loss": 0.445969820022583, + "mean_token_accuracy": 0.850671112537384, + "num_tokens": 9436136.0, + "step": 1047 + }, + { + "epoch": 0.7963525835866262, + "grad_norm": 1.675968885421753, + "learning_rate": 4.393855495771774e-06, + "loss": 0.4311422109603882, + "mean_token_accuracy": 0.8449079990386963, + "num_tokens": 9445189.0, + "step": 1048 + }, + { + "epoch": 0.797112462006079, + "grad_norm": 2.342410087585449, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.5733606219291687, + "mean_token_accuracy": 0.8156592845916748, + "num_tokens": 9451939.0, + "step": 1049 + }, + { + "epoch": 0.7978723404255319, + "grad_norm": 1.5967470407485962, + "learning_rate": 4.391118454266335e-06, + "loss": 0.46664729714393616, + "mean_token_accuracy": 0.8091695308685303, + "num_tokens": 9463968.0, + "step": 1050 + }, + { + "epoch": 0.7986322188449848, + "grad_norm": 1.5777863264083862, + "learning_rate": 4.389747942143549e-06, + "loss": 0.46028903126716614, + "mean_token_accuracy": 0.8347330093383789, + "num_tokens": 9475561.0, + "step": 1051 + }, + { + "epoch": 0.7993920972644377, + "grad_norm": 2.7630488872528076, + "learning_rate": 4.388376103722914e-06, + "loss": 0.5618188977241516, + "mean_token_accuracy": 0.8273467421531677, + "num_tokens": 9480661.0, + "step": 1052 + }, + { + "epoch": 0.8001519756838906, + "grad_norm": 2.093397378921509, + "learning_rate": 4.387002939967237e-06, + "loss": 0.2998353838920593, + "mean_token_accuracy": 0.8905231952667236, + "num_tokens": 9485924.0, + "step": 1053 + }, + { + "epoch": 0.8009118541033434, + "grad_norm": 1.4385871887207031, + "learning_rate": 4.38562845184026e-06, + "loss": 0.4944111704826355, + "mean_token_accuracy": 0.8403056263923645, + "num_tokens": 9500128.0, + "step": 1054 + }, + { + "epoch": 0.8016717325227963, + "grad_norm": 1.6393156051635742, + "learning_rate": 4.384252640306649e-06, + "loss": 0.5727907419204712, + "mean_token_accuracy": 0.7849414348602295, + "num_tokens": 9511569.0, + "step": 1055 + }, + { + "epoch": 0.8024316109422492, + "grad_norm": 2.3909664154052734, + "learning_rate": 4.382875506332002e-06, + "loss": 0.4760419726371765, + "mean_token_accuracy": 0.8408266305923462, + "num_tokens": 9517244.0, + "step": 1056 + }, + { + "epoch": 0.8031914893617021, + "grad_norm": 1.7288594245910645, + "learning_rate": 4.381497050882845e-06, + "loss": 0.5375926494598389, + "mean_token_accuracy": 0.8138614892959595, + "num_tokens": 9528736.0, + "step": 1057 + }, + { + "epoch": 0.8039513677811551, + "grad_norm": 2.093407392501831, + "learning_rate": 4.380117274926632e-06, + "loss": 0.46659404039382935, + "mean_token_accuracy": 0.8450702428817749, + "num_tokens": 9536200.0, + "step": 1058 + }, + { + "epoch": 0.8047112462006079, + "grad_norm": 1.6835898160934448, + "learning_rate": 4.3787361794317405e-06, + "loss": 0.43157699704170227, + "mean_token_accuracy": 0.8279973268508911, + "num_tokens": 9546314.0, + "step": 1059 + }, + { + "epoch": 0.8054711246200608, + "grad_norm": 1.983067512512207, + "learning_rate": 4.377353765367479e-06, + "loss": 0.5021739602088928, + "mean_token_accuracy": 0.8274815082550049, + "num_tokens": 9554375.0, + "step": 1060 + }, + { + "epoch": 0.8062310030395137, + "grad_norm": 2.0472030639648438, + "learning_rate": 4.375970033704078e-06, + "loss": 0.34298190474510193, + "mean_token_accuracy": 0.8900876045227051, + "num_tokens": 9560230.0, + "step": 1061 + }, + { + "epoch": 0.8069908814589666, + "grad_norm": 1.9613717794418335, + "learning_rate": 4.374584985412692e-06, + "loss": 0.3826758861541748, + "mean_token_accuracy": 0.839923620223999, + "num_tokens": 9566809.0, + "step": 1062 + }, + { + "epoch": 0.8077507598784195, + "grad_norm": 1.991289496421814, + "learning_rate": 4.373198621465405e-06, + "loss": 0.5492525100708008, + "mean_token_accuracy": 0.8153272867202759, + "num_tokens": 9576810.0, + "step": 1063 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.421370506286621, + "learning_rate": 4.3718109428352155e-06, + "loss": 0.5240297317504883, + "mean_token_accuracy": 0.8087242245674133, + "num_tokens": 9582906.0, + "step": 1064 + }, + { + "epoch": 0.8092705167173252, + "grad_norm": 3.697765588760376, + "learning_rate": 4.370421950496055e-06, + "loss": 0.6096476912498474, + "mean_token_accuracy": 0.787585973739624, + "num_tokens": 9586920.0, + "step": 1065 + }, + { + "epoch": 0.8100303951367781, + "grad_norm": 2.0767786502838135, + "learning_rate": 4.369031645422768e-06, + "loss": 0.41120079159736633, + "mean_token_accuracy": 0.8513731956481934, + "num_tokens": 9593902.0, + "step": 1066 + }, + { + "epoch": 0.810790273556231, + "grad_norm": 2.5968732833862305, + "learning_rate": 4.367640028591126e-06, + "loss": 0.3364982008934021, + "mean_token_accuracy": 0.8786963224411011, + "num_tokens": 9597745.0, + "step": 1067 + }, + { + "epoch": 0.8115501519756839, + "grad_norm": 2.165742874145508, + "learning_rate": 4.366247100977818e-06, + "loss": 0.406129390001297, + "mean_token_accuracy": 0.868243932723999, + "num_tokens": 9603496.0, + "step": 1068 + }, + { + "epoch": 0.8123100303951368, + "grad_norm": 2.0493404865264893, + "learning_rate": 4.364852863560456e-06, + "loss": 0.5356296300888062, + "mean_token_accuracy": 0.8191947340965271, + "num_tokens": 9610898.0, + "step": 1069 + }, + { + "epoch": 0.8130699088145896, + "grad_norm": 2.3224308490753174, + "learning_rate": 4.363457317317568e-06, + "loss": 0.41461923718452454, + "mean_token_accuracy": 0.8537945747375488, + "num_tokens": 9616626.0, + "step": 1070 + }, + { + "epoch": 0.8138297872340425, + "grad_norm": 1.7387986183166504, + "learning_rate": 4.362060463228603e-06, + "loss": 0.5134786367416382, + "mean_token_accuracy": 0.8511737585067749, + "num_tokens": 9626223.0, + "step": 1071 + }, + { + "epoch": 0.8145896656534954, + "grad_norm": 3.0270655155181885, + "learning_rate": 4.360662302273926e-06, + "loss": 0.3410695791244507, + "mean_token_accuracy": 0.8746449947357178, + "num_tokens": 9629455.0, + "step": 1072 + }, + { + "epoch": 0.8153495440729484, + "grad_norm": 1.7727062702178955, + "learning_rate": 4.35926283543482e-06, + "loss": 0.4610968828201294, + "mean_token_accuracy": 0.8444793224334717, + "num_tokens": 9638070.0, + "step": 1073 + }, + { + "epoch": 0.8161094224924013, + "grad_norm": 3.6333565711975098, + "learning_rate": 4.357862063693486e-06, + "loss": 0.3881273865699768, + "mean_token_accuracy": 0.8757344484329224, + "num_tokens": 9641028.0, + "step": 1074 + }, + { + "epoch": 0.8168693009118541, + "grad_norm": 3.024042844772339, + "learning_rate": 4.356459988033039e-06, + "loss": 0.3853808641433716, + "mean_token_accuracy": 0.8602254390716553, + "num_tokens": 9645730.0, + "step": 1075 + }, + { + "epoch": 0.817629179331307, + "grad_norm": 2.3359482288360596, + "learning_rate": 4.355056609437509e-06, + "loss": 0.4852045476436615, + "mean_token_accuracy": 0.8502728343009949, + "num_tokens": 9650975.0, + "step": 1076 + }, + { + "epoch": 0.8183890577507599, + "grad_norm": 2.2390685081481934, + "learning_rate": 4.353651928891842e-06, + "loss": 0.5287341475486755, + "mean_token_accuracy": 0.8247801065444946, + "num_tokens": 9657471.0, + "step": 1077 + }, + { + "epoch": 0.8191489361702128, + "grad_norm": 2.3809144496917725, + "learning_rate": 4.352245947381897e-06, + "loss": 0.5218510627746582, + "mean_token_accuracy": 0.8149170875549316, + "num_tokens": 9664108.0, + "step": 1078 + }, + { + "epoch": 0.8199088145896657, + "grad_norm": 1.7072309255599976, + "learning_rate": 4.3508386658944455e-06, + "loss": 0.46481168270111084, + "mean_token_accuracy": 0.834963321685791, + "num_tokens": 9673175.0, + "step": 1079 + }, + { + "epoch": 0.8206686930091185, + "grad_norm": 1.7383702993392944, + "learning_rate": 4.349430085417171e-06, + "loss": 0.4505952000617981, + "mean_token_accuracy": 0.8507769107818604, + "num_tokens": 9682800.0, + "step": 1080 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 2.4308547973632812, + "learning_rate": 4.348020206938672e-06, + "loss": 0.4832455515861511, + "mean_token_accuracy": 0.8538393974304199, + "num_tokens": 9688123.0, + "step": 1081 + }, + { + "epoch": 0.8221884498480243, + "grad_norm": 2.2686192989349365, + "learning_rate": 4.3466090314484526e-06, + "loss": 0.5112563371658325, + "mean_token_accuracy": 0.8308460712432861, + "num_tokens": 9694299.0, + "step": 1082 + }, + { + "epoch": 0.8229483282674772, + "grad_norm": 2.806093454360962, + "learning_rate": 4.345196559936931e-06, + "loss": 0.4818246364593506, + "mean_token_accuracy": 0.86617112159729, + "num_tokens": 9698471.0, + "step": 1083 + }, + { + "epoch": 0.8237082066869301, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.343782793395435e-06, + "loss": 0.38246971368789673, + "mean_token_accuracy": 0.8675198554992676, + "num_tokens": 9706444.0, + "step": 1084 + }, + { + "epoch": 0.824468085106383, + "grad_norm": 1.664942741394043, + "learning_rate": 4.3423677328162e-06, + "loss": 0.498797208070755, + "mean_token_accuracy": 0.8447319865226746, + "num_tokens": 9716765.0, + "step": 1085 + }, + { + "epoch": 0.8252279635258358, + "grad_norm": 1.3608235120773315, + "learning_rate": 4.340951379192369e-06, + "loss": 0.41961491107940674, + "mean_token_accuracy": 0.8339346647262573, + "num_tokens": 9729564.0, + "step": 1086 + }, + { + "epoch": 0.8259878419452887, + "grad_norm": 1.642503261566162, + "learning_rate": 4.3395337335179945e-06, + "loss": 0.5477945804595947, + "mean_token_accuracy": 0.8117889761924744, + "num_tokens": 9741217.0, + "step": 1087 + }, + { + "epoch": 0.8267477203647416, + "grad_norm": 3.0345044136047363, + "learning_rate": 4.338114796788035e-06, + "loss": 0.5024623870849609, + "mean_token_accuracy": 0.8333141207695007, + "num_tokens": 9744941.0, + "step": 1088 + }, + { + "epoch": 0.8275075987841946, + "grad_norm": 1.3096630573272705, + "learning_rate": 4.336694569998354e-06, + "loss": 0.44169723987579346, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 9757854.0, + "step": 1089 + }, + { + "epoch": 0.8282674772036475, + "grad_norm": 2.203279495239258, + "learning_rate": 4.3352730541457215e-06, + "loss": 0.5283265113830566, + "mean_token_accuracy": 0.8053759932518005, + "num_tokens": 9764096.0, + "step": 1090 + }, + { + "epoch": 0.8290273556231003, + "grad_norm": 1.3774312734603882, + "learning_rate": 4.333850250227814e-06, + "loss": 0.4584103226661682, + "mean_token_accuracy": 0.8342611193656921, + "num_tokens": 9777768.0, + "step": 1091 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 1.822637915611267, + "learning_rate": 4.332426159243206e-06, + "loss": 0.5432791709899902, + "mean_token_accuracy": 0.8136210441589355, + "num_tokens": 9791276.0, + "step": 1092 + }, + { + "epoch": 0.8305471124620061, + "grad_norm": 3.0190067291259766, + "learning_rate": 4.331000782191384e-06, + "loss": 0.5018150806427002, + "mean_token_accuracy": 0.8234807252883911, + "num_tokens": 9794902.0, + "step": 1093 + }, + { + "epoch": 0.831306990881459, + "grad_norm": 2.09987735748291, + "learning_rate": 4.329574120072728e-06, + "loss": 0.4270891547203064, + "mean_token_accuracy": 0.8544977903366089, + "num_tokens": 9800903.0, + "step": 1094 + }, + { + "epoch": 0.8320668693009119, + "grad_norm": 1.969549536705017, + "learning_rate": 4.328146173888528e-06, + "loss": 0.45801427960395813, + "mean_token_accuracy": 0.8334714770317078, + "num_tokens": 9808719.0, + "step": 1095 + }, + { + "epoch": 0.8328267477203647, + "grad_norm": 1.4565571546554565, + "learning_rate": 4.32671694464097e-06, + "loss": 0.34864288568496704, + "mean_token_accuracy": 0.8689061999320984, + "num_tokens": 9818262.0, + "step": 1096 + }, + { + "epoch": 0.8335866261398176, + "grad_norm": 1.2163832187652588, + "learning_rate": 4.3252864333331424e-06, + "loss": 0.37953704595565796, + "mean_token_accuracy": 0.866554856300354, + "num_tokens": 9833942.0, + "step": 1097 + }, + { + "epoch": 0.8343465045592705, + "grad_norm": 1.6112010478973389, + "learning_rate": 4.323854640969033e-06, + "loss": 0.5442801713943481, + "mean_token_accuracy": 0.8190416097640991, + "num_tokens": 9844765.0, + "step": 1098 + }, + { + "epoch": 0.8351063829787234, + "grad_norm": 1.8190315961837769, + "learning_rate": 4.322421568553529e-06, + "loss": 0.48271381855010986, + "mean_token_accuracy": 0.8203652501106262, + "num_tokens": 9852625.0, + "step": 1099 + }, + { + "epoch": 0.8358662613981763, + "grad_norm": 2.7897756099700928, + "learning_rate": 4.320987217092416e-06, + "loss": 0.4086323380470276, + "mean_token_accuracy": 0.8504934310913086, + "num_tokens": 9856888.0, + "step": 1100 + }, + { + "epoch": 0.8366261398176292, + "grad_norm": 1.7035977840423584, + "learning_rate": 4.319551587592377e-06, + "loss": 0.6325064301490784, + "mean_token_accuracy": 0.788190484046936, + "num_tokens": 9869419.0, + "step": 1101 + }, + { + "epoch": 0.837386018237082, + "grad_norm": 2.609731912612915, + "learning_rate": 4.318114681060989e-06, + "loss": 0.519314706325531, + "mean_token_accuracy": 0.8469992280006409, + "num_tokens": 9874553.0, + "step": 1102 + }, + { + "epoch": 0.8381458966565349, + "grad_norm": 1.2519766092300415, + "learning_rate": 4.316676498506735e-06, + "loss": 0.3566005825996399, + "mean_token_accuracy": 0.8588439226150513, + "num_tokens": 9886498.0, + "step": 1103 + }, + { + "epoch": 0.8389057750759878, + "grad_norm": 1.430892825126648, + "learning_rate": 4.3152370409389795e-06, + "loss": 0.5250182747840881, + "mean_token_accuracy": 0.8164948225021362, + "num_tokens": 9900256.0, + "step": 1104 + }, + { + "epoch": 0.8396656534954408, + "grad_norm": 3.1245436668395996, + "learning_rate": 4.3137963093679945e-06, + "loss": 0.3173971176147461, + "mean_token_accuracy": 0.8835347890853882, + "num_tokens": 9903899.0, + "step": 1105 + }, + { + "epoch": 0.8404255319148937, + "grad_norm": 3.131812572479248, + "learning_rate": 4.3123543048049395e-06, + "loss": 0.6567763090133667, + "mean_token_accuracy": 0.8233605027198792, + "num_tokens": 9908798.0, + "step": 1106 + }, + { + "epoch": 0.8411854103343465, + "grad_norm": 1.3551725149154663, + "learning_rate": 4.310911028261867e-06, + "loss": 0.3993729054927826, + "mean_token_accuracy": 0.8529655933380127, + "num_tokens": 9922577.0, + "step": 1107 + }, + { + "epoch": 0.8419452887537994, + "grad_norm": 2.572533130645752, + "learning_rate": 4.309466480751726e-06, + "loss": 0.40906503796577454, + "mean_token_accuracy": 0.8630726933479309, + "num_tokens": 9926890.0, + "step": 1108 + }, + { + "epoch": 0.8427051671732523, + "grad_norm": 1.9146469831466675, + "learning_rate": 4.308020663288356e-06, + "loss": 0.48423194885253906, + "mean_token_accuracy": 0.8370280861854553, + "num_tokens": 9934293.0, + "step": 1109 + }, + { + "epoch": 0.8434650455927052, + "grad_norm": 1.6178001165390015, + "learning_rate": 4.306573576886485e-06, + "loss": 0.4262213408946991, + "mean_token_accuracy": 0.839401125907898, + "num_tokens": 9944513.0, + "step": 1110 + }, + { + "epoch": 0.8442249240121581, + "grad_norm": 2.4444572925567627, + "learning_rate": 4.305125222561736e-06, + "loss": 0.5199950933456421, + "mean_token_accuracy": 0.8507720232009888, + "num_tokens": 9949512.0, + "step": 1111 + }, + { + "epoch": 0.8449848024316109, + "grad_norm": 1.7983134984970093, + "learning_rate": 4.303675601330618e-06, + "loss": 0.36155956983566284, + "mean_token_accuracy": 0.8568712472915649, + "num_tokens": 9956402.0, + "step": 1112 + }, + { + "epoch": 0.8457446808510638, + "grad_norm": 2.391096353530884, + "learning_rate": 4.302224714210532e-06, + "loss": 0.5391949415206909, + "mean_token_accuracy": 0.8183057308197021, + "num_tokens": 9961606.0, + "step": 1113 + }, + { + "epoch": 0.8465045592705167, + "grad_norm": 1.8520214557647705, + "learning_rate": 4.3007725622197675e-06, + "loss": 0.5758882761001587, + "mean_token_accuracy": 0.7924330234527588, + "num_tokens": 9971473.0, + "step": 1114 + }, + { + "epoch": 0.8472644376899696, + "grad_norm": 2.436640739440918, + "learning_rate": 4.2993191463775e-06, + "loss": 0.3837985396385193, + "mean_token_accuracy": 0.8620110750198364, + "num_tokens": 9976333.0, + "step": 1115 + }, + { + "epoch": 0.8480243161094225, + "grad_norm": 1.7287120819091797, + "learning_rate": 4.29786446770379e-06, + "loss": 0.40066856145858765, + "mean_token_accuracy": 0.8618333339691162, + "num_tokens": 9985617.0, + "step": 1116 + }, + { + "epoch": 0.8487841945288754, + "grad_norm": 2.0310518741607666, + "learning_rate": 4.296408527219592e-06, + "loss": 0.5465943217277527, + "mean_token_accuracy": 0.812044620513916, + "num_tokens": 9995363.0, + "step": 1117 + }, + { + "epoch": 0.8495440729483282, + "grad_norm": 1.4858589172363281, + "learning_rate": 4.294951325946737e-06, + "loss": 0.45840176939964294, + "mean_token_accuracy": 0.8432979583740234, + "num_tokens": 10006400.0, + "step": 1118 + }, + { + "epoch": 0.8503039513677811, + "grad_norm": 1.6153514385223389, + "learning_rate": 4.293492864907947e-06, + "loss": 0.5225611925125122, + "mean_token_accuracy": 0.8180211186408997, + "num_tokens": 10018352.0, + "step": 1119 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.1178412437438965, + "learning_rate": 4.2920331451268246e-06, + "loss": 0.5580621361732483, + "mean_token_accuracy": 0.8211709260940552, + "num_tokens": 10025614.0, + "step": 1120 + }, + { + "epoch": 0.851823708206687, + "grad_norm": 2.036839246749878, + "learning_rate": 4.2905721676278585e-06, + "loss": 0.4658433198928833, + "mean_token_accuracy": 0.8380423784255981, + "num_tokens": 10032489.0, + "step": 1121 + }, + { + "epoch": 0.8525835866261399, + "grad_norm": 2.0056262016296387, + "learning_rate": 4.28910993343642e-06, + "loss": 0.47023308277130127, + "mean_token_accuracy": 0.8340359926223755, + "num_tokens": 10040050.0, + "step": 1122 + }, + { + "epoch": 0.8533434650455927, + "grad_norm": 2.540024518966675, + "learning_rate": 4.2876464435787576e-06, + "loss": 0.502303957939148, + "mean_token_accuracy": 0.8288739919662476, + "num_tokens": 10045042.0, + "step": 1123 + }, + { + "epoch": 0.8541033434650456, + "grad_norm": 1.7894693613052368, + "learning_rate": 4.286181699082008e-06, + "loss": 0.4732973575592041, + "mean_token_accuracy": 0.8340568542480469, + "num_tokens": 10054424.0, + "step": 1124 + }, + { + "epoch": 0.8548632218844985, + "grad_norm": 1.5601223707199097, + "learning_rate": 4.284715700974186e-06, + "loss": 0.472471684217453, + "mean_token_accuracy": 0.8274722695350647, + "num_tokens": 10065523.0, + "step": 1125 + }, + { + "epoch": 0.8556231003039514, + "grad_norm": 1.7326055765151978, + "learning_rate": 4.283248450284182e-06, + "loss": 0.5924872159957886, + "mean_token_accuracy": 0.7943467497825623, + "num_tokens": 10076839.0, + "step": 1126 + }, + { + "epoch": 0.8563829787234043, + "grad_norm": 1.5165479183197021, + "learning_rate": 4.281779948041772e-06, + "loss": 0.44768425822257996, + "mean_token_accuracy": 0.8394696712493896, + "num_tokens": 10088168.0, + "step": 1127 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.5448920726776123, + "learning_rate": 4.280310195277606e-06, + "loss": 0.4458175003528595, + "mean_token_accuracy": 0.835773229598999, + "num_tokens": 10100306.0, + "step": 1128 + }, + { + "epoch": 0.85790273556231, + "grad_norm": 1.6311609745025635, + "learning_rate": 4.278839193023214e-06, + "loss": 0.4158072769641876, + "mean_token_accuracy": 0.8482539653778076, + "num_tokens": 10110581.0, + "step": 1129 + }, + { + "epoch": 0.8586626139817629, + "grad_norm": 1.6714754104614258, + "learning_rate": 4.277366942311001e-06, + "loss": 0.3686875104904175, + "mean_token_accuracy": 0.8681533336639404, + "num_tokens": 10118799.0, + "step": 1130 + }, + { + "epoch": 0.8594224924012158, + "grad_norm": 2.1604413986206055, + "learning_rate": 4.2758934441742494e-06, + "loss": 0.37267982959747314, + "mean_token_accuracy": 0.8520427346229553, + "num_tokens": 10124734.0, + "step": 1131 + }, + { + "epoch": 0.8601823708206687, + "grad_norm": 2.123013973236084, + "learning_rate": 4.274418699647117e-06, + "loss": 0.49963313341140747, + "mean_token_accuracy": 0.8248758912086487, + "num_tokens": 10131965.0, + "step": 1132 + }, + { + "epoch": 0.8609422492401215, + "grad_norm": 1.4308786392211914, + "learning_rate": 4.272942709764638e-06, + "loss": 0.48666873574256897, + "mean_token_accuracy": 0.8304717540740967, + "num_tokens": 10145164.0, + "step": 1133 + }, + { + "epoch": 0.8617021276595744, + "grad_norm": 1.7952618598937988, + "learning_rate": 4.271465475562716e-06, + "loss": 0.5536223649978638, + "mean_token_accuracy": 0.8093959093093872, + "num_tokens": 10154083.0, + "step": 1134 + }, + { + "epoch": 0.8624620060790273, + "grad_norm": 2.0622456073760986, + "learning_rate": 4.269986998078132e-06, + "loss": 0.5173629522323608, + "mean_token_accuracy": 0.8285619020462036, + "num_tokens": 10161889.0, + "step": 1135 + }, + { + "epoch": 0.8632218844984803, + "grad_norm": 2.0707509517669678, + "learning_rate": 4.268507278348539e-06, + "loss": 0.5871608257293701, + "mean_token_accuracy": 0.7827386856079102, + "num_tokens": 10170726.0, + "step": 1136 + }, + { + "epoch": 0.8639817629179332, + "grad_norm": 2.054368257522583, + "learning_rate": 4.2670263174124615e-06, + "loss": 0.5788969993591309, + "mean_token_accuracy": 0.7967237234115601, + "num_tokens": 10178474.0, + "step": 1137 + }, + { + "epoch": 0.8647416413373861, + "grad_norm": 1.901846170425415, + "learning_rate": 4.265544116309294e-06, + "loss": 0.5405587553977966, + "mean_token_accuracy": 0.8151819705963135, + "num_tokens": 10187013.0, + "step": 1138 + }, + { + "epoch": 0.8655015197568389, + "grad_norm": 2.901285409927368, + "learning_rate": 4.264060676079302e-06, + "loss": 0.44101861119270325, + "mean_token_accuracy": 0.8433429002761841, + "num_tokens": 10191517.0, + "step": 1139 + }, + { + "epoch": 0.8662613981762918, + "grad_norm": 2.4168388843536377, + "learning_rate": 4.262575997763622e-06, + "loss": 0.4686204195022583, + "mean_token_accuracy": 0.8505309820175171, + "num_tokens": 10196948.0, + "step": 1140 + }, + { + "epoch": 0.8670212765957447, + "grad_norm": 1.9588396549224854, + "learning_rate": 4.2610900824042575e-06, + "loss": 0.47056013345718384, + "mean_token_accuracy": 0.8280024528503418, + "num_tokens": 10204292.0, + "step": 1141 + }, + { + "epoch": 0.8677811550151976, + "grad_norm": 2.569150924682617, + "learning_rate": 4.2596029310440826e-06, + "loss": 0.573108434677124, + "mean_token_accuracy": 0.8108246326446533, + "num_tokens": 10209571.0, + "step": 1142 + }, + { + "epoch": 0.8685410334346505, + "grad_norm": 2.038032293319702, + "learning_rate": 4.258114544726835e-06, + "loss": 0.40545332431793213, + "mean_token_accuracy": 0.8611703515052795, + "num_tokens": 10215716.0, + "step": 1143 + }, + { + "epoch": 0.8693009118541033, + "grad_norm": 1.9884231090545654, + "learning_rate": 4.256624924497124e-06, + "loss": 0.40085992217063904, + "mean_token_accuracy": 0.8615031242370605, + "num_tokens": 10222775.0, + "step": 1144 + }, + { + "epoch": 0.8700607902735562, + "grad_norm": 1.912842035293579, + "learning_rate": 4.25513407140042e-06, + "loss": 0.41022324562072754, + "mean_token_accuracy": 0.8459607362747192, + "num_tokens": 10229589.0, + "step": 1145 + }, + { + "epoch": 0.8708206686930091, + "grad_norm": 1.9190576076507568, + "learning_rate": 4.253641986483063e-06, + "loss": 0.5541447401046753, + "mean_token_accuracy": 0.8256468772888184, + "num_tokens": 10240633.0, + "step": 1146 + }, + { + "epoch": 0.871580547112462, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.2521486707922545e-06, + "loss": 0.3680543899536133, + "mean_token_accuracy": 0.8654477596282959, + "num_tokens": 10251252.0, + "step": 1147 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 1.4438525438308716, + "learning_rate": 4.250654125376062e-06, + "loss": 0.45830875635147095, + "mean_token_accuracy": 0.8433834314346313, + "num_tokens": 10263980.0, + "step": 1148 + }, + { + "epoch": 0.8731003039513677, + "grad_norm": 2.1273653507232666, + "learning_rate": 4.249158351283414e-06, + "loss": 0.4129376709461212, + "mean_token_accuracy": 0.861556351184845, + "num_tokens": 10270426.0, + "step": 1149 + }, + { + "epoch": 0.8738601823708206, + "grad_norm": 2.598440647125244, + "learning_rate": 4.247661349564103e-06, + "loss": 0.418030709028244, + "mean_token_accuracy": 0.86553955078125, + "num_tokens": 10275493.0, + "step": 1150 + }, + { + "epoch": 0.8746200607902735, + "grad_norm": 1.6852490901947021, + "learning_rate": 4.246163121268782e-06, + "loss": 0.6403408050537109, + "mean_token_accuracy": 0.7966094017028809, + "num_tokens": 10287989.0, + "step": 1151 + }, + { + "epoch": 0.8753799392097265, + "grad_norm": 2.5013794898986816, + "learning_rate": 4.244663667448965e-06, + "loss": 0.49922505021095276, + "mean_token_accuracy": 0.8318735361099243, + "num_tokens": 10293360.0, + "step": 1152 + }, + { + "epoch": 0.8761398176291794, + "grad_norm": 1.2022709846496582, + "learning_rate": 4.243162989157027e-06, + "loss": 0.4414965510368347, + "mean_token_accuracy": 0.8338693380355835, + "num_tokens": 10310558.0, + "step": 1153 + }, + { + "epoch": 0.8768996960486323, + "grad_norm": 1.9903281927108765, + "learning_rate": 4.241661087446202e-06, + "loss": 0.4277610778808594, + "mean_token_accuracy": 0.8560749292373657, + "num_tokens": 10316983.0, + "step": 1154 + }, + { + "epoch": 0.8776595744680851, + "grad_norm": 2.104923725128174, + "learning_rate": 4.240157963370583e-06, + "loss": 0.44431713223457336, + "mean_token_accuracy": 0.8785282969474792, + "num_tokens": 10323294.0, + "step": 1155 + }, + { + "epoch": 0.878419452887538, + "grad_norm": 2.8364813327789307, + "learning_rate": 4.2386536179851175e-06, + "loss": 0.49948397278785706, + "mean_token_accuracy": 0.8305255174636841, + "num_tokens": 10327662.0, + "step": 1156 + }, + { + "epoch": 0.8791793313069909, + "grad_norm": 1.9493682384490967, + "learning_rate": 4.2371480523456156e-06, + "loss": 0.45867404341697693, + "mean_token_accuracy": 0.8373264074325562, + "num_tokens": 10335699.0, + "step": 1157 + }, + { + "epoch": 0.8799392097264438, + "grad_norm": 2.268616199493408, + "learning_rate": 4.235641267508741e-06, + "loss": 0.4547857940196991, + "mean_token_accuracy": 0.8252766132354736, + "num_tokens": 10342464.0, + "step": 1158 + }, + { + "epoch": 0.8806990881458967, + "grad_norm": 2.1334283351898193, + "learning_rate": 4.234133264532012e-06, + "loss": 0.39503124356269836, + "mean_token_accuracy": 0.8648351430892944, + "num_tokens": 10347514.0, + "step": 1159 + }, + { + "epoch": 0.8814589665653495, + "grad_norm": 1.2775357961654663, + "learning_rate": 4.232624044473805e-06, + "loss": 0.39945733547210693, + "mean_token_accuracy": 0.8369829654693604, + "num_tokens": 10363316.0, + "step": 1160 + }, + { + "epoch": 0.8822188449848024, + "grad_norm": 2.458413600921631, + "learning_rate": 4.231113608393348e-06, + "loss": 0.5020045638084412, + "mean_token_accuracy": 0.8295938968658447, + "num_tokens": 10368401.0, + "step": 1161 + }, + { + "epoch": 0.8829787234042553, + "grad_norm": 1.7464948892593384, + "learning_rate": 4.229601957350722e-06, + "loss": 0.5335392951965332, + "mean_token_accuracy": 0.8134858012199402, + "num_tokens": 10378337.0, + "step": 1162 + }, + { + "epoch": 0.8837386018237082, + "grad_norm": 3.1152119636535645, + "learning_rate": 4.228089092406863e-06, + "loss": 0.4811682105064392, + "mean_token_accuracy": 0.8460187315940857, + "num_tokens": 10382362.0, + "step": 1163 + }, + { + "epoch": 0.8844984802431611, + "grad_norm": 2.190847158432007, + "learning_rate": 4.226575014623557e-06, + "loss": 0.4428049921989441, + "mean_token_accuracy": 0.8382467031478882, + "num_tokens": 10388211.0, + "step": 1164 + }, + { + "epoch": 0.8852583586626139, + "grad_norm": 1.860153079032898, + "learning_rate": 4.225059725063444e-06, + "loss": 0.5265918970108032, + "mean_token_accuracy": 0.8181334733963013, + "num_tokens": 10398873.0, + "step": 1165 + }, + { + "epoch": 0.8860182370820668, + "grad_norm": 1.3372713327407837, + "learning_rate": 4.22354322479001e-06, + "loss": 0.43202850222587585, + "mean_token_accuracy": 0.8432420492172241, + "num_tokens": 10413158.0, + "step": 1166 + }, + { + "epoch": 0.8867781155015197, + "grad_norm": 1.3653379678726196, + "learning_rate": 4.222025514867596e-06, + "loss": 0.43780991435050964, + "mean_token_accuracy": 0.8441485166549683, + "num_tokens": 10428137.0, + "step": 1167 + }, + { + "epoch": 0.8875379939209727, + "grad_norm": 3.0230672359466553, + "learning_rate": 4.220506596361387e-06, + "loss": 0.6039337515830994, + "mean_token_accuracy": 0.8274872303009033, + "num_tokens": 10432586.0, + "step": 1168 + }, + { + "epoch": 0.8882978723404256, + "grad_norm": 2.2180392742156982, + "learning_rate": 4.218986470337419e-06, + "loss": 0.5453792810440063, + "mean_token_accuracy": 0.8127184510231018, + "num_tokens": 10439471.0, + "step": 1169 + }, + { + "epoch": 0.8890577507598785, + "grad_norm": 1.8519103527069092, + "learning_rate": 4.217465137862575e-06, + "loss": 0.5145469903945923, + "mean_token_accuracy": 0.8178654909133911, + "num_tokens": 10450471.0, + "step": 1170 + }, + { + "epoch": 0.8898176291793313, + "grad_norm": 2.034008026123047, + "learning_rate": 4.215942600004586e-06, + "loss": 0.44061461091041565, + "mean_token_accuracy": 0.8572084307670593, + "num_tokens": 10457382.0, + "step": 1171 + }, + { + "epoch": 0.8905775075987842, + "grad_norm": 3.4304304122924805, + "learning_rate": 4.214418857832025e-06, + "loss": 0.44397830963134766, + "mean_token_accuracy": 0.842149019241333, + "num_tokens": 10460650.0, + "step": 1172 + }, + { + "epoch": 0.8913373860182371, + "grad_norm": 1.9021750688552856, + "learning_rate": 4.212893912414316e-06, + "loss": 0.3769867420196533, + "mean_token_accuracy": 0.8806171417236328, + "num_tokens": 10468214.0, + "step": 1173 + }, + { + "epoch": 0.89209726443769, + "grad_norm": 1.9704062938690186, + "learning_rate": 4.211367764821722e-06, + "loss": 0.5501819849014282, + "mean_token_accuracy": 0.8176811337471008, + "num_tokens": 10476739.0, + "step": 1174 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.4350415468215942, + "learning_rate": 4.209840416125353e-06, + "loss": 0.41897401213645935, + "mean_token_accuracy": 0.8498011827468872, + "num_tokens": 10491769.0, + "step": 1175 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 3.8237783908843994, + "learning_rate": 4.208311867397162e-06, + "loss": 0.5296977162361145, + "mean_token_accuracy": 0.8168715834617615, + "num_tokens": 10494958.0, + "step": 1176 + }, + { + "epoch": 0.8943768996960486, + "grad_norm": 2.04784893989563, + "learning_rate": 4.206782119709942e-06, + "loss": 0.476105272769928, + "mean_token_accuracy": 0.834011435508728, + "num_tokens": 10502077.0, + "step": 1177 + }, + { + "epoch": 0.8951367781155015, + "grad_norm": 1.8839610815048218, + "learning_rate": 4.205251174137329e-06, + "loss": 0.49628815054893494, + "mean_token_accuracy": 0.8212119936943054, + "num_tokens": 10510077.0, + "step": 1178 + }, + { + "epoch": 0.8958966565349544, + "grad_norm": 1.2100634574890137, + "learning_rate": 4.2037190317538e-06, + "loss": 0.4931519329547882, + "mean_token_accuracy": 0.8170043230056763, + "num_tokens": 10528373.0, + "step": 1179 + }, + { + "epoch": 0.8966565349544073, + "grad_norm": 1.884637713432312, + "learning_rate": 4.202185693634671e-06, + "loss": 0.4913347363471985, + "mean_token_accuracy": 0.8234949707984924, + "num_tokens": 10537108.0, + "step": 1180 + }, + { + "epoch": 0.8974164133738601, + "grad_norm": 1.5062434673309326, + "learning_rate": 4.200651160856099e-06, + "loss": 0.4160492420196533, + "mean_token_accuracy": 0.845937192440033, + "num_tokens": 10547577.0, + "step": 1181 + }, + { + "epoch": 0.898176291793313, + "grad_norm": 2.331169605255127, + "learning_rate": 4.1991154344950755e-06, + "loss": 0.6532632112503052, + "mean_token_accuracy": 0.7743191123008728, + "num_tokens": 10556328.0, + "step": 1182 + }, + { + "epoch": 0.898936170212766, + "grad_norm": 1.3538362979888916, + "learning_rate": 4.197578515629435e-06, + "loss": 0.4437566101551056, + "mean_token_accuracy": 0.8427901268005371, + "num_tokens": 10570026.0, + "step": 1183 + }, + { + "epoch": 0.8996960486322189, + "grad_norm": 2.3828957080841064, + "learning_rate": 4.196040405337846e-06, + "loss": 0.6185290217399597, + "mean_token_accuracy": 0.7969824075698853, + "num_tokens": 10576465.0, + "step": 1184 + }, + { + "epoch": 0.9004559270516718, + "grad_norm": 2.4759042263031006, + "learning_rate": 4.194501104699813e-06, + "loss": 0.46489226818084717, + "mean_token_accuracy": 0.8472316265106201, + "num_tokens": 10582034.0, + "step": 1185 + }, + { + "epoch": 0.9012158054711246, + "grad_norm": 1.9215164184570312, + "learning_rate": 4.192960614795676e-06, + "loss": 0.48001551628112793, + "mean_token_accuracy": 0.8371596336364746, + "num_tokens": 10590556.0, + "step": 1186 + }, + { + "epoch": 0.9019756838905775, + "grad_norm": 2.2717080116271973, + "learning_rate": 4.19141893670661e-06, + "loss": 0.40083563327789307, + "mean_token_accuracy": 0.8464195728302002, + "num_tokens": 10595661.0, + "step": 1187 + }, + { + "epoch": 0.9027355623100304, + "grad_norm": 2.187122344970703, + "learning_rate": 4.189876071514624e-06, + "loss": 0.4942901134490967, + "mean_token_accuracy": 0.8186990022659302, + "num_tokens": 10603366.0, + "step": 1188 + }, + { + "epoch": 0.9034954407294833, + "grad_norm": 1.542414665222168, + "learning_rate": 4.188332020302561e-06, + "loss": 0.4731982946395874, + "mean_token_accuracy": 0.8487229347229004, + "num_tokens": 10616203.0, + "step": 1189 + }, + { + "epoch": 0.9042553191489362, + "grad_norm": 0.9957579970359802, + "learning_rate": 4.186786784154096e-06, + "loss": 0.33211836218833923, + "mean_token_accuracy": 0.870644748210907, + "num_tokens": 10633294.0, + "step": 1190 + }, + { + "epoch": 0.9050151975683891, + "grad_norm": 2.593867540359497, + "learning_rate": 4.1852403641537344e-06, + "loss": 0.6825464963912964, + "mean_token_accuracy": 0.7716869115829468, + "num_tokens": 10640615.0, + "step": 1191 + }, + { + "epoch": 0.9057750759878419, + "grad_norm": 2.0424516201019287, + "learning_rate": 4.183692761386813e-06, + "loss": 0.5672709941864014, + "mean_token_accuracy": 0.7973801493644714, + "num_tokens": 10649845.0, + "step": 1192 + }, + { + "epoch": 0.9065349544072948, + "grad_norm": 1.429018259048462, + "learning_rate": 4.1821439769395e-06, + "loss": 0.5427846908569336, + "mean_token_accuracy": 0.8200292587280273, + "num_tokens": 10665898.0, + "step": 1193 + }, + { + "epoch": 0.9072948328267477, + "grad_norm": 1.9764264822006226, + "learning_rate": 4.180594011898791e-06, + "loss": 0.4784567356109619, + "mean_token_accuracy": 0.82924485206604, + "num_tokens": 10673595.0, + "step": 1194 + }, + { + "epoch": 0.9080547112462006, + "grad_norm": 1.4004309177398682, + "learning_rate": 4.1790428673525104e-06, + "loss": 0.4791432023048401, + "mean_token_accuracy": 0.8334879875183105, + "num_tokens": 10687892.0, + "step": 1195 + }, + { + "epoch": 0.9088145896656535, + "grad_norm": 2.2207727432250977, + "learning_rate": 4.177490544389313e-06, + "loss": 0.5089365243911743, + "mean_token_accuracy": 0.8270776271820068, + "num_tokens": 10694911.0, + "step": 1196 + }, + { + "epoch": 0.9095744680851063, + "grad_norm": 2.2890450954437256, + "learning_rate": 4.175937044098678e-06, + "loss": 0.5152267813682556, + "mean_token_accuracy": 0.8527299165725708, + "num_tokens": 10700512.0, + "step": 1197 + }, + { + "epoch": 0.9103343465045592, + "grad_norm": 1.7938050031661987, + "learning_rate": 4.1743823675709115e-06, + "loss": 0.3507300615310669, + "mean_token_accuracy": 0.8694599866867065, + "num_tokens": 10707953.0, + "step": 1198 + }, + { + "epoch": 0.9110942249240122, + "grad_norm": 1.4368808269500732, + "learning_rate": 4.172826515897146e-06, + "loss": 0.407418429851532, + "mean_token_accuracy": 0.8432893753051758, + "num_tokens": 10717485.0, + "step": 1199 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 1.735339879989624, + "learning_rate": 4.171269490169337e-06, + "loss": 0.46996885538101196, + "mean_token_accuracy": 0.8331948518753052, + "num_tokens": 10726160.0, + "step": 1200 + }, + { + "epoch": 0.912613981762918, + "grad_norm": 1.7859221696853638, + "learning_rate": 4.1697112914802665e-06, + "loss": 0.5325199365615845, + "mean_token_accuracy": 0.8179605007171631, + "num_tokens": 10736284.0, + "step": 1201 + }, + { + "epoch": 0.9133738601823708, + "grad_norm": 2.6394896507263184, + "learning_rate": 4.168151920923536e-06, + "loss": 0.4039744734764099, + "mean_token_accuracy": 0.8545527458190918, + "num_tokens": 10740673.0, + "step": 1202 + }, + { + "epoch": 0.9141337386018237, + "grad_norm": 1.910988211631775, + "learning_rate": 4.1665913795935755e-06, + "loss": 0.5190291404724121, + "mean_token_accuracy": 0.8203921318054199, + "num_tokens": 10751946.0, + "step": 1203 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 3.0006964206695557, + "learning_rate": 4.16502966858563e-06, + "loss": 0.5856777429580688, + "mean_token_accuracy": 0.8061224222183228, + "num_tokens": 10756795.0, + "step": 1204 + }, + { + "epoch": 0.9156534954407295, + "grad_norm": 1.7396167516708374, + "learning_rate": 4.163466788995768e-06, + "loss": 0.54935222864151, + "mean_token_accuracy": 0.8052443265914917, + "num_tokens": 10767202.0, + "step": 1205 + }, + { + "epoch": 0.9164133738601824, + "grad_norm": 2.143735885620117, + "learning_rate": 4.161902741920881e-06, + "loss": 0.5020298361778259, + "mean_token_accuracy": 0.8249630928039551, + "num_tokens": 10774329.0, + "step": 1206 + }, + { + "epoch": 0.9171732522796353, + "grad_norm": 2.8871893882751465, + "learning_rate": 4.160337528458676e-06, + "loss": 0.5154489278793335, + "mean_token_accuracy": 0.8276848793029785, + "num_tokens": 10778929.0, + "step": 1207 + }, + { + "epoch": 0.9179331306990881, + "grad_norm": 1.4642788171768188, + "learning_rate": 4.15877114970768e-06, + "loss": 0.5033774375915527, + "mean_token_accuracy": 0.8296241164207458, + "num_tokens": 10790928.0, + "step": 1208 + }, + { + "epoch": 0.918693009118541, + "grad_norm": 1.8313497304916382, + "learning_rate": 4.1572036067672386e-06, + "loss": 0.5674909353256226, + "mean_token_accuracy": 0.7975562214851379, + "num_tokens": 10801372.0, + "step": 1209 + }, + { + "epoch": 0.9194528875379939, + "grad_norm": 2.005958080291748, + "learning_rate": 4.155634900737513e-06, + "loss": 0.5557019114494324, + "mean_token_accuracy": 0.8141391277313232, + "num_tokens": 10809150.0, + "step": 1210 + }, + { + "epoch": 0.9202127659574468, + "grad_norm": 2.333519697189331, + "learning_rate": 4.154065032719482e-06, + "loss": 0.6990420818328857, + "mean_token_accuracy": 0.7565394043922424, + "num_tokens": 10816612.0, + "step": 1211 + }, + { + "epoch": 0.9209726443768997, + "grad_norm": 1.4472655057907104, + "learning_rate": 4.152494003814939e-06, + "loss": 0.541398286819458, + "mean_token_accuracy": 0.8027358055114746, + "num_tokens": 10833840.0, + "step": 1212 + }, + { + "epoch": 0.9217325227963525, + "grad_norm": 1.6183619499206543, + "learning_rate": 4.150921815126493e-06, + "loss": 0.6096762418746948, + "mean_token_accuracy": 0.7994354963302612, + "num_tokens": 10846367.0, + "step": 1213 + }, + { + "epoch": 0.9224924012158054, + "grad_norm": 2.614919900894165, + "learning_rate": 4.149348467757566e-06, + "loss": 0.41846764087677, + "mean_token_accuracy": 0.8555068969726562, + "num_tokens": 10850836.0, + "step": 1214 + }, + { + "epoch": 0.9232522796352584, + "grad_norm": 1.4419831037521362, + "learning_rate": 4.147773962812393e-06, + "loss": 0.4139535427093506, + "mean_token_accuracy": 0.845671534538269, + "num_tokens": 10864228.0, + "step": 1215 + }, + { + "epoch": 0.9240121580547113, + "grad_norm": 2.3868865966796875, + "learning_rate": 4.146198301396025e-06, + "loss": 0.3357275128364563, + "mean_token_accuracy": 0.8829520344734192, + "num_tokens": 10868920.0, + "step": 1216 + }, + { + "epoch": 0.9247720364741642, + "grad_norm": 1.7685474157333374, + "learning_rate": 4.14462148461432e-06, + "loss": 0.45333072543144226, + "mean_token_accuracy": 0.8505891561508179, + "num_tokens": 10877286.0, + "step": 1217 + }, + { + "epoch": 0.925531914893617, + "grad_norm": 1.7627625465393066, + "learning_rate": 4.143043513573949e-06, + "loss": 0.5028705596923828, + "mean_token_accuracy": 0.825471043586731, + "num_tokens": 10887047.0, + "step": 1218 + }, + { + "epoch": 0.9262917933130699, + "grad_norm": 1.3168725967407227, + "learning_rate": 4.141464389382392e-06, + "loss": 0.5494637489318848, + "mean_token_accuracy": 0.8121747970581055, + "num_tokens": 10903599.0, + "step": 1219 + }, + { + "epoch": 0.9270516717325228, + "grad_norm": 2.5180399417877197, + "learning_rate": 4.13988411314794e-06, + "loss": 0.6134277582168579, + "mean_token_accuracy": 0.7983006834983826, + "num_tokens": 10909791.0, + "step": 1220 + }, + { + "epoch": 0.9278115501519757, + "grad_norm": 1.1889166831970215, + "learning_rate": 4.13830268597969e-06, + "loss": 0.36713096499443054, + "mean_token_accuracy": 0.8416121006011963, + "num_tokens": 10925794.0, + "step": 1221 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 2.142422676086426, + "learning_rate": 4.136720108987552e-06, + "loss": 0.4427933096885681, + "mean_token_accuracy": 0.8427745699882507, + "num_tokens": 10931622.0, + "step": 1222 + }, + { + "epoch": 0.9293313069908815, + "grad_norm": 1.908564567565918, + "learning_rate": 4.1351363832822364e-06, + "loss": 0.5088109374046326, + "mean_token_accuracy": 0.8309272527694702, + "num_tokens": 10940843.0, + "step": 1223 + }, + { + "epoch": 0.9300911854103343, + "grad_norm": 1.2862322330474854, + "learning_rate": 4.133551509975264e-06, + "loss": 0.3963761329650879, + "mean_token_accuracy": 0.8602159023284912, + "num_tokens": 10954481.0, + "step": 1224 + }, + { + "epoch": 0.9308510638297872, + "grad_norm": 1.5876200199127197, + "learning_rate": 4.13196549017896e-06, + "loss": 0.4311184287071228, + "mean_token_accuracy": 0.8460899591445923, + "num_tokens": 10963501.0, + "step": 1225 + }, + { + "epoch": 0.9316109422492401, + "grad_norm": 2.459878444671631, + "learning_rate": 4.130378325006453e-06, + "loss": 0.5016295313835144, + "mean_token_accuracy": 0.8125218152999878, + "num_tokens": 10968850.0, + "step": 1226 + }, + { + "epoch": 0.932370820668693, + "grad_norm": 2.059718370437622, + "learning_rate": 4.128790015571679e-06, + "loss": 0.48982277512550354, + "mean_token_accuracy": 0.8327049016952515, + "num_tokens": 10976642.0, + "step": 1227 + }, + { + "epoch": 0.9331306990881459, + "grad_norm": 1.3719185590744019, + "learning_rate": 4.127200562989372e-06, + "loss": 0.38778752088546753, + "mean_token_accuracy": 0.8623501062393188, + "num_tokens": 10988703.0, + "step": 1228 + }, + { + "epoch": 0.9338905775075987, + "grad_norm": 1.302140712738037, + "learning_rate": 4.125609968375073e-06, + "loss": 0.4887842535972595, + "mean_token_accuracy": 0.8322232961654663, + "num_tokens": 11005981.0, + "step": 1229 + }, + { + "epoch": 0.9346504559270516, + "grad_norm": 1.819624423980713, + "learning_rate": 4.12401823284512e-06, + "loss": 0.49825209379196167, + "mean_token_accuracy": 0.8278916478157043, + "num_tokens": 11014145.0, + "step": 1230 + }, + { + "epoch": 0.9354103343465046, + "grad_norm": 1.2762807607650757, + "learning_rate": 4.122425357516658e-06, + "loss": 0.433994323015213, + "mean_token_accuracy": 0.853028416633606, + "num_tokens": 11029232.0, + "step": 1231 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.2171671390533447, + "learning_rate": 4.1208313435076255e-06, + "loss": 0.38436949253082275, + "mean_token_accuracy": 0.8616260290145874, + "num_tokens": 11034743.0, + "step": 1232 + }, + { + "epoch": 0.9369300911854104, + "grad_norm": 1.355879545211792, + "learning_rate": 4.119236191936764e-06, + "loss": 0.5378084182739258, + "mean_token_accuracy": 0.8256701231002808, + "num_tokens": 11048149.0, + "step": 1233 + }, + { + "epoch": 0.9376899696048632, + "grad_norm": 2.66812801361084, + "learning_rate": 4.117639903923611e-06, + "loss": 0.5236451625823975, + "mean_token_accuracy": 0.8431973457336426, + "num_tokens": 11052295.0, + "step": 1234 + }, + { + "epoch": 0.9384498480243161, + "grad_norm": 1.5740545988082886, + "learning_rate": 4.116042480588505e-06, + "loss": 0.44322824478149414, + "mean_token_accuracy": 0.8436908721923828, + "num_tokens": 11062066.0, + "step": 1235 + }, + { + "epoch": 0.939209726443769, + "grad_norm": 1.230706810951233, + "learning_rate": 4.114443923052577e-06, + "loss": 0.3325323462486267, + "mean_token_accuracy": 0.8674666881561279, + "num_tokens": 11074300.0, + "step": 1236 + }, + { + "epoch": 0.9399696048632219, + "grad_norm": 1.9870070219039917, + "learning_rate": 4.112844232437757e-06, + "loss": 0.5711548328399658, + "mean_token_accuracy": 0.8081738948822021, + "num_tokens": 11082297.0, + "step": 1237 + }, + { + "epoch": 0.9407294832826748, + "grad_norm": 1.3020970821380615, + "learning_rate": 4.11124340986677e-06, + "loss": 0.4187922477722168, + "mean_token_accuracy": 0.8566171526908875, + "num_tokens": 11096810.0, + "step": 1238 + }, + { + "epoch": 0.9414893617021277, + "grad_norm": 2.1399197578430176, + "learning_rate": 4.109641456463135e-06, + "loss": 0.5293116569519043, + "mean_token_accuracy": 0.8176157474517822, + "num_tokens": 11102761.0, + "step": 1239 + }, + { + "epoch": 0.9422492401215805, + "grad_norm": 1.3503763675689697, + "learning_rate": 4.108038373351163e-06, + "loss": 0.4907652735710144, + "mean_token_accuracy": 0.8204987049102783, + "num_tokens": 11118480.0, + "step": 1240 + }, + { + "epoch": 0.9430091185410334, + "grad_norm": 1.9571399688720703, + "learning_rate": 4.106434161655962e-06, + "loss": 0.4709656536579132, + "mean_token_accuracy": 0.8371885418891907, + "num_tokens": 11126265.0, + "step": 1241 + }, + { + "epoch": 0.9437689969604863, + "grad_norm": 2.1277313232421875, + "learning_rate": 4.104828822503427e-06, + "loss": 0.4010283350944519, + "mean_token_accuracy": 0.8586333990097046, + "num_tokens": 11133022.0, + "step": 1242 + }, + { + "epoch": 0.9445288753799392, + "grad_norm": 1.6745036840438843, + "learning_rate": 4.103222357020248e-06, + "loss": 0.562545657157898, + "mean_token_accuracy": 0.8052060604095459, + "num_tokens": 11145255.0, + "step": 1243 + }, + { + "epoch": 0.9452887537993921, + "grad_norm": 2.3616299629211426, + "learning_rate": 4.101614766333904e-06, + "loss": 0.5878340601921082, + "mean_token_accuracy": 0.796745777130127, + "num_tokens": 11152020.0, + "step": 1244 + }, + { + "epoch": 0.9460486322188449, + "grad_norm": 1.6182078123092651, + "learning_rate": 4.100006051572664e-06, + "loss": 0.5357589721679688, + "mean_token_accuracy": 0.8089962005615234, + "num_tokens": 11163112.0, + "step": 1245 + }, + { + "epoch": 0.9468085106382979, + "grad_norm": 1.911770224571228, + "learning_rate": 4.098396213865587e-06, + "loss": 0.49805426597595215, + "mean_token_accuracy": 0.8289647102355957, + "num_tokens": 11171768.0, + "step": 1246 + }, + { + "epoch": 0.9475683890577508, + "grad_norm": 1.649155616760254, + "learning_rate": 4.096785254342518e-06, + "loss": 0.5756166577339172, + "mean_token_accuracy": 0.807680606842041, + "num_tokens": 11183527.0, + "step": 1247 + }, + { + "epoch": 0.9483282674772037, + "grad_norm": 1.8922761678695679, + "learning_rate": 4.095173174134091e-06, + "loss": 0.44688963890075684, + "mean_token_accuracy": 0.8375608921051025, + "num_tokens": 11191494.0, + "step": 1248 + }, + { + "epoch": 0.9490881458966566, + "grad_norm": 2.9044547080993652, + "learning_rate": 4.093559974371725e-06, + "loss": 0.48609739542007446, + "mean_token_accuracy": 0.8404892086982727, + "num_tokens": 11195837.0, + "step": 1249 + }, + { + "epoch": 0.9498480243161094, + "grad_norm": 2.287506580352783, + "learning_rate": 4.091945656187626e-06, + "loss": 0.5260225534439087, + "mean_token_accuracy": 0.8181945085525513, + "num_tokens": 11202174.0, + "step": 1250 + }, + { + "epoch": 0.9506079027355623, + "grad_norm": 1.7908886671066284, + "learning_rate": 4.090330220714785e-06, + "loss": 0.4207724928855896, + "mean_token_accuracy": 0.8616912364959717, + "num_tokens": 11209995.0, + "step": 1251 + }, + { + "epoch": 0.9513677811550152, + "grad_norm": 2.905418634414673, + "learning_rate": 4.0887136690869774e-06, + "loss": 0.4209241271018982, + "mean_token_accuracy": 0.8561323285102844, + "num_tokens": 11213799.0, + "step": 1252 + }, + { + "epoch": 0.9521276595744681, + "grad_norm": 2.814150333404541, + "learning_rate": 4.08709600243876e-06, + "loss": 0.36855608224868774, + "mean_token_accuracy": 0.8764539361000061, + "num_tokens": 11217643.0, + "step": 1253 + }, + { + "epoch": 0.952887537993921, + "grad_norm": 1.9385707378387451, + "learning_rate": 4.0854772219054735e-06, + "loss": 0.531031608581543, + "mean_token_accuracy": 0.80600905418396, + "num_tokens": 11225871.0, + "step": 1254 + }, + { + "epoch": 0.9536474164133738, + "grad_norm": 2.103058099746704, + "learning_rate": 4.083857328623243e-06, + "loss": 0.4576364755630493, + "mean_token_accuracy": 0.8447524905204773, + "num_tokens": 11231829.0, + "step": 1255 + }, + { + "epoch": 0.9544072948328267, + "grad_norm": 1.7518818378448486, + "learning_rate": 4.082236323728969e-06, + "loss": 0.5386767983436584, + "mean_token_accuracy": 0.8055596351623535, + "num_tokens": 11240977.0, + "step": 1256 + }, + { + "epoch": 0.9551671732522796, + "grad_norm": 1.8434966802597046, + "learning_rate": 4.0806142083603365e-06, + "loss": 0.5415925979614258, + "mean_token_accuracy": 0.809962272644043, + "num_tokens": 11249616.0, + "step": 1257 + }, + { + "epoch": 0.9559270516717325, + "grad_norm": 1.7341015338897705, + "learning_rate": 4.078990983655807e-06, + "loss": 0.4621101915836334, + "mean_token_accuracy": 0.8330386877059937, + "num_tokens": 11258616.0, + "step": 1258 + }, + { + "epoch": 0.9566869300911854, + "grad_norm": 1.8589727878570557, + "learning_rate": 4.077366650754624e-06, + "loss": 0.4031238555908203, + "mean_token_accuracy": 0.842434287071228, + "num_tokens": 11266006.0, + "step": 1259 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 1.657175898551941, + "learning_rate": 4.075741210796806e-06, + "loss": 0.41686388850212097, + "mean_token_accuracy": 0.8443650007247925, + "num_tokens": 11275601.0, + "step": 1260 + }, + { + "epoch": 0.9582066869300911, + "grad_norm": 2.4303717613220215, + "learning_rate": 4.07411466492315e-06, + "loss": 0.4554435610771179, + "mean_token_accuracy": 0.853043794631958, + "num_tokens": 11280650.0, + "step": 1261 + }, + { + "epoch": 0.958966565349544, + "grad_norm": 2.3653745651245117, + "learning_rate": 4.072487014275228e-06, + "loss": 0.4304995536804199, + "mean_token_accuracy": 0.8462260961532593, + "num_tokens": 11285637.0, + "step": 1262 + }, + { + "epoch": 0.959726443768997, + "grad_norm": 1.6689718961715698, + "learning_rate": 4.070858259995388e-06, + "loss": 0.5290807485580444, + "mean_token_accuracy": 0.8176917433738708, + "num_tokens": 11299110.0, + "step": 1263 + }, + { + "epoch": 0.9604863221884499, + "grad_norm": 2.103879451751709, + "learning_rate": 4.069228403226751e-06, + "loss": 0.4620879888534546, + "mean_token_accuracy": 0.835270345211029, + "num_tokens": 11305564.0, + "step": 1264 + }, + { + "epoch": 0.9612462006079028, + "grad_norm": 2.139012575149536, + "learning_rate": 4.067597445113216e-06, + "loss": 0.5143396258354187, + "mean_token_accuracy": 0.8191739320755005, + "num_tokens": 11311870.0, + "step": 1265 + }, + { + "epoch": 0.9620060790273556, + "grad_norm": 1.3971210718154907, + "learning_rate": 4.06596538679945e-06, + "loss": 0.472080260515213, + "mean_token_accuracy": 0.8321092128753662, + "num_tokens": 11323970.0, + "step": 1266 + }, + { + "epoch": 0.9627659574468085, + "grad_norm": 1.4965174198150635, + "learning_rate": 4.064332229430895e-06, + "loss": 0.359701007604599, + "mean_token_accuracy": 0.8903120160102844, + "num_tokens": 11333412.0, + "step": 1267 + }, + { + "epoch": 0.9635258358662614, + "grad_norm": 1.1898726224899292, + "learning_rate": 4.062697974153764e-06, + "loss": 0.3423798084259033, + "mean_token_accuracy": 0.8661491870880127, + "num_tokens": 11347657.0, + "step": 1268 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 1.4952168464660645, + "learning_rate": 4.06106262211504e-06, + "loss": 0.4214417338371277, + "mean_token_accuracy": 0.8362159729003906, + "num_tokens": 11357786.0, + "step": 1269 + }, + { + "epoch": 0.9650455927051672, + "grad_norm": 1.7949583530426025, + "learning_rate": 4.059426174462476e-06, + "loss": 0.59087735414505, + "mean_token_accuracy": 0.7965556979179382, + "num_tokens": 11370561.0, + "step": 1270 + }, + { + "epoch": 0.96580547112462, + "grad_norm": 1.8973214626312256, + "learning_rate": 4.057788632344594e-06, + "loss": 0.47525322437286377, + "mean_token_accuracy": 0.8317365050315857, + "num_tokens": 11378507.0, + "step": 1271 + }, + { + "epoch": 0.9665653495440729, + "grad_norm": 1.8665250539779663, + "learning_rate": 4.056149996910683e-06, + "loss": 0.3537125587463379, + "mean_token_accuracy": 0.8921569585800171, + "num_tokens": 11385186.0, + "step": 1272 + }, + { + "epoch": 0.9673252279635258, + "grad_norm": 1.5072317123413086, + "learning_rate": 4.054510269310803e-06, + "loss": 0.5145624876022339, + "mean_token_accuracy": 0.8265488147735596, + "num_tokens": 11397125.0, + "step": 1273 + }, + { + "epoch": 0.9680851063829787, + "grad_norm": 1.520525574684143, + "learning_rate": 4.052869450695776e-06, + "loss": 0.44322293996810913, + "mean_token_accuracy": 0.8403642177581787, + "num_tokens": 11409919.0, + "step": 1274 + }, + { + "epoch": 0.9688449848024316, + "grad_norm": 1.3764475584030151, + "learning_rate": 4.051227542217192e-06, + "loss": 0.5774400234222412, + "mean_token_accuracy": 0.804118275642395, + "num_tokens": 11425900.0, + "step": 1275 + }, + { + "epoch": 0.9696048632218845, + "grad_norm": 1.3922648429870605, + "learning_rate": 4.049584545027406e-06, + "loss": 0.42727944254875183, + "mean_token_accuracy": 0.8654505014419556, + "num_tokens": 11438787.0, + "step": 1276 + }, + { + "epoch": 0.9703647416413373, + "grad_norm": 1.8505840301513672, + "learning_rate": 4.047940460279537e-06, + "loss": 0.490803062915802, + "mean_token_accuracy": 0.8340574502944946, + "num_tokens": 11447997.0, + "step": 1277 + }, + { + "epoch": 0.9711246200607903, + "grad_norm": 2.28271222114563, + "learning_rate": 4.046295289127466e-06, + "loss": 0.588828444480896, + "mean_token_accuracy": 0.833497166633606, + "num_tokens": 11454072.0, + "step": 1278 + }, + { + "epoch": 0.9718844984802432, + "grad_norm": 2.4242560863494873, + "learning_rate": 4.044649032725836e-06, + "loss": 0.5128831267356873, + "mean_token_accuracy": 0.8225122690200806, + "num_tokens": 11460211.0, + "step": 1279 + }, + { + "epoch": 0.9726443768996961, + "grad_norm": 2.1738455295562744, + "learning_rate": 4.0430016922300566e-06, + "loss": 0.441631942987442, + "mean_token_accuracy": 0.841723620891571, + "num_tokens": 11466814.0, + "step": 1280 + }, + { + "epoch": 0.973404255319149, + "grad_norm": 2.541599988937378, + "learning_rate": 4.0413532687962926e-06, + "loss": 0.5062629580497742, + "mean_token_accuracy": 0.8013502359390259, + "num_tokens": 11472371.0, + "step": 1281 + }, + { + "epoch": 0.9741641337386018, + "grad_norm": 2.8011014461517334, + "learning_rate": 4.039703763581472e-06, + "loss": 0.5061966776847839, + "mean_token_accuracy": 0.829810380935669, + "num_tokens": 11476672.0, + "step": 1282 + }, + { + "epoch": 0.9749240121580547, + "grad_norm": 2.4505462646484375, + "learning_rate": 4.038053177743279e-06, + "loss": 0.43407535552978516, + "mean_token_accuracy": 0.8428469896316528, + "num_tokens": 11481297.0, + "step": 1283 + }, + { + "epoch": 0.9756838905775076, + "grad_norm": 2.1618378162384033, + "learning_rate": 4.036401512440161e-06, + "loss": 0.6056663393974304, + "mean_token_accuracy": 0.7977457642555237, + "num_tokens": 11488657.0, + "step": 1284 + }, + { + "epoch": 0.9764437689969605, + "grad_norm": 1.9192147254943848, + "learning_rate": 4.034748768831319e-06, + "loss": 0.524390697479248, + "mean_token_accuracy": 0.8120636940002441, + "num_tokens": 11496485.0, + "step": 1285 + }, + { + "epoch": 0.9772036474164134, + "grad_norm": 2.766435384750366, + "learning_rate": 4.033094948076713e-06, + "loss": 0.5494908690452576, + "mean_token_accuracy": 0.8141890168190002, + "num_tokens": 11501341.0, + "step": 1286 + }, + { + "epoch": 0.9779635258358662, + "grad_norm": 1.3519539833068848, + "learning_rate": 4.031440051337056e-06, + "loss": 0.4339691400527954, + "mean_token_accuracy": 0.8400131464004517, + "num_tokens": 11512843.0, + "step": 1287 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 1.2492141723632812, + "learning_rate": 4.02978407977382e-06, + "loss": 0.4433518052101135, + "mean_token_accuracy": 0.8432940244674683, + "num_tokens": 11530227.0, + "step": 1288 + }, + { + "epoch": 0.979483282674772, + "grad_norm": 1.6597715616226196, + "learning_rate": 4.02812703454923e-06, + "loss": 0.602222204208374, + "mean_token_accuracy": 0.786965548992157, + "num_tokens": 11543955.0, + "step": 1289 + }, + { + "epoch": 0.9802431610942249, + "grad_norm": 1.6621816158294678, + "learning_rate": 4.026468916826262e-06, + "loss": 0.35662174224853516, + "mean_token_accuracy": 0.8716133832931519, + "num_tokens": 11552064.0, + "step": 1290 + }, + { + "epoch": 0.9810030395136778, + "grad_norm": 4.539844989776611, + "learning_rate": 4.024809727768648e-06, + "loss": 0.543423593044281, + "mean_token_accuracy": 0.8293194770812988, + "num_tokens": 11555595.0, + "step": 1291 + }, + { + "epoch": 0.9817629179331308, + "grad_norm": 1.4026556015014648, + "learning_rate": 4.023149468540871e-06, + "loss": 0.4301237165927887, + "mean_token_accuracy": 0.8358224630355835, + "num_tokens": 11572275.0, + "step": 1292 + }, + { + "epoch": 0.9825227963525835, + "grad_norm": 1.611262321472168, + "learning_rate": 4.021488140308165e-06, + "loss": 0.5378580689430237, + "mean_token_accuracy": 0.8173760771751404, + "num_tokens": 11584299.0, + "step": 1293 + }, + { + "epoch": 0.9832826747720365, + "grad_norm": 4.138631820678711, + "learning_rate": 4.019825744236514e-06, + "loss": 0.40272149443626404, + "mean_token_accuracy": 0.8648844957351685, + "num_tokens": 11586705.0, + "step": 1294 + }, + { + "epoch": 0.9840425531914894, + "grad_norm": 3.177703619003296, + "learning_rate": 4.018162281492651e-06, + "loss": 0.5320103168487549, + "mean_token_accuracy": 0.8250276446342468, + "num_tokens": 11590689.0, + "step": 1295 + }, + { + "epoch": 0.9848024316109423, + "grad_norm": 2.727597713470459, + "learning_rate": 4.016497753244058e-06, + "loss": 0.5662774443626404, + "mean_token_accuracy": 0.8074625730514526, + "num_tokens": 11596092.0, + "step": 1296 + }, + { + "epoch": 0.9855623100303952, + "grad_norm": 1.485139012336731, + "learning_rate": 4.014832160658966e-06, + "loss": 0.5414972305297852, + "mean_token_accuracy": 0.8082696199417114, + "num_tokens": 11613785.0, + "step": 1297 + }, + { + "epoch": 0.986322188449848, + "grad_norm": 2.4025990962982178, + "learning_rate": 4.013165504906352e-06, + "loss": 0.6556503772735596, + "mean_token_accuracy": 0.7785214781761169, + "num_tokens": 11620421.0, + "step": 1298 + }, + { + "epoch": 0.9870820668693009, + "grad_norm": 1.878273606300354, + "learning_rate": 4.011497787155938e-06, + "loss": 0.4221133887767792, + "mean_token_accuracy": 0.850035548210144, + "num_tokens": 11627998.0, + "step": 1299 + }, + { + "epoch": 0.9878419452887538, + "grad_norm": 2.0430715084075928, + "learning_rate": 4.009829008578192e-06, + "loss": 0.5205984711647034, + "mean_token_accuracy": 0.819183349609375, + "num_tokens": 11636279.0, + "step": 1300 + }, + { + "epoch": 0.9886018237082067, + "grad_norm": 3.4769439697265625, + "learning_rate": 4.00815917034433e-06, + "loss": 0.5449948310852051, + "mean_token_accuracy": 0.8240023851394653, + "num_tokens": 11639638.0, + "step": 1301 + }, + { + "epoch": 0.9893617021276596, + "grad_norm": 2.4783987998962402, + "learning_rate": 4.006488273626307e-06, + "loss": 0.4316832423210144, + "mean_token_accuracy": 0.8474695086479187, + "num_tokens": 11645463.0, + "step": 1302 + }, + { + "epoch": 0.9901215805471124, + "grad_norm": 1.881475567817688, + "learning_rate": 4.004816319596822e-06, + "loss": 0.5157331824302673, + "mean_token_accuracy": 0.826042652130127, + "num_tokens": 11653955.0, + "step": 1303 + }, + { + "epoch": 0.9908814589665653, + "grad_norm": 2.6569254398345947, + "learning_rate": 4.003143309429317e-06, + "loss": 0.46492767333984375, + "mean_token_accuracy": 0.8320850133895874, + "num_tokens": 11659357.0, + "step": 1304 + }, + { + "epoch": 0.9916413373860182, + "grad_norm": 2.4917593002319336, + "learning_rate": 4.0014692442979756e-06, + "loss": 0.459585040807724, + "mean_token_accuracy": 0.8457611799240112, + "num_tokens": 11664207.0, + "step": 1305 + }, + { + "epoch": 0.9924012158054711, + "grad_norm": 2.6885526180267334, + "learning_rate": 3.999794125377721e-06, + "loss": 0.4677402973175049, + "mean_token_accuracy": 0.8307361602783203, + "num_tokens": 11668879.0, + "step": 1306 + }, + { + "epoch": 0.993161094224924, + "grad_norm": 1.9737319946289062, + "learning_rate": 3.998117953844215e-06, + "loss": 0.44684839248657227, + "mean_token_accuracy": 0.8367687463760376, + "num_tokens": 11676081.0, + "step": 1307 + }, + { + "epoch": 0.993920972644377, + "grad_norm": 1.4333021640777588, + "learning_rate": 3.996440730873861e-06, + "loss": 0.526146650314331, + "mean_token_accuracy": 0.816251814365387, + "num_tokens": 11689333.0, + "step": 1308 + }, + { + "epoch": 0.9946808510638298, + "grad_norm": 1.3689230680465698, + "learning_rate": 3.9947624576437975e-06, + "loss": 0.40214329957962036, + "mean_token_accuracy": 0.8610327839851379, + "num_tokens": 11701540.0, + "step": 1309 + }, + { + "epoch": 0.9954407294832827, + "grad_norm": 1.2435375452041626, + "learning_rate": 3.9930831353319025e-06, + "loss": 0.4532913267612457, + "mean_token_accuracy": 0.8415389060974121, + "num_tokens": 11717920.0, + "step": 1310 + }, + { + "epoch": 0.9962006079027356, + "grad_norm": 1.9968011379241943, + "learning_rate": 3.9914027651167866e-06, + "loss": 0.46954160928726196, + "mean_token_accuracy": 0.8351103663444519, + "num_tokens": 11724999.0, + "step": 1311 + }, + { + "epoch": 0.9969604863221885, + "grad_norm": 1.9521311521530151, + "learning_rate": 3.989721348177801e-06, + "loss": 0.5068016052246094, + "mean_token_accuracy": 0.8220845460891724, + "num_tokens": 11732569.0, + "step": 1312 + }, + { + "epoch": 0.9977203647416414, + "grad_norm": 2.7332582473754883, + "learning_rate": 3.988038885695028e-06, + "loss": 0.4154692590236664, + "mean_token_accuracy": 0.8493857383728027, + "num_tokens": 11736759.0, + "step": 1313 + }, + { + "epoch": 0.9984802431610942, + "grad_norm": 1.8656952381134033, + "learning_rate": 3.986355378849284e-06, + "loss": 0.4151354134082794, + "mean_token_accuracy": 0.83440101146698, + "num_tokens": 11743827.0, + "step": 1314 + }, + { + "epoch": 0.9992401215805471, + "grad_norm": 1.304006576538086, + "learning_rate": 3.984670828822118e-06, + "loss": 0.4926128089427948, + "mean_token_accuracy": 0.8603005409240723, + "num_tokens": 11757707.0, + "step": 1315 + }, + { + "epoch": 1.0, + "grad_norm": 1.497079610824585, + "learning_rate": 3.982985236795815e-06, + "loss": 0.43342477083206177, + "mean_token_accuracy": 0.8550825119018555, + "num_tokens": 11769678.0, + "step": 1316 + }, + { + "epoch": 1.000759878419453, + "grad_norm": 2.870274543762207, + "learning_rate": 3.981298603953385e-06, + "loss": 0.3723528981208801, + "mean_token_accuracy": 0.8745899796485901, + "num_tokens": 11773290.0, + "step": 1317 + }, + { + "epoch": 1.0015197568389058, + "grad_norm": 1.3442503213882446, + "learning_rate": 3.979610931478574e-06, + "loss": 0.34688329696655273, + "mean_token_accuracy": 0.8749074935913086, + "num_tokens": 11786400.0, + "step": 1318 + }, + { + "epoch": 1.0022796352583587, + "grad_norm": 1.7272238731384277, + "learning_rate": 3.977922220555855e-06, + "loss": 0.28274932503700256, + "mean_token_accuracy": 0.896713137626648, + "num_tokens": 11793059.0, + "step": 1319 + }, + { + "epoch": 1.0030395136778116, + "grad_norm": 1.7362451553344727, + "learning_rate": 3.976232472370431e-06, + "loss": 0.5494794845581055, + "mean_token_accuracy": 0.8341718912124634, + "num_tokens": 11802593.0, + "step": 1320 + }, + { + "epoch": 1.0037993920972645, + "grad_norm": 1.3316494226455688, + "learning_rate": 3.97454168810823e-06, + "loss": 0.41505366563796997, + "mean_token_accuracy": 0.8581969738006592, + "num_tokens": 11813925.0, + "step": 1321 + }, + { + "epoch": 1.0045592705167172, + "grad_norm": 1.6152615547180176, + "learning_rate": 3.972849868955913e-06, + "loss": 0.44761013984680176, + "mean_token_accuracy": 0.8413045406341553, + "num_tokens": 11825709.0, + "step": 1322 + }, + { + "epoch": 1.0053191489361701, + "grad_norm": 2.1172471046447754, + "learning_rate": 3.97115701610086e-06, + "loss": 0.3903353810310364, + "mean_token_accuracy": 0.8662760257720947, + "num_tokens": 11832070.0, + "step": 1323 + }, + { + "epoch": 1.006079027355623, + "grad_norm": 1.5923868417739868, + "learning_rate": 3.969463130731183e-06, + "loss": 0.4491051137447357, + "mean_token_accuracy": 0.8677828311920166, + "num_tokens": 11843154.0, + "step": 1324 + }, + { + "epoch": 1.006838905775076, + "grad_norm": 1.6848995685577393, + "learning_rate": 3.967768214035716e-06, + "loss": 0.45765817165374756, + "mean_token_accuracy": 0.8401060104370117, + "num_tokens": 11854826.0, + "step": 1325 + }, + { + "epoch": 1.0075987841945289, + "grad_norm": 2.3739020824432373, + "learning_rate": 3.966072267204014e-06, + "loss": 0.4482722580432892, + "mean_token_accuracy": 0.8368916511535645, + "num_tokens": 11860559.0, + "step": 1326 + }, + { + "epoch": 1.0083586626139818, + "grad_norm": 1.5403034687042236, + "learning_rate": 3.964375291426361e-06, + "loss": 0.35589972138404846, + "mean_token_accuracy": 0.8728118538856506, + "num_tokens": 11871959.0, + "step": 1327 + }, + { + "epoch": 1.0091185410334347, + "grad_norm": 1.6750119924545288, + "learning_rate": 3.962677287893758e-06, + "loss": 0.35873427987098694, + "mean_token_accuracy": 0.9027186632156372, + "num_tokens": 11881818.0, + "step": 1328 + }, + { + "epoch": 1.0098784194528876, + "grad_norm": 1.5489170551300049, + "learning_rate": 3.9609782577979305e-06, + "loss": 0.3634672462940216, + "mean_token_accuracy": 0.8582607507705688, + "num_tokens": 11891084.0, + "step": 1329 + }, + { + "epoch": 1.0106382978723405, + "grad_norm": 2.43859601020813, + "learning_rate": 3.959278202331323e-06, + "loss": 0.3640799820423126, + "mean_token_accuracy": 0.88062584400177, + "num_tokens": 11896032.0, + "step": 1330 + }, + { + "epoch": 1.0113981762917934, + "grad_norm": 3.612184524536133, + "learning_rate": 3.9575771226870986e-06, + "loss": 0.3733130097389221, + "mean_token_accuracy": 0.8946067094802856, + "num_tokens": 11899479.0, + "step": 1331 + }, + { + "epoch": 1.012158054711246, + "grad_norm": 1.541355848312378, + "learning_rate": 3.955875020059141e-06, + "loss": 0.320593923330307, + "mean_token_accuracy": 0.9057406783103943, + "num_tokens": 11910179.0, + "step": 1332 + }, + { + "epoch": 1.012917933130699, + "grad_norm": 2.0565030574798584, + "learning_rate": 3.954171895642052e-06, + "loss": 0.3341682553291321, + "mean_token_accuracy": 0.8829344511032104, + "num_tokens": 11916489.0, + "step": 1333 + }, + { + "epoch": 1.013677811550152, + "grad_norm": 2.9732539653778076, + "learning_rate": 3.9524677506311505e-06, + "loss": 0.38488566875457764, + "mean_token_accuracy": 0.8752974271774292, + "num_tokens": 11920682.0, + "step": 1334 + }, + { + "epoch": 1.0144376899696048, + "grad_norm": 2.7697458267211914, + "learning_rate": 3.950762586222469e-06, + "loss": 0.39864760637283325, + "mean_token_accuracy": 0.8593167662620544, + "num_tokens": 11925233.0, + "step": 1335 + }, + { + "epoch": 1.0151975683890577, + "grad_norm": 2.2302119731903076, + "learning_rate": 3.949056403612758e-06, + "loss": 0.3985682725906372, + "mean_token_accuracy": 0.8677899837493896, + "num_tokens": 11932000.0, + "step": 1336 + }, + { + "epoch": 1.0159574468085106, + "grad_norm": 2.360572576522827, + "learning_rate": 3.947349203999485e-06, + "loss": 0.36940714716911316, + "mean_token_accuracy": 0.8760676383972168, + "num_tokens": 11937569.0, + "step": 1337 + }, + { + "epoch": 1.0167173252279635, + "grad_norm": 1.3383921384811401, + "learning_rate": 3.945640988580824e-06, + "loss": 0.40628793835639954, + "mean_token_accuracy": 0.866442084312439, + "num_tokens": 11955679.0, + "step": 1338 + }, + { + "epoch": 1.0174772036474165, + "grad_norm": 2.1502623558044434, + "learning_rate": 3.943931758555669e-06, + "loss": 0.4493565559387207, + "mean_token_accuracy": 0.8307522535324097, + "num_tokens": 11962734.0, + "step": 1339 + }, + { + "epoch": 1.0182370820668694, + "grad_norm": 2.4737331867218018, + "learning_rate": 3.942221515123624e-06, + "loss": 0.28508758544921875, + "mean_token_accuracy": 0.8967142105102539, + "num_tokens": 11967783.0, + "step": 1340 + }, + { + "epoch": 1.0189969604863223, + "grad_norm": 2.4525370597839355, + "learning_rate": 3.940510259485002e-06, + "loss": 0.40227818489074707, + "mean_token_accuracy": 0.8618967533111572, + "num_tokens": 11972918.0, + "step": 1341 + }, + { + "epoch": 1.0197568389057752, + "grad_norm": 1.7299731969833374, + "learning_rate": 3.938797992840828e-06, + "loss": 0.26339593529701233, + "mean_token_accuracy": 0.9004406929016113, + "num_tokens": 11981250.0, + "step": 1342 + }, + { + "epoch": 1.0205167173252279, + "grad_norm": 2.8756747245788574, + "learning_rate": 3.937084716392839e-06, + "loss": 0.47792482376098633, + "mean_token_accuracy": 0.8440839052200317, + "num_tokens": 11986356.0, + "step": 1343 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 2.104473114013672, + "learning_rate": 3.935370431343475e-06, + "loss": 0.36723971366882324, + "mean_token_accuracy": 0.8831232786178589, + "num_tokens": 11994495.0, + "step": 1344 + }, + { + "epoch": 1.0220364741641337, + "grad_norm": 1.9173074960708618, + "learning_rate": 3.933655138895889e-06, + "loss": 0.409319669008255, + "mean_token_accuracy": 0.8632645606994629, + "num_tokens": 12002060.0, + "step": 1345 + }, + { + "epoch": 1.0227963525835866, + "grad_norm": 2.958311080932617, + "learning_rate": 3.9319388402539395e-06, + "loss": 0.5390093922615051, + "mean_token_accuracy": 0.8204828500747681, + "num_tokens": 12007588.0, + "step": 1346 + }, + { + "epoch": 1.0235562310030395, + "grad_norm": 1.6470831632614136, + "learning_rate": 3.930221536622192e-06, + "loss": 0.4524633288383484, + "mean_token_accuracy": 0.8516575694084167, + "num_tokens": 12018831.0, + "step": 1347 + }, + { + "epoch": 1.0243161094224924, + "grad_norm": 1.3160780668258667, + "learning_rate": 3.928503229205913e-06, + "loss": 0.4180558919906616, + "mean_token_accuracy": 0.8495022058486938, + "num_tokens": 12033947.0, + "step": 1348 + }, + { + "epoch": 1.0250759878419453, + "grad_norm": 1.9686089754104614, + "learning_rate": 3.92678391921108e-06, + "loss": 0.41927334666252136, + "mean_token_accuracy": 0.8462997674942017, + "num_tokens": 12042005.0, + "step": 1349 + }, + { + "epoch": 1.0258358662613982, + "grad_norm": 2.351778507232666, + "learning_rate": 3.92506360784437e-06, + "loss": 0.2946245074272156, + "mean_token_accuracy": 0.9170923233032227, + "num_tokens": 12046579.0, + "step": 1350 + }, + { + "epoch": 1.0265957446808511, + "grad_norm": 2.0636913776397705, + "learning_rate": 3.923342296313162e-06, + "loss": 0.3422774076461792, + "mean_token_accuracy": 0.8809213638305664, + "num_tokens": 12053214.0, + "step": 1351 + }, + { + "epoch": 1.027355623100304, + "grad_norm": 1.7272592782974243, + "learning_rate": 3.92161998582554e-06, + "loss": 0.5864541530609131, + "mean_token_accuracy": 0.7986117601394653, + "num_tokens": 12068522.0, + "step": 1352 + }, + { + "epoch": 1.028115501519757, + "grad_norm": 0.8980231881141663, + "learning_rate": 3.919896677590289e-06, + "loss": 0.2964550256729126, + "mean_token_accuracy": 0.8911845088005066, + "num_tokens": 12093834.0, + "step": 1353 + }, + { + "epoch": 1.0288753799392096, + "grad_norm": 1.6031712293624878, + "learning_rate": 3.918172372816892e-06, + "loss": 0.37254488468170166, + "mean_token_accuracy": 0.8615843057632446, + "num_tokens": 12104393.0, + "step": 1354 + }, + { + "epoch": 1.0296352583586625, + "grad_norm": 1.282134771347046, + "learning_rate": 3.916447072715531e-06, + "loss": 0.3522927761077881, + "mean_token_accuracy": 0.8713657259941101, + "num_tokens": 12118671.0, + "step": 1355 + }, + { + "epoch": 1.0303951367781155, + "grad_norm": 2.1986680030822754, + "learning_rate": 3.914720778497091e-06, + "loss": 0.3716316223144531, + "mean_token_accuracy": 0.8661249279975891, + "num_tokens": 12125178.0, + "step": 1356 + }, + { + "epoch": 1.0311550151975684, + "grad_norm": 1.5937882661819458, + "learning_rate": 3.91299349137315e-06, + "loss": 0.48067355155944824, + "mean_token_accuracy": 0.8284252882003784, + "num_tokens": 12136785.0, + "step": 1357 + }, + { + "epoch": 1.0319148936170213, + "grad_norm": 1.6743099689483643, + "learning_rate": 3.9112652125559845e-06, + "loss": 0.4461551308631897, + "mean_token_accuracy": 0.8381845355033875, + "num_tokens": 12150066.0, + "step": 1358 + }, + { + "epoch": 1.0326747720364742, + "grad_norm": 2.2346715927124023, + "learning_rate": 3.909535943258567e-06, + "loss": 0.3148220181465149, + "mean_token_accuracy": 0.8797591924667358, + "num_tokens": 12155506.0, + "step": 1359 + }, + { + "epoch": 1.033434650455927, + "grad_norm": 1.9608992338180542, + "learning_rate": 3.907805684694567e-06, + "loss": 0.32598960399627686, + "mean_token_accuracy": 0.8819410800933838, + "num_tokens": 12163261.0, + "step": 1360 + }, + { + "epoch": 1.03419452887538, + "grad_norm": 2.413477897644043, + "learning_rate": 3.906074438078343e-06, + "loss": 0.38179588317871094, + "mean_token_accuracy": 0.8739585876464844, + "num_tokens": 12169254.0, + "step": 1361 + }, + { + "epoch": 1.034954407294833, + "grad_norm": 2.0258278846740723, + "learning_rate": 3.904342204624955e-06, + "loss": 0.33240315318107605, + "mean_token_accuracy": 0.8808181285858154, + "num_tokens": 12175379.0, + "step": 1362 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 2.4111437797546387, + "learning_rate": 3.9026089855501475e-06, + "loss": 0.412802517414093, + "mean_token_accuracy": 0.8504396677017212, + "num_tokens": 12182007.0, + "step": 1363 + }, + { + "epoch": 1.0364741641337385, + "grad_norm": 2.0424840450286865, + "learning_rate": 3.900874782070362e-06, + "loss": 0.2914797067642212, + "mean_token_accuracy": 0.8731886148452759, + "num_tokens": 12187743.0, + "step": 1364 + }, + { + "epoch": 1.0372340425531914, + "grad_norm": 2.9248716831207275, + "learning_rate": 3.899139595402729e-06, + "loss": 0.34071338176727295, + "mean_token_accuracy": 0.8736443519592285, + "num_tokens": 12191830.0, + "step": 1365 + }, + { + "epoch": 1.0379939209726443, + "grad_norm": 2.240220785140991, + "learning_rate": 3.8974034267650695e-06, + "loss": 0.23049014806747437, + "mean_token_accuracy": 0.9000070691108704, + "num_tokens": 12196460.0, + "step": 1366 + }, + { + "epoch": 1.0387537993920972, + "grad_norm": 1.5038460493087769, + "learning_rate": 3.895666277375892e-06, + "loss": 0.32255327701568604, + "mean_token_accuracy": 0.873004674911499, + "num_tokens": 12206230.0, + "step": 1367 + }, + { + "epoch": 1.0395136778115501, + "grad_norm": 1.2339142560958862, + "learning_rate": 3.893928148454398e-06, + "loss": 0.4069131314754486, + "mean_token_accuracy": 0.8461740016937256, + "num_tokens": 12226502.0, + "step": 1368 + }, + { + "epoch": 1.040273556231003, + "grad_norm": 2.531553268432617, + "learning_rate": 3.89218904122047e-06, + "loss": 0.43681037425994873, + "mean_token_accuracy": 0.8497104048728943, + "num_tokens": 12232241.0, + "step": 1369 + }, + { + "epoch": 1.041033434650456, + "grad_norm": 3.8404815196990967, + "learning_rate": 3.890448956894682e-06, + "loss": 0.3241814970970154, + "mean_token_accuracy": 0.884732723236084, + "num_tokens": 12235126.0, + "step": 1370 + }, + { + "epoch": 1.0417933130699089, + "grad_norm": 2.9608030319213867, + "learning_rate": 3.888707896698293e-06, + "loss": 0.4641021490097046, + "mean_token_accuracy": 0.8496800661087036, + "num_tokens": 12240630.0, + "step": 1371 + }, + { + "epoch": 1.0425531914893618, + "grad_norm": 2.1166417598724365, + "learning_rate": 3.886965861853243e-06, + "loss": 0.42038479447364807, + "mean_token_accuracy": 0.8512747287750244, + "num_tokens": 12247969.0, + "step": 1372 + }, + { + "epoch": 1.0433130699088147, + "grad_norm": 2.5918161869049072, + "learning_rate": 3.885222853582163e-06, + "loss": 0.2871917188167572, + "mean_token_accuracy": 0.9129709601402283, + "num_tokens": 12252161.0, + "step": 1373 + }, + { + "epoch": 1.0440729483282676, + "grad_norm": 2.4261348247528076, + "learning_rate": 3.88347887310836e-06, + "loss": 0.4003123342990875, + "mean_token_accuracy": 0.8570356369018555, + "num_tokens": 12258135.0, + "step": 1374 + }, + { + "epoch": 1.0448328267477203, + "grad_norm": 1.3439548015594482, + "learning_rate": 3.881733921655829e-06, + "loss": 0.3278140425682068, + "mean_token_accuracy": 0.8831373453140259, + "num_tokens": 12272849.0, + "step": 1375 + }, + { + "epoch": 1.0455927051671732, + "grad_norm": 1.527989387512207, + "learning_rate": 3.879988000449243e-06, + "loss": 0.33789363503456116, + "mean_token_accuracy": 0.8825669884681702, + "num_tokens": 12283281.0, + "step": 1376 + }, + { + "epoch": 1.046352583586626, + "grad_norm": 1.6755503416061401, + "learning_rate": 3.878241110713957e-06, + "loss": 0.4816160798072815, + "mean_token_accuracy": 0.8193758726119995, + "num_tokens": 12295422.0, + "step": 1377 + }, + { + "epoch": 1.047112462006079, + "grad_norm": 2.8110361099243164, + "learning_rate": 3.876493253676004e-06, + "loss": 0.38662949204444885, + "mean_token_accuracy": 0.8611986637115479, + "num_tokens": 12299806.0, + "step": 1378 + }, + { + "epoch": 1.047872340425532, + "grad_norm": 1.86097252368927, + "learning_rate": 3.8747444305621e-06, + "loss": 0.27612629532814026, + "mean_token_accuracy": 0.8984048366546631, + "num_tokens": 12306599.0, + "step": 1379 + }, + { + "epoch": 1.0486322188449848, + "grad_norm": 2.361828565597534, + "learning_rate": 3.872994642599635e-06, + "loss": 0.469953715801239, + "mean_token_accuracy": 0.8464452028274536, + "num_tokens": 12314249.0, + "step": 1380 + }, + { + "epoch": 1.0493920972644377, + "grad_norm": 1.9524794816970825, + "learning_rate": 3.871243891016676e-06, + "loss": 0.5419625043869019, + "mean_token_accuracy": 0.8468329906463623, + "num_tokens": 12324987.0, + "step": 1381 + }, + { + "epoch": 1.0501519756838906, + "grad_norm": 1.6931511163711548, + "learning_rate": 3.869492177041971e-06, + "loss": 0.3791416883468628, + "mean_token_accuracy": 0.8692882061004639, + "num_tokens": 12336864.0, + "step": 1382 + }, + { + "epoch": 1.0509118541033435, + "grad_norm": 1.909692406654358, + "learning_rate": 3.867739501904938e-06, + "loss": 0.27974557876586914, + "mean_token_accuracy": 0.9004636406898499, + "num_tokens": 12343093.0, + "step": 1383 + }, + { + "epoch": 1.0516717325227964, + "grad_norm": 1.415162205696106, + "learning_rate": 3.8659858668356735e-06, + "loss": 0.38928335905075073, + "mean_token_accuracy": 0.8491984009742737, + "num_tokens": 12356613.0, + "step": 1384 + }, + { + "epoch": 1.0524316109422491, + "grad_norm": 1.8195741176605225, + "learning_rate": 3.864231273064944e-06, + "loss": 0.3798758089542389, + "mean_token_accuracy": 0.8728072047233582, + "num_tokens": 12364860.0, + "step": 1385 + }, + { + "epoch": 1.053191489361702, + "grad_norm": 1.8481454849243164, + "learning_rate": 3.862475721824193e-06, + "loss": 0.269635945558548, + "mean_token_accuracy": 0.899247407913208, + "num_tokens": 12371841.0, + "step": 1386 + }, + { + "epoch": 1.053951367781155, + "grad_norm": 1.7838784456253052, + "learning_rate": 3.8607192143455325e-06, + "loss": 0.36971768736839294, + "mean_token_accuracy": 0.8833638429641724, + "num_tokens": 12380685.0, + "step": 1387 + }, + { + "epoch": 1.0547112462006079, + "grad_norm": 1.333358645439148, + "learning_rate": 3.858961751861748e-06, + "loss": 0.4039418399333954, + "mean_token_accuracy": 0.8541078567504883, + "num_tokens": 12394072.0, + "step": 1388 + }, + { + "epoch": 1.0554711246200608, + "grad_norm": 2.1600265502929688, + "learning_rate": 3.857203335606294e-06, + "loss": 0.38211894035339355, + "mean_token_accuracy": 0.8549972772598267, + "num_tokens": 12400449.0, + "step": 1389 + }, + { + "epoch": 1.0562310030395137, + "grad_norm": 2.914902687072754, + "learning_rate": 3.855443966813295e-06, + "loss": 0.2237374186515808, + "mean_token_accuracy": 0.9253600835800171, + "num_tokens": 12403758.0, + "step": 1390 + }, + { + "epoch": 1.0569908814589666, + "grad_norm": 2.2361080646514893, + "learning_rate": 3.853683646717543e-06, + "loss": 0.3359566926956177, + "mean_token_accuracy": 0.898173451423645, + "num_tokens": 12410374.0, + "step": 1391 + }, + { + "epoch": 1.0577507598784195, + "grad_norm": 2.3639304637908936, + "learning_rate": 3.8519223765544985e-06, + "loss": 0.3844943046569824, + "mean_token_accuracy": 0.863599419593811, + "num_tokens": 12416016.0, + "step": 1392 + }, + { + "epoch": 1.0585106382978724, + "grad_norm": 2.202971935272217, + "learning_rate": 3.85016015756029e-06, + "loss": 0.3546281158924103, + "mean_token_accuracy": 0.8907540440559387, + "num_tokens": 12422026.0, + "step": 1393 + }, + { + "epoch": 1.0592705167173253, + "grad_norm": 1.1279661655426025, + "learning_rate": 3.848396990971709e-06, + "loss": 0.31522464752197266, + "mean_token_accuracy": 0.8662257194519043, + "num_tokens": 12439964.0, + "step": 1394 + }, + { + "epoch": 1.0600303951367782, + "grad_norm": 2.4731740951538086, + "learning_rate": 3.846632878026214e-06, + "loss": 0.456442266702652, + "mean_token_accuracy": 0.8516958951950073, + "num_tokens": 12446231.0, + "step": 1395 + }, + { + "epoch": 1.060790273556231, + "grad_norm": 1.7631878852844238, + "learning_rate": 3.844867819961928e-06, + "loss": 0.487227201461792, + "mean_token_accuracy": 0.8466947078704834, + "num_tokens": 12459989.0, + "step": 1396 + }, + { + "epoch": 1.0615501519756838, + "grad_norm": 2.4468278884887695, + "learning_rate": 3.843101818017637e-06, + "loss": 0.3367291986942291, + "mean_token_accuracy": 0.8734689950942993, + "num_tokens": 12465741.0, + "step": 1397 + }, + { + "epoch": 1.0623100303951367, + "grad_norm": 1.9045145511627197, + "learning_rate": 3.841334873432789e-06, + "loss": 0.4652615487575531, + "mean_token_accuracy": 0.8333107233047485, + "num_tokens": 12474963.0, + "step": 1398 + }, + { + "epoch": 1.0630699088145896, + "grad_norm": 1.6816917657852173, + "learning_rate": 3.839566987447492e-06, + "loss": 0.4144279956817627, + "mean_token_accuracy": 0.8472539186477661, + "num_tokens": 12485521.0, + "step": 1399 + }, + { + "epoch": 1.0638297872340425, + "grad_norm": 1.8990092277526855, + "learning_rate": 3.837798161302518e-06, + "loss": 0.4040985405445099, + "mean_token_accuracy": 0.8514704704284668, + "num_tokens": 12493495.0, + "step": 1400 + }, + { + "epoch": 1.0645896656534954, + "grad_norm": 2.27785325050354, + "learning_rate": 3.836028396239297e-06, + "loss": 0.43425723910331726, + "mean_token_accuracy": 0.8795069456100464, + "num_tokens": 12499789.0, + "step": 1401 + }, + { + "epoch": 1.0653495440729484, + "grad_norm": 2.5130882263183594, + "learning_rate": 3.8342576934999184e-06, + "loss": 0.33892524242401123, + "mean_token_accuracy": 0.8717449903488159, + "num_tokens": 12504885.0, + "step": 1402 + }, + { + "epoch": 1.0661094224924013, + "grad_norm": 2.650040864944458, + "learning_rate": 3.832486054327131e-06, + "loss": 0.4200317859649658, + "mean_token_accuracy": 0.8616159558296204, + "num_tokens": 12509783.0, + "step": 1403 + }, + { + "epoch": 1.0668693009118542, + "grad_norm": 2.9176881313323975, + "learning_rate": 3.830713479964335e-06, + "loss": 0.37018489837646484, + "mean_token_accuracy": 0.8676021695137024, + "num_tokens": 12514441.0, + "step": 1404 + }, + { + "epoch": 1.067629179331307, + "grad_norm": 1.6430318355560303, + "learning_rate": 3.828939971655595e-06, + "loss": 0.27539193630218506, + "mean_token_accuracy": 0.9077831506729126, + "num_tokens": 12523677.0, + "step": 1405 + }, + { + "epoch": 1.06838905775076, + "grad_norm": 1.3683708906173706, + "learning_rate": 3.827165530645627e-06, + "loss": 0.4085099697113037, + "mean_token_accuracy": 0.8579255938529968, + "num_tokens": 12540104.0, + "step": 1406 + }, + { + "epoch": 1.0691489361702127, + "grad_norm": 2.528465747833252, + "learning_rate": 3.825390158179802e-06, + "loss": 0.42462456226348877, + "mean_token_accuracy": 0.852813720703125, + "num_tokens": 12548239.0, + "step": 1407 + }, + { + "epoch": 1.0699088145896656, + "grad_norm": 1.8288795948028564, + "learning_rate": 3.823613855504144e-06, + "loss": 0.412417471408844, + "mean_token_accuracy": 0.8622130751609802, + "num_tokens": 12557316.0, + "step": 1408 + }, + { + "epoch": 1.0706686930091185, + "grad_norm": 2.341794490814209, + "learning_rate": 3.82183662386533e-06, + "loss": 0.2996668815612793, + "mean_token_accuracy": 0.8964041471481323, + "num_tokens": 12562377.0, + "step": 1409 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 2.555877208709717, + "learning_rate": 3.82005846451069e-06, + "loss": 0.4184221625328064, + "mean_token_accuracy": 0.8678828477859497, + "num_tokens": 12568516.0, + "step": 1410 + }, + { + "epoch": 1.0721884498480243, + "grad_norm": 2.081308126449585, + "learning_rate": 3.8182793786882065e-06, + "loss": 0.4376835823059082, + "mean_token_accuracy": 0.8409077525138855, + "num_tokens": 12576598.0, + "step": 1411 + }, + { + "epoch": 1.0729483282674772, + "grad_norm": 2.0272316932678223, + "learning_rate": 3.816499367646508e-06, + "loss": 0.3630060851573944, + "mean_token_accuracy": 0.8762413263320923, + "num_tokens": 12584587.0, + "step": 1412 + }, + { + "epoch": 1.0737082066869301, + "grad_norm": 2.6382484436035156, + "learning_rate": 3.814718432634877e-06, + "loss": 0.4244990348815918, + "mean_token_accuracy": 0.8509312272071838, + "num_tokens": 12590028.0, + "step": 1413 + }, + { + "epoch": 1.074468085106383, + "grad_norm": 2.429800271987915, + "learning_rate": 3.8129365749032398e-06, + "loss": 0.36990004777908325, + "mean_token_accuracy": 0.8749774098396301, + "num_tokens": 12594984.0, + "step": 1414 + }, + { + "epoch": 1.075227963525836, + "grad_norm": 3.5939090251922607, + "learning_rate": 3.8111537957021736e-06, + "loss": 0.4245661199092865, + "mean_token_accuracy": 0.8481623530387878, + "num_tokens": 12598494.0, + "step": 1415 + }, + { + "epoch": 1.0759878419452888, + "grad_norm": 2.705955982208252, + "learning_rate": 3.809370096282903e-06, + "loss": 0.41851678490638733, + "mean_token_accuracy": 0.8548051714897156, + "num_tokens": 12603876.0, + "step": 1416 + }, + { + "epoch": 1.0767477203647418, + "grad_norm": 1.7812079191207886, + "learning_rate": 3.807585477897296e-06, + "loss": 0.47113919258117676, + "mean_token_accuracy": 0.8346904516220093, + "num_tokens": 12613402.0, + "step": 1417 + }, + { + "epoch": 1.0775075987841944, + "grad_norm": 1.4335212707519531, + "learning_rate": 3.8057999417978654e-06, + "loss": 0.3802063465118408, + "mean_token_accuracy": 0.8563423156738281, + "num_tokens": 12626865.0, + "step": 1418 + }, + { + "epoch": 1.0782674772036474, + "grad_norm": 1.9171305894851685, + "learning_rate": 3.8040134892377702e-06, + "loss": 0.20898357033729553, + "mean_token_accuracy": 0.9189738035202026, + "num_tokens": 12632593.0, + "step": 1419 + }, + { + "epoch": 1.0790273556231003, + "grad_norm": 1.4996821880340576, + "learning_rate": 3.802226121470811e-06, + "loss": 0.4203261137008667, + "mean_token_accuracy": 0.8479211330413818, + "num_tokens": 12646395.0, + "step": 1420 + }, + { + "epoch": 1.0797872340425532, + "grad_norm": 2.2007253170013428, + "learning_rate": 3.800437839751432e-06, + "loss": 0.40370577573776245, + "mean_token_accuracy": 0.8427679538726807, + "num_tokens": 12653508.0, + "step": 1421 + }, + { + "epoch": 1.080547112462006, + "grad_norm": 1.7266581058502197, + "learning_rate": 3.7986486453347183e-06, + "loss": 0.46750491857528687, + "mean_token_accuracy": 0.8429205417633057, + "num_tokens": 12666329.0, + "step": 1422 + }, + { + "epoch": 1.081306990881459, + "grad_norm": 1.4716318845748901, + "learning_rate": 3.796858539476394e-06, + "loss": 0.3330317735671997, + "mean_token_accuracy": 0.879012942314148, + "num_tokens": 12676741.0, + "step": 1423 + }, + { + "epoch": 1.082066869300912, + "grad_norm": 2.652127265930176, + "learning_rate": 3.795067523432826e-06, + "loss": 0.35365715622901917, + "mean_token_accuracy": 0.8796792030334473, + "num_tokens": 12681479.0, + "step": 1424 + }, + { + "epoch": 1.0828267477203648, + "grad_norm": 1.2937829494476318, + "learning_rate": 3.793275598461017e-06, + "loss": 0.25272446870803833, + "mean_token_accuracy": 0.9231734275817871, + "num_tokens": 12694238.0, + "step": 1425 + }, + { + "epoch": 1.0835866261398177, + "grad_norm": 1.3831220865249634, + "learning_rate": 3.7914827658186104e-06, + "loss": 0.4935331344604492, + "mean_token_accuracy": 0.8417420387268066, + "num_tokens": 12712857.0, + "step": 1426 + }, + { + "epoch": 1.0843465045592706, + "grad_norm": 3.059525728225708, + "learning_rate": 3.7896890267638832e-06, + "loss": 0.2592190206050873, + "mean_token_accuracy": 0.9040263295173645, + "num_tokens": 12716766.0, + "step": 1427 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 2.8399202823638916, + "learning_rate": 3.787894382555752e-06, + "loss": 0.32098138332366943, + "mean_token_accuracy": 0.8838302493095398, + "num_tokens": 12720774.0, + "step": 1428 + }, + { + "epoch": 1.0858662613981762, + "grad_norm": 2.618479013442993, + "learning_rate": 3.7860988344537664e-06, + "loss": 0.425255686044693, + "mean_token_accuracy": 0.8564130067825317, + "num_tokens": 12726506.0, + "step": 1429 + }, + { + "epoch": 1.0866261398176291, + "grad_norm": 1.3108669519424438, + "learning_rate": 3.7843023837181126e-06, + "loss": 0.40220165252685547, + "mean_token_accuracy": 0.8588873147964478, + "num_tokens": 12742814.0, + "step": 1430 + }, + { + "epoch": 1.087386018237082, + "grad_norm": 2.2083566188812256, + "learning_rate": 3.782505031609607e-06, + "loss": 0.318379282951355, + "mean_token_accuracy": 0.8887606859207153, + "num_tokens": 12748388.0, + "step": 1431 + }, + { + "epoch": 1.088145896656535, + "grad_norm": 1.922358751296997, + "learning_rate": 3.7807067793897006e-06, + "loss": 0.2519589364528656, + "mean_token_accuracy": 0.8936764001846313, + "num_tokens": 12754761.0, + "step": 1432 + }, + { + "epoch": 1.0889057750759878, + "grad_norm": 1.7367439270019531, + "learning_rate": 3.778907628320477e-06, + "loss": 0.3970367908477783, + "mean_token_accuracy": 0.858735203742981, + "num_tokens": 12764016.0, + "step": 1433 + }, + { + "epoch": 1.0896656534954408, + "grad_norm": 2.1931066513061523, + "learning_rate": 3.77710757966465e-06, + "loss": 0.5250554084777832, + "mean_token_accuracy": 0.8356746435165405, + "num_tokens": 12772272.0, + "step": 1434 + }, + { + "epoch": 1.0904255319148937, + "grad_norm": 1.718337893486023, + "learning_rate": 3.775306634685562e-06, + "loss": 0.283231645822525, + "mean_token_accuracy": 0.9009919166564941, + "num_tokens": 12780706.0, + "step": 1435 + }, + { + "epoch": 1.0911854103343466, + "grad_norm": 2.1985926628112793, + "learning_rate": 3.773504794647187e-06, + "loss": 0.3913170397281647, + "mean_token_accuracy": 0.8909255266189575, + "num_tokens": 12787052.0, + "step": 1436 + }, + { + "epoch": 1.0919452887537995, + "grad_norm": 2.8687937259674072, + "learning_rate": 3.771702060814123e-06, + "loss": 0.3135771155357361, + "mean_token_accuracy": 0.9016125202178955, + "num_tokens": 12791854.0, + "step": 1437 + }, + { + "epoch": 1.0927051671732522, + "grad_norm": 4.203946590423584, + "learning_rate": 3.7698984344516e-06, + "loss": 0.3642737865447998, + "mean_token_accuracy": 0.8842349052429199, + "num_tokens": 12794969.0, + "step": 1438 + }, + { + "epoch": 1.093465045592705, + "grad_norm": 1.5134642124176025, + "learning_rate": 3.7680939168254733e-06, + "loss": 0.3732057213783264, + "mean_token_accuracy": 0.8671083450317383, + "num_tokens": 12808480.0, + "step": 1439 + }, + { + "epoch": 1.094224924012158, + "grad_norm": 3.2103970050811768, + "learning_rate": 3.7662885092022206e-06, + "loss": 0.3556194603443146, + "mean_token_accuracy": 0.8786529302597046, + "num_tokens": 12812654.0, + "step": 1440 + }, + { + "epoch": 1.094984802431611, + "grad_norm": 2.2774064540863037, + "learning_rate": 3.7644822128489476e-06, + "loss": 0.38409674167633057, + "mean_token_accuracy": 0.866563081741333, + "num_tokens": 12819854.0, + "step": 1441 + }, + { + "epoch": 1.0957446808510638, + "grad_norm": 1.8250885009765625, + "learning_rate": 3.7626750290333824e-06, + "loss": 0.3812350034713745, + "mean_token_accuracy": 0.8676212430000305, + "num_tokens": 12830338.0, + "step": 1442 + }, + { + "epoch": 1.0965045592705167, + "grad_norm": 1.8337891101837158, + "learning_rate": 3.7608669590238765e-06, + "loss": 0.3892471194267273, + "mean_token_accuracy": 0.8616238832473755, + "num_tokens": 12840340.0, + "step": 1443 + }, + { + "epoch": 1.0972644376899696, + "grad_norm": 1.5300254821777344, + "learning_rate": 3.7590580040894025e-06, + "loss": 0.35288217663764954, + "mean_token_accuracy": 0.8625509738922119, + "num_tokens": 12853144.0, + "step": 1444 + }, + { + "epoch": 1.0980243161094225, + "grad_norm": 2.152683734893799, + "learning_rate": 3.7572481654995554e-06, + "loss": 0.4004772901535034, + "mean_token_accuracy": 0.858427107334137, + "num_tokens": 12859970.0, + "step": 1445 + }, + { + "epoch": 1.0987841945288754, + "grad_norm": 1.532832145690918, + "learning_rate": 3.755437444524548e-06, + "loss": 0.46820127964019775, + "mean_token_accuracy": 0.8585472106933594, + "num_tokens": 12875243.0, + "step": 1446 + }, + { + "epoch": 1.0995440729483283, + "grad_norm": 1.6485342979431152, + "learning_rate": 3.7536258424352164e-06, + "loss": 0.46329325437545776, + "mean_token_accuracy": 0.8376060724258423, + "num_tokens": 12886383.0, + "step": 1447 + }, + { + "epoch": 1.1003039513677813, + "grad_norm": 2.402256488800049, + "learning_rate": 3.75181336050301e-06, + "loss": 0.43916207551956177, + "mean_token_accuracy": 0.8448786735534668, + "num_tokens": 12892613.0, + "step": 1448 + }, + { + "epoch": 1.101063829787234, + "grad_norm": 1.3893651962280273, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.3919021785259247, + "mean_token_accuracy": 0.8495820760726929, + "num_tokens": 12905523.0, + "step": 1449 + }, + { + "epoch": 1.1018237082066868, + "grad_norm": 1.5519827604293823, + "learning_rate": 3.7481857621988734e-06, + "loss": 0.4710700809955597, + "mean_token_accuracy": 0.8387632369995117, + "num_tokens": 12918236.0, + "step": 1450 + }, + { + "epoch": 1.1025835866261398, + "grad_norm": 2.0141353607177734, + "learning_rate": 3.74637064837293e-06, + "loss": 0.30866751074790955, + "mean_token_accuracy": 0.9059321880340576, + "num_tokens": 12924391.0, + "step": 1451 + }, + { + "epoch": 1.1033434650455927, + "grad_norm": 1.2201496362686157, + "learning_rate": 3.7445546597960882e-06, + "loss": 0.3938257396221161, + "mean_token_accuracy": 0.8726630210876465, + "num_tokens": 12943338.0, + "step": 1452 + }, + { + "epoch": 1.1041033434650456, + "grad_norm": 2.29434871673584, + "learning_rate": 3.742737797742878e-06, + "loss": 0.4347776174545288, + "mean_token_accuracy": 0.840569257736206, + "num_tokens": 12950636.0, + "step": 1453 + }, + { + "epoch": 1.1048632218844985, + "grad_norm": 2.3875105381011963, + "learning_rate": 3.7409200634884425e-06, + "loss": 0.48353564739227295, + "mean_token_accuracy": 0.8207056522369385, + "num_tokens": 12957635.0, + "step": 1454 + }, + { + "epoch": 1.1056231003039514, + "grad_norm": 2.3539648056030273, + "learning_rate": 3.7391014583085384e-06, + "loss": 0.3532431721687317, + "mean_token_accuracy": 0.8903788924217224, + "num_tokens": 12963032.0, + "step": 1455 + }, + { + "epoch": 1.1063829787234043, + "grad_norm": 1.5611135959625244, + "learning_rate": 3.737281983479534e-06, + "loss": 0.4734863042831421, + "mean_token_accuracy": 0.8413879871368408, + "num_tokens": 12977170.0, + "step": 1456 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.474320411682129, + "learning_rate": 3.735461640278404e-06, + "loss": 0.41854286193847656, + "mean_token_accuracy": 0.8499876856803894, + "num_tokens": 12993750.0, + "step": 1457 + }, + { + "epoch": 1.1079027355623101, + "grad_norm": 2.6873273849487305, + "learning_rate": 3.733640429982738e-06, + "loss": 0.47637903690338135, + "mean_token_accuracy": 0.83599853515625, + "num_tokens": 12999058.0, + "step": 1458 + }, + { + "epoch": 1.108662613981763, + "grad_norm": 1.4575026035308838, + "learning_rate": 3.731818353870729e-06, + "loss": 0.38441652059555054, + "mean_token_accuracy": 0.8582364320755005, + "num_tokens": 13013864.0, + "step": 1459 + }, + { + "epoch": 1.1094224924012157, + "grad_norm": 1.7722690105438232, + "learning_rate": 3.729995413221183e-06, + "loss": 0.4224998950958252, + "mean_token_accuracy": 0.8511888384819031, + "num_tokens": 13023714.0, + "step": 1460 + }, + { + "epoch": 1.1101823708206686, + "grad_norm": 2.625760555267334, + "learning_rate": 3.7281716093135068e-06, + "loss": 0.3487582802772522, + "mean_token_accuracy": 0.8834779262542725, + "num_tokens": 13028608.0, + "step": 1461 + }, + { + "epoch": 1.1109422492401215, + "grad_norm": 1.2554056644439697, + "learning_rate": 3.726346943427719e-06, + "loss": 0.33312469720840454, + "mean_token_accuracy": 0.8704153299331665, + "num_tokens": 13044901.0, + "step": 1462 + }, + { + "epoch": 1.1117021276595744, + "grad_norm": 2.1109910011291504, + "learning_rate": 3.7245214168444388e-06, + "loss": 0.387290894985199, + "mean_token_accuracy": 0.860816240310669, + "num_tokens": 13051452.0, + "step": 1463 + }, + { + "epoch": 1.1124620060790273, + "grad_norm": 3.159201145172119, + "learning_rate": 3.722695030844891e-06, + "loss": 0.37690871953964233, + "mean_token_accuracy": 0.8717561960220337, + "num_tokens": 13055131.0, + "step": 1464 + }, + { + "epoch": 1.1132218844984803, + "grad_norm": 1.3810011148452759, + "learning_rate": 3.7208677867109042e-06, + "loss": 0.36598485708236694, + "mean_token_accuracy": 0.8683375120162964, + "num_tokens": 13069798.0, + "step": 1465 + }, + { + "epoch": 1.1139817629179332, + "grad_norm": 2.500849485397339, + "learning_rate": 3.7190396857249087e-06, + "loss": 0.2781746983528137, + "mean_token_accuracy": 0.9026005268096924, + "num_tokens": 13075127.0, + "step": 1466 + }, + { + "epoch": 1.114741641337386, + "grad_norm": 1.7445712089538574, + "learning_rate": 3.7172107291699356e-06, + "loss": 0.5055314302444458, + "mean_token_accuracy": 0.8252174258232117, + "num_tokens": 13084843.0, + "step": 1467 + }, + { + "epoch": 1.115501519756839, + "grad_norm": 1.6386256217956543, + "learning_rate": 3.7153809183296174e-06, + "loss": 0.38478314876556396, + "mean_token_accuracy": 0.8600847721099854, + "num_tokens": 13096517.0, + "step": 1468 + }, + { + "epoch": 1.1162613981762919, + "grad_norm": 2.3818395137786865, + "learning_rate": 3.713550254488185e-06, + "loss": 0.40308547019958496, + "mean_token_accuracy": 0.8628184795379639, + "num_tokens": 13102324.0, + "step": 1469 + }, + { + "epoch": 1.1170212765957448, + "grad_norm": 1.73163640499115, + "learning_rate": 3.7117187389304703e-06, + "loss": 0.5035421848297119, + "mean_token_accuracy": 0.8229597210884094, + "num_tokens": 13113763.0, + "step": 1470 + }, + { + "epoch": 1.1177811550151975, + "grad_norm": 3.147177219390869, + "learning_rate": 3.7098863729418997e-06, + "loss": 0.557449221611023, + "mean_token_accuracy": 0.8266849517822266, + "num_tokens": 13118849.0, + "step": 1471 + }, + { + "epoch": 1.1185410334346504, + "grad_norm": 1.5061391592025757, + "learning_rate": 3.7080531578085e-06, + "loss": 0.3759554922580719, + "mean_token_accuracy": 0.8541903495788574, + "num_tokens": 13131337.0, + "step": 1472 + }, + { + "epoch": 1.1193009118541033, + "grad_norm": 2.172346353530884, + "learning_rate": 3.7062190948168906e-06, + "loss": 0.41491609811782837, + "mean_token_accuracy": 0.8531454801559448, + "num_tokens": 13139767.0, + "step": 1473 + }, + { + "epoch": 1.1200607902735562, + "grad_norm": 2.1527154445648193, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.4309239387512207, + "mean_token_accuracy": 0.8327745199203491, + "num_tokens": 13147210.0, + "step": 1474 + }, + { + "epoch": 1.1208206686930091, + "grad_norm": 1.8342832326889038, + "learning_rate": 3.7025484304085035e-06, + "loss": 0.34393298625946045, + "mean_token_accuracy": 0.8948153257369995, + "num_tokens": 13154831.0, + "step": 1475 + }, + { + "epoch": 1.121580547112462, + "grad_norm": 2.509291172027588, + "learning_rate": 3.7007118315679384e-06, + "loss": 0.4479471445083618, + "mean_token_accuracy": 0.8280234336853027, + "num_tokens": 13161040.0, + "step": 1476 + }, + { + "epoch": 1.122340425531915, + "grad_norm": 2.914710521697998, + "learning_rate": 3.6988743900215895e-06, + "loss": 0.3724832832813263, + "mean_token_accuracy": 0.863893985748291, + "num_tokens": 13164975.0, + "step": 1477 + }, + { + "epoch": 1.1231003039513678, + "grad_norm": 3.274808645248413, + "learning_rate": 3.6970361070590443e-06, + "loss": 0.4088161885738373, + "mean_token_accuracy": 0.8474822044372559, + "num_tokens": 13168826.0, + "step": 1478 + }, + { + "epoch": 1.1238601823708207, + "grad_norm": 2.861546277999878, + "learning_rate": 3.695196983970481e-06, + "loss": 0.45837992429733276, + "mean_token_accuracy": 0.8579759001731873, + "num_tokens": 13173794.0, + "step": 1479 + }, + { + "epoch": 1.1246200607902737, + "grad_norm": 1.9491597414016724, + "learning_rate": 3.6933570220466654e-06, + "loss": 0.4333910346031189, + "mean_token_accuracy": 0.8444236516952515, + "num_tokens": 13181598.0, + "step": 1480 + }, + { + "epoch": 1.1253799392097266, + "grad_norm": 1.329848051071167, + "learning_rate": 3.6915162225789546e-06, + "loss": 0.36404621601104736, + "mean_token_accuracy": 0.8694117069244385, + "num_tokens": 13196381.0, + "step": 1481 + }, + { + "epoch": 1.1261398176291793, + "grad_norm": 1.8854197263717651, + "learning_rate": 3.6896745868592924e-06, + "loss": 0.4085756838321686, + "mean_token_accuracy": 0.855188250541687, + "num_tokens": 13205236.0, + "step": 1482 + }, + { + "epoch": 1.1268996960486322, + "grad_norm": 3.01684832572937, + "learning_rate": 3.6878321161802106e-06, + "loss": 0.28105655312538147, + "mean_token_accuracy": 0.9009426236152649, + "num_tokens": 13209380.0, + "step": 1483 + }, + { + "epoch": 1.127659574468085, + "grad_norm": 1.8051308393478394, + "learning_rate": 3.685988811834823e-06, + "loss": 0.3314531147480011, + "mean_token_accuracy": 0.8805814385414124, + "num_tokens": 13217714.0, + "step": 1484 + }, + { + "epoch": 1.128419452887538, + "grad_norm": 1.61757493019104, + "learning_rate": 3.684144675116836e-06, + "loss": 0.4543863534927368, + "mean_token_accuracy": 0.8400536775588989, + "num_tokens": 13229330.0, + "step": 1485 + }, + { + "epoch": 1.1291793313069909, + "grad_norm": 1.602686882019043, + "learning_rate": 3.682299707320532e-06, + "loss": 0.3653204143047333, + "mean_token_accuracy": 0.8655825853347778, + "num_tokens": 13242872.0, + "step": 1486 + }, + { + "epoch": 1.1299392097264438, + "grad_norm": 2.3093113899230957, + "learning_rate": 3.680453909740782e-06, + "loss": 0.4383693039417267, + "mean_token_accuracy": 0.839782178401947, + "num_tokens": 13248976.0, + "step": 1487 + }, + { + "epoch": 1.1306990881458967, + "grad_norm": 1.180559754371643, + "learning_rate": 3.6786072836730376e-06, + "loss": 0.5354755520820618, + "mean_token_accuracy": 0.8151205778121948, + "num_tokens": 13272896.0, + "step": 1488 + }, + { + "epoch": 1.1314589665653496, + "grad_norm": 1.9554040431976318, + "learning_rate": 3.6767598304133325e-06, + "loss": 0.4485316872596741, + "mean_token_accuracy": 0.8399936556816101, + "num_tokens": 13280757.0, + "step": 1489 + }, + { + "epoch": 1.1322188449848025, + "grad_norm": 2.236471176147461, + "learning_rate": 3.674911551258279e-06, + "loss": 0.45594364404678345, + "mean_token_accuracy": 0.8552400469779968, + "num_tokens": 13287328.0, + "step": 1490 + }, + { + "epoch": 1.1329787234042552, + "grad_norm": 2.5228686332702637, + "learning_rate": 3.673062447505072e-06, + "loss": 0.4048641622066498, + "mean_token_accuracy": 0.8617376685142517, + "num_tokens": 13292716.0, + "step": 1491 + }, + { + "epoch": 1.1337386018237081, + "grad_norm": 1.1274473667144775, + "learning_rate": 3.6712125204514836e-06, + "loss": 0.3848876357078552, + "mean_token_accuracy": 0.8672975301742554, + "num_tokens": 13313403.0, + "step": 1492 + }, + { + "epoch": 1.134498480243161, + "grad_norm": 2.349541425704956, + "learning_rate": 3.6693617713958633e-06, + "loss": 0.3166058361530304, + "mean_token_accuracy": 0.8896721601486206, + "num_tokens": 13318720.0, + "step": 1493 + }, + { + "epoch": 1.135258358662614, + "grad_norm": 2.2438278198242188, + "learning_rate": 3.6675102016371387e-06, + "loss": 0.5418218970298767, + "mean_token_accuracy": 0.8256527185440063, + "num_tokens": 13325360.0, + "step": 1494 + }, + { + "epoch": 1.1360182370820668, + "grad_norm": 2.21268892288208, + "learning_rate": 3.665657812474812e-06, + "loss": 0.48603951930999756, + "mean_token_accuracy": 0.8273470401763916, + "num_tokens": 13333217.0, + "step": 1495 + }, + { + "epoch": 1.1367781155015197, + "grad_norm": 2.6105997562408447, + "learning_rate": 3.6638046052089614e-06, + "loss": 0.31221291422843933, + "mean_token_accuracy": 0.888375997543335, + "num_tokens": 13338413.0, + "step": 1496 + }, + { + "epoch": 1.1375379939209727, + "grad_norm": 3.655658483505249, + "learning_rate": 3.661950581140239e-06, + "loss": 0.3609023988246918, + "mean_token_accuracy": 0.8838576078414917, + "num_tokens": 13341499.0, + "step": 1497 + }, + { + "epoch": 1.1382978723404256, + "grad_norm": 2.242009162902832, + "learning_rate": 3.660095741569871e-06, + "loss": 0.40022802352905273, + "mean_token_accuracy": 0.8559960722923279, + "num_tokens": 13347917.0, + "step": 1498 + }, + { + "epoch": 1.1390577507598785, + "grad_norm": 1.7958979606628418, + "learning_rate": 3.658240087799655e-06, + "loss": 0.499157190322876, + "mean_token_accuracy": 0.8423802256584167, + "num_tokens": 13361570.0, + "step": 1499 + }, + { + "epoch": 1.1398176291793314, + "grad_norm": 2.5406908988952637, + "learning_rate": 3.6563836211319593e-06, + "loss": 0.4090137481689453, + "mean_token_accuracy": 0.8769663572311401, + "num_tokens": 13367183.0, + "step": 1500 + }, + { + "epoch": 1.1405775075987843, + "grad_norm": 1.9861716032028198, + "learning_rate": 3.654526342869724e-06, + "loss": 0.5125207304954529, + "mean_token_accuracy": 0.8315266370773315, + "num_tokens": 13376767.0, + "step": 1501 + }, + { + "epoch": 1.141337386018237, + "grad_norm": 1.731188178062439, + "learning_rate": 3.65266825431646e-06, + "loss": 0.39452576637268066, + "mean_token_accuracy": 0.8585706353187561, + "num_tokens": 13388437.0, + "step": 1502 + }, + { + "epoch": 1.1420972644376899, + "grad_norm": 1.5203773975372314, + "learning_rate": 3.6508093567762425e-06, + "loss": 0.39466819167137146, + "mean_token_accuracy": 0.8584027886390686, + "num_tokens": 13399727.0, + "step": 1503 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 2.606462001800537, + "learning_rate": 3.6489496515537204e-06, + "loss": 0.4521079361438751, + "mean_token_accuracy": 0.8413360118865967, + "num_tokens": 13408426.0, + "step": 1504 + }, + { + "epoch": 1.1436170212765957, + "grad_norm": 2.6207993030548096, + "learning_rate": 3.647089139954104e-06, + "loss": 0.4709353446960449, + "mean_token_accuracy": 0.8397113084793091, + "num_tokens": 13413506.0, + "step": 1505 + }, + { + "epoch": 1.1443768996960486, + "grad_norm": 1.7214165925979614, + "learning_rate": 3.6452278232831734e-06, + "loss": 0.45506367087364197, + "mean_token_accuracy": 0.8466023206710815, + "num_tokens": 13424592.0, + "step": 1506 + }, + { + "epoch": 1.1451367781155015, + "grad_norm": 1.7111759185791016, + "learning_rate": 3.643365702847272e-06, + "loss": 0.5016278624534607, + "mean_token_accuracy": 0.8196234703063965, + "num_tokens": 13434421.0, + "step": 1507 + }, + { + "epoch": 1.1458966565349544, + "grad_norm": 1.7528148889541626, + "learning_rate": 3.641502779953307e-06, + "loss": 0.5020896196365356, + "mean_token_accuracy": 0.826249361038208, + "num_tokens": 13445286.0, + "step": 1508 + }, + { + "epoch": 1.1466565349544073, + "grad_norm": 1.3470909595489502, + "learning_rate": 3.639639055908751e-06, + "loss": 0.45765724778175354, + "mean_token_accuracy": 0.8380560278892517, + "num_tokens": 13465030.0, + "step": 1509 + }, + { + "epoch": 1.1474164133738602, + "grad_norm": 2.4846835136413574, + "learning_rate": 3.6377745320216346e-06, + "loss": 0.46488267183303833, + "mean_token_accuracy": 0.8393925428390503, + "num_tokens": 13470883.0, + "step": 1510 + }, + { + "epoch": 1.1481762917933132, + "grad_norm": 1.770201563835144, + "learning_rate": 3.635909209600555e-06, + "loss": 0.5262179374694824, + "mean_token_accuracy": 0.8201162815093994, + "num_tokens": 13482558.0, + "step": 1511 + }, + { + "epoch": 1.148936170212766, + "grad_norm": 1.5955098867416382, + "learning_rate": 3.6340430899546656e-06, + "loss": 0.430621862411499, + "mean_token_accuracy": 0.8488553762435913, + "num_tokens": 13493003.0, + "step": 1512 + }, + { + "epoch": 1.1496960486322187, + "grad_norm": 2.846176862716675, + "learning_rate": 3.632176174393682e-06, + "loss": 0.23461638391017914, + "mean_token_accuracy": 0.9218817353248596, + "num_tokens": 13496566.0, + "step": 1513 + }, + { + "epoch": 1.1504559270516717, + "grad_norm": 1.9606610536575317, + "learning_rate": 3.630308464227877e-06, + "loss": 0.4940161108970642, + "mean_token_accuracy": 0.8474864959716797, + "num_tokens": 13504843.0, + "step": 1514 + }, + { + "epoch": 1.1512158054711246, + "grad_norm": 1.1588608026504517, + "learning_rate": 3.628439960768082e-06, + "loss": 0.32650992274284363, + "mean_token_accuracy": 0.8797246217727661, + "num_tokens": 13521513.0, + "step": 1515 + }, + { + "epoch": 1.1519756838905775, + "grad_norm": 1.3566495180130005, + "learning_rate": 3.6265706653256837e-06, + "loss": 0.4359064996242523, + "mean_token_accuracy": 0.8379859328269958, + "num_tokens": 13540608.0, + "step": 1516 + }, + { + "epoch": 1.1527355623100304, + "grad_norm": 1.4728609323501587, + "learning_rate": 3.624700579212626e-06, + "loss": 0.29939693212509155, + "mean_token_accuracy": 0.8831408023834229, + "num_tokens": 13550641.0, + "step": 1517 + }, + { + "epoch": 1.1534954407294833, + "grad_norm": 2.162325382232666, + "learning_rate": 3.6228297037414077e-06, + "loss": 0.4097636938095093, + "mean_token_accuracy": 0.8575425148010254, + "num_tokens": 13556931.0, + "step": 1518 + }, + { + "epoch": 1.1542553191489362, + "grad_norm": 1.754439353942871, + "learning_rate": 3.6209580402250816e-06, + "loss": 0.400202214717865, + "mean_token_accuracy": 0.8569821119308472, + "num_tokens": 13565491.0, + "step": 1519 + }, + { + "epoch": 1.155015197568389, + "grad_norm": 1.5250083208084106, + "learning_rate": 3.619085589977251e-06, + "loss": 0.43330419063568115, + "mean_token_accuracy": 0.8492985963821411, + "num_tokens": 13577147.0, + "step": 1520 + }, + { + "epoch": 1.155775075987842, + "grad_norm": 1.9108905792236328, + "learning_rate": 3.617212354312076e-06, + "loss": 0.30567464232444763, + "mean_token_accuracy": 0.8850164413452148, + "num_tokens": 13584366.0, + "step": 1521 + }, + { + "epoch": 1.156534954407295, + "grad_norm": 2.2574243545532227, + "learning_rate": 3.615338334544265e-06, + "loss": 0.4391738772392273, + "mean_token_accuracy": 0.839765727519989, + "num_tokens": 13591816.0, + "step": 1522 + }, + { + "epoch": 1.1572948328267478, + "grad_norm": 2.1235218048095703, + "learning_rate": 3.6134635319890763e-06, + "loss": 0.45043107867240906, + "mean_token_accuracy": 0.8385299444198608, + "num_tokens": 13599736.0, + "step": 1523 + }, + { + "epoch": 1.1580547112462005, + "grad_norm": 2.2274110317230225, + "learning_rate": 3.611587947962319e-06, + "loss": 0.3623226284980774, + "mean_token_accuracy": 0.8724044561386108, + "num_tokens": 13605354.0, + "step": 1524 + }, + { + "epoch": 1.1588145896656534, + "grad_norm": 3.414236545562744, + "learning_rate": 3.6097115837803504e-06, + "loss": 0.30060696601867676, + "mean_token_accuracy": 0.8971061706542969, + "num_tokens": 13608851.0, + "step": 1525 + }, + { + "epoch": 1.1595744680851063, + "grad_norm": 2.496264696121216, + "learning_rate": 3.6078344407600744e-06, + "loss": 0.3567180037498474, + "mean_token_accuracy": 0.8596180081367493, + "num_tokens": 13614339.0, + "step": 1526 + }, + { + "epoch": 1.1603343465045592, + "grad_norm": 2.0191843509674072, + "learning_rate": 3.6059565202189433e-06, + "loss": 0.43206095695495605, + "mean_token_accuracy": 0.8464000821113586, + "num_tokens": 13622395.0, + "step": 1527 + }, + { + "epoch": 1.1610942249240122, + "grad_norm": 1.5475906133651733, + "learning_rate": 3.604077823474954e-06, + "loss": 0.4535648226737976, + "mean_token_accuracy": 0.8391586542129517, + "num_tokens": 13635356.0, + "step": 1528 + }, + { + "epoch": 1.161854103343465, + "grad_norm": 2.1348211765289307, + "learning_rate": 3.6021983518466468e-06, + "loss": 0.2733963429927826, + "mean_token_accuracy": 0.9007417559623718, + "num_tokens": 13640641.0, + "step": 1529 + }, + { + "epoch": 1.162613981762918, + "grad_norm": 2.8452792167663574, + "learning_rate": 3.600318106653108e-06, + "loss": 0.29591235518455505, + "mean_token_accuracy": 0.8934413194656372, + "num_tokens": 13644995.0, + "step": 1530 + }, + { + "epoch": 1.1633738601823709, + "grad_norm": 2.342907190322876, + "learning_rate": 3.5984370892139663e-06, + "loss": 0.4675130248069763, + "mean_token_accuracy": 0.8352028131484985, + "num_tokens": 13652695.0, + "step": 1531 + }, + { + "epoch": 1.1641337386018238, + "grad_norm": 2.3480238914489746, + "learning_rate": 3.5965553008493924e-06, + "loss": 0.3114515542984009, + "mean_token_accuracy": 0.8845353126525879, + "num_tokens": 13658101.0, + "step": 1532 + }, + { + "epoch": 1.1648936170212765, + "grad_norm": 1.8608155250549316, + "learning_rate": 3.594672742880097e-06, + "loss": 0.3864145278930664, + "mean_token_accuracy": 0.867354154586792, + "num_tokens": 13666042.0, + "step": 1533 + }, + { + "epoch": 1.1656534954407296, + "grad_norm": 1.4756088256835938, + "learning_rate": 3.5927894166273324e-06, + "loss": 0.3671600818634033, + "mean_token_accuracy": 0.8695988655090332, + "num_tokens": 13678253.0, + "step": 1534 + }, + { + "epoch": 1.1664133738601823, + "grad_norm": 2.8831355571746826, + "learning_rate": 3.5909053234128893e-06, + "loss": 0.267184317111969, + "mean_token_accuracy": 0.9008115530014038, + "num_tokens": 13681790.0, + "step": 1535 + }, + { + "epoch": 1.1671732522796352, + "grad_norm": 2.1984763145446777, + "learning_rate": 3.5890204645590964e-06, + "loss": 0.4431505799293518, + "mean_token_accuracy": 0.8623673915863037, + "num_tokens": 13688444.0, + "step": 1536 + }, + { + "epoch": 1.167933130699088, + "grad_norm": 1.8271523714065552, + "learning_rate": 3.5871348413888207e-06, + "loss": 0.3861040771007538, + "mean_token_accuracy": 0.8624277114868164, + "num_tokens": 13696872.0, + "step": 1537 + }, + { + "epoch": 1.168693009118541, + "grad_norm": 1.6313756704330444, + "learning_rate": 3.585248455225466e-06, + "loss": 0.3775154948234558, + "mean_token_accuracy": 0.8624461889266968, + "num_tokens": 13706167.0, + "step": 1538 + }, + { + "epoch": 1.169452887537994, + "grad_norm": 2.4377901554107666, + "learning_rate": 3.5833613073929684e-06, + "loss": 0.2308957427740097, + "mean_token_accuracy": 0.920600175857544, + "num_tokens": 13710367.0, + "step": 1539 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 2.2621750831604004, + "learning_rate": 3.5814733992158025e-06, + "loss": 0.33167219161987305, + "mean_token_accuracy": 0.8963261842727661, + "num_tokens": 13716384.0, + "step": 1540 + }, + { + "epoch": 1.1709726443768997, + "grad_norm": 1.3178150653839111, + "learning_rate": 3.579584732018975e-06, + "loss": 0.3276631832122803, + "mean_token_accuracy": 0.8853521347045898, + "num_tokens": 13731031.0, + "step": 1541 + }, + { + "epoch": 1.1717325227963526, + "grad_norm": 2.177750587463379, + "learning_rate": 3.577695307128024e-06, + "loss": 0.48177266120910645, + "mean_token_accuracy": 0.830329418182373, + "num_tokens": 13737925.0, + "step": 1542 + }, + { + "epoch": 1.1724924012158056, + "grad_norm": 2.2268829345703125, + "learning_rate": 3.5758051258690223e-06, + "loss": 0.48843517899513245, + "mean_token_accuracy": 0.8310644030570984, + "num_tokens": 13746039.0, + "step": 1543 + }, + { + "epoch": 1.1732522796352582, + "grad_norm": 1.498701572418213, + "learning_rate": 3.5739141895685708e-06, + "loss": 0.4542962312698364, + "mean_token_accuracy": 0.8500330448150635, + "num_tokens": 13765002.0, + "step": 1544 + }, + { + "epoch": 1.1740121580547112, + "grad_norm": 1.786670446395874, + "learning_rate": 3.5720224995538023e-06, + "loss": 0.27367928624153137, + "mean_token_accuracy": 0.8916142582893372, + "num_tokens": 13774113.0, + "step": 1545 + }, + { + "epoch": 1.174772036474164, + "grad_norm": 2.0311272144317627, + "learning_rate": 3.5701300571523757e-06, + "loss": 0.559987485408783, + "mean_token_accuracy": 0.8266973495483398, + "num_tokens": 13783912.0, + "step": 1546 + }, + { + "epoch": 1.175531914893617, + "grad_norm": 1.8732186555862427, + "learning_rate": 3.5682368636924825e-06, + "loss": 0.5184751152992249, + "mean_token_accuracy": 0.8450918197631836, + "num_tokens": 13792728.0, + "step": 1547 + }, + { + "epoch": 1.1762917933130699, + "grad_norm": 1.4410661458969116, + "learning_rate": 3.566342920502837e-06, + "loss": 0.383536696434021, + "mean_token_accuracy": 0.8672217726707458, + "num_tokens": 13813590.0, + "step": 1548 + }, + { + "epoch": 1.1770516717325228, + "grad_norm": 3.06056547164917, + "learning_rate": 3.564448228912682e-06, + "loss": 0.3941686153411865, + "mean_token_accuracy": 0.8696402311325073, + "num_tokens": 13817704.0, + "step": 1549 + }, + { + "epoch": 1.1778115501519757, + "grad_norm": 1.6150329113006592, + "learning_rate": 3.562552790251785e-06, + "loss": 0.41606605052948, + "mean_token_accuracy": 0.8488572835922241, + "num_tokens": 13831303.0, + "step": 1550 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 2.1199934482574463, + "learning_rate": 3.5606566058504377e-06, + "loss": 0.3974752426147461, + "mean_token_accuracy": 0.8686345219612122, + "num_tokens": 13837613.0, + "step": 1551 + }, + { + "epoch": 1.1793313069908815, + "grad_norm": 1.5683876276016235, + "learning_rate": 3.558759677039455e-06, + "loss": 0.35225993394851685, + "mean_token_accuracy": 0.8710784316062927, + "num_tokens": 13846779.0, + "step": 1552 + }, + { + "epoch": 1.1800911854103344, + "grad_norm": 1.4644675254821777, + "learning_rate": 3.5568620051501755e-06, + "loss": 0.38400042057037354, + "mean_token_accuracy": 0.8548328876495361, + "num_tokens": 13860713.0, + "step": 1553 + }, + { + "epoch": 1.1808510638297873, + "grad_norm": 1.461491346359253, + "learning_rate": 3.5549635915144578e-06, + "loss": 0.4572640061378479, + "mean_token_accuracy": 0.8506045937538147, + "num_tokens": 13877289.0, + "step": 1554 + }, + { + "epoch": 1.18161094224924, + "grad_norm": 2.6364715099334717, + "learning_rate": 3.553064437464682e-06, + "loss": 0.3954341411590576, + "mean_token_accuracy": 0.8561649322509766, + "num_tokens": 13882064.0, + "step": 1555 + }, + { + "epoch": 1.182370820668693, + "grad_norm": 2.027273654937744, + "learning_rate": 3.551164544333745e-06, + "loss": 0.47625732421875, + "mean_token_accuracy": 0.8349384069442749, + "num_tokens": 13890306.0, + "step": 1556 + }, + { + "epoch": 1.1831306990881458, + "grad_norm": 2.8427743911743164, + "learning_rate": 3.549263913455069e-06, + "loss": 0.4273033142089844, + "mean_token_accuracy": 0.8541387319564819, + "num_tokens": 13894882.0, + "step": 1557 + }, + { + "epoch": 1.1838905775075987, + "grad_norm": 1.6298975944519043, + "learning_rate": 3.5473625461625884e-06, + "loss": 0.4378639757633209, + "mean_token_accuracy": 0.8634963631629944, + "num_tokens": 13906152.0, + "step": 1558 + }, + { + "epoch": 1.1846504559270516, + "grad_norm": 2.4098947048187256, + "learning_rate": 3.5454604437907535e-06, + "loss": 0.47236716747283936, + "mean_token_accuracy": 0.8646864891052246, + "num_tokens": 13911803.0, + "step": 1559 + }, + { + "epoch": 1.1854103343465046, + "grad_norm": 1.5972497463226318, + "learning_rate": 3.543557607674537e-06, + "loss": 0.3001407980918884, + "mean_token_accuracy": 0.8927055597305298, + "num_tokens": 13921304.0, + "step": 1560 + }, + { + "epoch": 1.1861702127659575, + "grad_norm": 2.1140005588531494, + "learning_rate": 3.54165403914942e-06, + "loss": 0.41898271441459656, + "mean_token_accuracy": 0.8542245626449585, + "num_tokens": 13929434.0, + "step": 1561 + }, + { + "epoch": 1.1869300911854104, + "grad_norm": 1.8733803033828735, + "learning_rate": 3.539749739551401e-06, + "loss": 0.35469961166381836, + "mean_token_accuracy": 0.8805290460586548, + "num_tokens": 13937781.0, + "step": 1562 + }, + { + "epoch": 1.1876899696048633, + "grad_norm": 2.2805802822113037, + "learning_rate": 3.53784471021699e-06, + "loss": 0.44496792554855347, + "mean_token_accuracy": 0.8454172611236572, + "num_tokens": 13944394.0, + "step": 1563 + }, + { + "epoch": 1.1884498480243162, + "grad_norm": 0.9728449583053589, + "learning_rate": 3.535938952483211e-06, + "loss": 0.3156968355178833, + "mean_token_accuracy": 0.8739837408065796, + "num_tokens": 13966712.0, + "step": 1564 + }, + { + "epoch": 1.189209726443769, + "grad_norm": 3.025338888168335, + "learning_rate": 3.534032467687597e-06, + "loss": 0.30036938190460205, + "mean_token_accuracy": 0.9058252573013306, + "num_tokens": 13970183.0, + "step": 1565 + }, + { + "epoch": 1.1899696048632218, + "grad_norm": 2.0659425258636475, + "learning_rate": 3.532125257168193e-06, + "loss": 0.30619731545448303, + "mean_token_accuracy": 0.9041587710380554, + "num_tokens": 13976657.0, + "step": 1566 + }, + { + "epoch": 1.1907294832826747, + "grad_norm": 3.2036776542663574, + "learning_rate": 3.5302173222635526e-06, + "loss": 0.4145944118499756, + "mean_token_accuracy": 0.8502328395843506, + "num_tokens": 13981198.0, + "step": 1567 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 1.7767539024353027, + "learning_rate": 3.5283086643127396e-06, + "loss": 0.437128484249115, + "mean_token_accuracy": 0.8965631723403931, + "num_tokens": 13990259.0, + "step": 1568 + }, + { + "epoch": 1.1922492401215805, + "grad_norm": 1.7777384519577026, + "learning_rate": 3.5263992846553203e-06, + "loss": 0.33831220865249634, + "mean_token_accuracy": 0.8734279870986938, + "num_tokens": 13999363.0, + "step": 1569 + }, + { + "epoch": 1.1930091185410334, + "grad_norm": 1.6710708141326904, + "learning_rate": 3.5244891846313733e-06, + "loss": 0.4005590081214905, + "mean_token_accuracy": 0.8820298314094543, + "num_tokens": 14008719.0, + "step": 1570 + }, + { + "epoch": 1.1937689969604863, + "grad_norm": 1.0378777980804443, + "learning_rate": 3.5225783655814798e-06, + "loss": 0.3174915313720703, + "mean_token_accuracy": 0.8894162774085999, + "num_tokens": 14025806.0, + "step": 1571 + }, + { + "epoch": 1.1945288753799392, + "grad_norm": 1.2647521495819092, + "learning_rate": 3.520666828846726e-06, + "loss": 0.4173050820827484, + "mean_token_accuracy": 0.8437265157699585, + "num_tokens": 14046445.0, + "step": 1572 + }, + { + "epoch": 1.1952887537993921, + "grad_norm": 2.8625528812408447, + "learning_rate": 3.518754575768702e-06, + "loss": 0.37182557582855225, + "mean_token_accuracy": 0.8660947680473328, + "num_tokens": 14051197.0, + "step": 1573 + }, + { + "epoch": 1.196048632218845, + "grad_norm": 1.1213171482086182, + "learning_rate": 3.516841607689501e-06, + "loss": 0.332731157541275, + "mean_token_accuracy": 0.8573278784751892, + "num_tokens": 14070817.0, + "step": 1574 + }, + { + "epoch": 1.196808510638298, + "grad_norm": 1.197508692741394, + "learning_rate": 3.5149279259517165e-06, + "loss": 0.34058472514152527, + "mean_token_accuracy": 0.8603571653366089, + "num_tokens": 14085301.0, + "step": 1575 + }, + { + "epoch": 1.1975683890577509, + "grad_norm": 4.019949913024902, + "learning_rate": 3.5130135318984454e-06, + "loss": 0.3094622492790222, + "mean_token_accuracy": 0.8905094861984253, + "num_tokens": 14088107.0, + "step": 1576 + }, + { + "epoch": 1.1983282674772036, + "grad_norm": 2.591181755065918, + "learning_rate": 3.5110984268732827e-06, + "loss": 0.3407078981399536, + "mean_token_accuracy": 0.880385160446167, + "num_tokens": 14092887.0, + "step": 1577 + }, + { + "epoch": 1.1990881458966565, + "grad_norm": 1.3069331645965576, + "learning_rate": 3.509182612220322e-06, + "loss": 0.3761988878250122, + "mean_token_accuracy": 0.862013041973114, + "num_tokens": 14109216.0, + "step": 1578 + }, + { + "epoch": 1.1998480243161094, + "grad_norm": 1.7802022695541382, + "learning_rate": 3.507266089284157e-06, + "loss": 0.3824652135372162, + "mean_token_accuracy": 0.8707721829414368, + "num_tokens": 14119645.0, + "step": 1579 + }, + { + "epoch": 1.2006079027355623, + "grad_norm": 2.7937185764312744, + "learning_rate": 3.5053488594098763e-06, + "loss": 0.33828890323638916, + "mean_token_accuracy": 0.8765541315078735, + "num_tokens": 14124628.0, + "step": 1580 + }, + { + "epoch": 1.2013677811550152, + "grad_norm": 1.892671823501587, + "learning_rate": 3.5034309239430664e-06, + "loss": 0.3476094603538513, + "mean_token_accuracy": 0.9053795337677002, + "num_tokens": 14131756.0, + "step": 1581 + }, + { + "epoch": 1.202127659574468, + "grad_norm": 1.6857695579528809, + "learning_rate": 3.501512284229807e-06, + "loss": 0.5397108793258667, + "mean_token_accuracy": 0.8173421025276184, + "num_tokens": 14143024.0, + "step": 1582 + }, + { + "epoch": 1.202887537993921, + "grad_norm": 2.501737117767334, + "learning_rate": 3.4995929416166756e-06, + "loss": 0.4192458391189575, + "mean_token_accuracy": 0.8558136224746704, + "num_tokens": 14149499.0, + "step": 1583 + }, + { + "epoch": 1.203647416413374, + "grad_norm": 2.0133907794952393, + "learning_rate": 3.4976728974507387e-06, + "loss": 0.4791576564311981, + "mean_token_accuracy": 0.8253597021102905, + "num_tokens": 14158381.0, + "step": 1584 + }, + { + "epoch": 1.2044072948328268, + "grad_norm": 2.984611988067627, + "learning_rate": 3.4957521530795576e-06, + "loss": 0.3040750026702881, + "mean_token_accuracy": 0.8902391791343689, + "num_tokens": 14162419.0, + "step": 1585 + }, + { + "epoch": 1.2051671732522795, + "grad_norm": 1.518591284751892, + "learning_rate": 3.493830709851185e-06, + "loss": 0.35539618134498596, + "mean_token_accuracy": 0.8737183809280396, + "num_tokens": 14173048.0, + "step": 1586 + }, + { + "epoch": 1.2059270516717326, + "grad_norm": 2.628758192062378, + "learning_rate": 3.4919085691141636e-06, + "loss": 0.33340200781822205, + "mean_token_accuracy": 0.8705098628997803, + "num_tokens": 14178255.0, + "step": 1587 + }, + { + "epoch": 1.2066869300911853, + "grad_norm": 2.5565974712371826, + "learning_rate": 3.4899857322175252e-06, + "loss": 0.44939476251602173, + "mean_token_accuracy": 0.8315504193305969, + "num_tokens": 14183808.0, + "step": 1588 + }, + { + "epoch": 1.2074468085106382, + "grad_norm": 1.7521045207977295, + "learning_rate": 3.4880622005107916e-06, + "loss": 0.3168621063232422, + "mean_token_accuracy": 0.8824669122695923, + "num_tokens": 14192186.0, + "step": 1589 + }, + { + "epoch": 1.2082066869300911, + "grad_norm": 1.9816104173660278, + "learning_rate": 3.486137975343971e-06, + "loss": 0.3892582058906555, + "mean_token_accuracy": 0.8524188995361328, + "num_tokens": 14200512.0, + "step": 1590 + }, + { + "epoch": 1.208966565349544, + "grad_norm": 1.459800124168396, + "learning_rate": 3.484213058067559e-06, + "loss": 0.45930033922195435, + "mean_token_accuracy": 0.8408471345901489, + "num_tokens": 14215232.0, + "step": 1591 + }, + { + "epoch": 1.209726443768997, + "grad_norm": 2.015493154525757, + "learning_rate": 3.482287450032536e-06, + "loss": 0.5514016151428223, + "mean_token_accuracy": 0.8456779718399048, + "num_tokens": 14225402.0, + "step": 1592 + }, + { + "epoch": 1.2104863221884499, + "grad_norm": 3.4511911869049072, + "learning_rate": 3.4803611525903687e-06, + "loss": 0.4772771894931793, + "mean_token_accuracy": 0.8558698892593384, + "num_tokens": 14229038.0, + "step": 1593 + }, + { + "epoch": 1.2112462006079028, + "grad_norm": 2.2247982025146484, + "learning_rate": 3.4784341670930067e-06, + "loss": 0.4042825996875763, + "mean_token_accuracy": 0.8635870218276978, + "num_tokens": 14237057.0, + "step": 1594 + }, + { + "epoch": 1.2120060790273557, + "grad_norm": 2.0534820556640625, + "learning_rate": 3.4765064948928813e-06, + "loss": 0.34057414531707764, + "mean_token_accuracy": 0.8800770044326782, + "num_tokens": 14243013.0, + "step": 1595 + }, + { + "epoch": 1.2127659574468086, + "grad_norm": 2.594703197479248, + "learning_rate": 3.474578137342909e-06, + "loss": 0.4997410774230957, + "mean_token_accuracy": 0.8302106261253357, + "num_tokens": 14251210.0, + "step": 1596 + }, + { + "epoch": 1.2135258358662613, + "grad_norm": 2.517833948135376, + "learning_rate": 3.4726490957964836e-06, + "loss": 0.3630390465259552, + "mean_token_accuracy": 0.8679884672164917, + "num_tokens": 14255893.0, + "step": 1597 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 1.5177065134048462, + "learning_rate": 3.4707193716074816e-06, + "loss": 0.36218544840812683, + "mean_token_accuracy": 0.879178524017334, + "num_tokens": 14268143.0, + "step": 1598 + }, + { + "epoch": 1.215045592705167, + "grad_norm": 2.215291738510132, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.4166645407676697, + "mean_token_accuracy": 0.8495793342590332, + "num_tokens": 14276794.0, + "step": 1599 + }, + { + "epoch": 1.21580547112462, + "grad_norm": 1.534294843673706, + "learning_rate": 3.466857880719645e-06, + "loss": 0.2635883092880249, + "mean_token_accuracy": 0.8971712589263916, + "num_tokens": 14287000.0, + "step": 1600 + }, + { + "epoch": 1.216565349544073, + "grad_norm": 1.2338658571243286, + "learning_rate": 3.464926116730953e-06, + "loss": 0.339110404253006, + "mean_token_accuracy": 0.895592987537384, + "num_tokens": 14303217.0, + "step": 1601 + }, + { + "epoch": 1.2173252279635258, + "grad_norm": 1.8717178106307983, + "learning_rate": 3.462993675519968e-06, + "loss": 0.41204726696014404, + "mean_token_accuracy": 0.8560728430747986, + "num_tokens": 14311372.0, + "step": 1602 + }, + { + "epoch": 1.2180851063829787, + "grad_norm": 2.844160795211792, + "learning_rate": 3.4610605584429526e-06, + "loss": 0.4129520058631897, + "mean_token_accuracy": 0.8555002212524414, + "num_tokens": 14316244.0, + "step": 1603 + }, + { + "epoch": 1.2188449848024316, + "grad_norm": 1.099926471710205, + "learning_rate": 3.4591267668566412e-06, + "loss": 0.35783132910728455, + "mean_token_accuracy": 0.8693175315856934, + "num_tokens": 14338414.0, + "step": 1604 + }, + { + "epoch": 1.2196048632218845, + "grad_norm": 1.6448384523391724, + "learning_rate": 3.457192302118244e-06, + "loss": 0.42060258984565735, + "mean_token_accuracy": 0.8557323217391968, + "num_tokens": 14349143.0, + "step": 1605 + }, + { + "epoch": 1.2203647416413375, + "grad_norm": 2.097529888153076, + "learning_rate": 3.455257165585444e-06, + "loss": 0.5227499008178711, + "mean_token_accuracy": 0.828961968421936, + "num_tokens": 14360032.0, + "step": 1606 + }, + { + "epoch": 1.2211246200607904, + "grad_norm": 1.602988600730896, + "learning_rate": 3.453321358616393e-06, + "loss": 0.3537187874317169, + "mean_token_accuracy": 0.8776708841323853, + "num_tokens": 14370005.0, + "step": 1607 + }, + { + "epoch": 1.221884498480243, + "grad_norm": 2.358971357345581, + "learning_rate": 3.4513848825697145e-06, + "loss": 0.3448919653892517, + "mean_token_accuracy": 0.8887944221496582, + "num_tokens": 14375718.0, + "step": 1608 + }, + { + "epoch": 1.222644376899696, + "grad_norm": 1.72306227684021, + "learning_rate": 3.4494477388045035e-06, + "loss": 0.36985084414482117, + "mean_token_accuracy": 0.859595537185669, + "num_tokens": 14385016.0, + "step": 1609 + }, + { + "epoch": 1.2234042553191489, + "grad_norm": 1.5494085550308228, + "learning_rate": 3.4475099286803204e-06, + "loss": 0.49003708362579346, + "mean_token_accuracy": 0.8701964616775513, + "num_tokens": 14399277.0, + "step": 1610 + }, + { + "epoch": 1.2241641337386018, + "grad_norm": 2.6874046325683594, + "learning_rate": 3.445571453557196e-06, + "loss": 0.3424490690231323, + "mean_token_accuracy": 0.8835943937301636, + "num_tokens": 14404182.0, + "step": 1611 + }, + { + "epoch": 1.2249240121580547, + "grad_norm": 2.2163190841674805, + "learning_rate": 3.443632314795627e-06, + "loss": 0.40944457054138184, + "mean_token_accuracy": 0.8649888038635254, + "num_tokens": 14410158.0, + "step": 1612 + }, + { + "epoch": 1.2256838905775076, + "grad_norm": 2.7961158752441406, + "learning_rate": 3.4416925137565756e-06, + "loss": 0.17890746891498566, + "mean_token_accuracy": 0.9439430832862854, + "num_tokens": 14413285.0, + "step": 1613 + }, + { + "epoch": 1.2264437689969605, + "grad_norm": 1.421451210975647, + "learning_rate": 3.439752051801467e-06, + "loss": 0.33948683738708496, + "mean_token_accuracy": 0.8754585981369019, + "num_tokens": 14424674.0, + "step": 1614 + }, + { + "epoch": 1.2272036474164134, + "grad_norm": 2.105196237564087, + "learning_rate": 3.4378109302921946e-06, + "loss": 0.40009379386901855, + "mean_token_accuracy": 0.8600341081619263, + "num_tokens": 14432400.0, + "step": 1615 + }, + { + "epoch": 1.2279635258358663, + "grad_norm": 2.004122734069824, + "learning_rate": 3.4358691505911105e-06, + "loss": 0.46013444662094116, + "mean_token_accuracy": 0.8400925993919373, + "num_tokens": 14440741.0, + "step": 1616 + }, + { + "epoch": 1.2287234042553192, + "grad_norm": 1.8407535552978516, + "learning_rate": 3.4339267140610317e-06, + "loss": 0.38828906416893005, + "mean_token_accuracy": 0.8582802414894104, + "num_tokens": 14448698.0, + "step": 1617 + }, + { + "epoch": 1.2294832826747721, + "grad_norm": 2.4285924434661865, + "learning_rate": 3.4319836220652334e-06, + "loss": 0.3109283447265625, + "mean_token_accuracy": 0.8888344764709473, + "num_tokens": 14453674.0, + "step": 1618 + }, + { + "epoch": 1.2302431610942248, + "grad_norm": 1.6322550773620605, + "learning_rate": 3.430039875967454e-06, + "loss": 0.5222204327583313, + "mean_token_accuracy": 0.825019121170044, + "num_tokens": 14465736.0, + "step": 1619 + }, + { + "epoch": 1.2310030395136777, + "grad_norm": 2.307573080062866, + "learning_rate": 3.428095477131888e-06, + "loss": 0.29477375745773315, + "mean_token_accuracy": 0.8899064660072327, + "num_tokens": 14471266.0, + "step": 1620 + }, + { + "epoch": 1.2317629179331306, + "grad_norm": 1.8044531345367432, + "learning_rate": 3.4261504269231904e-06, + "loss": 0.4883342981338501, + "mean_token_accuracy": 0.8310165405273438, + "num_tokens": 14481679.0, + "step": 1621 + }, + { + "epoch": 1.2325227963525835, + "grad_norm": 2.7585411071777344, + "learning_rate": 3.4242047267064714e-06, + "loss": 0.45369645953178406, + "mean_token_accuracy": 0.8432134985923767, + "num_tokens": 14487299.0, + "step": 1622 + }, + { + "epoch": 1.2332826747720365, + "grad_norm": 2.687490701675415, + "learning_rate": 3.4222583778472997e-06, + "loss": 0.5627540349960327, + "mean_token_accuracy": 0.8186438083648682, + "num_tokens": 14494254.0, + "step": 1623 + }, + { + "epoch": 1.2340425531914894, + "grad_norm": 2.622443199157715, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.28697147965431213, + "mean_token_accuracy": 0.8861737847328186, + "num_tokens": 14498632.0, + "step": 1624 + }, + { + "epoch": 1.2348024316109423, + "grad_norm": 2.6943359375, + "learning_rate": 3.4183637396661372e-06, + "loss": 0.25273287296295166, + "mean_token_accuracy": 0.9104914665222168, + "num_tokens": 14502797.0, + "step": 1625 + }, + { + "epoch": 1.2355623100303952, + "grad_norm": 2.428189992904663, + "learning_rate": 3.4164154530775552e-06, + "loss": 0.4213451147079468, + "mean_token_accuracy": 0.851524293422699, + "num_tokens": 14508503.0, + "step": 1626 + }, + { + "epoch": 1.236322188449848, + "grad_norm": 2.1722824573516846, + "learning_rate": 3.4144665233133318e-06, + "loss": 0.35238856077194214, + "mean_token_accuracy": 0.8730837106704712, + "num_tokens": 14516126.0, + "step": 1627 + }, + { + "epoch": 1.237082066869301, + "grad_norm": 2.291365146636963, + "learning_rate": 3.4125169517413005e-06, + "loss": 0.43963465094566345, + "mean_token_accuracy": 0.8525444865226746, + "num_tokens": 14522507.0, + "step": 1628 + }, + { + "epoch": 1.237841945288754, + "grad_norm": 1.6181648969650269, + "learning_rate": 3.410566739729746e-06, + "loss": 0.2799680233001709, + "mean_token_accuracy": 0.8915654420852661, + "num_tokens": 14531025.0, + "step": 1629 + }, + { + "epoch": 1.2386018237082066, + "grad_norm": 1.4039218425750732, + "learning_rate": 3.408615888647402e-06, + "loss": 0.29756587743759155, + "mean_token_accuracy": 0.8951715230941772, + "num_tokens": 14543770.0, + "step": 1630 + }, + { + "epoch": 1.2393617021276595, + "grad_norm": 2.148325204849243, + "learning_rate": 3.4066643998634506e-06, + "loss": 0.3983418345451355, + "mean_token_accuracy": 0.8635951280593872, + "num_tokens": 14550896.0, + "step": 1631 + }, + { + "epoch": 1.2401215805471124, + "grad_norm": 1.5225859880447388, + "learning_rate": 3.4047122747475227e-06, + "loss": 0.3247569799423218, + "mean_token_accuracy": 0.8727027177810669, + "num_tokens": 14562181.0, + "step": 1632 + }, + { + "epoch": 1.2408814589665653, + "grad_norm": 3.99835467338562, + "learning_rate": 3.402759514669694e-06, + "loss": 0.4317352771759033, + "mean_token_accuracy": 0.8488142490386963, + "num_tokens": 14565521.0, + "step": 1633 + }, + { + "epoch": 1.2416413373860182, + "grad_norm": 1.7306902408599854, + "learning_rate": 3.4008061210004872e-06, + "loss": 0.389854371547699, + "mean_token_accuracy": 0.8553084135055542, + "num_tokens": 14574633.0, + "step": 1634 + }, + { + "epoch": 1.2424012158054711, + "grad_norm": 2.3614673614501953, + "learning_rate": 3.3988520951108683e-06, + "loss": 0.3150152564048767, + "mean_token_accuracy": 0.8865959644317627, + "num_tokens": 14580240.0, + "step": 1635 + }, + { + "epoch": 1.243161094224924, + "grad_norm": 1.5625747442245483, + "learning_rate": 3.3968974383722497e-06, + "loss": 0.43160033226013184, + "mean_token_accuracy": 0.840155839920044, + "num_tokens": 14594255.0, + "step": 1636 + }, + { + "epoch": 1.243920972644377, + "grad_norm": 1.871620535850525, + "learning_rate": 3.3949421521564825e-06, + "loss": 0.49550193548202515, + "mean_token_accuracy": 0.8315126299858093, + "num_tokens": 14605416.0, + "step": 1637 + }, + { + "epoch": 1.2446808510638299, + "grad_norm": 2.111304759979248, + "learning_rate": 3.392986237835863e-06, + "loss": 0.2794899046421051, + "mean_token_accuracy": 0.9049773216247559, + "num_tokens": 14611711.0, + "step": 1638 + }, + { + "epoch": 1.2454407294832828, + "grad_norm": 3.7479894161224365, + "learning_rate": 3.391029696783127e-06, + "loss": 0.469397634267807, + "mean_token_accuracy": 0.8352956771850586, + "num_tokens": 14615536.0, + "step": 1639 + }, + { + "epoch": 1.2462006079027357, + "grad_norm": 3.277726650238037, + "learning_rate": 3.389072530371451e-06, + "loss": 0.35431790351867676, + "mean_token_accuracy": 0.8822286128997803, + "num_tokens": 14619390.0, + "step": 1640 + }, + { + "epoch": 1.2469604863221884, + "grad_norm": 1.9583072662353516, + "learning_rate": 3.3871147399744482e-06, + "loss": 0.3708694577217102, + "mean_token_accuracy": 0.8720351457595825, + "num_tokens": 14626573.0, + "step": 1641 + }, + { + "epoch": 1.2477203647416413, + "grad_norm": 1.8734042644500732, + "learning_rate": 3.385156326966173e-06, + "loss": 0.48163774609565735, + "mean_token_accuracy": 0.8479621410369873, + "num_tokens": 14636382.0, + "step": 1642 + }, + { + "epoch": 1.2484802431610942, + "grad_norm": 2.0085532665252686, + "learning_rate": 3.383197292721114e-06, + "loss": 0.4893198311328888, + "mean_token_accuracy": 0.838238000869751, + "num_tokens": 14645083.0, + "step": 1643 + }, + { + "epoch": 1.249240121580547, + "grad_norm": 2.0874593257904053, + "learning_rate": 3.3812376386141966e-06, + "loss": 0.4610505700111389, + "mean_token_accuracy": 0.8441368341445923, + "num_tokens": 14654048.0, + "step": 1644 + }, + { + "epoch": 1.25, + "grad_norm": 1.6887420415878296, + "learning_rate": 3.379277366020782e-06, + "loss": 0.3628596067428589, + "mean_token_accuracy": 0.8838590383529663, + "num_tokens": 14662317.0, + "step": 1645 + }, + { + "epoch": 1.250759878419453, + "grad_norm": 2.389002561569214, + "learning_rate": 3.3773164763166653e-06, + "loss": 0.21903495490550995, + "mean_token_accuracy": 0.9249413013458252, + "num_tokens": 14666394.0, + "step": 1646 + }, + { + "epoch": 1.2515197568389058, + "grad_norm": 1.7091087102890015, + "learning_rate": 3.3753549708780736e-06, + "loss": 0.37802332639694214, + "mean_token_accuracy": 0.8644627332687378, + "num_tokens": 14676214.0, + "step": 1647 + }, + { + "epoch": 1.2522796352583587, + "grad_norm": 2.5717999935150146, + "learning_rate": 3.3733928510816677e-06, + "loss": 0.4236462116241455, + "mean_token_accuracy": 0.8519910573959351, + "num_tokens": 14681681.0, + "step": 1648 + }, + { + "epoch": 1.2530395136778116, + "grad_norm": 1.958856463432312, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.3923419415950775, + "mean_token_accuracy": 0.8720202445983887, + "num_tokens": 14690419.0, + "step": 1649 + }, + { + "epoch": 1.2537993920972643, + "grad_norm": 1.5900038480758667, + "learning_rate": 3.369466773924207e-06, + "loss": 0.4182325601577759, + "mean_token_accuracy": 0.8515387177467346, + "num_tokens": 14699790.0, + "step": 1650 + }, + { + "epoch": 1.2545592705167175, + "grad_norm": 1.260547161102295, + "learning_rate": 3.3675028193186243e-06, + "loss": 0.3915718197822571, + "mean_token_accuracy": 0.8536830544471741, + "num_tokens": 14717502.0, + "step": 1651 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 1.8152283430099487, + "learning_rate": 3.365538255866169e-06, + "loss": 0.424524188041687, + "mean_token_accuracy": 0.8434420824050903, + "num_tokens": 14726591.0, + "step": 1652 + }, + { + "epoch": 1.256079027355623, + "grad_norm": 1.3357285261154175, + "learning_rate": 3.3635730849456484e-06, + "loss": 0.2949739396572113, + "mean_token_accuracy": 0.8868321180343628, + "num_tokens": 14739911.0, + "step": 1653 + }, + { + "epoch": 1.256838905775076, + "grad_norm": 1.1770358085632324, + "learning_rate": 3.3616073079362925e-06, + "loss": 0.29939576983451843, + "mean_token_accuracy": 0.8923654556274414, + "num_tokens": 14755521.0, + "step": 1654 + }, + { + "epoch": 1.2575987841945289, + "grad_norm": 2.059162139892578, + "learning_rate": 3.3596409262177633e-06, + "loss": 0.4562555253505707, + "mean_token_accuracy": 0.8585271239280701, + "num_tokens": 14764173.0, + "step": 1655 + }, + { + "epoch": 1.2583586626139818, + "grad_norm": 1.430752158164978, + "learning_rate": 3.357673941170139e-06, + "loss": 0.35301265120506287, + "mean_token_accuracy": 0.8920517563819885, + "num_tokens": 14775596.0, + "step": 1656 + }, + { + "epoch": 1.2591185410334347, + "grad_norm": 1.6066302061080933, + "learning_rate": 3.3557063541739283e-06, + "loss": 0.41129636764526367, + "mean_token_accuracy": 0.8512256145477295, + "num_tokens": 14786289.0, + "step": 1657 + }, + { + "epoch": 1.2598784194528876, + "grad_norm": 1.5471590757369995, + "learning_rate": 3.353738166610058e-06, + "loss": 0.3935067057609558, + "mean_token_accuracy": 0.8514131903648376, + "num_tokens": 14798672.0, + "step": 1658 + }, + { + "epoch": 1.2606382978723405, + "grad_norm": 1.3455181121826172, + "learning_rate": 3.35176937985988e-06, + "loss": 0.3486790657043457, + "mean_token_accuracy": 0.8644362688064575, + "num_tokens": 14811603.0, + "step": 1659 + }, + { + "epoch": 1.2613981762917934, + "grad_norm": 1.891432762145996, + "learning_rate": 3.349799995305162e-06, + "loss": 0.3325638175010681, + "mean_token_accuracy": 0.8844645023345947, + "num_tokens": 14819256.0, + "step": 1660 + }, + { + "epoch": 1.262158054711246, + "grad_norm": 2.600614309310913, + "learning_rate": 3.3478300143280946e-06, + "loss": 0.30310919880867004, + "mean_token_accuracy": 0.9103429317474365, + "num_tokens": 14823706.0, + "step": 1661 + }, + { + "epoch": 1.2629179331306992, + "grad_norm": 3.8636202812194824, + "learning_rate": 3.3458594383112868e-06, + "loss": 0.28377676010131836, + "mean_token_accuracy": 0.9047091007232666, + "num_tokens": 14826688.0, + "step": 1662 + }, + { + "epoch": 1.263677811550152, + "grad_norm": 2.3100268840789795, + "learning_rate": 3.343888268637765e-06, + "loss": 0.4723394513130188, + "mean_token_accuracy": 0.8306777477264404, + "num_tokens": 14835471.0, + "step": 1663 + }, + { + "epoch": 1.2644376899696048, + "grad_norm": 1.7582160234451294, + "learning_rate": 3.341916506690971e-06, + "loss": 0.48168784379959106, + "mean_token_accuracy": 0.8281306028366089, + "num_tokens": 14846513.0, + "step": 1664 + }, + { + "epoch": 1.2651975683890577, + "grad_norm": 2.166055917739868, + "learning_rate": 3.3399441538547638e-06, + "loss": 0.4626024067401886, + "mean_token_accuracy": 0.8377980589866638, + "num_tokens": 14853408.0, + "step": 1665 + }, + { + "epoch": 1.2659574468085106, + "grad_norm": 2.23038911819458, + "learning_rate": 3.337971211513417e-06, + "loss": 0.38434159755706787, + "mean_token_accuracy": 0.8708412647247314, + "num_tokens": 14859919.0, + "step": 1666 + }, + { + "epoch": 1.2667173252279635, + "grad_norm": 2.092505693435669, + "learning_rate": 3.3359976810516164e-06, + "loss": 0.35072219371795654, + "mean_token_accuracy": 0.8761640191078186, + "num_tokens": 14865624.0, + "step": 1667 + }, + { + "epoch": 1.2674772036474165, + "grad_norm": 1.8255130052566528, + "learning_rate": 3.3340235638544633e-06, + "loss": 0.4404270648956299, + "mean_token_accuracy": 0.836356520652771, + "num_tokens": 14874181.0, + "step": 1668 + }, + { + "epoch": 1.2682370820668694, + "grad_norm": 1.9889036417007446, + "learning_rate": 3.332048861307467e-06, + "loss": 0.4199368357658386, + "mean_token_accuracy": 0.8508217334747314, + "num_tokens": 14882275.0, + "step": 1669 + }, + { + "epoch": 1.2689969604863223, + "grad_norm": 4.050281047821045, + "learning_rate": 3.330073574796551e-06, + "loss": 0.4271625280380249, + "mean_token_accuracy": 0.8471108675003052, + "num_tokens": 14893633.0, + "step": 1670 + }, + { + "epoch": 1.2697568389057752, + "grad_norm": 1.998838186264038, + "learning_rate": 3.328097705708047e-06, + "loss": 0.34743767976760864, + "mean_token_accuracy": 0.8771528005599976, + "num_tokens": 14899859.0, + "step": 1671 + }, + { + "epoch": 1.2705167173252279, + "grad_norm": 1.7989062070846558, + "learning_rate": 3.3261212554286977e-06, + "loss": 0.5267184376716614, + "mean_token_accuracy": 0.8323302268981934, + "num_tokens": 14911131.0, + "step": 1672 + }, + { + "epoch": 1.2712765957446808, + "grad_norm": 1.312070369720459, + "learning_rate": 3.324144225345649e-06, + "loss": 0.4675425887107849, + "mean_token_accuracy": 0.8157106637954712, + "num_tokens": 14928955.0, + "step": 1673 + }, + { + "epoch": 1.2720364741641337, + "grad_norm": 2.0547919273376465, + "learning_rate": 3.3221666168464584e-06, + "loss": 0.33704331517219543, + "mean_token_accuracy": 0.8621441125869751, + "num_tokens": 14935536.0, + "step": 1674 + }, + { + "epoch": 1.2727963525835866, + "grad_norm": 2.810413122177124, + "learning_rate": 3.320188431319088e-06, + "loss": 0.4007563292980194, + "mean_token_accuracy": 0.8649672269821167, + "num_tokens": 14940219.0, + "step": 1675 + }, + { + "epoch": 1.2735562310030395, + "grad_norm": 1.3516674041748047, + "learning_rate": 3.318209670151904e-06, + "loss": 0.3457040786743164, + "mean_token_accuracy": 0.8698287010192871, + "num_tokens": 14952904.0, + "step": 1676 + }, + { + "epoch": 1.2743161094224924, + "grad_norm": 2.440643310546875, + "learning_rate": 3.3162303347336765e-06, + "loss": 0.5195086002349854, + "mean_token_accuracy": 0.8348199129104614, + "num_tokens": 14958623.0, + "step": 1677 + }, + { + "epoch": 1.2750759878419453, + "grad_norm": 1.3264343738555908, + "learning_rate": 3.3142504264535808e-06, + "loss": 0.2990425229072571, + "mean_token_accuracy": 0.8961933851242065, + "num_tokens": 14971494.0, + "step": 1678 + }, + { + "epoch": 1.2758358662613982, + "grad_norm": 1.3106894493103027, + "learning_rate": 3.3122699467011913e-06, + "loss": 0.291853666305542, + "mean_token_accuracy": 0.893449068069458, + "num_tokens": 14985239.0, + "step": 1679 + }, + { + "epoch": 1.2765957446808511, + "grad_norm": 2.5387396812438965, + "learning_rate": 3.3102888968664857e-06, + "loss": 0.4336916208267212, + "mean_token_accuracy": 0.8447890877723694, + "num_tokens": 14991453.0, + "step": 1680 + }, + { + "epoch": 1.2773556231003038, + "grad_norm": 2.7052135467529297, + "learning_rate": 3.308307278339842e-06, + "loss": 0.3279378116130829, + "mean_token_accuracy": 0.8935879468917847, + "num_tokens": 14995428.0, + "step": 1681 + }, + { + "epoch": 1.278115501519757, + "grad_norm": 1.6251261234283447, + "learning_rate": 3.306325092512034e-06, + "loss": 0.32066458463668823, + "mean_token_accuracy": 0.8909799456596375, + "num_tokens": 15004841.0, + "step": 1682 + }, + { + "epoch": 1.2788753799392096, + "grad_norm": 2.3014605045318604, + "learning_rate": 3.3043423407742374e-06, + "loss": 0.3523373603820801, + "mean_token_accuracy": 0.8810735940933228, + "num_tokens": 15010742.0, + "step": 1683 + }, + { + "epoch": 1.2796352583586625, + "grad_norm": 2.9563019275665283, + "learning_rate": 3.3023590245180237e-06, + "loss": 0.39715707302093506, + "mean_token_accuracy": 0.8779881000518799, + "num_tokens": 15015357.0, + "step": 1684 + }, + { + "epoch": 1.2803951367781155, + "grad_norm": 1.5787957906723022, + "learning_rate": 3.300375145135361e-06, + "loss": 0.44630166888237, + "mean_token_accuracy": 0.8400174975395203, + "num_tokens": 15031360.0, + "step": 1685 + }, + { + "epoch": 1.2811550151975684, + "grad_norm": 1.6753438711166382, + "learning_rate": 3.2983907040186112e-06, + "loss": 0.3235800862312317, + "mean_token_accuracy": 0.8938044309616089, + "num_tokens": 15040276.0, + "step": 1686 + }, + { + "epoch": 1.2819148936170213, + "grad_norm": 1.7331148386001587, + "learning_rate": 3.296405702560532e-06, + "loss": 0.39061424136161804, + "mean_token_accuracy": 0.8599754571914673, + "num_tokens": 15049725.0, + "step": 1687 + }, + { + "epoch": 1.2826747720364742, + "grad_norm": 2.2029430866241455, + "learning_rate": 3.294420142154274e-06, + "loss": 0.43598297238349915, + "mean_token_accuracy": 0.8663698434829712, + "num_tokens": 15058182.0, + "step": 1688 + }, + { + "epoch": 1.283434650455927, + "grad_norm": 2.943964958190918, + "learning_rate": 3.29243402419338e-06, + "loss": 0.405210942029953, + "mean_token_accuracy": 0.854996919631958, + "num_tokens": 15062920.0, + "step": 1689 + }, + { + "epoch": 1.28419452887538, + "grad_norm": 1.9343379735946655, + "learning_rate": 3.2904473500717826e-06, + "loss": 0.35011449456214905, + "mean_token_accuracy": 0.8745867013931274, + "num_tokens": 15070298.0, + "step": 1690 + }, + { + "epoch": 1.284954407294833, + "grad_norm": 2.559859037399292, + "learning_rate": 3.2884601211838087e-06, + "loss": 0.38816407322883606, + "mean_token_accuracy": 0.854763388633728, + "num_tokens": 15075667.0, + "step": 1691 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.4357839822769165, + "learning_rate": 3.2864723389241697e-06, + "loss": 0.4512745141983032, + "mean_token_accuracy": 0.8398592472076416, + "num_tokens": 15090291.0, + "step": 1692 + }, + { + "epoch": 1.2864741641337387, + "grad_norm": 1.7643728256225586, + "learning_rate": 3.284484004687969e-06, + "loss": 0.3536742627620697, + "mean_token_accuracy": 0.8726381063461304, + "num_tokens": 15099325.0, + "step": 1693 + }, + { + "epoch": 1.2872340425531914, + "grad_norm": 1.853173017501831, + "learning_rate": 3.2824951198706958e-06, + "loss": 0.36579740047454834, + "mean_token_accuracy": 0.8988048434257507, + "num_tokens": 15107090.0, + "step": 1694 + }, + { + "epoch": 1.2879939209726443, + "grad_norm": 1.6526862382888794, + "learning_rate": 3.280505685868226e-06, + "loss": 0.3853636682033539, + "mean_token_accuracy": 0.8743607997894287, + "num_tokens": 15117818.0, + "step": 1695 + }, + { + "epoch": 1.2887537993920972, + "grad_norm": 2.790398597717285, + "learning_rate": 3.278515704076821e-06, + "loss": 0.2707311511039734, + "mean_token_accuracy": 0.9034668803215027, + "num_tokens": 15121641.0, + "step": 1696 + }, + { + "epoch": 1.2895136778115501, + "grad_norm": 1.69557523727417, + "learning_rate": 3.276525175893126e-06, + "loss": 0.3707970082759857, + "mean_token_accuracy": 0.8617855906486511, + "num_tokens": 15130414.0, + "step": 1697 + }, + { + "epoch": 1.290273556231003, + "grad_norm": 1.1360478401184082, + "learning_rate": 3.274534102714172e-06, + "loss": 0.3368082344532013, + "mean_token_accuracy": 0.8781654834747314, + "num_tokens": 15148307.0, + "step": 1698 + }, + { + "epoch": 1.291033434650456, + "grad_norm": 1.5894653797149658, + "learning_rate": 3.272542485937369e-06, + "loss": 0.3870658278465271, + "mean_token_accuracy": 0.8830926418304443, + "num_tokens": 15161841.0, + "step": 1699 + }, + { + "epoch": 1.2917933130699089, + "grad_norm": 2.3735709190368652, + "learning_rate": 3.270550326960511e-06, + "loss": 0.3873991370201111, + "mean_token_accuracy": 0.8729057908058167, + "num_tokens": 15167733.0, + "step": 1700 + }, + { + "epoch": 1.2925531914893618, + "grad_norm": 1.3739598989486694, + "learning_rate": 3.268557627181772e-06, + "loss": 0.30831626057624817, + "mean_token_accuracy": 0.8695719242095947, + "num_tokens": 15180861.0, + "step": 1701 + }, + { + "epoch": 1.2933130699088147, + "grad_norm": 1.7526969909667969, + "learning_rate": 3.2665643879997054e-06, + "loss": 0.4716024398803711, + "mean_token_accuracy": 0.8303275108337402, + "num_tokens": 15191642.0, + "step": 1702 + }, + { + "epoch": 1.2940729483282674, + "grad_norm": 2.7866084575653076, + "learning_rate": 3.2645706108132426e-06, + "loss": 0.33337634801864624, + "mean_token_accuracy": 0.8790726065635681, + "num_tokens": 15196038.0, + "step": 1703 + }, + { + "epoch": 1.2948328267477205, + "grad_norm": 2.319765090942383, + "learning_rate": 3.2625762970216944e-06, + "loss": 0.3999716639518738, + "mean_token_accuracy": 0.8693568706512451, + "num_tokens": 15202075.0, + "step": 1704 + }, + { + "epoch": 1.2955927051671732, + "grad_norm": 3.18292498588562, + "learning_rate": 3.2605814480247454e-06, + "loss": 0.4579541087150574, + "mean_token_accuracy": 0.8516187071800232, + "num_tokens": 15206886.0, + "step": 1705 + }, + { + "epoch": 1.296352583586626, + "grad_norm": 2.1816933155059814, + "learning_rate": 3.258586065222459e-06, + "loss": 0.5198885202407837, + "mean_token_accuracy": 0.8170592784881592, + "num_tokens": 15214088.0, + "step": 1706 + }, + { + "epoch": 1.297112462006079, + "grad_norm": 1.9076340198516846, + "learning_rate": 3.2565901500152702e-06, + "loss": 0.49752360582351685, + "mean_token_accuracy": 0.8681992292404175, + "num_tokens": 15226046.0, + "step": 1707 + }, + { + "epoch": 1.297872340425532, + "grad_norm": 2.0223331451416016, + "learning_rate": 3.2545937038039904e-06, + "loss": 0.4515793025493622, + "mean_token_accuracy": 0.8429619073867798, + "num_tokens": 15234993.0, + "step": 1708 + }, + { + "epoch": 1.2986322188449848, + "grad_norm": 2.5089669227600098, + "learning_rate": 3.2525967279898017e-06, + "loss": 0.43628376722335815, + "mean_token_accuracy": 0.8493682146072388, + "num_tokens": 15240575.0, + "step": 1709 + }, + { + "epoch": 1.2993920972644377, + "grad_norm": 2.8347091674804688, + "learning_rate": 3.2505992239742582e-06, + "loss": 0.25112441182136536, + "mean_token_accuracy": 0.908825159072876, + "num_tokens": 15244085.0, + "step": 1710 + }, + { + "epoch": 1.3001519756838906, + "grad_norm": 2.3157572746276855, + "learning_rate": 3.2486011931592863e-06, + "loss": 0.482818067073822, + "mean_token_accuracy": 0.8305923938751221, + "num_tokens": 15250377.0, + "step": 1711 + }, + { + "epoch": 1.3009118541033435, + "grad_norm": 3.169052839279175, + "learning_rate": 3.2466026369471804e-06, + "loss": 0.3493242561817169, + "mean_token_accuracy": 0.86913001537323, + "num_tokens": 15255041.0, + "step": 1712 + }, + { + "epoch": 1.3016717325227964, + "grad_norm": 1.4475083351135254, + "learning_rate": 3.2446035567406033e-06, + "loss": 0.4177290201187134, + "mean_token_accuracy": 0.8497589826583862, + "num_tokens": 15266946.0, + "step": 1713 + }, + { + "epoch": 1.3024316109422491, + "grad_norm": 1.6473008394241333, + "learning_rate": 3.2426039539425875e-06, + "loss": 0.5272886753082275, + "mean_token_accuracy": 0.8440133333206177, + "num_tokens": 15279263.0, + "step": 1714 + }, + { + "epoch": 1.3031914893617023, + "grad_norm": 2.3996543884277344, + "learning_rate": 3.240603829956531e-06, + "loss": 0.4272066652774811, + "mean_token_accuracy": 0.8495640754699707, + "num_tokens": 15285213.0, + "step": 1715 + }, + { + "epoch": 1.303951367781155, + "grad_norm": 1.63034987449646, + "learning_rate": 3.238603186186198e-06, + "loss": 0.4034635126590729, + "mean_token_accuracy": 0.8638584613800049, + "num_tokens": 15295974.0, + "step": 1716 + }, + { + "epoch": 1.3047112462006079, + "grad_norm": 2.153608798980713, + "learning_rate": 3.2366020240357166e-06, + "loss": 0.30712565779685974, + "mean_token_accuracy": 0.8863866329193115, + "num_tokens": 15302220.0, + "step": 1717 + }, + { + "epoch": 1.3054711246200608, + "grad_norm": 2.9814558029174805, + "learning_rate": 3.2346003449095803e-06, + "loss": 0.3922840356826782, + "mean_token_accuracy": 0.868030309677124, + "num_tokens": 15306747.0, + "step": 1718 + }, + { + "epoch": 1.3062310030395137, + "grad_norm": 3.3417985439300537, + "learning_rate": 3.2325981502126434e-06, + "loss": 0.30750396847724915, + "mean_token_accuracy": 0.9065356850624084, + "num_tokens": 15310309.0, + "step": 1719 + }, + { + "epoch": 1.3069908814589666, + "grad_norm": 2.237682819366455, + "learning_rate": 3.2305954413501252e-06, + "loss": 0.35068294405937195, + "mean_token_accuracy": 0.8887614011764526, + "num_tokens": 15316463.0, + "step": 1720 + }, + { + "epoch": 1.3077507598784195, + "grad_norm": 1.9526605606079102, + "learning_rate": 3.228592219727602e-06, + "loss": 0.42061835527420044, + "mean_token_accuracy": 0.8456839323043823, + "num_tokens": 15323984.0, + "step": 1721 + }, + { + "epoch": 1.3085106382978724, + "grad_norm": 1.6454212665557861, + "learning_rate": 3.226588486751012e-06, + "loss": 0.5189976692199707, + "mean_token_accuracy": 0.8187375068664551, + "num_tokens": 15338807.0, + "step": 1722 + }, + { + "epoch": 1.3092705167173253, + "grad_norm": 1.4521609544754028, + "learning_rate": 3.2245842438266526e-06, + "loss": 0.329673171043396, + "mean_token_accuracy": 0.853867769241333, + "num_tokens": 15350400.0, + "step": 1723 + }, + { + "epoch": 1.3100303951367782, + "grad_norm": 1.8750989437103271, + "learning_rate": 3.222579492361179e-06, + "loss": 0.4635341167449951, + "mean_token_accuracy": 0.8393422365188599, + "num_tokens": 15360557.0, + "step": 1724 + }, + { + "epoch": 1.310790273556231, + "grad_norm": 1.2728849649429321, + "learning_rate": 3.220574233761603e-06, + "loss": 0.3255572021007538, + "mean_token_accuracy": 0.8989741802215576, + "num_tokens": 15376548.0, + "step": 1725 + }, + { + "epoch": 1.3115501519756838, + "grad_norm": 3.5155694484710693, + "learning_rate": 3.2185684694352913e-06, + "loss": 0.34204089641571045, + "mean_token_accuracy": 0.8781906366348267, + "num_tokens": 15380304.0, + "step": 1726 + }, + { + "epoch": 1.3123100303951367, + "grad_norm": 2.059800148010254, + "learning_rate": 3.216562200789968e-06, + "loss": 0.36288338899612427, + "mean_token_accuracy": 0.8595278263092041, + "num_tokens": 15387653.0, + "step": 1727 + }, + { + "epoch": 1.3130699088145896, + "grad_norm": 3.5388240814208984, + "learning_rate": 3.214555429233707e-06, + "loss": 0.5434849858283997, + "mean_token_accuracy": 0.8074631690979004, + "num_tokens": 15391662.0, + "step": 1728 + }, + { + "epoch": 1.3138297872340425, + "grad_norm": 2.8595592975616455, + "learning_rate": 3.2125481561749406e-06, + "loss": 0.5113687515258789, + "mean_token_accuracy": 0.8448649644851685, + "num_tokens": 15397536.0, + "step": 1729 + }, + { + "epoch": 1.3145896656534954, + "grad_norm": 2.50386905670166, + "learning_rate": 3.210540383022449e-06, + "loss": 0.5293697118759155, + "mean_token_accuracy": 0.8096445798873901, + "num_tokens": 15403478.0, + "step": 1730 + }, + { + "epoch": 1.3153495440729484, + "grad_norm": 1.880035400390625, + "learning_rate": 3.208532111185365e-06, + "loss": 0.5344835519790649, + "mean_token_accuracy": 0.8172965049743652, + "num_tokens": 15413812.0, + "step": 1731 + }, + { + "epoch": 1.3161094224924013, + "grad_norm": 1.3688768148422241, + "learning_rate": 3.2065233420731717e-06, + "loss": 0.2577427327632904, + "mean_token_accuracy": 0.9142681360244751, + "num_tokens": 15423583.0, + "step": 1732 + }, + { + "epoch": 1.3168693009118542, + "grad_norm": 1.7945705652236938, + "learning_rate": 3.2045140770956987e-06, + "loss": 0.3983926773071289, + "mean_token_accuracy": 0.8652000427246094, + "num_tokens": 15432473.0, + "step": 1733 + }, + { + "epoch": 1.3176291793313069, + "grad_norm": 1.8243350982666016, + "learning_rate": 3.2025043176631283e-06, + "loss": 0.48644185066223145, + "mean_token_accuracy": 0.8319193124771118, + "num_tokens": 15445463.0, + "step": 1734 + }, + { + "epoch": 1.31838905775076, + "grad_norm": 2.000094175338745, + "learning_rate": 3.2004940651859844e-06, + "loss": 0.43567317724227905, + "mean_token_accuracy": 0.8857482671737671, + "num_tokens": 15452382.0, + "step": 1735 + }, + { + "epoch": 1.3191489361702127, + "grad_norm": 2.379974365234375, + "learning_rate": 3.198483321075141e-06, + "loss": 0.5153506398200989, + "mean_token_accuracy": 0.8295865654945374, + "num_tokens": 15458740.0, + "step": 1736 + }, + { + "epoch": 1.3199088145896656, + "grad_norm": 1.6564184427261353, + "learning_rate": 3.196472086741815e-06, + "loss": 0.508430540561676, + "mean_token_accuracy": 0.8181540369987488, + "num_tokens": 15471844.0, + "step": 1737 + }, + { + "epoch": 1.3206686930091185, + "grad_norm": 2.006925344467163, + "learning_rate": 3.194460363597569e-06, + "loss": 0.34542378783226013, + "mean_token_accuracy": 0.8827437162399292, + "num_tokens": 15478414.0, + "step": 1738 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 3.589045763015747, + "learning_rate": 3.192448153054306e-06, + "loss": 0.4385780096054077, + "mean_token_accuracy": 0.8480287790298462, + "num_tokens": 15482063.0, + "step": 1739 + }, + { + "epoch": 1.3221884498480243, + "grad_norm": 1.9797427654266357, + "learning_rate": 3.190435456524275e-06, + "loss": 0.4330386519432068, + "mean_token_accuracy": 0.8458058834075928, + "num_tokens": 15489803.0, + "step": 1740 + }, + { + "epoch": 1.3229483282674772, + "grad_norm": 1.4777411222457886, + "learning_rate": 3.188422275420063e-06, + "loss": 0.3997895419597626, + "mean_token_accuracy": 0.8639512062072754, + "num_tokens": 15501103.0, + "step": 1741 + }, + { + "epoch": 1.3237082066869301, + "grad_norm": 2.882338523864746, + "learning_rate": 3.186408611154597e-06, + "loss": 0.2336438149213791, + "mean_token_accuracy": 0.9176726937294006, + "num_tokens": 15504854.0, + "step": 1742 + }, + { + "epoch": 1.324468085106383, + "grad_norm": 2.353503704071045, + "learning_rate": 3.184394465141146e-06, + "loss": 0.4107069671154022, + "mean_token_accuracy": 0.8677014112472534, + "num_tokens": 15510662.0, + "step": 1743 + }, + { + "epoch": 1.325227963525836, + "grad_norm": 2.6551976203918457, + "learning_rate": 3.1823798387933134e-06, + "loss": 0.3862302899360657, + "mean_token_accuracy": 0.8819445371627808, + "num_tokens": 15515681.0, + "step": 1744 + }, + { + "epoch": 1.3259878419452886, + "grad_norm": 1.478572964668274, + "learning_rate": 3.180364733525043e-06, + "loss": 0.43972986936569214, + "mean_token_accuracy": 0.832388162612915, + "num_tokens": 15529542.0, + "step": 1745 + }, + { + "epoch": 1.3267477203647418, + "grad_norm": 1.6003550291061401, + "learning_rate": 3.178349150750612e-06, + "loss": 0.3404902219772339, + "mean_token_accuracy": 0.8764007091522217, + "num_tokens": 15538865.0, + "step": 1746 + }, + { + "epoch": 1.3275075987841944, + "grad_norm": 2.130689859390259, + "learning_rate": 3.1763330918846347e-06, + "loss": 0.383136510848999, + "mean_token_accuracy": 0.8652247190475464, + "num_tokens": 15545567.0, + "step": 1747 + }, + { + "epoch": 1.3282674772036474, + "grad_norm": 2.395937442779541, + "learning_rate": 3.1743165583420586e-06, + "loss": 0.3870319128036499, + "mean_token_accuracy": 0.8618065118789673, + "num_tokens": 15551090.0, + "step": 1748 + }, + { + "epoch": 1.3290273556231003, + "grad_norm": 2.0841057300567627, + "learning_rate": 3.1722995515381644e-06, + "loss": 0.4838739335536957, + "mean_token_accuracy": 0.8548711538314819, + "num_tokens": 15558913.0, + "step": 1749 + }, + { + "epoch": 1.3297872340425532, + "grad_norm": 1.4237847328186035, + "learning_rate": 3.1702820728885657e-06, + "loss": 0.40350261330604553, + "mean_token_accuracy": 0.858984649181366, + "num_tokens": 15572045.0, + "step": 1750 + }, + { + "epoch": 1.330547112462006, + "grad_norm": 2.2641282081604004, + "learning_rate": 3.1682641238092064e-06, + "loss": 0.5117636919021606, + "mean_token_accuracy": 0.8078924417495728, + "num_tokens": 15579753.0, + "step": 1751 + }, + { + "epoch": 1.331306990881459, + "grad_norm": 1.0010309219360352, + "learning_rate": 3.1662457057163603e-06, + "loss": 0.3220978379249573, + "mean_token_accuracy": 0.8786559104919434, + "num_tokens": 15602823.0, + "step": 1752 + }, + { + "epoch": 1.332066869300912, + "grad_norm": 2.441230535507202, + "learning_rate": 3.164226820026632e-06, + "loss": 0.37529727816581726, + "mean_token_accuracy": 0.8886898756027222, + "num_tokens": 15608473.0, + "step": 1753 + }, + { + "epoch": 1.3328267477203648, + "grad_norm": 1.2960991859436035, + "learning_rate": 3.162207468156952e-06, + "loss": 0.3393767476081848, + "mean_token_accuracy": 0.8766993284225464, + "num_tokens": 15620893.0, + "step": 1754 + }, + { + "epoch": 1.3335866261398177, + "grad_norm": 2.0806996822357178, + "learning_rate": 3.16018765152458e-06, + "loss": 0.38034507632255554, + "mean_token_accuracy": 0.8854838609695435, + "num_tokens": 15627068.0, + "step": 1755 + }, + { + "epoch": 1.3343465045592704, + "grad_norm": 1.4316699504852295, + "learning_rate": 3.1581673715471007e-06, + "loss": 0.3665890693664551, + "mean_token_accuracy": 0.870919406414032, + "num_tokens": 15641070.0, + "step": 1756 + }, + { + "epoch": 1.3351063829787235, + "grad_norm": 1.3466622829437256, + "learning_rate": 3.1561466296424247e-06, + "loss": 0.37387198209762573, + "mean_token_accuracy": 0.8633951544761658, + "num_tokens": 15653777.0, + "step": 1757 + }, + { + "epoch": 1.3358662613981762, + "grad_norm": 1.8108628988265991, + "learning_rate": 3.154125427228786e-06, + "loss": 0.38428938388824463, + "mean_token_accuracy": 0.85402512550354, + "num_tokens": 15662494.0, + "step": 1758 + }, + { + "epoch": 1.3366261398176291, + "grad_norm": 1.3221700191497803, + "learning_rate": 3.152103765724743e-06, + "loss": 0.42825520038604736, + "mean_token_accuracy": 0.8435465097427368, + "num_tokens": 15677552.0, + "step": 1759 + }, + { + "epoch": 1.337386018237082, + "grad_norm": 2.6247692108154297, + "learning_rate": 3.150081646549174e-06, + "loss": 0.36186715960502625, + "mean_token_accuracy": 0.8767328262329102, + "num_tokens": 15682103.0, + "step": 1760 + }, + { + "epoch": 1.338145896656535, + "grad_norm": 2.1469814777374268, + "learning_rate": 3.1480590711212823e-06, + "loss": 0.3734385669231415, + "mean_token_accuracy": 0.8711104393005371, + "num_tokens": 15689182.0, + "step": 1761 + }, + { + "epoch": 1.3389057750759878, + "grad_norm": 2.1702585220336914, + "learning_rate": 3.1460360408605866e-06, + "loss": 0.2795315086841583, + "mean_token_accuracy": 0.8892190456390381, + "num_tokens": 15694272.0, + "step": 1762 + }, + { + "epoch": 1.3396656534954408, + "grad_norm": 1.918797254562378, + "learning_rate": 3.144012557186931e-06, + "loss": 0.4363473057746887, + "mean_token_accuracy": 0.8573931455612183, + "num_tokens": 15703532.0, + "step": 1763 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 2.5579960346221924, + "learning_rate": 3.14198862152047e-06, + "loss": 0.406247079372406, + "mean_token_accuracy": 0.8617593050003052, + "num_tokens": 15708652.0, + "step": 1764 + }, + { + "epoch": 1.3411854103343466, + "grad_norm": 2.3617870807647705, + "learning_rate": 3.1399642352816825e-06, + "loss": 0.2839522659778595, + "mean_token_accuracy": 0.8996064066886902, + "num_tokens": 15713598.0, + "step": 1765 + }, + { + "epoch": 1.3419452887537995, + "grad_norm": 1.248302936553955, + "learning_rate": 3.1379393998913594e-06, + "loss": 0.2922290861606598, + "mean_token_accuracy": 0.8948773145675659, + "num_tokens": 15726693.0, + "step": 1766 + }, + { + "epoch": 1.3427051671732522, + "grad_norm": 2.143599510192871, + "learning_rate": 3.135914116770609e-06, + "loss": 0.32176223397254944, + "mean_token_accuracy": 0.8808754682540894, + "num_tokens": 15731901.0, + "step": 1767 + }, + { + "epoch": 1.3434650455927053, + "grad_norm": 4.226369857788086, + "learning_rate": 3.1338883873408517e-06, + "loss": 0.4682556390762329, + "mean_token_accuracy": 0.8566025495529175, + "num_tokens": 15735029.0, + "step": 1768 + }, + { + "epoch": 1.344224924012158, + "grad_norm": 1.8695988655090332, + "learning_rate": 3.1318622130238237e-06, + "loss": 0.4297192394733429, + "mean_token_accuracy": 0.8419148921966553, + "num_tokens": 15744310.0, + "step": 1769 + }, + { + "epoch": 1.344984802431611, + "grad_norm": 2.4321305751800537, + "learning_rate": 3.1298355952415714e-06, + "loss": 0.36076444387435913, + "mean_token_accuracy": 0.8826035261154175, + "num_tokens": 15749337.0, + "step": 1770 + }, + { + "epoch": 1.3457446808510638, + "grad_norm": 1.5500011444091797, + "learning_rate": 3.127808535416454e-06, + "loss": 0.48664039373397827, + "mean_token_accuracy": 0.844344437122345, + "num_tokens": 15761096.0, + "step": 1771 + }, + { + "epoch": 1.3465045592705167, + "grad_norm": 2.1498289108276367, + "learning_rate": 3.1257810349711388e-06, + "loss": 0.4841752052307129, + "mean_token_accuracy": 0.8324567079544067, + "num_tokens": 15768646.0, + "step": 1772 + }, + { + "epoch": 1.3472644376899696, + "grad_norm": 1.2995187044143677, + "learning_rate": 3.1237530953286046e-06, + "loss": 0.492019385099411, + "mean_token_accuracy": 0.8285316228866577, + "num_tokens": 15788401.0, + "step": 1773 + }, + { + "epoch": 1.3480243161094225, + "grad_norm": 2.324819803237915, + "learning_rate": 3.121724717912138e-06, + "loss": 0.33166298270225525, + "mean_token_accuracy": 0.8856451511383057, + "num_tokens": 15794097.0, + "step": 1774 + }, + { + "epoch": 1.3487841945288754, + "grad_norm": 1.9611430168151855, + "learning_rate": 3.11969590414533e-06, + "loss": 0.3974284827709198, + "mean_token_accuracy": 0.8751305937767029, + "num_tokens": 15801065.0, + "step": 1775 + }, + { + "epoch": 1.3495440729483283, + "grad_norm": 1.7084417343139648, + "learning_rate": 3.1176666554520827e-06, + "loss": 0.38729435205459595, + "mean_token_accuracy": 0.8680770397186279, + "num_tokens": 15810353.0, + "step": 1776 + }, + { + "epoch": 1.3503039513677813, + "grad_norm": 1.7616240978240967, + "learning_rate": 3.1156369732566006e-06, + "loss": 0.4271578788757324, + "mean_token_accuracy": 0.843730092048645, + "num_tokens": 15821889.0, + "step": 1777 + }, + { + "epoch": 1.351063829787234, + "grad_norm": 2.030747413635254, + "learning_rate": 3.113606858983391e-06, + "loss": 0.361891508102417, + "mean_token_accuracy": 0.8522407412528992, + "num_tokens": 15830800.0, + "step": 1778 + }, + { + "epoch": 1.3518237082066868, + "grad_norm": 1.4842649698257446, + "learning_rate": 3.1115763140572686e-06, + "loss": 0.466334730386734, + "mean_token_accuracy": 0.8433995246887207, + "num_tokens": 15849422.0, + "step": 1779 + }, + { + "epoch": 1.3525835866261398, + "grad_norm": 1.6595379114151, + "learning_rate": 3.109545339903347e-06, + "loss": 0.4622533321380615, + "mean_token_accuracy": 0.8526314496994019, + "num_tokens": 15860431.0, + "step": 1780 + }, + { + "epoch": 1.3533434650455927, + "grad_norm": 2.1235809326171875, + "learning_rate": 3.107513937947041e-06, + "loss": 0.42694270610809326, + "mean_token_accuracy": 0.854864239692688, + "num_tokens": 15869044.0, + "step": 1781 + }, + { + "epoch": 1.3541033434650456, + "grad_norm": 1.5889263153076172, + "learning_rate": 3.1054821096140675e-06, + "loss": 0.41838499903678894, + "mean_token_accuracy": 0.8671513795852661, + "num_tokens": 15878598.0, + "step": 1782 + }, + { + "epoch": 1.3548632218844985, + "grad_norm": 2.2261741161346436, + "learning_rate": 3.1034498563304435e-06, + "loss": 0.4045066237449646, + "mean_token_accuracy": 0.843826949596405, + "num_tokens": 15885167.0, + "step": 1783 + }, + { + "epoch": 1.3556231003039514, + "grad_norm": 2.2569329738616943, + "learning_rate": 3.1014171795224794e-06, + "loss": 0.36677104234695435, + "mean_token_accuracy": 0.8747833967208862, + "num_tokens": 15891308.0, + "step": 1784 + }, + { + "epoch": 1.3563829787234043, + "grad_norm": 2.1027088165283203, + "learning_rate": 3.0993840806167884e-06, + "loss": 0.437946081161499, + "mean_token_accuracy": 0.8370785117149353, + "num_tokens": 15898952.0, + "step": 1785 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 1.8768929243087769, + "learning_rate": 3.0973505610402767e-06, + "loss": 0.4201734662055969, + "mean_token_accuracy": 0.8474810123443604, + "num_tokens": 15907340.0, + "step": 1786 + }, + { + "epoch": 1.35790273556231, + "grad_norm": 1.7216229438781738, + "learning_rate": 3.0953166222201474e-06, + "loss": 0.4225231409072876, + "mean_token_accuracy": 0.8437749147415161, + "num_tokens": 15917852.0, + "step": 1787 + }, + { + "epoch": 1.358662613981763, + "grad_norm": 2.6256966590881348, + "learning_rate": 3.093282265583895e-06, + "loss": 0.435439795255661, + "mean_token_accuracy": 0.8452040553092957, + "num_tokens": 15923739.0, + "step": 1788 + }, + { + "epoch": 1.3594224924012157, + "grad_norm": 2.90028977394104, + "learning_rate": 3.0912474925593124e-06, + "loss": 0.3730456829071045, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 15927943.0, + "step": 1789 + }, + { + "epoch": 1.3601823708206686, + "grad_norm": 1.5966626405715942, + "learning_rate": 3.0892123045744787e-06, + "loss": 0.42150455713272095, + "mean_token_accuracy": 0.854656457901001, + "num_tokens": 15939922.0, + "step": 1790 + }, + { + "epoch": 1.3609422492401215, + "grad_norm": 1.8069748878479004, + "learning_rate": 3.0871767030577686e-06, + "loss": 0.4954872131347656, + "mean_token_accuracy": 0.8289790153503418, + "num_tokens": 15950095.0, + "step": 1791 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 2.0855250358581543, + "learning_rate": 3.085140689437846e-06, + "loss": 0.41999945044517517, + "mean_token_accuracy": 0.8517382144927979, + "num_tokens": 15957972.0, + "step": 1792 + }, + { + "epoch": 1.3624620060790273, + "grad_norm": 2.108659267425537, + "learning_rate": 3.0831042651436634e-06, + "loss": 0.3668023645877838, + "mean_token_accuracy": 0.8710855841636658, + "num_tokens": 15965614.0, + "step": 1793 + }, + { + "epoch": 1.3632218844984803, + "grad_norm": 1.3799632787704468, + "learning_rate": 3.0810674316044602e-06, + "loss": 0.351409375667572, + "mean_token_accuracy": 0.870837390422821, + "num_tokens": 15978854.0, + "step": 1794 + }, + { + "epoch": 1.3639817629179332, + "grad_norm": 1.540397047996521, + "learning_rate": 3.0790301902497664e-06, + "loss": 0.403600811958313, + "mean_token_accuracy": 0.8485002517700195, + "num_tokens": 15993324.0, + "step": 1795 + }, + { + "epoch": 1.364741641337386, + "grad_norm": 1.946882963180542, + "learning_rate": 3.076992542509396e-06, + "loss": 0.40118327736854553, + "mean_token_accuracy": 0.8607497811317444, + "num_tokens": 16001937.0, + "step": 1796 + }, + { + "epoch": 1.365501519756839, + "grad_norm": 2.0464305877685547, + "learning_rate": 3.0749544898134487e-06, + "loss": 0.31742292642593384, + "mean_token_accuracy": 0.8878391981124878, + "num_tokens": 16009277.0, + "step": 1797 + }, + { + "epoch": 1.3662613981762917, + "grad_norm": 2.091754913330078, + "learning_rate": 3.072916033592307e-06, + "loss": 0.31580421328544617, + "mean_token_accuracy": 0.8875244855880737, + "num_tokens": 16015756.0, + "step": 1798 + }, + { + "epoch": 1.3670212765957448, + "grad_norm": 3.4449212551116943, + "learning_rate": 3.0708771752766397e-06, + "loss": 0.4692591726779938, + "mean_token_accuracy": 0.8456202149391174, + "num_tokens": 16019912.0, + "step": 1799 + }, + { + "epoch": 1.3677811550151975, + "grad_norm": 1.600419521331787, + "learning_rate": 3.068837916297396e-06, + "loss": 0.40389442443847656, + "mean_token_accuracy": 0.8378961086273193, + "num_tokens": 16032637.0, + "step": 1800 + }, + { + "epoch": 1.3685410334346504, + "grad_norm": 1.5282686948776245, + "learning_rate": 3.0667982580858047e-06, + "loss": 0.379841685295105, + "mean_token_accuracy": 0.8752143383026123, + "num_tokens": 16045205.0, + "step": 1801 + }, + { + "epoch": 1.3693009118541033, + "grad_norm": 2.486079454421997, + "learning_rate": 3.0647582020733773e-06, + "loss": 0.41060030460357666, + "mean_token_accuracy": 0.8575131893157959, + "num_tokens": 16051189.0, + "step": 1802 + }, + { + "epoch": 1.3700607902735562, + "grad_norm": 1.9458621740341187, + "learning_rate": 3.062717749691904e-06, + "loss": 0.4442213773727417, + "mean_token_accuracy": 0.8451495170593262, + "num_tokens": 16059700.0, + "step": 1803 + }, + { + "epoch": 1.3708206686930091, + "grad_norm": 1.4333001375198364, + "learning_rate": 3.0606769023734535e-06, + "loss": 0.39132001996040344, + "mean_token_accuracy": 0.8609901666641235, + "num_tokens": 16072458.0, + "step": 1804 + }, + { + "epoch": 1.371580547112462, + "grad_norm": 1.490355372428894, + "learning_rate": 3.0586356615503693e-06, + "loss": 0.4108564257621765, + "mean_token_accuracy": 0.8871046304702759, + "num_tokens": 16083142.0, + "step": 1805 + }, + { + "epoch": 1.372340425531915, + "grad_norm": 1.7765129804611206, + "learning_rate": 3.056594028655274e-06, + "loss": 0.3850266635417938, + "mean_token_accuracy": 0.8923365473747253, + "num_tokens": 16092519.0, + "step": 1806 + }, + { + "epoch": 1.3731003039513678, + "grad_norm": 1.955661416053772, + "learning_rate": 3.0545520051210637e-06, + "loss": 0.4665378928184509, + "mean_token_accuracy": 0.837419867515564, + "num_tokens": 16100618.0, + "step": 1807 + }, + { + "epoch": 1.3738601823708207, + "grad_norm": 3.259265422821045, + "learning_rate": 3.052509592380909e-06, + "loss": 0.24722981452941895, + "mean_token_accuracy": 0.9106054306030273, + "num_tokens": 16103836.0, + "step": 1808 + }, + { + "epoch": 1.3746200607902734, + "grad_norm": 1.7995736598968506, + "learning_rate": 3.050466791868254e-06, + "loss": 0.4982220530509949, + "mean_token_accuracy": 0.8298169374465942, + "num_tokens": 16114727.0, + "step": 1809 + }, + { + "epoch": 1.3753799392097266, + "grad_norm": 1.9643093347549438, + "learning_rate": 3.048423605016815e-06, + "loss": 0.5076829195022583, + "mean_token_accuracy": 0.8303098678588867, + "num_tokens": 16129491.0, + "step": 1810 + }, + { + "epoch": 1.3761398176291793, + "grad_norm": 3.505594491958618, + "learning_rate": 3.0463800332605787e-06, + "loss": 0.27466052770614624, + "mean_token_accuracy": 0.9018045663833618, + "num_tokens": 16132640.0, + "step": 1811 + }, + { + "epoch": 1.3768996960486322, + "grad_norm": 1.798437237739563, + "learning_rate": 3.0443360780338034e-06, + "loss": 0.4004853069782257, + "mean_token_accuracy": 0.8569544553756714, + "num_tokens": 16143317.0, + "step": 1812 + }, + { + "epoch": 1.377659574468085, + "grad_norm": 2.276740789413452, + "learning_rate": 3.042291740771014e-06, + "loss": 0.3823797106742859, + "mean_token_accuracy": 0.8764113783836365, + "num_tokens": 16148898.0, + "step": 1813 + }, + { + "epoch": 1.378419452887538, + "grad_norm": 2.5051357746124268, + "learning_rate": 3.0402470229070057e-06, + "loss": 0.40365856885910034, + "mean_token_accuracy": 0.8809891939163208, + "num_tokens": 16153815.0, + "step": 1814 + }, + { + "epoch": 1.3791793313069909, + "grad_norm": 1.2379236221313477, + "learning_rate": 3.03820192587684e-06, + "loss": 0.3955119848251343, + "mean_token_accuracy": 0.8536627292633057, + "num_tokens": 16167783.0, + "step": 1815 + }, + { + "epoch": 1.3799392097264438, + "grad_norm": 2.2286343574523926, + "learning_rate": 3.036156451115846e-06, + "loss": 0.39647501707077026, + "mean_token_accuracy": 0.8621993064880371, + "num_tokens": 16174707.0, + "step": 1816 + }, + { + "epoch": 1.3806990881458967, + "grad_norm": 1.884639024734497, + "learning_rate": 3.034110600059616e-06, + "loss": 0.31612110137939453, + "mean_token_accuracy": 0.8942475318908691, + "num_tokens": 16181919.0, + "step": 1817 + }, + { + "epoch": 1.3814589665653496, + "grad_norm": 1.891312599182129, + "learning_rate": 3.0320643741440052e-06, + "loss": 0.46209126710891724, + "mean_token_accuracy": 0.8374713659286499, + "num_tokens": 16189276.0, + "step": 1818 + }, + { + "epoch": 1.3822188449848025, + "grad_norm": 2.507478713989258, + "learning_rate": 3.0300177748051375e-06, + "loss": 0.37601593136787415, + "mean_token_accuracy": 0.8633589148521423, + "num_tokens": 16194346.0, + "step": 1819 + }, + { + "epoch": 1.3829787234042552, + "grad_norm": 1.5046696662902832, + "learning_rate": 3.0279708034793907e-06, + "loss": 0.3284982144832611, + "mean_token_accuracy": 0.8792630434036255, + "num_tokens": 16205457.0, + "step": 1820 + }, + { + "epoch": 1.3837386018237083, + "grad_norm": 2.4244449138641357, + "learning_rate": 3.025923461603412e-06, + "loss": 0.40939009189605713, + "mean_token_accuracy": 0.8596426248550415, + "num_tokens": 16211866.0, + "step": 1821 + }, + { + "epoch": 1.384498480243161, + "grad_norm": 2.8656933307647705, + "learning_rate": 3.0238757506141013e-06, + "loss": 0.4397110044956207, + "mean_token_accuracy": 0.8597331047058105, + "num_tokens": 16216607.0, + "step": 1822 + }, + { + "epoch": 1.385258358662614, + "grad_norm": 2.0718610286712646, + "learning_rate": 3.0218276719486245e-06, + "loss": 0.49057573080062866, + "mean_token_accuracy": 0.8325331211090088, + "num_tokens": 16224014.0, + "step": 1823 + }, + { + "epoch": 1.3860182370820668, + "grad_norm": 1.054450273513794, + "learning_rate": 3.019779227044398e-06, + "loss": 0.3758106827735901, + "mean_token_accuracy": 0.8689473867416382, + "num_tokens": 16248627.0, + "step": 1824 + }, + { + "epoch": 1.3867781155015197, + "grad_norm": 2.1115148067474365, + "learning_rate": 3.0177304173391038e-06, + "loss": 0.502967119216919, + "mean_token_accuracy": 0.823198676109314, + "num_tokens": 16256255.0, + "step": 1825 + }, + { + "epoch": 1.3875379939209727, + "grad_norm": 2.207277297973633, + "learning_rate": 3.015681244270672e-06, + "loss": 0.3458971083164215, + "mean_token_accuracy": 0.8930196762084961, + "num_tokens": 16261823.0, + "step": 1826 + }, + { + "epoch": 1.3882978723404256, + "grad_norm": 1.289669156074524, + "learning_rate": 3.0136317092772923e-06, + "loss": 0.4422765374183655, + "mean_token_accuracy": 0.8358346819877625, + "num_tokens": 16280659.0, + "step": 1827 + }, + { + "epoch": 1.3890577507598785, + "grad_norm": 2.233865737915039, + "learning_rate": 3.0115818137974066e-06, + "loss": 0.3643006384372711, + "mean_token_accuracy": 0.8682862520217896, + "num_tokens": 16286356.0, + "step": 1828 + }, + { + "epoch": 1.3898176291793314, + "grad_norm": 1.0950042009353638, + "learning_rate": 3.0095315592697126e-06, + "loss": 0.34712421894073486, + "mean_token_accuracy": 0.8578766584396362, + "num_tokens": 16307298.0, + "step": 1829 + }, + { + "epoch": 1.3905775075987843, + "grad_norm": 1.1708037853240967, + "learning_rate": 3.007480947133155e-06, + "loss": 0.33152541518211365, + "mean_token_accuracy": 0.894973874092102, + "num_tokens": 16323232.0, + "step": 1830 + }, + { + "epoch": 1.391337386018237, + "grad_norm": 1.2226970195770264, + "learning_rate": 3.0054299788269343e-06, + "loss": 0.3915635943412781, + "mean_token_accuracy": 0.8575779795646667, + "num_tokens": 16339273.0, + "step": 1831 + }, + { + "epoch": 1.39209726443769, + "grad_norm": 1.2226042747497559, + "learning_rate": 3.0033786557904982e-06, + "loss": 0.45846253633499146, + "mean_token_accuracy": 0.8290432691574097, + "num_tokens": 16360145.0, + "step": 1832 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 2.0117406845092773, + "learning_rate": 3.001326979463545e-06, + "loss": 0.3837882876396179, + "mean_token_accuracy": 0.8941739797592163, + "num_tokens": 16366602.0, + "step": 1833 + }, + { + "epoch": 1.3936170212765957, + "grad_norm": 1.8419997692108154, + "learning_rate": 2.9992749512860177e-06, + "loss": 0.40777021646499634, + "mean_token_accuracy": 0.854655385017395, + "num_tokens": 16375611.0, + "step": 1834 + }, + { + "epoch": 1.3943768996960486, + "grad_norm": 1.9405122995376587, + "learning_rate": 2.9972225726981114e-06, + "loss": 0.46685922145843506, + "mean_token_accuracy": 0.8493201732635498, + "num_tokens": 16384878.0, + "step": 1835 + }, + { + "epoch": 1.3951367781155015, + "grad_norm": 1.2425674200057983, + "learning_rate": 2.995169845140264e-06, + "loss": 0.394692063331604, + "mean_token_accuracy": 0.851348876953125, + "num_tokens": 16404452.0, + "step": 1836 + }, + { + "epoch": 1.3958966565349544, + "grad_norm": 1.2215365171432495, + "learning_rate": 2.9931167700531575e-06, + "loss": 0.31412452459335327, + "mean_token_accuracy": 0.882760763168335, + "num_tokens": 16419358.0, + "step": 1837 + }, + { + "epoch": 1.3966565349544073, + "grad_norm": 1.912168025970459, + "learning_rate": 2.9910633488777198e-06, + "loss": 0.5065487623214722, + "mean_token_accuracy": 0.8524355292320251, + "num_tokens": 16430418.0, + "step": 1838 + }, + { + "epoch": 1.3974164133738602, + "grad_norm": 2.2173948287963867, + "learning_rate": 2.989009583055121e-06, + "loss": 0.4290938377380371, + "mean_token_accuracy": 0.8381836414337158, + "num_tokens": 16438267.0, + "step": 1839 + }, + { + "epoch": 1.3981762917933132, + "grad_norm": 1.8293484449386597, + "learning_rate": 2.9869554740267726e-06, + "loss": 0.41683733463287354, + "mean_token_accuracy": 0.8548779487609863, + "num_tokens": 16447382.0, + "step": 1840 + }, + { + "epoch": 1.398936170212766, + "grad_norm": 1.835015892982483, + "learning_rate": 2.9849010232343274e-06, + "loss": 0.5080599784851074, + "mean_token_accuracy": 0.8193596601486206, + "num_tokens": 16458541.0, + "step": 1841 + }, + { + "epoch": 1.3996960486322187, + "grad_norm": 2.031339645385742, + "learning_rate": 2.982846232119679e-06, + "loss": 0.5168882012367249, + "mean_token_accuracy": 0.8525956869125366, + "num_tokens": 16467747.0, + "step": 1842 + }, + { + "epoch": 1.4004559270516717, + "grad_norm": 1.5554167032241821, + "learning_rate": 2.9807911021249573e-06, + "loss": 0.35098958015441895, + "mean_token_accuracy": 0.888373851776123, + "num_tokens": 16479319.0, + "step": 1843 + }, + { + "epoch": 1.4012158054711246, + "grad_norm": 1.7183740139007568, + "learning_rate": 2.9787356346925327e-06, + "loss": 0.41263148188591003, + "mean_token_accuracy": 0.8478364944458008, + "num_tokens": 16489952.0, + "step": 1844 + }, + { + "epoch": 1.4019756838905775, + "grad_norm": 1.7743209600448608, + "learning_rate": 2.9766798312650112e-06, + "loss": 0.4211183190345764, + "mean_token_accuracy": 0.8641136884689331, + "num_tokens": 16498655.0, + "step": 1845 + }, + { + "epoch": 1.4027355623100304, + "grad_norm": 2.141300916671753, + "learning_rate": 2.9746236932852355e-06, + "loss": 0.49548980593681335, + "mean_token_accuracy": 0.8304252028465271, + "num_tokens": 16506348.0, + "step": 1846 + }, + { + "epoch": 1.4034954407294833, + "grad_norm": 2.341571807861328, + "learning_rate": 2.9725672221962804e-06, + "loss": 0.40804803371429443, + "mean_token_accuracy": 0.8545800447463989, + "num_tokens": 16513091.0, + "step": 1847 + }, + { + "epoch": 1.4042553191489362, + "grad_norm": 1.934428095817566, + "learning_rate": 2.9705104194414587e-06, + "loss": 0.30029812455177307, + "mean_token_accuracy": 0.9032052755355835, + "num_tokens": 16519455.0, + "step": 1848 + }, + { + "epoch": 1.405015197568389, + "grad_norm": 1.420804500579834, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.4384060502052307, + "mean_token_accuracy": 0.8465110063552856, + "num_tokens": 16533222.0, + "step": 1849 + }, + { + "epoch": 1.405775075987842, + "grad_norm": 2.1180737018585205, + "learning_rate": 2.9663958247086165e-06, + "loss": 0.3915565609931946, + "mean_token_accuracy": 0.8633890748023987, + "num_tokens": 16539489.0, + "step": 1850 + }, + { + "epoch": 1.4065349544072947, + "grad_norm": 1.408048152923584, + "learning_rate": 2.964338035618378e-06, + "loss": 0.46166157722473145, + "mean_token_accuracy": 0.8305013179779053, + "num_tokens": 16555785.0, + "step": 1851 + }, + { + "epoch": 1.4072948328267478, + "grad_norm": 1.3418530225753784, + "learning_rate": 2.9622799206378306e-06, + "loss": 0.5314373970031738, + "mean_token_accuracy": 0.81779944896698, + "num_tokens": 16578111.0, + "step": 1852 + }, + { + "epoch": 1.4080547112462005, + "grad_norm": 1.4634262323379517, + "learning_rate": 2.9602214812114414e-06, + "loss": 0.4859408140182495, + "mean_token_accuracy": 0.8261818885803223, + "num_tokens": 16591976.0, + "step": 1853 + }, + { + "epoch": 1.4088145896656534, + "grad_norm": 1.4840295314788818, + "learning_rate": 2.9581627187838997e-06, + "loss": 0.4079628586769104, + "mean_token_accuracy": 0.8549603223800659, + "num_tokens": 16603631.0, + "step": 1854 + }, + { + "epoch": 1.4095744680851063, + "grad_norm": 2.1474642753601074, + "learning_rate": 2.956103634800126e-06, + "loss": 0.32997995615005493, + "mean_token_accuracy": 0.8836915493011475, + "num_tokens": 16609875.0, + "step": 1855 + }, + { + "epoch": 1.4103343465045592, + "grad_norm": 2.627460241317749, + "learning_rate": 2.9540442307052643e-06, + "loss": 0.3229186236858368, + "mean_token_accuracy": 0.8852157592773438, + "num_tokens": 16614113.0, + "step": 1856 + }, + { + "epoch": 1.4110942249240122, + "grad_norm": 1.9569811820983887, + "learning_rate": 2.9519845079446824e-06, + "loss": 0.5057883858680725, + "mean_token_accuracy": 0.8585711717605591, + "num_tokens": 16624611.0, + "step": 1857 + }, + { + "epoch": 1.411854103343465, + "grad_norm": 2.0604090690612793, + "learning_rate": 2.949924467963975e-06, + "loss": 0.4681510329246521, + "mean_token_accuracy": 0.8390560150146484, + "num_tokens": 16632938.0, + "step": 1858 + }, + { + "epoch": 1.412613981762918, + "grad_norm": 2.5430450439453125, + "learning_rate": 2.9478641122089563e-06, + "loss": 0.3090999126434326, + "mean_token_accuracy": 0.8943990468978882, + "num_tokens": 16637135.0, + "step": 1859 + }, + { + "epoch": 1.4133738601823709, + "grad_norm": 1.3275387287139893, + "learning_rate": 2.945803442125663e-06, + "loss": 0.3592180013656616, + "mean_token_accuracy": 0.8678265810012817, + "num_tokens": 16650322.0, + "step": 1860 + }, + { + "epoch": 1.4141337386018238, + "grad_norm": 1.9070929288864136, + "learning_rate": 2.943742459160354e-06, + "loss": 0.5332518815994263, + "mean_token_accuracy": 0.8475706577301025, + "num_tokens": 16660240.0, + "step": 1861 + }, + { + "epoch": 1.4148936170212765, + "grad_norm": 2.8724546432495117, + "learning_rate": 2.9416811647595052e-06, + "loss": 0.5052884817123413, + "mean_token_accuracy": 0.8363175392150879, + "num_tokens": 16665481.0, + "step": 1862 + }, + { + "epoch": 1.4156534954407296, + "grad_norm": 4.203817844390869, + "learning_rate": 2.939619560369813e-06, + "loss": 0.546925961971283, + "mean_token_accuracy": 0.834044337272644, + "num_tokens": 16669615.0, + "step": 1863 + }, + { + "epoch": 1.4164133738601823, + "grad_norm": 1.6466281414031982, + "learning_rate": 2.9375576474381907e-06, + "loss": 0.3474533259868622, + "mean_token_accuracy": 0.8571163415908813, + "num_tokens": 16678893.0, + "step": 1864 + }, + { + "epoch": 1.4171732522796352, + "grad_norm": 1.8885842561721802, + "learning_rate": 2.9354954274117683e-06, + "loss": 0.3726021349430084, + "mean_token_accuracy": 0.8629094958305359, + "num_tokens": 16685939.0, + "step": 1865 + }, + { + "epoch": 1.417933130699088, + "grad_norm": 2.830599784851074, + "learning_rate": 2.9334329017378898e-06, + "loss": 0.4138668477535248, + "mean_token_accuracy": 0.8670746088027954, + "num_tokens": 16690012.0, + "step": 1866 + }, + { + "epoch": 1.418693009118541, + "grad_norm": 1.6838961839675903, + "learning_rate": 2.9313700718641167e-06, + "loss": 0.33954259753227234, + "mean_token_accuracy": 0.8660278916358948, + "num_tokens": 16700061.0, + "step": 1867 + }, + { + "epoch": 1.419452887537994, + "grad_norm": 2.8767011165618896, + "learning_rate": 2.9293069392382224e-06, + "loss": 0.4650302827358246, + "mean_token_accuracy": 0.8448452949523926, + "num_tokens": 16705072.0, + "step": 1868 + }, + { + "epoch": 1.4202127659574468, + "grad_norm": 1.5901305675506592, + "learning_rate": 2.927243505308192e-06, + "loss": 0.40838998556137085, + "mean_token_accuracy": 0.8560664653778076, + "num_tokens": 16714763.0, + "step": 1869 + }, + { + "epoch": 1.4209726443768997, + "grad_norm": 1.3293657302856445, + "learning_rate": 2.925179771522223e-06, + "loss": 0.34712862968444824, + "mean_token_accuracy": 0.8633697032928467, + "num_tokens": 16729575.0, + "step": 1870 + }, + { + "epoch": 1.4217325227963526, + "grad_norm": 1.7465964555740356, + "learning_rate": 2.9231157393287234e-06, + "loss": 0.48190903663635254, + "mean_token_accuracy": 0.8255834579467773, + "num_tokens": 16742529.0, + "step": 1871 + }, + { + "epoch": 1.4224924012158056, + "grad_norm": 1.865749716758728, + "learning_rate": 2.9210514101763116e-06, + "loss": 0.4912028908729553, + "mean_token_accuracy": 0.8309572339057922, + "num_tokens": 16753989.0, + "step": 1872 + }, + { + "epoch": 1.4232522796352582, + "grad_norm": 2.55780291557312, + "learning_rate": 2.9189867855138103e-06, + "loss": 0.4550635814666748, + "mean_token_accuracy": 0.8584091067314148, + "num_tokens": 16758906.0, + "step": 1873 + }, + { + "epoch": 1.4240121580547114, + "grad_norm": 1.867530107498169, + "learning_rate": 2.9169218667902562e-06, + "loss": 0.3524911105632782, + "mean_token_accuracy": 0.8715004920959473, + "num_tokens": 16765969.0, + "step": 1874 + }, + { + "epoch": 1.424772036474164, + "grad_norm": 1.8886862993240356, + "learning_rate": 2.9148566554548857e-06, + "loss": 0.37144535779953003, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 16773935.0, + "step": 1875 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 1.266065239906311, + "learning_rate": 2.912791152957145e-06, + "loss": 0.3341747522354126, + "mean_token_accuracy": 0.8929134607315063, + "num_tokens": 16787780.0, + "step": 1876 + }, + { + "epoch": 1.4262917933130699, + "grad_norm": 2.524888753890991, + "learning_rate": 2.9107253607466833e-06, + "loss": 0.33709171414375305, + "mean_token_accuracy": 0.8857531547546387, + "num_tokens": 16792753.0, + "step": 1877 + }, + { + "epoch": 1.4270516717325228, + "grad_norm": 1.9269018173217773, + "learning_rate": 2.908659280273354e-06, + "loss": 0.32599249482154846, + "mean_token_accuracy": 0.8777773380279541, + "num_tokens": 16799904.0, + "step": 1878 + }, + { + "epoch": 1.4278115501519757, + "grad_norm": 1.9844375848770142, + "learning_rate": 2.9065929129872097e-06, + "loss": 0.4086732268333435, + "mean_token_accuracy": 0.8505409955978394, + "num_tokens": 16807774.0, + "step": 1879 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 4.0958662033081055, + "learning_rate": 2.9045262603385073e-06, + "loss": 0.3838827610015869, + "mean_token_accuracy": 0.877601146697998, + "num_tokens": 16810908.0, + "step": 1880 + }, + { + "epoch": 1.4293313069908815, + "grad_norm": 1.7323768138885498, + "learning_rate": 2.902459323777704e-06, + "loss": 0.37459003925323486, + "mean_token_accuracy": 0.8655836582183838, + "num_tokens": 16819494.0, + "step": 1881 + }, + { + "epoch": 1.4300911854103344, + "grad_norm": 2.608043670654297, + "learning_rate": 2.900392104755455e-06, + "loss": 0.5798726677894592, + "mean_token_accuracy": 0.8382592797279358, + "num_tokens": 16827745.0, + "step": 1882 + }, + { + "epoch": 1.4308510638297873, + "grad_norm": 1.3262078762054443, + "learning_rate": 2.8983246047226137e-06, + "loss": 0.3724595904350281, + "mean_token_accuracy": 0.8651963472366333, + "num_tokens": 16844171.0, + "step": 1883 + }, + { + "epoch": 1.43161094224924, + "grad_norm": 1.7250545024871826, + "learning_rate": 2.8962568251302327e-06, + "loss": 0.3478979468345642, + "mean_token_accuracy": 0.8807886242866516, + "num_tokens": 16852838.0, + "step": 1884 + }, + { + "epoch": 1.4323708206686931, + "grad_norm": 2.114525318145752, + "learning_rate": 2.8941887674295573e-06, + "loss": 0.5156140327453613, + "mean_token_accuracy": 0.825178861618042, + "num_tokens": 16861087.0, + "step": 1885 + }, + { + "epoch": 1.4331306990881458, + "grad_norm": 2.400829792022705, + "learning_rate": 2.892120433072031e-06, + "loss": 0.2807392477989197, + "mean_token_accuracy": 0.8907361030578613, + "num_tokens": 16866557.0, + "step": 1886 + }, + { + "epoch": 1.4338905775075987, + "grad_norm": 2.490880012512207, + "learning_rate": 2.8900518235092908e-06, + "loss": 0.2615952491760254, + "mean_token_accuracy": 0.9152894020080566, + "num_tokens": 16871357.0, + "step": 1887 + }, + { + "epoch": 1.4346504559270516, + "grad_norm": 1.9058431386947632, + "learning_rate": 2.887982940193165e-06, + "loss": 0.43623363971710205, + "mean_token_accuracy": 0.84696364402771, + "num_tokens": 16879016.0, + "step": 1888 + }, + { + "epoch": 1.4354103343465046, + "grad_norm": 1.4520210027694702, + "learning_rate": 2.8859137845756785e-06, + "loss": 0.3961856961250305, + "mean_token_accuracy": 0.8518897294998169, + "num_tokens": 16892254.0, + "step": 1889 + }, + { + "epoch": 1.4361702127659575, + "grad_norm": 2.500274896621704, + "learning_rate": 2.8838443581090415e-06, + "loss": 0.41457289457321167, + "mean_token_accuracy": 0.8751448392868042, + "num_tokens": 16897156.0, + "step": 1890 + }, + { + "epoch": 1.4369300911854104, + "grad_norm": 2.9312057495117188, + "learning_rate": 2.8817746622456585e-06, + "loss": 0.45875269174575806, + "mean_token_accuracy": 0.8411039113998413, + "num_tokens": 16902291.0, + "step": 1891 + }, + { + "epoch": 1.4376899696048633, + "grad_norm": 2.367419481277466, + "learning_rate": 2.879704698438121e-06, + "loss": 0.3643629848957062, + "mean_token_accuracy": 0.8771071434020996, + "num_tokens": 16908128.0, + "step": 1892 + }, + { + "epoch": 1.4384498480243162, + "grad_norm": 1.9907705783843994, + "learning_rate": 2.8776344681392106e-06, + "loss": 0.3206835389137268, + "mean_token_accuracy": 0.879996657371521, + "num_tokens": 16914918.0, + "step": 1893 + }, + { + "epoch": 1.439209726443769, + "grad_norm": 3.536956310272217, + "learning_rate": 2.875563972801893e-06, + "loss": 0.3640141785144806, + "mean_token_accuracy": 0.8814959526062012, + "num_tokens": 16918187.0, + "step": 1894 + }, + { + "epoch": 1.4399696048632218, + "grad_norm": 1.3451156616210938, + "learning_rate": 2.8734932138793226e-06, + "loss": 0.3427346348762512, + "mean_token_accuracy": 0.8835382461547852, + "num_tokens": 16931135.0, + "step": 1895 + }, + { + "epoch": 1.4407294832826747, + "grad_norm": 2.0735955238342285, + "learning_rate": 2.871422192824837e-06, + "loss": 0.4265315532684326, + "mean_token_accuracy": 0.8452677726745605, + "num_tokens": 16937995.0, + "step": 1896 + }, + { + "epoch": 1.4414893617021276, + "grad_norm": 1.5124932527542114, + "learning_rate": 2.8693509110919597e-06, + "loss": 0.497121661901474, + "mean_token_accuracy": 0.815092921257019, + "num_tokens": 16952743.0, + "step": 1897 + }, + { + "epoch": 1.4422492401215805, + "grad_norm": 3.716669797897339, + "learning_rate": 2.867279370134395e-06, + "loss": 0.5452651381492615, + "mean_token_accuracy": 0.8150380849838257, + "num_tokens": 16956797.0, + "step": 1898 + }, + { + "epoch": 1.4430091185410334, + "grad_norm": 1.3571398258209229, + "learning_rate": 2.8652075714060296e-06, + "loss": 0.4249724745750427, + "mean_token_accuracy": 0.8675867915153503, + "num_tokens": 16974494.0, + "step": 1899 + }, + { + "epoch": 1.4437689969604863, + "grad_norm": 2.310673475265503, + "learning_rate": 2.863135516360932e-06, + "loss": 0.39368677139282227, + "mean_token_accuracy": 0.878392219543457, + "num_tokens": 16980612.0, + "step": 1900 + }, + { + "epoch": 1.4445288753799392, + "grad_norm": 1.9025533199310303, + "learning_rate": 2.8610632064533517e-06, + "loss": 0.4786127805709839, + "mean_token_accuracy": 0.8720556497573853, + "num_tokens": 16992262.0, + "step": 1901 + }, + { + "epoch": 1.4452887537993921, + "grad_norm": 2.528564453125, + "learning_rate": 2.8589906431377133e-06, + "loss": 0.4223094582557678, + "mean_token_accuracy": 0.8513246178627014, + "num_tokens": 16997717.0, + "step": 1902 + }, + { + "epoch": 1.446048632218845, + "grad_norm": 1.010425329208374, + "learning_rate": 2.8569178278686222e-06, + "loss": 0.3908255696296692, + "mean_token_accuracy": 0.8620463609695435, + "num_tokens": 17020903.0, + "step": 1903 + }, + { + "epoch": 1.4468085106382977, + "grad_norm": 1.5760232210159302, + "learning_rate": 2.8548447621008614e-06, + "loss": 0.4134044051170349, + "mean_token_accuracy": 0.8472093343734741, + "num_tokens": 17035250.0, + "step": 1904 + }, + { + "epoch": 1.4475683890577509, + "grad_norm": 2.0668535232543945, + "learning_rate": 2.8527714472893866e-06, + "loss": 0.44095730781555176, + "mean_token_accuracy": 0.881983757019043, + "num_tokens": 17042170.0, + "step": 1905 + }, + { + "epoch": 1.4483282674772036, + "grad_norm": 1.1620599031448364, + "learning_rate": 2.85069788488933e-06, + "loss": 0.3607163429260254, + "mean_token_accuracy": 0.8684282898902893, + "num_tokens": 17061937.0, + "step": 1906 + }, + { + "epoch": 1.4490881458966565, + "grad_norm": 2.1316568851470947, + "learning_rate": 2.8486240763559984e-06, + "loss": 0.3478124141693115, + "mean_token_accuracy": 0.8772403001785278, + "num_tokens": 17068628.0, + "step": 1907 + }, + { + "epoch": 1.4498480243161094, + "grad_norm": 2.4756391048431396, + "learning_rate": 2.8465500231448707e-06, + "loss": 0.46441152691841125, + "mean_token_accuracy": 0.8436450958251953, + "num_tokens": 17075495.0, + "step": 1908 + }, + { + "epoch": 1.4506079027355623, + "grad_norm": 2.249720573425293, + "learning_rate": 2.844475726711595e-06, + "loss": 0.41565513610839844, + "mean_token_accuracy": 0.8525094985961914, + "num_tokens": 17080940.0, + "step": 1909 + }, + { + "epoch": 1.4513677811550152, + "grad_norm": 2.3081841468811035, + "learning_rate": 2.8424011885119956e-06, + "loss": 0.49903199076652527, + "mean_token_accuracy": 0.8212426900863647, + "num_tokens": 17092024.0, + "step": 1910 + }, + { + "epoch": 1.452127659574468, + "grad_norm": 1.2929959297180176, + "learning_rate": 2.8403264100020613e-06, + "loss": 0.47038257122039795, + "mean_token_accuracy": 0.8319816589355469, + "num_tokens": 17108840.0, + "step": 1911 + }, + { + "epoch": 1.452887537993921, + "grad_norm": 1.6476463079452515, + "learning_rate": 2.8382513926379508e-06, + "loss": 0.42287829518318176, + "mean_token_accuracy": 0.8555682897567749, + "num_tokens": 17119704.0, + "step": 1912 + }, + { + "epoch": 1.453647416413374, + "grad_norm": 1.759998083114624, + "learning_rate": 2.836176137875993e-06, + "loss": 0.40904951095581055, + "mean_token_accuracy": 0.8698266744613647, + "num_tokens": 17130676.0, + "step": 1913 + }, + { + "epoch": 1.4544072948328268, + "grad_norm": 1.510909914970398, + "learning_rate": 2.8341006471726817e-06, + "loss": 0.47834792733192444, + "mean_token_accuracy": 0.8335825204849243, + "num_tokens": 17146304.0, + "step": 1914 + }, + { + "epoch": 1.4551671732522795, + "grad_norm": 3.538071632385254, + "learning_rate": 2.832024921984674e-06, + "loss": 0.34059035778045654, + "mean_token_accuracy": 0.8769031763076782, + "num_tokens": 17150458.0, + "step": 1915 + }, + { + "epoch": 1.4559270516717326, + "grad_norm": 2.3368659019470215, + "learning_rate": 2.8299489637687955e-06, + "loss": 0.43068382143974304, + "mean_token_accuracy": 0.845360517501831, + "num_tokens": 17157368.0, + "step": 1916 + }, + { + "epoch": 1.4566869300911853, + "grad_norm": 1.8720396757125854, + "learning_rate": 2.8278727739820334e-06, + "loss": 0.37013399600982666, + "mean_token_accuracy": 0.854241132736206, + "num_tokens": 17166325.0, + "step": 1917 + }, + { + "epoch": 1.4574468085106382, + "grad_norm": 1.6706892251968384, + "learning_rate": 2.825796354081537e-06, + "loss": 0.5397020578384399, + "mean_token_accuracy": 0.8309713006019592, + "num_tokens": 17178920.0, + "step": 1918 + }, + { + "epoch": 1.4582066869300911, + "grad_norm": 2.729210376739502, + "learning_rate": 2.8237197055246175e-06, + "loss": 0.25137859582901, + "mean_token_accuracy": 0.9148792028427124, + "num_tokens": 17183107.0, + "step": 1919 + }, + { + "epoch": 1.458966565349544, + "grad_norm": 3.023500680923462, + "learning_rate": 2.821642829768748e-06, + "loss": 0.43312495946884155, + "mean_token_accuracy": 0.8481811285018921, + "num_tokens": 17187853.0, + "step": 1920 + }, + { + "epoch": 1.459726443768997, + "grad_norm": 1.8108519315719604, + "learning_rate": 2.8195657282715595e-06, + "loss": 0.5101792216300964, + "mean_token_accuracy": 0.8315553069114685, + "num_tokens": 17199247.0, + "step": 1921 + }, + { + "epoch": 1.4604863221884499, + "grad_norm": 2.0262672901153564, + "learning_rate": 2.817488402490841e-06, + "loss": 0.4449934959411621, + "mean_token_accuracy": 0.8634527325630188, + "num_tokens": 17206348.0, + "step": 1922 + }, + { + "epoch": 1.4612462006079028, + "grad_norm": 2.6163926124572754, + "learning_rate": 2.8154108538845405e-06, + "loss": 0.43052345514297485, + "mean_token_accuracy": 0.8375401496887207, + "num_tokens": 17211702.0, + "step": 1923 + }, + { + "epoch": 1.4620060790273557, + "grad_norm": 2.0854408740997314, + "learning_rate": 2.813333083910761e-06, + "loss": 0.5011380910873413, + "mean_token_accuracy": 0.8359915018081665, + "num_tokens": 17219096.0, + "step": 1924 + }, + { + "epoch": 1.4627659574468086, + "grad_norm": 2.2081687450408936, + "learning_rate": 2.8112550940277615e-06, + "loss": 0.5239193439483643, + "mean_token_accuracy": 0.8499593734741211, + "num_tokens": 17229266.0, + "step": 1925 + }, + { + "epoch": 1.4635258358662613, + "grad_norm": 1.798343539237976, + "learning_rate": 2.809176885693956e-06, + "loss": 0.4515029191970825, + "mean_token_accuracy": 0.8400485515594482, + "num_tokens": 17239280.0, + "step": 1926 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.897887945175171, + "learning_rate": 2.807098460367911e-06, + "loss": 0.35935714840888977, + "mean_token_accuracy": 0.8776072263717651, + "num_tokens": 17247132.0, + "step": 1927 + }, + { + "epoch": 1.465045592705167, + "grad_norm": 2.705836296081543, + "learning_rate": 2.8050198195083445e-06, + "loss": 0.3728443682193756, + "mean_token_accuracy": 0.8649885654449463, + "num_tokens": 17251865.0, + "step": 1928 + }, + { + "epoch": 1.46580547112462, + "grad_norm": 1.841178059577942, + "learning_rate": 2.802940964574127e-06, + "loss": 0.40604841709136963, + "mean_token_accuracy": 0.8537783622741699, + "num_tokens": 17260163.0, + "step": 1929 + }, + { + "epoch": 1.466565349544073, + "grad_norm": 2.7393605709075928, + "learning_rate": 2.800861897024279e-06, + "loss": 0.39346879720687866, + "mean_token_accuracy": 0.8628787994384766, + "num_tokens": 17264876.0, + "step": 1930 + }, + { + "epoch": 1.4673252279635258, + "grad_norm": 1.84367835521698, + "learning_rate": 2.798782618317971e-06, + "loss": 0.37411895394325256, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 17273049.0, + "step": 1931 + }, + { + "epoch": 1.4680851063829787, + "grad_norm": 1.6546733379364014, + "learning_rate": 2.796703129914519e-06, + "loss": 0.4997844099998474, + "mean_token_accuracy": 0.8267433643341064, + "num_tokens": 17285074.0, + "step": 1932 + }, + { + "epoch": 1.4688449848024316, + "grad_norm": 2.2749221324920654, + "learning_rate": 2.79462343327339e-06, + "loss": 0.35453367233276367, + "mean_token_accuracy": 0.8746850490570068, + "num_tokens": 17290273.0, + "step": 1933 + }, + { + "epoch": 1.4696048632218845, + "grad_norm": 1.7142518758773804, + "learning_rate": 2.7925435298541944e-06, + "loss": 0.345878541469574, + "mean_token_accuracy": 0.8600981831550598, + "num_tokens": 17301045.0, + "step": 1934 + }, + { + "epoch": 1.4703647416413375, + "grad_norm": 3.163342237472534, + "learning_rate": 2.7904634211166877e-06, + "loss": 0.4356975853443146, + "mean_token_accuracy": 0.8460350036621094, + "num_tokens": 17305108.0, + "step": 1935 + }, + { + "epoch": 1.4711246200607904, + "grad_norm": 1.6377612352371216, + "learning_rate": 2.7883831085207707e-06, + "loss": 0.4459729790687561, + "mean_token_accuracy": 0.8463394641876221, + "num_tokens": 17315479.0, + "step": 1936 + }, + { + "epoch": 1.471884498480243, + "grad_norm": 1.865268588066101, + "learning_rate": 2.7863025935264876e-06, + "loss": 0.394723117351532, + "mean_token_accuracy": 0.864177942276001, + "num_tokens": 17324795.0, + "step": 1937 + }, + { + "epoch": 1.4726443768996962, + "grad_norm": 1.241937518119812, + "learning_rate": 2.784221877594024e-06, + "loss": 0.2752220630645752, + "mean_token_accuracy": 0.8998259902000427, + "num_tokens": 17338000.0, + "step": 1938 + }, + { + "epoch": 1.4734042553191489, + "grad_norm": 1.8013651371002197, + "learning_rate": 2.7821409621837042e-06, + "loss": 0.4251005947589874, + "mean_token_accuracy": 0.8518919348716736, + "num_tokens": 17347351.0, + "step": 1939 + }, + { + "epoch": 1.4741641337386018, + "grad_norm": 1.2902207374572754, + "learning_rate": 2.7800598487559976e-06, + "loss": 0.3640727400779724, + "mean_token_accuracy": 0.8592870235443115, + "num_tokens": 17362335.0, + "step": 1940 + }, + { + "epoch": 1.4749240121580547, + "grad_norm": 2.5427513122558594, + "learning_rate": 2.777978538771508e-06, + "loss": 0.38166797161102295, + "mean_token_accuracy": 0.8653234839439392, + "num_tokens": 17367733.0, + "step": 1941 + }, + { + "epoch": 1.4756838905775076, + "grad_norm": 1.7793641090393066, + "learning_rate": 2.7758970336909795e-06, + "loss": 0.3113783895969391, + "mean_token_accuracy": 0.8812868595123291, + "num_tokens": 17375267.0, + "step": 1942 + }, + { + "epoch": 1.4764437689969605, + "grad_norm": 3.4031741619110107, + "learning_rate": 2.7738153349752923e-06, + "loss": 0.4800986647605896, + "mean_token_accuracy": 0.8336698412895203, + "num_tokens": 17379549.0, + "step": 1943 + }, + { + "epoch": 1.4772036474164134, + "grad_norm": 1.3451651334762573, + "learning_rate": 2.7717334440854634e-06, + "loss": 0.3115345239639282, + "mean_token_accuracy": 0.908623218536377, + "num_tokens": 17394455.0, + "step": 1944 + }, + { + "epoch": 1.4779635258358663, + "grad_norm": 1.980919599533081, + "learning_rate": 2.7696513624826422e-06, + "loss": 0.391154944896698, + "mean_token_accuracy": 0.8650267720222473, + "num_tokens": 17401931.0, + "step": 1945 + }, + { + "epoch": 1.4787234042553192, + "grad_norm": 1.0118765830993652, + "learning_rate": 2.7675690916281158e-06, + "loss": 0.3157956600189209, + "mean_token_accuracy": 0.8827471733093262, + "num_tokens": 17424144.0, + "step": 1946 + }, + { + "epoch": 1.4794832826747721, + "grad_norm": 1.579654335975647, + "learning_rate": 2.7654866329833e-06, + "loss": 0.4578486382961273, + "mean_token_accuracy": 0.8361750245094299, + "num_tokens": 17435769.0, + "step": 1947 + }, + { + "epoch": 1.4802431610942248, + "grad_norm": 1.7706717252731323, + "learning_rate": 2.763403988009746e-06, + "loss": 0.3564416170120239, + "mean_token_accuracy": 0.8689201474189758, + "num_tokens": 17444088.0, + "step": 1948 + }, + { + "epoch": 1.4810030395136777, + "grad_norm": 1.2264244556427002, + "learning_rate": 2.761321158169134e-06, + "loss": 0.30763837695121765, + "mean_token_accuracy": 0.8960219621658325, + "num_tokens": 17458096.0, + "step": 1949 + }, + { + "epoch": 1.4817629179331306, + "grad_norm": 1.214431881904602, + "learning_rate": 2.759238144923274e-06, + "loss": 0.49099457263946533, + "mean_token_accuracy": 0.8279136419296265, + "num_tokens": 17481062.0, + "step": 1950 + }, + { + "epoch": 1.4825227963525835, + "grad_norm": 1.593892216682434, + "learning_rate": 2.7571549497341044e-06, + "loss": 0.3745320737361908, + "mean_token_accuracy": 0.8690779209136963, + "num_tokens": 17490874.0, + "step": 1951 + }, + { + "epoch": 1.4832826747720365, + "grad_norm": 2.409924268722534, + "learning_rate": 2.755071574063692e-06, + "loss": 0.4310247600078583, + "mean_token_accuracy": 0.8521159291267395, + "num_tokens": 17496942.0, + "step": 1952 + }, + { + "epoch": 1.4840425531914894, + "grad_norm": 1.2557463645935059, + "learning_rate": 2.7529880193742297e-06, + "loss": 0.34304720163345337, + "mean_token_accuracy": 0.8748183250427246, + "num_tokens": 17514391.0, + "step": 1953 + }, + { + "epoch": 1.4848024316109423, + "grad_norm": 1.17310631275177, + "learning_rate": 2.7509042871280373e-06, + "loss": 0.3835817277431488, + "mean_token_accuracy": 0.8853274583816528, + "num_tokens": 17533289.0, + "step": 1954 + }, + { + "epoch": 1.4855623100303952, + "grad_norm": 1.5261479616165161, + "learning_rate": 2.748820378787558e-06, + "loss": 0.4799988865852356, + "mean_token_accuracy": 0.8252149820327759, + "num_tokens": 17544118.0, + "step": 1955 + }, + { + "epoch": 1.486322188449848, + "grad_norm": 2.030930757522583, + "learning_rate": 2.7467362958153585e-06, + "loss": 0.35690805315971375, + "mean_token_accuracy": 0.8959587216377258, + "num_tokens": 17550431.0, + "step": 1956 + }, + { + "epoch": 1.4870820668693008, + "grad_norm": 2.376520872116089, + "learning_rate": 2.7446520396741293e-06, + "loss": 0.262234091758728, + "mean_token_accuracy": 0.9054547548294067, + "num_tokens": 17554853.0, + "step": 1957 + }, + { + "epoch": 1.487841945288754, + "grad_norm": 1.6944479942321777, + "learning_rate": 2.742567611826681e-06, + "loss": 0.529259979724884, + "mean_token_accuracy": 0.8195339441299438, + "num_tokens": 17568016.0, + "step": 1958 + }, + { + "epoch": 1.4886018237082066, + "grad_norm": 2.833029270172119, + "learning_rate": 2.7404830137359445e-06, + "loss": 0.30229634046554565, + "mean_token_accuracy": 0.8933001756668091, + "num_tokens": 17572587.0, + "step": 1959 + }, + { + "epoch": 1.4893617021276595, + "grad_norm": 1.7040144205093384, + "learning_rate": 2.7383982468649715e-06, + "loss": 0.3166356682777405, + "mean_token_accuracy": 0.8871906399726868, + "num_tokens": 17580966.0, + "step": 1960 + }, + { + "epoch": 1.4901215805471124, + "grad_norm": 1.7539052963256836, + "learning_rate": 2.7363133126769326e-06, + "loss": 0.4231064021587372, + "mean_token_accuracy": 0.8708304166793823, + "num_tokens": 17590907.0, + "step": 1961 + }, + { + "epoch": 1.4908814589665653, + "grad_norm": 1.6198650598526, + "learning_rate": 2.7342282126351145e-06, + "loss": 0.4198967218399048, + "mean_token_accuracy": 0.8723280429840088, + "num_tokens": 17604291.0, + "step": 1962 + }, + { + "epoch": 1.4916413373860182, + "grad_norm": 1.8437711000442505, + "learning_rate": 2.73214294820292e-06, + "loss": 0.38923323154449463, + "mean_token_accuracy": 0.8697006106376648, + "num_tokens": 17612291.0, + "step": 1963 + }, + { + "epoch": 1.4924012158054711, + "grad_norm": 1.1129369735717773, + "learning_rate": 2.7300575208438684e-06, + "loss": 0.3107512593269348, + "mean_token_accuracy": 0.878618597984314, + "num_tokens": 17630073.0, + "step": 1964 + }, + { + "epoch": 1.493161094224924, + "grad_norm": 3.0210442543029785, + "learning_rate": 2.7279719320215924e-06, + "loss": 0.4630751609802246, + "mean_token_accuracy": 0.8567075729370117, + "num_tokens": 17634758.0, + "step": 1965 + }, + { + "epoch": 1.493920972644377, + "grad_norm": 2.8825972080230713, + "learning_rate": 2.725886183199839e-06, + "loss": 0.35351765155792236, + "mean_token_accuracy": 0.8711981773376465, + "num_tokens": 17639613.0, + "step": 1966 + }, + { + "epoch": 1.4946808510638299, + "grad_norm": 2.111238718032837, + "learning_rate": 2.723800275842468e-06, + "loss": 0.3529569208621979, + "mean_token_accuracy": 0.8679244518280029, + "num_tokens": 17645308.0, + "step": 1967 + }, + { + "epoch": 1.4954407294832825, + "grad_norm": 2.080509901046753, + "learning_rate": 2.7217142114134466e-06, + "loss": 0.43321219086647034, + "mean_token_accuracy": 0.8848220109939575, + "num_tokens": 17652292.0, + "step": 1968 + }, + { + "epoch": 1.4962006079027357, + "grad_norm": 2.8686363697052, + "learning_rate": 2.7196279913768587e-06, + "loss": 0.417035311460495, + "mean_token_accuracy": 0.8724601864814758, + "num_tokens": 17656908.0, + "step": 1969 + }, + { + "epoch": 1.4969604863221884, + "grad_norm": 3.294193744659424, + "learning_rate": 2.717541617196891e-06, + "loss": 0.3551934063434601, + "mean_token_accuracy": 0.8838565349578857, + "num_tokens": 17660590.0, + "step": 1970 + }, + { + "epoch": 1.4977203647416413, + "grad_norm": 1.766292929649353, + "learning_rate": 2.7154550903378425e-06, + "loss": 0.36521971225738525, + "mean_token_accuracy": 0.8810199499130249, + "num_tokens": 17668214.0, + "step": 1971 + }, + { + "epoch": 1.4984802431610942, + "grad_norm": 1.2127676010131836, + "learning_rate": 2.713368412264118e-06, + "loss": 0.35184425115585327, + "mean_token_accuracy": 0.8672580718994141, + "num_tokens": 17684736.0, + "step": 1972 + }, + { + "epoch": 1.499240121580547, + "grad_norm": 2.268256664276123, + "learning_rate": 2.711281584440228e-06, + "loss": 0.40115267038345337, + "mean_token_accuracy": 0.8517841100692749, + "num_tokens": 17691510.0, + "step": 1973 + }, + { + "epoch": 1.5, + "grad_norm": 2.7196054458618164, + "learning_rate": 2.70919460833079e-06, + "loss": 0.3819037675857544, + "mean_token_accuracy": 0.8765411376953125, + "num_tokens": 17696179.0, + "step": 1974 + }, + { + "epoch": 1.500759878419453, + "grad_norm": 2.969406843185425, + "learning_rate": 2.7071074854005206e-06, + "loss": 0.3922455608844757, + "mean_token_accuracy": 0.8796037435531616, + "num_tokens": 17700597.0, + "step": 1975 + }, + { + "epoch": 1.5015197568389058, + "grad_norm": 2.2965853214263916, + "learning_rate": 2.705020217114248e-06, + "loss": 0.5433666110038757, + "mean_token_accuracy": 0.809639036655426, + "num_tokens": 17708895.0, + "step": 1976 + }, + { + "epoch": 1.5022796352583585, + "grad_norm": 1.5584394931793213, + "learning_rate": 2.7029328049368942e-06, + "loss": 0.4736343324184418, + "mean_token_accuracy": 0.8197190761566162, + "num_tokens": 17725202.0, + "step": 1977 + }, + { + "epoch": 1.5030395136778116, + "grad_norm": 1.3903142213821411, + "learning_rate": 2.700845250333486e-06, + "loss": 0.4471571445465088, + "mean_token_accuracy": 0.839043140411377, + "num_tokens": 17742835.0, + "step": 1978 + }, + { + "epoch": 1.5037993920972643, + "grad_norm": 3.080716609954834, + "learning_rate": 2.69875755476915e-06, + "loss": 0.45760005712509155, + "mean_token_accuracy": 0.8366328477859497, + "num_tokens": 17747324.0, + "step": 1979 + }, + { + "epoch": 1.5045592705167175, + "grad_norm": 1.0150405168533325, + "learning_rate": 2.696669719709111e-06, + "loss": 0.33638954162597656, + "mean_token_accuracy": 0.8591676354408264, + "num_tokens": 17765565.0, + "step": 1980 + }, + { + "epoch": 1.5053191489361701, + "grad_norm": 2.402927875518799, + "learning_rate": 2.694581746618691e-06, + "loss": 0.4086601436138153, + "mean_token_accuracy": 0.8769911527633667, + "num_tokens": 17771275.0, + "step": 1981 + }, + { + "epoch": 1.506079027355623, + "grad_norm": 2.030583381652832, + "learning_rate": 2.6924936369633126e-06, + "loss": 0.5115457773208618, + "mean_token_accuracy": 0.8054746389389038, + "num_tokens": 17779999.0, + "step": 1982 + }, + { + "epoch": 1.506838905775076, + "grad_norm": 2.575199604034424, + "learning_rate": 2.6904053922084893e-06, + "loss": 0.363183856010437, + "mean_token_accuracy": 0.8716042637825012, + "num_tokens": 17785473.0, + "step": 1983 + }, + { + "epoch": 1.5075987841945289, + "grad_norm": 1.8497480154037476, + "learning_rate": 2.688317013819832e-06, + "loss": 0.4254384934902191, + "mean_token_accuracy": 0.8549597263336182, + "num_tokens": 17793812.0, + "step": 1984 + }, + { + "epoch": 1.5083586626139818, + "grad_norm": 1.7786511182785034, + "learning_rate": 2.686228503263045e-06, + "loss": 0.33400774002075195, + "mean_token_accuracy": 0.9027615189552307, + "num_tokens": 17801783.0, + "step": 1985 + }, + { + "epoch": 1.5091185410334347, + "grad_norm": 1.8365367650985718, + "learning_rate": 2.684139862003927e-06, + "loss": 0.35765063762664795, + "mean_token_accuracy": 0.8663736581802368, + "num_tokens": 17809562.0, + "step": 1986 + }, + { + "epoch": 1.5098784194528876, + "grad_norm": 1.8817477226257324, + "learning_rate": 2.682051091508365e-06, + "loss": 0.4627506732940674, + "mean_token_accuracy": 0.8358862400054932, + "num_tokens": 17819094.0, + "step": 1987 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 2.221547842025757, + "learning_rate": 2.679962193242338e-06, + "loss": 0.577020525932312, + "mean_token_accuracy": 0.80013108253479, + "num_tokens": 17826666.0, + "step": 1988 + }, + { + "epoch": 1.5113981762917934, + "grad_norm": 2.6618270874023438, + "learning_rate": 2.6778731686719177e-06, + "loss": 0.44632256031036377, + "mean_token_accuracy": 0.8611289262771606, + "num_tokens": 17833172.0, + "step": 1989 + }, + { + "epoch": 1.512158054711246, + "grad_norm": 2.9495689868927, + "learning_rate": 2.67578401926326e-06, + "loss": 0.3482511043548584, + "mean_token_accuracy": 0.8703314661979675, + "num_tokens": 17837220.0, + "step": 1990 + }, + { + "epoch": 1.5129179331306992, + "grad_norm": 2.0943644046783447, + "learning_rate": 2.6736947464826107e-06, + "loss": 0.2354314625263214, + "mean_token_accuracy": 0.9137634038925171, + "num_tokens": 17842712.0, + "step": 1991 + }, + { + "epoch": 1.513677811550152, + "grad_norm": 1.1303033828735352, + "learning_rate": 2.671605351796302e-06, + "loss": 0.3624761700630188, + "mean_token_accuracy": 0.8769594430923462, + "num_tokens": 17860902.0, + "step": 1992 + }, + { + "epoch": 1.5144376899696048, + "grad_norm": 2.8921146392822266, + "learning_rate": 2.6695158366707526e-06, + "loss": 0.2517220973968506, + "mean_token_accuracy": 0.8974182605743408, + "num_tokens": 17865160.0, + "step": 1993 + }, + { + "epoch": 1.5151975683890577, + "grad_norm": 2.320587158203125, + "learning_rate": 2.667426202572463e-06, + "loss": 0.4589889943599701, + "mean_token_accuracy": 0.8379613161087036, + "num_tokens": 17871994.0, + "step": 1994 + }, + { + "epoch": 1.5159574468085106, + "grad_norm": 1.1407674551010132, + "learning_rate": 2.665336450968019e-06, + "loss": 0.34412115812301636, + "mean_token_accuracy": 0.8776306509971619, + "num_tokens": 17889941.0, + "step": 1995 + }, + { + "epoch": 1.5167173252279635, + "grad_norm": 2.069814920425415, + "learning_rate": 2.6632465833240895e-06, + "loss": 0.47524404525756836, + "mean_token_accuracy": 0.830310046672821, + "num_tokens": 17898447.0, + "step": 1996 + }, + { + "epoch": 1.5174772036474165, + "grad_norm": 1.822415828704834, + "learning_rate": 2.661156601107424e-06, + "loss": 0.4541318416595459, + "mean_token_accuracy": 0.8856616020202637, + "num_tokens": 17908729.0, + "step": 1997 + }, + { + "epoch": 1.5182370820668694, + "grad_norm": 2.851428985595703, + "learning_rate": 2.659066505784852e-06, + "loss": 0.41761666536331177, + "mean_token_accuracy": 0.8710572719573975, + "num_tokens": 17913860.0, + "step": 1998 + }, + { + "epoch": 1.518996960486322, + "grad_norm": 1.8483710289001465, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.45517268776893616, + "mean_token_accuracy": 0.8411115407943726, + "num_tokens": 17923497.0, + "step": 1999 + }, + { + "epoch": 1.5197568389057752, + "grad_norm": 1.9044219255447388, + "learning_rate": 2.654885981689706e-06, + "loss": 0.42533189058303833, + "mean_token_accuracy": 0.8597894906997681, + "num_tokens": 17932670.0, + "step": 2000 + }, + { + "epoch": 1.5205167173252279, + "grad_norm": 1.8170348405838013, + "learning_rate": 2.652795555851184e-06, + "loss": 0.4009692072868347, + "mean_token_accuracy": 0.8553036451339722, + "num_tokens": 17941616.0, + "step": 2001 + }, + { + "epoch": 1.521276595744681, + "grad_norm": 1.4704090356826782, + "learning_rate": 2.6507050227748595e-06, + "loss": 0.3732764720916748, + "mean_token_accuracy": 0.8788566589355469, + "num_tokens": 17957187.0, + "step": 2002 + }, + { + "epoch": 1.5220364741641337, + "grad_norm": 1.6681534051895142, + "learning_rate": 2.648614383927949e-06, + "loss": 0.341326504945755, + "mean_token_accuracy": 0.874875545501709, + "num_tokens": 17966668.0, + "step": 2003 + }, + { + "epoch": 1.5227963525835866, + "grad_norm": 1.8578619956970215, + "learning_rate": 2.646523640777741e-06, + "loss": 0.3937399983406067, + "mean_token_accuracy": 0.8656851053237915, + "num_tokens": 17976194.0, + "step": 2004 + }, + { + "epoch": 1.5235562310030395, + "grad_norm": 1.7520431280136108, + "learning_rate": 2.6444327947916037e-06, + "loss": 0.3392767906188965, + "mean_token_accuracy": 0.8799679279327393, + "num_tokens": 17984492.0, + "step": 2005 + }, + { + "epoch": 1.5243161094224924, + "grad_norm": 3.4649906158447266, + "learning_rate": 2.6423418474369707e-06, + "loss": 0.3451516032218933, + "mean_token_accuracy": 0.8753262758255005, + "num_tokens": 17988240.0, + "step": 2006 + }, + { + "epoch": 1.5250759878419453, + "grad_norm": 1.8037052154541016, + "learning_rate": 2.64025080018135e-06, + "loss": 0.34428173303604126, + "mean_token_accuracy": 0.8719067573547363, + "num_tokens": 17996644.0, + "step": 2007 + }, + { + "epoch": 1.5258358662613982, + "grad_norm": 1.743722677230835, + "learning_rate": 2.6381596544923184e-06, + "loss": 0.4446655213832855, + "mean_token_accuracy": 0.8612518906593323, + "num_tokens": 18005109.0, + "step": 2008 + }, + { + "epoch": 1.5265957446808511, + "grad_norm": 1.3357981443405151, + "learning_rate": 2.636068411837523e-06, + "loss": 0.38647788763046265, + "mean_token_accuracy": 0.858294665813446, + "num_tokens": 18018193.0, + "step": 2009 + }, + { + "epoch": 1.5273556231003038, + "grad_norm": 1.4848440885543823, + "learning_rate": 2.6339770736846794e-06, + "loss": 0.3597261607646942, + "mean_token_accuracy": 0.8760983943939209, + "num_tokens": 18028959.0, + "step": 2010 + }, + { + "epoch": 1.528115501519757, + "grad_norm": 2.356933832168579, + "learning_rate": 2.6318856415015664e-06, + "loss": 0.2697138488292694, + "mean_token_accuracy": 0.9078473448753357, + "num_tokens": 18033946.0, + "step": 2011 + }, + { + "epoch": 1.5288753799392096, + "grad_norm": 1.964368224143982, + "learning_rate": 2.629794116756035e-06, + "loss": 0.41349685192108154, + "mean_token_accuracy": 0.8567900657653809, + "num_tokens": 18042724.0, + "step": 2012 + }, + { + "epoch": 1.5296352583586628, + "grad_norm": 1.5630402565002441, + "learning_rate": 2.627702500915995e-06, + "loss": 0.49310681223869324, + "mean_token_accuracy": 0.8229681253433228, + "num_tokens": 18054396.0, + "step": 2013 + }, + { + "epoch": 1.5303951367781155, + "grad_norm": 1.6657718420028687, + "learning_rate": 2.625610795449424e-06, + "loss": 0.4263935387134552, + "mean_token_accuracy": 0.8634918332099915, + "num_tokens": 18064347.0, + "step": 2014 + }, + { + "epoch": 1.5311550151975684, + "grad_norm": 1.3684180974960327, + "learning_rate": 2.6235190018243623e-06, + "loss": 0.2903984487056732, + "mean_token_accuracy": 0.8930408358573914, + "num_tokens": 18076826.0, + "step": 2015 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 1.635044813156128, + "learning_rate": 2.6214271215089106e-06, + "loss": 0.3066539168357849, + "mean_token_accuracy": 0.8912158012390137, + "num_tokens": 18085761.0, + "step": 2016 + }, + { + "epoch": 1.5326747720364742, + "grad_norm": 2.431518316268921, + "learning_rate": 2.6193351559712294e-06, + "loss": 0.31123271584510803, + "mean_token_accuracy": 0.8865828514099121, + "num_tokens": 18091715.0, + "step": 2017 + }, + { + "epoch": 1.533434650455927, + "grad_norm": 1.8317419290542603, + "learning_rate": 2.6172431066795428e-06, + "loss": 0.5042020082473755, + "mean_token_accuracy": 0.8245081901550293, + "num_tokens": 18102095.0, + "step": 2018 + }, + { + "epoch": 1.53419452887538, + "grad_norm": 3.4221980571746826, + "learning_rate": 2.6151509751021307e-06, + "loss": 0.2885819971561432, + "mean_token_accuracy": 0.8997149467468262, + "num_tokens": 18105456.0, + "step": 2019 + }, + { + "epoch": 1.534954407294833, + "grad_norm": 1.4435855150222778, + "learning_rate": 2.6130587627073315e-06, + "loss": 0.45573529601097107, + "mean_token_accuracy": 0.837191104888916, + "num_tokens": 18119039.0, + "step": 2020 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 1.5748237371444702, + "learning_rate": 2.6109664709635413e-06, + "loss": 0.4561889171600342, + "mean_token_accuracy": 0.8334558010101318, + "num_tokens": 18132150.0, + "step": 2021 + }, + { + "epoch": 1.5364741641337387, + "grad_norm": 2.8278751373291016, + "learning_rate": 2.60887410133921e-06, + "loss": 0.3495104908943176, + "mean_token_accuracy": 0.8926796913146973, + "num_tokens": 18136528.0, + "step": 2022 + }, + { + "epoch": 1.5372340425531914, + "grad_norm": 2.5045573711395264, + "learning_rate": 2.606781655302843e-06, + "loss": 0.45362481474876404, + "mean_token_accuracy": 0.8379551768302917, + "num_tokens": 18142581.0, + "step": 2023 + }, + { + "epoch": 1.5379939209726445, + "grad_norm": 2.5984106063842773, + "learning_rate": 2.604689134322999e-06, + "loss": 0.4210243821144104, + "mean_token_accuracy": 0.8571645021438599, + "num_tokens": 18148152.0, + "step": 2024 + }, + { + "epoch": 1.5387537993920972, + "grad_norm": 1.7180702686309814, + "learning_rate": 2.602596539868292e-06, + "loss": 0.2478562295436859, + "mean_token_accuracy": 0.9227135181427002, + "num_tokens": 18155435.0, + "step": 2025 + }, + { + "epoch": 1.5395136778115501, + "grad_norm": 2.3721933364868164, + "learning_rate": 2.6005038734073833e-06, + "loss": 0.3820664584636688, + "mean_token_accuracy": 0.8788443803787231, + "num_tokens": 18161403.0, + "step": 2026 + }, + { + "epoch": 1.540273556231003, + "grad_norm": 1.4967509508132935, + "learning_rate": 2.5984111364089875e-06, + "loss": 0.34247124195098877, + "mean_token_accuracy": 0.8809049129486084, + "num_tokens": 18173724.0, + "step": 2027 + }, + { + "epoch": 1.541033434650456, + "grad_norm": 2.5226845741271973, + "learning_rate": 2.5963183303418682e-06, + "loss": 0.2647642493247986, + "mean_token_accuracy": 0.8988642692565918, + "num_tokens": 18178927.0, + "step": 2028 + }, + { + "epoch": 1.5417933130699089, + "grad_norm": 2.217228412628174, + "learning_rate": 2.594225456674837e-06, + "loss": 0.37754058837890625, + "mean_token_accuracy": 0.8660204410552979, + "num_tokens": 18185268.0, + "step": 2029 + }, + { + "epoch": 1.5425531914893615, + "grad_norm": 2.336409091949463, + "learning_rate": 2.592132516876753e-06, + "loss": 0.45098528265953064, + "mean_token_accuracy": 0.842115044593811, + "num_tokens": 18192372.0, + "step": 2030 + }, + { + "epoch": 1.5433130699088147, + "grad_norm": 3.5437142848968506, + "learning_rate": 2.5900395124165216e-06, + "loss": 0.5326460003852844, + "mean_token_accuracy": 0.8125103712081909, + "num_tokens": 18199182.0, + "step": 2031 + }, + { + "epoch": 1.5440729483282674, + "grad_norm": 1.5785651206970215, + "learning_rate": 2.5879464447630947e-06, + "loss": 0.3714991509914398, + "mean_token_accuracy": 0.8711390495300293, + "num_tokens": 18209045.0, + "step": 2032 + }, + { + "epoch": 1.5448328267477205, + "grad_norm": 2.3616182804107666, + "learning_rate": 2.5858533153854676e-06, + "loss": 0.4548399746417999, + "mean_token_accuracy": 0.8411449193954468, + "num_tokens": 18215487.0, + "step": 2033 + }, + { + "epoch": 1.5455927051671732, + "grad_norm": 2.0750479698181152, + "learning_rate": 2.583760125752679e-06, + "loss": 0.3980535566806793, + "mean_token_accuracy": 0.8603327870368958, + "num_tokens": 18222606.0, + "step": 2034 + }, + { + "epoch": 1.5463525835866263, + "grad_norm": 2.609295129776001, + "learning_rate": 2.58166687733381e-06, + "loss": 0.40177756547927856, + "mean_token_accuracy": 0.8652099370956421, + "num_tokens": 18227341.0, + "step": 2035 + }, + { + "epoch": 1.547112462006079, + "grad_norm": 2.1621339321136475, + "learning_rate": 2.5795735715979826e-06, + "loss": 0.45104342699050903, + "mean_token_accuracy": 0.8481369018554688, + "num_tokens": 18235820.0, + "step": 2036 + }, + { + "epoch": 1.547872340425532, + "grad_norm": 1.0381370782852173, + "learning_rate": 2.577480210014359e-06, + "loss": 0.32621103525161743, + "mean_token_accuracy": 0.8867391347885132, + "num_tokens": 18258307.0, + "step": 2037 + }, + { + "epoch": 1.5486322188449848, + "grad_norm": 1.7634375095367432, + "learning_rate": 2.575386794052142e-06, + "loss": 0.5115169882774353, + "mean_token_accuracy": 0.818779468536377, + "num_tokens": 18272782.0, + "step": 2038 + }, + { + "epoch": 1.5493920972644377, + "grad_norm": 1.874875545501709, + "learning_rate": 2.5732933251805716e-06, + "loss": 0.4381459951400757, + "mean_token_accuracy": 0.8594684600830078, + "num_tokens": 18282618.0, + "step": 2039 + }, + { + "epoch": 1.5501519756838906, + "grad_norm": 2.1316351890563965, + "learning_rate": 2.571199804868923e-06, + "loss": 0.5410124063491821, + "mean_token_accuracy": 0.8247587084770203, + "num_tokens": 18289750.0, + "step": 2040 + }, + { + "epoch": 1.5509118541033433, + "grad_norm": 1.7574573755264282, + "learning_rate": 2.569106234586511e-06, + "loss": 0.29967373609542847, + "mean_token_accuracy": 0.8913218975067139, + "num_tokens": 18298110.0, + "step": 2041 + }, + { + "epoch": 1.5516717325227964, + "grad_norm": 1.929626703262329, + "learning_rate": 2.5670126158026843e-06, + "loss": 0.3287760019302368, + "mean_token_accuracy": 0.8870488405227661, + "num_tokens": 18305702.0, + "step": 2042 + }, + { + "epoch": 1.5524316109422491, + "grad_norm": 3.020153284072876, + "learning_rate": 2.5649189499868233e-06, + "loss": 0.38523542881011963, + "mean_token_accuracy": 0.854824960231781, + "num_tokens": 18309830.0, + "step": 2043 + }, + { + "epoch": 1.5531914893617023, + "grad_norm": 1.6378421783447266, + "learning_rate": 2.5628252386083443e-06, + "loss": 0.47371378540992737, + "mean_token_accuracy": 0.8627713918685913, + "num_tokens": 18322820.0, + "step": 2044 + }, + { + "epoch": 1.553951367781155, + "grad_norm": 1.3711130619049072, + "learning_rate": 2.560731483136694e-06, + "loss": 0.3319293260574341, + "mean_token_accuracy": 0.8704103231430054, + "num_tokens": 18335074.0, + "step": 2045 + }, + { + "epoch": 1.5547112462006079, + "grad_norm": 1.7589185237884521, + "learning_rate": 2.558637685041352e-06, + "loss": 0.4446021020412445, + "mean_token_accuracy": 0.8446722626686096, + "num_tokens": 18344115.0, + "step": 2046 + }, + { + "epoch": 1.5554711246200608, + "grad_norm": 2.5249195098876953, + "learning_rate": 2.5565438457918247e-06, + "loss": 0.4625541865825653, + "mean_token_accuracy": 0.8451195359230042, + "num_tokens": 18349235.0, + "step": 2047 + }, + { + "epoch": 1.5562310030395137, + "grad_norm": 1.0562543869018555, + "learning_rate": 2.5544499668576508e-06, + "loss": 0.33747735619544983, + "mean_token_accuracy": 0.8503615856170654, + "num_tokens": 18368253.0, + "step": 2048 + }, + { + "epoch": 1.5569908814589666, + "grad_norm": 2.9451215267181396, + "learning_rate": 2.5523560497083927e-06, + "loss": 0.3958815932273865, + "mean_token_accuracy": 0.8393744826316833, + "num_tokens": 18372887.0, + "step": 2049 + }, + { + "epoch": 1.5577507598784195, + "grad_norm": 1.3597660064697266, + "learning_rate": 2.5502620958136444e-06, + "loss": 0.46281275153160095, + "mean_token_accuracy": 0.8269470930099487, + "num_tokens": 18388074.0, + "step": 2050 + }, + { + "epoch": 1.5585106382978724, + "grad_norm": 3.269068717956543, + "learning_rate": 2.548168106643022e-06, + "loss": 0.2309008538722992, + "mean_token_accuracy": 0.9178205728530884, + "num_tokens": 18391406.0, + "step": 2051 + }, + { + "epoch": 1.559270516717325, + "grad_norm": 2.1459391117095947, + "learning_rate": 2.546074083666169e-06, + "loss": 0.4006733298301697, + "mean_token_accuracy": 0.8631902933120728, + "num_tokens": 18397497.0, + "step": 2052 + }, + { + "epoch": 1.5600303951367782, + "grad_norm": 1.4614566564559937, + "learning_rate": 2.5439800283527495e-06, + "loss": 0.40810418128967285, + "mean_token_accuracy": 0.8473483920097351, + "num_tokens": 18409474.0, + "step": 2053 + }, + { + "epoch": 1.560790273556231, + "grad_norm": 2.084808826446533, + "learning_rate": 2.541885942172454e-06, + "loss": 0.34967708587646484, + "mean_token_accuracy": 0.8707003593444824, + "num_tokens": 18416400.0, + "step": 2054 + }, + { + "epoch": 1.561550151975684, + "grad_norm": 1.90664541721344, + "learning_rate": 2.539791826594991e-06, + "loss": 0.37694251537323, + "mean_token_accuracy": 0.8704941272735596, + "num_tokens": 18424206.0, + "step": 2055 + }, + { + "epoch": 1.5623100303951367, + "grad_norm": 1.880176305770874, + "learning_rate": 2.537697683090093e-06, + "loss": 0.32510411739349365, + "mean_token_accuracy": 0.8848961591720581, + "num_tokens": 18431676.0, + "step": 2056 + }, + { + "epoch": 1.5630699088145896, + "grad_norm": 2.133375406265259, + "learning_rate": 2.5356035131275096e-06, + "loss": 0.30538493394851685, + "mean_token_accuracy": 0.8890067338943481, + "num_tokens": 18438014.0, + "step": 2057 + }, + { + "epoch": 1.5638297872340425, + "grad_norm": 2.3495655059814453, + "learning_rate": 2.5335093181770105e-06, + "loss": 0.3126775324344635, + "mean_token_accuracy": 0.8865689039230347, + "num_tokens": 18443604.0, + "step": 2058 + }, + { + "epoch": 1.5645896656534954, + "grad_norm": 2.37949538230896, + "learning_rate": 2.531415099708382e-06, + "loss": 0.3257793188095093, + "mean_token_accuracy": 0.8809669017791748, + "num_tokens": 18448654.0, + "step": 2059 + }, + { + "epoch": 1.5653495440729484, + "grad_norm": 1.8285472393035889, + "learning_rate": 2.5293208591914265e-06, + "loss": 0.32376936078071594, + "mean_token_accuracy": 0.8816431760787964, + "num_tokens": 18456619.0, + "step": 2060 + }, + { + "epoch": 1.5661094224924013, + "grad_norm": 2.3238534927368164, + "learning_rate": 2.5272265980959644e-06, + "loss": 0.40366506576538086, + "mean_token_accuracy": 0.8496750593185425, + "num_tokens": 18462788.0, + "step": 2061 + }, + { + "epoch": 1.5668693009118542, + "grad_norm": 1.8954942226409912, + "learning_rate": 2.525132317891827e-06, + "loss": 0.3405473828315735, + "mean_token_accuracy": 0.8849360942840576, + "num_tokens": 18470719.0, + "step": 2062 + }, + { + "epoch": 1.5676291793313069, + "grad_norm": 1.6268190145492554, + "learning_rate": 2.523038020048861e-06, + "loss": 0.3662685751914978, + "mean_token_accuracy": 0.8865662813186646, + "num_tokens": 18482095.0, + "step": 2063 + }, + { + "epoch": 1.56838905775076, + "grad_norm": 2.5198733806610107, + "learning_rate": 2.5209437060369266e-06, + "loss": 0.3968311548233032, + "mean_token_accuracy": 0.8643308281898499, + "num_tokens": 18488069.0, + "step": 2064 + }, + { + "epoch": 1.5691489361702127, + "grad_norm": 2.9197335243225098, + "learning_rate": 2.518849377325893e-06, + "loss": 0.24738386273384094, + "mean_token_accuracy": 0.91959547996521, + "num_tokens": 18491762.0, + "step": 2065 + }, + { + "epoch": 1.5699088145896658, + "grad_norm": 1.5914254188537598, + "learning_rate": 2.51675503538564e-06, + "loss": 0.33473581075668335, + "mean_token_accuracy": 0.8794662952423096, + "num_tokens": 18501316.0, + "step": 2066 + }, + { + "epoch": 1.5706686930091185, + "grad_norm": 2.5130460262298584, + "learning_rate": 2.5146606816860597e-06, + "loss": 0.4067240357398987, + "mean_token_accuracy": 0.8564209342002869, + "num_tokens": 18507169.0, + "step": 2067 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 2.093353509902954, + "learning_rate": 2.5125663176970475e-06, + "loss": 0.4312136769294739, + "mean_token_accuracy": 0.8540225028991699, + "num_tokens": 18514536.0, + "step": 2068 + }, + { + "epoch": 1.5721884498480243, + "grad_norm": 1.284495234489441, + "learning_rate": 2.5104719448885103e-06, + "loss": 0.3813856542110443, + "mean_token_accuracy": 0.8435653448104858, + "num_tokens": 18529947.0, + "step": 2069 + }, + { + "epoch": 1.5729483282674772, + "grad_norm": 2.0383973121643066, + "learning_rate": 2.5083775647303583e-06, + "loss": 0.4428079426288605, + "mean_token_accuracy": 0.8841741681098938, + "num_tokens": 18537109.0, + "step": 2070 + }, + { + "epoch": 1.5737082066869301, + "grad_norm": 1.7991697788238525, + "learning_rate": 2.5062831786925102e-06, + "loss": 0.460052490234375, + "mean_token_accuracy": 0.8459943532943726, + "num_tokens": 18547108.0, + "step": 2071 + }, + { + "epoch": 1.574468085106383, + "grad_norm": 2.2168822288513184, + "learning_rate": 2.5041887882448845e-06, + "loss": 0.2863885462284088, + "mean_token_accuracy": 0.906816840171814, + "num_tokens": 18552357.0, + "step": 2072 + }, + { + "epoch": 1.575227963525836, + "grad_norm": 3.918499708175659, + "learning_rate": 2.5020943948574056e-06, + "loss": 0.3439999222755432, + "mean_token_accuracy": 0.8742123246192932, + "num_tokens": 18555272.0, + "step": 2073 + }, + { + "epoch": 1.5759878419452886, + "grad_norm": 1.773869514465332, + "learning_rate": 2.5e-06, + "loss": 0.2815646827220917, + "mean_token_accuracy": 0.8939872980117798, + "num_tokens": 18562989.0, + "step": 2074 + }, + { + "epoch": 1.5767477203647418, + "grad_norm": 1.8675572872161865, + "learning_rate": 2.497905605142595e-06, + "loss": 0.5005829930305481, + "mean_token_accuracy": 0.8242729902267456, + "num_tokens": 18575587.0, + "step": 2075 + }, + { + "epoch": 1.5775075987841944, + "grad_norm": 2.3143508434295654, + "learning_rate": 2.4958112117551163e-06, + "loss": 0.42472895979881287, + "mean_token_accuracy": 0.8540043830871582, + "num_tokens": 18581666.0, + "step": 2076 + }, + { + "epoch": 1.5782674772036476, + "grad_norm": 2.529740333557129, + "learning_rate": 2.4937168213074906e-06, + "loss": 0.24539905786514282, + "mean_token_accuracy": 0.9041235446929932, + "num_tokens": 18585773.0, + "step": 2077 + }, + { + "epoch": 1.5790273556231003, + "grad_norm": 2.5188395977020264, + "learning_rate": 2.491622435269642e-06, + "loss": 0.23059265315532684, + "mean_token_accuracy": 0.9204603433609009, + "num_tokens": 18589915.0, + "step": 2078 + }, + { + "epoch": 1.5797872340425532, + "grad_norm": 2.7752444744110107, + "learning_rate": 2.489528055111491e-06, + "loss": 0.452225923538208, + "mean_token_accuracy": 0.8444918990135193, + "num_tokens": 18595488.0, + "step": 2079 + }, + { + "epoch": 1.580547112462006, + "grad_norm": 1.174774408340454, + "learning_rate": 2.487433682302953e-06, + "loss": 0.3399246633052826, + "mean_token_accuracy": 0.8608446717262268, + "num_tokens": 18613756.0, + "step": 2080 + }, + { + "epoch": 1.581306990881459, + "grad_norm": 1.515575647354126, + "learning_rate": 2.485339318313941e-06, + "loss": 0.45886170864105225, + "mean_token_accuracy": 0.8479131460189819, + "num_tokens": 18629610.0, + "step": 2081 + }, + { + "epoch": 1.582066869300912, + "grad_norm": 1.7039403915405273, + "learning_rate": 2.4832449646143605e-06, + "loss": 0.349803626537323, + "mean_token_accuracy": 0.8721815347671509, + "num_tokens": 18637523.0, + "step": 2082 + }, + { + "epoch": 1.5828267477203646, + "grad_norm": 3.2289421558380127, + "learning_rate": 2.4811506226741077e-06, + "loss": 0.4967171549797058, + "mean_token_accuracy": 0.8303675651550293, + "num_tokens": 18641826.0, + "step": 2083 + }, + { + "epoch": 1.5835866261398177, + "grad_norm": 1.71235990524292, + "learning_rate": 2.4790562939630738e-06, + "loss": 0.4202485680580139, + "mean_token_accuracy": 0.8581224679946899, + "num_tokens": 18653146.0, + "step": 2084 + }, + { + "epoch": 1.5843465045592704, + "grad_norm": 1.710036277770996, + "learning_rate": 2.4769619799511392e-06, + "loss": 0.3942421078681946, + "mean_token_accuracy": 0.8553562164306641, + "num_tokens": 18663826.0, + "step": 2085 + }, + { + "epoch": 1.5851063829787235, + "grad_norm": 1.464859127998352, + "learning_rate": 2.474867682108174e-06, + "loss": 0.4093329906463623, + "mean_token_accuracy": 0.8598780632019043, + "num_tokens": 18675325.0, + "step": 2086 + }, + { + "epoch": 1.5858662613981762, + "grad_norm": 2.083707809448242, + "learning_rate": 2.472773401904037e-06, + "loss": 0.4252093434333801, + "mean_token_accuracy": 0.8433356881141663, + "num_tokens": 18682416.0, + "step": 2087 + }, + { + "epoch": 1.5866261398176293, + "grad_norm": 1.5577973127365112, + "learning_rate": 2.470679140808574e-06, + "loss": 0.3680085241794586, + "mean_token_accuracy": 0.8609116077423096, + "num_tokens": 18694445.0, + "step": 2088 + }, + { + "epoch": 1.587386018237082, + "grad_norm": 2.1617276668548584, + "learning_rate": 2.4685849002916184e-06, + "loss": 0.40488749742507935, + "mean_token_accuracy": 0.8429721593856812, + "num_tokens": 18701204.0, + "step": 2089 + }, + { + "epoch": 1.588145896656535, + "grad_norm": 2.046678304672241, + "learning_rate": 2.4664906818229903e-06, + "loss": 0.329141229391098, + "mean_token_accuracy": 0.8830771446228027, + "num_tokens": 18708354.0, + "step": 2090 + }, + { + "epoch": 1.5889057750759878, + "grad_norm": 2.7741200923919678, + "learning_rate": 2.4643964868724916e-06, + "loss": 0.42294493317604065, + "mean_token_accuracy": 0.8612706065177917, + "num_tokens": 18713017.0, + "step": 2091 + }, + { + "epoch": 1.5896656534954408, + "grad_norm": 2.085151433944702, + "learning_rate": 2.4623023169099074e-06, + "loss": 0.39038220047950745, + "mean_token_accuracy": 0.861169695854187, + "num_tokens": 18721423.0, + "step": 2092 + }, + { + "epoch": 1.5904255319148937, + "grad_norm": 2.8721165657043457, + "learning_rate": 2.4602081734050093e-06, + "loss": 0.27753859758377075, + "mean_token_accuracy": 0.8959167003631592, + "num_tokens": 18725044.0, + "step": 2093 + }, + { + "epoch": 1.5911854103343464, + "grad_norm": 1.7388207912445068, + "learning_rate": 2.4581140578275473e-06, + "loss": 0.3570033311843872, + "mean_token_accuracy": 0.8715590238571167, + "num_tokens": 18733891.0, + "step": 2094 + }, + { + "epoch": 1.5919452887537995, + "grad_norm": 2.3645241260528564, + "learning_rate": 2.456019971647251e-06, + "loss": 0.38982006907463074, + "mean_token_accuracy": 0.8734139800071716, + "num_tokens": 18740464.0, + "step": 2095 + }, + { + "epoch": 1.5927051671732522, + "grad_norm": 3.674072027206421, + "learning_rate": 2.4539259163338317e-06, + "loss": 0.4068281650543213, + "mean_token_accuracy": 0.8397839069366455, + "num_tokens": 18744857.0, + "step": 2096 + }, + { + "epoch": 1.5934650455927053, + "grad_norm": 1.8209186792373657, + "learning_rate": 2.4518318933569786e-06, + "loss": 0.3471015691757202, + "mean_token_accuracy": 0.8709044456481934, + "num_tokens": 18752414.0, + "step": 2097 + }, + { + "epoch": 1.594224924012158, + "grad_norm": 1.8138704299926758, + "learning_rate": 2.449737904186357e-06, + "loss": 0.3438487648963928, + "mean_token_accuracy": 0.8766711950302124, + "num_tokens": 18760587.0, + "step": 2098 + }, + { + "epoch": 1.594984802431611, + "grad_norm": 1.7893842458724976, + "learning_rate": 2.447643950291608e-06, + "loss": 0.43519508838653564, + "mean_token_accuracy": 0.8682907819747925, + "num_tokens": 18770293.0, + "step": 2099 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 1.4305094480514526, + "learning_rate": 2.4455500331423505e-06, + "loss": 0.37106508016586304, + "mean_token_accuracy": 0.8611354827880859, + "num_tokens": 18782456.0, + "step": 2100 + }, + { + "epoch": 1.5965045592705167, + "grad_norm": 2.0797057151794434, + "learning_rate": 2.4434561542081765e-06, + "loss": 0.43942689895629883, + "mean_token_accuracy": 0.8477288484573364, + "num_tokens": 18789547.0, + "step": 2101 + }, + { + "epoch": 1.5972644376899696, + "grad_norm": 1.2983288764953613, + "learning_rate": 2.441362314958649e-06, + "loss": 0.46385765075683594, + "mean_token_accuracy": 0.8340978622436523, + "num_tokens": 18809456.0, + "step": 2102 + }, + { + "epoch": 1.5980243161094225, + "grad_norm": 2.60866641998291, + "learning_rate": 2.439268516863306e-06, + "loss": 0.3106239140033722, + "mean_token_accuracy": 0.8859497308731079, + "num_tokens": 18813781.0, + "step": 2103 + }, + { + "epoch": 1.5987841945288754, + "grad_norm": 3.389376163482666, + "learning_rate": 2.4371747613916566e-06, + "loss": 0.44926169514656067, + "mean_token_accuracy": 0.8664819002151489, + "num_tokens": 18817666.0, + "step": 2104 + }, + { + "epoch": 1.5995440729483281, + "grad_norm": 3.3417351245880127, + "learning_rate": 2.4350810500131776e-06, + "loss": 0.4786076545715332, + "mean_token_accuracy": 0.8357523679733276, + "num_tokens": 18823717.0, + "step": 2105 + }, + { + "epoch": 1.6003039513677813, + "grad_norm": 1.5215197801589966, + "learning_rate": 2.4329873841973174e-06, + "loss": 0.4123923182487488, + "mean_token_accuracy": 0.853337287902832, + "num_tokens": 18835163.0, + "step": 2106 + }, + { + "epoch": 1.601063829787234, + "grad_norm": 1.8798415660858154, + "learning_rate": 2.4308937654134893e-06, + "loss": 0.45594000816345215, + "mean_token_accuracy": 0.8553717732429504, + "num_tokens": 18843923.0, + "step": 2107 + }, + { + "epoch": 1.601823708206687, + "grad_norm": 2.1012487411499023, + "learning_rate": 2.428800195131078e-06, + "loss": 0.4340161085128784, + "mean_token_accuracy": 0.8448120355606079, + "num_tokens": 18851852.0, + "step": 2108 + }, + { + "epoch": 1.6025835866261398, + "grad_norm": 2.827080726623535, + "learning_rate": 2.4267066748194297e-06, + "loss": 0.25922513008117676, + "mean_token_accuracy": 0.9024698734283447, + "num_tokens": 18856113.0, + "step": 2109 + }, + { + "epoch": 1.6033434650455927, + "grad_norm": 1.641032338142395, + "learning_rate": 2.4246132059478582e-06, + "loss": 0.591558575630188, + "mean_token_accuracy": 0.7960667610168457, + "num_tokens": 18870618.0, + "step": 2110 + }, + { + "epoch": 1.6041033434650456, + "grad_norm": 2.600771188735962, + "learning_rate": 2.4225197899856416e-06, + "loss": 0.382815957069397, + "mean_token_accuracy": 0.8654585480690002, + "num_tokens": 18875456.0, + "step": 2111 + }, + { + "epoch": 1.6048632218844985, + "grad_norm": 1.5125449895858765, + "learning_rate": 2.4204264284020182e-06, + "loss": 0.4643454849720001, + "mean_token_accuracy": 0.837038516998291, + "num_tokens": 18887979.0, + "step": 2112 + }, + { + "epoch": 1.6056231003039514, + "grad_norm": 1.7571941614151, + "learning_rate": 2.4183331226661913e-06, + "loss": 0.30713701248168945, + "mean_token_accuracy": 0.8856921195983887, + "num_tokens": 18896143.0, + "step": 2113 + }, + { + "epoch": 1.6063829787234043, + "grad_norm": 2.124593496322632, + "learning_rate": 2.4162398742473216e-06, + "loss": 0.2873607575893402, + "mean_token_accuracy": 0.8986717462539673, + "num_tokens": 18902364.0, + "step": 2114 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 2.3496272563934326, + "learning_rate": 2.4141466846145332e-06, + "loss": 0.33715200424194336, + "mean_token_accuracy": 0.8816461563110352, + "num_tokens": 18908038.0, + "step": 2115 + }, + { + "epoch": 1.60790273556231, + "grad_norm": 1.2783573865890503, + "learning_rate": 2.4120535552369057e-06, + "loss": 0.45153388381004333, + "mean_token_accuracy": 0.8345640897750854, + "num_tokens": 18926687.0, + "step": 2116 + }, + { + "epoch": 1.608662613981763, + "grad_norm": 2.1481080055236816, + "learning_rate": 2.4099604875834796e-06, + "loss": 0.43976694345474243, + "mean_token_accuracy": 0.847899317741394, + "num_tokens": 18932974.0, + "step": 2117 + }, + { + "epoch": 1.6094224924012157, + "grad_norm": 1.8669065237045288, + "learning_rate": 2.407867483123248e-06, + "loss": 0.4649358093738556, + "mean_token_accuracy": 0.8310785293579102, + "num_tokens": 18942551.0, + "step": 2118 + }, + { + "epoch": 1.6101823708206688, + "grad_norm": 2.7667746543884277, + "learning_rate": 2.4057745433251637e-06, + "loss": 0.4542210102081299, + "mean_token_accuracy": 0.8450086116790771, + "num_tokens": 18947525.0, + "step": 2119 + }, + { + "epoch": 1.6109422492401215, + "grad_norm": 2.2865076065063477, + "learning_rate": 2.4036816696581326e-06, + "loss": 0.34291431307792664, + "mean_token_accuracy": 0.8741394281387329, + "num_tokens": 18952967.0, + "step": 2120 + }, + { + "epoch": 1.6117021276595744, + "grad_norm": 3.055197238922119, + "learning_rate": 2.401588863591013e-06, + "loss": 0.4686807692050934, + "mean_token_accuracy": 0.8440030217170715, + "num_tokens": 18958257.0, + "step": 2121 + }, + { + "epoch": 1.6124620060790273, + "grad_norm": 2.268456220626831, + "learning_rate": 2.3994961265926166e-06, + "loss": 0.440069317817688, + "mean_token_accuracy": 0.8534891605377197, + "num_tokens": 18964745.0, + "step": 2122 + }, + { + "epoch": 1.6132218844984803, + "grad_norm": 2.061185359954834, + "learning_rate": 2.3974034601317085e-06, + "loss": 0.4383159279823303, + "mean_token_accuracy": 0.8484808802604675, + "num_tokens": 18972136.0, + "step": 2123 + }, + { + "epoch": 1.6139817629179332, + "grad_norm": 1.5121275186538696, + "learning_rate": 2.3953108656770018e-06, + "loss": 0.42403632402420044, + "mean_token_accuracy": 0.8467602133750916, + "num_tokens": 18985353.0, + "step": 2124 + }, + { + "epoch": 1.614741641337386, + "grad_norm": 1.9965397119522095, + "learning_rate": 2.3932183446971584e-06, + "loss": 0.3915751576423645, + "mean_token_accuracy": 0.8622956275939941, + "num_tokens": 18992017.0, + "step": 2125 + }, + { + "epoch": 1.615501519756839, + "grad_norm": 1.6688618659973145, + "learning_rate": 2.3911258986607907e-06, + "loss": 0.468288391828537, + "mean_token_accuracy": 0.8372251987457275, + "num_tokens": 19001930.0, + "step": 2126 + }, + { + "epoch": 1.6162613981762917, + "grad_norm": 1.8984699249267578, + "learning_rate": 2.3890335290364596e-06, + "loss": 0.3082895278930664, + "mean_token_accuracy": 0.8815990686416626, + "num_tokens": 19009712.0, + "step": 2127 + }, + { + "epoch": 1.6170212765957448, + "grad_norm": 2.6934773921966553, + "learning_rate": 2.386941237292669e-06, + "loss": 0.48406022787094116, + "mean_token_accuracy": 0.8300775289535522, + "num_tokens": 19015212.0, + "step": 2128 + }, + { + "epoch": 1.6177811550151975, + "grad_norm": 1.6615487337112427, + "learning_rate": 2.3848490248978693e-06, + "loss": 0.45227736234664917, + "mean_token_accuracy": 0.8421006798744202, + "num_tokens": 19027115.0, + "step": 2129 + }, + { + "epoch": 1.6185410334346506, + "grad_norm": 1.4625248908996582, + "learning_rate": 2.3827568933204576e-06, + "loss": 0.4141014814376831, + "mean_token_accuracy": 0.8479453325271606, + "num_tokens": 19041103.0, + "step": 2130 + }, + { + "epoch": 1.6193009118541033, + "grad_norm": 1.856701135635376, + "learning_rate": 2.3806648440287715e-06, + "loss": 0.3440483808517456, + "mean_token_accuracy": 0.8978210687637329, + "num_tokens": 19048124.0, + "step": 2131 + }, + { + "epoch": 1.6200607902735562, + "grad_norm": 1.7056550979614258, + "learning_rate": 2.378572878491091e-06, + "loss": 0.4136195182800293, + "mean_token_accuracy": 0.8579289317131042, + "num_tokens": 19057113.0, + "step": 2132 + }, + { + "epoch": 1.6208206686930091, + "grad_norm": 1.4673033952713013, + "learning_rate": 2.376480998175638e-06, + "loss": 0.40176504850387573, + "mean_token_accuracy": 0.8677150011062622, + "num_tokens": 19068258.0, + "step": 2133 + }, + { + "epoch": 1.621580547112462, + "grad_norm": 2.12859845161438, + "learning_rate": 2.3743892045505764e-06, + "loss": 0.39754825830459595, + "mean_token_accuracy": 0.8486959934234619, + "num_tokens": 19075469.0, + "step": 2134 + }, + { + "epoch": 1.622340425531915, + "grad_norm": 1.474247694015503, + "learning_rate": 2.372297499084006e-06, + "loss": 0.3546760678291321, + "mean_token_accuracy": 0.8767229318618774, + "num_tokens": 19086744.0, + "step": 2135 + }, + { + "epoch": 1.6231003039513676, + "grad_norm": 1.9945709705352783, + "learning_rate": 2.3702058832439667e-06, + "loss": 0.4200798273086548, + "mean_token_accuracy": 0.8435655832290649, + "num_tokens": 19095903.0, + "step": 2136 + }, + { + "epoch": 1.6238601823708207, + "grad_norm": 2.71991229057312, + "learning_rate": 2.368114358498434e-06, + "loss": 0.44925457239151, + "mean_token_accuracy": 0.8348450660705566, + "num_tokens": 19100864.0, + "step": 2137 + }, + { + "epoch": 1.6246200607902734, + "grad_norm": 2.817664623260498, + "learning_rate": 2.366022926315322e-06, + "loss": 0.44386279582977295, + "mean_token_accuracy": 0.8739628791809082, + "num_tokens": 19105355.0, + "step": 2138 + }, + { + "epoch": 1.6253799392097266, + "grad_norm": 1.3673229217529297, + "learning_rate": 2.3639315881624776e-06, + "loss": 0.3693230152130127, + "mean_token_accuracy": 0.8698620796203613, + "num_tokens": 19116748.0, + "step": 2139 + }, + { + "epoch": 1.6261398176291793, + "grad_norm": 2.712531805038452, + "learning_rate": 2.361840345507683e-06, + "loss": 0.4442938268184662, + "mean_token_accuracy": 0.8433241844177246, + "num_tokens": 19121437.0, + "step": 2140 + }, + { + "epoch": 1.6268996960486324, + "grad_norm": 2.2885231971740723, + "learning_rate": 2.359749199818651e-06, + "loss": 0.4021872878074646, + "mean_token_accuracy": 0.8605252504348755, + "num_tokens": 19127633.0, + "step": 2141 + }, + { + "epoch": 1.627659574468085, + "grad_norm": 1.9257299900054932, + "learning_rate": 2.3576581525630297e-06, + "loss": 0.3577788472175598, + "mean_token_accuracy": 0.8691596388816833, + "num_tokens": 19134450.0, + "step": 2142 + }, + { + "epoch": 1.628419452887538, + "grad_norm": 1.5035467147827148, + "learning_rate": 2.355567205208397e-06, + "loss": 0.3800235986709595, + "mean_token_accuracy": 0.867794394493103, + "num_tokens": 19146149.0, + "step": 2143 + }, + { + "epoch": 1.6291793313069909, + "grad_norm": 2.110445737838745, + "learning_rate": 2.353476359222259e-06, + "loss": 0.34394145011901855, + "mean_token_accuracy": 0.8777303695678711, + "num_tokens": 19152017.0, + "step": 2144 + }, + { + "epoch": 1.6299392097264438, + "grad_norm": 1.1713787317276, + "learning_rate": 2.351385616072052e-06, + "loss": 0.4060516357421875, + "mean_token_accuracy": 0.8411345481872559, + "num_tokens": 19172089.0, + "step": 2145 + }, + { + "epoch": 1.6306990881458967, + "grad_norm": 1.7600529193878174, + "learning_rate": 2.3492949772251418e-06, + "loss": 0.5299694538116455, + "mean_token_accuracy": 0.8218191862106323, + "num_tokens": 19184041.0, + "step": 2146 + }, + { + "epoch": 1.6314589665653494, + "grad_norm": 1.7126617431640625, + "learning_rate": 2.3472044441488175e-06, + "loss": 0.38628721237182617, + "mean_token_accuracy": 0.8526935577392578, + "num_tokens": 19193101.0, + "step": 2147 + }, + { + "epoch": 1.6322188449848025, + "grad_norm": 1.210344672203064, + "learning_rate": 2.345114018310295e-06, + "loss": 0.2732373774051666, + "mean_token_accuracy": 0.8903822898864746, + "num_tokens": 19206697.0, + "step": 2148 + }, + { + "epoch": 1.6329787234042552, + "grad_norm": 1.6693075895309448, + "learning_rate": 2.3430237011767166e-06, + "loss": 0.3472709655761719, + "mean_token_accuracy": 0.8767187595367432, + "num_tokens": 19217008.0, + "step": 2149 + }, + { + "epoch": 1.6337386018237083, + "grad_norm": 1.5242515802383423, + "learning_rate": 2.3409334942151485e-06, + "loss": 0.4345507025718689, + "mean_token_accuracy": 0.8481311202049255, + "num_tokens": 19231573.0, + "step": 2150 + }, + { + "epoch": 1.634498480243161, + "grad_norm": 2.470122814178467, + "learning_rate": 2.3388433988925767e-06, + "loss": 0.4453052878379822, + "mean_token_accuracy": 0.8411355018615723, + "num_tokens": 19237076.0, + "step": 2151 + }, + { + "epoch": 1.635258358662614, + "grad_norm": 2.4177467823028564, + "learning_rate": 2.3367534166759105e-06, + "loss": 0.454534113407135, + "mean_token_accuracy": 0.8635509014129639, + "num_tokens": 19242890.0, + "step": 2152 + }, + { + "epoch": 1.6360182370820668, + "grad_norm": 2.8036744594573975, + "learning_rate": 2.3346635490319815e-06, + "loss": 0.4396413564682007, + "mean_token_accuracy": 0.8491836786270142, + "num_tokens": 19247492.0, + "step": 2153 + }, + { + "epoch": 1.6367781155015197, + "grad_norm": 1.9286335706710815, + "learning_rate": 2.3325737974275382e-06, + "loss": 0.34988659620285034, + "mean_token_accuracy": 0.8704243898391724, + "num_tokens": 19254966.0, + "step": 2154 + }, + { + "epoch": 1.6375379939209727, + "grad_norm": 1.8929904699325562, + "learning_rate": 2.3304841633292487e-06, + "loss": 0.4195491671562195, + "mean_token_accuracy": 0.857181966304779, + "num_tokens": 19263324.0, + "step": 2155 + }, + { + "epoch": 1.6382978723404256, + "grad_norm": 2.2598466873168945, + "learning_rate": 2.328394648203698e-06, + "loss": 0.37977826595306396, + "mean_token_accuracy": 0.8626722097396851, + "num_tokens": 19269363.0, + "step": 2156 + }, + { + "epoch": 1.6390577507598785, + "grad_norm": 1.8118126392364502, + "learning_rate": 2.32630525351739e-06, + "loss": 0.3532063364982605, + "mean_token_accuracy": 0.8677854537963867, + "num_tokens": 19277360.0, + "step": 2157 + }, + { + "epoch": 1.6398176291793312, + "grad_norm": 1.5216798782348633, + "learning_rate": 2.324215980736741e-06, + "loss": 0.38609349727630615, + "mean_token_accuracy": 0.8685325980186462, + "num_tokens": 19292159.0, + "step": 2158 + }, + { + "epoch": 1.6405775075987843, + "grad_norm": 3.0511462688446045, + "learning_rate": 2.3221268313280836e-06, + "loss": 0.21988365054130554, + "mean_token_accuracy": 0.9172534942626953, + "num_tokens": 19295735.0, + "step": 2159 + }, + { + "epoch": 1.641337386018237, + "grad_norm": 1.957828164100647, + "learning_rate": 2.320037806757662e-06, + "loss": 0.3868909478187561, + "mean_token_accuracy": 0.8605331182479858, + "num_tokens": 19303287.0, + "step": 2160 + }, + { + "epoch": 1.64209726443769, + "grad_norm": 2.590040922164917, + "learning_rate": 2.317948908491636e-06, + "loss": 0.3940129578113556, + "mean_token_accuracy": 0.8814224004745483, + "num_tokens": 19308101.0, + "step": 2161 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 2.859248161315918, + "learning_rate": 2.315860137996074e-06, + "loss": 0.3437344431877136, + "mean_token_accuracy": 0.8789017200469971, + "num_tokens": 19313026.0, + "step": 2162 + }, + { + "epoch": 1.6436170212765957, + "grad_norm": 1.1788666248321533, + "learning_rate": 2.3137714967369544e-06, + "loss": 0.3976179361343384, + "mean_token_accuracy": 0.8383771181106567, + "num_tokens": 19331103.0, + "step": 2163 + }, + { + "epoch": 1.6443768996960486, + "grad_norm": 1.8409802913665771, + "learning_rate": 2.3116829861801687e-06, + "loss": 0.41898879408836365, + "mean_token_accuracy": 0.8575010299682617, + "num_tokens": 19340866.0, + "step": 2164 + }, + { + "epoch": 1.6451367781155015, + "grad_norm": 1.4124691486358643, + "learning_rate": 2.3095946077915115e-06, + "loss": 0.333813339471817, + "mean_token_accuracy": 0.8766071796417236, + "num_tokens": 19353673.0, + "step": 2165 + }, + { + "epoch": 1.6458966565349544, + "grad_norm": 1.76325261592865, + "learning_rate": 2.307506363036688e-06, + "loss": 0.4158991575241089, + "mean_token_accuracy": 0.8522704839706421, + "num_tokens": 19363635.0, + "step": 2166 + }, + { + "epoch": 1.6466565349544073, + "grad_norm": 1.758833885192871, + "learning_rate": 2.305418253381309e-06, + "loss": 0.298480749130249, + "mean_token_accuracy": 0.888424277305603, + "num_tokens": 19372291.0, + "step": 2167 + }, + { + "epoch": 1.6474164133738602, + "grad_norm": 1.6387488842010498, + "learning_rate": 2.3033302802908895e-06, + "loss": 0.4309447109699249, + "mean_token_accuracy": 0.8672212362289429, + "num_tokens": 19383480.0, + "step": 2168 + }, + { + "epoch": 1.648176291793313, + "grad_norm": 1.5251084566116333, + "learning_rate": 2.301242445230851e-06, + "loss": 0.44890880584716797, + "mean_token_accuracy": 0.847392737865448, + "num_tokens": 19394810.0, + "step": 2169 + }, + { + "epoch": 1.648936170212766, + "grad_norm": 1.6106950044631958, + "learning_rate": 2.299154749666515e-06, + "loss": 0.4403916597366333, + "mean_token_accuracy": 0.8379756212234497, + "num_tokens": 19405551.0, + "step": 2170 + }, + { + "epoch": 1.6496960486322187, + "grad_norm": 1.4238437414169312, + "learning_rate": 2.2970671950631066e-06, + "loss": 0.4015567898750305, + "mean_token_accuracy": 0.851482629776001, + "num_tokens": 19418621.0, + "step": 2171 + }, + { + "epoch": 1.6504559270516719, + "grad_norm": 1.3026156425476074, + "learning_rate": 2.2949797828857527e-06, + "loss": 0.3680947422981262, + "mean_token_accuracy": 0.8641397953033447, + "num_tokens": 19432118.0, + "step": 2172 + }, + { + "epoch": 1.6512158054711246, + "grad_norm": 2.1265358924865723, + "learning_rate": 2.2928925145994798e-06, + "loss": 0.43980664014816284, + "mean_token_accuracy": 0.8358430862426758, + "num_tokens": 19439069.0, + "step": 2173 + }, + { + "epoch": 1.6519756838905775, + "grad_norm": 1.8399443626403809, + "learning_rate": 2.290805391669212e-06, + "loss": 0.29801061749458313, + "mean_token_accuracy": 0.8773187398910522, + "num_tokens": 19446745.0, + "step": 2174 + }, + { + "epoch": 1.6527355623100304, + "grad_norm": 1.8680047988891602, + "learning_rate": 2.2887184155597725e-06, + "loss": 0.3235543966293335, + "mean_token_accuracy": 0.8754611015319824, + "num_tokens": 19455266.0, + "step": 2175 + }, + { + "epoch": 1.6534954407294833, + "grad_norm": 2.3048481941223145, + "learning_rate": 2.286631587735883e-06, + "loss": 0.4011988043785095, + "mean_token_accuracy": 0.8531811237335205, + "num_tokens": 19461049.0, + "step": 2176 + }, + { + "epoch": 1.6542553191489362, + "grad_norm": 2.6067066192626953, + "learning_rate": 2.2845449096621583e-06, + "loss": 0.4957500696182251, + "mean_token_accuracy": 0.8255549073219299, + "num_tokens": 19466884.0, + "step": 2177 + }, + { + "epoch": 1.655015197568389, + "grad_norm": 1.5211488008499146, + "learning_rate": 2.282458382803109e-06, + "loss": 0.32245099544525146, + "mean_token_accuracy": 0.8865629434585571, + "num_tokens": 19477294.0, + "step": 2178 + }, + { + "epoch": 1.655775075987842, + "grad_norm": 2.245542526245117, + "learning_rate": 2.280372008623142e-06, + "loss": 0.3790864944458008, + "mean_token_accuracy": 0.8766552209854126, + "num_tokens": 19483385.0, + "step": 2179 + }, + { + "epoch": 1.6565349544072947, + "grad_norm": 2.1158151626586914, + "learning_rate": 2.2782857885865538e-06, + "loss": 0.4726812243461609, + "mean_token_accuracy": 0.8384029865264893, + "num_tokens": 19491367.0, + "step": 2180 + }, + { + "epoch": 1.6572948328267478, + "grad_norm": 3.301389694213867, + "learning_rate": 2.2761997241575335e-06, + "loss": 0.37664809823036194, + "mean_token_accuracy": 0.8913813829421997, + "num_tokens": 19494876.0, + "step": 2181 + }, + { + "epoch": 1.6580547112462005, + "grad_norm": 2.2964162826538086, + "learning_rate": 2.274113816800161e-06, + "loss": 0.4110721945762634, + "mean_token_accuracy": 0.8551756143569946, + "num_tokens": 19500546.0, + "step": 2182 + }, + { + "epoch": 1.6588145896656536, + "grad_norm": 3.368161916732788, + "learning_rate": 2.272028067978408e-06, + "loss": 0.39089250564575195, + "mean_token_accuracy": 0.8786845207214355, + "num_tokens": 19504142.0, + "step": 2183 + }, + { + "epoch": 1.6595744680851063, + "grad_norm": 1.7299834489822388, + "learning_rate": 2.2699424791561324e-06, + "loss": 0.5205090641975403, + "mean_token_accuracy": 0.8394201993942261, + "num_tokens": 19514523.0, + "step": 2184 + }, + { + "epoch": 1.6603343465045592, + "grad_norm": 2.045919418334961, + "learning_rate": 2.267857051797081e-06, + "loss": 0.49093255400657654, + "mean_token_accuracy": 0.8338311910629272, + "num_tokens": 19522439.0, + "step": 2185 + }, + { + "epoch": 1.6610942249240122, + "grad_norm": 1.2035714387893677, + "learning_rate": 2.265771787364886e-06, + "loss": 0.37247753143310547, + "mean_token_accuracy": 0.8873692750930786, + "num_tokens": 19536717.0, + "step": 2186 + }, + { + "epoch": 1.661854103343465, + "grad_norm": 2.6186633110046387, + "learning_rate": 2.263686687323068e-06, + "loss": 0.3318040370941162, + "mean_token_accuracy": 0.8720577955245972, + "num_tokens": 19541966.0, + "step": 2187 + }, + { + "epoch": 1.662613981762918, + "grad_norm": 2.6845929622650146, + "learning_rate": 2.261601753135029e-06, + "loss": 0.32441991567611694, + "mean_token_accuracy": 0.8700553178787231, + "num_tokens": 19546644.0, + "step": 2188 + }, + { + "epoch": 1.6633738601823707, + "grad_norm": 2.078998327255249, + "learning_rate": 2.259516986264057e-06, + "loss": 0.3424156904220581, + "mean_token_accuracy": 0.8707810044288635, + "num_tokens": 19553472.0, + "step": 2189 + }, + { + "epoch": 1.6641337386018238, + "grad_norm": 2.380747079849243, + "learning_rate": 2.2574323881733202e-06, + "loss": 0.4994799494743347, + "mean_token_accuracy": 0.817003607749939, + "num_tokens": 19560502.0, + "step": 2190 + }, + { + "epoch": 1.6648936170212765, + "grad_norm": 1.2984378337860107, + "learning_rate": 2.255347960325871e-06, + "loss": 0.33139657974243164, + "mean_token_accuracy": 0.8763977289199829, + "num_tokens": 19575624.0, + "step": 2191 + }, + { + "epoch": 1.6656534954407296, + "grad_norm": 1.3232799768447876, + "learning_rate": 2.2532637041846423e-06, + "loss": 0.32994017004966736, + "mean_token_accuracy": 0.8790634274482727, + "num_tokens": 19588636.0, + "step": 2192 + }, + { + "epoch": 1.6664133738601823, + "grad_norm": 2.11212158203125, + "learning_rate": 2.2511796212124424e-06, + "loss": 0.3140082359313965, + "mean_token_accuracy": 0.8946622014045715, + "num_tokens": 19594917.0, + "step": 2193 + }, + { + "epoch": 1.6671732522796354, + "grad_norm": 2.7206521034240723, + "learning_rate": 2.2490957128719627e-06, + "loss": 0.3723612427711487, + "mean_token_accuracy": 0.8781955242156982, + "num_tokens": 19599310.0, + "step": 2194 + }, + { + "epoch": 1.667933130699088, + "grad_norm": 2.6681952476501465, + "learning_rate": 2.247011980625771e-06, + "loss": 0.3740317225456238, + "mean_token_accuracy": 0.8780536651611328, + "num_tokens": 19604172.0, + "step": 2195 + }, + { + "epoch": 1.668693009118541, + "grad_norm": 1.8933384418487549, + "learning_rate": 2.2449284259363093e-06, + "loss": 0.3359421491622925, + "mean_token_accuracy": 0.8785334825515747, + "num_tokens": 19612030.0, + "step": 2196 + }, + { + "epoch": 1.669452887537994, + "grad_norm": 2.4779889583587646, + "learning_rate": 2.2428450502658964e-06, + "loss": 0.3724144399166107, + "mean_token_accuracy": 0.8739810585975647, + "num_tokens": 19617800.0, + "step": 2197 + }, + { + "epoch": 1.6702127659574468, + "grad_norm": 3.0661120414733887, + "learning_rate": 2.240761855076727e-06, + "loss": 0.3627531826496124, + "mean_token_accuracy": 0.865296483039856, + "num_tokens": 19621885.0, + "step": 2198 + }, + { + "epoch": 1.6709726443768997, + "grad_norm": 2.431708574295044, + "learning_rate": 2.238678841830867e-06, + "loss": 0.31396129727363586, + "mean_token_accuracy": 0.9026765823364258, + "num_tokens": 19627122.0, + "step": 2199 + }, + { + "epoch": 1.6717325227963524, + "grad_norm": 2.5498745441436768, + "learning_rate": 2.2365960119902543e-06, + "loss": 0.3193191885948181, + "mean_token_accuracy": 0.8750600218772888, + "num_tokens": 19631771.0, + "step": 2200 + }, + { + "epoch": 1.6724924012158056, + "grad_norm": 2.0419046878814697, + "learning_rate": 2.2345133670167e-06, + "loss": 0.32747960090637207, + "mean_token_accuracy": 0.8603148460388184, + "num_tokens": 19638972.0, + "step": 2201 + }, + { + "epoch": 1.6732522796352582, + "grad_norm": 2.0412306785583496, + "learning_rate": 2.232430908371885e-06, + "loss": 0.4701780676841736, + "mean_token_accuracy": 0.8318476676940918, + "num_tokens": 19647968.0, + "step": 2202 + }, + { + "epoch": 1.6740121580547114, + "grad_norm": 2.054070472717285, + "learning_rate": 2.2303486375173586e-06, + "loss": 0.33284813165664673, + "mean_token_accuracy": 0.8760920763015747, + "num_tokens": 19654032.0, + "step": 2203 + }, + { + "epoch": 1.674772036474164, + "grad_norm": 1.6053217649459839, + "learning_rate": 2.228266555914538e-06, + "loss": 0.34431374073028564, + "mean_token_accuracy": 0.8764770030975342, + "num_tokens": 19663785.0, + "step": 2204 + }, + { + "epoch": 1.675531914893617, + "grad_norm": 1.474494457244873, + "learning_rate": 2.2261846650247077e-06, + "loss": 0.3541037440299988, + "mean_token_accuracy": 0.8782497644424438, + "num_tokens": 19675498.0, + "step": 2205 + }, + { + "epoch": 1.6762917933130699, + "grad_norm": 1.9318026304244995, + "learning_rate": 2.224102966309021e-06, + "loss": 0.4291660189628601, + "mean_token_accuracy": 0.8424201607704163, + "num_tokens": 19684576.0, + "step": 2206 + }, + { + "epoch": 1.6770516717325228, + "grad_norm": 2.2150020599365234, + "learning_rate": 2.2220214612284925e-06, + "loss": 0.46187907457351685, + "mean_token_accuracy": 0.840459942817688, + "num_tokens": 19690412.0, + "step": 2207 + }, + { + "epoch": 1.6778115501519757, + "grad_norm": 1.667281150817871, + "learning_rate": 2.2199401512440037e-06, + "loss": 0.37440744042396545, + "mean_token_accuracy": 0.8694081902503967, + "num_tokens": 19699600.0, + "step": 2208 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 2.6446619033813477, + "learning_rate": 2.2178590378162957e-06, + "loss": 0.3301953077316284, + "mean_token_accuracy": 0.8992182016372681, + "num_tokens": 19704162.0, + "step": 2209 + }, + { + "epoch": 1.6793313069908815, + "grad_norm": 1.4266780614852905, + "learning_rate": 2.215778122405977e-06, + "loss": 0.3811204135417938, + "mean_token_accuracy": 0.861638069152832, + "num_tokens": 19716511.0, + "step": 2210 + }, + { + "epoch": 1.6800911854103342, + "grad_norm": 1.826087474822998, + "learning_rate": 2.2136974064735132e-06, + "loss": 0.4790012836456299, + "mean_token_accuracy": 0.8404909372329712, + "num_tokens": 19726645.0, + "step": 2211 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 1.8551808595657349, + "learning_rate": 2.2116168914792293e-06, + "loss": 0.40999075770378113, + "mean_token_accuracy": 0.8419463634490967, + "num_tokens": 19735601.0, + "step": 2212 + }, + { + "epoch": 1.68161094224924, + "grad_norm": 2.560124158859253, + "learning_rate": 2.209536578883313e-06, + "loss": 0.43428558111190796, + "mean_token_accuracy": 0.8689159750938416, + "num_tokens": 19741138.0, + "step": 2213 + }, + { + "epoch": 1.6823708206686931, + "grad_norm": 2.0154869556427, + "learning_rate": 2.207456470145807e-06, + "loss": 0.43633338809013367, + "mean_token_accuracy": 0.8646916151046753, + "num_tokens": 19751929.0, + "step": 2214 + }, + { + "epoch": 1.6831306990881458, + "grad_norm": 1.3583155870437622, + "learning_rate": 2.205376566726611e-06, + "loss": 0.3050280511379242, + "mean_token_accuracy": 0.8998798727989197, + "num_tokens": 19764012.0, + "step": 2215 + }, + { + "epoch": 1.6838905775075987, + "grad_norm": 1.266262173652649, + "learning_rate": 2.2032968700854813e-06, + "loss": 0.4039713144302368, + "mean_token_accuracy": 0.8571382164955139, + "num_tokens": 19780683.0, + "step": 2216 + }, + { + "epoch": 1.6846504559270516, + "grad_norm": 1.864356517791748, + "learning_rate": 2.2012173816820297e-06, + "loss": 0.361503541469574, + "mean_token_accuracy": 0.868161678314209, + "num_tokens": 19788907.0, + "step": 2217 + }, + { + "epoch": 1.6854103343465046, + "grad_norm": 1.320155382156372, + "learning_rate": 2.1991381029757216e-06, + "loss": 0.28228244185447693, + "mean_token_accuracy": 0.8945217132568359, + "num_tokens": 19800354.0, + "step": 2218 + }, + { + "epoch": 1.6861702127659575, + "grad_norm": 1.9706367254257202, + "learning_rate": 2.1970590354258745e-06, + "loss": 0.2849377989768982, + "mean_token_accuracy": 0.9065699577331543, + "num_tokens": 19806735.0, + "step": 2219 + }, + { + "epoch": 1.6869300911854104, + "grad_norm": 1.9150370359420776, + "learning_rate": 2.1949801804916563e-06, + "loss": 0.4125257730484009, + "mean_token_accuracy": 0.8642163872718811, + "num_tokens": 19814056.0, + "step": 2220 + }, + { + "epoch": 1.6876899696048633, + "grad_norm": 2.062589645385742, + "learning_rate": 2.19290153963209e-06, + "loss": 0.451707124710083, + "mean_token_accuracy": 0.8311163187026978, + "num_tokens": 19821263.0, + "step": 2221 + }, + { + "epoch": 1.688449848024316, + "grad_norm": 1.3959208726882935, + "learning_rate": 2.190823114306045e-06, + "loss": 0.3326707184314728, + "mean_token_accuracy": 0.9037837982177734, + "num_tokens": 19835163.0, + "step": 2222 + }, + { + "epoch": 1.689209726443769, + "grad_norm": 2.09995698928833, + "learning_rate": 2.188744905972239e-06, + "loss": 0.4144105315208435, + "mean_token_accuracy": 0.8512029051780701, + "num_tokens": 19843164.0, + "step": 2223 + }, + { + "epoch": 1.6899696048632218, + "grad_norm": 1.4759427309036255, + "learning_rate": 2.186666916089239e-06, + "loss": 0.4707002639770508, + "mean_token_accuracy": 0.8371601104736328, + "num_tokens": 19858551.0, + "step": 2224 + }, + { + "epoch": 1.690729483282675, + "grad_norm": 2.3398702144622803, + "learning_rate": 2.1845891461154604e-06, + "loss": 0.34672820568084717, + "mean_token_accuracy": 0.879936695098877, + "num_tokens": 19864348.0, + "step": 2225 + }, + { + "epoch": 1.6914893617021276, + "grad_norm": 1.6283963918685913, + "learning_rate": 2.1825115975091594e-06, + "loss": 0.31835079193115234, + "mean_token_accuracy": 0.8695961833000183, + "num_tokens": 19873560.0, + "step": 2226 + }, + { + "epoch": 1.6922492401215805, + "grad_norm": 2.035759687423706, + "learning_rate": 2.1804342717284414e-06, + "loss": 0.43110257387161255, + "mean_token_accuracy": 0.8593922853469849, + "num_tokens": 19880796.0, + "step": 2227 + }, + { + "epoch": 1.6930091185410334, + "grad_norm": 2.1340725421905518, + "learning_rate": 2.1783571702312523e-06, + "loss": 0.46967440843582153, + "mean_token_accuracy": 0.8839266300201416, + "num_tokens": 19887911.0, + "step": 2228 + }, + { + "epoch": 1.6937689969604863, + "grad_norm": 1.710340142250061, + "learning_rate": 2.176280294475383e-06, + "loss": 0.4167519807815552, + "mean_token_accuracy": 0.8526116609573364, + "num_tokens": 19896674.0, + "step": 2229 + }, + { + "epoch": 1.6945288753799392, + "grad_norm": 1.7793304920196533, + "learning_rate": 2.174203645918464e-06, + "loss": 0.3875434994697571, + "mean_token_accuracy": 0.8637192249298096, + "num_tokens": 19904825.0, + "step": 2230 + }, + { + "epoch": 1.6952887537993921, + "grad_norm": 1.7908778190612793, + "learning_rate": 2.172127226017967e-06, + "loss": 0.42065349221229553, + "mean_token_accuracy": 0.850834846496582, + "num_tokens": 19914377.0, + "step": 2231 + }, + { + "epoch": 1.696048632218845, + "grad_norm": 3.0943970680236816, + "learning_rate": 2.1700510362312053e-06, + "loss": 0.44845050573349, + "mean_token_accuracy": 0.8460367918014526, + "num_tokens": 19918929.0, + "step": 2232 + }, + { + "epoch": 1.6968085106382977, + "grad_norm": 1.5586018562316895, + "learning_rate": 2.1679750780153265e-06, + "loss": 0.4723482131958008, + "mean_token_accuracy": 0.871384859085083, + "num_tokens": 19932738.0, + "step": 2233 + }, + { + "epoch": 1.6975683890577509, + "grad_norm": 2.014230728149414, + "learning_rate": 2.1658993528273196e-06, + "loss": 0.43307146430015564, + "mean_token_accuracy": 0.8677935600280762, + "num_tokens": 19940246.0, + "step": 2234 + }, + { + "epoch": 1.6983282674772036, + "grad_norm": 1.528979778289795, + "learning_rate": 2.163823862124007e-06, + "loss": 0.3897377550601959, + "mean_token_accuracy": 0.8737689256668091, + "num_tokens": 19951187.0, + "step": 2235 + }, + { + "epoch": 1.6990881458966567, + "grad_norm": 1.9856207370758057, + "learning_rate": 2.1617486073620496e-06, + "loss": 0.4285745620727539, + "mean_token_accuracy": 0.8744081258773804, + "num_tokens": 19957768.0, + "step": 2236 + }, + { + "epoch": 1.6998480243161094, + "grad_norm": 2.130525827407837, + "learning_rate": 2.15967358999794e-06, + "loss": 0.405293732881546, + "mean_token_accuracy": 0.8588452935218811, + "num_tokens": 19965354.0, + "step": 2237 + }, + { + "epoch": 1.7006079027355623, + "grad_norm": 1.665329098701477, + "learning_rate": 2.1575988114880057e-06, + "loss": 0.42987754940986633, + "mean_token_accuracy": 0.846322238445282, + "num_tokens": 19975780.0, + "step": 2238 + }, + { + "epoch": 1.7013677811550152, + "grad_norm": 1.0725677013397217, + "learning_rate": 2.155524273288405e-06, + "loss": 0.31892159581184387, + "mean_token_accuracy": 0.8692483305931091, + "num_tokens": 19995875.0, + "step": 2239 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 2.282604455947876, + "learning_rate": 2.15344997685513e-06, + "loss": 0.4460654556751251, + "mean_token_accuracy": 0.8623759746551514, + "num_tokens": 20001466.0, + "step": 2240 + }, + { + "epoch": 1.702887537993921, + "grad_norm": 1.1385949850082397, + "learning_rate": 2.1513759236440024e-06, + "loss": 0.37046104669570923, + "mean_token_accuracy": 0.8637164831161499, + "num_tokens": 20020998.0, + "step": 2241 + }, + { + "epoch": 1.7036474164133737, + "grad_norm": 1.5521315336227417, + "learning_rate": 2.1493021151106704e-06, + "loss": 0.4526556134223938, + "mean_token_accuracy": 0.8675785064697266, + "num_tokens": 20032750.0, + "step": 2242 + }, + { + "epoch": 1.7044072948328268, + "grad_norm": 1.7777446508407593, + "learning_rate": 2.147228552710614e-06, + "loss": 0.41294580698013306, + "mean_token_accuracy": 0.8597785234451294, + "num_tokens": 20041901.0, + "step": 2243 + }, + { + "epoch": 1.7051671732522795, + "grad_norm": 1.5157700777053833, + "learning_rate": 2.145155237899139e-06, + "loss": 0.4158926010131836, + "mean_token_accuracy": 0.8512611985206604, + "num_tokens": 20053705.0, + "step": 2244 + }, + { + "epoch": 1.7059270516717326, + "grad_norm": 1.5116809606552124, + "learning_rate": 2.143082172131378e-06, + "loss": 0.43943172693252563, + "mean_token_accuracy": 0.8429899215698242, + "num_tokens": 20069468.0, + "step": 2245 + }, + { + "epoch": 1.7066869300911853, + "grad_norm": 1.6095285415649414, + "learning_rate": 2.141009356862288e-06, + "loss": 0.41325604915618896, + "mean_token_accuracy": 0.8832963705062866, + "num_tokens": 20080596.0, + "step": 2246 + }, + { + "epoch": 1.7074468085106385, + "grad_norm": 1.39210844039917, + "learning_rate": 2.138936793546649e-06, + "loss": 0.3945302963256836, + "mean_token_accuracy": 0.8698325753211975, + "num_tokens": 20094158.0, + "step": 2247 + }, + { + "epoch": 1.7082066869300911, + "grad_norm": 2.9576594829559326, + "learning_rate": 2.1368644836390684e-06, + "loss": 0.16507276892662048, + "mean_token_accuracy": 0.9410445690155029, + "num_tokens": 20097002.0, + "step": 2248 + }, + { + "epoch": 1.708966565349544, + "grad_norm": 1.7631266117095947, + "learning_rate": 2.134792428593971e-06, + "loss": 0.519780695438385, + "mean_token_accuracy": 0.8276066780090332, + "num_tokens": 20107947.0, + "step": 2249 + }, + { + "epoch": 1.709726443768997, + "grad_norm": 2.144636869430542, + "learning_rate": 2.1327206298656055e-06, + "loss": 0.32923734188079834, + "mean_token_accuracy": 0.8766019344329834, + "num_tokens": 20113676.0, + "step": 2250 + }, + { + "epoch": 1.7104863221884499, + "grad_norm": 1.9511034488677979, + "learning_rate": 2.130649088908041e-06, + "loss": 0.4043842554092407, + "mean_token_accuracy": 0.8525843620300293, + "num_tokens": 20120787.0, + "step": 2251 + }, + { + "epoch": 1.7112462006079028, + "grad_norm": 1.5001336336135864, + "learning_rate": 2.1285778071751638e-06, + "loss": 0.4800187051296234, + "mean_token_accuracy": 0.8398486375808716, + "num_tokens": 20133534.0, + "step": 2252 + }, + { + "epoch": 1.7120060790273555, + "grad_norm": 1.435195803642273, + "learning_rate": 2.126506786120678e-06, + "loss": 0.44489604234695435, + "mean_token_accuracy": 0.8444881439208984, + "num_tokens": 20151787.0, + "step": 2253 + }, + { + "epoch": 1.7127659574468086, + "grad_norm": 1.3056137561798096, + "learning_rate": 2.1244360271981073e-06, + "loss": 0.300567090511322, + "mean_token_accuracy": 0.8903113007545471, + "num_tokens": 20163390.0, + "step": 2254 + }, + { + "epoch": 1.7135258358662613, + "grad_norm": 1.7347925901412964, + "learning_rate": 2.1223655318607907e-06, + "loss": 0.30601179599761963, + "mean_token_accuracy": 0.8845717906951904, + "num_tokens": 20171354.0, + "step": 2255 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.316306471824646, + "learning_rate": 2.1202953015618794e-06, + "loss": 0.3972984552383423, + "mean_token_accuracy": 0.845410943031311, + "num_tokens": 20184464.0, + "step": 2256 + }, + { + "epoch": 1.715045592705167, + "grad_norm": 2.1052892208099365, + "learning_rate": 2.1182253377543428e-06, + "loss": 0.3357020616531372, + "mean_token_accuracy": 0.8853542804718018, + "num_tokens": 20190539.0, + "step": 2257 + }, + { + "epoch": 1.71580547112462, + "grad_norm": 1.4192553758621216, + "learning_rate": 2.116155641890959e-06, + "loss": 0.3881692588329315, + "mean_token_accuracy": 0.8442144989967346, + "num_tokens": 20204570.0, + "step": 2258 + }, + { + "epoch": 1.716565349544073, + "grad_norm": 2.134113311767578, + "learning_rate": 2.1140862154243223e-06, + "loss": 0.37803274393081665, + "mean_token_accuracy": 0.8703107237815857, + "num_tokens": 20210535.0, + "step": 2259 + }, + { + "epoch": 1.7173252279635258, + "grad_norm": 2.9149155616760254, + "learning_rate": 2.1120170598068353e-06, + "loss": 0.34860676527023315, + "mean_token_accuracy": 0.8734345436096191, + "num_tokens": 20214375.0, + "step": 2260 + }, + { + "epoch": 1.7180851063829787, + "grad_norm": 1.6855589151382446, + "learning_rate": 2.109948176490711e-06, + "loss": 0.3676984906196594, + "mean_token_accuracy": 0.8531560301780701, + "num_tokens": 20223791.0, + "step": 2261 + }, + { + "epoch": 1.7188449848024316, + "grad_norm": 2.09671950340271, + "learning_rate": 2.10787956692797e-06, + "loss": 0.41744115948677063, + "mean_token_accuracy": 0.8570001125335693, + "num_tokens": 20231254.0, + "step": 2262 + }, + { + "epoch": 1.7196048632218845, + "grad_norm": 3.148813009262085, + "learning_rate": 2.1058112325704436e-06, + "loss": 0.20556189119815826, + "mean_token_accuracy": 0.926898717880249, + "num_tokens": 20234470.0, + "step": 2263 + }, + { + "epoch": 1.7203647416413372, + "grad_norm": 1.9707107543945312, + "learning_rate": 2.103743174869769e-06, + "loss": 0.40733110904693604, + "mean_token_accuracy": 0.8740406036376953, + "num_tokens": 20242286.0, + "step": 2264 + }, + { + "epoch": 1.7211246200607904, + "grad_norm": 1.2756069898605347, + "learning_rate": 2.1016753952773867e-06, + "loss": 0.3940718173980713, + "mean_token_accuracy": 0.860906720161438, + "num_tokens": 20260382.0, + "step": 2265 + }, + { + "epoch": 1.721884498480243, + "grad_norm": 1.5074653625488281, + "learning_rate": 2.0996078952445453e-06, + "loss": 0.3353617191314697, + "mean_token_accuracy": 0.8809853792190552, + "num_tokens": 20271665.0, + "step": 2266 + }, + { + "epoch": 1.7226443768996962, + "grad_norm": 1.4331210851669312, + "learning_rate": 2.0975406762222966e-06, + "loss": 0.32260069251060486, + "mean_token_accuracy": 0.901330828666687, + "num_tokens": 20283122.0, + "step": 2267 + }, + { + "epoch": 1.7234042553191489, + "grad_norm": 2.2378969192504883, + "learning_rate": 2.095473739661494e-06, + "loss": 0.39086243510246277, + "mean_token_accuracy": 0.8681687116622925, + "num_tokens": 20289243.0, + "step": 2268 + }, + { + "epoch": 1.7241641337386018, + "grad_norm": 2.754582405090332, + "learning_rate": 2.093407087012791e-06, + "loss": 0.42927244305610657, + "mean_token_accuracy": 0.8594136834144592, + "num_tokens": 20294537.0, + "step": 2269 + }, + { + "epoch": 1.7249240121580547, + "grad_norm": 2.2721824645996094, + "learning_rate": 2.091340719726647e-06, + "loss": 0.42479783296585083, + "mean_token_accuracy": 0.8411722183227539, + "num_tokens": 20301502.0, + "step": 2270 + }, + { + "epoch": 1.7256838905775076, + "grad_norm": 2.3230299949645996, + "learning_rate": 2.089274639253317e-06, + "loss": 0.4218963384628296, + "mean_token_accuracy": 0.8498032093048096, + "num_tokens": 20307710.0, + "step": 2271 + }, + { + "epoch": 1.7264437689969605, + "grad_norm": 2.3499748706817627, + "learning_rate": 2.0872088470428553e-06, + "loss": 0.4472277760505676, + "mean_token_accuracy": 0.8487255573272705, + "num_tokens": 20313945.0, + "step": 2272 + }, + { + "epoch": 1.7272036474164134, + "grad_norm": 1.3709690570831299, + "learning_rate": 2.0851433445451142e-06, + "loss": 0.38701117038726807, + "mean_token_accuracy": 0.8592075109481812, + "num_tokens": 20328023.0, + "step": 2273 + }, + { + "epoch": 1.7279635258358663, + "grad_norm": 1.1293425559997559, + "learning_rate": 2.0830781332097446e-06, + "loss": 0.34000539779663086, + "mean_token_accuracy": 0.8779317140579224, + "num_tokens": 20346767.0, + "step": 2274 + }, + { + "epoch": 1.728723404255319, + "grad_norm": 2.9770123958587646, + "learning_rate": 2.08101321448619e-06, + "loss": 0.4437636733055115, + "mean_token_accuracy": 0.8398602604866028, + "num_tokens": 20352306.0, + "step": 2275 + }, + { + "epoch": 1.7294832826747721, + "grad_norm": 3.510955572128296, + "learning_rate": 2.0789485898236897e-06, + "loss": 0.3359706401824951, + "mean_token_accuracy": 0.8872498273849487, + "num_tokens": 20355560.0, + "step": 2276 + }, + { + "epoch": 1.7302431610942248, + "grad_norm": 2.0873279571533203, + "learning_rate": 2.076884260671276e-06, + "loss": 0.38720619678497314, + "mean_token_accuracy": 0.865881621837616, + "num_tokens": 20362802.0, + "step": 2277 + }, + { + "epoch": 1.731003039513678, + "grad_norm": 2.4871230125427246, + "learning_rate": 2.0748202284777775e-06, + "loss": 0.3250775933265686, + "mean_token_accuracy": 0.8867610692977905, + "num_tokens": 20367080.0, + "step": 2278 + }, + { + "epoch": 1.7317629179331306, + "grad_norm": 3.5603582859039307, + "learning_rate": 2.072756494691809e-06, + "loss": 0.35600754618644714, + "mean_token_accuracy": 0.8781189918518066, + "num_tokens": 20370625.0, + "step": 2279 + }, + { + "epoch": 1.7325227963525835, + "grad_norm": 2.0948755741119385, + "learning_rate": 2.070693060761779e-06, + "loss": 0.3558604419231415, + "mean_token_accuracy": 0.902066707611084, + "num_tokens": 20376835.0, + "step": 2280 + }, + { + "epoch": 1.7332826747720365, + "grad_norm": 2.391188859939575, + "learning_rate": 2.0686299281358837e-06, + "loss": 0.36596938967704773, + "mean_token_accuracy": 0.8741272687911987, + "num_tokens": 20382282.0, + "step": 2281 + }, + { + "epoch": 1.7340425531914894, + "grad_norm": 1.6906369924545288, + "learning_rate": 2.0665670982621107e-06, + "loss": 0.5241266489028931, + "mean_token_accuracy": 0.8091107606887817, + "num_tokens": 20393736.0, + "step": 2282 + }, + { + "epoch": 1.7348024316109423, + "grad_norm": 1.7578394412994385, + "learning_rate": 2.0645045725882334e-06, + "loss": 0.37041786313056946, + "mean_token_accuracy": 0.8907113075256348, + "num_tokens": 20402715.0, + "step": 2283 + }, + { + "epoch": 1.7355623100303952, + "grad_norm": 2.191727638244629, + "learning_rate": 2.0624423525618097e-06, + "loss": 0.43301627039909363, + "mean_token_accuracy": 0.8706433773040771, + "num_tokens": 20409976.0, + "step": 2284 + }, + { + "epoch": 1.736322188449848, + "grad_norm": 1.958005666732788, + "learning_rate": 2.0603804396301875e-06, + "loss": 0.29002684354782104, + "mean_token_accuracy": 0.8914110660552979, + "num_tokens": 20417099.0, + "step": 2285 + }, + { + "epoch": 1.7370820668693008, + "grad_norm": 2.477837085723877, + "learning_rate": 2.058318835240495e-06, + "loss": 0.2953898310661316, + "mean_token_accuracy": 0.8975275754928589, + "num_tokens": 20422251.0, + "step": 2286 + }, + { + "epoch": 1.737841945288754, + "grad_norm": 2.156764268875122, + "learning_rate": 2.0562575408396475e-06, + "loss": 0.4063698649406433, + "mean_token_accuracy": 0.8497642278671265, + "num_tokens": 20429338.0, + "step": 2287 + }, + { + "epoch": 1.7386018237082066, + "grad_norm": 1.6748939752578735, + "learning_rate": 2.0541965578743373e-06, + "loss": 0.3272587060928345, + "mean_token_accuracy": 0.8646700382232666, + "num_tokens": 20439680.0, + "step": 2288 + }, + { + "epoch": 1.7393617021276597, + "grad_norm": 1.9948776960372925, + "learning_rate": 2.0521358877910446e-06, + "loss": 0.36843347549438477, + "mean_token_accuracy": 0.8613901138305664, + "num_tokens": 20448492.0, + "step": 2289 + }, + { + "epoch": 1.7401215805471124, + "grad_norm": 2.231428623199463, + "learning_rate": 2.0500755320360263e-06, + "loss": 0.3905152380466461, + "mean_token_accuracy": 0.8980990052223206, + "num_tokens": 20453945.0, + "step": 2290 + }, + { + "epoch": 1.7408814589665653, + "grad_norm": 2.2187650203704834, + "learning_rate": 2.048015492055319e-06, + "loss": 0.45920854806900024, + "mean_token_accuracy": 0.8282852172851562, + "num_tokens": 20462378.0, + "step": 2291 + }, + { + "epoch": 1.7416413373860182, + "grad_norm": 2.0668466091156006, + "learning_rate": 2.045955769294737e-06, + "loss": 0.3227751553058624, + "mean_token_accuracy": 0.8805934190750122, + "num_tokens": 20469822.0, + "step": 2292 + }, + { + "epoch": 1.7424012158054711, + "grad_norm": 1.9162774085998535, + "learning_rate": 2.0438963651998747e-06, + "loss": 0.4604800343513489, + "mean_token_accuracy": 0.8441175818443298, + "num_tokens": 20479099.0, + "step": 2293 + }, + { + "epoch": 1.743161094224924, + "grad_norm": 2.645329713821411, + "learning_rate": 2.0418372812161015e-06, + "loss": 0.3239654004573822, + "mean_token_accuracy": 0.8888648748397827, + "num_tokens": 20483926.0, + "step": 2294 + }, + { + "epoch": 1.743920972644377, + "grad_norm": 1.39468514919281, + "learning_rate": 2.03977851878856e-06, + "loss": 0.4003690183162689, + "mean_token_accuracy": 0.8769714832305908, + "num_tokens": 20496501.0, + "step": 2295 + }, + { + "epoch": 1.7446808510638299, + "grad_norm": 3.509174346923828, + "learning_rate": 2.0377200793621694e-06, + "loss": 0.2948213517665863, + "mean_token_accuracy": 0.8972329497337341, + "num_tokens": 20500000.0, + "step": 2296 + }, + { + "epoch": 1.7454407294832825, + "grad_norm": 1.5033894777297974, + "learning_rate": 2.0356619643816234e-06, + "loss": 0.40694737434387207, + "mean_token_accuracy": 0.8607243895530701, + "num_tokens": 20513473.0, + "step": 2297 + }, + { + "epoch": 1.7462006079027357, + "grad_norm": 1.4324895143508911, + "learning_rate": 2.0336041752913843e-06, + "loss": 0.3899157643318176, + "mean_token_accuracy": 0.858935534954071, + "num_tokens": 20524516.0, + "step": 2298 + }, + { + "epoch": 1.7469604863221884, + "grad_norm": 2.359544277191162, + "learning_rate": 2.031546713535688e-06, + "loss": 0.369213342666626, + "mean_token_accuracy": 0.8741403818130493, + "num_tokens": 20530421.0, + "step": 2299 + }, + { + "epoch": 1.7477203647416415, + "grad_norm": 2.282637357711792, + "learning_rate": 2.029489580558542e-06, + "loss": 0.3255441188812256, + "mean_token_accuracy": 0.9045462608337402, + "num_tokens": 20535954.0, + "step": 2300 + }, + { + "epoch": 1.7484802431610942, + "grad_norm": 1.7367198467254639, + "learning_rate": 2.0274327778037204e-06, + "loss": 0.43890488147735596, + "mean_token_accuracy": 0.8494667410850525, + "num_tokens": 20548638.0, + "step": 2301 + }, + { + "epoch": 1.749240121580547, + "grad_norm": 1.6236488819122314, + "learning_rate": 2.0253763067147657e-06, + "loss": 0.4440777897834778, + "mean_token_accuracy": 0.8414230942726135, + "num_tokens": 20559263.0, + "step": 2302 + }, + { + "epoch": 1.75, + "grad_norm": 1.3755455017089844, + "learning_rate": 2.0233201687349888e-06, + "loss": 0.3473797142505646, + "mean_token_accuracy": 0.8742472529411316, + "num_tokens": 20573109.0, + "step": 2303 + }, + { + "epoch": 1.750759878419453, + "grad_norm": 3.271153688430786, + "learning_rate": 2.0212643653074677e-06, + "loss": 0.4965784549713135, + "mean_token_accuracy": 0.8596988916397095, + "num_tokens": 20578525.0, + "step": 2304 + }, + { + "epoch": 1.7515197568389058, + "grad_norm": 2.6341168880462646, + "learning_rate": 2.019208897875043e-06, + "loss": 0.37775442004203796, + "mean_token_accuracy": 0.8721816539764404, + "num_tokens": 20583641.0, + "step": 2305 + }, + { + "epoch": 1.7522796352583585, + "grad_norm": 1.8308569192886353, + "learning_rate": 2.0171537678803222e-06, + "loss": 0.3243415355682373, + "mean_token_accuracy": 0.8837124109268188, + "num_tokens": 20591725.0, + "step": 2306 + }, + { + "epoch": 1.7530395136778116, + "grad_norm": 2.4362998008728027, + "learning_rate": 2.015098976765673e-06, + "loss": 0.3738787770271301, + "mean_token_accuracy": 0.8974303007125854, + "num_tokens": 20596587.0, + "step": 2307 + }, + { + "epoch": 1.7537993920972643, + "grad_norm": 3.2920920848846436, + "learning_rate": 2.0130445259732282e-06, + "loss": 0.33901530504226685, + "mean_token_accuracy": 0.9019063115119934, + "num_tokens": 20600379.0, + "step": 2308 + }, + { + "epoch": 1.7545592705167175, + "grad_norm": 1.290475606918335, + "learning_rate": 2.01099041694488e-06, + "loss": 0.37150678038597107, + "mean_token_accuracy": 0.8542044758796692, + "num_tokens": 20614340.0, + "step": 2309 + }, + { + "epoch": 1.7553191489361701, + "grad_norm": 2.7794933319091797, + "learning_rate": 2.0089366511222815e-06, + "loss": 0.3746095895767212, + "mean_token_accuracy": 0.8653185367584229, + "num_tokens": 20622056.0, + "step": 2310 + }, + { + "epoch": 1.756079027355623, + "grad_norm": 2.2112278938293457, + "learning_rate": 2.006883229946843e-06, + "loss": 0.35793858766555786, + "mean_token_accuracy": 0.875727653503418, + "num_tokens": 20628930.0, + "step": 2311 + }, + { + "epoch": 1.756838905775076, + "grad_norm": 1.5240603685379028, + "learning_rate": 2.0048301548597365e-06, + "loss": 0.512831449508667, + "mean_token_accuracy": 0.8139172792434692, + "num_tokens": 20643159.0, + "step": 2312 + }, + { + "epoch": 1.7575987841945289, + "grad_norm": 1.810485601425171, + "learning_rate": 2.0027774273018894e-06, + "loss": 0.43870818614959717, + "mean_token_accuracy": 0.8313089609146118, + "num_tokens": 20651914.0, + "step": 2313 + }, + { + "epoch": 1.7583586626139818, + "grad_norm": 1.748178243637085, + "learning_rate": 2.0007250487139827e-06, + "loss": 0.42277514934539795, + "mean_token_accuracy": 0.8463197946548462, + "num_tokens": 20660054.0, + "step": 2314 + }, + { + "epoch": 1.7591185410334347, + "grad_norm": 1.511717677116394, + "learning_rate": 1.998673020536456e-06, + "loss": 0.38304439187049866, + "mean_token_accuracy": 0.8508470058441162, + "num_tokens": 20673371.0, + "step": 2315 + }, + { + "epoch": 1.7598784194528876, + "grad_norm": 1.7790700197219849, + "learning_rate": 1.996621344209503e-06, + "loss": 0.3838311433792114, + "mean_token_accuracy": 0.8676829934120178, + "num_tokens": 20682072.0, + "step": 2316 + }, + { + "epoch": 1.7606382978723403, + "grad_norm": 1.9128468036651611, + "learning_rate": 1.994570021173067e-06, + "loss": 0.40384364128112793, + "mean_token_accuracy": 0.8747294545173645, + "num_tokens": 20689000.0, + "step": 2317 + }, + { + "epoch": 1.7613981762917934, + "grad_norm": 3.286569118499756, + "learning_rate": 1.9925190528668455e-06, + "loss": 0.38019680976867676, + "mean_token_accuracy": 0.8678069114685059, + "num_tokens": 20692763.0, + "step": 2318 + }, + { + "epoch": 1.762158054711246, + "grad_norm": 1.6108927726745605, + "learning_rate": 1.990468440730288e-06, + "loss": 0.3144170045852661, + "mean_token_accuracy": 0.8695170879364014, + "num_tokens": 20702620.0, + "step": 2319 + }, + { + "epoch": 1.7629179331306992, + "grad_norm": 3.185225009918213, + "learning_rate": 1.9884181862025938e-06, + "loss": 0.41619348526000977, + "mean_token_accuracy": 0.8543670177459717, + "num_tokens": 20706857.0, + "step": 2320 + }, + { + "epoch": 1.763677811550152, + "grad_norm": 2.3699469566345215, + "learning_rate": 1.986368290722709e-06, + "loss": 0.5115842819213867, + "mean_token_accuracy": 0.8141909837722778, + "num_tokens": 20713997.0, + "step": 2321 + }, + { + "epoch": 1.7644376899696048, + "grad_norm": 1.4449706077575684, + "learning_rate": 1.9843187557293286e-06, + "loss": 0.419655442237854, + "mean_token_accuracy": 0.8545533418655396, + "num_tokens": 20726548.0, + "step": 2322 + }, + { + "epoch": 1.7651975683890577, + "grad_norm": 2.127614974975586, + "learning_rate": 1.9822695826608975e-06, + "loss": 0.43722522258758545, + "mean_token_accuracy": 0.8542283773422241, + "num_tokens": 20733469.0, + "step": 2323 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 3.3081557750701904, + "learning_rate": 1.9802207729556023e-06, + "loss": 0.30904972553253174, + "mean_token_accuracy": 0.8896352648735046, + "num_tokens": 20737190.0, + "step": 2324 + }, + { + "epoch": 1.7667173252279635, + "grad_norm": 2.603506326675415, + "learning_rate": 1.978172328051377e-06, + "loss": 0.30952537059783936, + "mean_token_accuracy": 0.8868587017059326, + "num_tokens": 20741780.0, + "step": 2325 + }, + { + "epoch": 1.7674772036474165, + "grad_norm": 2.576824903488159, + "learning_rate": 1.9761242493858987e-06, + "loss": 0.29593953490257263, + "mean_token_accuracy": 0.888198733329773, + "num_tokens": 20746324.0, + "step": 2326 + }, + { + "epoch": 1.7682370820668694, + "grad_norm": 1.6168320178985596, + "learning_rate": 1.9740765383965894e-06, + "loss": 0.5093998908996582, + "mean_token_accuracy": 0.8301646709442139, + "num_tokens": 20760140.0, + "step": 2327 + }, + { + "epoch": 1.768996960486322, + "grad_norm": 2.1162400245666504, + "learning_rate": 1.9720291965206097e-06, + "loss": 0.36714404821395874, + "mean_token_accuracy": 0.8699671626091003, + "num_tokens": 20766961.0, + "step": 2328 + }, + { + "epoch": 1.7697568389057752, + "grad_norm": 1.046911597251892, + "learning_rate": 1.969982225194864e-06, + "loss": 0.40783989429473877, + "mean_token_accuracy": 0.8474892377853394, + "num_tokens": 20786737.0, + "step": 2329 + }, + { + "epoch": 1.7705167173252279, + "grad_norm": 1.7059568166732788, + "learning_rate": 1.9679356258559943e-06, + "loss": 0.44083845615386963, + "mean_token_accuracy": 0.841221034526825, + "num_tokens": 20798907.0, + "step": 2330 + }, + { + "epoch": 1.771276595744681, + "grad_norm": 1.5157767534255981, + "learning_rate": 1.9658893999403847e-06, + "loss": 0.4671107828617096, + "mean_token_accuracy": 0.8252813816070557, + "num_tokens": 20814304.0, + "step": 2331 + }, + { + "epoch": 1.7720364741641337, + "grad_norm": 2.1340525150299072, + "learning_rate": 1.9638435488841543e-06, + "loss": 0.4088709354400635, + "mean_token_accuracy": 0.8595127463340759, + "num_tokens": 20821827.0, + "step": 2332 + }, + { + "epoch": 1.7727963525835866, + "grad_norm": 1.948072910308838, + "learning_rate": 1.96179807412316e-06, + "loss": 0.3692860007286072, + "mean_token_accuracy": 0.8678920269012451, + "num_tokens": 20828612.0, + "step": 2333 + }, + { + "epoch": 1.7735562310030395, + "grad_norm": 1.5731977224349976, + "learning_rate": 1.959752977092995e-06, + "loss": 0.3743135929107666, + "mean_token_accuracy": 0.8723479509353638, + "num_tokens": 20838497.0, + "step": 2334 + }, + { + "epoch": 1.7743161094224924, + "grad_norm": 1.5506012439727783, + "learning_rate": 1.957708259228987e-06, + "loss": 0.4403391182422638, + "mean_token_accuracy": 0.854604959487915, + "num_tokens": 20851603.0, + "step": 2335 + }, + { + "epoch": 1.7750759878419453, + "grad_norm": 1.154336929321289, + "learning_rate": 1.9556639219661983e-06, + "loss": 0.5281188488006592, + "mean_token_accuracy": 0.8101300001144409, + "num_tokens": 20875661.0, + "step": 2336 + }, + { + "epoch": 1.7758358662613982, + "grad_norm": 4.720771312713623, + "learning_rate": 1.9536199667394217e-06, + "loss": 0.44419822096824646, + "mean_token_accuracy": 0.8740090131759644, + "num_tokens": 20886971.0, + "step": 2337 + }, + { + "epoch": 1.7765957446808511, + "grad_norm": 1.5492230653762817, + "learning_rate": 1.9515763949831852e-06, + "loss": 0.4538637697696686, + "mean_token_accuracy": 0.8362185955047607, + "num_tokens": 20899212.0, + "step": 2338 + }, + { + "epoch": 1.7773556231003038, + "grad_norm": 1.354101538658142, + "learning_rate": 1.9495332081317466e-06, + "loss": 0.4341534376144409, + "mean_token_accuracy": 0.8380170464515686, + "num_tokens": 20913065.0, + "step": 2339 + }, + { + "epoch": 1.778115501519757, + "grad_norm": 1.5805599689483643, + "learning_rate": 1.947490407619092e-06, + "loss": 0.40928739309310913, + "mean_token_accuracy": 0.8524469137191772, + "num_tokens": 20922919.0, + "step": 2340 + }, + { + "epoch": 1.7788753799392096, + "grad_norm": 2.097221851348877, + "learning_rate": 1.945447994878937e-06, + "loss": 0.4816104769706726, + "mean_token_accuracy": 0.888654351234436, + "num_tokens": 20931350.0, + "step": 2341 + }, + { + "epoch": 1.7796352583586628, + "grad_norm": 1.7193297147750854, + "learning_rate": 1.9434059713447264e-06, + "loss": 0.44925639033317566, + "mean_token_accuracy": 0.8500319123268127, + "num_tokens": 20940546.0, + "step": 2342 + }, + { + "epoch": 1.7803951367781155, + "grad_norm": 1.5971747636795044, + "learning_rate": 1.9413643384496315e-06, + "loss": 0.29559412598609924, + "mean_token_accuracy": 0.8871279954910278, + "num_tokens": 20950604.0, + "step": 2343 + }, + { + "epoch": 1.7811550151975684, + "grad_norm": 2.788029670715332, + "learning_rate": 1.9393230976265478e-06, + "loss": 0.31713539361953735, + "mean_token_accuracy": 0.8866176605224609, + "num_tokens": 20955296.0, + "step": 2344 + }, + { + "epoch": 1.7819148936170213, + "grad_norm": 1.5747952461242676, + "learning_rate": 1.937282250308096e-06, + "loss": 0.41813358664512634, + "mean_token_accuracy": 0.8418053984642029, + "num_tokens": 20967664.0, + "step": 2345 + }, + { + "epoch": 1.7826747720364742, + "grad_norm": 2.0813145637512207, + "learning_rate": 1.935241797926623e-06, + "loss": 0.39056286215782166, + "mean_token_accuracy": 0.8601781129837036, + "num_tokens": 20975895.0, + "step": 2346 + }, + { + "epoch": 1.783434650455927, + "grad_norm": 2.143022298812866, + "learning_rate": 1.933201741914196e-06, + "loss": 0.40797823667526245, + "mean_token_accuracy": 0.8846398591995239, + "num_tokens": 20983683.0, + "step": 2347 + }, + { + "epoch": 1.78419452887538, + "grad_norm": 1.8451775312423706, + "learning_rate": 1.931162083702606e-06, + "loss": 0.34083136916160583, + "mean_token_accuracy": 0.8643462657928467, + "num_tokens": 20992621.0, + "step": 2348 + }, + { + "epoch": 1.784954407294833, + "grad_norm": 1.8603935241699219, + "learning_rate": 1.9291228247233607e-06, + "loss": 0.4860231280326843, + "mean_token_accuracy": 0.8391251564025879, + "num_tokens": 21002427.0, + "step": 2349 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 2.751711845397949, + "learning_rate": 1.9270839664076937e-06, + "loss": 0.30588358640670776, + "mean_token_accuracy": 0.8836315274238586, + "num_tokens": 21006898.0, + "step": 2350 + }, + { + "epoch": 1.7864741641337387, + "grad_norm": 1.0335345268249512, + "learning_rate": 1.9250455101865526e-06, + "loss": 0.3119634985923767, + "mean_token_accuracy": 0.8912283182144165, + "num_tokens": 21024930.0, + "step": 2351 + }, + { + "epoch": 1.7872340425531914, + "grad_norm": 2.4693806171417236, + "learning_rate": 1.9230074574906043e-06, + "loss": 0.1976669877767563, + "mean_token_accuracy": 0.928974986076355, + "num_tokens": 21029027.0, + "step": 2352 + }, + { + "epoch": 1.7879939209726445, + "grad_norm": 1.2892690896987915, + "learning_rate": 1.920969809750234e-06, + "loss": 0.46008217334747314, + "mean_token_accuracy": 0.8299605846405029, + "num_tokens": 21047671.0, + "step": 2353 + }, + { + "epoch": 1.7887537993920972, + "grad_norm": 3.162534713745117, + "learning_rate": 1.91893256839554e-06, + "loss": 0.2916071116924286, + "mean_token_accuracy": 0.8932807445526123, + "num_tokens": 21051555.0, + "step": 2354 + }, + { + "epoch": 1.7895136778115501, + "grad_norm": 1.7627713680267334, + "learning_rate": 1.916895734856338e-06, + "loss": 0.3223535120487213, + "mean_token_accuracy": 0.8852578401565552, + "num_tokens": 21060056.0, + "step": 2355 + }, + { + "epoch": 1.790273556231003, + "grad_norm": 1.9448071718215942, + "learning_rate": 1.9148593105621542e-06, + "loss": 0.3650452196598053, + "mean_token_accuracy": 0.8709862232208252, + "num_tokens": 21067190.0, + "step": 2356 + }, + { + "epoch": 1.791033434650456, + "grad_norm": 2.026644229888916, + "learning_rate": 1.9128232969422318e-06, + "loss": 0.3620566427707672, + "mean_token_accuracy": 0.865707516670227, + "num_tokens": 21075197.0, + "step": 2357 + }, + { + "epoch": 1.7917933130699089, + "grad_norm": 2.2628564834594727, + "learning_rate": 1.9107876954255217e-06, + "loss": 0.353444367647171, + "mean_token_accuracy": 0.8590385913848877, + "num_tokens": 21080823.0, + "step": 2358 + }, + { + "epoch": 1.7925531914893615, + "grad_norm": 2.5959067344665527, + "learning_rate": 1.908752507440689e-06, + "loss": 0.43711763620376587, + "mean_token_accuracy": 0.8539710640907288, + "num_tokens": 21086016.0, + "step": 2359 + }, + { + "epoch": 1.7933130699088147, + "grad_norm": 1.6228864192962646, + "learning_rate": 1.906717734416105e-06, + "loss": 0.38630396127700806, + "mean_token_accuracy": 0.8611987829208374, + "num_tokens": 21096573.0, + "step": 2360 + }, + { + "epoch": 1.7940729483282674, + "grad_norm": 1.8471404314041138, + "learning_rate": 1.9046833777798534e-06, + "loss": 0.46608641743659973, + "mean_token_accuracy": 0.8782031536102295, + "num_tokens": 21105817.0, + "step": 2361 + }, + { + "epoch": 1.7948328267477205, + "grad_norm": 2.6532235145568848, + "learning_rate": 1.9026494389597239e-06, + "loss": 0.3310372829437256, + "mean_token_accuracy": 0.8781720399856567, + "num_tokens": 21111192.0, + "step": 2362 + }, + { + "epoch": 1.7955927051671732, + "grad_norm": 2.172534942626953, + "learning_rate": 1.9006159193832124e-06, + "loss": 0.49921661615371704, + "mean_token_accuracy": 0.8215196132659912, + "num_tokens": 21117878.0, + "step": 2363 + }, + { + "epoch": 1.7963525835866263, + "grad_norm": 1.6507720947265625, + "learning_rate": 1.8985828204775206e-06, + "loss": 0.4189162850379944, + "mean_token_accuracy": 0.8520572185516357, + "num_tokens": 21128287.0, + "step": 2364 + }, + { + "epoch": 1.797112462006079, + "grad_norm": 1.5932034254074097, + "learning_rate": 1.8965501436695578e-06, + "loss": 0.45531854033470154, + "mean_token_accuracy": 0.8391242027282715, + "num_tokens": 21140605.0, + "step": 2365 + }, + { + "epoch": 1.797872340425532, + "grad_norm": 2.4680638313293457, + "learning_rate": 1.894517890385933e-06, + "loss": 0.41174983978271484, + "mean_token_accuracy": 0.8616886138916016, + "num_tokens": 21147045.0, + "step": 2366 + }, + { + "epoch": 1.7986322188449848, + "grad_norm": 1.61875319480896, + "learning_rate": 1.8924860620529594e-06, + "loss": 0.47573935985565186, + "mean_token_accuracy": 0.8347671031951904, + "num_tokens": 21157253.0, + "step": 2367 + }, + { + "epoch": 1.7993920972644377, + "grad_norm": 3.4389333724975586, + "learning_rate": 1.8904546600966539e-06, + "loss": 0.34975939989089966, + "mean_token_accuracy": 0.8915865421295166, + "num_tokens": 21160486.0, + "step": 2368 + }, + { + "epoch": 1.8001519756838906, + "grad_norm": 2.0069527626037598, + "learning_rate": 1.888423685942732e-06, + "loss": 0.379585325717926, + "mean_token_accuracy": 0.8605983257293701, + "num_tokens": 21168016.0, + "step": 2369 + }, + { + "epoch": 1.8009118541033433, + "grad_norm": 3.0740530490875244, + "learning_rate": 1.886393141016609e-06, + "loss": 0.5244829058647156, + "mean_token_accuracy": 0.8282772302627563, + "num_tokens": 21172851.0, + "step": 2370 + }, + { + "epoch": 1.8016717325227964, + "grad_norm": 1.5724968910217285, + "learning_rate": 1.8843630267434e-06, + "loss": 0.2020694762468338, + "mean_token_accuracy": 0.8882503509521484, + "num_tokens": 21179866.0, + "step": 2371 + }, + { + "epoch": 1.8024316109422491, + "grad_norm": 2.1539509296417236, + "learning_rate": 1.8823333445479175e-06, + "loss": 0.37903186678886414, + "mean_token_accuracy": 0.8525497317314148, + "num_tokens": 21186941.0, + "step": 2372 + }, + { + "epoch": 1.8031914893617023, + "grad_norm": 2.0247764587402344, + "learning_rate": 1.8803040958546708e-06, + "loss": 0.293364018201828, + "mean_token_accuracy": 0.8954306244850159, + "num_tokens": 21193659.0, + "step": 2373 + }, + { + "epoch": 1.803951367781155, + "grad_norm": 1.7034926414489746, + "learning_rate": 1.8782752820878636e-06, + "loss": 0.33828210830688477, + "mean_token_accuracy": 0.9032940864562988, + "num_tokens": 21201399.0, + "step": 2374 + }, + { + "epoch": 1.8047112462006079, + "grad_norm": 1.7864601612091064, + "learning_rate": 1.8762469046713954e-06, + "loss": 0.3165147006511688, + "mean_token_accuracy": 0.8997465372085571, + "num_tokens": 21209105.0, + "step": 2375 + }, + { + "epoch": 1.8054711246200608, + "grad_norm": 2.3371729850769043, + "learning_rate": 1.8742189650288617e-06, + "loss": 0.4036901593208313, + "mean_token_accuracy": 0.8549420833587646, + "num_tokens": 21215429.0, + "step": 2376 + }, + { + "epoch": 1.8062310030395137, + "grad_norm": 1.7922348976135254, + "learning_rate": 1.872191464583547e-06, + "loss": 0.4366671144962311, + "mean_token_accuracy": 0.8614166975021362, + "num_tokens": 21226823.0, + "step": 2377 + }, + { + "epoch": 1.8069908814589666, + "grad_norm": 2.1667943000793457, + "learning_rate": 1.8701644047584294e-06, + "loss": 0.3543647825717926, + "mean_token_accuracy": 0.9031318426132202, + "num_tokens": 21232823.0, + "step": 2378 + }, + { + "epoch": 1.8077507598784195, + "grad_norm": 1.7554421424865723, + "learning_rate": 1.868137786976177e-06, + "loss": 0.32704365253448486, + "mean_token_accuracy": 0.8990532755851746, + "num_tokens": 21242036.0, + "step": 2379 + }, + { + "epoch": 1.8085106382978724, + "grad_norm": 1.6723839044570923, + "learning_rate": 1.8661116126591492e-06, + "loss": 0.3665752410888672, + "mean_token_accuracy": 0.8828305006027222, + "num_tokens": 21251290.0, + "step": 2380 + }, + { + "epoch": 1.809270516717325, + "grad_norm": 1.5078409910202026, + "learning_rate": 1.8640858832293924e-06, + "loss": 0.368108332157135, + "mean_token_accuracy": 0.8720884323120117, + "num_tokens": 21263510.0, + "step": 2381 + }, + { + "epoch": 1.8100303951367782, + "grad_norm": 2.245553493499756, + "learning_rate": 1.8620606001086423e-06, + "loss": 0.3189915716648102, + "mean_token_accuracy": 0.9015103578567505, + "num_tokens": 21269690.0, + "step": 2382 + }, + { + "epoch": 1.810790273556231, + "grad_norm": 1.780027151107788, + "learning_rate": 1.8600357647183188e-06, + "loss": 0.40369710326194763, + "mean_token_accuracy": 0.8539618253707886, + "num_tokens": 21278523.0, + "step": 2383 + }, + { + "epoch": 1.811550151975684, + "grad_norm": 2.1727912425994873, + "learning_rate": 1.8580113784795306e-06, + "loss": 0.29285651445388794, + "mean_token_accuracy": 0.8954071998596191, + "num_tokens": 21284717.0, + "step": 2384 + }, + { + "epoch": 1.8123100303951367, + "grad_norm": 2.310225248336792, + "learning_rate": 1.8559874428130708e-06, + "loss": 0.3090948760509491, + "mean_token_accuracy": 0.8853784203529358, + "num_tokens": 21290484.0, + "step": 2385 + }, + { + "epoch": 1.8130699088145896, + "grad_norm": 1.6556873321533203, + "learning_rate": 1.8539639591394131e-06, + "loss": 0.4425269663333893, + "mean_token_accuracy": 0.8488757610321045, + "num_tokens": 21302588.0, + "step": 2386 + }, + { + "epoch": 1.8138297872340425, + "grad_norm": 1.9238256216049194, + "learning_rate": 1.8519409288787182e-06, + "loss": 0.4781329929828644, + "mean_token_accuracy": 0.8392970561981201, + "num_tokens": 21310598.0, + "step": 2387 + }, + { + "epoch": 1.8145896656534954, + "grad_norm": 1.4976142644882202, + "learning_rate": 1.8499183534508263e-06, + "loss": 0.36829859018325806, + "mean_token_accuracy": 0.8687542676925659, + "num_tokens": 21322668.0, + "step": 2388 + }, + { + "epoch": 1.8153495440729484, + "grad_norm": 2.0216941833496094, + "learning_rate": 1.8478962342752584e-06, + "loss": 0.385962575674057, + "mean_token_accuracy": 0.8908089399337769, + "num_tokens": 21330378.0, + "step": 2389 + }, + { + "epoch": 1.8161094224924013, + "grad_norm": 1.647863507270813, + "learning_rate": 1.8458745727712142e-06, + "loss": 0.30903705954551697, + "mean_token_accuracy": 0.8914397954940796, + "num_tokens": 21339932.0, + "step": 2390 + }, + { + "epoch": 1.8168693009118542, + "grad_norm": 1.5832399129867554, + "learning_rate": 1.8438533703575757e-06, + "loss": 0.3636384606361389, + "mean_token_accuracy": 0.8611595630645752, + "num_tokens": 21351557.0, + "step": 2391 + }, + { + "epoch": 1.8176291793313069, + "grad_norm": 3.0069241523742676, + "learning_rate": 1.8418326284528997e-06, + "loss": 0.37970617413520813, + "mean_token_accuracy": 0.8620643615722656, + "num_tokens": 21355704.0, + "step": 2392 + }, + { + "epoch": 1.81838905775076, + "grad_norm": 2.004526376724243, + "learning_rate": 1.8398123484754204e-06, + "loss": 0.5333225131034851, + "mean_token_accuracy": 0.8062554597854614, + "num_tokens": 21364640.0, + "step": 2393 + }, + { + "epoch": 1.8191489361702127, + "grad_norm": 1.449981689453125, + "learning_rate": 1.8377925318430478e-06, + "loss": 0.3736325800418854, + "mean_token_accuracy": 0.858788251876831, + "num_tokens": 21377025.0, + "step": 2394 + }, + { + "epoch": 1.8199088145896658, + "grad_norm": 1.1959524154663086, + "learning_rate": 1.8357731799733686e-06, + "loss": 0.3272058963775635, + "mean_token_accuracy": 0.8840590715408325, + "num_tokens": 21395378.0, + "step": 2395 + }, + { + "epoch": 1.8206686930091185, + "grad_norm": 2.134742498397827, + "learning_rate": 1.8337542942836406e-06, + "loss": 0.3737856149673462, + "mean_token_accuracy": 0.8674061298370361, + "num_tokens": 21402106.0, + "step": 2396 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 2.2179460525512695, + "learning_rate": 1.8317358761907945e-06, + "loss": 0.37301796674728394, + "mean_token_accuracy": 0.8605623245239258, + "num_tokens": 21408367.0, + "step": 2397 + }, + { + "epoch": 1.8221884498480243, + "grad_norm": 2.1718010902404785, + "learning_rate": 1.8297179271114345e-06, + "loss": 0.2772231101989746, + "mean_token_accuracy": 0.8997501730918884, + "num_tokens": 21414274.0, + "step": 2398 + }, + { + "epoch": 1.8229483282674772, + "grad_norm": 1.410933494567871, + "learning_rate": 1.827700448461836e-06, + "loss": 0.4834601581096649, + "mean_token_accuracy": 0.8382522463798523, + "num_tokens": 21429120.0, + "step": 2399 + }, + { + "epoch": 1.8237082066869301, + "grad_norm": 3.4779679775238037, + "learning_rate": 1.8256834416579423e-06, + "loss": 0.44643428921699524, + "mean_token_accuracy": 0.8308249711990356, + "num_tokens": 21432437.0, + "step": 2400 + }, + { + "epoch": 1.824468085106383, + "grad_norm": 1.374484658241272, + "learning_rate": 1.8236669081153657e-06, + "loss": 0.3947869837284088, + "mean_token_accuracy": 0.8605848550796509, + "num_tokens": 21445656.0, + "step": 2401 + }, + { + "epoch": 1.825227963525836, + "grad_norm": 1.9599316120147705, + "learning_rate": 1.8216508492493887e-06, + "loss": 0.49040719866752625, + "mean_token_accuracy": 0.839459240436554, + "num_tokens": 21452889.0, + "step": 2402 + }, + { + "epoch": 1.8259878419452886, + "grad_norm": 2.1267881393432617, + "learning_rate": 1.8196352664749578e-06, + "loss": 0.3233179450035095, + "mean_token_accuracy": 0.8841243386268616, + "num_tokens": 21458788.0, + "step": 2403 + }, + { + "epoch": 1.8267477203647418, + "grad_norm": 2.6356115341186523, + "learning_rate": 1.8176201612066874e-06, + "loss": 0.43436336517333984, + "mean_token_accuracy": 0.850265622138977, + "num_tokens": 21464305.0, + "step": 2404 + }, + { + "epoch": 1.8275075987841944, + "grad_norm": 2.0232386589050293, + "learning_rate": 1.8156055348588548e-06, + "loss": 0.37281763553619385, + "mean_token_accuracy": 0.8616300821304321, + "num_tokens": 21471722.0, + "step": 2405 + }, + { + "epoch": 1.8282674772036476, + "grad_norm": 3.2616260051727295, + "learning_rate": 1.8135913888454034e-06, + "loss": 0.2882898151874542, + "mean_token_accuracy": 0.9001147747039795, + "num_tokens": 21475400.0, + "step": 2406 + }, + { + "epoch": 1.8290273556231003, + "grad_norm": 2.1665611267089844, + "learning_rate": 1.8115777245799383e-06, + "loss": 0.45269185304641724, + "mean_token_accuracy": 0.8420798778533936, + "num_tokens": 21481827.0, + "step": 2407 + }, + { + "epoch": 1.8297872340425532, + "grad_norm": 1.4406569004058838, + "learning_rate": 1.8095645434757261e-06, + "loss": 0.43665701150894165, + "mean_token_accuracy": 0.8401381969451904, + "num_tokens": 21496441.0, + "step": 2408 + }, + { + "epoch": 1.830547112462006, + "grad_norm": 1.6756342649459839, + "learning_rate": 1.8075518469456944e-06, + "loss": 0.3521783947944641, + "mean_token_accuracy": 0.8737466335296631, + "num_tokens": 21505568.0, + "step": 2409 + }, + { + "epoch": 1.831306990881459, + "grad_norm": 1.6623140573501587, + "learning_rate": 1.8055396364024318e-06, + "loss": 0.344537615776062, + "mean_token_accuracy": 0.886972188949585, + "num_tokens": 21513252.0, + "step": 2410 + }, + { + "epoch": 1.832066869300912, + "grad_norm": 2.064835548400879, + "learning_rate": 1.803527913258186e-06, + "loss": 0.3252706229686737, + "mean_token_accuracy": 0.885245680809021, + "num_tokens": 21520242.0, + "step": 2411 + }, + { + "epoch": 1.8328267477203646, + "grad_norm": 1.9969112873077393, + "learning_rate": 1.8015166789248606e-06, + "loss": 0.34694376587867737, + "mean_token_accuracy": 0.8818766474723816, + "num_tokens": 21527524.0, + "step": 2412 + }, + { + "epoch": 1.8335866261398177, + "grad_norm": 2.086148977279663, + "learning_rate": 1.7995059348140165e-06, + "loss": 0.23109188675880432, + "mean_token_accuracy": 0.912773609161377, + "num_tokens": 21532829.0, + "step": 2413 + }, + { + "epoch": 1.8343465045592704, + "grad_norm": 1.80828058719635, + "learning_rate": 1.7974956823368728e-06, + "loss": 0.5422223210334778, + "mean_token_accuracy": 0.8058640956878662, + "num_tokens": 21544440.0, + "step": 2414 + }, + { + "epoch": 1.8351063829787235, + "grad_norm": 1.8121788501739502, + "learning_rate": 1.7954859229043017e-06, + "loss": 0.3674035668373108, + "mean_token_accuracy": 0.8628277778625488, + "num_tokens": 21553160.0, + "step": 2415 + }, + { + "epoch": 1.8358662613981762, + "grad_norm": 1.9307979345321655, + "learning_rate": 1.7934766579268292e-06, + "loss": 0.4528796672821045, + "mean_token_accuracy": 0.8328302502632141, + "num_tokens": 21563485.0, + "step": 2416 + }, + { + "epoch": 1.8366261398176293, + "grad_norm": 1.2312756776809692, + "learning_rate": 1.7914678888146347e-06, + "loss": 0.40424543619155884, + "mean_token_accuracy": 0.8571025133132935, + "num_tokens": 21582662.0, + "step": 2417 + }, + { + "epoch": 1.837386018237082, + "grad_norm": 1.6305770874023438, + "learning_rate": 1.7894596169775514e-06, + "loss": 0.36575305461883545, + "mean_token_accuracy": 0.8768579959869385, + "num_tokens": 21592930.0, + "step": 2418 + }, + { + "epoch": 1.838145896656535, + "grad_norm": 1.8107178211212158, + "learning_rate": 1.7874518438250598e-06, + "loss": 0.3260963261127472, + "mean_token_accuracy": 0.896018385887146, + "num_tokens": 21600509.0, + "step": 2419 + }, + { + "epoch": 1.8389057750759878, + "grad_norm": 2.7195847034454346, + "learning_rate": 1.785444570766293e-06, + "loss": 0.2728347182273865, + "mean_token_accuracy": 0.9178709983825684, + "num_tokens": 21604489.0, + "step": 2420 + }, + { + "epoch": 1.8396656534954408, + "grad_norm": 1.9783591032028198, + "learning_rate": 1.7834377992100332e-06, + "loss": 0.3136378526687622, + "mean_token_accuracy": 0.8844017386436462, + "num_tokens": 21612060.0, + "step": 2421 + }, + { + "epoch": 1.8404255319148937, + "grad_norm": 2.1911418437957764, + "learning_rate": 1.7814315305647095e-06, + "loss": 0.39013993740081787, + "mean_token_accuracy": 0.8688976764678955, + "num_tokens": 21618778.0, + "step": 2422 + }, + { + "epoch": 1.8411854103343464, + "grad_norm": 1.9143604040145874, + "learning_rate": 1.779425766238398e-06, + "loss": 0.5113036632537842, + "mean_token_accuracy": 0.8329141139984131, + "num_tokens": 21628976.0, + "step": 2423 + }, + { + "epoch": 1.8419452887537995, + "grad_norm": 1.4184197187423706, + "learning_rate": 1.7774205076388207e-06, + "loss": 0.3821067810058594, + "mean_token_accuracy": 0.8604007959365845, + "num_tokens": 21643145.0, + "step": 2424 + }, + { + "epoch": 1.8427051671732522, + "grad_norm": 2.45896577835083, + "learning_rate": 1.7754157561733476e-06, + "loss": 0.3004961311817169, + "mean_token_accuracy": 0.89884352684021, + "num_tokens": 21647441.0, + "step": 2425 + }, + { + "epoch": 1.8434650455927053, + "grad_norm": 1.7999277114868164, + "learning_rate": 1.7734115132489887e-06, + "loss": 0.42533132433891296, + "mean_token_accuracy": 0.8838746547698975, + "num_tokens": 21657445.0, + "step": 2426 + }, + { + "epoch": 1.844224924012158, + "grad_norm": 2.099728584289551, + "learning_rate": 1.7714077802723994e-06, + "loss": 0.36200380325317383, + "mean_token_accuracy": 0.86548912525177, + "num_tokens": 21663966.0, + "step": 2427 + }, + { + "epoch": 1.844984802431611, + "grad_norm": 2.1970369815826416, + "learning_rate": 1.7694045586498754e-06, + "loss": 0.34944331645965576, + "mean_token_accuracy": 0.8670865297317505, + "num_tokens": 21670051.0, + "step": 2428 + }, + { + "epoch": 1.8457446808510638, + "grad_norm": 2.2928519248962402, + "learning_rate": 1.7674018497873568e-06, + "loss": 0.39500880241394043, + "mean_token_accuracy": 0.8744652271270752, + "num_tokens": 21676054.0, + "step": 2429 + }, + { + "epoch": 1.8465045592705167, + "grad_norm": 1.7598960399627686, + "learning_rate": 1.7653996550904208e-06, + "loss": 0.40113672614097595, + "mean_token_accuracy": 0.8552819490432739, + "num_tokens": 21685514.0, + "step": 2430 + }, + { + "epoch": 1.8472644376899696, + "grad_norm": 2.0529749393463135, + "learning_rate": 1.7633979759642844e-06, + "loss": 0.47586584091186523, + "mean_token_accuracy": 0.8412872552871704, + "num_tokens": 21693282.0, + "step": 2431 + }, + { + "epoch": 1.8480243161094225, + "grad_norm": 2.2423181533813477, + "learning_rate": 1.7613968138138027e-06, + "loss": 0.2757381796836853, + "mean_token_accuracy": 0.8992017507553101, + "num_tokens": 21698439.0, + "step": 2432 + }, + { + "epoch": 1.8487841945288754, + "grad_norm": 1.3280467987060547, + "learning_rate": 1.7593961700434692e-06, + "loss": 0.29535043239593506, + "mean_token_accuracy": 0.8943840861320496, + "num_tokens": 21711823.0, + "step": 2433 + }, + { + "epoch": 1.8495440729483281, + "grad_norm": 2.589221715927124, + "learning_rate": 1.7573960460574133e-06, + "loss": 0.46775516867637634, + "mean_token_accuracy": 0.8654797673225403, + "num_tokens": 21717180.0, + "step": 2434 + }, + { + "epoch": 1.8503039513677813, + "grad_norm": 2.1137642860412598, + "learning_rate": 1.7553964432593976e-06, + "loss": 0.3808780610561371, + "mean_token_accuracy": 0.8759565353393555, + "num_tokens": 21723980.0, + "step": 2435 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 2.386967182159424, + "learning_rate": 1.75339736305282e-06, + "loss": 0.42688336968421936, + "mean_token_accuracy": 0.8488960266113281, + "num_tokens": 21730411.0, + "step": 2436 + }, + { + "epoch": 1.851823708206687, + "grad_norm": 1.586552619934082, + "learning_rate": 1.7513988068407145e-06, + "loss": 0.33497530221939087, + "mean_token_accuracy": 0.8809621334075928, + "num_tokens": 21740228.0, + "step": 2437 + }, + { + "epoch": 1.8525835866261398, + "grad_norm": 2.107167959213257, + "learning_rate": 1.7494007760257428e-06, + "loss": 0.3801528513431549, + "mean_token_accuracy": 0.8666986227035522, + "num_tokens": 21746718.0, + "step": 2438 + }, + { + "epoch": 1.8533434650455927, + "grad_norm": 2.514514684677124, + "learning_rate": 1.7474032720101991e-06, + "loss": 0.285498708486557, + "mean_token_accuracy": 0.901540219783783, + "num_tokens": 21751009.0, + "step": 2439 + }, + { + "epoch": 1.8541033434650456, + "grad_norm": 1.8152034282684326, + "learning_rate": 1.7454062961960102e-06, + "loss": 0.3704795241355896, + "mean_token_accuracy": 0.8630262613296509, + "num_tokens": 21760164.0, + "step": 2440 + }, + { + "epoch": 1.8548632218844985, + "grad_norm": 2.714531183242798, + "learning_rate": 1.7434098499847308e-06, + "loss": 0.5070809125900269, + "mean_token_accuracy": 0.8408594131469727, + "num_tokens": 21765602.0, + "step": 2441 + }, + { + "epoch": 1.8556231003039514, + "grad_norm": 2.173832893371582, + "learning_rate": 1.7414139347775423e-06, + "loss": 0.3500945568084717, + "mean_token_accuracy": 0.8733699321746826, + "num_tokens": 21772029.0, + "step": 2442 + }, + { + "epoch": 1.8563829787234043, + "grad_norm": 1.580376148223877, + "learning_rate": 1.7394185519752546e-06, + "loss": 0.5137908458709717, + "mean_token_accuracy": 0.8141944408416748, + "num_tokens": 21784531.0, + "step": 2443 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 2.079318046569824, + "learning_rate": 1.7374237029783064e-06, + "loss": 0.41820770502090454, + "mean_token_accuracy": 0.8513275384902954, + "num_tokens": 21792047.0, + "step": 2444 + }, + { + "epoch": 1.85790273556231, + "grad_norm": 2.6890387535095215, + "learning_rate": 1.7354293891867582e-06, + "loss": 0.3810037672519684, + "mean_token_accuracy": 0.8790096044540405, + "num_tokens": 21796634.0, + "step": 2445 + }, + { + "epoch": 1.858662613981763, + "grad_norm": 2.161081552505493, + "learning_rate": 1.7334356120002956e-06, + "loss": 0.48064762353897095, + "mean_token_accuracy": 0.8329977989196777, + "num_tokens": 21803509.0, + "step": 2446 + }, + { + "epoch": 1.8594224924012157, + "grad_norm": 1.9201551675796509, + "learning_rate": 1.7314423728182283e-06, + "loss": 0.36369895935058594, + "mean_token_accuracy": 0.8713955879211426, + "num_tokens": 21810528.0, + "step": 2447 + }, + { + "epoch": 1.8601823708206688, + "grad_norm": 1.8095223903656006, + "learning_rate": 1.7294496730394897e-06, + "loss": 0.41493499279022217, + "mean_token_accuracy": 0.855312705039978, + "num_tokens": 21821176.0, + "step": 2448 + }, + { + "epoch": 1.8609422492401215, + "grad_norm": 2.172389507293701, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.3467463552951813, + "mean_token_accuracy": 0.8801594972610474, + "num_tokens": 21827486.0, + "step": 2449 + }, + { + "epoch": 1.8617021276595744, + "grad_norm": 2.8139185905456543, + "learning_rate": 1.7254658972858293e-06, + "loss": 0.35121995210647583, + "mean_token_accuracy": 0.8741901516914368, + "num_tokens": 21831915.0, + "step": 2450 + }, + { + "epoch": 1.8624620060790273, + "grad_norm": 1.2572762966156006, + "learning_rate": 1.7234748241068742e-06, + "loss": 0.3775328993797302, + "mean_token_accuracy": 0.8547425866127014, + "num_tokens": 21849623.0, + "step": 2451 + }, + { + "epoch": 1.8632218844984803, + "grad_norm": 1.2357900142669678, + "learning_rate": 1.7214842959231796e-06, + "loss": 0.28715917468070984, + "mean_token_accuracy": 0.9034290313720703, + "num_tokens": 21864507.0, + "step": 2452 + }, + { + "epoch": 1.8639817629179332, + "grad_norm": 1.2349165678024292, + "learning_rate": 1.719494314131775e-06, + "loss": 0.27918580174446106, + "mean_token_accuracy": 0.9073119759559631, + "num_tokens": 21878519.0, + "step": 2453 + }, + { + "epoch": 1.864741641337386, + "grad_norm": 1.960353136062622, + "learning_rate": 1.7175048801293042e-06, + "loss": 0.49304282665252686, + "mean_token_accuracy": 0.8193954229354858, + "num_tokens": 21886861.0, + "step": 2454 + }, + { + "epoch": 1.865501519756839, + "grad_norm": 1.480118751525879, + "learning_rate": 1.7155159953120315e-06, + "loss": 0.39433127641677856, + "mean_token_accuracy": 0.8674266338348389, + "num_tokens": 21899131.0, + "step": 2455 + }, + { + "epoch": 1.8662613981762917, + "grad_norm": 2.3136367797851562, + "learning_rate": 1.7135276610758309e-06, + "loss": 0.40943437814712524, + "mean_token_accuracy": 0.8511340022087097, + "num_tokens": 21905550.0, + "step": 2456 + }, + { + "epoch": 1.8670212765957448, + "grad_norm": 1.3622872829437256, + "learning_rate": 1.7115398788161923e-06, + "loss": 0.4255254566669464, + "mean_token_accuracy": 0.8457357883453369, + "num_tokens": 21919943.0, + "step": 2457 + }, + { + "epoch": 1.8677811550151975, + "grad_norm": 1.8197853565216064, + "learning_rate": 1.7095526499282172e-06, + "loss": 0.33384573459625244, + "mean_token_accuracy": 0.8757365942001343, + "num_tokens": 21928368.0, + "step": 2458 + }, + { + "epoch": 1.8685410334346506, + "grad_norm": 1.8771090507507324, + "learning_rate": 1.7075659758066207e-06, + "loss": 0.38854318857192993, + "mean_token_accuracy": 0.8565001487731934, + "num_tokens": 21936624.0, + "step": 2459 + }, + { + "epoch": 1.8693009118541033, + "grad_norm": 1.449811577796936, + "learning_rate": 1.7055798578457267e-06, + "loss": 0.45504286885261536, + "mean_token_accuracy": 0.8338158130645752, + "num_tokens": 21952192.0, + "step": 2460 + }, + { + "epoch": 1.8700607902735562, + "grad_norm": 2.253678321838379, + "learning_rate": 1.703594297439469e-06, + "loss": 0.44300752878189087, + "mean_token_accuracy": 0.8451106548309326, + "num_tokens": 21959107.0, + "step": 2461 + }, + { + "epoch": 1.8708206686930091, + "grad_norm": 2.5431747436523438, + "learning_rate": 1.7016092959813892e-06, + "loss": 0.34692925214767456, + "mean_token_accuracy": 0.8823766708374023, + "num_tokens": 21964543.0, + "step": 2462 + }, + { + "epoch": 1.871580547112462, + "grad_norm": 2.7001953125, + "learning_rate": 1.6996248548646393e-06, + "loss": 0.5270686745643616, + "mean_token_accuracy": 0.8366886377334595, + "num_tokens": 21970157.0, + "step": 2463 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 2.3855581283569336, + "learning_rate": 1.6976409754819767e-06, + "loss": 0.40109893679618835, + "mean_token_accuracy": 0.8477234840393066, + "num_tokens": 21976046.0, + "step": 2464 + }, + { + "epoch": 1.8731003039513676, + "grad_norm": 1.6014364957809448, + "learning_rate": 1.6956576592257635e-06, + "loss": 0.4344262480735779, + "mean_token_accuracy": 0.8464433550834656, + "num_tokens": 21986299.0, + "step": 2465 + }, + { + "epoch": 1.8738601823708207, + "grad_norm": 2.221372127532959, + "learning_rate": 1.6936749074879663e-06, + "loss": 0.24239015579223633, + "mean_token_accuracy": 0.9185566306114197, + "num_tokens": 21991541.0, + "step": 2466 + }, + { + "epoch": 1.8746200607902734, + "grad_norm": 1.6672178506851196, + "learning_rate": 1.6916927216601593e-06, + "loss": 0.35219496488571167, + "mean_token_accuracy": 0.8668237328529358, + "num_tokens": 22000797.0, + "step": 2467 + }, + { + "epoch": 1.8753799392097266, + "grad_norm": 1.364131212234497, + "learning_rate": 1.6897111031335145e-06, + "loss": 0.4456409513950348, + "mean_token_accuracy": 0.8350487947463989, + "num_tokens": 22018297.0, + "step": 2468 + }, + { + "epoch": 1.8761398176291793, + "grad_norm": 1.4535794258117676, + "learning_rate": 1.6877300532988095e-06, + "loss": 0.395782470703125, + "mean_token_accuracy": 0.8482908010482788, + "num_tokens": 22030096.0, + "step": 2469 + }, + { + "epoch": 1.8768996960486324, + "grad_norm": 2.0192270278930664, + "learning_rate": 1.6857495735464196e-06, + "loss": 0.31406813859939575, + "mean_token_accuracy": 0.889453649520874, + "num_tokens": 22036082.0, + "step": 2470 + }, + { + "epoch": 1.877659574468085, + "grad_norm": 2.159257173538208, + "learning_rate": 1.6837696652663244e-06, + "loss": 0.43942126631736755, + "mean_token_accuracy": 0.8518660068511963, + "num_tokens": 22043413.0, + "step": 2471 + }, + { + "epoch": 1.878419452887538, + "grad_norm": 1.9774882793426514, + "learning_rate": 1.681790329848097e-06, + "loss": 0.42464935779571533, + "mean_token_accuracy": 0.8545591831207275, + "num_tokens": 22050290.0, + "step": 2472 + }, + { + "epoch": 1.8791793313069909, + "grad_norm": 1.0219167470932007, + "learning_rate": 1.6798115686809125e-06, + "loss": 0.36917346715927124, + "mean_token_accuracy": 0.8650286197662354, + "num_tokens": 22070408.0, + "step": 2473 + }, + { + "epoch": 1.8799392097264438, + "grad_norm": 1.2943378686904907, + "learning_rate": 1.677833383153542e-06, + "loss": 0.3434808850288391, + "mean_token_accuracy": 0.878541111946106, + "num_tokens": 22083567.0, + "step": 2474 + }, + { + "epoch": 1.8806990881458967, + "grad_norm": 3.582855224609375, + "learning_rate": 1.6758557746543518e-06, + "loss": 0.39738911390304565, + "mean_token_accuracy": 0.8951535224914551, + "num_tokens": 22086886.0, + "step": 2475 + }, + { + "epoch": 1.8814589665653494, + "grad_norm": 1.680220365524292, + "learning_rate": 1.673878744571304e-06, + "loss": 0.38146206736564636, + "mean_token_accuracy": 0.8596681356430054, + "num_tokens": 22095564.0, + "step": 2476 + }, + { + "epoch": 1.8822188449848025, + "grad_norm": 1.448194146156311, + "learning_rate": 1.6719022942919527e-06, + "loss": 0.43309977650642395, + "mean_token_accuracy": 0.8669528961181641, + "num_tokens": 22109333.0, + "step": 2477 + }, + { + "epoch": 1.8829787234042552, + "grad_norm": 1.5353537797927856, + "learning_rate": 1.6699264252034498e-06, + "loss": 0.4479079842567444, + "mean_token_accuracy": 0.8379873037338257, + "num_tokens": 22124735.0, + "step": 2478 + }, + { + "epoch": 1.8837386018237083, + "grad_norm": 1.1744320392608643, + "learning_rate": 1.6679511386925337e-06, + "loss": 0.31951260566711426, + "mean_token_accuracy": 0.8792685270309448, + "num_tokens": 22140882.0, + "step": 2479 + }, + { + "epoch": 1.884498480243161, + "grad_norm": 2.1996841430664062, + "learning_rate": 1.6659764361455383e-06, + "loss": 0.39045992493629456, + "mean_token_accuracy": 0.8587675094604492, + "num_tokens": 22146843.0, + "step": 2480 + }, + { + "epoch": 1.885258358662614, + "grad_norm": 3.494931697845459, + "learning_rate": 1.6640023189483836e-06, + "loss": 0.44756871461868286, + "mean_token_accuracy": 0.8643628358840942, + "num_tokens": 22150504.0, + "step": 2481 + }, + { + "epoch": 1.8860182370820668, + "grad_norm": 2.2455973625183105, + "learning_rate": 1.6620287884865831e-06, + "loss": 0.3308878540992737, + "mean_token_accuracy": 0.8748078942298889, + "num_tokens": 22156537.0, + "step": 2482 + }, + { + "epoch": 1.8867781155015197, + "grad_norm": 2.31868314743042, + "learning_rate": 1.6600558461452368e-06, + "loss": 0.46583569049835205, + "mean_token_accuracy": 0.8438903093338013, + "num_tokens": 22163501.0, + "step": 2483 + }, + { + "epoch": 1.8875379939209727, + "grad_norm": 1.5695412158966064, + "learning_rate": 1.65808349330903e-06, + "loss": 0.351986825466156, + "mean_token_accuracy": 0.8707568645477295, + "num_tokens": 22173880.0, + "step": 2484 + }, + { + "epoch": 1.8882978723404256, + "grad_norm": 1.4109563827514648, + "learning_rate": 1.656111731362236e-06, + "loss": 0.36058586835861206, + "mean_token_accuracy": 0.8606001138687134, + "num_tokens": 22189000.0, + "step": 2485 + }, + { + "epoch": 1.8890577507598785, + "grad_norm": 1.0398776531219482, + "learning_rate": 1.6541405616887138e-06, + "loss": 0.36524999141693115, + "mean_token_accuracy": 0.8690586090087891, + "num_tokens": 22209187.0, + "step": 2486 + }, + { + "epoch": 1.8898176291793312, + "grad_norm": 2.1050004959106445, + "learning_rate": 1.6521699856719065e-06, + "loss": 0.2988269329071045, + "mean_token_accuracy": 0.8887280225753784, + "num_tokens": 22215539.0, + "step": 2487 + }, + { + "epoch": 1.8905775075987843, + "grad_norm": 2.5606791973114014, + "learning_rate": 1.650200004694839e-06, + "loss": 0.41077330708503723, + "mean_token_accuracy": 0.8436049818992615, + "num_tokens": 22221133.0, + "step": 2488 + }, + { + "epoch": 1.891337386018237, + "grad_norm": 1.5786094665527344, + "learning_rate": 1.6482306201401211e-06, + "loss": 0.4217292368412018, + "mean_token_accuracy": 0.859939455986023, + "num_tokens": 22231578.0, + "step": 2489 + }, + { + "epoch": 1.89209726443769, + "grad_norm": 1.7131884098052979, + "learning_rate": 1.6462618333899422e-06, + "loss": 0.3945464789867401, + "mean_token_accuracy": 0.8679244518280029, + "num_tokens": 22241252.0, + "step": 2490 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 2.8350300788879395, + "learning_rate": 1.6442936458260723e-06, + "loss": 0.3992699384689331, + "mean_token_accuracy": 0.8717275857925415, + "num_tokens": 22246226.0, + "step": 2491 + }, + { + "epoch": 1.8936170212765957, + "grad_norm": 2.2180120944976807, + "learning_rate": 1.6423260588298608e-06, + "loss": 0.3381099998950958, + "mean_token_accuracy": 0.8968075513839722, + "num_tokens": 22252355.0, + "step": 2492 + }, + { + "epoch": 1.8943768996960486, + "grad_norm": 2.6498866081237793, + "learning_rate": 1.6403590737822378e-06, + "loss": 0.36339250206947327, + "mean_token_accuracy": 0.8633373379707336, + "num_tokens": 22257407.0, + "step": 2493 + }, + { + "epoch": 1.8951367781155015, + "grad_norm": 2.634241819381714, + "learning_rate": 1.6383926920637077e-06, + "loss": 0.2562698721885681, + "mean_token_accuracy": 0.8999600410461426, + "num_tokens": 22261858.0, + "step": 2494 + }, + { + "epoch": 1.8958966565349544, + "grad_norm": 2.0163333415985107, + "learning_rate": 1.6364269150543533e-06, + "loss": 0.3413389027118683, + "mean_token_accuracy": 0.8718398809432983, + "num_tokens": 22268517.0, + "step": 2495 + }, + { + "epoch": 1.8966565349544073, + "grad_norm": 2.8333005905151367, + "learning_rate": 1.6344617441338311e-06, + "loss": 0.4354540705680847, + "mean_token_accuracy": 0.8491238355636597, + "num_tokens": 22273648.0, + "step": 2496 + }, + { + "epoch": 1.8974164133738602, + "grad_norm": 1.6280957460403442, + "learning_rate": 1.6324971806813766e-06, + "loss": 0.3015792965888977, + "mean_token_accuracy": 0.8937206268310547, + "num_tokens": 22282521.0, + "step": 2497 + }, + { + "epoch": 1.898176291793313, + "grad_norm": 1.2246302366256714, + "learning_rate": 1.6305332260757937e-06, + "loss": 0.26619502902030945, + "mean_token_accuracy": 0.8886681199073792, + "num_tokens": 22295179.0, + "step": 2498 + }, + { + "epoch": 1.898936170212766, + "grad_norm": 2.4014432430267334, + "learning_rate": 1.6285698816954626e-06, + "loss": 0.3735058903694153, + "mean_token_accuracy": 0.8693109750747681, + "num_tokens": 22300681.0, + "step": 2499 + }, + { + "epoch": 1.8996960486322187, + "grad_norm": 1.4447300434112549, + "learning_rate": 1.6266071489183327e-06, + "loss": 0.40768876671791077, + "mean_token_accuracy": 0.8556059002876282, + "num_tokens": 22312442.0, + "step": 2500 + }, + { + "epoch": 1.9004559270516719, + "grad_norm": 2.1339821815490723, + "learning_rate": 1.6246450291219268e-06, + "loss": 0.33442017436027527, + "mean_token_accuracy": 0.8837105631828308, + "num_tokens": 22318779.0, + "step": 2501 + }, + { + "epoch": 1.9012158054711246, + "grad_norm": 2.8564913272857666, + "learning_rate": 1.6226835236833356e-06, + "loss": 0.36013197898864746, + "mean_token_accuracy": 0.8810569047927856, + "num_tokens": 22323390.0, + "step": 2502 + }, + { + "epoch": 1.9019756838905775, + "grad_norm": 2.1201915740966797, + "learning_rate": 1.620722633979219e-06, + "loss": 0.4587489664554596, + "mean_token_accuracy": 0.8517274856567383, + "num_tokens": 22330275.0, + "step": 2503 + }, + { + "epoch": 1.9027355623100304, + "grad_norm": 2.211402177810669, + "learning_rate": 1.6187623613858038e-06, + "loss": 0.3698349595069885, + "mean_token_accuracy": 0.8768182992935181, + "num_tokens": 22336041.0, + "step": 2504 + }, + { + "epoch": 1.9034954407294833, + "grad_norm": 1.421604871749878, + "learning_rate": 1.6168027072788868e-06, + "loss": 0.38086453080177307, + "mean_token_accuracy": 0.8622198104858398, + "num_tokens": 22349310.0, + "step": 2505 + }, + { + "epoch": 1.9042553191489362, + "grad_norm": 2.4304113388061523, + "learning_rate": 1.6148436730338279e-06, + "loss": 0.34694477915763855, + "mean_token_accuracy": 0.8833136558532715, + "num_tokens": 22355069.0, + "step": 2506 + }, + { + "epoch": 1.905015197568389, + "grad_norm": 2.1076772212982178, + "learning_rate": 1.6128852600255518e-06, + "loss": 0.4973800778388977, + "mean_token_accuracy": 0.851190984249115, + "num_tokens": 22362402.0, + "step": 2507 + }, + { + "epoch": 1.905775075987842, + "grad_norm": 3.0934200286865234, + "learning_rate": 1.6109274696285496e-06, + "loss": 0.46498024463653564, + "mean_token_accuracy": 0.8436626195907593, + "num_tokens": 22367390.0, + "step": 2508 + }, + { + "epoch": 1.9065349544072947, + "grad_norm": 2.0114359855651855, + "learning_rate": 1.6089703032168736e-06, + "loss": 0.45143815875053406, + "mean_token_accuracy": 0.852748692035675, + "num_tokens": 22377032.0, + "step": 2509 + }, + { + "epoch": 1.9072948328267478, + "grad_norm": 1.8780893087387085, + "learning_rate": 1.6070137621641382e-06, + "loss": 0.3977179527282715, + "mean_token_accuracy": 0.8556262850761414, + "num_tokens": 22386880.0, + "step": 2510 + }, + { + "epoch": 1.9080547112462005, + "grad_norm": 1.6748069524765015, + "learning_rate": 1.6050578478435184e-06, + "loss": 0.35590440034866333, + "mean_token_accuracy": 0.8702141046524048, + "num_tokens": 22396616.0, + "step": 2511 + }, + { + "epoch": 1.9088145896656536, + "grad_norm": 0.9799401760101318, + "learning_rate": 1.6031025616277512e-06, + "loss": 0.3325427770614624, + "mean_token_accuracy": 0.8771291971206665, + "num_tokens": 22419580.0, + "step": 2512 + }, + { + "epoch": 1.9095744680851063, + "grad_norm": 1.5084866285324097, + "learning_rate": 1.6011479048891323e-06, + "loss": 0.44336390495300293, + "mean_token_accuracy": 0.8786209225654602, + "num_tokens": 22434235.0, + "step": 2513 + }, + { + "epoch": 1.9103343465045592, + "grad_norm": 1.8544305562973022, + "learning_rate": 1.5991938789995138e-06, + "loss": 0.3055306375026703, + "mean_token_accuracy": 0.9043174982070923, + "num_tokens": 22442003.0, + "step": 2514 + }, + { + "epoch": 1.9110942249240122, + "grad_norm": 4.29932165145874, + "learning_rate": 1.5972404853303061e-06, + "loss": 0.386760413646698, + "mean_token_accuracy": 0.8914207220077515, + "num_tokens": 22444787.0, + "step": 2515 + }, + { + "epoch": 1.911854103343465, + "grad_norm": 1.7560505867004395, + "learning_rate": 1.595287725252478e-06, + "loss": 0.4141422510147095, + "mean_token_accuracy": 0.862310528755188, + "num_tokens": 22453625.0, + "step": 2516 + }, + { + "epoch": 1.912613981762918, + "grad_norm": 2.685443878173828, + "learning_rate": 1.5933356001365502e-06, + "loss": 0.36217260360717773, + "mean_token_accuracy": 0.868883490562439, + "num_tokens": 22458597.0, + "step": 2517 + }, + { + "epoch": 1.9133738601823707, + "grad_norm": 2.2587239742279053, + "learning_rate": 1.591384111352599e-06, + "loss": 0.5298880934715271, + "mean_token_accuracy": 0.821168839931488, + "num_tokens": 22466091.0, + "step": 2518 + }, + { + "epoch": 1.9141337386018238, + "grad_norm": 2.273380756378174, + "learning_rate": 1.5894332602702545e-06, + "loss": 0.3194117546081543, + "mean_token_accuracy": 0.8849239945411682, + "num_tokens": 22471785.0, + "step": 2519 + }, + { + "epoch": 1.9148936170212765, + "grad_norm": 2.314634084701538, + "learning_rate": 1.5874830482587003e-06, + "loss": 0.457550585269928, + "mean_token_accuracy": 0.8367670774459839, + "num_tokens": 22479091.0, + "step": 2520 + }, + { + "epoch": 1.9156534954407296, + "grad_norm": 2.16206693649292, + "learning_rate": 1.585533476686669e-06, + "loss": 0.43055859208106995, + "mean_token_accuracy": 0.8659856915473938, + "num_tokens": 22487379.0, + "step": 2521 + }, + { + "epoch": 1.9164133738601823, + "grad_norm": 2.2091798782348633, + "learning_rate": 1.5835845469224447e-06, + "loss": 0.45421302318573, + "mean_token_accuracy": 0.8418087959289551, + "num_tokens": 22493755.0, + "step": 2522 + }, + { + "epoch": 1.9171732522796354, + "grad_norm": 1.6166985034942627, + "learning_rate": 1.5816362603338632e-06, + "loss": 0.5211667418479919, + "mean_token_accuracy": 0.809440016746521, + "num_tokens": 22506648.0, + "step": 2523 + }, + { + "epoch": 1.917933130699088, + "grad_norm": 2.4998703002929688, + "learning_rate": 1.5796886182883053e-06, + "loss": 0.45915648341178894, + "mean_token_accuracy": 0.833067774772644, + "num_tokens": 22513216.0, + "step": 2524 + }, + { + "epoch": 1.918693009118541, + "grad_norm": 1.492928147315979, + "learning_rate": 1.577741622152702e-06, + "loss": 0.45581498742103577, + "mean_token_accuracy": 0.8531479835510254, + "num_tokens": 22524908.0, + "step": 2525 + }, + { + "epoch": 1.919452887537994, + "grad_norm": 2.0502207279205322, + "learning_rate": 1.5757952732935288e-06, + "loss": 0.4156759977340698, + "mean_token_accuracy": 0.8677599430084229, + "num_tokens": 22532275.0, + "step": 2526 + }, + { + "epoch": 1.9202127659574468, + "grad_norm": 2.4572031497955322, + "learning_rate": 1.5738495730768104e-06, + "loss": 0.43373313546180725, + "mean_token_accuracy": 0.8435516357421875, + "num_tokens": 22538272.0, + "step": 2527 + }, + { + "epoch": 1.9209726443768997, + "grad_norm": 2.071903705596924, + "learning_rate": 1.5719045228681127e-06, + "loss": 0.3211413621902466, + "mean_token_accuracy": 0.87841796875, + "num_tokens": 22545487.0, + "step": 2528 + }, + { + "epoch": 1.9217325227963524, + "grad_norm": 1.6742064952850342, + "learning_rate": 1.5699601240325474e-06, + "loss": 0.3704240322113037, + "mean_token_accuracy": 0.8646563291549683, + "num_tokens": 22554840.0, + "step": 2529 + }, + { + "epoch": 1.9224924012158056, + "grad_norm": 1.0941399335861206, + "learning_rate": 1.5680163779347668e-06, + "loss": 0.3595704436302185, + "mean_token_accuracy": 0.8680597543716431, + "num_tokens": 22572627.0, + "step": 2530 + }, + { + "epoch": 1.9232522796352582, + "grad_norm": 2.9815237522125244, + "learning_rate": 1.5660732859389687e-06, + "loss": 0.2941335141658783, + "mean_token_accuracy": 0.8847303986549377, + "num_tokens": 22576851.0, + "step": 2531 + }, + { + "epoch": 1.9240121580547114, + "grad_norm": 2.898106813430786, + "learning_rate": 1.5641308494088903e-06, + "loss": 0.4066317081451416, + "mean_token_accuracy": 0.8469538688659668, + "num_tokens": 22581431.0, + "step": 2532 + }, + { + "epoch": 1.924772036474164, + "grad_norm": 1.6757515668869019, + "learning_rate": 1.5621890697078069e-06, + "loss": 0.33923569321632385, + "mean_token_accuracy": 0.8790708184242249, + "num_tokens": 22590648.0, + "step": 2533 + }, + { + "epoch": 1.925531914893617, + "grad_norm": 1.747314214706421, + "learning_rate": 1.5602479481985333e-06, + "loss": 0.4865703582763672, + "mean_token_accuracy": 0.8314566612243652, + "num_tokens": 22600153.0, + "step": 2534 + }, + { + "epoch": 1.9262917933130699, + "grad_norm": 2.7927849292755127, + "learning_rate": 1.5583074862434254e-06, + "loss": 0.335658460855484, + "mean_token_accuracy": 0.8769067525863647, + "num_tokens": 22604864.0, + "step": 2535 + }, + { + "epoch": 1.9270516717325228, + "grad_norm": 2.2553000450134277, + "learning_rate": 1.5563676852043738e-06, + "loss": 0.4442562460899353, + "mean_token_accuracy": 0.8381515145301819, + "num_tokens": 22611102.0, + "step": 2536 + }, + { + "epoch": 1.9278115501519757, + "grad_norm": 1.1937638521194458, + "learning_rate": 1.5544285464428044e-06, + "loss": 0.38608425855636597, + "mean_token_accuracy": 0.8589644432067871, + "num_tokens": 22627781.0, + "step": 2537 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 3.282639980316162, + "learning_rate": 1.55249007131968e-06, + "loss": 0.31231993436813354, + "mean_token_accuracy": 0.8917703032493591, + "num_tokens": 22632341.0, + "step": 2538 + }, + { + "epoch": 1.9293313069908815, + "grad_norm": 2.3212976455688477, + "learning_rate": 1.5505522611954977e-06, + "loss": 0.34952571988105774, + "mean_token_accuracy": 0.8752106428146362, + "num_tokens": 22638572.0, + "step": 2539 + }, + { + "epoch": 1.9300911854103342, + "grad_norm": 1.389098882675171, + "learning_rate": 1.548615117430286e-06, + "loss": 0.4298851788043976, + "mean_token_accuracy": 0.871698260307312, + "num_tokens": 22651875.0, + "step": 2540 + }, + { + "epoch": 1.9308510638297873, + "grad_norm": 1.5333977937698364, + "learning_rate": 1.5466786413836077e-06, + "loss": 0.45540744066238403, + "mean_token_accuracy": 0.8409075736999512, + "num_tokens": 22662903.0, + "step": 2541 + }, + { + "epoch": 1.93161094224924, + "grad_norm": 1.7833251953125, + "learning_rate": 1.5447428344145565e-06, + "loss": 0.333247572183609, + "mean_token_accuracy": 0.8796100616455078, + "num_tokens": 22671125.0, + "step": 2542 + }, + { + "epoch": 1.9323708206686931, + "grad_norm": 1.5165303945541382, + "learning_rate": 1.5428076978817564e-06, + "loss": 0.3085063099861145, + "mean_token_accuracy": 0.888705849647522, + "num_tokens": 22681482.0, + "step": 2543 + }, + { + "epoch": 1.9331306990881458, + "grad_norm": 2.3556196689605713, + "learning_rate": 1.5408732331433596e-06, + "loss": 0.44008776545524597, + "mean_token_accuracy": 0.8578170537948608, + "num_tokens": 22686952.0, + "step": 2544 + }, + { + "epoch": 1.9338905775075987, + "grad_norm": 2.9572882652282715, + "learning_rate": 1.538939441557048e-06, + "loss": 0.3779261112213135, + "mean_token_accuracy": 0.8657241463661194, + "num_tokens": 22691211.0, + "step": 2545 + }, + { + "epoch": 1.9346504559270516, + "grad_norm": 2.373473644256592, + "learning_rate": 1.5370063244800326e-06, + "loss": 0.4113072454929352, + "mean_token_accuracy": 0.872116208076477, + "num_tokens": 22697442.0, + "step": 2546 + }, + { + "epoch": 1.9354103343465046, + "grad_norm": 2.270207643508911, + "learning_rate": 1.5350738832690479e-06, + "loss": 0.4021070897579193, + "mean_token_accuracy": 0.8750372529029846, + "num_tokens": 22703693.0, + "step": 2547 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 2.429445266723633, + "learning_rate": 1.5331421192803565e-06, + "loss": 0.40210235118865967, + "mean_token_accuracy": 0.8593704104423523, + "num_tokens": 22709285.0, + "step": 2548 + }, + { + "epoch": 1.9369300911854104, + "grad_norm": 1.4576458930969238, + "learning_rate": 1.5312110338697427e-06, + "loss": 0.44822201132774353, + "mean_token_accuracy": 0.8737322688102722, + "num_tokens": 22723743.0, + "step": 2549 + }, + { + "epoch": 1.9376899696048633, + "grad_norm": 2.1008098125457764, + "learning_rate": 1.5292806283925192e-06, + "loss": 0.3514235019683838, + "mean_token_accuracy": 0.8689005374908447, + "num_tokens": 22730135.0, + "step": 2550 + }, + { + "epoch": 1.938449848024316, + "grad_norm": 1.9786806106567383, + "learning_rate": 1.5273509042035172e-06, + "loss": 0.4483771324157715, + "mean_token_accuracy": 0.8353633880615234, + "num_tokens": 22738717.0, + "step": 2551 + }, + { + "epoch": 1.939209726443769, + "grad_norm": 1.0649693012237549, + "learning_rate": 1.5254218626570927e-06, + "loss": 0.30712205171585083, + "mean_token_accuracy": 0.8802675008773804, + "num_tokens": 22757346.0, + "step": 2552 + }, + { + "epoch": 1.9399696048632218, + "grad_norm": 3.0401108264923096, + "learning_rate": 1.5234935051071193e-06, + "loss": 0.5213959217071533, + "mean_token_accuracy": 0.8249514102935791, + "num_tokens": 22762169.0, + "step": 2553 + }, + { + "epoch": 1.940729483282675, + "grad_norm": 2.892486572265625, + "learning_rate": 1.521565832906994e-06, + "loss": 0.5694394111633301, + "mean_token_accuracy": 0.8139263391494751, + "num_tokens": 22767824.0, + "step": 2554 + }, + { + "epoch": 1.9414893617021276, + "grad_norm": 1.6187207698822021, + "learning_rate": 1.519638847409632e-06, + "loss": 0.46748271584510803, + "mean_token_accuracy": 0.8541051149368286, + "num_tokens": 22778195.0, + "step": 2555 + }, + { + "epoch": 1.9422492401215805, + "grad_norm": 1.3857731819152832, + "learning_rate": 1.5177125499674639e-06, + "loss": 0.35661786794662476, + "mean_token_accuracy": 0.8711516857147217, + "num_tokens": 22792353.0, + "step": 2556 + }, + { + "epoch": 1.9430091185410334, + "grad_norm": 1.108441710472107, + "learning_rate": 1.515786941932441e-06, + "loss": 0.3537200391292572, + "mean_token_accuracy": 0.8739079833030701, + "num_tokens": 22813185.0, + "step": 2557 + }, + { + "epoch": 1.9437689969604863, + "grad_norm": 2.0528404712677, + "learning_rate": 1.5138620246560295e-06, + "loss": 0.4161028265953064, + "mean_token_accuracy": 0.8385938405990601, + "num_tokens": 22821227.0, + "step": 2558 + }, + { + "epoch": 1.9445288753799392, + "grad_norm": 1.5123628377914429, + "learning_rate": 1.5119377994892095e-06, + "loss": 0.4420986473560333, + "mean_token_accuracy": 0.8664361834526062, + "num_tokens": 22835064.0, + "step": 2559 + }, + { + "epoch": 1.9452887537993921, + "grad_norm": 2.5354838371276855, + "learning_rate": 1.5100142677824752e-06, + "loss": 0.3837323784828186, + "mean_token_accuracy": 0.8607655763626099, + "num_tokens": 22840455.0, + "step": 2560 + }, + { + "epoch": 1.946048632218845, + "grad_norm": 1.1354057788848877, + "learning_rate": 1.5080914308858375e-06, + "loss": 0.39776813983917236, + "mean_token_accuracy": 0.8586497902870178, + "num_tokens": 22858828.0, + "step": 2561 + }, + { + "epoch": 1.9468085106382977, + "grad_norm": 1.576740026473999, + "learning_rate": 1.5061692901488161e-06, + "loss": 0.3167848289012909, + "mean_token_accuracy": 0.8876185417175293, + "num_tokens": 22868674.0, + "step": 2562 + }, + { + "epoch": 1.9475683890577509, + "grad_norm": 1.4835401773452759, + "learning_rate": 1.5042478469204437e-06, + "loss": 0.44950318336486816, + "mean_token_accuracy": 0.8526639342308044, + "num_tokens": 22883019.0, + "step": 2563 + }, + { + "epoch": 1.9483282674772036, + "grad_norm": 1.617073655128479, + "learning_rate": 1.502327102549262e-06, + "loss": 0.45711010694503784, + "mean_token_accuracy": 0.834361732006073, + "num_tokens": 22896834.0, + "step": 2564 + }, + { + "epoch": 1.9490881458966567, + "grad_norm": 1.3348414897918701, + "learning_rate": 1.5004070583833252e-06, + "loss": 0.3691314458847046, + "mean_token_accuracy": 0.8779371380805969, + "num_tokens": 22912350.0, + "step": 2565 + }, + { + "epoch": 1.9498480243161094, + "grad_norm": 1.711234450340271, + "learning_rate": 1.4984877157701932e-06, + "loss": 0.38726937770843506, + "mean_token_accuracy": 0.8704015016555786, + "num_tokens": 22922575.0, + "step": 2566 + }, + { + "epoch": 1.9506079027355623, + "grad_norm": 2.4587950706481934, + "learning_rate": 1.4965690760569346e-06, + "loss": 0.4455464482307434, + "mean_token_accuracy": 0.8481032252311707, + "num_tokens": 22928717.0, + "step": 2567 + }, + { + "epoch": 1.9513677811550152, + "grad_norm": 2.4189560413360596, + "learning_rate": 1.4946511405901237e-06, + "loss": 0.4120418429374695, + "mean_token_accuracy": 0.8519487380981445, + "num_tokens": 22934977.0, + "step": 2568 + }, + { + "epoch": 1.952127659574468, + "grad_norm": 1.2503050565719604, + "learning_rate": 1.4927339107158437e-06, + "loss": 0.4434332251548767, + "mean_token_accuracy": 0.8448144793510437, + "num_tokens": 22950061.0, + "step": 2569 + }, + { + "epoch": 1.952887537993921, + "grad_norm": 1.788493275642395, + "learning_rate": 1.4908173877796784e-06, + "loss": 0.49203023314476013, + "mean_token_accuracy": 0.8601495623588562, + "num_tokens": 22961838.0, + "step": 2570 + }, + { + "epoch": 1.9536474164133737, + "grad_norm": 1.4260050058364868, + "learning_rate": 1.4889015731267186e-06, + "loss": 0.3286570906639099, + "mean_token_accuracy": 0.882429838180542, + "num_tokens": 22973192.0, + "step": 2571 + }, + { + "epoch": 1.9544072948328268, + "grad_norm": 1.6754822731018066, + "learning_rate": 1.486986468101555e-06, + "loss": 0.34655290842056274, + "mean_token_accuracy": 0.8807861804962158, + "num_tokens": 22983661.0, + "step": 2572 + }, + { + "epoch": 1.9551671732522795, + "grad_norm": 1.9064570665359497, + "learning_rate": 1.4850720740482842e-06, + "loss": 0.34020254015922546, + "mean_token_accuracy": 0.86677086353302, + "num_tokens": 22991231.0, + "step": 2573 + }, + { + "epoch": 1.9559270516717326, + "grad_norm": 1.977444052696228, + "learning_rate": 1.4831583923105e-06, + "loss": 0.21505260467529297, + "mean_token_accuracy": 0.921241819858551, + "num_tokens": 22996828.0, + "step": 2574 + }, + { + "epoch": 1.9566869300911853, + "grad_norm": 1.1019235849380493, + "learning_rate": 1.481245424231298e-06, + "loss": 0.3804295063018799, + "mean_token_accuracy": 0.8582668900489807, + "num_tokens": 23016018.0, + "step": 2575 + }, + { + "epoch": 1.9574468085106385, + "grad_norm": 1.7943179607391357, + "learning_rate": 1.4793331711532743e-06, + "loss": 0.38565245270729065, + "mean_token_accuracy": 0.8599048256874084, + "num_tokens": 23024461.0, + "step": 2576 + }, + { + "epoch": 1.9582066869300911, + "grad_norm": 2.273824453353882, + "learning_rate": 1.4774216344185204e-06, + "loss": 0.46297723054885864, + "mean_token_accuracy": 0.8294345140457153, + "num_tokens": 23031687.0, + "step": 2577 + }, + { + "epoch": 1.958966565349544, + "grad_norm": 2.308509111404419, + "learning_rate": 1.4755108153686275e-06, + "loss": 0.4366525411605835, + "mean_token_accuracy": 0.8515903949737549, + "num_tokens": 23037072.0, + "step": 2578 + }, + { + "epoch": 1.959726443768997, + "grad_norm": 2.069028377532959, + "learning_rate": 1.4736007153446803e-06, + "loss": 0.33900877833366394, + "mean_token_accuracy": 0.8937177658081055, + "num_tokens": 23043207.0, + "step": 2579 + }, + { + "epoch": 1.9604863221884499, + "grad_norm": 2.905163288116455, + "learning_rate": 1.4716913356872614e-06, + "loss": 0.3708382844924927, + "mean_token_accuracy": 0.8936747312545776, + "num_tokens": 23047020.0, + "step": 2580 + }, + { + "epoch": 1.9612462006079028, + "grad_norm": 2.4153175354003906, + "learning_rate": 1.4697826777364478e-06, + "loss": 0.473562091588974, + "mean_token_accuracy": 0.8350275158882141, + "num_tokens": 23053282.0, + "step": 2581 + }, + { + "epoch": 1.9620060790273555, + "grad_norm": 2.21589994430542, + "learning_rate": 1.467874742831808e-06, + "loss": 0.3812660276889801, + "mean_token_accuracy": 0.8623865842819214, + "num_tokens": 23059399.0, + "step": 2582 + }, + { + "epoch": 1.9627659574468086, + "grad_norm": 1.0847623348236084, + "learning_rate": 1.4659675323124037e-06, + "loss": 0.3846944570541382, + "mean_token_accuracy": 0.8633466958999634, + "num_tokens": 23081005.0, + "step": 2583 + }, + { + "epoch": 1.9635258358662613, + "grad_norm": 1.8754645586013794, + "learning_rate": 1.46406104751679e-06, + "loss": 0.3460300862789154, + "mean_token_accuracy": 0.8757443428039551, + "num_tokens": 23088710.0, + "step": 2584 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 2.13075852394104, + "learning_rate": 1.462155289783011e-06, + "loss": 0.3060935139656067, + "mean_token_accuracy": 0.9070644378662109, + "num_tokens": 23094862.0, + "step": 2585 + }, + { + "epoch": 1.965045592705167, + "grad_norm": 2.9674458503723145, + "learning_rate": 1.4602502604486e-06, + "loss": 0.4464406371116638, + "mean_token_accuracy": 0.8497441411018372, + "num_tokens": 23099821.0, + "step": 2586 + }, + { + "epoch": 1.96580547112462, + "grad_norm": 1.9171007871627808, + "learning_rate": 1.45834596085058e-06, + "loss": 0.3905114531517029, + "mean_token_accuracy": 0.8564352989196777, + "num_tokens": 23107804.0, + "step": 2587 + }, + { + "epoch": 1.966565349544073, + "grad_norm": 2.0817408561706543, + "learning_rate": 1.456442392325463e-06, + "loss": 0.3903818130493164, + "mean_token_accuracy": 0.8671162128448486, + "num_tokens": 23115224.0, + "step": 2588 + }, + { + "epoch": 1.9673252279635258, + "grad_norm": 2.6379549503326416, + "learning_rate": 1.4545395562092467e-06, + "loss": 0.22965987026691437, + "mean_token_accuracy": 0.9160916805267334, + "num_tokens": 23119184.0, + "step": 2589 + }, + { + "epoch": 1.9680851063829787, + "grad_norm": 2.525221824645996, + "learning_rate": 1.4526374538374133e-06, + "loss": 0.4132574498653412, + "mean_token_accuracy": 0.8486990332603455, + "num_tokens": 23124679.0, + "step": 2590 + }, + { + "epoch": 1.9688449848024316, + "grad_norm": 2.0362391471862793, + "learning_rate": 1.4507360865449318e-06, + "loss": 0.29624345898628235, + "mean_token_accuracy": 0.888127863407135, + "num_tokens": 23130756.0, + "step": 2591 + }, + { + "epoch": 1.9696048632218845, + "grad_norm": 1.5150481462478638, + "learning_rate": 1.4488354556662553e-06, + "loss": 0.3852264881134033, + "mean_token_accuracy": 0.8532775640487671, + "num_tokens": 23141597.0, + "step": 2592 + }, + { + "epoch": 1.9703647416413372, + "grad_norm": 1.5255193710327148, + "learning_rate": 1.4469355625353199e-06, + "loss": 0.37015780806541443, + "mean_token_accuracy": 0.8669752478599548, + "num_tokens": 23152487.0, + "step": 2593 + }, + { + "epoch": 1.9711246200607904, + "grad_norm": 1.1780041456222534, + "learning_rate": 1.4450364084855433e-06, + "loss": 0.34421291947364807, + "mean_token_accuracy": 0.8593694567680359, + "num_tokens": 23168769.0, + "step": 2594 + }, + { + "epoch": 1.971884498480243, + "grad_norm": 2.4549946784973145, + "learning_rate": 1.4431379948498254e-06, + "loss": 0.4000544548034668, + "mean_token_accuracy": 0.8551953434944153, + "num_tokens": 23175428.0, + "step": 2595 + }, + { + "epoch": 1.9726443768996962, + "grad_norm": 2.374192476272583, + "learning_rate": 1.4412403229605453e-06, + "loss": 0.31329840421676636, + "mean_token_accuracy": 0.8917277455329895, + "num_tokens": 23180678.0, + "step": 2596 + }, + { + "epoch": 1.9734042553191489, + "grad_norm": 1.268515706062317, + "learning_rate": 1.4393433941495638e-06, + "loss": 0.34808623790740967, + "mean_token_accuracy": 0.8726245164871216, + "num_tokens": 23194733.0, + "step": 2597 + }, + { + "epoch": 1.9741641337386018, + "grad_norm": 2.0898988246917725, + "learning_rate": 1.4374472097482156e-06, + "loss": 0.45849233865737915, + "mean_token_accuracy": 0.8414266109466553, + "num_tokens": 23202211.0, + "step": 2598 + }, + { + "epoch": 1.9749240121580547, + "grad_norm": 2.1497802734375, + "learning_rate": 1.4355517710873184e-06, + "loss": 0.4304521977901459, + "mean_token_accuracy": 0.8502874374389648, + "num_tokens": 23209623.0, + "step": 2599 + }, + { + "epoch": 1.9756838905775076, + "grad_norm": 1.821786880493164, + "learning_rate": 1.4336570794971643e-06, + "loss": 0.3910462558269501, + "mean_token_accuracy": 0.8962477445602417, + "num_tokens": 23218904.0, + "step": 2600 + }, + { + "epoch": 1.9764437689969605, + "grad_norm": 2.2523093223571777, + "learning_rate": 1.4317631363075186e-06, + "loss": 0.3456020951271057, + "mean_token_accuracy": 0.8703117370605469, + "num_tokens": 23225602.0, + "step": 2601 + }, + { + "epoch": 1.9772036474164134, + "grad_norm": 1.6920030117034912, + "learning_rate": 1.4298699428476236e-06, + "loss": 0.4629668593406677, + "mean_token_accuracy": 0.841956615447998, + "num_tokens": 23236812.0, + "step": 2602 + }, + { + "epoch": 1.9779635258358663, + "grad_norm": 1.8796344995498657, + "learning_rate": 1.427977500446199e-06, + "loss": 0.3302173316478729, + "mean_token_accuracy": 0.8769404888153076, + "num_tokens": 23245851.0, + "step": 2603 + }, + { + "epoch": 1.978723404255319, + "grad_norm": 2.4003775119781494, + "learning_rate": 1.4260858104314299e-06, + "loss": 0.48402607440948486, + "mean_token_accuracy": 0.8477497100830078, + "num_tokens": 23252429.0, + "step": 2604 + }, + { + "epoch": 1.9794832826747721, + "grad_norm": 3.576800584793091, + "learning_rate": 1.4241948741309783e-06, + "loss": 0.2943669259548187, + "mean_token_accuracy": 0.8933546543121338, + "num_tokens": 23255431.0, + "step": 2605 + }, + { + "epoch": 1.9802431610942248, + "grad_norm": 2.7589938640594482, + "learning_rate": 1.4223046928719764e-06, + "loss": 0.5138746500015259, + "mean_token_accuracy": 0.817468523979187, + "num_tokens": 23261351.0, + "step": 2606 + }, + { + "epoch": 1.981003039513678, + "grad_norm": 1.6950130462646484, + "learning_rate": 1.420415267981026e-06, + "loss": 0.2744991183280945, + "mean_token_accuracy": 0.9005721211433411, + "num_tokens": 23269482.0, + "step": 2607 + }, + { + "epoch": 1.9817629179331306, + "grad_norm": 1.5962934494018555, + "learning_rate": 1.418526600784198e-06, + "loss": 0.4629114270210266, + "mean_token_accuracy": 0.8337699174880981, + "num_tokens": 23279796.0, + "step": 2608 + }, + { + "epoch": 1.9825227963525835, + "grad_norm": 1.4962197542190552, + "learning_rate": 1.4166386926070322e-06, + "loss": 0.4217689633369446, + "mean_token_accuracy": 0.8445580005645752, + "num_tokens": 23293050.0, + "step": 2609 + }, + { + "epoch": 1.9832826747720365, + "grad_norm": 1.4243721961975098, + "learning_rate": 1.414751544774535e-06, + "loss": 0.4888152480125427, + "mean_token_accuracy": 0.8298524022102356, + "num_tokens": 23308501.0, + "step": 2610 + }, + { + "epoch": 1.9840425531914894, + "grad_norm": 1.5776121616363525, + "learning_rate": 1.412865158611179e-06, + "loss": 0.3156965970993042, + "mean_token_accuracy": 0.8773540258407593, + "num_tokens": 23317401.0, + "step": 2611 + }, + { + "epoch": 1.9848024316109423, + "grad_norm": 1.4690552949905396, + "learning_rate": 1.4109795354409045e-06, + "loss": 0.35854774713516235, + "mean_token_accuracy": 0.869156002998352, + "num_tokens": 23328891.0, + "step": 2612 + }, + { + "epoch": 1.9855623100303952, + "grad_norm": 1.5036180019378662, + "learning_rate": 1.4090946765871105e-06, + "loss": 0.3579009771347046, + "mean_token_accuracy": 0.8698509931564331, + "num_tokens": 23340473.0, + "step": 2613 + }, + { + "epoch": 1.986322188449848, + "grad_norm": 2.0811538696289062, + "learning_rate": 1.4072105833726685e-06, + "loss": 0.2905905246734619, + "mean_token_accuracy": 0.9131759405136108, + "num_tokens": 23346480.0, + "step": 2614 + }, + { + "epoch": 1.9870820668693008, + "grad_norm": 1.2866275310516357, + "learning_rate": 1.4053272571199037e-06, + "loss": 0.4091147184371948, + "mean_token_accuracy": 0.8537255525588989, + "num_tokens": 23361957.0, + "step": 2615 + }, + { + "epoch": 1.987841945288754, + "grad_norm": 1.439497470855713, + "learning_rate": 1.4034446991506084e-06, + "loss": 0.4888972342014313, + "mean_token_accuracy": 0.8451695442199707, + "num_tokens": 23374936.0, + "step": 2616 + }, + { + "epoch": 1.9886018237082066, + "grad_norm": 1.758204698562622, + "learning_rate": 1.401562910786034e-06, + "loss": 0.4976118803024292, + "mean_token_accuracy": 0.8346713781356812, + "num_tokens": 23386102.0, + "step": 2617 + }, + { + "epoch": 1.9893617021276597, + "grad_norm": 1.436486840248108, + "learning_rate": 1.3996818933468926e-06, + "loss": 0.42407113313674927, + "mean_token_accuracy": 0.8529444932937622, + "num_tokens": 23398645.0, + "step": 2618 + }, + { + "epoch": 1.9901215805471124, + "grad_norm": 2.1466588973999023, + "learning_rate": 1.397801648153354e-06, + "loss": 0.45519331097602844, + "mean_token_accuracy": 0.8460411429405212, + "num_tokens": 23406162.0, + "step": 2619 + }, + { + "epoch": 1.9908814589665653, + "grad_norm": 2.0492005348205566, + "learning_rate": 1.395922176525047e-06, + "loss": 0.31093084812164307, + "mean_token_accuracy": 0.8927264213562012, + "num_tokens": 23412051.0, + "step": 2620 + }, + { + "epoch": 1.9916413373860182, + "grad_norm": 2.2639048099517822, + "learning_rate": 1.3940434797810567e-06, + "loss": 0.3804079592227936, + "mean_token_accuracy": 0.8720212578773499, + "num_tokens": 23418252.0, + "step": 2621 + }, + { + "epoch": 1.9924012158054711, + "grad_norm": 1.9541687965393066, + "learning_rate": 1.3921655592399256e-06, + "loss": 0.38776344060897827, + "mean_token_accuracy": 0.858753502368927, + "num_tokens": 23425901.0, + "step": 2622 + }, + { + "epoch": 1.993161094224924, + "grad_norm": 1.5119032859802246, + "learning_rate": 1.3902884162196509e-06, + "loss": 0.39581215381622314, + "mean_token_accuracy": 0.8539663553237915, + "num_tokens": 23439390.0, + "step": 2623 + }, + { + "epoch": 1.993920972644377, + "grad_norm": 2.1608591079711914, + "learning_rate": 1.388412052037682e-06, + "loss": 0.41801220178604126, + "mean_token_accuracy": 0.8703387975692749, + "num_tokens": 23445725.0, + "step": 2624 + }, + { + "epoch": 1.9946808510638299, + "grad_norm": 2.463165521621704, + "learning_rate": 1.3865364680109239e-06, + "loss": 0.3252835273742676, + "mean_token_accuracy": 0.9031686186790466, + "num_tokens": 23451122.0, + "step": 2625 + }, + { + "epoch": 1.9954407294832825, + "grad_norm": 1.1901201009750366, + "learning_rate": 1.384661665455736e-06, + "loss": 0.3358447253704071, + "mean_token_accuracy": 0.8767676949501038, + "num_tokens": 23467381.0, + "step": 2626 + }, + { + "epoch": 1.9962006079027357, + "grad_norm": 1.3035757541656494, + "learning_rate": 1.3827876456879247e-06, + "loss": 0.3736562430858612, + "mean_token_accuracy": 0.849855899810791, + "num_tokens": 23482192.0, + "step": 2627 + }, + { + "epoch": 1.9969604863221884, + "grad_norm": 1.8807034492492676, + "learning_rate": 1.3809144100227483e-06, + "loss": 0.45943766832351685, + "mean_token_accuracy": 0.8456380367279053, + "num_tokens": 23495167.0, + "step": 2628 + }, + { + "epoch": 1.9977203647416415, + "grad_norm": 2.3645784854888916, + "learning_rate": 1.3790419597749198e-06, + "loss": 0.4271511435508728, + "mean_token_accuracy": 0.846099853515625, + "num_tokens": 23500790.0, + "step": 2629 + }, + { + "epoch": 1.9984802431610942, + "grad_norm": 1.8451792001724243, + "learning_rate": 1.3771702962585928e-06, + "loss": 0.38092344999313354, + "mean_token_accuracy": 0.8641276359558105, + "num_tokens": 23508845.0, + "step": 2630 + }, + { + "epoch": 1.999240121580547, + "grad_norm": 1.1115045547485352, + "learning_rate": 1.3752994207873743e-06, + "loss": 0.35954269766807556, + "mean_token_accuracy": 0.8642125129699707, + "num_tokens": 23527929.0, + "step": 2631 + }, + { + "epoch": 2.0, + "grad_norm": 1.406253457069397, + "learning_rate": 1.373429334674317e-06, + "loss": 0.33467042446136475, + "mean_token_accuracy": 0.8713197708129883, + "num_tokens": 23539356.0, + "step": 2632 + }, + { + "epoch": 2.0007598784194527, + "grad_norm": 2.8150978088378906, + "learning_rate": 1.3715600392319186e-06, + "loss": 0.22929656505584717, + "mean_token_accuracy": 0.9197485446929932, + "num_tokens": 23543746.0, + "step": 2633 + }, + { + "epoch": 2.001519756838906, + "grad_norm": 2.6291964054107666, + "learning_rate": 1.369691535772123e-06, + "loss": 0.290000855922699, + "mean_token_accuracy": 0.8979663848876953, + "num_tokens": 23548633.0, + "step": 2634 + }, + { + "epoch": 2.0022796352583585, + "grad_norm": 1.724357008934021, + "learning_rate": 1.3678238256063193e-06, + "loss": 0.3717018663883209, + "mean_token_accuracy": 0.8743406534194946, + "num_tokens": 23557187.0, + "step": 2635 + }, + { + "epoch": 2.0030395136778116, + "grad_norm": 2.3801965713500977, + "learning_rate": 1.3659569100453346e-06, + "loss": 0.3452329635620117, + "mean_token_accuracy": 0.8799462914466858, + "num_tokens": 23563321.0, + "step": 2636 + }, + { + "epoch": 2.0037993920972643, + "grad_norm": 1.8925955295562744, + "learning_rate": 1.3640907903994455e-06, + "loss": 0.32880955934524536, + "mean_token_accuracy": 0.888347864151001, + "num_tokens": 23570571.0, + "step": 2637 + }, + { + "epoch": 2.0045592705167175, + "grad_norm": 1.0761849880218506, + "learning_rate": 1.3622254679783665e-06, + "loss": 0.395224004983902, + "mean_token_accuracy": 0.8637001514434814, + "num_tokens": 23589504.0, + "step": 2638 + }, + { + "epoch": 2.00531914893617, + "grad_norm": 2.1172127723693848, + "learning_rate": 1.3603609440912508e-06, + "loss": 0.32195356488227844, + "mean_token_accuracy": 0.8984324932098389, + "num_tokens": 23595586.0, + "step": 2639 + }, + { + "epoch": 2.0060790273556233, + "grad_norm": 2.127723217010498, + "learning_rate": 1.3584972200466936e-06, + "loss": 0.4710606634616852, + "mean_token_accuracy": 0.8563182950019836, + "num_tokens": 23602747.0, + "step": 2640 + }, + { + "epoch": 2.006838905775076, + "grad_norm": 1.9752192497253418, + "learning_rate": 1.356634297152729e-06, + "loss": 0.24204617738723755, + "mean_token_accuracy": 0.9082983136177063, + "num_tokens": 23609005.0, + "step": 2641 + }, + { + "epoch": 2.007598784194529, + "grad_norm": 2.5435397624969482, + "learning_rate": 1.3547721767168273e-06, + "loss": 0.16702288389205933, + "mean_token_accuracy": 0.9353867769241333, + "num_tokens": 23612852.0, + "step": 2642 + }, + { + "epoch": 2.0083586626139818, + "grad_norm": 1.8113304376602173, + "learning_rate": 1.3529108600458967e-06, + "loss": 0.4245433509349823, + "mean_token_accuracy": 0.8446527719497681, + "num_tokens": 23621462.0, + "step": 2643 + }, + { + "epoch": 2.0091185410334345, + "grad_norm": 1.0438088178634644, + "learning_rate": 1.3510503484462807e-06, + "loss": 0.3710743188858032, + "mean_token_accuracy": 0.8731123208999634, + "num_tokens": 23642029.0, + "step": 2644 + }, + { + "epoch": 2.0098784194528876, + "grad_norm": 1.9650516510009766, + "learning_rate": 1.349190643223758e-06, + "loss": 0.32384324073791504, + "mean_token_accuracy": 0.8859044313430786, + "num_tokens": 23648970.0, + "step": 2645 + }, + { + "epoch": 2.0106382978723403, + "grad_norm": 1.4213180541992188, + "learning_rate": 1.347331745683542e-06, + "loss": 0.42391857504844666, + "mean_token_accuracy": 0.8568997383117676, + "num_tokens": 23663012.0, + "step": 2646 + }, + { + "epoch": 2.0113981762917934, + "grad_norm": 1.852386236190796, + "learning_rate": 1.3454736571302761e-06, + "loss": 0.37283188104629517, + "mean_token_accuracy": 0.9096506834030151, + "num_tokens": 23671632.0, + "step": 2647 + }, + { + "epoch": 2.012158054711246, + "grad_norm": 1.8350872993469238, + "learning_rate": 1.3436163788680411e-06, + "loss": 0.21148793399333954, + "mean_token_accuracy": 0.9306647181510925, + "num_tokens": 23678554.0, + "step": 2648 + }, + { + "epoch": 2.012917933130699, + "grad_norm": 1.8285188674926758, + "learning_rate": 1.3417599122003464e-06, + "loss": 0.2638583183288574, + "mean_token_accuracy": 0.904695987701416, + "num_tokens": 23686905.0, + "step": 2649 + }, + { + "epoch": 2.013677811550152, + "grad_norm": 1.1955424547195435, + "learning_rate": 1.3399042584301298e-06, + "loss": 0.30598434805870056, + "mean_token_accuracy": 0.8953701257705688, + "num_tokens": 23702734.0, + "step": 2650 + }, + { + "epoch": 2.014437689969605, + "grad_norm": 1.5378512144088745, + "learning_rate": 1.3380494188597603e-06, + "loss": 0.33754611015319824, + "mean_token_accuracy": 0.9063926935195923, + "num_tokens": 23715891.0, + "step": 2651 + }, + { + "epoch": 2.0151975683890577, + "grad_norm": 1.6957111358642578, + "learning_rate": 1.3361953947910394e-06, + "loss": 0.26302939653396606, + "mean_token_accuracy": 0.90192711353302, + "num_tokens": 23724034.0, + "step": 2652 + }, + { + "epoch": 2.015957446808511, + "grad_norm": 1.1756837368011475, + "learning_rate": 1.334342187525189e-06, + "loss": 0.3312695622444153, + "mean_token_accuracy": 0.870500385761261, + "num_tokens": 23741241.0, + "step": 2653 + }, + { + "epoch": 2.0167173252279635, + "grad_norm": 1.027145266532898, + "learning_rate": 1.3324897983628621e-06, + "loss": 0.2534530758857727, + "mean_token_accuracy": 0.894199550151825, + "num_tokens": 23758399.0, + "step": 2654 + }, + { + "epoch": 2.0174772036474162, + "grad_norm": 2.2585113048553467, + "learning_rate": 1.330638228604137e-06, + "loss": 0.4558389186859131, + "mean_token_accuracy": 0.8372241258621216, + "num_tokens": 23766871.0, + "step": 2655 + }, + { + "epoch": 2.0182370820668694, + "grad_norm": 1.886893630027771, + "learning_rate": 1.3287874795485168e-06, + "loss": 0.29894912242889404, + "mean_token_accuracy": 0.9086098670959473, + "num_tokens": 23774935.0, + "step": 2656 + }, + { + "epoch": 2.018996960486322, + "grad_norm": 2.082537889480591, + "learning_rate": 1.3269375524949286e-06, + "loss": 0.39323803782463074, + "mean_token_accuracy": 0.8598287105560303, + "num_tokens": 23781303.0, + "step": 2657 + }, + { + "epoch": 2.019756838905775, + "grad_norm": 1.7059803009033203, + "learning_rate": 1.3250884487417227e-06, + "loss": 0.17909850180149078, + "mean_token_accuracy": 0.9276094436645508, + "num_tokens": 23789148.0, + "step": 2658 + }, + { + "epoch": 2.020516717325228, + "grad_norm": 2.150275945663452, + "learning_rate": 1.3232401695866686e-06, + "loss": 0.3707781434059143, + "mean_token_accuracy": 0.8587700128555298, + "num_tokens": 23795484.0, + "step": 2659 + }, + { + "epoch": 2.021276595744681, + "grad_norm": 2.0554518699645996, + "learning_rate": 1.321392716326963e-06, + "loss": 0.33217954635620117, + "mean_token_accuracy": 0.874828577041626, + "num_tokens": 23802968.0, + "step": 2660 + }, + { + "epoch": 2.0220364741641337, + "grad_norm": 2.4556071758270264, + "learning_rate": 1.3195460902592193e-06, + "loss": 0.2790899872779846, + "mean_token_accuracy": 0.9071618914604187, + "num_tokens": 23807788.0, + "step": 2661 + }, + { + "epoch": 2.022796352583587, + "grad_norm": 1.7501509189605713, + "learning_rate": 1.3177002926794685e-06, + "loss": 0.3080750107765198, + "mean_token_accuracy": 0.8942672610282898, + "num_tokens": 23816023.0, + "step": 2662 + }, + { + "epoch": 2.0235562310030395, + "grad_norm": 1.3934804201126099, + "learning_rate": 1.3158553248831658e-06, + "loss": 0.286912202835083, + "mean_token_accuracy": 0.9284837245941162, + "num_tokens": 23827186.0, + "step": 2663 + }, + { + "epoch": 2.024316109422492, + "grad_norm": 1.2530465126037598, + "learning_rate": 1.3140111881651773e-06, + "loss": 0.2630627155303955, + "mean_token_accuracy": 0.9029854536056519, + "num_tokens": 23841399.0, + "step": 2664 + }, + { + "epoch": 2.0250759878419453, + "grad_norm": 1.3417384624481201, + "learning_rate": 1.312167883819791e-06, + "loss": 0.37794870138168335, + "mean_token_accuracy": 0.8722256422042847, + "num_tokens": 23856061.0, + "step": 2665 + }, + { + "epoch": 2.025835866261398, + "grad_norm": 2.234257698059082, + "learning_rate": 1.3103254131407082e-06, + "loss": 0.2739933133125305, + "mean_token_accuracy": 0.9055665135383606, + "num_tokens": 23861865.0, + "step": 2666 + }, + { + "epoch": 2.026595744680851, + "grad_norm": 1.4187006950378418, + "learning_rate": 1.308483777421046e-06, + "loss": 0.24370817840099335, + "mean_token_accuracy": 0.9145886301994324, + "num_tokens": 23873632.0, + "step": 2667 + }, + { + "epoch": 2.027355623100304, + "grad_norm": 2.3645882606506348, + "learning_rate": 1.3066429779533352e-06, + "loss": 0.23659822344779968, + "mean_token_accuracy": 0.9209753274917603, + "num_tokens": 23878866.0, + "step": 2668 + }, + { + "epoch": 2.028115501519757, + "grad_norm": 1.4782226085662842, + "learning_rate": 1.3048030160295196e-06, + "loss": 0.3353138267993927, + "mean_token_accuracy": 0.8747807741165161, + "num_tokens": 23891089.0, + "step": 2669 + }, + { + "epoch": 2.0288753799392096, + "grad_norm": 2.051754951477051, + "learning_rate": 1.3029638929409555e-06, + "loss": 0.2905973196029663, + "mean_token_accuracy": 0.887441873550415, + "num_tokens": 23897653.0, + "step": 2670 + }, + { + "epoch": 2.0296352583586628, + "grad_norm": 1.322279453277588, + "learning_rate": 1.3011256099784103e-06, + "loss": 0.3938416540622711, + "mean_token_accuracy": 0.8911079168319702, + "num_tokens": 23912525.0, + "step": 2671 + }, + { + "epoch": 2.0303951367781155, + "grad_norm": 1.87980318069458, + "learning_rate": 1.2992881684320627e-06, + "loss": 0.16637520492076874, + "mean_token_accuracy": 0.9472321271896362, + "num_tokens": 23918752.0, + "step": 2672 + }, + { + "epoch": 2.0311550151975686, + "grad_norm": 2.0867233276367188, + "learning_rate": 1.297451569591498e-06, + "loss": 0.37282776832580566, + "mean_token_accuracy": 0.8688399195671082, + "num_tokens": 23925918.0, + "step": 2673 + }, + { + "epoch": 2.0319148936170213, + "grad_norm": 1.129468560218811, + "learning_rate": 1.2956158147457116e-06, + "loss": 0.33072173595428467, + "mean_token_accuracy": 0.8788217306137085, + "num_tokens": 23944702.0, + "step": 2674 + }, + { + "epoch": 2.032674772036474, + "grad_norm": 3.6016290187835693, + "learning_rate": 1.2937809051831102e-06, + "loss": 0.28343498706817627, + "mean_token_accuracy": 0.911794900894165, + "num_tokens": 23948417.0, + "step": 2675 + }, + { + "epoch": 2.033434650455927, + "grad_norm": 1.4904811382293701, + "learning_rate": 1.2919468421915008e-06, + "loss": 0.4072638750076294, + "mean_token_accuracy": 0.8615934252738953, + "num_tokens": 23963654.0, + "step": 2676 + }, + { + "epoch": 2.0341945288753798, + "grad_norm": 2.90740704536438, + "learning_rate": 1.2901136270580994e-06, + "loss": 0.3685106635093689, + "mean_token_accuracy": 0.8923419713973999, + "num_tokens": 23968608.0, + "step": 2677 + }, + { + "epoch": 2.034954407294833, + "grad_norm": 1.8772104978561401, + "learning_rate": 1.2882812610695305e-06, + "loss": 0.2947828471660614, + "mean_token_accuracy": 0.9065762758255005, + "num_tokens": 23978298.0, + "step": 2678 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 1.2135536670684814, + "learning_rate": 1.2864497455118152e-06, + "loss": 0.36015012860298157, + "mean_token_accuracy": 0.8481813073158264, + "num_tokens": 23995784.0, + "step": 2679 + }, + { + "epoch": 2.0364741641337387, + "grad_norm": 1.941889762878418, + "learning_rate": 1.2846190816703836e-06, + "loss": 0.3004198670387268, + "mean_token_accuracy": 0.8843618631362915, + "num_tokens": 24002651.0, + "step": 2680 + }, + { + "epoch": 2.0372340425531914, + "grad_norm": 1.8905075788497925, + "learning_rate": 1.2827892708300648e-06, + "loss": 0.26640570163726807, + "mean_token_accuracy": 0.9079146385192871, + "num_tokens": 24010400.0, + "step": 2681 + }, + { + "epoch": 2.0379939209726445, + "grad_norm": 1.2975934743881226, + "learning_rate": 1.280960314275092e-06, + "loss": 0.19093887507915497, + "mean_token_accuracy": 0.9277223348617554, + "num_tokens": 24021528.0, + "step": 2682 + }, + { + "epoch": 2.038753799392097, + "grad_norm": 1.6483098268508911, + "learning_rate": 1.279132213289096e-06, + "loss": 0.29260069131851196, + "mean_token_accuracy": 0.892486572265625, + "num_tokens": 24030470.0, + "step": 2683 + }, + { + "epoch": 2.0395136778115504, + "grad_norm": 1.6875916719436646, + "learning_rate": 1.2773049691551103e-06, + "loss": 0.3784627914428711, + "mean_token_accuracy": 0.8682783842086792, + "num_tokens": 24041608.0, + "step": 2684 + }, + { + "epoch": 2.040273556231003, + "grad_norm": 2.1055848598480225, + "learning_rate": 1.2754785831555617e-06, + "loss": 0.14676237106323242, + "mean_token_accuracy": 0.9532995223999023, + "num_tokens": 24046687.0, + "step": 2685 + }, + { + "epoch": 2.0410334346504557, + "grad_norm": 1.3862961530685425, + "learning_rate": 1.273653056572282e-06, + "loss": 0.34408485889434814, + "mean_token_accuracy": 0.8748919367790222, + "num_tokens": 24059147.0, + "step": 2686 + }, + { + "epoch": 2.041793313069909, + "grad_norm": 2.936876058578491, + "learning_rate": 1.2718283906864939e-06, + "loss": 0.2471027672290802, + "mean_token_accuracy": 0.9177526235580444, + "num_tokens": 24062963.0, + "step": 2687 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 1.3992520570755005, + "learning_rate": 1.2700045867788184e-06, + "loss": 0.421109139919281, + "mean_token_accuracy": 0.8664785623550415, + "num_tokens": 24077912.0, + "step": 2688 + }, + { + "epoch": 2.0433130699088147, + "grad_norm": 3.0531985759735107, + "learning_rate": 1.2681816461292715e-06, + "loss": 0.292591392993927, + "mean_token_accuracy": 0.8992351293563843, + "num_tokens": 24082058.0, + "step": 2689 + }, + { + "epoch": 2.0440729483282674, + "grad_norm": 1.4562251567840576, + "learning_rate": 1.2663595700172631e-06, + "loss": 0.39367130398750305, + "mean_token_accuracy": 0.8894597887992859, + "num_tokens": 24093954.0, + "step": 2690 + }, + { + "epoch": 2.0448328267477205, + "grad_norm": 1.9354028701782227, + "learning_rate": 1.2645383597215965e-06, + "loss": 0.28203579783439636, + "mean_token_accuracy": 0.9011955261230469, + "num_tokens": 24100590.0, + "step": 2691 + }, + { + "epoch": 2.045592705167173, + "grad_norm": 1.5010690689086914, + "learning_rate": 1.2627180165204671e-06, + "loss": 0.3463609516620636, + "mean_token_accuracy": 0.8978298306465149, + "num_tokens": 24111104.0, + "step": 2692 + }, + { + "epoch": 2.0463525835866263, + "grad_norm": 2.585813045501709, + "learning_rate": 1.2608985416914616e-06, + "loss": 0.2142711877822876, + "mean_token_accuracy": 0.9260460138320923, + "num_tokens": 24115301.0, + "step": 2693 + }, + { + "epoch": 2.047112462006079, + "grad_norm": 2.317268133163452, + "learning_rate": 1.259079936511558e-06, + "loss": 0.14454546570777893, + "mean_token_accuracy": 0.9498077034950256, + "num_tokens": 24120295.0, + "step": 2694 + }, + { + "epoch": 2.047872340425532, + "grad_norm": 1.966550350189209, + "learning_rate": 1.257262202257124e-06, + "loss": 0.20745311677455902, + "mean_token_accuracy": 0.9157166481018066, + "num_tokens": 24127158.0, + "step": 2695 + }, + { + "epoch": 2.048632218844985, + "grad_norm": 1.6521401405334473, + "learning_rate": 1.2554453402039124e-06, + "loss": 0.2547406256198883, + "mean_token_accuracy": 0.9356101751327515, + "num_tokens": 24135620.0, + "step": 2696 + }, + { + "epoch": 2.0493920972644375, + "grad_norm": 2.341756582260132, + "learning_rate": 1.2536293516270704e-06, + "loss": 0.35540008544921875, + "mean_token_accuracy": 0.874363899230957, + "num_tokens": 24141766.0, + "step": 2697 + }, + { + "epoch": 2.0501519756838906, + "grad_norm": 1.7938716411590576, + "learning_rate": 1.251814237801128e-06, + "loss": 0.37250861525535583, + "mean_token_accuracy": 0.8644422292709351, + "num_tokens": 24149997.0, + "step": 2698 + }, + { + "epoch": 2.0509118541033433, + "grad_norm": 2.0868122577667236, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.44527092576026917, + "mean_token_accuracy": 0.8510264158248901, + "num_tokens": 24158208.0, + "step": 2699 + }, + { + "epoch": 2.0516717325227964, + "grad_norm": 2.412604808807373, + "learning_rate": 1.24818663949699e-06, + "loss": 0.19276219606399536, + "mean_token_accuracy": 0.9317681789398193, + "num_tokens": 24162905.0, + "step": 2700 + }, + { + "epoch": 2.052431610942249, + "grad_norm": 1.4488455057144165, + "learning_rate": 1.246374157564785e-06, + "loss": 0.3493705093860626, + "mean_token_accuracy": 0.9016396999359131, + "num_tokens": 24175852.0, + "step": 2701 + }, + { + "epoch": 2.0531914893617023, + "grad_norm": 2.1629185676574707, + "learning_rate": 1.2445625554754526e-06, + "loss": 0.30588388442993164, + "mean_token_accuracy": 0.8871392011642456, + "num_tokens": 24181507.0, + "step": 2702 + }, + { + "epoch": 2.053951367781155, + "grad_norm": 2.0489449501037598, + "learning_rate": 1.2427518345004459e-06, + "loss": 0.4578161835670471, + "mean_token_accuracy": 0.8498104214668274, + "num_tokens": 24191918.0, + "step": 2703 + }, + { + "epoch": 2.054711246200608, + "grad_norm": 2.063019037246704, + "learning_rate": 1.2409419959105981e-06, + "loss": 0.31680572032928467, + "mean_token_accuracy": 0.8809083700180054, + "num_tokens": 24199336.0, + "step": 2704 + }, + { + "epoch": 2.0554711246200608, + "grad_norm": 2.4594223499298096, + "learning_rate": 1.239133040976124e-06, + "loss": 0.3048282265663147, + "mean_token_accuracy": 0.8897095322608948, + "num_tokens": 24205118.0, + "step": 2705 + }, + { + "epoch": 2.056231003039514, + "grad_norm": 1.6359999179840088, + "learning_rate": 1.237324970966618e-06, + "loss": 0.4312370717525482, + "mean_token_accuracy": 0.8526142835617065, + "num_tokens": 24215792.0, + "step": 2706 + }, + { + "epoch": 2.0569908814589666, + "grad_norm": 1.5534536838531494, + "learning_rate": 1.2355177871510538e-06, + "loss": 0.3647908568382263, + "mean_token_accuracy": 0.8680631518363953, + "num_tokens": 24235325.0, + "step": 2707 + }, + { + "epoch": 2.0577507598784193, + "grad_norm": 2.4902515411376953, + "learning_rate": 1.2337114907977798e-06, + "loss": 0.3605276942253113, + "mean_token_accuracy": 0.8776376843452454, + "num_tokens": 24241502.0, + "step": 2708 + }, + { + "epoch": 2.0585106382978724, + "grad_norm": 1.7282993793487549, + "learning_rate": 1.2319060831745273e-06, + "loss": 0.38326722383499146, + "mean_token_accuracy": 0.8531644344329834, + "num_tokens": 24252665.0, + "step": 2709 + }, + { + "epoch": 2.059270516717325, + "grad_norm": 1.4213361740112305, + "learning_rate": 1.2301015655484006e-06, + "loss": 0.32221150398254395, + "mean_token_accuracy": 0.8890664577484131, + "num_tokens": 24266409.0, + "step": 2710 + }, + { + "epoch": 2.060030395136778, + "grad_norm": 2.6412453651428223, + "learning_rate": 1.2282979391858767e-06, + "loss": 0.20225220918655396, + "mean_token_accuracy": 0.9287782311439514, + "num_tokens": 24271069.0, + "step": 2711 + }, + { + "epoch": 2.060790273556231, + "grad_norm": 3.2601654529571533, + "learning_rate": 1.2264952053528145e-06, + "loss": 0.23259003460407257, + "mean_token_accuracy": 0.9290606379508972, + "num_tokens": 24274992.0, + "step": 2712 + }, + { + "epoch": 2.061550151975684, + "grad_norm": 1.6633410453796387, + "learning_rate": 1.2246933653144386e-06, + "loss": 0.355314165353775, + "mean_token_accuracy": 0.870380163192749, + "num_tokens": 24284917.0, + "step": 2713 + }, + { + "epoch": 2.0623100303951367, + "grad_norm": 2.9081318378448486, + "learning_rate": 1.2228924203353507e-06, + "loss": 0.38050833344459534, + "mean_token_accuracy": 0.8879997730255127, + "num_tokens": 24289694.0, + "step": 2714 + }, + { + "epoch": 2.06306990881459, + "grad_norm": 3.2404227256774902, + "learning_rate": 1.2210923716795233e-06, + "loss": 0.2502570152282715, + "mean_token_accuracy": 0.9150978922843933, + "num_tokens": 24293254.0, + "step": 2715 + }, + { + "epoch": 2.0638297872340425, + "grad_norm": 1.9262174367904663, + "learning_rate": 1.2192932206103e-06, + "loss": 0.26763200759887695, + "mean_token_accuracy": 0.9203122854232788, + "num_tokens": 24300881.0, + "step": 2716 + }, + { + "epoch": 2.0645896656534957, + "grad_norm": 1.6790109872817993, + "learning_rate": 1.2174949683903943e-06, + "loss": 0.22275440394878387, + "mean_token_accuracy": 0.9212621450424194, + "num_tokens": 24309288.0, + "step": 2717 + }, + { + "epoch": 2.0653495440729484, + "grad_norm": 1.8272414207458496, + "learning_rate": 1.2156976162818895e-06, + "loss": 0.3183424472808838, + "mean_token_accuracy": 0.8813169002532959, + "num_tokens": 24316980.0, + "step": 2718 + }, + { + "epoch": 2.066109422492401, + "grad_norm": 2.7388651371002197, + "learning_rate": 1.2139011655462338e-06, + "loss": 0.24794816970825195, + "mean_token_accuracy": 0.9109550714492798, + "num_tokens": 24321867.0, + "step": 2719 + }, + { + "epoch": 2.066869300911854, + "grad_norm": 1.4866925477981567, + "learning_rate": 1.2121056174442484e-06, + "loss": 0.24177205562591553, + "mean_token_accuracy": 0.9102780818939209, + "num_tokens": 24332874.0, + "step": 2720 + }, + { + "epoch": 2.067629179331307, + "grad_norm": 1.6006059646606445, + "learning_rate": 1.2103109732361178e-06, + "loss": 0.29220807552337646, + "mean_token_accuracy": 0.8947570323944092, + "num_tokens": 24342790.0, + "step": 2721 + }, + { + "epoch": 2.06838905775076, + "grad_norm": 2.2688677310943604, + "learning_rate": 1.208517234181391e-06, + "loss": 0.39247143268585205, + "mean_token_accuracy": 0.8514304161071777, + "num_tokens": 24349329.0, + "step": 2722 + }, + { + "epoch": 2.0691489361702127, + "grad_norm": 2.404534339904785, + "learning_rate": 1.2067244015389829e-06, + "loss": 0.4461793303489685, + "mean_token_accuracy": 0.8531662821769714, + "num_tokens": 24356287.0, + "step": 2723 + }, + { + "epoch": 2.069908814589666, + "grad_norm": 1.813341498374939, + "learning_rate": 1.204932476567175e-06, + "loss": 0.38300177454948425, + "mean_token_accuracy": 0.8597674369812012, + "num_tokens": 24366181.0, + "step": 2724 + }, + { + "epoch": 2.0706686930091185, + "grad_norm": 3.49125337600708, + "learning_rate": 1.2031414605236066e-06, + "loss": 0.33281540870666504, + "mean_token_accuracy": 0.8774969577789307, + "num_tokens": 24370362.0, + "step": 2725 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 1.7682114839553833, + "learning_rate": 1.2013513546652827e-06, + "loss": 0.3001813590526581, + "mean_token_accuracy": 0.8840254545211792, + "num_tokens": 24380469.0, + "step": 2726 + }, + { + "epoch": 2.0721884498480243, + "grad_norm": 2.3688952922821045, + "learning_rate": 1.1995621602485685e-06, + "loss": 0.20055249333381653, + "mean_token_accuracy": 0.9246129989624023, + "num_tokens": 24385474.0, + "step": 2727 + }, + { + "epoch": 2.072948328267477, + "grad_norm": 2.3368382453918457, + "learning_rate": 1.1977738785291894e-06, + "loss": 0.18379954993724823, + "mean_token_accuracy": 0.9385529160499573, + "num_tokens": 24390002.0, + "step": 2728 + }, + { + "epoch": 2.07370820668693, + "grad_norm": 1.857473373413086, + "learning_rate": 1.1959865107622306e-06, + "loss": 0.4606894552707672, + "mean_token_accuracy": 0.8437427282333374, + "num_tokens": 24400880.0, + "step": 2729 + }, + { + "epoch": 2.074468085106383, + "grad_norm": 1.2714136838912964, + "learning_rate": 1.1942000582021355e-06, + "loss": 0.21171459555625916, + "mean_token_accuracy": 0.9216019511222839, + "num_tokens": 24413113.0, + "step": 2730 + }, + { + "epoch": 2.075227963525836, + "grad_norm": 2.2025210857391357, + "learning_rate": 1.1924145221027048e-06, + "loss": 0.44211941957473755, + "mean_token_accuracy": 0.8538386821746826, + "num_tokens": 24420504.0, + "step": 2731 + }, + { + "epoch": 2.0759878419452886, + "grad_norm": 1.6706589460372925, + "learning_rate": 1.190629903717097e-06, + "loss": 0.35163265466690063, + "mean_token_accuracy": 0.8716240525245667, + "num_tokens": 24430203.0, + "step": 2732 + }, + { + "epoch": 2.0767477203647418, + "grad_norm": 2.299182176589966, + "learning_rate": 1.1888462042978268e-06, + "loss": 0.30983975529670715, + "mean_token_accuracy": 0.8859797716140747, + "num_tokens": 24437387.0, + "step": 2733 + }, + { + "epoch": 2.0775075987841944, + "grad_norm": 2.975123167037964, + "learning_rate": 1.1870634250967606e-06, + "loss": 0.23585952818393707, + "mean_token_accuracy": 0.9167368412017822, + "num_tokens": 24441176.0, + "step": 2734 + }, + { + "epoch": 2.0782674772036476, + "grad_norm": 1.1052464246749878, + "learning_rate": 1.1852815673651246e-06, + "loss": 0.24136316776275635, + "mean_token_accuracy": 0.8897353410720825, + "num_tokens": 24457092.0, + "step": 2735 + }, + { + "epoch": 2.0790273556231003, + "grad_norm": 1.5531870126724243, + "learning_rate": 1.1835006323534926e-06, + "loss": 0.302223265171051, + "mean_token_accuracy": 0.8940514326095581, + "num_tokens": 24467643.0, + "step": 2736 + }, + { + "epoch": 2.0797872340425534, + "grad_norm": 1.706140398979187, + "learning_rate": 1.1817206213117943e-06, + "loss": 0.39235255122184753, + "mean_token_accuracy": 0.8615218997001648, + "num_tokens": 24477715.0, + "step": 2737 + }, + { + "epoch": 2.080547112462006, + "grad_norm": 2.1109750270843506, + "learning_rate": 1.1799415354893103e-06, + "loss": 0.2526751756668091, + "mean_token_accuracy": 0.9108465909957886, + "num_tokens": 24484248.0, + "step": 2738 + }, + { + "epoch": 2.0813069908814588, + "grad_norm": 1.9943277835845947, + "learning_rate": 1.178163376134671e-06, + "loss": 0.3540172874927521, + "mean_token_accuracy": 0.9131139516830444, + "num_tokens": 24492207.0, + "step": 2739 + }, + { + "epoch": 2.082066869300912, + "grad_norm": 1.9536099433898926, + "learning_rate": 1.1763861444958573e-06, + "loss": 0.3902950584888458, + "mean_token_accuracy": 0.8611530065536499, + "num_tokens": 24501567.0, + "step": 2740 + }, + { + "epoch": 2.0828267477203646, + "grad_norm": 3.146925926208496, + "learning_rate": 1.1746098418201987e-06, + "loss": 0.43440669775009155, + "mean_token_accuracy": 0.8709320425987244, + "num_tokens": 24506684.0, + "step": 2741 + }, + { + "epoch": 2.0835866261398177, + "grad_norm": 2.763427495956421, + "learning_rate": 1.172834469354373e-06, + "loss": 0.3513452410697937, + "mean_token_accuracy": 0.8774256110191345, + "num_tokens": 24511509.0, + "step": 2742 + }, + { + "epoch": 2.0843465045592704, + "grad_norm": 2.773829221725464, + "learning_rate": 1.1710600283444048e-06, + "loss": 0.24668049812316895, + "mean_token_accuracy": 0.9146889448165894, + "num_tokens": 24516030.0, + "step": 2743 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 1.666471242904663, + "learning_rate": 1.169286520035666e-06, + "loss": 0.36206915974617004, + "mean_token_accuracy": 0.8711973428726196, + "num_tokens": 24526656.0, + "step": 2744 + }, + { + "epoch": 2.085866261398176, + "grad_norm": 2.818890333175659, + "learning_rate": 1.1675139456728702e-06, + "loss": 0.32967281341552734, + "mean_token_accuracy": 0.880983829498291, + "num_tokens": 24531625.0, + "step": 2745 + }, + { + "epoch": 2.0866261398176293, + "grad_norm": 1.09058678150177, + "learning_rate": 1.1657423065000811e-06, + "loss": 0.36224377155303955, + "mean_token_accuracy": 0.8708326816558838, + "num_tokens": 24557123.0, + "step": 2746 + }, + { + "epoch": 2.087386018237082, + "grad_norm": 1.1434987783432007, + "learning_rate": 1.1639716037607036e-06, + "loss": 0.26490458846092224, + "mean_token_accuracy": 0.9131897687911987, + "num_tokens": 24573223.0, + "step": 2747 + }, + { + "epoch": 2.088145896656535, + "grad_norm": 2.437505006790161, + "learning_rate": 1.1622018386974829e-06, + "loss": 0.18964408338069916, + "mean_token_accuracy": 0.9271818399429321, + "num_tokens": 24578306.0, + "step": 2748 + }, + { + "epoch": 2.088905775075988, + "grad_norm": 1.797308325767517, + "learning_rate": 1.160433012552508e-06, + "loss": 0.3090781569480896, + "mean_token_accuracy": 0.8960750102996826, + "num_tokens": 24587562.0, + "step": 2749 + }, + { + "epoch": 2.0896656534954405, + "grad_norm": 2.4050841331481934, + "learning_rate": 1.1586651265672122e-06, + "loss": 0.4001041054725647, + "mean_token_accuracy": 0.8588370084762573, + "num_tokens": 24594223.0, + "step": 2750 + }, + { + "epoch": 2.0904255319148937, + "grad_norm": 1.8757156133651733, + "learning_rate": 1.1568981819823636e-06, + "loss": 0.37845075130462646, + "mean_token_accuracy": 0.866146445274353, + "num_tokens": 24602556.0, + "step": 2751 + }, + { + "epoch": 2.0911854103343464, + "grad_norm": 1.8205114603042603, + "learning_rate": 1.1551321800380722e-06, + "loss": 0.24738016724586487, + "mean_token_accuracy": 0.923284113407135, + "num_tokens": 24611627.0, + "step": 2752 + }, + { + "epoch": 2.0919452887537995, + "grad_norm": 2.107512950897217, + "learning_rate": 1.153367121973786e-06, + "loss": 0.3062688410282135, + "mean_token_accuracy": 0.8909003734588623, + "num_tokens": 24619569.0, + "step": 2753 + }, + { + "epoch": 2.092705167173252, + "grad_norm": 1.93110191822052, + "learning_rate": 1.1516030090282915e-06, + "loss": 0.38658422231674194, + "mean_token_accuracy": 0.869437038898468, + "num_tokens": 24628869.0, + "step": 2754 + }, + { + "epoch": 2.0934650455927053, + "grad_norm": 2.3618004322052, + "learning_rate": 1.1498398424397106e-06, + "loss": 0.19193072617053986, + "mean_token_accuracy": 0.9329519271850586, + "num_tokens": 24633724.0, + "step": 2755 + }, + { + "epoch": 2.094224924012158, + "grad_norm": 2.274510622024536, + "learning_rate": 1.1480776234455024e-06, + "loss": 0.24939998984336853, + "mean_token_accuracy": 0.9104958772659302, + "num_tokens": 24642762.0, + "step": 2756 + }, + { + "epoch": 2.094984802431611, + "grad_norm": 1.7468934059143066, + "learning_rate": 1.1463163532824572e-06, + "loss": 0.3876607418060303, + "mean_token_accuracy": 0.8540539145469666, + "num_tokens": 24652138.0, + "step": 2757 + }, + { + "epoch": 2.095744680851064, + "grad_norm": 2.905381441116333, + "learning_rate": 1.1445560331867054e-06, + "loss": 0.33666878938674927, + "mean_token_accuracy": 0.8805598616600037, + "num_tokens": 24656612.0, + "step": 2758 + }, + { + "epoch": 2.096504559270517, + "grad_norm": 1.5513007640838623, + "learning_rate": 1.142796664393707e-06, + "loss": 0.25168463587760925, + "mean_token_accuracy": 0.925534725189209, + "num_tokens": 24667132.0, + "step": 2759 + }, + { + "epoch": 2.0972644376899696, + "grad_norm": 1.6804249286651611, + "learning_rate": 1.141038248138253e-06, + "loss": 0.3862859010696411, + "mean_token_accuracy": 0.8686253428459167, + "num_tokens": 24679274.0, + "step": 2760 + }, + { + "epoch": 2.0980243161094223, + "grad_norm": 1.7432880401611328, + "learning_rate": 1.1392807856544682e-06, + "loss": 0.3200700879096985, + "mean_token_accuracy": 0.9188123941421509, + "num_tokens": 24688628.0, + "step": 2761 + }, + { + "epoch": 2.0987841945288754, + "grad_norm": 1.8734468221664429, + "learning_rate": 1.1375242781758077e-06, + "loss": 0.34758424758911133, + "mean_token_accuracy": 0.8724187016487122, + "num_tokens": 24698159.0, + "step": 2762 + }, + { + "epoch": 2.099544072948328, + "grad_norm": 3.7156829833984375, + "learning_rate": 1.1357687269350564e-06, + "loss": 0.30014732480049133, + "mean_token_accuracy": 0.9021577835083008, + "num_tokens": 24701797.0, + "step": 2763 + }, + { + "epoch": 2.1003039513677813, + "grad_norm": 1.5196985006332397, + "learning_rate": 1.1340141331643276e-06, + "loss": 0.45747464895248413, + "mean_token_accuracy": 0.839891791343689, + "num_tokens": 24716468.0, + "step": 2764 + }, + { + "epoch": 2.101063829787234, + "grad_norm": 1.978009581565857, + "learning_rate": 1.132260498095062e-06, + "loss": 0.3130183815956116, + "mean_token_accuracy": 0.90610271692276, + "num_tokens": 24723211.0, + "step": 2765 + }, + { + "epoch": 2.101823708206687, + "grad_norm": 1.5883251428604126, + "learning_rate": 1.1305078229580294e-06, + "loss": 0.30493029952049255, + "mean_token_accuracy": 0.8889745473861694, + "num_tokens": 24733839.0, + "step": 2766 + }, + { + "epoch": 2.1025835866261398, + "grad_norm": 1.2397783994674683, + "learning_rate": 1.128756108983325e-06, + "loss": 0.2606407105922699, + "mean_token_accuracy": 0.9061247110366821, + "num_tokens": 24747488.0, + "step": 2767 + }, + { + "epoch": 2.103343465045593, + "grad_norm": 1.3046784400939941, + "learning_rate": 1.1270053574003658e-06, + "loss": 0.38750404119491577, + "mean_token_accuracy": 0.8777017593383789, + "num_tokens": 24763893.0, + "step": 2768 + }, + { + "epoch": 2.1041033434650456, + "grad_norm": 1.499266266822815, + "learning_rate": 1.1252555694379005e-06, + "loss": 0.4804937243461609, + "mean_token_accuracy": 0.8344086408615112, + "num_tokens": 24779323.0, + "step": 2769 + }, + { + "epoch": 2.1048632218844983, + "grad_norm": 1.211094856262207, + "learning_rate": 1.123506746323997e-06, + "loss": 0.3579246997833252, + "mean_token_accuracy": 0.8705919981002808, + "num_tokens": 24794965.0, + "step": 2770 + }, + { + "epoch": 2.1056231003039514, + "grad_norm": 2.490551471710205, + "learning_rate": 1.1217588892860446e-06, + "loss": 0.4084790349006653, + "mean_token_accuracy": 0.8553222417831421, + "num_tokens": 24800614.0, + "step": 2771 + }, + { + "epoch": 2.106382978723404, + "grad_norm": 1.5249632596969604, + "learning_rate": 1.1200119995507572e-06, + "loss": 0.36853182315826416, + "mean_token_accuracy": 0.8847414255142212, + "num_tokens": 24812886.0, + "step": 2772 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 1.8510968685150146, + "learning_rate": 1.1182660783441719e-06, + "loss": 0.2918103337287903, + "mean_token_accuracy": 0.8898224830627441, + "num_tokens": 24821545.0, + "step": 2773 + }, + { + "epoch": 2.10790273556231, + "grad_norm": 1.7721803188323975, + "learning_rate": 1.11652112689164e-06, + "loss": 0.2920452654361725, + "mean_token_accuracy": 0.8879085779190063, + "num_tokens": 24831526.0, + "step": 2774 + }, + { + "epoch": 2.108662613981763, + "grad_norm": 1.3987336158752441, + "learning_rate": 1.1147771464178378e-06, + "loss": 0.4407062828540802, + "mean_token_accuracy": 0.8472493886947632, + "num_tokens": 24845847.0, + "step": 2775 + }, + { + "epoch": 2.1094224924012157, + "grad_norm": 1.8927375078201294, + "learning_rate": 1.1130341381467569e-06, + "loss": 0.36293038725852966, + "mean_token_accuracy": 0.8881135582923889, + "num_tokens": 24854760.0, + "step": 2776 + }, + { + "epoch": 2.110182370820669, + "grad_norm": 3.0480666160583496, + "learning_rate": 1.111292103301708e-06, + "loss": 0.30395108461380005, + "mean_token_accuracy": 0.9036306142807007, + "num_tokens": 24859051.0, + "step": 2777 + }, + { + "epoch": 2.1109422492401215, + "grad_norm": 1.5833618640899658, + "learning_rate": 1.1095510431053176e-06, + "loss": 0.26424330472946167, + "mean_token_accuracy": 0.9020674824714661, + "num_tokens": 24869853.0, + "step": 2778 + }, + { + "epoch": 2.1117021276595747, + "grad_norm": 1.645459532737732, + "learning_rate": 1.1078109587795311e-06, + "loss": 0.3563994765281677, + "mean_token_accuracy": 0.8732106685638428, + "num_tokens": 24880184.0, + "step": 2779 + }, + { + "epoch": 2.1124620060790273, + "grad_norm": 2.2964093685150146, + "learning_rate": 1.1060718515456022e-06, + "loss": 0.19739922881126404, + "mean_token_accuracy": 0.9273765087127686, + "num_tokens": 24885398.0, + "step": 2780 + }, + { + "epoch": 2.11322188449848, + "grad_norm": 2.094024181365967, + "learning_rate": 1.1043337226241075e-06, + "loss": 0.3321923315525055, + "mean_token_accuracy": 0.8865819573402405, + "num_tokens": 24893908.0, + "step": 2781 + }, + { + "epoch": 2.113981762917933, + "grad_norm": 1.9787025451660156, + "learning_rate": 1.1025965732349318e-06, + "loss": 0.37631168961524963, + "mean_token_accuracy": 0.8808693885803223, + "num_tokens": 24901270.0, + "step": 2782 + }, + { + "epoch": 2.114741641337386, + "grad_norm": 2.376060724258423, + "learning_rate": 1.100860404597271e-06, + "loss": 0.2591894268989563, + "mean_token_accuracy": 0.9174780249595642, + "num_tokens": 24906578.0, + "step": 2783 + }, + { + "epoch": 2.115501519756839, + "grad_norm": 1.0967903137207031, + "learning_rate": 1.0991252179296389e-06, + "loss": 0.26626938581466675, + "mean_token_accuracy": 0.9305505752563477, + "num_tokens": 24922329.0, + "step": 2784 + }, + { + "epoch": 2.1162613981762917, + "grad_norm": 3.3701183795928955, + "learning_rate": 1.0973910144498534e-06, + "loss": 0.2710079848766327, + "mean_token_accuracy": 0.9095271825790405, + "num_tokens": 24925777.0, + "step": 2785 + }, + { + "epoch": 2.117021276595745, + "grad_norm": 1.636264681816101, + "learning_rate": 1.0956577953750461e-06, + "loss": 0.2995981276035309, + "mean_token_accuracy": 0.8988568782806396, + "num_tokens": 24934230.0, + "step": 2786 + }, + { + "epoch": 2.1177811550151975, + "grad_norm": 2.3107731342315674, + "learning_rate": 1.093925561921657e-06, + "loss": 0.3424459397792816, + "mean_token_accuracy": 0.9100210070610046, + "num_tokens": 24939830.0, + "step": 2787 + }, + { + "epoch": 2.1185410334346506, + "grad_norm": 1.814764380455017, + "learning_rate": 1.0921943153054343e-06, + "loss": 0.3182154893875122, + "mean_token_accuracy": 0.883027195930481, + "num_tokens": 24947764.0, + "step": 2788 + }, + { + "epoch": 2.1193009118541033, + "grad_norm": 1.693555235862732, + "learning_rate": 1.0904640567414332e-06, + "loss": 0.3685447573661804, + "mean_token_accuracy": 0.8900846242904663, + "num_tokens": 24957680.0, + "step": 2789 + }, + { + "epoch": 2.1200607902735564, + "grad_norm": 1.0726022720336914, + "learning_rate": 1.088734787444017e-06, + "loss": 0.28461548686027527, + "mean_token_accuracy": 0.9026681184768677, + "num_tokens": 24975181.0, + "step": 2790 + }, + { + "epoch": 2.120820668693009, + "grad_norm": 1.3013874292373657, + "learning_rate": 1.0870065086268506e-06, + "loss": 0.28222548961639404, + "mean_token_accuracy": 0.9041857719421387, + "num_tokens": 24993211.0, + "step": 2791 + }, + { + "epoch": 2.121580547112462, + "grad_norm": 2.592106580734253, + "learning_rate": 1.085279221502909e-06, + "loss": 0.31733593344688416, + "mean_token_accuracy": 0.90151047706604, + "num_tokens": 24998151.0, + "step": 2792 + }, + { + "epoch": 2.122340425531915, + "grad_norm": 2.649210214614868, + "learning_rate": 1.0835529272844694e-06, + "loss": 0.341595321893692, + "mean_token_accuracy": 0.8989696502685547, + "num_tokens": 25003399.0, + "step": 2793 + }, + { + "epoch": 2.1231003039513676, + "grad_norm": 2.376619577407837, + "learning_rate": 1.0818276271831094e-06, + "loss": 0.2770065665245056, + "mean_token_accuracy": 0.8967875242233276, + "num_tokens": 25009686.0, + "step": 2794 + }, + { + "epoch": 2.1238601823708207, + "grad_norm": 2.1539604663848877, + "learning_rate": 1.080103322409711e-06, + "loss": 0.37501147389411926, + "mean_token_accuracy": 0.8768513202667236, + "num_tokens": 25016339.0, + "step": 2795 + }, + { + "epoch": 2.1246200607902734, + "grad_norm": 2.5727670192718506, + "learning_rate": 1.0783800141744607e-06, + "loss": 0.31852903962135315, + "mean_token_accuracy": 0.8897477388381958, + "num_tokens": 25021410.0, + "step": 2796 + }, + { + "epoch": 2.1253799392097266, + "grad_norm": 2.1428916454315186, + "learning_rate": 1.0766577036868395e-06, + "loss": 0.2348000407218933, + "mean_token_accuracy": 0.9012142419815063, + "num_tokens": 25027375.0, + "step": 2797 + }, + { + "epoch": 2.1261398176291793, + "grad_norm": 2.4231064319610596, + "learning_rate": 1.074936392155631e-06, + "loss": 0.30580806732177734, + "mean_token_accuracy": 0.8963108658790588, + "num_tokens": 25033211.0, + "step": 2798 + }, + { + "epoch": 2.1268996960486324, + "grad_norm": 2.1027259826660156, + "learning_rate": 1.073216080788921e-06, + "loss": 0.2508814334869385, + "mean_token_accuracy": 0.9095165729522705, + "num_tokens": 25040316.0, + "step": 2799 + }, + { + "epoch": 2.127659574468085, + "grad_norm": 1.6513079404830933, + "learning_rate": 1.0714967707940876e-06, + "loss": 0.40694183111190796, + "mean_token_accuracy": 0.8895826935768127, + "num_tokens": 25054978.0, + "step": 2800 + }, + { + "epoch": 2.128419452887538, + "grad_norm": 2.0551133155822754, + "learning_rate": 1.0697784633778093e-06, + "loss": 0.3452662229537964, + "mean_token_accuracy": 0.8710684776306152, + "num_tokens": 25062755.0, + "step": 2801 + }, + { + "epoch": 2.129179331306991, + "grad_norm": 2.1780688762664795, + "learning_rate": 1.0680611597460607e-06, + "loss": 0.2918209135532379, + "mean_token_accuracy": 0.8689337968826294, + "num_tokens": 25069453.0, + "step": 2802 + }, + { + "epoch": 2.1299392097264436, + "grad_norm": 1.7905635833740234, + "learning_rate": 1.0663448611041114e-06, + "loss": 0.3535313308238983, + "mean_token_accuracy": 0.8762770295143127, + "num_tokens": 25080004.0, + "step": 2803 + }, + { + "epoch": 2.1306990881458967, + "grad_norm": 1.6187241077423096, + "learning_rate": 1.0646295686565258e-06, + "loss": 0.3042716681957245, + "mean_token_accuracy": 0.884156346321106, + "num_tokens": 25089652.0, + "step": 2804 + }, + { + "epoch": 2.1314589665653494, + "grad_norm": 2.667459011077881, + "learning_rate": 1.0629152836071633e-06, + "loss": 0.3904019892215729, + "mean_token_accuracy": 0.8603606224060059, + "num_tokens": 25095556.0, + "step": 2805 + }, + { + "epoch": 2.1322188449848025, + "grad_norm": 1.4227970838546753, + "learning_rate": 1.0612020071591722e-06, + "loss": 0.3765299320220947, + "mean_token_accuracy": 0.8655093908309937, + "num_tokens": 25108963.0, + "step": 2806 + }, + { + "epoch": 2.132978723404255, + "grad_norm": 2.262726068496704, + "learning_rate": 1.0594897405149994e-06, + "loss": 0.2727298140525818, + "mean_token_accuracy": 0.9005513191223145, + "num_tokens": 25115135.0, + "step": 2807 + }, + { + "epoch": 2.1337386018237083, + "grad_norm": 2.0810186862945557, + "learning_rate": 1.0577784848763773e-06, + "loss": 0.4001343250274658, + "mean_token_accuracy": 0.8537896871566772, + "num_tokens": 25123079.0, + "step": 2808 + }, + { + "epoch": 2.134498480243161, + "grad_norm": 1.6573376655578613, + "learning_rate": 1.0560682414443315e-06, + "loss": 0.4197486340999603, + "mean_token_accuracy": 0.8549862504005432, + "num_tokens": 25135398.0, + "step": 2809 + }, + { + "epoch": 2.135258358662614, + "grad_norm": 2.200150489807129, + "learning_rate": 1.0543590114191768e-06, + "loss": 0.32026296854019165, + "mean_token_accuracy": 0.8797904253005981, + "num_tokens": 25141382.0, + "step": 2810 + }, + { + "epoch": 2.136018237082067, + "grad_norm": 2.678558111190796, + "learning_rate": 1.0526507960005164e-06, + "loss": 0.30048054456710815, + "mean_token_accuracy": 0.8849201202392578, + "num_tokens": 25146235.0, + "step": 2811 + }, + { + "epoch": 2.13677811550152, + "grad_norm": 1.5207500457763672, + "learning_rate": 1.0509435963872422e-06, + "loss": 0.3706427216529846, + "mean_token_accuracy": 0.8740214109420776, + "num_tokens": 25157108.0, + "step": 2812 + }, + { + "epoch": 2.1375379939209727, + "grad_norm": 1.4632720947265625, + "learning_rate": 1.049237413777532e-06, + "loss": 0.27156776189804077, + "mean_token_accuracy": 0.8950715661048889, + "num_tokens": 25167937.0, + "step": 2813 + }, + { + "epoch": 2.1382978723404253, + "grad_norm": 2.101048469543457, + "learning_rate": 1.0475322493688506e-06, + "loss": 0.366736501455307, + "mean_token_accuracy": 0.8700850009918213, + "num_tokens": 25177043.0, + "step": 2814 + }, + { + "epoch": 2.1390577507598785, + "grad_norm": 2.54221248626709, + "learning_rate": 1.0458281043579482e-06, + "loss": 0.20383943617343903, + "mean_token_accuracy": 0.9226665496826172, + "num_tokens": 25182105.0, + "step": 2815 + }, + { + "epoch": 2.139817629179331, + "grad_norm": 1.7742674350738525, + "learning_rate": 1.04412497994086e-06, + "loss": 0.26852455735206604, + "mean_token_accuracy": 0.8987031579017639, + "num_tokens": 25190178.0, + "step": 2816 + }, + { + "epoch": 2.1405775075987843, + "grad_norm": 3.2856075763702393, + "learning_rate": 1.0424228773129019e-06, + "loss": 0.24643859267234802, + "mean_token_accuracy": 0.9189155101776123, + "num_tokens": 25194105.0, + "step": 2817 + }, + { + "epoch": 2.141337386018237, + "grad_norm": 3.374311923980713, + "learning_rate": 1.0407217976686777e-06, + "loss": 0.2575511336326599, + "mean_token_accuracy": 0.9143530130386353, + "num_tokens": 25197787.0, + "step": 2818 + }, + { + "epoch": 2.14209726443769, + "grad_norm": 1.4967217445373535, + "learning_rate": 1.03902174220207e-06, + "loss": 0.3054750859737396, + "mean_token_accuracy": 0.8989205360412598, + "num_tokens": 25209150.0, + "step": 2819 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 2.654459238052368, + "learning_rate": 1.0373227121062423e-06, + "loss": 0.27398061752319336, + "mean_token_accuracy": 0.9181102514266968, + "num_tokens": 25214015.0, + "step": 2820 + }, + { + "epoch": 2.143617021276596, + "grad_norm": 1.3205828666687012, + "learning_rate": 1.0356247085736388e-06, + "loss": 0.4085468053817749, + "mean_token_accuracy": 0.8745299577713013, + "num_tokens": 25230588.0, + "step": 2821 + }, + { + "epoch": 2.1443768996960486, + "grad_norm": 1.6965736150741577, + "learning_rate": 1.0339277327959863e-06, + "loss": 0.27269643545150757, + "mean_token_accuracy": 0.9001271724700928, + "num_tokens": 25239298.0, + "step": 2822 + }, + { + "epoch": 2.1451367781155017, + "grad_norm": 2.789114236831665, + "learning_rate": 1.0322317859642852e-06, + "loss": 0.2319176197052002, + "mean_token_accuracy": 0.9237110614776611, + "num_tokens": 25243286.0, + "step": 2823 + }, + { + "epoch": 2.1458966565349544, + "grad_norm": 1.8817718029022217, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.2917990982532501, + "mean_token_accuracy": 0.9211062788963318, + "num_tokens": 25250575.0, + "step": 2824 + }, + { + "epoch": 2.146656534954407, + "grad_norm": 2.1824984550476074, + "learning_rate": 1.0288429838991405e-06, + "loss": 0.39010798931121826, + "mean_token_accuracy": 0.8887852430343628, + "num_tokens": 25257947.0, + "step": 2825 + }, + { + "epoch": 2.1474164133738602, + "grad_norm": 1.302579641342163, + "learning_rate": 1.0271501310440882e-06, + "loss": 0.3511282503604889, + "mean_token_accuracy": 0.8728797435760498, + "num_tokens": 25272846.0, + "step": 2826 + }, + { + "epoch": 2.148176291793313, + "grad_norm": 1.691807746887207, + "learning_rate": 1.0254583118917699e-06, + "loss": 0.34246695041656494, + "mean_token_accuracy": 0.8743435144424438, + "num_tokens": 25283004.0, + "step": 2827 + }, + { + "epoch": 2.148936170212766, + "grad_norm": 1.2483569383621216, + "learning_rate": 1.0237675276295709e-06, + "loss": 0.3346659243106842, + "mean_token_accuracy": 0.8823951482772827, + "num_tokens": 25297786.0, + "step": 2828 + }, + { + "epoch": 2.1496960486322187, + "grad_norm": 3.7242841720581055, + "learning_rate": 1.022077779444145e-06, + "loss": 0.25516486167907715, + "mean_token_accuracy": 0.9189130663871765, + "num_tokens": 25301524.0, + "step": 2829 + }, + { + "epoch": 2.150455927051672, + "grad_norm": 2.5851144790649414, + "learning_rate": 1.020389068521426e-06, + "loss": 0.3543069362640381, + "mean_token_accuracy": 0.8942399621009827, + "num_tokens": 25307277.0, + "step": 2830 + }, + { + "epoch": 2.1512158054711246, + "grad_norm": 1.3453631401062012, + "learning_rate": 1.018701396046616e-06, + "loss": 0.2900702953338623, + "mean_token_accuracy": 0.8847548365592957, + "num_tokens": 25321366.0, + "step": 2831 + }, + { + "epoch": 2.1519756838905777, + "grad_norm": 1.6905686855316162, + "learning_rate": 1.0170147632041858e-06, + "loss": 0.24844832718372345, + "mean_token_accuracy": 0.9167388677597046, + "num_tokens": 25328916.0, + "step": 2832 + }, + { + "epoch": 2.1527355623100304, + "grad_norm": 2.6469411849975586, + "learning_rate": 1.0153291711778825e-06, + "loss": 0.18566903471946716, + "mean_token_accuracy": 0.9346771836280823, + "num_tokens": 25332871.0, + "step": 2833 + }, + { + "epoch": 2.1534954407294835, + "grad_norm": 1.3880906105041504, + "learning_rate": 1.0136446211507175e-06, + "loss": 0.37413570284843445, + "mean_token_accuracy": 0.8685535788536072, + "num_tokens": 25347447.0, + "step": 2834 + }, + { + "epoch": 2.154255319148936, + "grad_norm": 1.1376656293869019, + "learning_rate": 1.0119611143049731e-06, + "loss": 0.2844143509864807, + "mean_token_accuracy": 0.8910006284713745, + "num_tokens": 25365930.0, + "step": 2835 + }, + { + "epoch": 2.155015197568389, + "grad_norm": 2.259666919708252, + "learning_rate": 1.0102786518221997e-06, + "loss": 0.3148176074028015, + "mean_token_accuracy": 0.8851165175437927, + "num_tokens": 25373047.0, + "step": 2836 + }, + { + "epoch": 2.155775075987842, + "grad_norm": 3.304095506668091, + "learning_rate": 1.0085972348832138e-06, + "loss": 0.2042517364025116, + "mean_token_accuracy": 0.9247308969497681, + "num_tokens": 25376348.0, + "step": 2837 + }, + { + "epoch": 2.1565349544072947, + "grad_norm": 1.9856120347976685, + "learning_rate": 1.0069168646680985e-06, + "loss": 0.3547414541244507, + "mean_token_accuracy": 0.8941285610198975, + "num_tokens": 25384675.0, + "step": 2838 + }, + { + "epoch": 2.157294832826748, + "grad_norm": 2.8482213020324707, + "learning_rate": 1.0052375423562038e-06, + "loss": 0.3530133366584778, + "mean_token_accuracy": 0.8789700269699097, + "num_tokens": 25389631.0, + "step": 2839 + }, + { + "epoch": 2.1580547112462005, + "grad_norm": 1.4270408153533936, + "learning_rate": 1.0035592691261395e-06, + "loss": 0.34078776836395264, + "mean_token_accuracy": 0.8648165464401245, + "num_tokens": 25403746.0, + "step": 2840 + }, + { + "epoch": 2.1588145896656536, + "grad_norm": 0.9342723488807678, + "learning_rate": 1.0018820461557852e-06, + "loss": 0.2615935504436493, + "mean_token_accuracy": 0.9082236289978027, + "num_tokens": 25424695.0, + "step": 2841 + }, + { + "epoch": 2.1595744680851063, + "grad_norm": 2.695632219314575, + "learning_rate": 1.0002058746222807e-06, + "loss": 0.2202145904302597, + "mean_token_accuracy": 0.9221563339233398, + "num_tokens": 25428783.0, + "step": 2842 + }, + { + "epoch": 2.1603343465045595, + "grad_norm": 1.5679794549942017, + "learning_rate": 9.985307557020257e-07, + "loss": 0.24275024235248566, + "mean_token_accuracy": 0.9363338351249695, + "num_tokens": 25439104.0, + "step": 2843 + }, + { + "epoch": 2.161094224924012, + "grad_norm": 1.5985528230667114, + "learning_rate": 9.968566905706833e-07, + "loss": 0.2541901171207428, + "mean_token_accuracy": 0.9040743112564087, + "num_tokens": 25448829.0, + "step": 2844 + }, + { + "epoch": 2.161854103343465, + "grad_norm": 2.6022164821624756, + "learning_rate": 9.951836804031795e-07, + "loss": 0.24492180347442627, + "mean_token_accuracy": 0.9109418392181396, + "num_tokens": 25453902.0, + "step": 2845 + }, + { + "epoch": 2.162613981762918, + "grad_norm": 1.6719969511032104, + "learning_rate": 9.935117263736943e-07, + "loss": 0.43255117535591125, + "mean_token_accuracy": 0.868374228477478, + "num_tokens": 25465538.0, + "step": 2846 + }, + { + "epoch": 2.1633738601823707, + "grad_norm": 1.8284894227981567, + "learning_rate": 9.918408296556706e-07, + "loss": 0.32285982370376587, + "mean_token_accuracy": 0.9016412496566772, + "num_tokens": 25473721.0, + "step": 2847 + }, + { + "epoch": 2.164133738601824, + "grad_norm": 1.4488024711608887, + "learning_rate": 9.90170991421808e-07, + "loss": 0.35639309883117676, + "mean_token_accuracy": 0.8861881494522095, + "num_tokens": 25487535.0, + "step": 2848 + }, + { + "epoch": 2.1648936170212765, + "grad_norm": 2.089930534362793, + "learning_rate": 9.88502212844063e-07, + "loss": 0.2588546574115753, + "mean_token_accuracy": 0.9029642939567566, + "num_tokens": 25494567.0, + "step": 2849 + }, + { + "epoch": 2.1656534954407296, + "grad_norm": 1.1274315118789673, + "learning_rate": 9.86834495093649e-07, + "loss": 0.37268880009651184, + "mean_token_accuracy": 0.859347939491272, + "num_tokens": 25518278.0, + "step": 2850 + }, + { + "epoch": 2.1664133738601823, + "grad_norm": 2.3886640071868896, + "learning_rate": 9.851678393410343e-07, + "loss": 0.34938913583755493, + "mean_token_accuracy": 0.8724287748336792, + "num_tokens": 25524001.0, + "step": 2851 + }, + { + "epoch": 2.1671732522796354, + "grad_norm": 2.521230459213257, + "learning_rate": 9.83502246755942e-07, + "loss": 0.34781408309936523, + "mean_token_accuracy": 0.8970093131065369, + "num_tokens": 25529982.0, + "step": 2852 + }, + { + "epoch": 2.167933130699088, + "grad_norm": 2.467618942260742, + "learning_rate": 9.818377185073493e-07, + "loss": 0.29725387692451477, + "mean_token_accuracy": 0.8991899490356445, + "num_tokens": 25535356.0, + "step": 2853 + }, + { + "epoch": 2.1686930091185412, + "grad_norm": 2.335873603820801, + "learning_rate": 9.801742557634872e-07, + "loss": 0.39603036642074585, + "mean_token_accuracy": 0.8755916357040405, + "num_tokens": 25542526.0, + "step": 2854 + }, + { + "epoch": 2.169452887537994, + "grad_norm": 1.8388596773147583, + "learning_rate": 9.78511859691835e-07, + "loss": 0.3414672017097473, + "mean_token_accuracy": 0.8951467275619507, + "num_tokens": 25551904.0, + "step": 2855 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 1.86272394657135, + "learning_rate": 9.768505314591295e-07, + "loss": 0.45748448371887207, + "mean_token_accuracy": 0.8614354133605957, + "num_tokens": 25562197.0, + "step": 2856 + }, + { + "epoch": 2.1709726443768997, + "grad_norm": 1.9142264127731323, + "learning_rate": 9.751902722313527e-07, + "loss": 0.20877259969711304, + "mean_token_accuracy": 0.9316688179969788, + "num_tokens": 25569403.0, + "step": 2857 + }, + { + "epoch": 2.1717325227963524, + "grad_norm": 2.1138272285461426, + "learning_rate": 9.73531083173739e-07, + "loss": 0.37058722972869873, + "mean_token_accuracy": 0.8654135465621948, + "num_tokens": 25577200.0, + "step": 2858 + }, + { + "epoch": 2.1724924012158056, + "grad_norm": 1.973467469215393, + "learning_rate": 9.718729654507713e-07, + "loss": 0.4106993079185486, + "mean_token_accuracy": 0.8958662152290344, + "num_tokens": 25585694.0, + "step": 2859 + }, + { + "epoch": 2.1732522796352582, + "grad_norm": 1.957513451576233, + "learning_rate": 9.702159202261802e-07, + "loss": 0.2067333608865738, + "mean_token_accuracy": 0.9413473606109619, + "num_tokens": 25591604.0, + "step": 2860 + }, + { + "epoch": 2.1740121580547114, + "grad_norm": 2.7639806270599365, + "learning_rate": 9.685599486629444e-07, + "loss": 0.3446827232837677, + "mean_token_accuracy": 0.8837845325469971, + "num_tokens": 25596528.0, + "step": 2861 + }, + { + "epoch": 2.174772036474164, + "grad_norm": 2.483734607696533, + "learning_rate": 9.669050519232875e-07, + "loss": 0.21230249106884003, + "mean_token_accuracy": 0.9334918856620789, + "num_tokens": 25601182.0, + "step": 2862 + }, + { + "epoch": 2.175531914893617, + "grad_norm": 1.7194870710372925, + "learning_rate": 9.65251231168681e-07, + "loss": 0.2657586932182312, + "mean_token_accuracy": 0.9035707712173462, + "num_tokens": 25610561.0, + "step": 2863 + }, + { + "epoch": 2.17629179331307, + "grad_norm": 2.6709611415863037, + "learning_rate": 9.63598487559839e-07, + "loss": 0.3673030138015747, + "mean_token_accuracy": 0.8976202011108398, + "num_tokens": 25615822.0, + "step": 2864 + }, + { + "epoch": 2.1770516717325226, + "grad_norm": 1.6646889448165894, + "learning_rate": 9.619468222567216e-07, + "loss": 0.2796666622161865, + "mean_token_accuracy": 0.8698215484619141, + "num_tokens": 25626148.0, + "step": 2865 + }, + { + "epoch": 2.1778115501519757, + "grad_norm": 1.8341799974441528, + "learning_rate": 9.602962364185286e-07, + "loss": 0.44835132360458374, + "mean_token_accuracy": 0.84391850233078, + "num_tokens": 25636305.0, + "step": 2866 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 2.3579823970794678, + "learning_rate": 9.586467312037076e-07, + "loss": 0.2875673472881317, + "mean_token_accuracy": 0.889403223991394, + "num_tokens": 25642593.0, + "step": 2867 + }, + { + "epoch": 2.1793313069908815, + "grad_norm": 1.1284339427947998, + "learning_rate": 9.569983077699447e-07, + "loss": 0.3402171730995178, + "mean_token_accuracy": 0.8795222043991089, + "num_tokens": 25663734.0, + "step": 2868 + }, + { + "epoch": 2.180091185410334, + "grad_norm": 1.4705578088760376, + "learning_rate": 9.553509672741646e-07, + "loss": 0.4216107726097107, + "mean_token_accuracy": 0.845354437828064, + "num_tokens": 25678197.0, + "step": 2869 + }, + { + "epoch": 2.1808510638297873, + "grad_norm": 2.6181085109710693, + "learning_rate": 9.53704710872535e-07, + "loss": 0.2777765393257141, + "mean_token_accuracy": 0.8884872198104858, + "num_tokens": 25683808.0, + "step": 2870 + }, + { + "epoch": 2.18161094224924, + "grad_norm": 2.7285003662109375, + "learning_rate": 9.520595397204643e-07, + "loss": 0.33339786529541016, + "mean_token_accuracy": 0.8892828226089478, + "num_tokens": 25690125.0, + "step": 2871 + }, + { + "epoch": 2.182370820668693, + "grad_norm": 2.200571298599243, + "learning_rate": 9.504154549725944e-07, + "loss": 0.46546393632888794, + "mean_token_accuracy": 0.8389996290206909, + "num_tokens": 25697279.0, + "step": 2872 + }, + { + "epoch": 2.183130699088146, + "grad_norm": 3.491392135620117, + "learning_rate": 9.487724577828081e-07, + "loss": 0.17026299238204956, + "mean_token_accuracy": 0.9410334825515747, + "num_tokens": 25700263.0, + "step": 2873 + }, + { + "epoch": 2.183890577507599, + "grad_norm": 2.7800233364105225, + "learning_rate": 9.471305493042243e-07, + "loss": 0.2309894859790802, + "mean_token_accuracy": 0.9233936071395874, + "num_tokens": 25704486.0, + "step": 2874 + }, + { + "epoch": 2.1846504559270516, + "grad_norm": 2.6505582332611084, + "learning_rate": 9.454897306891972e-07, + "loss": 0.4378674328327179, + "mean_token_accuracy": 0.8846660852432251, + "num_tokens": 25710115.0, + "step": 2875 + }, + { + "epoch": 2.1854103343465043, + "grad_norm": 1.5393849611282349, + "learning_rate": 9.438500030893166e-07, + "loss": 0.42081019282341003, + "mean_token_accuracy": 0.8672939538955688, + "num_tokens": 25724598.0, + "step": 2876 + }, + { + "epoch": 2.1861702127659575, + "grad_norm": 1.911198377609253, + "learning_rate": 9.422113676554073e-07, + "loss": 0.19115394353866577, + "mean_token_accuracy": 0.9201297163963318, + "num_tokens": 25731040.0, + "step": 2877 + }, + { + "epoch": 2.18693009118541, + "grad_norm": 1.371443748474121, + "learning_rate": 9.405738255375243e-07, + "loss": 0.3639947772026062, + "mean_token_accuracy": 0.8653393983840942, + "num_tokens": 25745335.0, + "step": 2878 + }, + { + "epoch": 2.1876899696048633, + "grad_norm": 3.216238498687744, + "learning_rate": 9.389373778849612e-07, + "loss": 0.2623414397239685, + "mean_token_accuracy": 0.9046015739440918, + "num_tokens": 25749223.0, + "step": 2879 + }, + { + "epoch": 2.188449848024316, + "grad_norm": 2.7558846473693848, + "learning_rate": 9.37302025846237e-07, + "loss": 0.31921297311782837, + "mean_token_accuracy": 0.8903186321258545, + "num_tokens": 25754341.0, + "step": 2880 + }, + { + "epoch": 2.189209726443769, + "grad_norm": 2.06365704536438, + "learning_rate": 9.356677705691058e-07, + "loss": 0.357482373714447, + "mean_token_accuracy": 0.8661626577377319, + "num_tokens": 25761199.0, + "step": 2881 + }, + { + "epoch": 2.189969604863222, + "grad_norm": 3.240328550338745, + "learning_rate": 9.340346132005507e-07, + "loss": 0.3157888650894165, + "mean_token_accuracy": 0.8948285579681396, + "num_tokens": 25765099.0, + "step": 2882 + }, + { + "epoch": 2.190729483282675, + "grad_norm": 1.4671967029571533, + "learning_rate": 9.324025548867849e-07, + "loss": 0.32077109813690186, + "mean_token_accuracy": 0.8813248872756958, + "num_tokens": 25777636.0, + "step": 2883 + }, + { + "epoch": 2.1914893617021276, + "grad_norm": 2.6475353240966797, + "learning_rate": 9.307715967732492e-07, + "loss": 0.35567623376846313, + "mean_token_accuracy": 0.8738130331039429, + "num_tokens": 25783737.0, + "step": 2884 + }, + { + "epoch": 2.1922492401215807, + "grad_norm": 1.791491150856018, + "learning_rate": 9.29141740004613e-07, + "loss": 0.2556282877922058, + "mean_token_accuracy": 0.9223519563674927, + "num_tokens": 25792069.0, + "step": 2885 + }, + { + "epoch": 2.1930091185410334, + "grad_norm": 2.3944389820098877, + "learning_rate": 9.275129857247722e-07, + "loss": 0.3145869970321655, + "mean_token_accuracy": 0.8938079476356506, + "num_tokens": 25798400.0, + "step": 2886 + }, + { + "epoch": 2.193768996960486, + "grad_norm": 2.0802059173583984, + "learning_rate": 9.258853350768499e-07, + "loss": 0.37343069911003113, + "mean_token_accuracy": 0.8705670833587646, + "num_tokens": 25806567.0, + "step": 2887 + }, + { + "epoch": 2.1945288753799392, + "grad_norm": 2.10831880569458, + "learning_rate": 9.242587892031945e-07, + "loss": 0.1989251971244812, + "mean_token_accuracy": 0.931064248085022, + "num_tokens": 25812715.0, + "step": 2888 + }, + { + "epoch": 2.195288753799392, + "grad_norm": 2.1305530071258545, + "learning_rate": 9.226333492453759e-07, + "loss": 0.29377204179763794, + "mean_token_accuracy": 0.8942701816558838, + "num_tokens": 25819988.0, + "step": 2889 + }, + { + "epoch": 2.196048632218845, + "grad_norm": 2.179025411605835, + "learning_rate": 9.210090163441928e-07, + "loss": 0.37565115094184875, + "mean_token_accuracy": 0.8700202703475952, + "num_tokens": 25827777.0, + "step": 2890 + }, + { + "epoch": 2.1968085106382977, + "grad_norm": 3.177180290222168, + "learning_rate": 9.19385791639665e-07, + "loss": 0.16646479070186615, + "mean_token_accuracy": 0.9426749348640442, + "num_tokens": 25831724.0, + "step": 2891 + }, + { + "epoch": 2.197568389057751, + "grad_norm": 1.103196620941162, + "learning_rate": 9.177636762710321e-07, + "loss": 0.29140013456344604, + "mean_token_accuracy": 0.8789779543876648, + "num_tokens": 25854707.0, + "step": 2892 + }, + { + "epoch": 2.1983282674772036, + "grad_norm": 1.597692847251892, + "learning_rate": 9.161426713767574e-07, + "loss": 0.37799614667892456, + "mean_token_accuracy": 0.8623079061508179, + "num_tokens": 25868429.0, + "step": 2893 + }, + { + "epoch": 2.1990881458966567, + "grad_norm": 2.227132558822632, + "learning_rate": 9.145227780945265e-07, + "loss": 0.2683261036872864, + "mean_token_accuracy": 0.9092563390731812, + "num_tokens": 25875367.0, + "step": 2894 + }, + { + "epoch": 2.1998480243161094, + "grad_norm": 3.1229634284973145, + "learning_rate": 9.129039975612408e-07, + "loss": 0.21859994530677795, + "mean_token_accuracy": 0.9187530875205994, + "num_tokens": 25879456.0, + "step": 2895 + }, + { + "epoch": 2.2006079027355625, + "grad_norm": 2.3224828243255615, + "learning_rate": 9.112863309130235e-07, + "loss": 0.3557605743408203, + "mean_token_accuracy": 0.8735873103141785, + "num_tokens": 25886477.0, + "step": 2896 + }, + { + "epoch": 2.201367781155015, + "grad_norm": 1.7784863710403442, + "learning_rate": 9.096697792852155e-07, + "loss": 0.334577351808548, + "mean_token_accuracy": 0.8948780298233032, + "num_tokens": 25894977.0, + "step": 2897 + }, + { + "epoch": 2.202127659574468, + "grad_norm": 2.34066104888916, + "learning_rate": 9.080543438123746e-07, + "loss": 0.16479721665382385, + "mean_token_accuracy": 0.9405456781387329, + "num_tokens": 25900015.0, + "step": 2898 + }, + { + "epoch": 2.202887537993921, + "grad_norm": 1.944082498550415, + "learning_rate": 9.064400256282757e-07, + "loss": 0.40259572863578796, + "mean_token_accuracy": 0.8632713556289673, + "num_tokens": 25908749.0, + "step": 2899 + }, + { + "epoch": 2.2036474164133737, + "grad_norm": 1.2758828401565552, + "learning_rate": 9.048268258659098e-07, + "loss": 0.3939874470233917, + "mean_token_accuracy": 0.8652969598770142, + "num_tokens": 25924972.0, + "step": 2900 + }, + { + "epoch": 2.204407294832827, + "grad_norm": 1.4483891725540161, + "learning_rate": 9.032147456574822e-07, + "loss": 0.4132935404777527, + "mean_token_accuracy": 0.868486762046814, + "num_tokens": 25939785.0, + "step": 2901 + }, + { + "epoch": 2.2051671732522795, + "grad_norm": 1.4866713285446167, + "learning_rate": 9.01603786134413e-07, + "loss": 0.3644951581954956, + "mean_token_accuracy": 0.8750203847885132, + "num_tokens": 25952648.0, + "step": 2902 + }, + { + "epoch": 2.2059270516717326, + "grad_norm": 1.6555454730987549, + "learning_rate": 8.999939484273362e-07, + "loss": 0.48656779527664185, + "mean_token_accuracy": 0.8372372984886169, + "num_tokens": 25965062.0, + "step": 2903 + }, + { + "epoch": 2.2066869300911853, + "grad_norm": 2.3154168128967285, + "learning_rate": 8.983852336660959e-07, + "loss": 0.3768891990184784, + "mean_token_accuracy": 0.8614999055862427, + "num_tokens": 25972152.0, + "step": 2904 + }, + { + "epoch": 2.2074468085106385, + "grad_norm": 2.3618056774139404, + "learning_rate": 8.967776429797529e-07, + "loss": 0.24905793368816376, + "mean_token_accuracy": 0.9170958995819092, + "num_tokens": 25977808.0, + "step": 2905 + }, + { + "epoch": 2.208206686930091, + "grad_norm": 1.929051399230957, + "learning_rate": 8.951711774965741e-07, + "loss": 0.38099539279937744, + "mean_token_accuracy": 0.8812143802642822, + "num_tokens": 25987871.0, + "step": 2906 + }, + { + "epoch": 2.2089665653495443, + "grad_norm": 1.6529620885849, + "learning_rate": 8.93565838344039e-07, + "loss": 0.31784749031066895, + "mean_token_accuracy": 0.8929437398910522, + "num_tokens": 25997777.0, + "step": 2907 + }, + { + "epoch": 2.209726443768997, + "grad_norm": 2.1413469314575195, + "learning_rate": 8.919616266488373e-07, + "loss": 0.4043882191181183, + "mean_token_accuracy": 0.8937146663665771, + "num_tokens": 26005213.0, + "step": 2908 + }, + { + "epoch": 2.2104863221884496, + "grad_norm": 1.3838988542556763, + "learning_rate": 8.903585435368658e-07, + "loss": 0.2858969569206238, + "mean_token_accuracy": 0.9084860682487488, + "num_tokens": 26018371.0, + "step": 2909 + }, + { + "epoch": 2.211246200607903, + "grad_norm": 1.2853319644927979, + "learning_rate": 8.887565901332304e-07, + "loss": 0.3178713619709015, + "mean_token_accuracy": 0.872230589389801, + "num_tokens": 26034136.0, + "step": 2910 + }, + { + "epoch": 2.2120060790273555, + "grad_norm": 2.9032399654388428, + "learning_rate": 8.871557675622442e-07, + "loss": 0.20348960161209106, + "mean_token_accuracy": 0.9275314807891846, + "num_tokens": 26038299.0, + "step": 2911 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 2.4349892139434814, + "learning_rate": 8.855560769474237e-07, + "loss": 0.24282032251358032, + "mean_token_accuracy": 0.9103988409042358, + "num_tokens": 26043427.0, + "step": 2912 + }, + { + "epoch": 2.2135258358662613, + "grad_norm": 2.324664831161499, + "learning_rate": 8.839575194114958e-07, + "loss": 0.3808317184448242, + "mean_token_accuracy": 0.8598989844322205, + "num_tokens": 26049667.0, + "step": 2913 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 2.594947576522827, + "learning_rate": 8.823600960763901e-07, + "loss": 0.39623332023620605, + "mean_token_accuracy": 0.8738477230072021, + "num_tokens": 26055428.0, + "step": 2914 + }, + { + "epoch": 2.215045592705167, + "grad_norm": 1.674308180809021, + "learning_rate": 8.807638080632375e-07, + "loss": 0.2641369104385376, + "mean_token_accuracy": 0.9119734764099121, + "num_tokens": 26064355.0, + "step": 2915 + }, + { + "epoch": 2.2158054711246202, + "grad_norm": 2.9884912967681885, + "learning_rate": 8.791686564923746e-07, + "loss": 0.19229236245155334, + "mean_token_accuracy": 0.9388723969459534, + "num_tokens": 26067563.0, + "step": 2916 + }, + { + "epoch": 2.216565349544073, + "grad_norm": 1.8513846397399902, + "learning_rate": 8.775746424833428e-07, + "loss": 0.3076218366622925, + "mean_token_accuracy": 0.9165210723876953, + "num_tokens": 26075609.0, + "step": 2917 + }, + { + "epoch": 2.217325227963526, + "grad_norm": 1.229604721069336, + "learning_rate": 8.759817671548801e-07, + "loss": 0.2727023959159851, + "mean_token_accuracy": 0.8931418061256409, + "num_tokens": 26091183.0, + "step": 2918 + }, + { + "epoch": 2.2180851063829787, + "grad_norm": 2.384413957595825, + "learning_rate": 8.743900316249273e-07, + "loss": 0.27312609553337097, + "mean_token_accuracy": 0.8972288370132446, + "num_tokens": 26096677.0, + "step": 2919 + }, + { + "epoch": 2.2188449848024314, + "grad_norm": 2.186370611190796, + "learning_rate": 8.727994370106288e-07, + "loss": 0.36045557260513306, + "mean_token_accuracy": 0.8788503408432007, + "num_tokens": 26104464.0, + "step": 2920 + }, + { + "epoch": 2.2196048632218845, + "grad_norm": 2.769796848297119, + "learning_rate": 8.71209984428322e-07, + "loss": 0.3427591919898987, + "mean_token_accuracy": 0.892108678817749, + "num_tokens": 26109571.0, + "step": 2921 + }, + { + "epoch": 2.2203647416413372, + "grad_norm": 2.9888014793395996, + "learning_rate": 8.696216749935471e-07, + "loss": 0.20137615501880646, + "mean_token_accuracy": 0.9366025924682617, + "num_tokens": 26113165.0, + "step": 2922 + }, + { + "epoch": 2.2211246200607904, + "grad_norm": 1.484858751296997, + "learning_rate": 8.680345098210408e-07, + "loss": 0.2884698510169983, + "mean_token_accuracy": 0.8992507457733154, + "num_tokens": 26124385.0, + "step": 2923 + }, + { + "epoch": 2.221884498480243, + "grad_norm": 1.690119981765747, + "learning_rate": 8.664484900247363e-07, + "loss": 0.34275567531585693, + "mean_token_accuracy": 0.8682634234428406, + "num_tokens": 26134944.0, + "step": 2924 + }, + { + "epoch": 2.222644376899696, + "grad_norm": 1.6171982288360596, + "learning_rate": 8.64863616717764e-07, + "loss": 0.256338506937027, + "mean_token_accuracy": 0.9281957745552063, + "num_tokens": 26143586.0, + "step": 2925 + }, + { + "epoch": 2.223404255319149, + "grad_norm": 2.4853835105895996, + "learning_rate": 8.632798910124493e-07, + "loss": 0.26290056109428406, + "mean_token_accuracy": 0.9119559526443481, + "num_tokens": 26148931.0, + "step": 2926 + }, + { + "epoch": 2.224164133738602, + "grad_norm": 2.0014333724975586, + "learning_rate": 8.616973140203097e-07, + "loss": 0.33400261402130127, + "mean_token_accuracy": 0.8796782493591309, + "num_tokens": 26156246.0, + "step": 2927 + }, + { + "epoch": 2.2249240121580547, + "grad_norm": 1.4637027978897095, + "learning_rate": 8.601158868520617e-07, + "loss": 0.24374958872795105, + "mean_token_accuracy": 0.9116952419281006, + "num_tokens": 26166431.0, + "step": 2928 + }, + { + "epoch": 2.225683890577508, + "grad_norm": 2.2056987285614014, + "learning_rate": 8.585356106176093e-07, + "loss": 0.3419337570667267, + "mean_token_accuracy": 0.8703858852386475, + "num_tokens": 26173974.0, + "step": 2929 + }, + { + "epoch": 2.2264437689969605, + "grad_norm": 1.3687927722930908, + "learning_rate": 8.569564864260524e-07, + "loss": 0.43176111578941345, + "mean_token_accuracy": 0.8616900444030762, + "num_tokens": 26191632.0, + "step": 2930 + }, + { + "epoch": 2.227203647416413, + "grad_norm": 1.4975634813308716, + "learning_rate": 8.553785153856809e-07, + "loss": 0.38525745272636414, + "mean_token_accuracy": 0.8611687421798706, + "num_tokens": 26203300.0, + "step": 2931 + }, + { + "epoch": 2.2279635258358663, + "grad_norm": 1.970109462738037, + "learning_rate": 8.538016986039751e-07, + "loss": 0.31731468439102173, + "mean_token_accuracy": 0.884365975856781, + "num_tokens": 26210037.0, + "step": 2932 + }, + { + "epoch": 2.228723404255319, + "grad_norm": 2.681717872619629, + "learning_rate": 8.522260371876068e-07, + "loss": 0.2770140767097473, + "mean_token_accuracy": 0.9020107984542847, + "num_tokens": 26215460.0, + "step": 2933 + }, + { + "epoch": 2.229483282674772, + "grad_norm": 2.2324795722961426, + "learning_rate": 8.506515322424349e-07, + "loss": 0.30599141120910645, + "mean_token_accuracy": 0.8939633965492249, + "num_tokens": 26221260.0, + "step": 2934 + }, + { + "epoch": 2.230243161094225, + "grad_norm": 2.08915376663208, + "learning_rate": 8.49078184873508e-07, + "loss": 0.3609209954738617, + "mean_token_accuracy": 0.8776482343673706, + "num_tokens": 26228397.0, + "step": 2935 + }, + { + "epoch": 2.231003039513678, + "grad_norm": 1.641366958618164, + "learning_rate": 8.475059961850617e-07, + "loss": 0.2969125509262085, + "mean_token_accuracy": 0.8949217796325684, + "num_tokens": 26238533.0, + "step": 2936 + }, + { + "epoch": 2.2317629179331306, + "grad_norm": 1.082148551940918, + "learning_rate": 8.459349672805198e-07, + "loss": 0.23957109451293945, + "mean_token_accuracy": 0.9255712032318115, + "num_tokens": 26254154.0, + "step": 2937 + }, + { + "epoch": 2.2325227963525838, + "grad_norm": 2.495208740234375, + "learning_rate": 8.443650992624877e-07, + "loss": 0.2879767417907715, + "mean_token_accuracy": 0.8911515474319458, + "num_tokens": 26260812.0, + "step": 2938 + }, + { + "epoch": 2.2332826747720365, + "grad_norm": 3.566549062728882, + "learning_rate": 8.427963932327621e-07, + "loss": 0.31420570611953735, + "mean_token_accuracy": 0.8888009190559387, + "num_tokens": 26264592.0, + "step": 2939 + }, + { + "epoch": 2.2340425531914896, + "grad_norm": 2.217177391052246, + "learning_rate": 8.412288502923211e-07, + "loss": 0.30547618865966797, + "mean_token_accuracy": 0.9065294861793518, + "num_tokens": 26270729.0, + "step": 2940 + }, + { + "epoch": 2.2348024316109423, + "grad_norm": 1.404260277748108, + "learning_rate": 8.396624715413251e-07, + "loss": 0.32485032081604004, + "mean_token_accuracy": 0.8799532651901245, + "num_tokens": 26284280.0, + "step": 2941 + }, + { + "epoch": 2.235562310030395, + "grad_norm": 1.5519827604293823, + "learning_rate": 8.380972580791191e-07, + "loss": 0.3330575227737427, + "mean_token_accuracy": 0.8865892887115479, + "num_tokens": 26293635.0, + "step": 2942 + }, + { + "epoch": 2.236322188449848, + "grad_norm": 2.604766845703125, + "learning_rate": 8.365332110042323e-07, + "loss": 0.18986842036247253, + "mean_token_accuracy": 0.9276989102363586, + "num_tokens": 26298553.0, + "step": 2943 + }, + { + "epoch": 2.237082066869301, + "grad_norm": 2.1750004291534424, + "learning_rate": 8.349703314143712e-07, + "loss": 0.3661153018474579, + "mean_token_accuracy": 0.8879489302635193, + "num_tokens": 26305697.0, + "step": 2944 + }, + { + "epoch": 2.237841945288754, + "grad_norm": 2.247069835662842, + "learning_rate": 8.334086204064254e-07, + "loss": 0.3127560615539551, + "mean_token_accuracy": 0.8846344351768494, + "num_tokens": 26312347.0, + "step": 2945 + }, + { + "epoch": 2.2386018237082066, + "grad_norm": 1.905275821685791, + "learning_rate": 8.318480790764638e-07, + "loss": 0.44245776534080505, + "mean_token_accuracy": 0.87440425157547, + "num_tokens": 26322787.0, + "step": 2946 + }, + { + "epoch": 2.2393617021276597, + "grad_norm": 1.8596254587173462, + "learning_rate": 8.302887085197342e-07, + "loss": 0.30068373680114746, + "mean_token_accuracy": 0.8847110271453857, + "num_tokens": 26330437.0, + "step": 2947 + }, + { + "epoch": 2.2401215805471124, + "grad_norm": 2.0028860569000244, + "learning_rate": 8.28730509830663e-07, + "loss": 0.4276006817817688, + "mean_token_accuracy": 0.8406014442443848, + "num_tokens": 26340100.0, + "step": 2948 + }, + { + "epoch": 2.2408814589665655, + "grad_norm": 2.494434356689453, + "learning_rate": 8.271734841028553e-07, + "loss": 0.3874223232269287, + "mean_token_accuracy": 0.8782174587249756, + "num_tokens": 26345750.0, + "step": 2949 + }, + { + "epoch": 2.2416413373860182, + "grad_norm": 1.955613613128662, + "learning_rate": 8.256176324290885e-07, + "loss": 0.28770074248313904, + "mean_token_accuracy": 0.9004360437393188, + "num_tokens": 26353342.0, + "step": 2950 + }, + { + "epoch": 2.2424012158054714, + "grad_norm": 1.7579785585403442, + "learning_rate": 8.240629559013222e-07, + "loss": 0.2277943640947342, + "mean_token_accuracy": 0.9145861864089966, + "num_tokens": 26361348.0, + "step": 2951 + }, + { + "epoch": 2.243161094224924, + "grad_norm": 1.5848479270935059, + "learning_rate": 8.22509455610688e-07, + "loss": 0.32944542169570923, + "mean_token_accuracy": 0.8662827014923096, + "num_tokens": 26372006.0, + "step": 2952 + }, + { + "epoch": 2.2439209726443767, + "grad_norm": 2.6263222694396973, + "learning_rate": 8.209571326474897e-07, + "loss": 0.34646326303482056, + "mean_token_accuracy": 0.8817736506462097, + "num_tokens": 26377664.0, + "step": 2953 + }, + { + "epoch": 2.24468085106383, + "grad_norm": 2.407590627670288, + "learning_rate": 8.194059881012107e-07, + "loss": 0.41302192211151123, + "mean_token_accuracy": 0.8898757696151733, + "num_tokens": 26384225.0, + "step": 2954 + }, + { + "epoch": 2.2454407294832825, + "grad_norm": 2.5156402587890625, + "learning_rate": 8.178560230605012e-07, + "loss": 0.3468608558177948, + "mean_token_accuracy": 0.8879599571228027, + "num_tokens": 26389374.0, + "step": 2955 + }, + { + "epoch": 2.2462006079027357, + "grad_norm": 1.5076090097427368, + "learning_rate": 8.163072386131876e-07, + "loss": 0.3750625550746918, + "mean_token_accuracy": 0.8712738752365112, + "num_tokens": 26402674.0, + "step": 2956 + }, + { + "epoch": 2.2469604863221884, + "grad_norm": 1.5181068181991577, + "learning_rate": 8.147596358462662e-07, + "loss": 0.19113478064537048, + "mean_token_accuracy": 0.9323463439941406, + "num_tokens": 26411626.0, + "step": 2957 + }, + { + "epoch": 2.2477203647416415, + "grad_norm": 1.0806915760040283, + "learning_rate": 8.132132158459044e-07, + "loss": 0.3411233425140381, + "mean_token_accuracy": 0.8736830949783325, + "num_tokens": 26435891.0, + "step": 2958 + }, + { + "epoch": 2.248480243161094, + "grad_norm": 1.5527247190475464, + "learning_rate": 8.116679796974389e-07, + "loss": 0.425741970539093, + "mean_token_accuracy": 0.8448845148086548, + "num_tokens": 26448134.0, + "step": 2959 + }, + { + "epoch": 2.2492401215805473, + "grad_norm": 1.2390631437301636, + "learning_rate": 8.10123928485377e-07, + "loss": 0.38084933161735535, + "mean_token_accuracy": 0.8656617999076843, + "num_tokens": 26467213.0, + "step": 2960 + }, + { + "epoch": 2.25, + "grad_norm": 3.0672852993011475, + "learning_rate": 8.08581063293391e-07, + "loss": 0.29300111532211304, + "mean_token_accuracy": 0.8933638334274292, + "num_tokens": 26471599.0, + "step": 2961 + }, + { + "epoch": 2.250759878419453, + "grad_norm": 1.2359145879745483, + "learning_rate": 8.070393852043251e-07, + "loss": 0.41337621212005615, + "mean_token_accuracy": 0.854198694229126, + "num_tokens": 26488461.0, + "step": 2962 + }, + { + "epoch": 2.251519756838906, + "grad_norm": 1.8551225662231445, + "learning_rate": 8.054988953001889e-07, + "loss": 0.3036419153213501, + "mean_token_accuracy": 0.8883144855499268, + "num_tokens": 26496398.0, + "step": 2963 + }, + { + "epoch": 2.2522796352583585, + "grad_norm": 1.3691812753677368, + "learning_rate": 8.039595946621551e-07, + "loss": 0.3286219835281372, + "mean_token_accuracy": 0.892130434513092, + "num_tokens": 26510493.0, + "step": 2964 + }, + { + "epoch": 2.2530395136778116, + "grad_norm": 1.7371556758880615, + "learning_rate": 8.024214843705647e-07, + "loss": 0.4105026125907898, + "mean_token_accuracy": 0.8889180421829224, + "num_tokens": 26519148.0, + "step": 2965 + }, + { + "epoch": 2.2537993920972643, + "grad_norm": 2.211665630340576, + "learning_rate": 8.00884565504925e-07, + "loss": 0.3912196159362793, + "mean_token_accuracy": 0.8632891774177551, + "num_tokens": 26526314.0, + "step": 2966 + }, + { + "epoch": 2.2545592705167175, + "grad_norm": 2.476206064224243, + "learning_rate": 7.993488391439025e-07, + "loss": 0.20462508499622345, + "mean_token_accuracy": 0.9276266098022461, + "num_tokens": 26531781.0, + "step": 2967 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 1.4944102764129639, + "learning_rate": 7.978143063653296e-07, + "loss": 0.2694895267486572, + "mean_token_accuracy": 0.9033881425857544, + "num_tokens": 26543780.0, + "step": 2968 + }, + { + "epoch": 2.2560790273556233, + "grad_norm": 1.7570104598999023, + "learning_rate": 7.962809682462008e-07, + "loss": 0.3060353100299835, + "mean_token_accuracy": 0.8908290863037109, + "num_tokens": 26551978.0, + "step": 2969 + }, + { + "epoch": 2.256838905775076, + "grad_norm": 2.215514898300171, + "learning_rate": 7.947488258626718e-07, + "loss": 0.2930528521537781, + "mean_token_accuracy": 0.8989757299423218, + "num_tokens": 26558267.0, + "step": 2970 + }, + { + "epoch": 2.2575987841945286, + "grad_norm": 2.3069000244140625, + "learning_rate": 7.93217880290059e-07, + "loss": 0.18501774966716766, + "mean_token_accuracy": 0.931271493434906, + "num_tokens": 26563286.0, + "step": 2971 + }, + { + "epoch": 2.2583586626139818, + "grad_norm": 1.6555116176605225, + "learning_rate": 7.916881326028387e-07, + "loss": 0.3178265392780304, + "mean_token_accuracy": 0.9016884565353394, + "num_tokens": 26572087.0, + "step": 2972 + }, + { + "epoch": 2.2591185410334345, + "grad_norm": 2.222161054611206, + "learning_rate": 7.901595838746471e-07, + "loss": 0.3013504445552826, + "mean_token_accuracy": 0.8942798376083374, + "num_tokens": 26578159.0, + "step": 2973 + }, + { + "epoch": 2.2598784194528876, + "grad_norm": 1.979411005973816, + "learning_rate": 7.886322351782782e-07, + "loss": 0.42746615409851074, + "mean_token_accuracy": 0.85303795337677, + "num_tokens": 26586252.0, + "step": 2974 + }, + { + "epoch": 2.2606382978723403, + "grad_norm": 1.4925786256790161, + "learning_rate": 7.871060875856854e-07, + "loss": 0.33495625853538513, + "mean_token_accuracy": 0.8911026120185852, + "num_tokens": 26599921.0, + "step": 2975 + }, + { + "epoch": 2.2613981762917934, + "grad_norm": 1.9037046432495117, + "learning_rate": 7.855811421679746e-07, + "loss": 0.31471866369247437, + "mean_token_accuracy": 0.9007552862167358, + "num_tokens": 26607954.0, + "step": 2976 + }, + { + "epoch": 2.262158054711246, + "grad_norm": 2.2751407623291016, + "learning_rate": 7.840573999954154e-07, + "loss": 0.26972368359565735, + "mean_token_accuracy": 0.8992317914962769, + "num_tokens": 26614036.0, + "step": 2977 + }, + { + "epoch": 2.262917933130699, + "grad_norm": 2.680572271347046, + "learning_rate": 7.825348621374257e-07, + "loss": 0.4264066219329834, + "mean_token_accuracy": 0.8547691106796265, + "num_tokens": 26619545.0, + "step": 2978 + }, + { + "epoch": 2.263677811550152, + "grad_norm": 2.3535876274108887, + "learning_rate": 7.810135296625817e-07, + "loss": 0.37871062755584717, + "mean_token_accuracy": 0.8621708750724792, + "num_tokens": 26626248.0, + "step": 2979 + }, + { + "epoch": 2.264437689969605, + "grad_norm": 1.2249537706375122, + "learning_rate": 7.794934036386139e-07, + "loss": 0.3877285122871399, + "mean_token_accuracy": 0.8593572378158569, + "num_tokens": 26648023.0, + "step": 2980 + }, + { + "epoch": 2.2651975683890577, + "grad_norm": 2.43371844291687, + "learning_rate": 7.779744851324048e-07, + "loss": 0.37463510036468506, + "mean_token_accuracy": 0.8646193742752075, + "num_tokens": 26654016.0, + "step": 2981 + }, + { + "epoch": 2.2659574468085104, + "grad_norm": 1.7429327964782715, + "learning_rate": 7.7645677520999e-07, + "loss": 0.4033060669898987, + "mean_token_accuracy": 0.8644014596939087, + "num_tokens": 26664447.0, + "step": 2982 + }, + { + "epoch": 2.2667173252279635, + "grad_norm": 2.4090006351470947, + "learning_rate": 7.749402749365573e-07, + "loss": 0.2981206774711609, + "mean_token_accuracy": 0.8886175751686096, + "num_tokens": 26670355.0, + "step": 2983 + }, + { + "epoch": 2.2674772036474162, + "grad_norm": 1.3855396509170532, + "learning_rate": 7.734249853764428e-07, + "loss": 0.35967472195625305, + "mean_token_accuracy": 0.8652631044387817, + "num_tokens": 26685385.0, + "step": 2984 + }, + { + "epoch": 2.2682370820668694, + "grad_norm": 1.328214168548584, + "learning_rate": 7.719109075931375e-07, + "loss": 0.3571951389312744, + "mean_token_accuracy": 0.8894522190093994, + "num_tokens": 26703265.0, + "step": 2985 + }, + { + "epoch": 2.268996960486322, + "grad_norm": 2.5001046657562256, + "learning_rate": 7.703980426492791e-07, + "loss": 0.3512844741344452, + "mean_token_accuracy": 0.887405514717102, + "num_tokens": 26709095.0, + "step": 2986 + }, + { + "epoch": 2.269756838905775, + "grad_norm": 1.8704569339752197, + "learning_rate": 7.688863916066524e-07, + "loss": 0.2746743857860565, + "mean_token_accuracy": 0.903412401676178, + "num_tokens": 26716815.0, + "step": 2987 + }, + { + "epoch": 2.270516717325228, + "grad_norm": 2.1134285926818848, + "learning_rate": 7.673759555261947e-07, + "loss": 0.38385504484176636, + "mean_token_accuracy": 0.8759124279022217, + "num_tokens": 26724046.0, + "step": 2988 + }, + { + "epoch": 2.271276595744681, + "grad_norm": 1.2651840448379517, + "learning_rate": 7.65866735467988e-07, + "loss": 0.3499506413936615, + "mean_token_accuracy": 0.8704953193664551, + "num_tokens": 26743024.0, + "step": 2989 + }, + { + "epoch": 2.2720364741641337, + "grad_norm": 1.7289817333221436, + "learning_rate": 7.643587324912597e-07, + "loss": 0.3768725097179413, + "mean_token_accuracy": 0.8623670339584351, + "num_tokens": 26754336.0, + "step": 2990 + }, + { + "epoch": 2.272796352583587, + "grad_norm": 1.6121667623519897, + "learning_rate": 7.628519476543839e-07, + "loss": 0.42746737599372864, + "mean_token_accuracy": 0.8425478935241699, + "num_tokens": 26766813.0, + "step": 2991 + }, + { + "epoch": 2.2735562310030395, + "grad_norm": 2.705442428588867, + "learning_rate": 7.613463820148831e-07, + "loss": 0.27137982845306396, + "mean_token_accuracy": 0.9014253616333008, + "num_tokens": 26772565.0, + "step": 2992 + }, + { + "epoch": 2.274316109422492, + "grad_norm": 1.3811960220336914, + "learning_rate": 7.598420366294185e-07, + "loss": 0.2957465350627899, + "mean_token_accuracy": 0.8935354351997375, + "num_tokens": 26787325.0, + "step": 2993 + }, + { + "epoch": 2.2750759878419453, + "grad_norm": 2.469336986541748, + "learning_rate": 7.583389125537982e-07, + "loss": 0.2811780273914337, + "mean_token_accuracy": 0.8956634998321533, + "num_tokens": 26793457.0, + "step": 2994 + }, + { + "epoch": 2.275835866261398, + "grad_norm": 2.945681571960449, + "learning_rate": 7.568370108429732e-07, + "loss": 0.3186708092689514, + "mean_token_accuracy": 0.8817545175552368, + "num_tokens": 26797867.0, + "step": 2995 + }, + { + "epoch": 2.276595744680851, + "grad_norm": 1.7748228311538696, + "learning_rate": 7.553363325510355e-07, + "loss": 0.3279818892478943, + "mean_token_accuracy": 0.884396493434906, + "num_tokens": 26806656.0, + "step": 2996 + }, + { + "epoch": 2.277355623100304, + "grad_norm": 1.312500238418579, + "learning_rate": 7.538368787312186e-07, + "loss": 0.3754822611808777, + "mean_token_accuracy": 0.8653179407119751, + "num_tokens": 26823126.0, + "step": 2997 + }, + { + "epoch": 2.278115501519757, + "grad_norm": 3.1305344104766846, + "learning_rate": 7.523386504358984e-07, + "loss": 0.3293214440345764, + "mean_token_accuracy": 0.8908799886703491, + "num_tokens": 26828250.0, + "step": 2998 + }, + { + "epoch": 2.2788753799392096, + "grad_norm": 2.6449344158172607, + "learning_rate": 7.508416487165862e-07, + "loss": 0.23732036352157593, + "mean_token_accuracy": 0.9029837846755981, + "num_tokens": 26833123.0, + "step": 2999 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 2.04388427734375, + "learning_rate": 7.49345874623939e-07, + "loss": 0.31240373849868774, + "mean_token_accuracy": 0.8860392570495605, + "num_tokens": 26840878.0, + "step": 3000 + }, + { + "epoch": 2.2803951367781155, + "grad_norm": 1.1828604936599731, + "learning_rate": 7.478513292077463e-07, + "loss": 0.32127636671066284, + "mean_token_accuracy": 0.8938446044921875, + "num_tokens": 26858916.0, + "step": 3001 + }, + { + "epoch": 2.2811550151975686, + "grad_norm": 2.5061612129211426, + "learning_rate": 7.46358013516938e-07, + "loss": 0.30558091402053833, + "mean_token_accuracy": 0.8819161653518677, + "num_tokens": 26864218.0, + "step": 3002 + }, + { + "epoch": 2.2819148936170213, + "grad_norm": 2.424044609069824, + "learning_rate": 7.448659285995808e-07, + "loss": 0.3008216917514801, + "mean_token_accuracy": 0.8751994371414185, + "num_tokens": 26869646.0, + "step": 3003 + }, + { + "epoch": 2.282674772036474, + "grad_norm": 1.3576173782348633, + "learning_rate": 7.433750755028774e-07, + "loss": 0.3001647889614105, + "mean_token_accuracy": 0.8996933698654175, + "num_tokens": 26884385.0, + "step": 3004 + }, + { + "epoch": 2.283434650455927, + "grad_norm": 2.237589120864868, + "learning_rate": 7.418854552731655e-07, + "loss": 0.3126741051673889, + "mean_token_accuracy": 0.8910979628562927, + "num_tokens": 26891109.0, + "step": 3005 + }, + { + "epoch": 2.2841945288753798, + "grad_norm": 2.1947414875030518, + "learning_rate": 7.403970689559184e-07, + "loss": 0.29793858528137207, + "mean_token_accuracy": 0.9057353734970093, + "num_tokens": 26897905.0, + "step": 3006 + }, + { + "epoch": 2.284954407294833, + "grad_norm": 1.4252705574035645, + "learning_rate": 7.389099175957426e-07, + "loss": 0.2873227298259735, + "mean_token_accuracy": 0.8910978436470032, + "num_tokens": 26910322.0, + "step": 3007 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 1.2200649976730347, + "learning_rate": 7.374240022363785e-07, + "loss": 0.2782876491546631, + "mean_token_accuracy": 0.8948163390159607, + "num_tokens": 26927253.0, + "step": 3008 + }, + { + "epoch": 2.2864741641337387, + "grad_norm": 2.1249423027038574, + "learning_rate": 7.359393239206991e-07, + "loss": 0.4046584367752075, + "mean_token_accuracy": 0.8653120994567871, + "num_tokens": 26934798.0, + "step": 3009 + }, + { + "epoch": 2.2872340425531914, + "grad_norm": 1.6851856708526611, + "learning_rate": 7.344558836907067e-07, + "loss": 0.3814213275909424, + "mean_token_accuracy": 0.8618872165679932, + "num_tokens": 26944984.0, + "step": 3010 + }, + { + "epoch": 2.2879939209726445, + "grad_norm": 1.5802191495895386, + "learning_rate": 7.329736825875388e-07, + "loss": 0.28643855452537537, + "mean_token_accuracy": 0.9038295745849609, + "num_tokens": 26957832.0, + "step": 3011 + }, + { + "epoch": 2.288753799392097, + "grad_norm": 1.6257383823394775, + "learning_rate": 7.314927216514617e-07, + "loss": 0.264072448015213, + "mean_token_accuracy": 0.9089190363883972, + "num_tokens": 26967621.0, + "step": 3012 + }, + { + "epoch": 2.2895136778115504, + "grad_norm": 2.107192039489746, + "learning_rate": 7.300130019218688e-07, + "loss": 0.2772635817527771, + "mean_token_accuracy": 0.9071067571640015, + "num_tokens": 26974669.0, + "step": 3013 + }, + { + "epoch": 2.290273556231003, + "grad_norm": 1.496505618095398, + "learning_rate": 7.285345244372843e-07, + "loss": 0.2936630845069885, + "mean_token_accuracy": 0.8946818113327026, + "num_tokens": 26985942.0, + "step": 3014 + }, + { + "epoch": 2.2910334346504557, + "grad_norm": 1.6122950315475464, + "learning_rate": 7.270572902353634e-07, + "loss": 0.2819349765777588, + "mean_token_accuracy": 0.8909854888916016, + "num_tokens": 26996231.0, + "step": 3015 + }, + { + "epoch": 2.291793313069909, + "grad_norm": 1.9463475942611694, + "learning_rate": 7.255813003528834e-07, + "loss": 0.2584724426269531, + "mean_token_accuracy": 0.9069744348526001, + "num_tokens": 27003253.0, + "step": 3016 + }, + { + "epoch": 2.2925531914893615, + "grad_norm": 2.1707770824432373, + "learning_rate": 7.241065558257513e-07, + "loss": 0.17524898052215576, + "mean_token_accuracy": 0.926141083240509, + "num_tokens": 27009501.0, + "step": 3017 + }, + { + "epoch": 2.2933130699088147, + "grad_norm": 2.1424882411956787, + "learning_rate": 7.226330576889998e-07, + "loss": 0.26512211561203003, + "mean_token_accuracy": 0.9059023857116699, + "num_tokens": 27016096.0, + "step": 3018 + }, + { + "epoch": 2.2940729483282674, + "grad_norm": 3.50669264793396, + "learning_rate": 7.211608069767867e-07, + "loss": 0.24738222360610962, + "mean_token_accuracy": 0.9179760217666626, + "num_tokens": 27019810.0, + "step": 3019 + }, + { + "epoch": 2.2948328267477205, + "grad_norm": 1.5426064729690552, + "learning_rate": 7.196898047223943e-07, + "loss": 0.2762960195541382, + "mean_token_accuracy": 0.8937389850616455, + "num_tokens": 27031952.0, + "step": 3020 + }, + { + "epoch": 2.295592705167173, + "grad_norm": 2.469064712524414, + "learning_rate": 7.182200519582283e-07, + "loss": 0.2877562940120697, + "mean_token_accuracy": 0.9252556562423706, + "num_tokens": 27036673.0, + "step": 3021 + }, + { + "epoch": 2.2963525835866263, + "grad_norm": 3.289813756942749, + "learning_rate": 7.167515497158179e-07, + "loss": 0.2837294340133667, + "mean_token_accuracy": 0.9070497155189514, + "num_tokens": 27041001.0, + "step": 3022 + }, + { + "epoch": 2.297112462006079, + "grad_norm": 1.7201104164123535, + "learning_rate": 7.152842990258147e-07, + "loss": 0.44239580631256104, + "mean_token_accuracy": 0.8443326354026794, + "num_tokens": 27052265.0, + "step": 3023 + }, + { + "epoch": 2.297872340425532, + "grad_norm": 1.3710078001022339, + "learning_rate": 7.138183009179922e-07, + "loss": 0.40450236201286316, + "mean_token_accuracy": 0.87160724401474, + "num_tokens": 27068475.0, + "step": 3024 + }, + { + "epoch": 2.298632218844985, + "grad_norm": 2.1379098892211914, + "learning_rate": 7.123535564212419e-07, + "loss": 0.3432690501213074, + "mean_token_accuracy": 0.8736584186553955, + "num_tokens": 27075548.0, + "step": 3025 + }, + { + "epoch": 2.2993920972644375, + "grad_norm": 2.423079252243042, + "learning_rate": 7.108900665635815e-07, + "loss": 0.27869731187820435, + "mean_token_accuracy": 0.9046810865402222, + "num_tokens": 27081560.0, + "step": 3026 + }, + { + "epoch": 2.3001519756838906, + "grad_norm": 1.2137898206710815, + "learning_rate": 7.094278323721418e-07, + "loss": 0.41351836919784546, + "mean_token_accuracy": 0.8553295135498047, + "num_tokens": 27098346.0, + "step": 3027 + }, + { + "epoch": 2.3009118541033433, + "grad_norm": 1.371337890625, + "learning_rate": 7.079668548731757e-07, + "loss": 0.29800572991371155, + "mean_token_accuracy": 0.9219756126403809, + "num_tokens": 27111678.0, + "step": 3028 + }, + { + "epoch": 2.3016717325227964, + "grad_norm": 3.133449077606201, + "learning_rate": 7.065071350920538e-07, + "loss": 0.39177340269088745, + "mean_token_accuracy": 0.8742524981498718, + "num_tokens": 27116496.0, + "step": 3029 + }, + { + "epoch": 2.302431610942249, + "grad_norm": 1.4038591384887695, + "learning_rate": 7.050486740532633e-07, + "loss": 0.2862081825733185, + "mean_token_accuracy": 0.8894703984260559, + "num_tokens": 27130806.0, + "step": 3030 + }, + { + "epoch": 2.3031914893617023, + "grad_norm": 1.806132197380066, + "learning_rate": 7.035914727804085e-07, + "loss": 0.42546606063842773, + "mean_token_accuracy": 0.876154363155365, + "num_tokens": 27143687.0, + "step": 3031 + }, + { + "epoch": 2.303951367781155, + "grad_norm": 1.8565905094146729, + "learning_rate": 7.021355322962103e-07, + "loss": 0.304633229970932, + "mean_token_accuracy": 0.896949052810669, + "num_tokens": 27152532.0, + "step": 3032 + }, + { + "epoch": 2.304711246200608, + "grad_norm": 2.8857851028442383, + "learning_rate": 7.006808536225009e-07, + "loss": 0.3943948745727539, + "mean_token_accuracy": 0.8629783391952515, + "num_tokens": 27157824.0, + "step": 3033 + }, + { + "epoch": 2.3054711246200608, + "grad_norm": 1.7708746194839478, + "learning_rate": 6.992274377802328e-07, + "loss": 0.46951010823249817, + "mean_token_accuracy": 0.8334795236587524, + "num_tokens": 27169445.0, + "step": 3034 + }, + { + "epoch": 2.306231003039514, + "grad_norm": 2.5275487899780273, + "learning_rate": 6.977752857894684e-07, + "loss": 0.3764885365962982, + "mean_token_accuracy": 0.8665527105331421, + "num_tokens": 27176545.0, + "step": 3035 + }, + { + "epoch": 2.3069908814589666, + "grad_norm": 1.9251405000686646, + "learning_rate": 6.963243986693832e-07, + "loss": 0.44473910331726074, + "mean_token_accuracy": 0.8828103542327881, + "num_tokens": 27187808.0, + "step": 3036 + }, + { + "epoch": 2.3077507598784193, + "grad_norm": 2.1559739112854004, + "learning_rate": 6.94874777438265e-07, + "loss": 0.35055795311927795, + "mean_token_accuracy": 0.8815537691116333, + "num_tokens": 27195493.0, + "step": 3037 + }, + { + "epoch": 2.3085106382978724, + "grad_norm": 1.2242814302444458, + "learning_rate": 6.934264231135163e-07, + "loss": 0.38762199878692627, + "mean_token_accuracy": 0.8607999086380005, + "num_tokens": 27213291.0, + "step": 3038 + }, + { + "epoch": 2.309270516717325, + "grad_norm": 3.787707805633545, + "learning_rate": 6.919793367116453e-07, + "loss": 0.299210786819458, + "mean_token_accuracy": 0.8993752002716064, + "num_tokens": 27216930.0, + "step": 3039 + }, + { + "epoch": 2.310030395136778, + "grad_norm": 1.4088979959487915, + "learning_rate": 6.905335192482734e-07, + "loss": 0.337495893239975, + "mean_token_accuracy": 0.8903428912162781, + "num_tokens": 27229441.0, + "step": 3040 + }, + { + "epoch": 2.310790273556231, + "grad_norm": 2.0042521953582764, + "learning_rate": 6.890889717381333e-07, + "loss": 0.2732951045036316, + "mean_token_accuracy": 0.8986722826957703, + "num_tokens": 27237525.0, + "step": 3041 + }, + { + "epoch": 2.311550151975684, + "grad_norm": 2.4301047325134277, + "learning_rate": 6.876456951950614e-07, + "loss": 0.25528258085250854, + "mean_token_accuracy": 0.9083898663520813, + "num_tokens": 27243073.0, + "step": 3042 + }, + { + "epoch": 2.3123100303951367, + "grad_norm": 1.4725151062011719, + "learning_rate": 6.862036906320055e-07, + "loss": 0.3366362452507019, + "mean_token_accuracy": 0.8746060729026794, + "num_tokens": 27255151.0, + "step": 3043 + }, + { + "epoch": 2.31306990881459, + "grad_norm": 2.687649965286255, + "learning_rate": 6.847629590610202e-07, + "loss": 0.30955633521080017, + "mean_token_accuracy": 0.8862895369529724, + "num_tokens": 27259909.0, + "step": 3044 + }, + { + "epoch": 2.3138297872340425, + "grad_norm": 1.9105106592178345, + "learning_rate": 6.833235014932662e-07, + "loss": 0.3366878628730774, + "mean_token_accuracy": 0.8920552134513855, + "num_tokens": 27268003.0, + "step": 3045 + }, + { + "epoch": 2.3145896656534957, + "grad_norm": 2.278108596801758, + "learning_rate": 6.818853189390104e-07, + "loss": 0.41192957758903503, + "mean_token_accuracy": 0.8558850288391113, + "num_tokens": 27275447.0, + "step": 3046 + }, + { + "epoch": 2.3153495440729484, + "grad_norm": 3.114295482635498, + "learning_rate": 6.804484124076249e-07, + "loss": 0.16981825232505798, + "mean_token_accuracy": 0.9305338859558105, + "num_tokens": 27279348.0, + "step": 3047 + }, + { + "epoch": 2.316109422492401, + "grad_norm": 1.188263177871704, + "learning_rate": 6.790127829075843e-07, + "loss": 0.3003719747066498, + "mean_token_accuracy": 0.8945091366767883, + "num_tokens": 27296576.0, + "step": 3048 + }, + { + "epoch": 2.316869300911854, + "grad_norm": 1.4627037048339844, + "learning_rate": 6.775784314464717e-07, + "loss": 0.42125576734542847, + "mean_token_accuracy": 0.85997474193573, + "num_tokens": 27310603.0, + "step": 3049 + }, + { + "epoch": 2.317629179331307, + "grad_norm": 1.86640465259552, + "learning_rate": 6.761453590309675e-07, + "loss": 0.27236056327819824, + "mean_token_accuracy": 0.8952003717422485, + "num_tokens": 27320635.0, + "step": 3050 + }, + { + "epoch": 2.31838905775076, + "grad_norm": 2.3250787258148193, + "learning_rate": 6.747135666668581e-07, + "loss": 0.35650634765625, + "mean_token_accuracy": 0.8870455026626587, + "num_tokens": 27326778.0, + "step": 3051 + }, + { + "epoch": 2.3191489361702127, + "grad_norm": 1.493028163909912, + "learning_rate": 6.732830553590305e-07, + "loss": 0.3086358308792114, + "mean_token_accuracy": 0.8837405443191528, + "num_tokens": 27341792.0, + "step": 3052 + }, + { + "epoch": 2.319908814589666, + "grad_norm": 1.9723037481307983, + "learning_rate": 6.718538261114727e-07, + "loss": 0.2970390021800995, + "mean_token_accuracy": 0.8897635340690613, + "num_tokens": 27349764.0, + "step": 3053 + }, + { + "epoch": 2.3206686930091185, + "grad_norm": 2.418403387069702, + "learning_rate": 6.704258799272723e-07, + "loss": 0.31288546323776245, + "mean_token_accuracy": 0.8795867562294006, + "num_tokens": 27355223.0, + "step": 3054 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 1.866711139678955, + "learning_rate": 6.689992178086174e-07, + "loss": 0.2915012240409851, + "mean_token_accuracy": 0.8901758790016174, + "num_tokens": 27363363.0, + "step": 3055 + }, + { + "epoch": 2.3221884498480243, + "grad_norm": 2.52559494972229, + "learning_rate": 6.675738407567941e-07, + "loss": 0.28706514835357666, + "mean_token_accuracy": 0.9131950736045837, + "num_tokens": 27368937.0, + "step": 3056 + }, + { + "epoch": 2.3229483282674774, + "grad_norm": 1.5393383502960205, + "learning_rate": 6.661497497721872e-07, + "loss": 0.41627925634384155, + "mean_token_accuracy": 0.8846169114112854, + "num_tokens": 27381824.0, + "step": 3057 + }, + { + "epoch": 2.32370820668693, + "grad_norm": 1.2711350917816162, + "learning_rate": 6.647269458542793e-07, + "loss": 0.3200211524963379, + "mean_token_accuracy": 0.8812989592552185, + "num_tokens": 27399489.0, + "step": 3058 + }, + { + "epoch": 2.324468085106383, + "grad_norm": 2.4790799617767334, + "learning_rate": 6.633054300016464e-07, + "loss": 0.21309956908226013, + "mean_token_accuracy": 0.9245274066925049, + "num_tokens": 27403825.0, + "step": 3059 + }, + { + "epoch": 2.325227963525836, + "grad_norm": 1.937660813331604, + "learning_rate": 6.618852032119655e-07, + "loss": 0.18426720798015594, + "mean_token_accuracy": 0.9317672252655029, + "num_tokens": 27410934.0, + "step": 3060 + }, + { + "epoch": 2.3259878419452886, + "grad_norm": 1.4951587915420532, + "learning_rate": 6.604662664820063e-07, + "loss": 0.27759790420532227, + "mean_token_accuracy": 0.9198849201202393, + "num_tokens": 27421281.0, + "step": 3061 + }, + { + "epoch": 2.3267477203647418, + "grad_norm": 1.6459094285964966, + "learning_rate": 6.590486208076319e-07, + "loss": 0.3164416551589966, + "mean_token_accuracy": 0.8805180788040161, + "num_tokens": 27431545.0, + "step": 3062 + }, + { + "epoch": 2.3275075987841944, + "grad_norm": 1.6612298488616943, + "learning_rate": 6.576322671838003e-07, + "loss": 0.35754746198654175, + "mean_token_accuracy": 0.8680465817451477, + "num_tokens": 27441566.0, + "step": 3063 + }, + { + "epoch": 2.3282674772036476, + "grad_norm": 2.4485018253326416, + "learning_rate": 6.562172066045655e-07, + "loss": 0.2957935929298401, + "mean_token_accuracy": 0.886491596698761, + "num_tokens": 27447186.0, + "step": 3064 + }, + { + "epoch": 2.3290273556231003, + "grad_norm": 1.9771100282669067, + "learning_rate": 6.548034400630693e-07, + "loss": 0.3137952387332916, + "mean_token_accuracy": 0.8874903321266174, + "num_tokens": 27454347.0, + "step": 3065 + }, + { + "epoch": 2.329787234042553, + "grad_norm": 4.502175331115723, + "learning_rate": 6.533909685515483e-07, + "loss": 0.30587732791900635, + "mean_token_accuracy": 0.8878371715545654, + "num_tokens": 27457322.0, + "step": 3066 + }, + { + "epoch": 2.330547112462006, + "grad_norm": 1.041748285293579, + "learning_rate": 6.519797930613289e-07, + "loss": 0.2936970889568329, + "mean_token_accuracy": 0.8899037837982178, + "num_tokens": 27476750.0, + "step": 3067 + }, + { + "epoch": 2.331306990881459, + "grad_norm": 1.57416570186615, + "learning_rate": 6.505699145828287e-07, + "loss": 0.2849736511707306, + "mean_token_accuracy": 0.8906558156013489, + "num_tokens": 27489326.0, + "step": 3068 + }, + { + "epoch": 2.332066869300912, + "grad_norm": 2.879692792892456, + "learning_rate": 6.491613341055547e-07, + "loss": 0.22944235801696777, + "mean_token_accuracy": 0.9167940616607666, + "num_tokens": 27493562.0, + "step": 3069 + }, + { + "epoch": 2.3328267477203646, + "grad_norm": 2.3187942504882812, + "learning_rate": 6.477540526181036e-07, + "loss": 0.3072662949562073, + "mean_token_accuracy": 0.8936570882797241, + "num_tokens": 27499670.0, + "step": 3070 + }, + { + "epoch": 2.3335866261398177, + "grad_norm": 1.3098584413528442, + "learning_rate": 6.463480711081577e-07, + "loss": 0.4124477505683899, + "mean_token_accuracy": 0.8422118425369263, + "num_tokens": 27518197.0, + "step": 3071 + }, + { + "epoch": 2.3343465045592704, + "grad_norm": 1.874219298362732, + "learning_rate": 6.449433905624916e-07, + "loss": 0.34171411395072937, + "mean_token_accuracy": 0.8761874437332153, + "num_tokens": 27526512.0, + "step": 3072 + }, + { + "epoch": 2.3351063829787235, + "grad_norm": 3.3637123107910156, + "learning_rate": 6.435400119669618e-07, + "loss": 0.23634830117225647, + "mean_token_accuracy": 0.9309012293815613, + "num_tokens": 27529921.0, + "step": 3073 + }, + { + "epoch": 2.335866261398176, + "grad_norm": 2.025264263153076, + "learning_rate": 6.421379363065142e-07, + "loss": 0.352272629737854, + "mean_token_accuracy": 0.8678278923034668, + "num_tokens": 27537122.0, + "step": 3074 + }, + { + "epoch": 2.3366261398176293, + "grad_norm": 1.7762253284454346, + "learning_rate": 6.407371645651808e-07, + "loss": 0.3190876841545105, + "mean_token_accuracy": 0.8870849609375, + "num_tokens": 27547436.0, + "step": 3075 + }, + { + "epoch": 2.337386018237082, + "grad_norm": 1.4258071184158325, + "learning_rate": 6.393376977260754e-07, + "loss": 0.24304701387882233, + "mean_token_accuracy": 0.9347224235534668, + "num_tokens": 27559322.0, + "step": 3076 + }, + { + "epoch": 2.3381458966565347, + "grad_norm": 2.015075922012329, + "learning_rate": 6.379395367713983e-07, + "loss": 0.37574928998947144, + "mean_token_accuracy": 0.8884165287017822, + "num_tokens": 27566564.0, + "step": 3077 + }, + { + "epoch": 2.338905775075988, + "grad_norm": 2.2211477756500244, + "learning_rate": 6.365426826824328e-07, + "loss": 0.3210097551345825, + "mean_token_accuracy": 0.8879522085189819, + "num_tokens": 27573643.0, + "step": 3078 + }, + { + "epoch": 2.339665653495441, + "grad_norm": 2.102496385574341, + "learning_rate": 6.351471364395448e-07, + "loss": 0.4013458490371704, + "mean_token_accuracy": 0.887574315071106, + "num_tokens": 27580724.0, + "step": 3079 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 1.6786696910858154, + "learning_rate": 6.337528990221822e-07, + "loss": 0.3980376124382019, + "mean_token_accuracy": 0.8881500363349915, + "num_tokens": 27592147.0, + "step": 3080 + }, + { + "epoch": 2.3411854103343464, + "grad_norm": 2.541473388671875, + "learning_rate": 6.323599714088754e-07, + "loss": 0.1682094782590866, + "mean_token_accuracy": 0.9426926374435425, + "num_tokens": 27596757.0, + "step": 3081 + }, + { + "epoch": 2.3419452887537995, + "grad_norm": 2.0378596782684326, + "learning_rate": 6.309683545772327e-07, + "loss": 0.4023628234863281, + "mean_token_accuracy": 0.8561117649078369, + "num_tokens": 27604923.0, + "step": 3082 + }, + { + "epoch": 2.342705167173252, + "grad_norm": 1.7666785717010498, + "learning_rate": 6.29578049503946e-07, + "loss": 0.37102991342544556, + "mean_token_accuracy": 0.8807623386383057, + "num_tokens": 27614106.0, + "step": 3083 + }, + { + "epoch": 2.3434650455927053, + "grad_norm": 1.6605560779571533, + "learning_rate": 6.281890571647853e-07, + "loss": 0.4239729642868042, + "mean_token_accuracy": 0.8428831696510315, + "num_tokens": 27626568.0, + "step": 3084 + }, + { + "epoch": 2.344224924012158, + "grad_norm": 1.9562166929244995, + "learning_rate": 6.268013785345969e-07, + "loss": 0.16737908124923706, + "mean_token_accuracy": 0.9457347393035889, + "num_tokens": 27632789.0, + "step": 3085 + }, + { + "epoch": 2.344984802431611, + "grad_norm": 2.274827480316162, + "learning_rate": 6.254150145873081e-07, + "loss": 0.3866672217845917, + "mean_token_accuracy": 0.8498655557632446, + "num_tokens": 27639692.0, + "step": 3086 + }, + { + "epoch": 2.345744680851064, + "grad_norm": 1.9612165689468384, + "learning_rate": 6.240299662959237e-07, + "loss": 0.2607918977737427, + "mean_token_accuracy": 0.9195128679275513, + "num_tokens": 27646911.0, + "step": 3087 + }, + { + "epoch": 2.3465045592705165, + "grad_norm": 1.6821730136871338, + "learning_rate": 6.226462346325221e-07, + "loss": 0.3244997560977936, + "mean_token_accuracy": 0.8889811038970947, + "num_tokens": 27656789.0, + "step": 3088 + }, + { + "epoch": 2.3472644376899696, + "grad_norm": 1.8024263381958008, + "learning_rate": 6.2126382056826e-07, + "loss": 0.28899791836738586, + "mean_token_accuracy": 0.8931136131286621, + "num_tokens": 27666153.0, + "step": 3089 + }, + { + "epoch": 2.3480243161094223, + "grad_norm": 2.8205342292785645, + "learning_rate": 6.198827250733694e-07, + "loss": 0.32387930154800415, + "mean_token_accuracy": 0.9032641649246216, + "num_tokens": 27671042.0, + "step": 3090 + }, + { + "epoch": 2.3487841945288754, + "grad_norm": 2.8001155853271484, + "learning_rate": 6.185029491171554e-07, + "loss": 0.3122251331806183, + "mean_token_accuracy": 0.9122956395149231, + "num_tokens": 27675732.0, + "step": 3091 + }, + { + "epoch": 2.349544072948328, + "grad_norm": 2.6694142818450928, + "learning_rate": 6.171244936679985e-07, + "loss": 0.3166629374027252, + "mean_token_accuracy": 0.875450074672699, + "num_tokens": 27681448.0, + "step": 3092 + }, + { + "epoch": 2.3503039513677813, + "grad_norm": 1.515966534614563, + "learning_rate": 6.157473596933517e-07, + "loss": 0.17373405396938324, + "mean_token_accuracy": 0.933076798915863, + "num_tokens": 27690654.0, + "step": 3093 + }, + { + "epoch": 2.351063829787234, + "grad_norm": 2.4486823081970215, + "learning_rate": 6.143715481597404e-07, + "loss": 0.18732565641403198, + "mean_token_accuracy": 0.9323808550834656, + "num_tokens": 27696111.0, + "step": 3094 + }, + { + "epoch": 2.351823708206687, + "grad_norm": 2.3000645637512207, + "learning_rate": 6.129970600327623e-07, + "loss": 0.267723023891449, + "mean_token_accuracy": 0.9053730964660645, + "num_tokens": 27702103.0, + "step": 3095 + }, + { + "epoch": 2.3525835866261398, + "grad_norm": 2.533583164215088, + "learning_rate": 6.116238962770868e-07, + "loss": 0.40778815746307373, + "mean_token_accuracy": 0.8500792980194092, + "num_tokens": 27708868.0, + "step": 3096 + }, + { + "epoch": 2.353343465045593, + "grad_norm": 1.9357147216796875, + "learning_rate": 6.102520578564508e-07, + "loss": 0.2880813479423523, + "mean_token_accuracy": 0.8895434141159058, + "num_tokens": 27716730.0, + "step": 3097 + }, + { + "epoch": 2.3541033434650456, + "grad_norm": 3.1041259765625, + "learning_rate": 6.088815457336664e-07, + "loss": 0.21810382604599, + "mean_token_accuracy": 0.9217148423194885, + "num_tokens": 27720792.0, + "step": 3098 + }, + { + "epoch": 2.3548632218844983, + "grad_norm": 2.890695095062256, + "learning_rate": 6.075123608706093e-07, + "loss": 0.4002879858016968, + "mean_token_accuracy": 0.8573901653289795, + "num_tokens": 27726201.0, + "step": 3099 + }, + { + "epoch": 2.3556231003039514, + "grad_norm": 3.4247958660125732, + "learning_rate": 6.061445042282271e-07, + "loss": 0.4269426465034485, + "mean_token_accuracy": 0.848825216293335, + "num_tokens": 27730419.0, + "step": 3100 + }, + { + "epoch": 2.356382978723404, + "grad_norm": 1.8903621435165405, + "learning_rate": 6.047779767665341e-07, + "loss": 0.460983544588089, + "mean_token_accuracy": 0.8535886406898499, + "num_tokens": 27741121.0, + "step": 3101 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 2.6975221633911133, + "learning_rate": 6.03412779444612e-07, + "loss": 0.34841713309288025, + "mean_token_accuracy": 0.8812501430511475, + "num_tokens": 27746537.0, + "step": 3102 + }, + { + "epoch": 2.35790273556231, + "grad_norm": 1.4414833784103394, + "learning_rate": 6.02048913220609e-07, + "loss": 0.34440115094184875, + "mean_token_accuracy": 0.8725030422210693, + "num_tokens": 27761085.0, + "step": 3103 + }, + { + "epoch": 2.358662613981763, + "grad_norm": 1.7643623352050781, + "learning_rate": 6.006863790517392e-07, + "loss": 0.31087273359298706, + "mean_token_accuracy": 0.9108829498291016, + "num_tokens": 27769320.0, + "step": 3104 + }, + { + "epoch": 2.3594224924012157, + "grad_norm": 1.365966558456421, + "learning_rate": 5.993251778942794e-07, + "loss": 0.501873254776001, + "mean_token_accuracy": 0.8246122598648071, + "num_tokens": 27791567.0, + "step": 3105 + }, + { + "epoch": 2.360182370820669, + "grad_norm": 2.5981390476226807, + "learning_rate": 5.979653107035754e-07, + "loss": 0.27364015579223633, + "mean_token_accuracy": 0.8946651816368103, + "num_tokens": 27796849.0, + "step": 3106 + }, + { + "epoch": 2.3609422492401215, + "grad_norm": 3.3564229011535645, + "learning_rate": 5.966067784340346e-07, + "loss": 0.2456880509853363, + "mean_token_accuracy": 0.9110729694366455, + "num_tokens": 27800785.0, + "step": 3107 + }, + { + "epoch": 2.3617021276595747, + "grad_norm": 1.6739033460617065, + "learning_rate": 5.952495820391244e-07, + "loss": 0.30737343430519104, + "mean_token_accuracy": 0.8898587226867676, + "num_tokens": 27811982.0, + "step": 3108 + }, + { + "epoch": 2.3624620060790273, + "grad_norm": 1.4430924654006958, + "learning_rate": 5.9389372247138e-07, + "loss": 0.46142861247062683, + "mean_token_accuracy": 0.8355259895324707, + "num_tokens": 27827765.0, + "step": 3109 + }, + { + "epoch": 2.36322188449848, + "grad_norm": 3.7220218181610107, + "learning_rate": 5.92539200682396e-07, + "loss": 0.18588921427726746, + "mean_token_accuracy": 0.9419732093811035, + "num_tokens": 27830551.0, + "step": 3110 + }, + { + "epoch": 2.363981762917933, + "grad_norm": 2.4770448207855225, + "learning_rate": 5.911860176228262e-07, + "loss": 0.3194807767868042, + "mean_token_accuracy": 0.8959789276123047, + "num_tokens": 27836529.0, + "step": 3111 + }, + { + "epoch": 2.364741641337386, + "grad_norm": 2.1989665031433105, + "learning_rate": 5.898341742423866e-07, + "loss": 0.23653598129749298, + "mean_token_accuracy": 0.9119038581848145, + "num_tokens": 27842019.0, + "step": 3112 + }, + { + "epoch": 2.365501519756839, + "grad_norm": 1.9562573432922363, + "learning_rate": 5.884836714898554e-07, + "loss": 0.320852130651474, + "mean_token_accuracy": 0.8902987837791443, + "num_tokens": 27850663.0, + "step": 3113 + }, + { + "epoch": 2.3662613981762917, + "grad_norm": 1.4759801626205444, + "learning_rate": 5.871345103130646e-07, + "loss": 0.2739158570766449, + "mean_token_accuracy": 0.9033761024475098, + "num_tokens": 27863451.0, + "step": 3114 + }, + { + "epoch": 2.367021276595745, + "grad_norm": 1.7798938751220703, + "learning_rate": 5.857866916589089e-07, + "loss": 0.35400229692459106, + "mean_token_accuracy": 0.8623180389404297, + "num_tokens": 27873669.0, + "step": 3115 + }, + { + "epoch": 2.3677811550151975, + "grad_norm": 2.269472360610962, + "learning_rate": 5.84440216473339e-07, + "loss": 0.3717876672744751, + "mean_token_accuracy": 0.8853007555007935, + "num_tokens": 27880307.0, + "step": 3116 + }, + { + "epoch": 2.3685410334346506, + "grad_norm": 1.5675846338272095, + "learning_rate": 5.830950857013629e-07, + "loss": 0.3465133011341095, + "mean_token_accuracy": 0.876459002494812, + "num_tokens": 27893889.0, + "step": 3117 + }, + { + "epoch": 2.3693009118541033, + "grad_norm": 2.782482147216797, + "learning_rate": 5.817513002870451e-07, + "loss": 0.14173674583435059, + "mean_token_accuracy": 0.9492213726043701, + "num_tokens": 27897693.0, + "step": 3118 + }, + { + "epoch": 2.3700607902735564, + "grad_norm": 1.830674171447754, + "learning_rate": 5.80408861173507e-07, + "loss": 0.2692085802555084, + "mean_token_accuracy": 0.9287421107292175, + "num_tokens": 27905261.0, + "step": 3119 + }, + { + "epoch": 2.370820668693009, + "grad_norm": 2.2477660179138184, + "learning_rate": 5.790677693029217e-07, + "loss": 0.32119685411453247, + "mean_token_accuracy": 0.8751975297927856, + "num_tokens": 27911581.0, + "step": 3120 + }, + { + "epoch": 2.371580547112462, + "grad_norm": 2.3288302421569824, + "learning_rate": 5.777280256165218e-07, + "loss": 0.34133443236351013, + "mean_token_accuracy": 0.8763091564178467, + "num_tokens": 27918603.0, + "step": 3121 + }, + { + "epoch": 2.372340425531915, + "grad_norm": 1.595375895500183, + "learning_rate": 5.763896310545893e-07, + "loss": 0.30863112211227417, + "mean_token_accuracy": 0.8858665823936462, + "num_tokens": 27929892.0, + "step": 3122 + }, + { + "epoch": 2.3731003039513676, + "grad_norm": 2.0553293228149414, + "learning_rate": 5.750525865564613e-07, + "loss": 0.28052228689193726, + "mean_token_accuracy": 0.8970555067062378, + "num_tokens": 27937532.0, + "step": 3123 + }, + { + "epoch": 2.3738601823708207, + "grad_norm": 1.4700267314910889, + "learning_rate": 5.737168930605272e-07, + "loss": 0.27994588017463684, + "mean_token_accuracy": 0.9026262760162354, + "num_tokens": 27948679.0, + "step": 3124 + }, + { + "epoch": 2.3746200607902734, + "grad_norm": 3.2083890438079834, + "learning_rate": 5.723825515042284e-07, + "loss": 0.1810106784105301, + "mean_token_accuracy": 0.9297720193862915, + "num_tokens": 27952090.0, + "step": 3125 + }, + { + "epoch": 2.3753799392097266, + "grad_norm": 1.4345086812973022, + "learning_rate": 5.710495628240567e-07, + "loss": 0.2929079830646515, + "mean_token_accuracy": 0.8950849771499634, + "num_tokens": 27964959.0, + "step": 3126 + }, + { + "epoch": 2.3761398176291793, + "grad_norm": 2.0222737789154053, + "learning_rate": 5.697179279555551e-07, + "loss": 0.41308528184890747, + "mean_token_accuracy": 0.8616737127304077, + "num_tokens": 27973803.0, + "step": 3127 + }, + { + "epoch": 2.3768996960486324, + "grad_norm": 1.2820483446121216, + "learning_rate": 5.683876478333161e-07, + "loss": 0.4069697856903076, + "mean_token_accuracy": 0.8547379970550537, + "num_tokens": 27991576.0, + "step": 3128 + }, + { + "epoch": 2.377659574468085, + "grad_norm": 2.3709049224853516, + "learning_rate": 5.670587233909819e-07, + "loss": 0.1923210471868515, + "mean_token_accuracy": 0.9360835552215576, + "num_tokens": 27997051.0, + "step": 3129 + }, + { + "epoch": 2.378419452887538, + "grad_norm": 1.874002456665039, + "learning_rate": 5.657311555612433e-07, + "loss": 0.431087851524353, + "mean_token_accuracy": 0.8736472129821777, + "num_tokens": 28004863.0, + "step": 3130 + }, + { + "epoch": 2.379179331306991, + "grad_norm": 1.0792341232299805, + "learning_rate": 5.64404945275836e-07, + "loss": 0.38039785623550415, + "mean_token_accuracy": 0.8523920178413391, + "num_tokens": 28027220.0, + "step": 3131 + }, + { + "epoch": 2.3799392097264436, + "grad_norm": 1.7947046756744385, + "learning_rate": 5.630800934655481e-07, + "loss": 0.29587826132774353, + "mean_token_accuracy": 0.8919603824615479, + "num_tokens": 28035495.0, + "step": 3132 + }, + { + "epoch": 2.3806990881458967, + "grad_norm": 3.4972469806671143, + "learning_rate": 5.617566010602113e-07, + "loss": 0.31223949790000916, + "mean_token_accuracy": 0.895270586013794, + "num_tokens": 28039135.0, + "step": 3133 + }, + { + "epoch": 2.3814589665653494, + "grad_norm": 2.331387758255005, + "learning_rate": 5.60434468988702e-07, + "loss": 0.30856233835220337, + "mean_token_accuracy": 0.8810996413230896, + "num_tokens": 28045572.0, + "step": 3134 + }, + { + "epoch": 2.3822188449848025, + "grad_norm": 1.9918609857559204, + "learning_rate": 5.591136981789439e-07, + "loss": 0.3031975328922272, + "mean_token_accuracy": 0.9028782844543457, + "num_tokens": 28051851.0, + "step": 3135 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 1.6089690923690796, + "learning_rate": 5.577942895579064e-07, + "loss": 0.34390494227409363, + "mean_token_accuracy": 0.8744557499885559, + "num_tokens": 28062705.0, + "step": 3136 + }, + { + "epoch": 2.3837386018237083, + "grad_norm": 1.4829623699188232, + "learning_rate": 5.564762440515994e-07, + "loss": 0.3172723650932312, + "mean_token_accuracy": 0.9192344546318054, + "num_tokens": 28073539.0, + "step": 3137 + }, + { + "epoch": 2.384498480243161, + "grad_norm": 1.4833530187606812, + "learning_rate": 5.551595625850786e-07, + "loss": 0.3714778423309326, + "mean_token_accuracy": 0.8697570562362671, + "num_tokens": 28085949.0, + "step": 3138 + }, + { + "epoch": 2.385258358662614, + "grad_norm": 3.140885829925537, + "learning_rate": 5.538442460824417e-07, + "loss": 0.3266214430332184, + "mean_token_accuracy": 0.9124236702919006, + "num_tokens": 28090639.0, + "step": 3139 + }, + { + "epoch": 2.386018237082067, + "grad_norm": 1.731658935546875, + "learning_rate": 5.525302954668285e-07, + "loss": 0.21903038024902344, + "mean_token_accuracy": 0.9181338548660278, + "num_tokens": 28099076.0, + "step": 3140 + }, + { + "epoch": 2.38677811550152, + "grad_norm": 1.2315683364868164, + "learning_rate": 5.5121771166042e-07, + "loss": 0.25057584047317505, + "mean_token_accuracy": 0.9130429029464722, + "num_tokens": 28113532.0, + "step": 3141 + }, + { + "epoch": 2.3875379939209727, + "grad_norm": 3.888575553894043, + "learning_rate": 5.499064955844383e-07, + "loss": 0.173577219247818, + "mean_token_accuracy": 0.9388964176177979, + "num_tokens": 28116683.0, + "step": 3142 + }, + { + "epoch": 2.3882978723404253, + "grad_norm": 1.4791816473007202, + "learning_rate": 5.48596648159145e-07, + "loss": 0.38739481568336487, + "mean_token_accuracy": 0.9086727499961853, + "num_tokens": 28129363.0, + "step": 3143 + }, + { + "epoch": 2.3890577507598785, + "grad_norm": 2.1314213275909424, + "learning_rate": 5.472881703038418e-07, + "loss": 0.3724244236946106, + "mean_token_accuracy": 0.8749525547027588, + "num_tokens": 28136421.0, + "step": 3144 + }, + { + "epoch": 2.389817629179331, + "grad_norm": 2.4120898246765137, + "learning_rate": 5.459810629368692e-07, + "loss": 0.36195144057273865, + "mean_token_accuracy": 0.869860053062439, + "num_tokens": 28143903.0, + "step": 3145 + }, + { + "epoch": 2.3905775075987843, + "grad_norm": 1.7327654361724854, + "learning_rate": 5.446753269756036e-07, + "loss": 0.3846886157989502, + "mean_token_accuracy": 0.859398603439331, + "num_tokens": 28155403.0, + "step": 3146 + }, + { + "epoch": 2.391337386018237, + "grad_norm": 1.2435929775238037, + "learning_rate": 5.433709633364637e-07, + "loss": 0.36000579595565796, + "mean_token_accuracy": 0.8722110986709595, + "num_tokens": 28171739.0, + "step": 3147 + }, + { + "epoch": 2.39209726443769, + "grad_norm": 1.746272325515747, + "learning_rate": 5.420679729348993e-07, + "loss": 0.36778098344802856, + "mean_token_accuracy": 0.8639050722122192, + "num_tokens": 28182326.0, + "step": 3148 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 2.0103561878204346, + "learning_rate": 5.407663566854008e-07, + "loss": 0.3921544551849365, + "mean_token_accuracy": 0.8679144382476807, + "num_tokens": 28191456.0, + "step": 3149 + }, + { + "epoch": 2.393617021276596, + "grad_norm": 1.792054533958435, + "learning_rate": 5.394661155014921e-07, + "loss": 0.4300078749656677, + "mean_token_accuracy": 0.8496290445327759, + "num_tokens": 28201943.0, + "step": 3150 + }, + { + "epoch": 2.3943768996960486, + "grad_norm": 1.1109238862991333, + "learning_rate": 5.381672502957324e-07, + "loss": 0.3262210190296173, + "mean_token_accuracy": 0.8634629845619202, + "num_tokens": 28221353.0, + "step": 3151 + }, + { + "epoch": 2.3951367781155017, + "grad_norm": 1.855241060256958, + "learning_rate": 5.368697619797159e-07, + "loss": 0.3076592981815338, + "mean_token_accuracy": 0.9093140959739685, + "num_tokens": 28229172.0, + "step": 3152 + }, + { + "epoch": 2.3958966565349544, + "grad_norm": 2.416808605194092, + "learning_rate": 5.355736514640697e-07, + "loss": 0.27811431884765625, + "mean_token_accuracy": 0.9024926424026489, + "num_tokens": 28234877.0, + "step": 3153 + }, + { + "epoch": 2.396656534954407, + "grad_norm": 1.6434770822525024, + "learning_rate": 5.342789196584527e-07, + "loss": 0.43254753947257996, + "mean_token_accuracy": 0.8404601812362671, + "num_tokens": 28245905.0, + "step": 3154 + }, + { + "epoch": 2.3974164133738602, + "grad_norm": 2.4053826332092285, + "learning_rate": 5.329855674715592e-07, + "loss": 0.3984904885292053, + "mean_token_accuracy": 0.8764510154724121, + "num_tokens": 28251558.0, + "step": 3155 + }, + { + "epoch": 2.398176291793313, + "grad_norm": 1.60322904586792, + "learning_rate": 5.316935958111139e-07, + "loss": 0.34025734663009644, + "mean_token_accuracy": 0.8753441572189331, + "num_tokens": 28261596.0, + "step": 3156 + }, + { + "epoch": 2.398936170212766, + "grad_norm": 1.5645020008087158, + "learning_rate": 5.304030055838704e-07, + "loss": 0.35805732011795044, + "mean_token_accuracy": 0.8628225922584534, + "num_tokens": 28272233.0, + "step": 3157 + }, + { + "epoch": 2.3996960486322187, + "grad_norm": 2.0708835124969482, + "learning_rate": 5.291137976956148e-07, + "loss": 0.35056009888648987, + "mean_token_accuracy": 0.8771238923072815, + "num_tokens": 28279905.0, + "step": 3158 + }, + { + "epoch": 2.400455927051672, + "grad_norm": 1.9882023334503174, + "learning_rate": 5.278259730511651e-07, + "loss": 0.30454230308532715, + "mean_token_accuracy": 0.883628249168396, + "num_tokens": 28287183.0, + "step": 3159 + }, + { + "epoch": 2.4012158054711246, + "grad_norm": 2.3435161113739014, + "learning_rate": 5.26539532554364e-07, + "loss": 0.262816458940506, + "mean_token_accuracy": 0.8924182653427124, + "num_tokens": 28293816.0, + "step": 3160 + }, + { + "epoch": 2.4019756838905777, + "grad_norm": 1.5700311660766602, + "learning_rate": 5.252544771080853e-07, + "loss": 0.43474194407463074, + "mean_token_accuracy": 0.8594561815261841, + "num_tokens": 28306346.0, + "step": 3161 + }, + { + "epoch": 2.4027355623100304, + "grad_norm": 1.8969467878341675, + "learning_rate": 5.239708076142311e-07, + "loss": 0.309972882270813, + "mean_token_accuracy": 0.8846274614334106, + "num_tokens": 28314843.0, + "step": 3162 + }, + { + "epoch": 2.4034954407294835, + "grad_norm": 2.2149617671966553, + "learning_rate": 5.226885249737292e-07, + "loss": 0.40023672580718994, + "mean_token_accuracy": 0.8641965389251709, + "num_tokens": 28322124.0, + "step": 3163 + }, + { + "epoch": 2.404255319148936, + "grad_norm": 1.3280621767044067, + "learning_rate": 5.214076300865359e-07, + "loss": 0.31123194098472595, + "mean_token_accuracy": 0.8883715271949768, + "num_tokens": 28336490.0, + "step": 3164 + }, + { + "epoch": 2.405015197568389, + "grad_norm": 1.402884602546692, + "learning_rate": 5.201281238516318e-07, + "loss": 0.2590488791465759, + "mean_token_accuracy": 0.9011414051055908, + "num_tokens": 28349094.0, + "step": 3165 + }, + { + "epoch": 2.405775075987842, + "grad_norm": 1.6564174890518188, + "learning_rate": 5.188500071670235e-07, + "loss": 0.23672837018966675, + "mean_token_accuracy": 0.9133221507072449, + "num_tokens": 28357665.0, + "step": 3166 + }, + { + "epoch": 2.4065349544072947, + "grad_norm": 1.9133414030075073, + "learning_rate": 5.175732809297435e-07, + "loss": 0.40488386154174805, + "mean_token_accuracy": 0.8534098863601685, + "num_tokens": 28366519.0, + "step": 3167 + }, + { + "epoch": 2.407294832826748, + "grad_norm": 1.447898268699646, + "learning_rate": 5.16297946035847e-07, + "loss": 0.3679184913635254, + "mean_token_accuracy": 0.8696858882904053, + "num_tokens": 28379315.0, + "step": 3168 + }, + { + "epoch": 2.4080547112462005, + "grad_norm": 3.454120397567749, + "learning_rate": 5.150240033804116e-07, + "loss": 0.23210272192955017, + "mean_token_accuracy": 0.9179670214653015, + "num_tokens": 28382844.0, + "step": 3169 + }, + { + "epoch": 2.4088145896656536, + "grad_norm": 1.7603836059570312, + "learning_rate": 5.137514538575419e-07, + "loss": 0.4491140842437744, + "mean_token_accuracy": 0.8472066521644592, + "num_tokens": 28394064.0, + "step": 3170 + }, + { + "epoch": 2.4095744680851063, + "grad_norm": 1.3338149785995483, + "learning_rate": 5.124802983603602e-07, + "loss": 0.3237353563308716, + "mean_token_accuracy": 0.8897873163223267, + "num_tokens": 28410190.0, + "step": 3171 + }, + { + "epoch": 2.410334346504559, + "grad_norm": 2.6191205978393555, + "learning_rate": 5.112105377810128e-07, + "loss": 0.3119543194770813, + "mean_token_accuracy": 0.889589250087738, + "num_tokens": 28414838.0, + "step": 3172 + }, + { + "epoch": 2.411094224924012, + "grad_norm": 2.583130121231079, + "learning_rate": 5.099421730106669e-07, + "loss": 0.2616881728172302, + "mean_token_accuracy": 0.9155426621437073, + "num_tokens": 28419792.0, + "step": 3173 + }, + { + "epoch": 2.4118541033434653, + "grad_norm": 2.875683307647705, + "learning_rate": 5.086752049395094e-07, + "loss": 0.2567689120769501, + "mean_token_accuracy": 0.9075877666473389, + "num_tokens": 28424069.0, + "step": 3174 + }, + { + "epoch": 2.412613981762918, + "grad_norm": 1.695042371749878, + "learning_rate": 5.074096344567475e-07, + "loss": 0.3164510130882263, + "mean_token_accuracy": 0.8845095634460449, + "num_tokens": 28433279.0, + "step": 3175 + }, + { + "epoch": 2.4133738601823707, + "grad_norm": 2.110863447189331, + "learning_rate": 5.061454624506074e-07, + "loss": 0.22680208086967468, + "mean_token_accuracy": 0.9221781492233276, + "num_tokens": 28439569.0, + "step": 3176 + }, + { + "epoch": 2.414133738601824, + "grad_norm": 2.030958890914917, + "learning_rate": 5.048826898083331e-07, + "loss": 0.3482169210910797, + "mean_token_accuracy": 0.8853809833526611, + "num_tokens": 28447203.0, + "step": 3177 + }, + { + "epoch": 2.4148936170212765, + "grad_norm": 1.9921014308929443, + "learning_rate": 5.036213174161877e-07, + "loss": 0.29343554377555847, + "mean_token_accuracy": 0.893486499786377, + "num_tokens": 28454923.0, + "step": 3178 + }, + { + "epoch": 2.4156534954407296, + "grad_norm": 4.079009532928467, + "learning_rate": 5.023613461594512e-07, + "loss": 0.2569321095943451, + "mean_token_accuracy": 0.9205472469329834, + "num_tokens": 28458173.0, + "step": 3179 + }, + { + "epoch": 2.4164133738601823, + "grad_norm": 3.077458143234253, + "learning_rate": 5.01102776922418e-07, + "loss": 0.3203810453414917, + "mean_token_accuracy": 0.8863208293914795, + "num_tokens": 28462449.0, + "step": 3180 + }, + { + "epoch": 2.4171732522796354, + "grad_norm": 2.4658167362213135, + "learning_rate": 4.998456105884025e-07, + "loss": 0.33045345544815063, + "mean_token_accuracy": 0.8856333494186401, + "num_tokens": 28468051.0, + "step": 3181 + }, + { + "epoch": 2.417933130699088, + "grad_norm": 2.053370714187622, + "learning_rate": 4.985898480397322e-07, + "loss": 0.2415514886379242, + "mean_token_accuracy": 0.9296282529830933, + "num_tokens": 28473839.0, + "step": 3182 + }, + { + "epoch": 2.418693009118541, + "grad_norm": 2.705026149749756, + "learning_rate": 4.973354901577487e-07, + "loss": 0.3233085870742798, + "mean_token_accuracy": 0.8820867538452148, + "num_tokens": 28479419.0, + "step": 3183 + }, + { + "epoch": 2.419452887537994, + "grad_norm": 2.1648733615875244, + "learning_rate": 4.960825378228082e-07, + "loss": 0.25225499272346497, + "mean_token_accuracy": 0.9170141220092773, + "num_tokens": 28484968.0, + "step": 3184 + }, + { + "epoch": 2.420212765957447, + "grad_norm": 1.8317075967788696, + "learning_rate": 4.948309919142832e-07, + "loss": 0.3143184781074524, + "mean_token_accuracy": 0.8824752569198608, + "num_tokens": 28492904.0, + "step": 3185 + }, + { + "epoch": 2.4209726443768997, + "grad_norm": 2.591052770614624, + "learning_rate": 4.935808533105546e-07, + "loss": 0.31191521883010864, + "mean_token_accuracy": 0.9041938185691833, + "num_tokens": 28498136.0, + "step": 3186 + }, + { + "epoch": 2.4217325227963524, + "grad_norm": 2.200559139251709, + "learning_rate": 4.923321228890184e-07, + "loss": 0.23661679029464722, + "mean_token_accuracy": 0.9179906845092773, + "num_tokens": 28504246.0, + "step": 3187 + }, + { + "epoch": 2.4224924012158056, + "grad_norm": 1.6311591863632202, + "learning_rate": 4.910848015260822e-07, + "loss": 0.35421687364578247, + "mean_token_accuracy": 0.8728591799736023, + "num_tokens": 28515481.0, + "step": 3188 + }, + { + "epoch": 2.4232522796352582, + "grad_norm": 2.1564102172851562, + "learning_rate": 4.898388900971635e-07, + "loss": 0.30809515714645386, + "mean_token_accuracy": 0.8960262537002563, + "num_tokens": 28521294.0, + "step": 3189 + }, + { + "epoch": 2.4240121580547114, + "grad_norm": 2.1413958072662354, + "learning_rate": 4.885943894766909e-07, + "loss": 0.217842698097229, + "mean_token_accuracy": 0.9408326745033264, + "num_tokens": 28527104.0, + "step": 3190 + }, + { + "epoch": 2.424772036474164, + "grad_norm": 2.373764991760254, + "learning_rate": 4.873513005381042e-07, + "loss": 0.33814239501953125, + "mean_token_accuracy": 0.9007177352905273, + "num_tokens": 28533654.0, + "step": 3191 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 1.8809123039245605, + "learning_rate": 4.861096241538483e-07, + "loss": 0.4467903971672058, + "mean_token_accuracy": 0.8424190282821655, + "num_tokens": 28543667.0, + "step": 3192 + }, + { + "epoch": 2.42629179331307, + "grad_norm": 1.4945175647735596, + "learning_rate": 4.848693611953825e-07, + "loss": 0.3123834729194641, + "mean_token_accuracy": 0.9072253704071045, + "num_tokens": 28554944.0, + "step": 3193 + }, + { + "epoch": 2.4270516717325226, + "grad_norm": 1.8136200904846191, + "learning_rate": 4.836305125331695e-07, + "loss": 0.27221372723579407, + "mean_token_accuracy": 0.9039586782455444, + "num_tokens": 28563082.0, + "step": 3194 + }, + { + "epoch": 2.4278115501519757, + "grad_norm": 4.269916534423828, + "learning_rate": 4.823930790366801e-07, + "loss": 0.2660295069217682, + "mean_token_accuracy": 0.9072147607803345, + "num_tokens": 28566246.0, + "step": 3195 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 2.273453950881958, + "learning_rate": 4.811570615743952e-07, + "loss": 0.27304959297180176, + "mean_token_accuracy": 0.9012454748153687, + "num_tokens": 28572906.0, + "step": 3196 + }, + { + "epoch": 2.4293313069908815, + "grad_norm": 2.0931496620178223, + "learning_rate": 4.799224610137975e-07, + "loss": 0.2358006238937378, + "mean_token_accuracy": 0.9263845682144165, + "num_tokens": 28579148.0, + "step": 3197 + }, + { + "epoch": 2.430091185410334, + "grad_norm": 1.885201334953308, + "learning_rate": 4.786892782213781e-07, + "loss": 0.346000611782074, + "mean_token_accuracy": 0.8616824150085449, + "num_tokens": 28587361.0, + "step": 3198 + }, + { + "epoch": 2.4308510638297873, + "grad_norm": 2.229367971420288, + "learning_rate": 4.774575140626317e-07, + "loss": 0.2951638400554657, + "mean_token_accuracy": 0.8938933610916138, + "num_tokens": 28593622.0, + "step": 3199 + }, + { + "epoch": 2.43161094224924, + "grad_norm": 2.681004762649536, + "learning_rate": 4.7622716940205787e-07, + "loss": 0.2588275671005249, + "mean_token_accuracy": 0.9101524353027344, + "num_tokens": 28597890.0, + "step": 3200 + }, + { + "epoch": 2.432370820668693, + "grad_norm": 1.8040683269500732, + "learning_rate": 4.7499824510316013e-07, + "loss": 0.3194184899330139, + "mean_token_accuracy": 0.906498908996582, + "num_tokens": 28606885.0, + "step": 3201 + }, + { + "epoch": 2.433130699088146, + "grad_norm": 3.4185421466827393, + "learning_rate": 4.7377074202844514e-07, + "loss": 0.4457589387893677, + "mean_token_accuracy": 0.8387380838394165, + "num_tokens": 28611709.0, + "step": 3202 + }, + { + "epoch": 2.433890577507599, + "grad_norm": 2.6594674587249756, + "learning_rate": 4.7254466103941995e-07, + "loss": 0.3260703384876251, + "mean_token_accuracy": 0.9050778150558472, + "num_tokens": 28616931.0, + "step": 3203 + }, + { + "epoch": 2.4346504559270516, + "grad_norm": 1.999886393547058, + "learning_rate": 4.713200029965978e-07, + "loss": 0.2933492660522461, + "mean_token_accuracy": 0.900344729423523, + "num_tokens": 28624034.0, + "step": 3204 + }, + { + "epoch": 2.4354103343465043, + "grad_norm": 1.932508111000061, + "learning_rate": 4.700967687594901e-07, + "loss": 0.29114463925361633, + "mean_token_accuracy": 0.9247289896011353, + "num_tokens": 28633528.0, + "step": 3205 + }, + { + "epoch": 2.4361702127659575, + "grad_norm": 4.55303430557251, + "learning_rate": 4.68874959186609e-07, + "loss": 0.2936939597129822, + "mean_token_accuracy": 0.9006574153900146, + "num_tokens": 28636289.0, + "step": 3206 + }, + { + "epoch": 2.43693009118541, + "grad_norm": 2.02156400680542, + "learning_rate": 4.6765457513546747e-07, + "loss": 0.3098263740539551, + "mean_token_accuracy": 0.8965007066726685, + "num_tokens": 28643491.0, + "step": 3207 + }, + { + "epoch": 2.4376899696048633, + "grad_norm": 2.018125295639038, + "learning_rate": 4.664356174625795e-07, + "loss": 0.4749948978424072, + "mean_token_accuracy": 0.8366118669509888, + "num_tokens": 28654136.0, + "step": 3208 + }, + { + "epoch": 2.438449848024316, + "grad_norm": 2.0175318717956543, + "learning_rate": 4.6521808702345516e-07, + "loss": 0.31277763843536377, + "mean_token_accuracy": 0.8878506422042847, + "num_tokens": 28662363.0, + "step": 3209 + }, + { + "epoch": 2.439209726443769, + "grad_norm": 1.37982177734375, + "learning_rate": 4.640019846726043e-07, + "loss": 0.3872165083885193, + "mean_token_accuracy": 0.8586703538894653, + "num_tokens": 28681637.0, + "step": 3210 + }, + { + "epoch": 2.439969604863222, + "grad_norm": 2.265124559402466, + "learning_rate": 4.6278731126353447e-07, + "loss": 0.20262989401817322, + "mean_token_accuracy": 0.9290857315063477, + "num_tokens": 28687032.0, + "step": 3211 + }, + { + "epoch": 2.440729483282675, + "grad_norm": 1.730516791343689, + "learning_rate": 4.615740676487507e-07, + "loss": 0.21819885075092316, + "mean_token_accuracy": 0.9351010322570801, + "num_tokens": 28694692.0, + "step": 3212 + }, + { + "epoch": 2.4414893617021276, + "grad_norm": 2.297240972518921, + "learning_rate": 4.603622546797534e-07, + "loss": 0.34703850746154785, + "mean_token_accuracy": 0.8764227628707886, + "num_tokens": 28700838.0, + "step": 3213 + }, + { + "epoch": 2.4422492401215807, + "grad_norm": 1.3174461126327515, + "learning_rate": 4.591518732070402e-07, + "loss": 0.27869975566864014, + "mean_token_accuracy": 0.8975766897201538, + "num_tokens": 28715114.0, + "step": 3214 + }, + { + "epoch": 2.4430091185410334, + "grad_norm": 1.8751143217086792, + "learning_rate": 4.5794292408010285e-07, + "loss": 0.4260019361972809, + "mean_token_accuracy": 0.8564238548278809, + "num_tokens": 28724176.0, + "step": 3215 + }, + { + "epoch": 2.443768996960486, + "grad_norm": 2.095414161682129, + "learning_rate": 4.5673540814742875e-07, + "loss": 0.2791098952293396, + "mean_token_accuracy": 0.896371603012085, + "num_tokens": 28730815.0, + "step": 3216 + }, + { + "epoch": 2.4445288753799392, + "grad_norm": 1.470991611480713, + "learning_rate": 4.555293262564994e-07, + "loss": 0.3128473162651062, + "mean_token_accuracy": 0.8857797980308533, + "num_tokens": 28743271.0, + "step": 3217 + }, + { + "epoch": 2.445288753799392, + "grad_norm": 1.8783953189849854, + "learning_rate": 4.5432467925378784e-07, + "loss": 0.24838949739933014, + "mean_token_accuracy": 0.9119431972503662, + "num_tokens": 28751291.0, + "step": 3218 + }, + { + "epoch": 2.446048632218845, + "grad_norm": 2.134469747543335, + "learning_rate": 4.53121467984764e-07, + "loss": 0.390994668006897, + "mean_token_accuracy": 0.8823093175888062, + "num_tokens": 28758262.0, + "step": 3219 + }, + { + "epoch": 2.4468085106382977, + "grad_norm": 1.369758129119873, + "learning_rate": 4.5191969329388627e-07, + "loss": 0.33717092871665955, + "mean_token_accuracy": 0.8909138441085815, + "num_tokens": 28770330.0, + "step": 3220 + }, + { + "epoch": 2.447568389057751, + "grad_norm": 1.3363337516784668, + "learning_rate": 4.5071935602460704e-07, + "loss": 0.41521191596984863, + "mean_token_accuracy": 0.8482609987258911, + "num_tokens": 28788148.0, + "step": 3221 + }, + { + "epoch": 2.4483282674772036, + "grad_norm": 1.5309195518493652, + "learning_rate": 4.495204570193687e-07, + "loss": 0.23737329244613647, + "mean_token_accuracy": 0.9094061851501465, + "num_tokens": 28798150.0, + "step": 3222 + }, + { + "epoch": 2.4490881458966567, + "grad_norm": 1.5956720113754272, + "learning_rate": 4.483229971196054e-07, + "loss": 0.24943354725837708, + "mean_token_accuracy": 0.9051728248596191, + "num_tokens": 28808116.0, + "step": 3223 + }, + { + "epoch": 2.4498480243161094, + "grad_norm": 1.515918254852295, + "learning_rate": 4.4712697716573994e-07, + "loss": 0.3883020281791687, + "mean_token_accuracy": 0.8599046468734741, + "num_tokens": 28822835.0, + "step": 3224 + }, + { + "epoch": 2.4506079027355625, + "grad_norm": 1.3584989309310913, + "learning_rate": 4.4593239799718636e-07, + "loss": 0.33565959334373474, + "mean_token_accuracy": 0.8725172281265259, + "num_tokens": 28841697.0, + "step": 3225 + }, + { + "epoch": 2.451367781155015, + "grad_norm": 2.800762414932251, + "learning_rate": 4.447392604523443e-07, + "loss": 0.36243852972984314, + "mean_token_accuracy": 0.8881685733795166, + "num_tokens": 28847164.0, + "step": 3226 + }, + { + "epoch": 2.452127659574468, + "grad_norm": 1.3506053686141968, + "learning_rate": 4.43547565368605e-07, + "loss": 0.21717754006385803, + "mean_token_accuracy": 0.9296318292617798, + "num_tokens": 28858658.0, + "step": 3227 + }, + { + "epoch": 2.452887537993921, + "grad_norm": 2.094951868057251, + "learning_rate": 4.423573135823464e-07, + "loss": 0.3554617762565613, + "mean_token_accuracy": 0.8762428760528564, + "num_tokens": 28866509.0, + "step": 3228 + }, + { + "epoch": 2.4536474164133737, + "grad_norm": 1.4730854034423828, + "learning_rate": 4.411685059289314e-07, + "loss": 0.2805292010307312, + "mean_token_accuracy": 0.9004697799682617, + "num_tokens": 28878151.0, + "step": 3229 + }, + { + "epoch": 2.454407294832827, + "grad_norm": 2.1443302631378174, + "learning_rate": 4.399811432427123e-07, + "loss": 0.3829796314239502, + "mean_token_accuracy": 0.866457462310791, + "num_tokens": 28886050.0, + "step": 3230 + }, + { + "epoch": 2.4551671732522795, + "grad_norm": 3.437201738357544, + "learning_rate": 4.387952263570261e-07, + "loss": 0.18470892310142517, + "mean_token_accuracy": 0.9365379810333252, + "num_tokens": 28889484.0, + "step": 3231 + }, + { + "epoch": 2.4559270516717326, + "grad_norm": 2.6203434467315674, + "learning_rate": 4.376107561041937e-07, + "loss": 0.25328633189201355, + "mean_token_accuracy": 0.921377956867218, + "num_tokens": 28893972.0, + "step": 3232 + }, + { + "epoch": 2.4566869300911853, + "grad_norm": 2.4467883110046387, + "learning_rate": 4.3642773331552203e-07, + "loss": 0.2748469412326813, + "mean_token_accuracy": 0.9046314358711243, + "num_tokens": 28899118.0, + "step": 3233 + }, + { + "epoch": 2.4574468085106385, + "grad_norm": 1.9845495223999023, + "learning_rate": 4.352461588213036e-07, + "loss": 0.443121075630188, + "mean_token_accuracy": 0.8609750866889954, + "num_tokens": 28909005.0, + "step": 3234 + }, + { + "epoch": 2.458206686930091, + "grad_norm": 2.8748083114624023, + "learning_rate": 4.340660334508115e-07, + "loss": 0.22461901605129242, + "mean_token_accuracy": 0.916649341583252, + "num_tokens": 28913366.0, + "step": 3235 + }, + { + "epoch": 2.4589665653495443, + "grad_norm": 1.7406567335128784, + "learning_rate": 4.328873580323034e-07, + "loss": 0.4147683382034302, + "mean_token_accuracy": 0.8523626327514648, + "num_tokens": 28924695.0, + "step": 3236 + }, + { + "epoch": 2.459726443768997, + "grad_norm": 1.767052412033081, + "learning_rate": 4.3171013339301905e-07, + "loss": 0.38994747400283813, + "mean_token_accuracy": 0.863203227519989, + "num_tokens": 28935163.0, + "step": 3237 + }, + { + "epoch": 2.4604863221884496, + "grad_norm": 1.2257410287857056, + "learning_rate": 4.305343603591802e-07, + "loss": 0.23309440910816193, + "mean_token_accuracy": 0.9016385674476624, + "num_tokens": 28948374.0, + "step": 3238 + }, + { + "epoch": 2.461246200607903, + "grad_norm": 1.3017674684524536, + "learning_rate": 4.293600397559897e-07, + "loss": 0.2825638949871063, + "mean_token_accuracy": 0.8953868746757507, + "num_tokens": 28961695.0, + "step": 3239 + }, + { + "epoch": 2.4620060790273555, + "grad_norm": 1.475160837173462, + "learning_rate": 4.2818717240763115e-07, + "loss": 0.30598294734954834, + "mean_token_accuracy": 0.8774391412734985, + "num_tokens": 28976399.0, + "step": 3240 + }, + { + "epoch": 2.4627659574468086, + "grad_norm": 2.1078310012817383, + "learning_rate": 4.2701575913726644e-07, + "loss": 0.4696943759918213, + "mean_token_accuracy": 0.8566044569015503, + "num_tokens": 28985515.0, + "step": 3241 + }, + { + "epoch": 2.4635258358662613, + "grad_norm": 2.587887763977051, + "learning_rate": 4.258458007670413e-07, + "loss": 0.32537540793418884, + "mean_token_accuracy": 0.8889709711074829, + "num_tokens": 28990365.0, + "step": 3242 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 1.622995138168335, + "learning_rate": 4.2467729811807497e-07, + "loss": 0.47171884775161743, + "mean_token_accuracy": 0.8305673599243164, + "num_tokens": 29002644.0, + "step": 3243 + }, + { + "epoch": 2.465045592705167, + "grad_norm": 2.0702009201049805, + "learning_rate": 4.235102520104681e-07, + "loss": 0.45754289627075195, + "mean_token_accuracy": 0.8536194562911987, + "num_tokens": 29011325.0, + "step": 3244 + }, + { + "epoch": 2.4658054711246202, + "grad_norm": 1.4394203424453735, + "learning_rate": 4.2234466326330023e-07, + "loss": 0.36623480916023254, + "mean_token_accuracy": 0.8834698796272278, + "num_tokens": 29028440.0, + "step": 3245 + }, + { + "epoch": 2.466565349544073, + "grad_norm": 1.6777557134628296, + "learning_rate": 4.211805326946247e-07, + "loss": 0.19617480039596558, + "mean_token_accuracy": 0.920343279838562, + "num_tokens": 29035936.0, + "step": 3246 + }, + { + "epoch": 2.467325227963526, + "grad_norm": 1.7396641969680786, + "learning_rate": 4.200178611214736e-07, + "loss": 0.3978565037250519, + "mean_token_accuracy": 0.8532278537750244, + "num_tokens": 29046734.0, + "step": 3247 + }, + { + "epoch": 2.4680851063829787, + "grad_norm": 2.9263904094696045, + "learning_rate": 4.18856649359855e-07, + "loss": 0.24883142113685608, + "mean_token_accuracy": 0.9077831506729126, + "num_tokens": 29051879.0, + "step": 3248 + }, + { + "epoch": 2.4688449848024314, + "grad_norm": 2.637763500213623, + "learning_rate": 4.1769689822475147e-07, + "loss": 0.3370334506034851, + "mean_token_accuracy": 0.8828175067901611, + "num_tokens": 29057684.0, + "step": 3249 + }, + { + "epoch": 2.4696048632218845, + "grad_norm": 1.768539309501648, + "learning_rate": 4.165386085301212e-07, + "loss": 0.32484760880470276, + "mean_token_accuracy": 0.8829447031021118, + "num_tokens": 29066105.0, + "step": 3250 + }, + { + "epoch": 2.4703647416413372, + "grad_norm": 1.4335054159164429, + "learning_rate": 4.1538178108889717e-07, + "loss": 0.442533940076828, + "mean_token_accuracy": 0.846094012260437, + "num_tokens": 29082385.0, + "step": 3251 + }, + { + "epoch": 2.4711246200607904, + "grad_norm": 2.007174491882324, + "learning_rate": 4.1422641671298336e-07, + "loss": 0.2856018841266632, + "mean_token_accuracy": 0.9205893278121948, + "num_tokens": 29089022.0, + "step": 3252 + }, + { + "epoch": 2.471884498480243, + "grad_norm": 2.225895404815674, + "learning_rate": 4.1307251621326124e-07, + "loss": 0.17259414494037628, + "mean_token_accuracy": 0.9244140386581421, + "num_tokens": 29094176.0, + "step": 3253 + }, + { + "epoch": 2.472644376899696, + "grad_norm": 2.6121842861175537, + "learning_rate": 4.1192008039958236e-07, + "loss": 0.37352171540260315, + "mean_token_accuracy": 0.8913992643356323, + "num_tokens": 29099565.0, + "step": 3254 + }, + { + "epoch": 2.473404255319149, + "grad_norm": 1.5645455121994019, + "learning_rate": 4.1076911008076895e-07, + "loss": 0.37237828969955444, + "mean_token_accuracy": 0.879361629486084, + "num_tokens": 29112039.0, + "step": 3255 + }, + { + "epoch": 2.474164133738602, + "grad_norm": 3.144536018371582, + "learning_rate": 4.096196060646168e-07, + "loss": 0.2038595974445343, + "mean_token_accuracy": 0.9299201369285583, + "num_tokens": 29115720.0, + "step": 3256 + }, + { + "epoch": 2.4749240121580547, + "grad_norm": 2.679821014404297, + "learning_rate": 4.0847156915789385e-07, + "loss": 0.41715145111083984, + "mean_token_accuracy": 0.862784743309021, + "num_tokens": 29120944.0, + "step": 3257 + }, + { + "epoch": 2.475683890577508, + "grad_norm": 2.243694305419922, + "learning_rate": 4.073250001663345e-07, + "loss": 0.43414735794067383, + "mean_token_accuracy": 0.8504310250282288, + "num_tokens": 29128842.0, + "step": 3258 + }, + { + "epoch": 2.4764437689969605, + "grad_norm": 2.636111259460449, + "learning_rate": 4.061798998946459e-07, + "loss": 0.2401021122932434, + "mean_token_accuracy": 0.910351037979126, + "num_tokens": 29133769.0, + "step": 3259 + }, + { + "epoch": 2.477203647416413, + "grad_norm": 2.6116414070129395, + "learning_rate": 4.050362691465032e-07, + "loss": 0.3290833830833435, + "mean_token_accuracy": 0.8770763278007507, + "num_tokens": 29138639.0, + "step": 3260 + }, + { + "epoch": 2.4779635258358663, + "grad_norm": 2.279324531555176, + "learning_rate": 4.038941087245507e-07, + "loss": 0.38752615451812744, + "mean_token_accuracy": 0.8624980449676514, + "num_tokens": 29145966.0, + "step": 3261 + }, + { + "epoch": 2.478723404255319, + "grad_norm": 2.2280423641204834, + "learning_rate": 4.0275341943040057e-07, + "loss": 0.3724668025970459, + "mean_token_accuracy": 0.8737661838531494, + "num_tokens": 29152705.0, + "step": 3262 + }, + { + "epoch": 2.479483282674772, + "grad_norm": 2.030075788497925, + "learning_rate": 4.0161420206463243e-07, + "loss": 0.32603174448013306, + "mean_token_accuracy": 0.8819995522499084, + "num_tokens": 29159853.0, + "step": 3263 + }, + { + "epoch": 2.480243161094225, + "grad_norm": 2.048346519470215, + "learning_rate": 4.0047645742679275e-07, + "loss": 0.3046466112136841, + "mean_token_accuracy": 0.8898575305938721, + "num_tokens": 29167744.0, + "step": 3264 + }, + { + "epoch": 2.481003039513678, + "grad_norm": 2.8435800075531006, + "learning_rate": 3.9934018631539506e-07, + "loss": 0.3660475015640259, + "mean_token_accuracy": 0.8754674196243286, + "num_tokens": 29173234.0, + "step": 3265 + }, + { + "epoch": 2.4817629179331306, + "grad_norm": 1.7785491943359375, + "learning_rate": 3.982053895279173e-07, + "loss": 0.39483463764190674, + "mean_token_accuracy": 0.8613039255142212, + "num_tokens": 29182555.0, + "step": 3266 + }, + { + "epoch": 2.4825227963525838, + "grad_norm": 2.384479522705078, + "learning_rate": 3.970720678608034e-07, + "loss": 0.3536769151687622, + "mean_token_accuracy": 0.8700416088104248, + "num_tokens": 29189742.0, + "step": 3267 + }, + { + "epoch": 2.4832826747720365, + "grad_norm": 2.368417978286743, + "learning_rate": 3.9594022210946355e-07, + "loss": 0.2937469780445099, + "mean_token_accuracy": 0.8970743417739868, + "num_tokens": 29194960.0, + "step": 3268 + }, + { + "epoch": 2.4840425531914896, + "grad_norm": 2.3920481204986572, + "learning_rate": 3.948098530682695e-07, + "loss": 0.29564806818962097, + "mean_token_accuracy": 0.913650393486023, + "num_tokens": 29200704.0, + "step": 3269 + }, + { + "epoch": 2.4848024316109423, + "grad_norm": 1.1830788850784302, + "learning_rate": 3.9368096153055783e-07, + "loss": 0.39095211029052734, + "mean_token_accuracy": 0.8536444902420044, + "num_tokens": 29224862.0, + "step": 3270 + }, + { + "epoch": 2.485562310030395, + "grad_norm": 1.4365004301071167, + "learning_rate": 3.925535482886286e-07, + "loss": 0.27921199798583984, + "mean_token_accuracy": 0.8939366936683655, + "num_tokens": 29237375.0, + "step": 3271 + }, + { + "epoch": 2.486322188449848, + "grad_norm": 2.1955132484436035, + "learning_rate": 3.9142761413374336e-07, + "loss": 0.41748374700546265, + "mean_token_accuracy": 0.8621724843978882, + "num_tokens": 29244655.0, + "step": 3272 + }, + { + "epoch": 2.487082066869301, + "grad_norm": 2.4120712280273438, + "learning_rate": 3.90303159856126e-07, + "loss": 0.2881275415420532, + "mean_token_accuracy": 0.8962163329124451, + "num_tokens": 29250350.0, + "step": 3273 + }, + { + "epoch": 2.487841945288754, + "grad_norm": 1.315206527709961, + "learning_rate": 3.891801862449629e-07, + "loss": 0.33958539366722107, + "mean_token_accuracy": 0.8800086379051208, + "num_tokens": 29264563.0, + "step": 3274 + }, + { + "epoch": 2.4886018237082066, + "grad_norm": 1.9663656949996948, + "learning_rate": 3.880586940883979e-07, + "loss": 0.35844963788986206, + "mean_token_accuracy": 0.8683270215988159, + "num_tokens": 29273782.0, + "step": 3275 + }, + { + "epoch": 2.4893617021276597, + "grad_norm": 1.4438722133636475, + "learning_rate": 3.869386841735395e-07, + "loss": 0.39307960867881775, + "mean_token_accuracy": 0.8902837038040161, + "num_tokens": 29288914.0, + "step": 3276 + }, + { + "epoch": 2.4901215805471124, + "grad_norm": 2.779317617416382, + "learning_rate": 3.8582015728645366e-07, + "loss": 0.237838476896286, + "mean_token_accuracy": 0.9132705926895142, + "num_tokens": 29293218.0, + "step": 3277 + }, + { + "epoch": 2.4908814589665655, + "grad_norm": 1.6183768510818481, + "learning_rate": 3.8470311421216435e-07, + "loss": 0.24135810136795044, + "mean_token_accuracy": 0.9351533055305481, + "num_tokens": 29301928.0, + "step": 3278 + }, + { + "epoch": 2.4916413373860182, + "grad_norm": 1.6468756198883057, + "learning_rate": 3.835875557346552e-07, + "loss": 0.34042105078697205, + "mean_token_accuracy": 0.8920395374298096, + "num_tokens": 29313740.0, + "step": 3279 + }, + { + "epoch": 2.4924012158054714, + "grad_norm": 1.6257606744766235, + "learning_rate": 3.8247348263687035e-07, + "loss": 0.3479476869106293, + "mean_token_accuracy": 0.8826069831848145, + "num_tokens": 29323650.0, + "step": 3280 + }, + { + "epoch": 2.493161094224924, + "grad_norm": 1.7144103050231934, + "learning_rate": 3.81360895700707e-07, + "loss": 0.3905973434448242, + "mean_token_accuracy": 0.8974796533584595, + "num_tokens": 29333192.0, + "step": 3281 + }, + { + "epoch": 2.4939209726443767, + "grad_norm": 1.3381150960922241, + "learning_rate": 3.802497957070225e-07, + "loss": 0.31121304631233215, + "mean_token_accuracy": 0.8921661376953125, + "num_tokens": 29348219.0, + "step": 3282 + }, + { + "epoch": 2.49468085106383, + "grad_norm": 1.8036452531814575, + "learning_rate": 3.7914018343562896e-07, + "loss": 0.4264541268348694, + "mean_token_accuracy": 0.8469835519790649, + "num_tokens": 29359632.0, + "step": 3283 + }, + { + "epoch": 2.4954407294832825, + "grad_norm": 1.7335898876190186, + "learning_rate": 3.780320596652956e-07, + "loss": 0.2710324823856354, + "mean_token_accuracy": 0.9050130844116211, + "num_tokens": 29368771.0, + "step": 3284 + }, + { + "epoch": 2.4962006079027357, + "grad_norm": 1.5694719552993774, + "learning_rate": 3.7692542517374615e-07, + "loss": 0.3114343285560608, + "mean_token_accuracy": 0.8869681358337402, + "num_tokens": 29379694.0, + "step": 3285 + }, + { + "epoch": 2.4969604863221884, + "grad_norm": 2.042365074157715, + "learning_rate": 3.75820280737659e-07, + "loss": 0.23643970489501953, + "mean_token_accuracy": 0.9191685318946838, + "num_tokens": 29385914.0, + "step": 3286 + }, + { + "epoch": 2.4977203647416415, + "grad_norm": 2.2526986598968506, + "learning_rate": 3.7471662713266744e-07, + "loss": 0.3166671097278595, + "mean_token_accuracy": 0.901310384273529, + "num_tokens": 29392128.0, + "step": 3287 + }, + { + "epoch": 2.498480243161094, + "grad_norm": 1.474029541015625, + "learning_rate": 3.7361446513335816e-07, + "loss": 0.4021439552307129, + "mean_token_accuracy": 0.9001395106315613, + "num_tokens": 29404742.0, + "step": 3288 + }, + { + "epoch": 2.499240121580547, + "grad_norm": 1.3057628870010376, + "learning_rate": 3.725137955132707e-07, + "loss": 0.30949655175209045, + "mean_token_accuracy": 0.8990561962127686, + "num_tokens": 29421839.0, + "step": 3289 + }, + { + "epoch": 2.5, + "grad_norm": 1.61989164352417, + "learning_rate": 3.7141461904489665e-07, + "loss": 0.3134443163871765, + "mean_token_accuracy": 0.8906387090682983, + "num_tokens": 29432127.0, + "step": 3290 + }, + { + "epoch": 2.500759878419453, + "grad_norm": 1.5306038856506348, + "learning_rate": 3.70316936499682e-07, + "loss": 0.4017624855041504, + "mean_token_accuracy": 0.845695436000824, + "num_tokens": 29444397.0, + "step": 3291 + }, + { + "epoch": 2.501519756838906, + "grad_norm": 1.2971603870391846, + "learning_rate": 3.6922074864802095e-07, + "loss": 0.4591655135154724, + "mean_token_accuracy": 0.8666995763778687, + "num_tokens": 29461121.0, + "step": 3292 + }, + { + "epoch": 2.5022796352583585, + "grad_norm": 1.9822273254394531, + "learning_rate": 3.681260562592609e-07, + "loss": 0.3666776716709137, + "mean_token_accuracy": 0.8733338117599487, + "num_tokens": 29469211.0, + "step": 3293 + }, + { + "epoch": 2.5030395136778116, + "grad_norm": 2.331378936767578, + "learning_rate": 3.670328601016995e-07, + "loss": 0.3511161506175995, + "mean_token_accuracy": 0.8734879493713379, + "num_tokens": 29475473.0, + "step": 3294 + }, + { + "epoch": 2.5037993920972643, + "grad_norm": 1.2138792276382446, + "learning_rate": 3.659411609425834e-07, + "loss": 0.2819535732269287, + "mean_token_accuracy": 0.9210860729217529, + "num_tokens": 29492447.0, + "step": 3295 + }, + { + "epoch": 2.5045592705167175, + "grad_norm": 1.4580892324447632, + "learning_rate": 3.648509595481095e-07, + "loss": 0.37376853823661804, + "mean_token_accuracy": 0.868643045425415, + "num_tokens": 29506128.0, + "step": 3296 + }, + { + "epoch": 2.50531914893617, + "grad_norm": 2.3763513565063477, + "learning_rate": 3.6376225668342287e-07, + "loss": 0.3229329586029053, + "mean_token_accuracy": 0.8802589178085327, + "num_tokens": 29512500.0, + "step": 3297 + }, + { + "epoch": 2.5060790273556233, + "grad_norm": 1.7995069026947021, + "learning_rate": 3.626750531126169e-07, + "loss": 0.2303360551595688, + "mean_token_accuracy": 0.9212342500686646, + "num_tokens": 29518867.0, + "step": 3298 + }, + { + "epoch": 2.506838905775076, + "grad_norm": 2.4798812866210938, + "learning_rate": 3.615893495987335e-07, + "loss": 0.15825161337852478, + "mean_token_accuracy": 0.9465295076370239, + "num_tokens": 29523418.0, + "step": 3299 + }, + { + "epoch": 2.5075987841945286, + "grad_norm": 2.6747193336486816, + "learning_rate": 3.6050514690376124e-07, + "loss": 0.3672150671482086, + "mean_token_accuracy": 0.8869320154190063, + "num_tokens": 29534685.0, + "step": 3300 + }, + { + "epoch": 2.5083586626139818, + "grad_norm": 1.47441828250885, + "learning_rate": 3.594224457886336e-07, + "loss": 0.3551298975944519, + "mean_token_accuracy": 0.8751654624938965, + "num_tokens": 29546692.0, + "step": 3301 + }, + { + "epoch": 2.509118541033435, + "grad_norm": 2.2134389877319336, + "learning_rate": 3.5834124701323414e-07, + "loss": 0.39865267276763916, + "mean_token_accuracy": 0.8581235408782959, + "num_tokens": 29553889.0, + "step": 3302 + }, + { + "epoch": 2.5098784194528876, + "grad_norm": 1.9763301610946655, + "learning_rate": 3.5726155133638915e-07, + "loss": 0.29025325179100037, + "mean_token_accuracy": 0.8915338516235352, + "num_tokens": 29562429.0, + "step": 3303 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 2.347961187362671, + "learning_rate": 3.561833595158698e-07, + "loss": 0.33726242184638977, + "mean_token_accuracy": 0.8788525462150574, + "num_tokens": 29568696.0, + "step": 3304 + }, + { + "epoch": 2.5113981762917934, + "grad_norm": 1.7410497665405273, + "learning_rate": 3.5510667230839237e-07, + "loss": 0.3604505956172943, + "mean_token_accuracy": 0.8745309114456177, + "num_tokens": 29579020.0, + "step": 3305 + }, + { + "epoch": 2.512158054711246, + "grad_norm": 2.8427274227142334, + "learning_rate": 3.540314904696196e-07, + "loss": 0.16700688004493713, + "mean_token_accuracy": 0.9461087584495544, + "num_tokens": 29583216.0, + "step": 3306 + }, + { + "epoch": 2.512917933130699, + "grad_norm": 3.4459211826324463, + "learning_rate": 3.529578147541532e-07, + "loss": 0.20073774456977844, + "mean_token_accuracy": 0.9330953359603882, + "num_tokens": 29586393.0, + "step": 3307 + }, + { + "epoch": 2.513677811550152, + "grad_norm": 1.2530099153518677, + "learning_rate": 3.518856459155409e-07, + "loss": 0.3268885016441345, + "mean_token_accuracy": 0.8808276653289795, + "num_tokens": 29602387.0, + "step": 3308 + }, + { + "epoch": 2.514437689969605, + "grad_norm": 2.64876389503479, + "learning_rate": 3.508149847062725e-07, + "loss": 0.328682541847229, + "mean_token_accuracy": 0.8907853364944458, + "num_tokens": 29608298.0, + "step": 3309 + }, + { + "epoch": 2.5151975683890577, + "grad_norm": 2.3505539894104004, + "learning_rate": 3.4974583187777853e-07, + "loss": 0.3768400549888611, + "mean_token_accuracy": 0.8646256327629089, + "num_tokens": 29615035.0, + "step": 3310 + }, + { + "epoch": 2.5159574468085104, + "grad_norm": 3.298685073852539, + "learning_rate": 3.4867818818043217e-07, + "loss": 0.4103941023349762, + "mean_token_accuracy": 0.8660793304443359, + "num_tokens": 29619522.0, + "step": 3311 + }, + { + "epoch": 2.5167173252279635, + "grad_norm": 1.8788949251174927, + "learning_rate": 3.476120543635469e-07, + "loss": 0.39368999004364014, + "mean_token_accuracy": 0.861727237701416, + "num_tokens": 29628297.0, + "step": 3312 + }, + { + "epoch": 2.5174772036474167, + "grad_norm": 1.3355047702789307, + "learning_rate": 3.4654743117537525e-07, + "loss": 0.30587559938430786, + "mean_token_accuracy": 0.8944345116615295, + "num_tokens": 29643010.0, + "step": 3313 + }, + { + "epoch": 2.5182370820668694, + "grad_norm": 1.6371463537216187, + "learning_rate": 3.4548431936311275e-07, + "loss": 0.35551705956459045, + "mean_token_accuracy": 0.8975727558135986, + "num_tokens": 29654169.0, + "step": 3314 + }, + { + "epoch": 2.518996960486322, + "grad_norm": 1.8126708269119263, + "learning_rate": 3.4442271967289083e-07, + "loss": 0.40501973032951355, + "mean_token_accuracy": 0.872620701789856, + "num_tokens": 29665965.0, + "step": 3315 + }, + { + "epoch": 2.519756838905775, + "grad_norm": 2.9103341102600098, + "learning_rate": 3.433626328497805e-07, + "loss": 0.21716530621051788, + "mean_token_accuracy": 0.9180731773376465, + "num_tokens": 29670529.0, + "step": 3316 + }, + { + "epoch": 2.520516717325228, + "grad_norm": 1.3893235921859741, + "learning_rate": 3.4230405963779357e-07, + "loss": 0.2638336420059204, + "mean_token_accuracy": 0.9039981365203857, + "num_tokens": 29681585.0, + "step": 3317 + }, + { + "epoch": 2.521276595744681, + "grad_norm": 2.408050298690796, + "learning_rate": 3.412470007798757e-07, + "loss": 0.4774054288864136, + "mean_token_accuracy": 0.835527777671814, + "num_tokens": 29688642.0, + "step": 3318 + }, + { + "epoch": 2.5220364741641337, + "grad_norm": 2.923038959503174, + "learning_rate": 3.4019145701791186e-07, + "loss": 0.24404606223106384, + "mean_token_accuracy": 0.9276547431945801, + "num_tokens": 29692516.0, + "step": 3319 + }, + { + "epoch": 2.522796352583587, + "grad_norm": 3.470700740814209, + "learning_rate": 3.3913742909272353e-07, + "loss": 0.26732707023620605, + "mean_token_accuracy": 0.910873293876648, + "num_tokens": 29695779.0, + "step": 3320 + }, + { + "epoch": 2.5235562310030395, + "grad_norm": 2.2419376373291016, + "learning_rate": 3.3808491774406817e-07, + "loss": 0.16050264239311218, + "mean_token_accuracy": 0.934256911277771, + "num_tokens": 29701486.0, + "step": 3321 + }, + { + "epoch": 2.524316109422492, + "grad_norm": 2.3232672214508057, + "learning_rate": 3.370339237106385e-07, + "loss": 0.23050843179225922, + "mean_token_accuracy": 0.9202409982681274, + "num_tokens": 29706780.0, + "step": 3322 + }, + { + "epoch": 2.5250759878419453, + "grad_norm": 3.012422800064087, + "learning_rate": 3.359844477300633e-07, + "loss": 0.22087830305099487, + "mean_token_accuracy": 0.9293035268783569, + "num_tokens": 29711164.0, + "step": 3323 + }, + { + "epoch": 2.5258358662613984, + "grad_norm": 3.0274150371551514, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.1908535212278366, + "mean_token_accuracy": 0.9202175140380859, + "num_tokens": 29714988.0, + "step": 3324 + }, + { + "epoch": 2.526595744680851, + "grad_norm": 1.9113285541534424, + "learning_rate": 3.3389005287265713e-07, + "loss": 0.3098488748073578, + "mean_token_accuracy": 0.8901765942573547, + "num_tokens": 29722665.0, + "step": 3325 + }, + { + "epoch": 2.527355623100304, + "grad_norm": 2.3841238021850586, + "learning_rate": 3.32845135465755e-07, + "loss": 0.25352805852890015, + "mean_token_accuracy": 0.9079523682594299, + "num_tokens": 29727646.0, + "step": 3326 + }, + { + "epoch": 2.528115501519757, + "grad_norm": 2.134140968322754, + "learning_rate": 3.3180173905155906e-07, + "loss": 0.24720364809036255, + "mean_token_accuracy": 0.9039219617843628, + "num_tokens": 29734233.0, + "step": 3327 + }, + { + "epoch": 2.5288753799392096, + "grad_norm": 1.9245797395706177, + "learning_rate": 3.3075986436236494e-07, + "loss": 0.2697824537754059, + "mean_token_accuracy": 0.9077266454696655, + "num_tokens": 29742107.0, + "step": 3328 + }, + { + "epoch": 2.5296352583586628, + "grad_norm": 2.5044164657592773, + "learning_rate": 3.297195121294022e-07, + "loss": 0.3145396411418915, + "mean_token_accuracy": 0.8834670782089233, + "num_tokens": 29747755.0, + "step": 3329 + }, + { + "epoch": 2.5303951367781155, + "grad_norm": 3.475567102432251, + "learning_rate": 3.286806830828285e-07, + "loss": 0.14926226437091827, + "mean_token_accuracy": 0.9487104415893555, + "num_tokens": 29750730.0, + "step": 3330 + }, + { + "epoch": 2.5311550151975686, + "grad_norm": 2.0287671089172363, + "learning_rate": 3.2764337795173433e-07, + "loss": 0.3795855641365051, + "mean_token_accuracy": 0.8685719966888428, + "num_tokens": 29758328.0, + "step": 3331 + }, + { + "epoch": 2.5319148936170213, + "grad_norm": 1.4884649515151978, + "learning_rate": 3.2660759746414055e-07, + "loss": 0.3048096299171448, + "mean_token_accuracy": 0.8908923268318176, + "num_tokens": 29770486.0, + "step": 3332 + }, + { + "epoch": 2.532674772036474, + "grad_norm": 2.0645828247070312, + "learning_rate": 3.255733423469978e-07, + "loss": 0.3477875590324402, + "mean_token_accuracy": 0.8803027868270874, + "num_tokens": 29778363.0, + "step": 3333 + }, + { + "epoch": 2.533434650455927, + "grad_norm": 2.032289981842041, + "learning_rate": 3.245406133261858e-07, + "loss": 0.39452236890792847, + "mean_token_accuracy": 0.8499241471290588, + "num_tokens": 29786353.0, + "step": 3334 + }, + { + "epoch": 2.53419452887538, + "grad_norm": 2.146658420562744, + "learning_rate": 3.235094111265141e-07, + "loss": 0.250872939825058, + "mean_token_accuracy": 0.9086864590644836, + "num_tokens": 29793122.0, + "step": 3335 + }, + { + "epoch": 2.534954407294833, + "grad_norm": 1.407880187034607, + "learning_rate": 3.224797364717197e-07, + "loss": 0.30364125967025757, + "mean_token_accuracy": 0.875752329826355, + "num_tokens": 29806866.0, + "step": 3336 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 2.6231658458709717, + "learning_rate": 3.214515900844681e-07, + "loss": 0.31516194343566895, + "mean_token_accuracy": 0.8799179792404175, + "num_tokens": 29813035.0, + "step": 3337 + }, + { + "epoch": 2.5364741641337387, + "grad_norm": 2.3876113891601562, + "learning_rate": 3.204249726863523e-07, + "loss": 0.3034508526325226, + "mean_token_accuracy": 0.8916938304901123, + "num_tokens": 29818810.0, + "step": 3338 + }, + { + "epoch": 2.5372340425531914, + "grad_norm": 2.16711163520813, + "learning_rate": 3.1939988499789075e-07, + "loss": 0.25329700112342834, + "mean_token_accuracy": 0.9260494112968445, + "num_tokens": 29825472.0, + "step": 3339 + }, + { + "epoch": 2.5379939209726445, + "grad_norm": 2.5136961936950684, + "learning_rate": 3.18376327738531e-07, + "loss": 0.3313722312450409, + "mean_token_accuracy": 0.8868670463562012, + "num_tokens": 29831426.0, + "step": 3340 + }, + { + "epoch": 2.538753799392097, + "grad_norm": 1.7886340618133545, + "learning_rate": 3.1735430162664366e-07, + "loss": 0.3526390492916107, + "mean_token_accuracy": 0.8689097762107849, + "num_tokens": 29840212.0, + "step": 3341 + }, + { + "epoch": 2.5395136778115504, + "grad_norm": 2.2471916675567627, + "learning_rate": 3.1633380737952663e-07, + "loss": 0.21594303846359253, + "mean_token_accuracy": 0.9280022382736206, + "num_tokens": 29845696.0, + "step": 3342 + }, + { + "epoch": 2.540273556231003, + "grad_norm": 1.1835771799087524, + "learning_rate": 3.15314845713402e-07, + "loss": 0.2646978497505188, + "mean_token_accuracy": 0.8992418050765991, + "num_tokens": 29861802.0, + "step": 3343 + }, + { + "epoch": 2.5410334346504557, + "grad_norm": 2.2009525299072266, + "learning_rate": 3.14297417343416e-07, + "loss": 0.4950712323188782, + "mean_token_accuracy": 0.8226115703582764, + "num_tokens": 29869931.0, + "step": 3344 + }, + { + "epoch": 2.541793313069909, + "grad_norm": 1.2517180442810059, + "learning_rate": 3.1328152298363943e-07, + "loss": 0.26179224252700806, + "mean_token_accuracy": 0.9045941829681396, + "num_tokens": 29883562.0, + "step": 3345 + }, + { + "epoch": 2.5425531914893615, + "grad_norm": 2.1705822944641113, + "learning_rate": 3.122671633470664e-07, + "loss": 0.38098567724227905, + "mean_token_accuracy": 0.8638834357261658, + "num_tokens": 29891094.0, + "step": 3346 + }, + { + "epoch": 2.5433130699088147, + "grad_norm": 1.5869110822677612, + "learning_rate": 3.1125433914561185e-07, + "loss": 0.36774593591690063, + "mean_token_accuracy": 0.8730655908584595, + "num_tokens": 29901795.0, + "step": 3347 + }, + { + "epoch": 2.5440729483282674, + "grad_norm": 1.267867922782898, + "learning_rate": 3.1024305109011664e-07, + "loss": 0.30716824531555176, + "mean_token_accuracy": 0.8794038891792297, + "num_tokens": 29918112.0, + "step": 3348 + }, + { + "epoch": 2.5448328267477205, + "grad_norm": 1.7851269245147705, + "learning_rate": 3.092332998903416e-07, + "loss": 0.3374805748462677, + "mean_token_accuracy": 0.8766556978225708, + "num_tokens": 29927770.0, + "step": 3349 + }, + { + "epoch": 2.545592705167173, + "grad_norm": 1.7153595685958862, + "learning_rate": 3.082250862549671e-07, + "loss": 0.4149400293827057, + "mean_token_accuracy": 0.853299617767334, + "num_tokens": 29939361.0, + "step": 3350 + }, + { + "epoch": 2.5463525835866263, + "grad_norm": 2.676774740219116, + "learning_rate": 3.0721841089159823e-07, + "loss": 0.2004309445619583, + "mean_token_accuracy": 0.9245458841323853, + "num_tokens": 29943717.0, + "step": 3351 + }, + { + "epoch": 2.547112462006079, + "grad_norm": 3.0472381114959717, + "learning_rate": 3.0621327450675806e-07, + "loss": 0.31185799837112427, + "mean_token_accuracy": 0.8936638832092285, + "num_tokens": 29948613.0, + "step": 3352 + }, + { + "epoch": 2.547872340425532, + "grad_norm": 3.141087055206299, + "learning_rate": 3.0520967780588966e-07, + "loss": 0.34619835019111633, + "mean_token_accuracy": 0.8754764199256897, + "num_tokens": 29952477.0, + "step": 3353 + }, + { + "epoch": 2.548632218844985, + "grad_norm": 1.277807593345642, + "learning_rate": 3.0420762149335566e-07, + "loss": 0.41385579109191895, + "mean_token_accuracy": 0.8646053075790405, + "num_tokens": 29972620.0, + "step": 3354 + }, + { + "epoch": 2.5493920972644375, + "grad_norm": 1.8656301498413086, + "learning_rate": 3.0320710627243815e-07, + "loss": 0.33177047967910767, + "mean_token_accuracy": 0.884863018989563, + "num_tokens": 29980861.0, + "step": 3355 + }, + { + "epoch": 2.5501519756838906, + "grad_norm": 1.5590285062789917, + "learning_rate": 3.022081328453372e-07, + "loss": 0.35837340354919434, + "mean_token_accuracy": 0.8669678568840027, + "num_tokens": 29992920.0, + "step": 3356 + }, + { + "epoch": 2.5509118541033433, + "grad_norm": 1.3580808639526367, + "learning_rate": 3.0121070191317075e-07, + "loss": 0.30251336097717285, + "mean_token_accuracy": 0.891779363155365, + "num_tokens": 30006416.0, + "step": 3357 + }, + { + "epoch": 2.5516717325227964, + "grad_norm": 1.2978777885437012, + "learning_rate": 3.002148141759739e-07, + "loss": 0.3747216463088989, + "mean_token_accuracy": 0.8675031661987305, + "num_tokens": 30026730.0, + "step": 3358 + }, + { + "epoch": 2.552431610942249, + "grad_norm": 2.1855390071868896, + "learning_rate": 2.992204703326995e-07, + "loss": 0.25247129797935486, + "mean_token_accuracy": 0.9170730113983154, + "num_tokens": 30032920.0, + "step": 3359 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 1.46858811378479, + "learning_rate": 2.9822767108121623e-07, + "loss": 0.45840656757354736, + "mean_token_accuracy": 0.8472789525985718, + "num_tokens": 30046347.0, + "step": 3360 + }, + { + "epoch": 2.553951367781155, + "grad_norm": 1.7625445127487183, + "learning_rate": 2.9723641711830896e-07, + "loss": 0.34696075320243835, + "mean_token_accuracy": 0.8730940222740173, + "num_tokens": 30057264.0, + "step": 3361 + }, + { + "epoch": 2.5547112462006076, + "grad_norm": 2.3647844791412354, + "learning_rate": 2.96246709139677e-07, + "loss": 0.3888760209083557, + "mean_token_accuracy": 0.8829300403594971, + "num_tokens": 30064199.0, + "step": 3362 + }, + { + "epoch": 2.5554711246200608, + "grad_norm": 1.3508832454681396, + "learning_rate": 2.9525854783993696e-07, + "loss": 0.2998582720756531, + "mean_token_accuracy": 0.8910796642303467, + "num_tokens": 30078083.0, + "step": 3363 + }, + { + "epoch": 2.556231003039514, + "grad_norm": 1.8688349723815918, + "learning_rate": 2.942719339126171e-07, + "loss": 0.23044756054878235, + "mean_token_accuracy": 0.9150751233100891, + "num_tokens": 30086010.0, + "step": 3364 + }, + { + "epoch": 2.5569908814589666, + "grad_norm": 2.7221083641052246, + "learning_rate": 2.932868680501613e-07, + "loss": 0.30724483728408813, + "mean_token_accuracy": 0.9012277126312256, + "num_tokens": 30091524.0, + "step": 3365 + }, + { + "epoch": 2.5577507598784193, + "grad_norm": 2.5149598121643066, + "learning_rate": 2.92303350943928e-07, + "loss": 0.37096866965293884, + "mean_token_accuracy": 0.8573155403137207, + "num_tokens": 30097860.0, + "step": 3366 + }, + { + "epoch": 2.5585106382978724, + "grad_norm": 2.9985098838806152, + "learning_rate": 2.913213832841857e-07, + "loss": 0.3397367596626282, + "mean_token_accuracy": 0.8724661469459534, + "num_tokens": 30107543.0, + "step": 3367 + }, + { + "epoch": 2.559270516717325, + "grad_norm": 2.119527816772461, + "learning_rate": 2.9034096576011805e-07, + "loss": 0.34516414999961853, + "mean_token_accuracy": 0.8728296756744385, + "num_tokens": 30114737.0, + "step": 3368 + }, + { + "epoch": 2.560030395136778, + "grad_norm": 2.6809260845184326, + "learning_rate": 2.893620990598192e-07, + "loss": 0.4649572968482971, + "mean_token_accuracy": 0.8441047668457031, + "num_tokens": 30120640.0, + "step": 3369 + }, + { + "epoch": 2.560790273556231, + "grad_norm": 1.634458065032959, + "learning_rate": 2.8838478387029605e-07, + "loss": 0.3435993194580078, + "mean_token_accuracy": 0.8726693987846375, + "num_tokens": 30133091.0, + "step": 3370 + }, + { + "epoch": 2.561550151975684, + "grad_norm": 1.7352157831192017, + "learning_rate": 2.8740902087746604e-07, + "loss": 0.3171056807041168, + "mean_token_accuracy": 0.8962107300758362, + "num_tokens": 30141735.0, + "step": 3371 + }, + { + "epoch": 2.5623100303951367, + "grad_norm": 2.8209640979766846, + "learning_rate": 2.8643481076615717e-07, + "loss": 0.24519780278205872, + "mean_token_accuracy": 0.9098281860351562, + "num_tokens": 30146073.0, + "step": 3372 + }, + { + "epoch": 2.5630699088145894, + "grad_norm": 2.1111650466918945, + "learning_rate": 2.854621542201064e-07, + "loss": 0.34583622217178345, + "mean_token_accuracy": 0.8917075395584106, + "num_tokens": 30153104.0, + "step": 3373 + }, + { + "epoch": 2.5638297872340425, + "grad_norm": 1.5275969505310059, + "learning_rate": 2.844910519219632e-07, + "loss": 0.33743610978126526, + "mean_token_accuracy": 0.8789186477661133, + "num_tokens": 30166414.0, + "step": 3374 + }, + { + "epoch": 2.5645896656534957, + "grad_norm": 3.6885430812835693, + "learning_rate": 2.835215045532841e-07, + "loss": 0.3318662643432617, + "mean_token_accuracy": 0.880516767501831, + "num_tokens": 30170397.0, + "step": 3375 + }, + { + "epoch": 2.5653495440729484, + "grad_norm": 3.58422589302063, + "learning_rate": 2.8255351279453446e-07, + "loss": 0.24304428696632385, + "mean_token_accuracy": 0.911949634552002, + "num_tokens": 30173809.0, + "step": 3376 + }, + { + "epoch": 2.566109422492401, + "grad_norm": 2.180278778076172, + "learning_rate": 2.815870773250873e-07, + "loss": 0.2282833755016327, + "mean_token_accuracy": 0.9192917346954346, + "num_tokens": 30179431.0, + "step": 3377 + }, + { + "epoch": 2.566869300911854, + "grad_norm": 1.925766110420227, + "learning_rate": 2.8062219882322636e-07, + "loss": 0.38162487745285034, + "mean_token_accuracy": 0.8635650873184204, + "num_tokens": 30194252.0, + "step": 3378 + }, + { + "epoch": 2.567629179331307, + "grad_norm": 1.9528982639312744, + "learning_rate": 2.796588779661388e-07, + "loss": 0.3215118646621704, + "mean_token_accuracy": 0.8850376605987549, + "num_tokens": 30202341.0, + "step": 3379 + }, + { + "epoch": 2.56838905775076, + "grad_norm": 1.9466958045959473, + "learning_rate": 2.786971154299209e-07, + "loss": 0.3743375539779663, + "mean_token_accuracy": 0.8669804930686951, + "num_tokens": 30210657.0, + "step": 3380 + }, + { + "epoch": 2.5691489361702127, + "grad_norm": 1.0222121477127075, + "learning_rate": 2.777369118895745e-07, + "loss": 0.28801876306533813, + "mean_token_accuracy": 0.8983622789382935, + "num_tokens": 30232182.0, + "step": 3381 + }, + { + "epoch": 2.569908814589666, + "grad_norm": 1.5706082582473755, + "learning_rate": 2.767782680190073e-07, + "loss": 0.37556713819503784, + "mean_token_accuracy": 0.8659577369689941, + "num_tokens": 30244819.0, + "step": 3382 + }, + { + "epoch": 2.5706686930091185, + "grad_norm": 2.5092997550964355, + "learning_rate": 2.7582118449103273e-07, + "loss": 0.4440537691116333, + "mean_token_accuracy": 0.8627067804336548, + "num_tokens": 30251856.0, + "step": 3383 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 2.2710351943969727, + "learning_rate": 2.748656619773687e-07, + "loss": 0.12478743493556976, + "mean_token_accuracy": 0.9581196904182434, + "num_tokens": 30255765.0, + "step": 3384 + }, + { + "epoch": 2.5721884498480243, + "grad_norm": 1.5596920251846313, + "learning_rate": 2.739117011486378e-07, + "loss": 0.23946957290172577, + "mean_token_accuracy": 0.9149091243743896, + "num_tokens": 30265134.0, + "step": 3385 + }, + { + "epoch": 2.5729483282674774, + "grad_norm": 2.5665597915649414, + "learning_rate": 2.729593026743668e-07, + "loss": 0.22638919949531555, + "mean_token_accuracy": 0.9160120487213135, + "num_tokens": 30269971.0, + "step": 3386 + }, + { + "epoch": 2.57370820668693, + "grad_norm": 2.1374216079711914, + "learning_rate": 2.7200846722298503e-07, + "loss": 0.3681026101112366, + "mean_token_accuracy": 0.8709797263145447, + "num_tokens": 30277792.0, + "step": 3387 + }, + { + "epoch": 2.574468085106383, + "grad_norm": 1.5955793857574463, + "learning_rate": 2.710591954618247e-07, + "loss": 0.3560969829559326, + "mean_token_accuracy": 0.8950826525688171, + "num_tokens": 30289038.0, + "step": 3388 + }, + { + "epoch": 2.575227963525836, + "grad_norm": 1.561316967010498, + "learning_rate": 2.701114880571232e-07, + "loss": 0.29359546303749084, + "mean_token_accuracy": 0.9007925987243652, + "num_tokens": 30298228.0, + "step": 3389 + }, + { + "epoch": 2.5759878419452886, + "grad_norm": 1.7596205472946167, + "learning_rate": 2.6916534567401675e-07, + "loss": 0.29790499806404114, + "mean_token_accuracy": 0.8907828330993652, + "num_tokens": 30306908.0, + "step": 3390 + }, + { + "epoch": 2.5767477203647418, + "grad_norm": 2.1243667602539062, + "learning_rate": 2.6822076897654453e-07, + "loss": 0.26356661319732666, + "mean_token_accuracy": 0.9012589454650879, + "num_tokens": 30312971.0, + "step": 3391 + }, + { + "epoch": 2.5775075987841944, + "grad_norm": 2.35373592376709, + "learning_rate": 2.6727775862764703e-07, + "loss": 0.3303247094154358, + "mean_token_accuracy": 0.876170814037323, + "num_tokens": 30319303.0, + "step": 3392 + }, + { + "epoch": 2.5782674772036476, + "grad_norm": 2.5983684062957764, + "learning_rate": 2.663363152891654e-07, + "loss": 0.3094015121459961, + "mean_token_accuracy": 0.9034996628761292, + "num_tokens": 30324454.0, + "step": 3393 + }, + { + "epoch": 2.5790273556231003, + "grad_norm": 2.264035940170288, + "learning_rate": 2.653964396218406e-07, + "loss": 0.42449623346328735, + "mean_token_accuracy": 0.8461374044418335, + "num_tokens": 30331213.0, + "step": 3394 + }, + { + "epoch": 2.579787234042553, + "grad_norm": 1.591833233833313, + "learning_rate": 2.64458132285314e-07, + "loss": 0.3518860340118408, + "mean_token_accuracy": 0.8751099705696106, + "num_tokens": 30341328.0, + "step": 3395 + }, + { + "epoch": 2.580547112462006, + "grad_norm": 2.4209396839141846, + "learning_rate": 2.635213939381248e-07, + "loss": 0.3116898238658905, + "mean_token_accuracy": 0.9000394344329834, + "num_tokens": 30346970.0, + "step": 3396 + }, + { + "epoch": 2.581306990881459, + "grad_norm": 2.373574733734131, + "learning_rate": 2.625862252377129e-07, + "loss": 0.2558296322822571, + "mean_token_accuracy": 0.9050976037979126, + "num_tokens": 30352535.0, + "step": 3397 + }, + { + "epoch": 2.582066869300912, + "grad_norm": 2.3691492080688477, + "learning_rate": 2.61652626840416e-07, + "loss": 0.34974297881126404, + "mean_token_accuracy": 0.880367636680603, + "num_tokens": 30359008.0, + "step": 3398 + }, + { + "epoch": 2.5828267477203646, + "grad_norm": 2.6194329261779785, + "learning_rate": 2.6072059940146775e-07, + "loss": 0.302560031414032, + "mean_token_accuracy": 0.9090637564659119, + "num_tokens": 30364545.0, + "step": 3399 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 1.8017159700393677, + "learning_rate": 2.597901435750025e-07, + "loss": 0.2855827212333679, + "mean_token_accuracy": 0.8930953741073608, + "num_tokens": 30372593.0, + "step": 3400 + }, + { + "epoch": 2.5843465045592704, + "grad_norm": 1.736401915550232, + "learning_rate": 2.5886126001405e-07, + "loss": 0.38662317395210266, + "mean_token_accuracy": 0.876146674156189, + "num_tokens": 30382097.0, + "step": 3401 + }, + { + "epoch": 2.5851063829787235, + "grad_norm": 1.174890398979187, + "learning_rate": 2.579339493705355e-07, + "loss": 0.351195752620697, + "mean_token_accuracy": 0.8636453747749329, + "num_tokens": 30399208.0, + "step": 3402 + }, + { + "epoch": 2.585866261398176, + "grad_norm": 1.9311470985412598, + "learning_rate": 2.5700821229528164e-07, + "loss": 0.3222745656967163, + "mean_token_accuracy": 0.9071283936500549, + "num_tokens": 30406648.0, + "step": 3403 + }, + { + "epoch": 2.5866261398176293, + "grad_norm": 1.7329829931259155, + "learning_rate": 2.5608404943800627e-07, + "loss": 0.25072571635246277, + "mean_token_accuracy": 0.9056229591369629, + "num_tokens": 30414836.0, + "step": 3404 + }, + { + "epoch": 2.587386018237082, + "grad_norm": 2.8609302043914795, + "learning_rate": 2.5516146144732273e-07, + "loss": 0.23907656967639923, + "mean_token_accuracy": 0.9093331694602966, + "num_tokens": 30419027.0, + "step": 3405 + }, + { + "epoch": 2.5881458966565347, + "grad_norm": 2.0544052124023438, + "learning_rate": 2.5424044897073895e-07, + "loss": 0.28297221660614014, + "mean_token_accuracy": 0.8845421075820923, + "num_tokens": 30426720.0, + "step": 3406 + }, + { + "epoch": 2.588905775075988, + "grad_norm": 2.0454416275024414, + "learning_rate": 2.533210126546565e-07, + "loss": 0.40411946177482605, + "mean_token_accuracy": 0.8890959024429321, + "num_tokens": 30434413.0, + "step": 3407 + }, + { + "epoch": 2.589665653495441, + "grad_norm": 2.5405404567718506, + "learning_rate": 2.52403153144371e-07, + "loss": 0.2860855460166931, + "mean_token_accuracy": 0.9015365242958069, + "num_tokens": 30439934.0, + "step": 3408 + }, + { + "epoch": 2.5904255319148937, + "grad_norm": 1.7092106342315674, + "learning_rate": 2.514868710840723e-07, + "loss": 0.36490949988365173, + "mean_token_accuracy": 0.8814249634742737, + "num_tokens": 30450150.0, + "step": 3409 + }, + { + "epoch": 2.5911854103343464, + "grad_norm": 2.2119903564453125, + "learning_rate": 2.505721671168426e-07, + "loss": 0.3019217848777771, + "mean_token_accuracy": 0.915499210357666, + "num_tokens": 30456068.0, + "step": 3410 + }, + { + "epoch": 2.5919452887537995, + "grad_norm": 2.1960413455963135, + "learning_rate": 2.496590418846545e-07, + "loss": 0.21500837802886963, + "mean_token_accuracy": 0.9256033897399902, + "num_tokens": 30461738.0, + "step": 3411 + }, + { + "epoch": 2.592705167173252, + "grad_norm": 2.9918036460876465, + "learning_rate": 2.48747496028377e-07, + "loss": 0.3921341001987457, + "mean_token_accuracy": 0.8748230934143066, + "num_tokens": 30466747.0, + "step": 3412 + }, + { + "epoch": 2.5934650455927053, + "grad_norm": 3.091017723083496, + "learning_rate": 2.478375301877664e-07, + "loss": 0.19394469261169434, + "mean_token_accuracy": 0.9396419525146484, + "num_tokens": 30470176.0, + "step": 3413 + }, + { + "epoch": 2.594224924012158, + "grad_norm": 1.3302737474441528, + "learning_rate": 2.4692914500147185e-07, + "loss": 0.43362653255462646, + "mean_token_accuracy": 0.844821572303772, + "num_tokens": 30486501.0, + "step": 3414 + }, + { + "epoch": 2.594984802431611, + "grad_norm": 1.7620038986206055, + "learning_rate": 2.460223411070337e-07, + "loss": 0.2638559937477112, + "mean_token_accuracy": 0.8977950215339661, + "num_tokens": 30495182.0, + "step": 3415 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 2.4946224689483643, + "learning_rate": 2.451171191408813e-07, + "loss": 0.1953703612089157, + "mean_token_accuracy": 0.9253969192504883, + "num_tokens": 30499923.0, + "step": 3416 + }, + { + "epoch": 2.5965045592705165, + "grad_norm": 1.3302149772644043, + "learning_rate": 2.4421347973833443e-07, + "loss": 0.32972219586372375, + "mean_token_accuracy": 0.9001818299293518, + "num_tokens": 30515685.0, + "step": 3417 + }, + { + "epoch": 2.5972644376899696, + "grad_norm": 1.4542583227157593, + "learning_rate": 2.4331142353360206e-07, + "loss": 0.2967185378074646, + "mean_token_accuracy": 0.9060331583023071, + "num_tokens": 30527102.0, + "step": 3418 + }, + { + "epoch": 2.5980243161094227, + "grad_norm": 2.951036214828491, + "learning_rate": 2.424109511597822e-07, + "loss": 0.2523000240325928, + "mean_token_accuracy": 0.9064282774925232, + "num_tokens": 30531309.0, + "step": 3419 + }, + { + "epoch": 2.5987841945288754, + "grad_norm": 2.1855340003967285, + "learning_rate": 2.4151206324886047e-07, + "loss": 0.35201045870780945, + "mean_token_accuracy": 0.8787988424301147, + "num_tokens": 30538014.0, + "step": 3420 + }, + { + "epoch": 2.599544072948328, + "grad_norm": 2.579791307449341, + "learning_rate": 2.406147604317119e-07, + "loss": 0.24575991928577423, + "mean_token_accuracy": 0.9309802055358887, + "num_tokens": 30542437.0, + "step": 3421 + }, + { + "epoch": 2.6003039513677813, + "grad_norm": 2.758512496948242, + "learning_rate": 2.397190433380964e-07, + "loss": 0.3121788501739502, + "mean_token_accuracy": 0.8949024677276611, + "num_tokens": 30547171.0, + "step": 3422 + }, + { + "epoch": 2.601063829787234, + "grad_norm": 1.7849500179290771, + "learning_rate": 2.388249125966646e-07, + "loss": 0.3810131251811981, + "mean_token_accuracy": 0.8799927830696106, + "num_tokens": 30556368.0, + "step": 3423 + }, + { + "epoch": 2.601823708206687, + "grad_norm": 2.701768636703491, + "learning_rate": 2.3793236883495164e-07, + "loss": 0.2190743237733841, + "mean_token_accuracy": 0.9288224577903748, + "num_tokens": 30561367.0, + "step": 3424 + }, + { + "epoch": 2.6025835866261398, + "grad_norm": 2.0361149311065674, + "learning_rate": 2.3704141267937797e-07, + "loss": 0.37623006105422974, + "mean_token_accuracy": 0.8677272796630859, + "num_tokens": 30569589.0, + "step": 3425 + }, + { + "epoch": 2.603343465045593, + "grad_norm": 1.094288945198059, + "learning_rate": 2.3615204475525096e-07, + "loss": 0.3885940909385681, + "mean_token_accuracy": 0.8518509864807129, + "num_tokens": 30592538.0, + "step": 3426 + }, + { + "epoch": 2.6041033434650456, + "grad_norm": 3.1634905338287354, + "learning_rate": 2.3526426568676485e-07, + "loss": 0.14411768317222595, + "mean_token_accuracy": 0.9483509063720703, + "num_tokens": 30595768.0, + "step": 3427 + }, + { + "epoch": 2.6048632218844983, + "grad_norm": 1.642171859741211, + "learning_rate": 2.3437807609699575e-07, + "loss": 0.28384336829185486, + "mean_token_accuracy": 0.8940542936325073, + "num_tokens": 30605361.0, + "step": 3428 + }, + { + "epoch": 2.6056231003039514, + "grad_norm": 1.567029356956482, + "learning_rate": 2.3349347660790582e-07, + "loss": 0.373100221157074, + "mean_token_accuracy": 0.8695693016052246, + "num_tokens": 30616182.0, + "step": 3429 + }, + { + "epoch": 2.6063829787234045, + "grad_norm": 1.5392675399780273, + "learning_rate": 2.3261046784034154e-07, + "loss": 0.4163264036178589, + "mean_token_accuracy": 0.8596208095550537, + "num_tokens": 30628601.0, + "step": 3430 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 1.5044162273406982, + "learning_rate": 2.3172905041403181e-07, + "loss": 0.3813124895095825, + "mean_token_accuracy": 0.8577728271484375, + "num_tokens": 30641924.0, + "step": 3431 + }, + { + "epoch": 2.60790273556231, + "grad_norm": 1.4375652074813843, + "learning_rate": 2.3084922494758965e-07, + "loss": 0.33166638016700745, + "mean_token_accuracy": 0.8825733661651611, + "num_tokens": 30653849.0, + "step": 3432 + }, + { + "epoch": 2.608662613981763, + "grad_norm": 2.5562593936920166, + "learning_rate": 2.299709920585108e-07, + "loss": 0.3969959616661072, + "mean_token_accuracy": 0.8612505197525024, + "num_tokens": 30659471.0, + "step": 3433 + }, + { + "epoch": 2.6094224924012157, + "grad_norm": 3.2285826206207275, + "learning_rate": 2.2909435236317224e-07, + "loss": 0.24361640214920044, + "mean_token_accuracy": 0.9103600978851318, + "num_tokens": 30664129.0, + "step": 3434 + }, + { + "epoch": 2.610182370820669, + "grad_norm": 2.702500343322754, + "learning_rate": 2.2821930647683427e-07, + "loss": 0.28006303310394287, + "mean_token_accuracy": 0.9067277908325195, + "num_tokens": 30668453.0, + "step": 3435 + }, + { + "epoch": 2.6109422492401215, + "grad_norm": 1.6491931676864624, + "learning_rate": 2.2734585501363676e-07, + "loss": 0.38273465633392334, + "mean_token_accuracy": 0.8603695631027222, + "num_tokens": 30680159.0, + "step": 3436 + }, + { + "epoch": 2.6117021276595747, + "grad_norm": 2.628532648086548, + "learning_rate": 2.2647399858660156e-07, + "loss": 0.28879645466804504, + "mean_token_accuracy": 0.9014753103256226, + "num_tokens": 30685706.0, + "step": 3437 + }, + { + "epoch": 2.6124620060790273, + "grad_norm": 1.515868067741394, + "learning_rate": 2.2560373780763256e-07, + "loss": 0.3872387707233429, + "mean_token_accuracy": 0.8627544641494751, + "num_tokens": 30696942.0, + "step": 3438 + }, + { + "epoch": 2.61322188449848, + "grad_norm": 2.4761857986450195, + "learning_rate": 2.2473507328751086e-07, + "loss": 0.3222554624080658, + "mean_token_accuracy": 0.8839071989059448, + "num_tokens": 30703089.0, + "step": 3439 + }, + { + "epoch": 2.613981762917933, + "grad_norm": 1.5424152612686157, + "learning_rate": 2.238680056358991e-07, + "loss": 0.24553638696670532, + "mean_token_accuracy": 0.9186095595359802, + "num_tokens": 30712643.0, + "step": 3440 + }, + { + "epoch": 2.6147416413373863, + "grad_norm": 2.1723358631134033, + "learning_rate": 2.2300253546133883e-07, + "loss": 0.22538061439990997, + "mean_token_accuracy": 0.914456844329834, + "num_tokens": 30719326.0, + "step": 3441 + }, + { + "epoch": 2.615501519756839, + "grad_norm": 2.0607242584228516, + "learning_rate": 2.2213866337125022e-07, + "loss": 0.40517157316207886, + "mean_token_accuracy": 0.8558610677719116, + "num_tokens": 30726700.0, + "step": 3442 + }, + { + "epoch": 2.6162613981762917, + "grad_norm": 1.3590739965438843, + "learning_rate": 2.2127638997193196e-07, + "loss": 0.3030068874359131, + "mean_token_accuracy": 0.904723048210144, + "num_tokens": 30739090.0, + "step": 3443 + }, + { + "epoch": 2.617021276595745, + "grad_norm": 1.3497486114501953, + "learning_rate": 2.2041571586856104e-07, + "loss": 0.33204561471939087, + "mean_token_accuracy": 0.8720648288726807, + "num_tokens": 30755883.0, + "step": 3444 + }, + { + "epoch": 2.6177811550151975, + "grad_norm": 4.2434515953063965, + "learning_rate": 2.1955664166519036e-07, + "loss": 0.16747456789016724, + "mean_token_accuracy": 0.9386751651763916, + "num_tokens": 30758472.0, + "step": 3445 + }, + { + "epoch": 2.6185410334346506, + "grad_norm": 2.629639148712158, + "learning_rate": 2.1869916796475294e-07, + "loss": 0.3494086265563965, + "mean_token_accuracy": 0.8652780055999756, + "num_tokens": 30765014.0, + "step": 3446 + }, + { + "epoch": 2.6193009118541033, + "grad_norm": 1.56986403465271, + "learning_rate": 2.1784329536905653e-07, + "loss": 0.288389652967453, + "mean_token_accuracy": 0.8911552429199219, + "num_tokens": 30783893.0, + "step": 3447 + }, + { + "epoch": 2.6200607902735564, + "grad_norm": 2.137489080429077, + "learning_rate": 2.1698902447878478e-07, + "loss": 0.32554084062576294, + "mean_token_accuracy": 0.8827146291732788, + "num_tokens": 30791222.0, + "step": 3448 + }, + { + "epoch": 2.620820668693009, + "grad_norm": 1.7718229293823242, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.433074414730072, + "mean_token_accuracy": 0.8626075983047485, + "num_tokens": 30800909.0, + "step": 3449 + }, + { + "epoch": 2.621580547112462, + "grad_norm": 1.8075933456420898, + "learning_rate": 2.1528529021163203e-07, + "loss": 0.3695775270462036, + "mean_token_accuracy": 0.8898511528968811, + "num_tokens": 30809680.0, + "step": 3450 + }, + { + "epoch": 2.622340425531915, + "grad_norm": 2.61863374710083, + "learning_rate": 2.1443582803049757e-07, + "loss": 0.3161890506744385, + "mean_token_accuracy": 0.9073872566223145, + "num_tokens": 30814582.0, + "step": 3451 + }, + { + "epoch": 2.6231003039513676, + "grad_norm": 1.9178471565246582, + "learning_rate": 2.1358796994628005e-07, + "loss": 0.26871830224990845, + "mean_token_accuracy": 0.9038676023483276, + "num_tokens": 30822408.0, + "step": 3452 + }, + { + "epoch": 2.6238601823708207, + "grad_norm": 1.4968323707580566, + "learning_rate": 2.1274171655403852e-07, + "loss": 0.30813854932785034, + "mean_token_accuracy": 0.8859157562255859, + "num_tokens": 30833627.0, + "step": 3453 + }, + { + "epoch": 2.6246200607902734, + "grad_norm": 2.584803581237793, + "learning_rate": 2.118970684477062e-07, + "loss": 0.3794214129447937, + "mean_token_accuracy": 0.8644092679023743, + "num_tokens": 30839801.0, + "step": 3454 + }, + { + "epoch": 2.6253799392097266, + "grad_norm": 1.4426536560058594, + "learning_rate": 2.1105402622008996e-07, + "loss": 0.3904871344566345, + "mean_token_accuracy": 0.8601649403572083, + "num_tokens": 30854293.0, + "step": 3455 + }, + { + "epoch": 2.6261398176291793, + "grad_norm": 2.42291522026062, + "learning_rate": 2.1021259046286907e-07, + "loss": 0.2830442786216736, + "mean_token_accuracy": 0.918805718421936, + "num_tokens": 30859112.0, + "step": 3456 + }, + { + "epoch": 2.6268996960486324, + "grad_norm": 1.4296268224716187, + "learning_rate": 2.0937276176659553e-07, + "loss": 0.3172294497489929, + "mean_token_accuracy": 0.8801482915878296, + "num_tokens": 30871653.0, + "step": 3457 + }, + { + "epoch": 2.627659574468085, + "grad_norm": 2.1253740787506104, + "learning_rate": 2.0853454072069402e-07, + "loss": 0.39093419909477234, + "mean_token_accuracy": 0.9099202156066895, + "num_tokens": 30879156.0, + "step": 3458 + }, + { + "epoch": 2.628419452887538, + "grad_norm": 2.829529047012329, + "learning_rate": 2.0769792791345945e-07, + "loss": 0.35299739241600037, + "mean_token_accuracy": 0.8719742894172668, + "num_tokens": 30883839.0, + "step": 3459 + }, + { + "epoch": 2.629179331306991, + "grad_norm": 1.4410310983657837, + "learning_rate": 2.068629239320588e-07, + "loss": 0.3550579845905304, + "mean_token_accuracy": 0.8580837249755859, + "num_tokens": 30897204.0, + "step": 3460 + }, + { + "epoch": 2.6299392097264436, + "grad_norm": 1.8980296850204468, + "learning_rate": 2.0602952936253112e-07, + "loss": 0.33656907081604004, + "mean_token_accuracy": 0.9022824764251709, + "num_tokens": 30905304.0, + "step": 3461 + }, + { + "epoch": 2.6306990881458967, + "grad_norm": 2.6224915981292725, + "learning_rate": 2.0519774478978404e-07, + "loss": 0.383074015378952, + "mean_token_accuracy": 0.8744953870773315, + "num_tokens": 30911118.0, + "step": 3462 + }, + { + "epoch": 2.6314589665653494, + "grad_norm": 1.8675706386566162, + "learning_rate": 2.043675707975959e-07, + "loss": 0.36784154176712036, + "mean_token_accuracy": 0.8660717606544495, + "num_tokens": 30919786.0, + "step": 3463 + }, + { + "epoch": 2.6322188449848025, + "grad_norm": 1.7601722478866577, + "learning_rate": 2.0353900796861503e-07, + "loss": 0.4188779294490814, + "mean_token_accuracy": 0.8655462861061096, + "num_tokens": 30930882.0, + "step": 3464 + }, + { + "epoch": 2.632978723404255, + "grad_norm": 1.760291337966919, + "learning_rate": 2.027120568843588e-07, + "loss": 0.31421059370040894, + "mean_token_accuracy": 0.8876073360443115, + "num_tokens": 30940524.0, + "step": 3465 + }, + { + "epoch": 2.6337386018237083, + "grad_norm": 2.0120749473571777, + "learning_rate": 2.0188671812521293e-07, + "loss": 0.4053173065185547, + "mean_token_accuracy": 0.855548620223999, + "num_tokens": 30949577.0, + "step": 3466 + }, + { + "epoch": 2.634498480243161, + "grad_norm": 1.1353741884231567, + "learning_rate": 2.0106299227043298e-07, + "loss": 0.24654456973075867, + "mean_token_accuracy": 0.905929684638977, + "num_tokens": 30965797.0, + "step": 3467 + }, + { + "epoch": 2.6352583586626137, + "grad_norm": 2.011974811553955, + "learning_rate": 2.002408798981395e-07, + "loss": 0.37494587898254395, + "mean_token_accuracy": 0.8785897493362427, + "num_tokens": 30974271.0, + "step": 3468 + }, + { + "epoch": 2.636018237082067, + "grad_norm": 1.3929005861282349, + "learning_rate": 1.9942038158532407e-07, + "loss": 0.43479201197624207, + "mean_token_accuracy": 0.8481380939483643, + "num_tokens": 30992451.0, + "step": 3469 + }, + { + "epoch": 2.63677811550152, + "grad_norm": 2.2714993953704834, + "learning_rate": 1.9860149790784432e-07, + "loss": 0.36299505829811096, + "mean_token_accuracy": 0.8728935718536377, + "num_tokens": 30999180.0, + "step": 3470 + }, + { + "epoch": 2.6375379939209727, + "grad_norm": 1.722923755645752, + "learning_rate": 1.977842294404228e-07, + "loss": 0.2461910843849182, + "mean_token_accuracy": 0.9146148562431335, + "num_tokens": 31008617.0, + "step": 3471 + }, + { + "epoch": 2.6382978723404253, + "grad_norm": 1.4508280754089355, + "learning_rate": 1.9696857675665122e-07, + "loss": 0.3511884808540344, + "mean_token_accuracy": 0.869759202003479, + "num_tokens": 31022158.0, + "step": 3472 + }, + { + "epoch": 2.6390577507598785, + "grad_norm": 2.5803074836730957, + "learning_rate": 1.9615454042898635e-07, + "loss": 0.27785009145736694, + "mean_token_accuracy": 0.9075050354003906, + "num_tokens": 31027176.0, + "step": 3473 + }, + { + "epoch": 2.639817629179331, + "grad_norm": 3.2428712844848633, + "learning_rate": 1.95342121028749e-07, + "loss": 0.30596673488616943, + "mean_token_accuracy": 0.8934510946273804, + "num_tokens": 31031140.0, + "step": 3474 + }, + { + "epoch": 2.6405775075987843, + "grad_norm": 1.5055527687072754, + "learning_rate": 1.9453131912612694e-07, + "loss": 0.3586134612560272, + "mean_token_accuracy": 0.87983238697052, + "num_tokens": 31041878.0, + "step": 3475 + }, + { + "epoch": 2.641337386018237, + "grad_norm": 2.8457231521606445, + "learning_rate": 1.9372213529017192e-07, + "loss": 0.314262330532074, + "mean_token_accuracy": 0.8857930302619934, + "num_tokens": 31046670.0, + "step": 3476 + }, + { + "epoch": 2.64209726443769, + "grad_norm": 2.661770820617676, + "learning_rate": 1.9291457008880077e-07, + "loss": 0.3096502125263214, + "mean_token_accuracy": 0.9015626907348633, + "num_tokens": 31052419.0, + "step": 3477 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 1.9692156314849854, + "learning_rate": 1.9210862408879373e-07, + "loss": 0.33081287145614624, + "mean_token_accuracy": 0.8793413639068604, + "num_tokens": 31060462.0, + "step": 3478 + }, + { + "epoch": 2.6436170212765955, + "grad_norm": 2.454256772994995, + "learning_rate": 1.9130429785579441e-07, + "loss": 0.486195832490921, + "mean_token_accuracy": 0.8472193479537964, + "num_tokens": 31066537.0, + "step": 3479 + }, + { + "epoch": 2.6443768996960486, + "grad_norm": 3.121835470199585, + "learning_rate": 1.9050159195431017e-07, + "loss": 0.28688520193099976, + "mean_token_accuracy": 0.8916707038879395, + "num_tokens": 31071061.0, + "step": 3480 + }, + { + "epoch": 2.6451367781155017, + "grad_norm": 2.0197176933288574, + "learning_rate": 1.8970050694771064e-07, + "loss": 0.2587219774723053, + "mean_token_accuracy": 0.9204096794128418, + "num_tokens": 31077438.0, + "step": 3481 + }, + { + "epoch": 2.6458966565349544, + "grad_norm": 2.305452585220337, + "learning_rate": 1.8890104339822913e-07, + "loss": 0.3234187960624695, + "mean_token_accuracy": 0.8695623874664307, + "num_tokens": 31084445.0, + "step": 3482 + }, + { + "epoch": 2.646656534954407, + "grad_norm": 2.671178102493286, + "learning_rate": 1.881032018669579e-07, + "loss": 0.31658151745796204, + "mean_token_accuracy": 0.9211946725845337, + "num_tokens": 31090229.0, + "step": 3483 + }, + { + "epoch": 2.6474164133738602, + "grad_norm": 1.9448342323303223, + "learning_rate": 1.8730698291385518e-07, + "loss": 0.4380547106266022, + "mean_token_accuracy": 0.8881628513336182, + "num_tokens": 31098328.0, + "step": 3484 + }, + { + "epoch": 2.648176291793313, + "grad_norm": 2.00927734375, + "learning_rate": 1.8651238709773646e-07, + "loss": 0.30627715587615967, + "mean_token_accuracy": 0.9037996530532837, + "num_tokens": 31106114.0, + "step": 3485 + }, + { + "epoch": 2.648936170212766, + "grad_norm": 1.800561547279358, + "learning_rate": 1.8571941497627976e-07, + "loss": 0.3352568745613098, + "mean_token_accuracy": 0.8773363828659058, + "num_tokens": 31114962.0, + "step": 3486 + }, + { + "epoch": 2.6496960486322187, + "grad_norm": 1.2112451791763306, + "learning_rate": 1.8492806710602495e-07, + "loss": 0.30349305272102356, + "mean_token_accuracy": 0.8948603272438049, + "num_tokens": 31131202.0, + "step": 3487 + }, + { + "epoch": 2.650455927051672, + "grad_norm": 1.241676926612854, + "learning_rate": 1.8413834404236857e-07, + "loss": 0.33237409591674805, + "mean_token_accuracy": 0.8674747943878174, + "num_tokens": 31146087.0, + "step": 3488 + }, + { + "epoch": 2.6512158054711246, + "grad_norm": 1.7932970523834229, + "learning_rate": 1.8335024633956977e-07, + "loss": 0.2946045696735382, + "mean_token_accuracy": 0.9197652339935303, + "num_tokens": 31153539.0, + "step": 3489 + }, + { + "epoch": 2.6519756838905773, + "grad_norm": 1.4799917936325073, + "learning_rate": 1.8256377455074526e-07, + "loss": 0.41131776571273804, + "mean_token_accuracy": 0.859546422958374, + "num_tokens": 31165330.0, + "step": 3490 + }, + { + "epoch": 2.6527355623100304, + "grad_norm": 1.196844458580017, + "learning_rate": 1.8177892922787154e-07, + "loss": 0.3251150846481323, + "mean_token_accuracy": 0.8738864660263062, + "num_tokens": 31182815.0, + "step": 3491 + }, + { + "epoch": 2.6534954407294835, + "grad_norm": 1.954189419746399, + "learning_rate": 1.809957109217833e-07, + "loss": 0.31352269649505615, + "mean_token_accuracy": 0.8898859024047852, + "num_tokens": 31190907.0, + "step": 3492 + }, + { + "epoch": 2.654255319148936, + "grad_norm": 2.5248095989227295, + "learning_rate": 1.802141201821736e-07, + "loss": 0.29824098944664, + "mean_token_accuracy": 0.9073196053504944, + "num_tokens": 31196077.0, + "step": 3493 + }, + { + "epoch": 2.655015197568389, + "grad_norm": 2.163174629211426, + "learning_rate": 1.7943415755759168e-07, + "loss": 0.3291153311729431, + "mean_token_accuracy": 0.8850691318511963, + "num_tokens": 31202843.0, + "step": 3494 + }, + { + "epoch": 2.655775075987842, + "grad_norm": 1.1075550317764282, + "learning_rate": 1.7865582359544664e-07, + "loss": 0.3335857093334198, + "mean_token_accuracy": 0.877744197845459, + "num_tokens": 31224407.0, + "step": 3495 + }, + { + "epoch": 2.6565349544072947, + "grad_norm": 3.600712299346924, + "learning_rate": 1.7787911884200314e-07, + "loss": 0.24402567744255066, + "mean_token_accuracy": 0.9030617475509644, + "num_tokens": 31228150.0, + "step": 3496 + }, + { + "epoch": 2.657294832826748, + "grad_norm": 2.5282156467437744, + "learning_rate": 1.7710404384238156e-07, + "loss": 0.3065975606441498, + "mean_token_accuracy": 0.8894387483596802, + "num_tokens": 31233676.0, + "step": 3497 + }, + { + "epoch": 2.6580547112462005, + "grad_norm": 5.057322025299072, + "learning_rate": 1.7633059914055976e-07, + "loss": 0.3121221661567688, + "mean_token_accuracy": 0.8697853088378906, + "num_tokens": 31241436.0, + "step": 3498 + }, + { + "epoch": 2.6588145896656536, + "grad_norm": 2.3506245613098145, + "learning_rate": 1.7555878527937164e-07, + "loss": 0.3100275993347168, + "mean_token_accuracy": 0.8860085010528564, + "num_tokens": 31249589.0, + "step": 3499 + }, + { + "epoch": 2.6595744680851063, + "grad_norm": 1.352675199508667, + "learning_rate": 1.7478860280050525e-07, + "loss": 0.3743774890899658, + "mean_token_accuracy": 0.8581909537315369, + "num_tokens": 31264177.0, + "step": 3500 + }, + { + "epoch": 2.660334346504559, + "grad_norm": 1.4283853769302368, + "learning_rate": 1.740200522445043e-07, + "loss": 0.3012605905532837, + "mean_token_accuracy": 0.8875954151153564, + "num_tokens": 31278104.0, + "step": 3501 + }, + { + "epoch": 2.661094224924012, + "grad_norm": 1.2291043996810913, + "learning_rate": 1.7325313415076705e-07, + "loss": 0.28256118297576904, + "mean_token_accuracy": 0.8932200074195862, + "num_tokens": 31295863.0, + "step": 3502 + }, + { + "epoch": 2.6618541033434653, + "grad_norm": 1.4281202554702759, + "learning_rate": 1.7248784905754656e-07, + "loss": 0.17757278680801392, + "mean_token_accuracy": 0.9204857349395752, + "num_tokens": 31304203.0, + "step": 3503 + }, + { + "epoch": 2.662613981762918, + "grad_norm": 1.369604229927063, + "learning_rate": 1.717241975019493e-07, + "loss": 0.35701876878738403, + "mean_token_accuracy": 0.8924071192741394, + "num_tokens": 31317585.0, + "step": 3504 + }, + { + "epoch": 2.6633738601823707, + "grad_norm": 1.8434638977050781, + "learning_rate": 1.7096218001993514e-07, + "loss": 0.2783927619457245, + "mean_token_accuracy": 0.9073910713195801, + "num_tokens": 31325380.0, + "step": 3505 + }, + { + "epoch": 2.664133738601824, + "grad_norm": 1.946325421333313, + "learning_rate": 1.702017971463174e-07, + "loss": 0.2873200476169586, + "mean_token_accuracy": 0.8956313133239746, + "num_tokens": 31333366.0, + "step": 3506 + }, + { + "epoch": 2.6648936170212765, + "grad_norm": 2.468369960784912, + "learning_rate": 1.6944304941476224e-07, + "loss": 0.2589072287082672, + "mean_token_accuracy": 0.9237367510795593, + "num_tokens": 31337721.0, + "step": 3507 + }, + { + "epoch": 2.6656534954407296, + "grad_norm": 1.1283265352249146, + "learning_rate": 1.686859373577876e-07, + "loss": 0.3271624445915222, + "mean_token_accuracy": 0.8839015960693359, + "num_tokens": 31355493.0, + "step": 3508 + }, + { + "epoch": 2.6664133738601823, + "grad_norm": 1.9863340854644775, + "learning_rate": 1.679304615067634e-07, + "loss": 0.24140994250774384, + "mean_token_accuracy": 0.9161529541015625, + "num_tokens": 31362707.0, + "step": 3509 + }, + { + "epoch": 2.6671732522796354, + "grad_norm": 1.8522552251815796, + "learning_rate": 1.671766223919133e-07, + "loss": 0.3312528133392334, + "mean_token_accuracy": 0.8730556964874268, + "num_tokens": 31371077.0, + "step": 3510 + }, + { + "epoch": 2.667933130699088, + "grad_norm": 2.4215502738952637, + "learning_rate": 1.6642442054230935e-07, + "loss": 0.3685656189918518, + "mean_token_accuracy": 0.8850007653236389, + "num_tokens": 31378208.0, + "step": 3511 + }, + { + "epoch": 2.668693009118541, + "grad_norm": 2.1833741664886475, + "learning_rate": 1.6567385648587563e-07, + "loss": 0.34506508708000183, + "mean_token_accuracy": 0.8798409104347229, + "num_tokens": 31384364.0, + "step": 3512 + }, + { + "epoch": 2.669452887537994, + "grad_norm": 1.5749074220657349, + "learning_rate": 1.6492493074938777e-07, + "loss": 0.426993191242218, + "mean_token_accuracy": 0.8461192846298218, + "num_tokens": 31399653.0, + "step": 3513 + }, + { + "epoch": 2.670212765957447, + "grad_norm": 1.782159686088562, + "learning_rate": 1.6417764385846996e-07, + "loss": 0.43299031257629395, + "mean_token_accuracy": 0.8456183075904846, + "num_tokens": 31410255.0, + "step": 3514 + }, + { + "epoch": 2.6709726443768997, + "grad_norm": 1.3696199655532837, + "learning_rate": 1.6343199633759715e-07, + "loss": 0.24636408686637878, + "mean_token_accuracy": 0.8885586261749268, + "num_tokens": 31422388.0, + "step": 3515 + }, + { + "epoch": 2.6717325227963524, + "grad_norm": 1.9061282873153687, + "learning_rate": 1.6268798871009405e-07, + "loss": 0.4061458706855774, + "mean_token_accuracy": 0.8875166177749634, + "num_tokens": 31431610.0, + "step": 3516 + }, + { + "epoch": 2.6724924012158056, + "grad_norm": 1.906085729598999, + "learning_rate": 1.6194562149813241e-07, + "loss": 0.4171827435493469, + "mean_token_accuracy": 0.848915159702301, + "num_tokens": 31440612.0, + "step": 3517 + }, + { + "epoch": 2.6732522796352582, + "grad_norm": 1.7384947538375854, + "learning_rate": 1.6120489522273548e-07, + "loss": 0.38559412956237793, + "mean_token_accuracy": 0.860315203666687, + "num_tokens": 31451002.0, + "step": 3518 + }, + { + "epoch": 2.6740121580547114, + "grad_norm": 3.150087356567383, + "learning_rate": 1.6046581040377317e-07, + "loss": 0.17975735664367676, + "mean_token_accuracy": 0.9390251636505127, + "num_tokens": 31454609.0, + "step": 3519 + }, + { + "epoch": 2.674772036474164, + "grad_norm": 1.9782978296279907, + "learning_rate": 1.5972836755996286e-07, + "loss": 0.4016202688217163, + "mean_token_accuracy": 0.8536617755889893, + "num_tokens": 31463351.0, + "step": 3520 + }, + { + "epoch": 2.675531914893617, + "grad_norm": 1.459272861480713, + "learning_rate": 1.589925672088713e-07, + "loss": 0.32752668857574463, + "mean_token_accuracy": 0.8932114839553833, + "num_tokens": 31475029.0, + "step": 3521 + }, + { + "epoch": 2.67629179331307, + "grad_norm": 1.5019307136535645, + "learning_rate": 1.5825840986691155e-07, + "loss": 0.47891637682914734, + "mean_token_accuracy": 0.8196566700935364, + "num_tokens": 31489340.0, + "step": 3522 + }, + { + "epoch": 2.6770516717325226, + "grad_norm": 1.9832415580749512, + "learning_rate": 1.5752589604934255e-07, + "loss": 0.3787233829498291, + "mean_token_accuracy": 0.8592989444732666, + "num_tokens": 31498173.0, + "step": 3523 + }, + { + "epoch": 2.6778115501519757, + "grad_norm": 1.6112871170043945, + "learning_rate": 1.567950262702714e-07, + "loss": 0.394833505153656, + "mean_token_accuracy": 0.8762246370315552, + "num_tokens": 31509701.0, + "step": 3524 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 2.542189598083496, + "learning_rate": 1.560658010426505e-07, + "loss": 0.344679057598114, + "mean_token_accuracy": 0.8738337159156799, + "num_tokens": 31516174.0, + "step": 3525 + }, + { + "epoch": 2.6793313069908815, + "grad_norm": 1.6784722805023193, + "learning_rate": 1.5533822087827805e-07, + "loss": 0.2981395423412323, + "mean_token_accuracy": 0.9238042831420898, + "num_tokens": 31526373.0, + "step": 3526 + }, + { + "epoch": 2.680091185410334, + "grad_norm": 2.1711673736572266, + "learning_rate": 1.54612286287798e-07, + "loss": 0.32182997465133667, + "mean_token_accuracy": 0.8804676532745361, + "num_tokens": 31532221.0, + "step": 3527 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 2.920492172241211, + "learning_rate": 1.5388799778069896e-07, + "loss": 0.42035239934921265, + "mean_token_accuracy": 0.8616809844970703, + "num_tokens": 31537349.0, + "step": 3528 + }, + { + "epoch": 2.68161094224924, + "grad_norm": 1.6369318962097168, + "learning_rate": 1.5316535586531483e-07, + "loss": 0.3083080053329468, + "mean_token_accuracy": 0.8857955932617188, + "num_tokens": 31548063.0, + "step": 3529 + }, + { + "epoch": 2.682370820668693, + "grad_norm": 1.745784044265747, + "learning_rate": 1.5244436104882327e-07, + "loss": 0.3295830190181732, + "mean_token_accuracy": 0.8790948390960693, + "num_tokens": 31557297.0, + "step": 3530 + }, + { + "epoch": 2.683130699088146, + "grad_norm": 2.933802843093872, + "learning_rate": 1.5172501383724668e-07, + "loss": 0.20540538430213928, + "mean_token_accuracy": 0.9353891611099243, + "num_tokens": 31561267.0, + "step": 3531 + }, + { + "epoch": 2.683890577507599, + "grad_norm": 1.1792415380477905, + "learning_rate": 1.5100731473544932e-07, + "loss": 0.2857414484024048, + "mean_token_accuracy": 0.8919717073440552, + "num_tokens": 31577364.0, + "step": 3532 + }, + { + "epoch": 2.6846504559270516, + "grad_norm": 1.5752356052398682, + "learning_rate": 1.5029126424714186e-07, + "loss": 0.42933136224746704, + "mean_token_accuracy": 0.8738011717796326, + "num_tokens": 31593255.0, + "step": 3533 + }, + { + "epoch": 2.6854103343465043, + "grad_norm": 1.4097353219985962, + "learning_rate": 1.495768628748745e-07, + "loss": 0.41403159499168396, + "mean_token_accuracy": 0.8538030385971069, + "num_tokens": 31606689.0, + "step": 3534 + }, + { + "epoch": 2.6861702127659575, + "grad_norm": 1.3788182735443115, + "learning_rate": 1.4886411112004258e-07, + "loss": 0.3825019299983978, + "mean_token_accuracy": 0.870381236076355, + "num_tokens": 31623528.0, + "step": 3535 + }, + { + "epoch": 2.6869300911854106, + "grad_norm": 2.3032004833221436, + "learning_rate": 1.481530094828823e-07, + "loss": 0.28886643052101135, + "mean_token_accuracy": 0.9053950905799866, + "num_tokens": 31629949.0, + "step": 3536 + }, + { + "epoch": 2.6876899696048633, + "grad_norm": 1.8950154781341553, + "learning_rate": 1.4744355846247254e-07, + "loss": 0.3261764645576477, + "mean_token_accuracy": 0.8882689476013184, + "num_tokens": 31639482.0, + "step": 3537 + }, + { + "epoch": 2.688449848024316, + "grad_norm": 2.8152518272399902, + "learning_rate": 1.4673575855673278e-07, + "loss": 0.19367718696594238, + "mean_token_accuracy": 0.948776364326477, + "num_tokens": 31643354.0, + "step": 3538 + }, + { + "epoch": 2.689209726443769, + "grad_norm": 2.1745874881744385, + "learning_rate": 1.460296102624248e-07, + "loss": 0.3250897526741028, + "mean_token_accuracy": 0.8834096193313599, + "num_tokens": 31651085.0, + "step": 3539 + }, + { + "epoch": 2.689969604863222, + "grad_norm": 2.5239014625549316, + "learning_rate": 1.4532511407515022e-07, + "loss": 0.3069056570529938, + "mean_token_accuracy": 0.8939725160598755, + "num_tokens": 31656790.0, + "step": 3540 + }, + { + "epoch": 2.690729483282675, + "grad_norm": 2.19575572013855, + "learning_rate": 1.4462227048935185e-07, + "loss": 0.38596993684768677, + "mean_token_accuracy": 0.8545209169387817, + "num_tokens": 31664577.0, + "step": 3541 + }, + { + "epoch": 2.6914893617021276, + "grad_norm": 2.4618618488311768, + "learning_rate": 1.439210799983126e-07, + "loss": 0.43490833044052124, + "mean_token_accuracy": 0.8452163338661194, + "num_tokens": 31670328.0, + "step": 3542 + }, + { + "epoch": 2.6922492401215807, + "grad_norm": 1.6371922492980957, + "learning_rate": 1.4322154309415387e-07, + "loss": 0.36862409114837646, + "mean_token_accuracy": 0.8575112819671631, + "num_tokens": 31680342.0, + "step": 3543 + }, + { + "epoch": 2.6930091185410334, + "grad_norm": 3.311603546142578, + "learning_rate": 1.425236602678387e-07, + "loss": 0.3098670542240143, + "mean_token_accuracy": 0.8895800113677979, + "num_tokens": 31686819.0, + "step": 3544 + }, + { + "epoch": 2.693768996960486, + "grad_norm": 2.246453285217285, + "learning_rate": 1.4182743200916839e-07, + "loss": 0.2145545780658722, + "mean_token_accuracy": 0.9456803798675537, + "num_tokens": 31692024.0, + "step": 3545 + }, + { + "epoch": 2.6945288753799392, + "grad_norm": 2.962627410888672, + "learning_rate": 1.4113285880678145e-07, + "loss": 0.22648683190345764, + "mean_token_accuracy": 0.9368027448654175, + "num_tokens": 31696292.0, + "step": 3546 + }, + { + "epoch": 2.6952887537993924, + "grad_norm": 2.3828611373901367, + "learning_rate": 1.4043994114815663e-07, + "loss": 0.28031831979751587, + "mean_token_accuracy": 0.8995643854141235, + "num_tokens": 31701896.0, + "step": 3547 + }, + { + "epoch": 2.696048632218845, + "grad_norm": 2.749218463897705, + "learning_rate": 1.3974867951961097e-07, + "loss": 0.31309080123901367, + "mean_token_accuracy": 0.8827601671218872, + "num_tokens": 31707434.0, + "step": 3548 + }, + { + "epoch": 2.6968085106382977, + "grad_norm": 1.5682415962219238, + "learning_rate": 1.3905907440629752e-07, + "loss": 0.2794681191444397, + "mean_token_accuracy": 0.9000695943832397, + "num_tokens": 31718923.0, + "step": 3549 + }, + { + "epoch": 2.697568389057751, + "grad_norm": 2.2193145751953125, + "learning_rate": 1.38371126292208e-07, + "loss": 0.31643980741500854, + "mean_token_accuracy": 0.8916857242584229, + "num_tokens": 31724566.0, + "step": 3550 + }, + { + "epoch": 2.6983282674772036, + "grad_norm": 2.14003324508667, + "learning_rate": 1.3768483566017093e-07, + "loss": 0.3225042521953583, + "mean_token_accuracy": 0.8810629844665527, + "num_tokens": 31731363.0, + "step": 3551 + }, + { + "epoch": 2.6990881458966567, + "grad_norm": 2.594632863998413, + "learning_rate": 1.3700020299185156e-07, + "loss": 0.28227928280830383, + "mean_token_accuracy": 0.8986451625823975, + "num_tokens": 31736574.0, + "step": 3552 + }, + { + "epoch": 2.6998480243161094, + "grad_norm": 1.8695379495620728, + "learning_rate": 1.3631722876775137e-07, + "loss": 0.46631208062171936, + "mean_token_accuracy": 0.8425353765487671, + "num_tokens": 31746568.0, + "step": 3553 + }, + { + "epoch": 2.7006079027355625, + "grad_norm": 2.1246798038482666, + "learning_rate": 1.3563591346720806e-07, + "loss": 0.3978712260723114, + "mean_token_accuracy": 0.85677170753479, + "num_tokens": 31755499.0, + "step": 3554 + }, + { + "epoch": 2.701367781155015, + "grad_norm": 1.9348199367523193, + "learning_rate": 1.3495625756839464e-07, + "loss": 0.4381856620311737, + "mean_token_accuracy": 0.8389089107513428, + "num_tokens": 31765267.0, + "step": 3555 + }, + { + "epoch": 2.702127659574468, + "grad_norm": 3.3802061080932617, + "learning_rate": 1.342782615483204e-07, + "loss": 0.2558897137641907, + "mean_token_accuracy": 0.9038383960723877, + "num_tokens": 31769169.0, + "step": 3556 + }, + { + "epoch": 2.702887537993921, + "grad_norm": 1.8666874170303345, + "learning_rate": 1.3360192588282832e-07, + "loss": 0.3420698642730713, + "mean_token_accuracy": 0.8731567859649658, + "num_tokens": 31778500.0, + "step": 3557 + }, + { + "epoch": 2.7036474164133737, + "grad_norm": 2.2502217292785645, + "learning_rate": 1.3292725104659676e-07, + "loss": 0.33352571725845337, + "mean_token_accuracy": 0.889266848564148, + "num_tokens": 31786245.0, + "step": 3558 + }, + { + "epoch": 2.704407294832827, + "grad_norm": 1.7217984199523926, + "learning_rate": 1.3225423751313942e-07, + "loss": 0.3671357035636902, + "mean_token_accuracy": 0.8806703686714172, + "num_tokens": 31796242.0, + "step": 3559 + }, + { + "epoch": 2.7051671732522795, + "grad_norm": 2.5113964080810547, + "learning_rate": 1.315828857548024e-07, + "loss": 0.24104978144168854, + "mean_token_accuracy": 0.9279846549034119, + "num_tokens": 31801005.0, + "step": 3560 + }, + { + "epoch": 2.7059270516717326, + "grad_norm": 2.0345516204833984, + "learning_rate": 1.309131962427662e-07, + "loss": 0.3277859687805176, + "mean_token_accuracy": 0.8744111061096191, + "num_tokens": 31810184.0, + "step": 3561 + }, + { + "epoch": 2.7066869300911853, + "grad_norm": 1.2103748321533203, + "learning_rate": 1.3024516944704495e-07, + "loss": 0.34378400444984436, + "mean_token_accuracy": 0.8734696507453918, + "num_tokens": 31830255.0, + "step": 3562 + }, + { + "epoch": 2.7074468085106385, + "grad_norm": 2.3213655948638916, + "learning_rate": 1.2957880583648525e-07, + "loss": 0.38547977805137634, + "mean_token_accuracy": 0.8699804544448853, + "num_tokens": 31836624.0, + "step": 3563 + }, + { + "epoch": 2.708206686930091, + "grad_norm": 1.3899281024932861, + "learning_rate": 1.2891410587876714e-07, + "loss": 0.38521939516067505, + "mean_token_accuracy": 0.8629069924354553, + "num_tokens": 31851201.0, + "step": 3564 + }, + { + "epoch": 2.7089665653495443, + "grad_norm": 1.9310930967330933, + "learning_rate": 1.2825107004040272e-07, + "loss": 0.26716265082359314, + "mean_token_accuracy": 0.9009085893630981, + "num_tokens": 31858683.0, + "step": 3565 + }, + { + "epoch": 2.709726443768997, + "grad_norm": 2.839961290359497, + "learning_rate": 1.2758969878673504e-07, + "loss": 0.3741273880004883, + "mean_token_accuracy": 0.8934653997421265, + "num_tokens": 31864354.0, + "step": 3566 + }, + { + "epoch": 2.7104863221884496, + "grad_norm": 1.374866247177124, + "learning_rate": 1.269299925819409e-07, + "loss": 0.43979907035827637, + "mean_token_accuracy": 0.8200695514678955, + "num_tokens": 31879875.0, + "step": 3567 + }, + { + "epoch": 2.711246200607903, + "grad_norm": 1.149755597114563, + "learning_rate": 1.262719518890279e-07, + "loss": 0.375344842672348, + "mean_token_accuracy": 0.8663579225540161, + "num_tokens": 31902014.0, + "step": 3568 + }, + { + "epoch": 2.7120060790273555, + "grad_norm": 1.5612202882766724, + "learning_rate": 1.2561557716983308e-07, + "loss": 0.3224652409553528, + "mean_token_accuracy": 0.8762812614440918, + "num_tokens": 31913496.0, + "step": 3569 + }, + { + "epoch": 2.7127659574468086, + "grad_norm": 2.291853666305542, + "learning_rate": 1.2496086888502595e-07, + "loss": 0.299552321434021, + "mean_token_accuracy": 0.8792698383331299, + "num_tokens": 31919505.0, + "step": 3570 + }, + { + "epoch": 2.7135258358662613, + "grad_norm": 2.799447536468506, + "learning_rate": 1.2430782749410676e-07, + "loss": 0.16546699404716492, + "mean_token_accuracy": 0.943824052810669, + "num_tokens": 31923154.0, + "step": 3571 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.4593926668167114, + "learning_rate": 1.2365645345540383e-07, + "loss": 0.35158461332321167, + "mean_token_accuracy": 0.8825424909591675, + "num_tokens": 31936316.0, + "step": 3572 + }, + { + "epoch": 2.715045592705167, + "grad_norm": 1.3870587348937988, + "learning_rate": 1.2300674722607735e-07, + "loss": 0.25250178575515747, + "mean_token_accuracy": 0.900173544883728, + "num_tokens": 31948979.0, + "step": 3573 + }, + { + "epoch": 2.71580547112462, + "grad_norm": 1.8494576215744019, + "learning_rate": 1.223587092621162e-07, + "loss": 0.36176151037216187, + "mean_token_accuracy": 0.8696292638778687, + "num_tokens": 31957512.0, + "step": 3574 + }, + { + "epoch": 2.716565349544073, + "grad_norm": 2.2320656776428223, + "learning_rate": 1.2171234001833788e-07, + "loss": 0.3317434787750244, + "mean_token_accuracy": 0.8897237777709961, + "num_tokens": 31964788.0, + "step": 3575 + }, + { + "epoch": 2.717325227963526, + "grad_norm": 2.424726963043213, + "learning_rate": 1.2106763994838954e-07, + "loss": 0.2880811095237732, + "mean_token_accuracy": 0.8983594179153442, + "num_tokens": 31970888.0, + "step": 3576 + }, + { + "epoch": 2.7180851063829787, + "grad_norm": 1.7122806310653687, + "learning_rate": 1.204246095047465e-07, + "loss": 0.4846091568470001, + "mean_token_accuracy": 0.8358923196792603, + "num_tokens": 31981891.0, + "step": 3577 + }, + { + "epoch": 2.7188449848024314, + "grad_norm": 2.3445510864257812, + "learning_rate": 1.1978324913871214e-07, + "loss": 0.28702512383461, + "mean_token_accuracy": 0.8942852020263672, + "num_tokens": 31987375.0, + "step": 3578 + }, + { + "epoch": 2.7196048632218845, + "grad_norm": 2.418414831161499, + "learning_rate": 1.1914355930041838e-07, + "loss": 0.27506208419799805, + "mean_token_accuracy": 0.9329943656921387, + "num_tokens": 31992517.0, + "step": 3579 + }, + { + "epoch": 2.7203647416413372, + "grad_norm": 2.363285541534424, + "learning_rate": 1.1850554043882329e-07, + "loss": 0.32415682077407837, + "mean_token_accuracy": 0.9004105925559998, + "num_tokens": 31998223.0, + "step": 3580 + }, + { + "epoch": 2.7211246200607904, + "grad_norm": 1.5022046566009521, + "learning_rate": 1.178691930017134e-07, + "loss": 0.2446850836277008, + "mean_token_accuracy": 0.9055813550949097, + "num_tokens": 32008038.0, + "step": 3581 + }, + { + "epoch": 2.721884498480243, + "grad_norm": 1.7016842365264893, + "learning_rate": 1.172345174357023e-07, + "loss": 0.356515109539032, + "mean_token_accuracy": 0.876318097114563, + "num_tokens": 32018738.0, + "step": 3582 + }, + { + "epoch": 2.722644376899696, + "grad_norm": 2.113873243331909, + "learning_rate": 1.1660151418622923e-07, + "loss": 0.24748530983924866, + "mean_token_accuracy": 0.9214030504226685, + "num_tokens": 32025225.0, + "step": 3583 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 1.6737921237945557, + "learning_rate": 1.159701836975602e-07, + "loss": 0.30180150270462036, + "mean_token_accuracy": 0.9211363792419434, + "num_tokens": 32034579.0, + "step": 3584 + }, + { + "epoch": 2.7241641337386016, + "grad_norm": 1.4193580150604248, + "learning_rate": 1.153405264127877e-07, + "loss": 0.2939320504665375, + "mean_token_accuracy": 0.9005526304244995, + "num_tokens": 32046461.0, + "step": 3585 + }, + { + "epoch": 2.7249240121580547, + "grad_norm": 2.273599863052368, + "learning_rate": 1.1471254277382882e-07, + "loss": 0.3552356958389282, + "mean_token_accuracy": 0.8682018518447876, + "num_tokens": 32056210.0, + "step": 3586 + }, + { + "epoch": 2.725683890577508, + "grad_norm": 2.242373466491699, + "learning_rate": 1.1408623322142736e-07, + "loss": 0.37924283742904663, + "mean_token_accuracy": 0.8833099603652954, + "num_tokens": 32063545.0, + "step": 3587 + }, + { + "epoch": 2.7264437689969605, + "grad_norm": 2.039243459701538, + "learning_rate": 1.134615981951509e-07, + "loss": 0.29171228408813477, + "mean_token_accuracy": 0.8961814641952515, + "num_tokens": 32070870.0, + "step": 3588 + }, + { + "epoch": 2.727203647416413, + "grad_norm": 1.8081161975860596, + "learning_rate": 1.1283863813339263e-07, + "loss": 0.34568479657173157, + "mean_token_accuracy": 0.9093149900436401, + "num_tokens": 32078829.0, + "step": 3589 + }, + { + "epoch": 2.7279635258358663, + "grad_norm": 2.301534414291382, + "learning_rate": 1.1221735347336976e-07, + "loss": 0.32527366280555725, + "mean_token_accuracy": 0.8894226551055908, + "num_tokens": 32084533.0, + "step": 3590 + }, + { + "epoch": 2.728723404255319, + "grad_norm": 1.9389806985855103, + "learning_rate": 1.1159774465112433e-07, + "loss": 0.39770618081092834, + "mean_token_accuracy": 0.8613806962966919, + "num_tokens": 32092713.0, + "step": 3591 + }, + { + "epoch": 2.729483282674772, + "grad_norm": 1.6589549779891968, + "learning_rate": 1.1097981210152042e-07, + "loss": 0.3170590400695801, + "mean_token_accuracy": 0.8901652097702026, + "num_tokens": 32102904.0, + "step": 3592 + }, + { + "epoch": 2.730243161094225, + "grad_norm": 1.8090909719467163, + "learning_rate": 1.1036355625824808e-07, + "loss": 0.274291455745697, + "mean_token_accuracy": 0.9074428081512451, + "num_tokens": 32111009.0, + "step": 3593 + }, + { + "epoch": 2.731003039513678, + "grad_norm": 2.431757688522339, + "learning_rate": 1.0974897755381936e-07, + "loss": 0.30703026056289673, + "mean_token_accuracy": 0.9109988808631897, + "num_tokens": 32116173.0, + "step": 3594 + }, + { + "epoch": 2.7317629179331306, + "grad_norm": 1.8828567266464233, + "learning_rate": 1.0913607641956842e-07, + "loss": 0.34009286761283875, + "mean_token_accuracy": 0.8761146068572998, + "num_tokens": 32124273.0, + "step": 3595 + }, + { + "epoch": 2.7325227963525833, + "grad_norm": 1.2194745540618896, + "learning_rate": 1.0852485328565337e-07, + "loss": 0.2432229220867157, + "mean_token_accuracy": 0.8984386920928955, + "num_tokens": 32137430.0, + "step": 3596 + }, + { + "epoch": 2.7332826747720365, + "grad_norm": 2.3038880825042725, + "learning_rate": 1.0791530858105387e-07, + "loss": 0.2546696066856384, + "mean_token_accuracy": 0.9092214107513428, + "num_tokens": 32145207.0, + "step": 3597 + }, + { + "epoch": 2.7340425531914896, + "grad_norm": 2.807394504547119, + "learning_rate": 1.0730744273357213e-07, + "loss": 0.33576664328575134, + "mean_token_accuracy": 0.8793773651123047, + "num_tokens": 32150161.0, + "step": 3598 + }, + { + "epoch": 2.7348024316109423, + "grad_norm": 1.8207601308822632, + "learning_rate": 1.067012561698319e-07, + "loss": 0.43848833441734314, + "mean_token_accuracy": 0.8729845285415649, + "num_tokens": 32160180.0, + "step": 3599 + }, + { + "epoch": 2.735562310030395, + "grad_norm": 1.5954468250274658, + "learning_rate": 1.0609674931527786e-07, + "loss": 0.3471013307571411, + "mean_token_accuracy": 0.889906644821167, + "num_tokens": 32172442.0, + "step": 3600 + }, + { + "epoch": 2.736322188449848, + "grad_norm": 1.2474297285079956, + "learning_rate": 1.0549392259417646e-07, + "loss": 0.2967996299266815, + "mean_token_accuracy": 0.887985110282898, + "num_tokens": 32187624.0, + "step": 3601 + }, + { + "epoch": 2.737082066869301, + "grad_norm": 1.4285695552825928, + "learning_rate": 1.0489277642961481e-07, + "loss": 0.2793816924095154, + "mean_token_accuracy": 0.8948850631713867, + "num_tokens": 32199904.0, + "step": 3602 + }, + { + "epoch": 2.737841945288754, + "grad_norm": 1.4096852540969849, + "learning_rate": 1.0429331124350045e-07, + "loss": 0.39516502618789673, + "mean_token_accuracy": 0.8942514657974243, + "num_tokens": 32213145.0, + "step": 3603 + }, + { + "epoch": 2.7386018237082066, + "grad_norm": 1.4818166494369507, + "learning_rate": 1.0369552745656014e-07, + "loss": 0.3851013779640198, + "mean_token_accuracy": 0.8604148626327515, + "num_tokens": 32225576.0, + "step": 3604 + }, + { + "epoch": 2.7393617021276597, + "grad_norm": 2.0186386108398438, + "learning_rate": 1.0309942548834329e-07, + "loss": 0.2715086340904236, + "mean_token_accuracy": 0.9169677495956421, + "num_tokens": 32232808.0, + "step": 3605 + }, + { + "epoch": 2.7401215805471124, + "grad_norm": 2.3498101234436035, + "learning_rate": 1.0250500575721578e-07, + "loss": 0.2616893947124481, + "mean_token_accuracy": 0.9052878618240356, + "num_tokens": 32239209.0, + "step": 3606 + }, + { + "epoch": 2.740881458966565, + "grad_norm": 2.3760416507720947, + "learning_rate": 1.0191226868036419e-07, + "loss": 0.3654823303222656, + "mean_token_accuracy": 0.9066962003707886, + "num_tokens": 32245690.0, + "step": 3607 + }, + { + "epoch": 2.7416413373860182, + "grad_norm": 1.9187121391296387, + "learning_rate": 1.0132121467379574e-07, + "loss": 0.2764931321144104, + "mean_token_accuracy": 0.9288564920425415, + "num_tokens": 32252804.0, + "step": 3608 + }, + { + "epoch": 2.7424012158054714, + "grad_norm": 2.57564115524292, + "learning_rate": 1.0073184415233334e-07, + "loss": 0.2813187837600708, + "mean_token_accuracy": 0.890303909778595, + "num_tokens": 32258534.0, + "step": 3609 + }, + { + "epoch": 2.743161094224924, + "grad_norm": 2.0758004188537598, + "learning_rate": 1.0014415752962081e-07, + "loss": 0.29847270250320435, + "mean_token_accuracy": 0.8947038054466248, + "num_tokens": 32265373.0, + "step": 3610 + }, + { + "epoch": 2.7439209726443767, + "grad_norm": 3.005535840988159, + "learning_rate": 9.955815521811852e-08, + "loss": 0.2781291604042053, + "mean_token_accuracy": 0.899482250213623, + "num_tokens": 32269487.0, + "step": 3611 + }, + { + "epoch": 2.74468085106383, + "grad_norm": 2.131834030151367, + "learning_rate": 9.897383762910606e-08, + "loss": 0.2915271520614624, + "mean_token_accuracy": 0.8984331488609314, + "num_tokens": 32276242.0, + "step": 3612 + }, + { + "epoch": 2.7454407294832825, + "grad_norm": 2.048445463180542, + "learning_rate": 9.839120517267986e-08, + "loss": 0.38389909267425537, + "mean_token_accuracy": 0.8720065951347351, + "num_tokens": 32284956.0, + "step": 3613 + }, + { + "epoch": 2.7462006079027357, + "grad_norm": 3.3529200553894043, + "learning_rate": 9.781025825775392e-08, + "loss": 0.29694801568984985, + "mean_token_accuracy": 0.8991866111755371, + "num_tokens": 32289109.0, + "step": 3614 + }, + { + "epoch": 2.7469604863221884, + "grad_norm": 2.5099470615386963, + "learning_rate": 9.72309972920582e-08, + "loss": 0.2015802264213562, + "mean_token_accuracy": 0.9364612102508545, + "num_tokens": 32294163.0, + "step": 3615 + }, + { + "epoch": 2.7477203647416415, + "grad_norm": 1.7144349813461304, + "learning_rate": 9.665342268214167e-08, + "loss": 0.42185109853744507, + "mean_token_accuracy": 0.8469204902648926, + "num_tokens": 32304034.0, + "step": 3616 + }, + { + "epoch": 2.748480243161094, + "grad_norm": 1.7306944131851196, + "learning_rate": 9.607753483336812e-08, + "loss": 0.294491708278656, + "mean_token_accuracy": 0.8831486701965332, + "num_tokens": 32314079.0, + "step": 3617 + }, + { + "epoch": 2.749240121580547, + "grad_norm": 1.5339795351028442, + "learning_rate": 9.55033341499173e-08, + "loss": 0.4163019359111786, + "mean_token_accuracy": 0.8496603965759277, + "num_tokens": 32325707.0, + "step": 3618 + }, + { + "epoch": 2.75, + "grad_norm": 1.878015398979187, + "learning_rate": 9.493082103478519e-08, + "loss": 0.2632361650466919, + "mean_token_accuracy": 0.8944116830825806, + "num_tokens": 32333710.0, + "step": 3619 + }, + { + "epoch": 2.750759878419453, + "grad_norm": 1.771299958229065, + "learning_rate": 9.43599958897845e-08, + "loss": 0.3327634334564209, + "mean_token_accuracy": 0.8778671026229858, + "num_tokens": 32343311.0, + "step": 3620 + }, + { + "epoch": 2.751519756838906, + "grad_norm": 1.358282208442688, + "learning_rate": 9.379085911554148e-08, + "loss": 0.3822714686393738, + "mean_token_accuracy": 0.8586339354515076, + "num_tokens": 32361435.0, + "step": 3621 + }, + { + "epoch": 2.7522796352583585, + "grad_norm": 1.9158512353897095, + "learning_rate": 9.322341111149852e-08, + "loss": 0.23024609684944153, + "mean_token_accuracy": 0.9222040176391602, + "num_tokens": 32368371.0, + "step": 3622 + }, + { + "epoch": 2.7530395136778116, + "grad_norm": 1.247992753982544, + "learning_rate": 9.265765227591261e-08, + "loss": 0.3436150550842285, + "mean_token_accuracy": 0.8803039789199829, + "num_tokens": 32388723.0, + "step": 3623 + }, + { + "epoch": 2.7537993920972643, + "grad_norm": 1.913124918937683, + "learning_rate": 9.209358300585474e-08, + "loss": 0.35059863328933716, + "mean_token_accuracy": 0.875072717666626, + "num_tokens": 32397011.0, + "step": 3624 + }, + { + "epoch": 2.7545592705167175, + "grad_norm": 2.487434148788452, + "learning_rate": 9.153120369721047e-08, + "loss": 0.2234063446521759, + "mean_token_accuracy": 0.904019832611084, + "num_tokens": 32402316.0, + "step": 3625 + }, + { + "epoch": 2.75531914893617, + "grad_norm": 2.188255548477173, + "learning_rate": 9.09705147446796e-08, + "loss": 0.19389624893665314, + "mean_token_accuracy": 0.9302033185958862, + "num_tokens": 32408031.0, + "step": 3626 + }, + { + "epoch": 2.7560790273556233, + "grad_norm": 2.892735004425049, + "learning_rate": 9.041151654177488e-08, + "loss": 0.24316613376140594, + "mean_token_accuracy": 0.9222840070724487, + "num_tokens": 32412498.0, + "step": 3627 + }, + { + "epoch": 2.756838905775076, + "grad_norm": 2.6814024448394775, + "learning_rate": 8.985420948082329e-08, + "loss": 0.2725716233253479, + "mean_token_accuracy": 0.9069510698318481, + "num_tokens": 32417717.0, + "step": 3628 + }, + { + "epoch": 2.7575987841945286, + "grad_norm": 2.8956947326660156, + "learning_rate": 8.929859395296365e-08, + "loss": 0.3466540575027466, + "mean_token_accuracy": 0.8771743774414062, + "num_tokens": 32422425.0, + "step": 3629 + }, + { + "epoch": 2.7583586626139818, + "grad_norm": 2.393306016921997, + "learning_rate": 8.874467034814816e-08, + "loss": 0.40261518955230713, + "mean_token_accuracy": 0.8902627229690552, + "num_tokens": 32428512.0, + "step": 3630 + }, + { + "epoch": 2.759118541033435, + "grad_norm": 2.201388359069824, + "learning_rate": 8.819243905514308e-08, + "loss": 0.28923481702804565, + "mean_token_accuracy": 0.8948091268539429, + "num_tokens": 32434316.0, + "step": 3631 + }, + { + "epoch": 2.7598784194528876, + "grad_norm": 1.9007173776626587, + "learning_rate": 8.764190046152421e-08, + "loss": 0.3775410056114197, + "mean_token_accuracy": 0.8737541437149048, + "num_tokens": 32442785.0, + "step": 3632 + }, + { + "epoch": 2.7606382978723403, + "grad_norm": 1.0914241075515747, + "learning_rate": 8.709305495368137e-08, + "loss": 0.27528852224349976, + "mean_token_accuracy": 0.8981513977050781, + "num_tokens": 32462749.0, + "step": 3633 + }, + { + "epoch": 2.7613981762917934, + "grad_norm": 2.024019718170166, + "learning_rate": 8.654590291681531e-08, + "loss": 0.3178071677684784, + "mean_token_accuracy": 0.8825376033782959, + "num_tokens": 32470041.0, + "step": 3634 + }, + { + "epoch": 2.762158054711246, + "grad_norm": 1.038554072380066, + "learning_rate": 8.600044473493856e-08, + "loss": 0.26435115933418274, + "mean_token_accuracy": 0.9002813100814819, + "num_tokens": 32492633.0, + "step": 3635 + }, + { + "epoch": 2.762917933130699, + "grad_norm": 3.143336057662964, + "learning_rate": 8.545668079087438e-08, + "loss": 0.356077641248703, + "mean_token_accuracy": 0.890540361404419, + "num_tokens": 32497085.0, + "step": 3636 + }, + { + "epoch": 2.763677811550152, + "grad_norm": 1.8176860809326172, + "learning_rate": 8.491461146625774e-08, + "loss": 0.42660102248191833, + "mean_token_accuracy": 0.8467463254928589, + "num_tokens": 32506375.0, + "step": 3637 + }, + { + "epoch": 2.764437689969605, + "grad_norm": 1.7116483449935913, + "learning_rate": 8.437423714153292e-08, + "loss": 0.3794213533401489, + "mean_token_accuracy": 0.8674054145812988, + "num_tokens": 32517443.0, + "step": 3638 + }, + { + "epoch": 2.7651975683890577, + "grad_norm": 3.004796266555786, + "learning_rate": 8.383555819595601e-08, + "loss": 0.3199142515659332, + "mean_token_accuracy": 0.8825819492340088, + "num_tokens": 32527003.0, + "step": 3639 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 2.6139073371887207, + "learning_rate": 8.329857500759291e-08, + "loss": 0.4262070059776306, + "mean_token_accuracy": 0.8643308281898499, + "num_tokens": 32533227.0, + "step": 3640 + }, + { + "epoch": 2.7667173252279635, + "grad_norm": 1.4850772619247437, + "learning_rate": 8.2763287953318e-08, + "loss": 0.4211199879646301, + "mean_token_accuracy": 0.8522083759307861, + "num_tokens": 32546463.0, + "step": 3641 + }, + { + "epoch": 2.7674772036474167, + "grad_norm": 2.1967451572418213, + "learning_rate": 8.22296974088177e-08, + "loss": 0.32154369354248047, + "mean_token_accuracy": 0.9058319926261902, + "num_tokens": 32554292.0, + "step": 3642 + }, + { + "epoch": 2.7682370820668694, + "grad_norm": 1.4377225637435913, + "learning_rate": 8.169780374858577e-08, + "loss": 0.34665489196777344, + "mean_token_accuracy": 0.8763554096221924, + "num_tokens": 32567357.0, + "step": 3643 + }, + { + "epoch": 2.768996960486322, + "grad_norm": 1.8216571807861328, + "learning_rate": 8.116760734592527e-08, + "loss": 0.39765921235084534, + "mean_token_accuracy": 0.8595637679100037, + "num_tokens": 32577681.0, + "step": 3644 + }, + { + "epoch": 2.769756838905775, + "grad_norm": 3.732693672180176, + "learning_rate": 8.063910857294881e-08, + "loss": 0.16449159383773804, + "mean_token_accuracy": 0.9406331777572632, + "num_tokens": 32580792.0, + "step": 3645 + }, + { + "epoch": 2.770516717325228, + "grad_norm": 1.4248076677322388, + "learning_rate": 8.011230780057749e-08, + "loss": 0.43648213148117065, + "mean_token_accuracy": 0.8409627676010132, + "num_tokens": 32596950.0, + "step": 3646 + }, + { + "epoch": 2.771276595744681, + "grad_norm": 1.5802161693572998, + "learning_rate": 7.958720539853971e-08, + "loss": 0.41201114654541016, + "mean_token_accuracy": 0.8678973913192749, + "num_tokens": 32608870.0, + "step": 3647 + }, + { + "epoch": 2.7720364741641337, + "grad_norm": 1.864032506942749, + "learning_rate": 7.906380173537315e-08, + "loss": 0.3839274048805237, + "mean_token_accuracy": 0.863370418548584, + "num_tokens": 32619357.0, + "step": 3648 + }, + { + "epoch": 2.772796352583587, + "grad_norm": 2.0040485858917236, + "learning_rate": 7.854209717842231e-08, + "loss": 0.4682219624519348, + "mean_token_accuracy": 0.8341292142868042, + "num_tokens": 32628659.0, + "step": 3649 + }, + { + "epoch": 2.7735562310030395, + "grad_norm": 3.2517287731170654, + "learning_rate": 7.80220920938396e-08, + "loss": 0.3697377145290375, + "mean_token_accuracy": 0.8937886357307434, + "num_tokens": 32632724.0, + "step": 3650 + }, + { + "epoch": 2.774316109422492, + "grad_norm": 1.437434434890747, + "learning_rate": 7.750378684658444e-08, + "loss": 0.21713104844093323, + "mean_token_accuracy": 0.9223493337631226, + "num_tokens": 32643085.0, + "step": 3651 + }, + { + "epoch": 2.7750759878419453, + "grad_norm": 1.3312400579452515, + "learning_rate": 7.698718180042392e-08, + "loss": 0.3078494668006897, + "mean_token_accuracy": 0.8865747451782227, + "num_tokens": 32657205.0, + "step": 3652 + }, + { + "epoch": 2.7758358662613984, + "grad_norm": 1.3009766340255737, + "learning_rate": 7.647227731793078e-08, + "loss": 0.33374494314193726, + "mean_token_accuracy": 0.8755972385406494, + "num_tokens": 32670785.0, + "step": 3653 + }, + { + "epoch": 2.776595744680851, + "grad_norm": 1.7956385612487793, + "learning_rate": 7.595907376048512e-08, + "loss": 0.3185005486011505, + "mean_token_accuracy": 0.896104097366333, + "num_tokens": 32679376.0, + "step": 3654 + }, + { + "epoch": 2.777355623100304, + "grad_norm": 1.9820408821105957, + "learning_rate": 7.544757148827297e-08, + "loss": 0.34602630138397217, + "mean_token_accuracy": 0.9006669521331787, + "num_tokens": 32687327.0, + "step": 3655 + }, + { + "epoch": 2.778115501519757, + "grad_norm": 1.447498083114624, + "learning_rate": 7.493777086028608e-08, + "loss": 0.29633957147598267, + "mean_token_accuracy": 0.8827477693557739, + "num_tokens": 32698669.0, + "step": 3656 + }, + { + "epoch": 2.7788753799392096, + "grad_norm": 1.195237159729004, + "learning_rate": 7.442967223432212e-08, + "loss": 0.25846078991889954, + "mean_token_accuracy": 0.932551920413971, + "num_tokens": 32713411.0, + "step": 3657 + }, + { + "epoch": 2.7796352583586628, + "grad_norm": 1.4306368827819824, + "learning_rate": 7.392327596698474e-08, + "loss": 0.22794288396835327, + "mean_token_accuracy": 0.9128783941268921, + "num_tokens": 32724629.0, + "step": 3658 + }, + { + "epoch": 2.7803951367781155, + "grad_norm": 3.5105903148651123, + "learning_rate": 7.341858241368182e-08, + "loss": 0.21695205569267273, + "mean_token_accuracy": 0.9189575910568237, + "num_tokens": 32728392.0, + "step": 3659 + }, + { + "epoch": 2.7811550151975686, + "grad_norm": 2.8782589435577393, + "learning_rate": 7.291559192862701e-08, + "loss": 0.3374413847923279, + "mean_token_accuracy": 0.9080451726913452, + "num_tokens": 32733126.0, + "step": 3660 + }, + { + "epoch": 2.7819148936170213, + "grad_norm": 1.9232850074768066, + "learning_rate": 7.24143048648382e-08, + "loss": 0.2707790732383728, + "mean_token_accuracy": 0.9045628309249878, + "num_tokens": 32741378.0, + "step": 3661 + }, + { + "epoch": 2.782674772036474, + "grad_norm": 1.1166657209396362, + "learning_rate": 7.19147215741381e-08, + "loss": 0.2668237090110779, + "mean_token_accuracy": 0.8920862674713135, + "num_tokens": 32760317.0, + "step": 3662 + }, + { + "epoch": 2.783434650455927, + "grad_norm": 3.9177591800689697, + "learning_rate": 7.141684240715374e-08, + "loss": 0.18272298574447632, + "mean_token_accuracy": 0.94575434923172, + "num_tokens": 32763663.0, + "step": 3663 + }, + { + "epoch": 2.78419452887538, + "grad_norm": 1.9616899490356445, + "learning_rate": 7.092066771331507e-08, + "loss": 0.20110884308815002, + "mean_token_accuracy": 0.9169102907180786, + "num_tokens": 32770243.0, + "step": 3664 + }, + { + "epoch": 2.784954407294833, + "grad_norm": 3.5950927734375, + "learning_rate": 7.042619784085741e-08, + "loss": 0.24979421496391296, + "mean_token_accuracy": 0.9095007181167603, + "num_tokens": 32773985.0, + "step": 3665 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 1.8824433088302612, + "learning_rate": 6.993343313681872e-08, + "loss": 0.32540541887283325, + "mean_token_accuracy": 0.8754172921180725, + "num_tokens": 32782040.0, + "step": 3666 + }, + { + "epoch": 2.7864741641337387, + "grad_norm": 1.7720941305160522, + "learning_rate": 6.944237394703985e-08, + "loss": 0.2930932641029358, + "mean_token_accuracy": 0.8913610577583313, + "num_tokens": 32790338.0, + "step": 3667 + }, + { + "epoch": 2.7872340425531914, + "grad_norm": 1.6130414009094238, + "learning_rate": 6.895302061616483e-08, + "loss": 0.35470184683799744, + "mean_token_accuracy": 0.8745495676994324, + "num_tokens": 32801160.0, + "step": 3668 + }, + { + "epoch": 2.7879939209726445, + "grad_norm": 1.315376877784729, + "learning_rate": 6.846537348764116e-08, + "loss": 0.33905792236328125, + "mean_token_accuracy": 0.8629679679870605, + "num_tokens": 32816508.0, + "step": 3669 + }, + { + "epoch": 2.788753799392097, + "grad_norm": 1.9508394002914429, + "learning_rate": 6.797943290371839e-08, + "loss": 0.27722638845443726, + "mean_token_accuracy": 0.8903636932373047, + "num_tokens": 32824029.0, + "step": 3670 + }, + { + "epoch": 2.7895136778115504, + "grad_norm": 0.9335530996322632, + "learning_rate": 6.74951992054479e-08, + "loss": 0.3004249632358551, + "mean_token_accuracy": 0.887278139591217, + "num_tokens": 32849091.0, + "step": 3671 + }, + { + "epoch": 2.790273556231003, + "grad_norm": 1.8353229761123657, + "learning_rate": 6.701267273268392e-08, + "loss": 0.3471749424934387, + "mean_token_accuracy": 0.8823778629302979, + "num_tokens": 32858285.0, + "step": 3672 + }, + { + "epoch": 2.7910334346504557, + "grad_norm": 2.469905138015747, + "learning_rate": 6.653185382408195e-08, + "loss": 0.27492985129356384, + "mean_token_accuracy": 0.898033857345581, + "num_tokens": 32863568.0, + "step": 3673 + }, + { + "epoch": 2.791793313069909, + "grad_norm": 1.861342430114746, + "learning_rate": 6.605274281709929e-08, + "loss": 0.4201383590698242, + "mean_token_accuracy": 0.8511666655540466, + "num_tokens": 32873794.0, + "step": 3674 + }, + { + "epoch": 2.7925531914893615, + "grad_norm": 1.6716010570526123, + "learning_rate": 6.557534004799443e-08, + "loss": 0.31345364451408386, + "mean_token_accuracy": 0.8953241109848022, + "num_tokens": 32883515.0, + "step": 3675 + }, + { + "epoch": 2.7933130699088147, + "grad_norm": 1.566288709640503, + "learning_rate": 6.509964585182688e-08, + "loss": 0.36333587765693665, + "mean_token_accuracy": 0.866706132888794, + "num_tokens": 32895232.0, + "step": 3676 + }, + { + "epoch": 2.7940729483282674, + "grad_norm": 1.5501067638397217, + "learning_rate": 6.462566056245761e-08, + "loss": 0.2846035957336426, + "mean_token_accuracy": 0.9041277766227722, + "num_tokens": 32903854.0, + "step": 3677 + }, + { + "epoch": 2.7948328267477205, + "grad_norm": 2.15285325050354, + "learning_rate": 6.415338451254722e-08, + "loss": 0.35233989357948303, + "mean_token_accuracy": 0.8840795159339905, + "num_tokens": 32911633.0, + "step": 3678 + }, + { + "epoch": 2.795592705167173, + "grad_norm": 1.3108829259872437, + "learning_rate": 6.368281803355692e-08, + "loss": 0.3379764258861542, + "mean_token_accuracy": 0.9114458560943604, + "num_tokens": 32925455.0, + "step": 3679 + }, + { + "epoch": 2.7963525835866263, + "grad_norm": 1.818579912185669, + "learning_rate": 6.321396145574948e-08, + "loss": 0.32847997546195984, + "mean_token_accuracy": 0.8970182538032532, + "num_tokens": 32935029.0, + "step": 3680 + }, + { + "epoch": 2.797112462006079, + "grad_norm": 3.7173373699188232, + "learning_rate": 6.274681510818587e-08, + "loss": 0.18795353174209595, + "mean_token_accuracy": 0.9429396986961365, + "num_tokens": 32938652.0, + "step": 3681 + }, + { + "epoch": 2.797872340425532, + "grad_norm": 2.2997212409973145, + "learning_rate": 6.228137931872713e-08, + "loss": 0.34515100717544556, + "mean_token_accuracy": 0.878103494644165, + "num_tokens": 32945409.0, + "step": 3682 + }, + { + "epoch": 2.798632218844985, + "grad_norm": 2.424675941467285, + "learning_rate": 6.18176544140342e-08, + "loss": 0.2552722990512848, + "mean_token_accuracy": 0.9087961316108704, + "num_tokens": 32950721.0, + "step": 3683 + }, + { + "epoch": 2.7993920972644375, + "grad_norm": 2.662060022354126, + "learning_rate": 6.135564071956729e-08, + "loss": 0.2554262578487396, + "mean_token_accuracy": 0.9034075736999512, + "num_tokens": 32955891.0, + "step": 3684 + }, + { + "epoch": 2.8001519756838906, + "grad_norm": 1.1945017576217651, + "learning_rate": 6.089533855958508e-08, + "loss": 0.36223694682121277, + "mean_token_accuracy": 0.8567380905151367, + "num_tokens": 32971543.0, + "step": 3685 + }, + { + "epoch": 2.8009118541033433, + "grad_norm": 1.2724100351333618, + "learning_rate": 6.043674825714607e-08, + "loss": 0.35224610567092896, + "mean_token_accuracy": 0.8696926832199097, + "num_tokens": 32986452.0, + "step": 3686 + }, + { + "epoch": 2.8016717325227964, + "grad_norm": 1.3042409420013428, + "learning_rate": 5.997987013410533e-08, + "loss": 0.38680803775787354, + "mean_token_accuracy": 0.8600257635116577, + "num_tokens": 33005534.0, + "step": 3687 + }, + { + "epoch": 2.802431610942249, + "grad_norm": 2.448430299758911, + "learning_rate": 5.9524704511118305e-08, + "loss": 0.13345648348331451, + "mean_token_accuracy": 0.9592865705490112, + "num_tokens": 33009403.0, + "step": 3688 + }, + { + "epoch": 2.8031914893617023, + "grad_norm": 1.1455037593841553, + "learning_rate": 5.9071251707638056e-08, + "loss": 0.3144465982913971, + "mean_token_accuracy": 0.8841190338134766, + "num_tokens": 33028129.0, + "step": 3689 + }, + { + "epoch": 2.803951367781155, + "grad_norm": 2.0947425365448, + "learning_rate": 5.861951204191446e-08, + "loss": 0.36041027307510376, + "mean_token_accuracy": 0.8605015873908997, + "num_tokens": 33036379.0, + "step": 3690 + }, + { + "epoch": 2.8047112462006076, + "grad_norm": 3.1552155017852783, + "learning_rate": 5.8169485830996134e-08, + "loss": 0.32727721333503723, + "mean_token_accuracy": 0.9110068678855896, + "num_tokens": 33040276.0, + "step": 3691 + }, + { + "epoch": 2.8054711246200608, + "grad_norm": 2.5555851459503174, + "learning_rate": 5.772117339072902e-08, + "loss": 0.23542895913124084, + "mean_token_accuracy": 0.91229647397995, + "num_tokens": 33045308.0, + "step": 3692 + }, + { + "epoch": 2.806231003039514, + "grad_norm": 2.4970197677612305, + "learning_rate": 5.7274575035755896e-08, + "loss": 0.13501018285751343, + "mean_token_accuracy": 0.9495668411254883, + "num_tokens": 33049012.0, + "step": 3693 + }, + { + "epoch": 2.8069908814589666, + "grad_norm": 3.25179123878479, + "learning_rate": 5.68296910795163e-08, + "loss": 0.39757871627807617, + "mean_token_accuracy": 0.8692524433135986, + "num_tokens": 33053004.0, + "step": 3694 + }, + { + "epoch": 2.8077507598784193, + "grad_norm": 2.4152987003326416, + "learning_rate": 5.6386521834247696e-08, + "loss": 0.3562552332878113, + "mean_token_accuracy": 0.8817118406295776, + "num_tokens": 33059557.0, + "step": 3695 + }, + { + "epoch": 2.8085106382978724, + "grad_norm": 2.051687002182007, + "learning_rate": 5.5945067610982395e-08, + "loss": 0.5281018018722534, + "mean_token_accuracy": 0.8174080848693848, + "num_tokens": 33068691.0, + "step": 3696 + }, + { + "epoch": 2.809270516717325, + "grad_norm": 3.8002891540527344, + "learning_rate": 5.550532871955061e-08, + "loss": 0.20866292715072632, + "mean_token_accuracy": 0.9262990951538086, + "num_tokens": 33072085.0, + "step": 3697 + }, + { + "epoch": 2.810030395136778, + "grad_norm": 2.3774707317352295, + "learning_rate": 5.506730546857797e-08, + "loss": 0.2632027566432953, + "mean_token_accuracy": 0.9251352548599243, + "num_tokens": 33078720.0, + "step": 3698 + }, + { + "epoch": 2.810790273556231, + "grad_norm": 1.3897415399551392, + "learning_rate": 5.463099816548578e-08, + "loss": 0.3936246931552887, + "mean_token_accuracy": 0.8637404441833496, + "num_tokens": 33092660.0, + "step": 3699 + }, + { + "epoch": 2.811550151975684, + "grad_norm": 1.5614900588989258, + "learning_rate": 5.419640711649188e-08, + "loss": 0.44372743368148804, + "mean_token_accuracy": 0.8500189185142517, + "num_tokens": 33104431.0, + "step": 3700 + }, + { + "epoch": 2.8123100303951367, + "grad_norm": 1.466921329498291, + "learning_rate": 5.376353262660811e-08, + "loss": 0.3102647066116333, + "mean_token_accuracy": 0.8741628527641296, + "num_tokens": 33115290.0, + "step": 3701 + }, + { + "epoch": 2.8130699088145894, + "grad_norm": 1.6993112564086914, + "learning_rate": 5.333237499964283e-08, + "loss": 0.4017091989517212, + "mean_token_accuracy": 0.865143358707428, + "num_tokens": 33126710.0, + "step": 3702 + }, + { + "epoch": 2.8138297872340425, + "grad_norm": 2.2112064361572266, + "learning_rate": 5.290293453819956e-08, + "loss": 0.3109806776046753, + "mean_token_accuracy": 0.9097060561180115, + "num_tokens": 33133186.0, + "step": 3703 + }, + { + "epoch": 2.8145896656534957, + "grad_norm": 1.9934327602386475, + "learning_rate": 5.247521154367552e-08, + "loss": 0.35044047236442566, + "mean_token_accuracy": 0.874421238899231, + "num_tokens": 33140329.0, + "step": 3704 + }, + { + "epoch": 2.8153495440729484, + "grad_norm": 2.815687656402588, + "learning_rate": 5.2049206316263366e-08, + "loss": 0.2516332268714905, + "mean_token_accuracy": 0.9180612564086914, + "num_tokens": 33144861.0, + "step": 3705 + }, + { + "epoch": 2.816109422492401, + "grad_norm": 1.7479608058929443, + "learning_rate": 5.162491915495005e-08, + "loss": 0.16342511773109436, + "mean_token_accuracy": 0.9410310983657837, + "num_tokens": 33151936.0, + "step": 3706 + }, + { + "epoch": 2.816869300911854, + "grad_norm": 1.3695951700210571, + "learning_rate": 5.120235035751653e-08, + "loss": 0.2908460199832916, + "mean_token_accuracy": 0.9211517572402954, + "num_tokens": 33164151.0, + "step": 3707 + }, + { + "epoch": 2.817629179331307, + "grad_norm": 2.370861768722534, + "learning_rate": 5.0781500220537797e-08, + "loss": 0.26081186532974243, + "mean_token_accuracy": 0.9090365171432495, + "num_tokens": 33169551.0, + "step": 3708 + }, + { + "epoch": 2.81838905775076, + "grad_norm": 1.627031922340393, + "learning_rate": 5.036236903938285e-08, + "loss": 0.2977932393550873, + "mean_token_accuracy": 0.9078235626220703, + "num_tokens": 33179586.0, + "step": 3709 + }, + { + "epoch": 2.8191489361702127, + "grad_norm": 1.830381155014038, + "learning_rate": 4.9944957108213896e-08, + "loss": 0.2239128053188324, + "mean_token_accuracy": 0.9216980934143066, + "num_tokens": 33186754.0, + "step": 3710 + }, + { + "epoch": 2.819908814589666, + "grad_norm": 2.419703245162964, + "learning_rate": 4.952926471998687e-08, + "loss": 0.3302939832210541, + "mean_token_accuracy": 0.9000803232192993, + "num_tokens": 33192512.0, + "step": 3711 + }, + { + "epoch": 2.8206686930091185, + "grad_norm": 2.2166857719421387, + "learning_rate": 4.911529216645089e-08, + "loss": 0.2880767285823822, + "mean_token_accuracy": 0.9058420658111572, + "num_tokens": 33198274.0, + "step": 3712 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 1.357695460319519, + "learning_rate": 4.8703039738147165e-08, + "loss": 0.38549065589904785, + "mean_token_accuracy": 0.8689560890197754, + "num_tokens": 33213015.0, + "step": 3713 + }, + { + "epoch": 2.8221884498480243, + "grad_norm": 1.3445006608963013, + "learning_rate": 4.829250772441091e-08, + "loss": 0.28673315048217773, + "mean_token_accuracy": 0.8871713876724243, + "num_tokens": 33226895.0, + "step": 3714 + }, + { + "epoch": 2.8229483282674774, + "grad_norm": 2.043430805206299, + "learning_rate": 4.788369641336943e-08, + "loss": 0.27235424518585205, + "mean_token_accuracy": 0.9001829624176025, + "num_tokens": 33233991.0, + "step": 3715 + }, + { + "epoch": 2.82370820668693, + "grad_norm": 3.290034294128418, + "learning_rate": 4.7476606091941544e-08, + "loss": 0.3277619481086731, + "mean_token_accuracy": 0.9064863324165344, + "num_tokens": 33238393.0, + "step": 3716 + }, + { + "epoch": 2.824468085106383, + "grad_norm": 3.1663918495178223, + "learning_rate": 4.707123704583927e-08, + "loss": 0.2841528058052063, + "mean_token_accuracy": 0.9187209606170654, + "num_tokens": 33242428.0, + "step": 3717 + }, + { + "epoch": 2.825227963525836, + "grad_norm": 1.2812966108322144, + "learning_rate": 4.6667589559566405e-08, + "loss": 0.4020092785358429, + "mean_token_accuracy": 0.8751412630081177, + "num_tokens": 33257996.0, + "step": 3718 + }, + { + "epoch": 2.8259878419452886, + "grad_norm": 1.4390029907226562, + "learning_rate": 4.626566391641774e-08, + "loss": 0.44845378398895264, + "mean_token_accuracy": 0.8416492938995361, + "num_tokens": 33271661.0, + "step": 3719 + }, + { + "epoch": 2.8267477203647418, + "grad_norm": 1.5283807516098022, + "learning_rate": 4.586546039848094e-08, + "loss": 0.28856372833251953, + "mean_token_accuracy": 0.8961426019668579, + "num_tokens": 33282969.0, + "step": 3720 + }, + { + "epoch": 2.8275075987841944, + "grad_norm": 1.5666929483413696, + "learning_rate": 4.546697928663357e-08, + "loss": 0.3489445149898529, + "mean_token_accuracy": 0.8704522848129272, + "num_tokens": 33293549.0, + "step": 3721 + }, + { + "epoch": 2.8282674772036476, + "grad_norm": 1.6343169212341309, + "learning_rate": 4.5070220860545244e-08, + "loss": 0.3505254089832306, + "mean_token_accuracy": 0.8735896348953247, + "num_tokens": 33304821.0, + "step": 3722 + }, + { + "epoch": 2.8290273556231003, + "grad_norm": 1.963257074356079, + "learning_rate": 4.467518539867655e-08, + "loss": 0.3180759847164154, + "mean_token_accuracy": 0.8902066946029663, + "num_tokens": 33312313.0, + "step": 3723 + }, + { + "epoch": 2.829787234042553, + "grad_norm": 3.3562021255493164, + "learning_rate": 4.428187317827848e-08, + "loss": 0.23085635900497437, + "mean_token_accuracy": 0.9242620468139648, + "num_tokens": 33315831.0, + "step": 3724 + }, + { + "epoch": 2.830547112462006, + "grad_norm": 1.7402317523956299, + "learning_rate": 4.3890284475392175e-08, + "loss": 0.27766430377960205, + "mean_token_accuracy": 0.8943138122558594, + "num_tokens": 33324982.0, + "step": 3725 + }, + { + "epoch": 2.831306990881459, + "grad_norm": 1.6835107803344727, + "learning_rate": 4.350041956485029e-08, + "loss": 0.35358738899230957, + "mean_token_accuracy": 0.8683137893676758, + "num_tokens": 33334979.0, + "step": 3726 + }, + { + "epoch": 2.832066869300912, + "grad_norm": 2.232856035232544, + "learning_rate": 4.311227872027479e-08, + "loss": 0.3305876851081848, + "mean_token_accuracy": 0.885346531867981, + "num_tokens": 33341951.0, + "step": 3727 + }, + { + "epoch": 2.8328267477203646, + "grad_norm": 1.763230800628662, + "learning_rate": 4.272586221407776e-08, + "loss": 0.3677369952201843, + "mean_token_accuracy": 0.8810771703720093, + "num_tokens": 33351110.0, + "step": 3728 + }, + { + "epoch": 2.8335866261398177, + "grad_norm": 1.3161970376968384, + "learning_rate": 4.2341170317461433e-08, + "loss": 0.4191834628582001, + "mean_token_accuracy": 0.8625809550285339, + "num_tokens": 33368231.0, + "step": 3729 + }, + { + "epoch": 2.8343465045592704, + "grad_norm": 2.151383399963379, + "learning_rate": 4.1958203300417056e-08, + "loss": 0.30521994829177856, + "mean_token_accuracy": 0.8904989957809448, + "num_tokens": 33374755.0, + "step": 3730 + }, + { + "epoch": 2.8351063829787235, + "grad_norm": 1.2751890420913696, + "learning_rate": 4.1576961431726016e-08, + "loss": 0.2024286538362503, + "mean_token_accuracy": 0.9254995584487915, + "num_tokens": 33385820.0, + "step": 3731 + }, + { + "epoch": 2.835866261398176, + "grad_norm": 2.5229005813598633, + "learning_rate": 4.119744497895817e-08, + "loss": 0.2631904184818268, + "mean_token_accuracy": 0.9213854074478149, + "num_tokens": 33390577.0, + "step": 3732 + }, + { + "epoch": 2.8366261398176293, + "grad_norm": 1.3829402923583984, + "learning_rate": 4.0819654208472947e-08, + "loss": 0.3373589813709259, + "mean_token_accuracy": 0.8810330629348755, + "num_tokens": 33404300.0, + "step": 3733 + }, + { + "epoch": 2.837386018237082, + "grad_norm": 1.395129919052124, + "learning_rate": 4.044358938541853e-08, + "loss": 0.27040547132492065, + "mean_token_accuracy": 0.8935626745223999, + "num_tokens": 33418071.0, + "step": 3734 + }, + { + "epoch": 2.8381458966565347, + "grad_norm": 2.4185354709625244, + "learning_rate": 4.006925077373158e-08, + "loss": 0.2641582489013672, + "mean_token_accuracy": 0.9196245670318604, + "num_tokens": 33423213.0, + "step": 3735 + }, + { + "epoch": 2.838905775075988, + "grad_norm": 1.9432255029678345, + "learning_rate": 3.969663863613721e-08, + "loss": 0.31337353587150574, + "mean_token_accuracy": 0.886800229549408, + "num_tokens": 33432442.0, + "step": 3736 + }, + { + "epoch": 2.839665653495441, + "grad_norm": 1.1473867893218994, + "learning_rate": 3.9325753234149276e-08, + "loss": 0.3156060576438904, + "mean_token_accuracy": 0.8809531331062317, + "num_tokens": 33452184.0, + "step": 3737 + }, + { + "epoch": 2.8404255319148937, + "grad_norm": 2.233121633529663, + "learning_rate": 3.8956594828069295e-08, + "loss": 0.31154608726501465, + "mean_token_accuracy": 0.8883147239685059, + "num_tokens": 33458643.0, + "step": 3738 + }, + { + "epoch": 2.8411854103343464, + "grad_norm": 2.165466070175171, + "learning_rate": 3.8589163676986674e-08, + "loss": 0.38480815291404724, + "mean_token_accuracy": 0.8609665036201477, + "num_tokens": 33466465.0, + "step": 3739 + }, + { + "epoch": 2.8419452887537995, + "grad_norm": 3.5072174072265625, + "learning_rate": 3.822346003877875e-08, + "loss": 0.45201557874679565, + "mean_token_accuracy": 0.8519665002822876, + "num_tokens": 33470826.0, + "step": 3740 + }, + { + "epoch": 2.842705167173252, + "grad_norm": 2.2038586139678955, + "learning_rate": 3.785948417011076e-08, + "loss": 0.34780675172805786, + "mean_token_accuracy": 0.8806177377700806, + "num_tokens": 33478706.0, + "step": 3741 + }, + { + "epoch": 2.8434650455927053, + "grad_norm": 1.8423243761062622, + "learning_rate": 3.749723632643476e-08, + "loss": 0.2681577205657959, + "mean_token_accuracy": 0.9055651426315308, + "num_tokens": 33486200.0, + "step": 3742 + }, + { + "epoch": 2.844224924012158, + "grad_norm": 1.3372201919555664, + "learning_rate": 3.713671676199016e-08, + "loss": 0.3277212381362915, + "mean_token_accuracy": 0.8801225423812866, + "num_tokens": 33499465.0, + "step": 3743 + }, + { + "epoch": 2.844984802431611, + "grad_norm": 2.303901195526123, + "learning_rate": 3.677792572980371e-08, + "loss": 0.2349717617034912, + "mean_token_accuracy": 0.9109916090965271, + "num_tokens": 33505491.0, + "step": 3744 + }, + { + "epoch": 2.845744680851064, + "grad_norm": 2.1374688148498535, + "learning_rate": 3.642086348168844e-08, + "loss": 0.3567136526107788, + "mean_token_accuracy": 0.8669205904006958, + "num_tokens": 33512665.0, + "step": 3745 + }, + { + "epoch": 2.8465045592705165, + "grad_norm": 3.476426362991333, + "learning_rate": 3.6065530268244445e-08, + "loss": 0.3189643621444702, + "mean_token_accuracy": 0.882624626159668, + "num_tokens": 33516449.0, + "step": 3746 + }, + { + "epoch": 2.8472644376899696, + "grad_norm": 1.094572901725769, + "learning_rate": 3.5711926338858335e-08, + "loss": 0.25354239344596863, + "mean_token_accuracy": 0.9008959531784058, + "num_tokens": 33536298.0, + "step": 3747 + }, + { + "epoch": 2.8480243161094227, + "grad_norm": 1.375033974647522, + "learning_rate": 3.536005194170328e-08, + "loss": 0.2859119772911072, + "mean_token_accuracy": 0.8998885154724121, + "num_tokens": 33548861.0, + "step": 3748 + }, + { + "epoch": 2.8487841945288754, + "grad_norm": 1.96660578250885, + "learning_rate": 3.5009907323737826e-08, + "loss": 0.35728299617767334, + "mean_token_accuracy": 0.8976923227310181, + "num_tokens": 33556270.0, + "step": 3749 + }, + { + "epoch": 2.849544072948328, + "grad_norm": 2.8434062004089355, + "learning_rate": 3.466149273070707e-08, + "loss": 0.25592705607414246, + "mean_token_accuracy": 0.9228044748306274, + "num_tokens": 33560603.0, + "step": 3750 + }, + { + "epoch": 2.8503039513677813, + "grad_norm": 2.7658159732818604, + "learning_rate": 3.431480840714152e-08, + "loss": 0.33110958337783813, + "mean_token_accuracy": 0.8761162161827087, + "num_tokens": 33565428.0, + "step": 3751 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 3.696040391921997, + "learning_rate": 3.396985459635821e-08, + "loss": 0.29301607608795166, + "mean_token_accuracy": 0.9034254550933838, + "num_tokens": 33568866.0, + "step": 3752 + }, + { + "epoch": 2.851823708206687, + "grad_norm": 1.8923646211624146, + "learning_rate": 3.3626631540458754e-08, + "loss": 0.3817586600780487, + "mean_token_accuracy": 0.8635997772216797, + "num_tokens": 33578141.0, + "step": 3753 + }, + { + "epoch": 2.8525835866261398, + "grad_norm": 1.6717027425765991, + "learning_rate": 3.328513948032991e-08, + "loss": 0.37302929162979126, + "mean_token_accuracy": 0.8670454025268555, + "num_tokens": 33588694.0, + "step": 3754 + }, + { + "epoch": 2.853343465045593, + "grad_norm": 3.2549097537994385, + "learning_rate": 3.29453786556444e-08, + "loss": 0.27366238832473755, + "mean_token_accuracy": 0.9079047441482544, + "num_tokens": 33592813.0, + "step": 3755 + }, + { + "epoch": 2.8541033434650456, + "grad_norm": 1.3533412218093872, + "learning_rate": 3.260734930485926e-08, + "loss": 0.4412471652030945, + "mean_token_accuracy": 0.839799165725708, + "num_tokens": 33609765.0, + "step": 3756 + }, + { + "epoch": 2.8548632218844983, + "grad_norm": 2.876262903213501, + "learning_rate": 3.227105166521638e-08, + "loss": 0.3382536768913269, + "mean_token_accuracy": 0.879544734954834, + "num_tokens": 33614131.0, + "step": 3757 + }, + { + "epoch": 2.8556231003039514, + "grad_norm": 1.9969818592071533, + "learning_rate": 3.193648597274279e-08, + "loss": 0.24406743049621582, + "mean_token_accuracy": 0.9072264432907104, + "num_tokens": 33621867.0, + "step": 3758 + }, + { + "epoch": 2.8563829787234045, + "grad_norm": 2.934230089187622, + "learning_rate": 3.1603652462249e-08, + "loss": 0.0893428698182106, + "mean_token_accuracy": 0.9702994227409363, + "num_tokens": 33625133.0, + "step": 3759 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 3.162353038787842, + "learning_rate": 3.127255136733093e-08, + "loss": 0.2535284459590912, + "mean_token_accuracy": 0.8997728824615479, + "num_tokens": 33629391.0, + "step": 3760 + }, + { + "epoch": 2.85790273556231, + "grad_norm": 1.3975396156311035, + "learning_rate": 3.094318292036824e-08, + "loss": 0.37006449699401855, + "mean_token_accuracy": 0.8666602373123169, + "num_tokens": 33644831.0, + "step": 3761 + }, + { + "epoch": 2.858662613981763, + "grad_norm": 1.4016542434692383, + "learning_rate": 3.061554735252325e-08, + "loss": 0.33619073033332825, + "mean_token_accuracy": 0.8836570978164673, + "num_tokens": 33660918.0, + "step": 3762 + }, + { + "epoch": 2.8594224924012157, + "grad_norm": 1.3213437795639038, + "learning_rate": 3.028964489374453e-08, + "loss": 0.29083719849586487, + "mean_token_accuracy": 0.9077234864234924, + "num_tokens": 33672778.0, + "step": 3763 + }, + { + "epoch": 2.860182370820669, + "grad_norm": 2.157179594039917, + "learning_rate": 2.9965475772762154e-08, + "loss": 0.35480785369873047, + "mean_token_accuracy": 0.8937191367149353, + "num_tokens": 33680991.0, + "step": 3764 + }, + { + "epoch": 2.8609422492401215, + "grad_norm": 3.584878921508789, + "learning_rate": 2.96430402170908e-08, + "loss": 0.34448280930519104, + "mean_token_accuracy": 0.878994345664978, + "num_tokens": 33685137.0, + "step": 3765 + }, + { + "epoch": 2.8617021276595747, + "grad_norm": 1.7320963144302368, + "learning_rate": 2.9322338453028066e-08, + "loss": 0.40042293071746826, + "mean_token_accuracy": 0.8563319444656372, + "num_tokens": 33694591.0, + "step": 3766 + }, + { + "epoch": 2.8624620060790273, + "grad_norm": 1.6684232950210571, + "learning_rate": 2.900337070565473e-08, + "loss": 0.4402884542942047, + "mean_token_accuracy": 0.8600190877914429, + "num_tokens": 33708467.0, + "step": 3767 + }, + { + "epoch": 2.86322188449848, + "grad_norm": 1.9484777450561523, + "learning_rate": 2.8686137198834784e-08, + "loss": 0.2297988086938858, + "mean_token_accuracy": 0.9216253161430359, + "num_tokens": 33715825.0, + "step": 3768 + }, + { + "epoch": 2.863981762917933, + "grad_norm": 1.4594624042510986, + "learning_rate": 2.8370638155215125e-08, + "loss": 0.2471354901790619, + "mean_token_accuracy": 0.9343935251235962, + "num_tokens": 33726774.0, + "step": 3769 + }, + { + "epoch": 2.8647416413373863, + "grad_norm": 1.75857412815094, + "learning_rate": 2.805687379622446e-08, + "loss": 0.3599606156349182, + "mean_token_accuracy": 0.8600481748580933, + "num_tokens": 33738487.0, + "step": 3770 + }, + { + "epoch": 2.865501519756839, + "grad_norm": 2.5933029651641846, + "learning_rate": 2.774484434207525e-08, + "loss": 0.3705040514469147, + "mean_token_accuracy": 0.8960624933242798, + "num_tokens": 33743954.0, + "step": 3771 + }, + { + "epoch": 2.8662613981762917, + "grad_norm": 2.339298963546753, + "learning_rate": 2.7434550011761763e-08, + "loss": 0.35568612813949585, + "mean_token_accuracy": 0.8733487725257874, + "num_tokens": 33750214.0, + "step": 3772 + }, + { + "epoch": 2.867021276595745, + "grad_norm": 2.2959485054016113, + "learning_rate": 2.712599102306035e-08, + "loss": 0.2672561705112457, + "mean_token_accuracy": 0.9030044078826904, + "num_tokens": 33756736.0, + "step": 3773 + }, + { + "epoch": 2.8677811550151975, + "grad_norm": 1.500349521636963, + "learning_rate": 2.681916759252917e-08, + "loss": 0.41401299834251404, + "mean_token_accuracy": 0.8844438195228577, + "num_tokens": 33769268.0, + "step": 3774 + }, + { + "epoch": 2.8685410334346506, + "grad_norm": 1.7837727069854736, + "learning_rate": 2.6514079935509586e-08, + "loss": 0.2668437957763672, + "mean_token_accuracy": 0.8956533670425415, + "num_tokens": 33777122.0, + "step": 3775 + }, + { + "epoch": 2.8693009118541033, + "grad_norm": 1.717192530632019, + "learning_rate": 2.6210728266123364e-08, + "loss": 0.25972551107406616, + "mean_token_accuracy": 0.883383572101593, + "num_tokens": 33785044.0, + "step": 3776 + }, + { + "epoch": 2.8700607902735564, + "grad_norm": 1.9367283582687378, + "learning_rate": 2.5909112797274093e-08, + "loss": 0.44500526785850525, + "mean_token_accuracy": 0.8556182980537415, + "num_tokens": 33794610.0, + "step": 3777 + }, + { + "epoch": 2.870820668693009, + "grad_norm": 1.4821012020111084, + "learning_rate": 2.560923374064772e-08, + "loss": 0.3385273218154907, + "mean_token_accuracy": 0.873454749584198, + "num_tokens": 33807602.0, + "step": 3778 + }, + { + "epoch": 2.871580547112462, + "grad_norm": 2.105130195617676, + "learning_rate": 2.531109130671061e-08, + "loss": 0.2996317446231842, + "mean_token_accuracy": 0.8943172693252563, + "num_tokens": 33814280.0, + "step": 3779 + }, + { + "epoch": 2.872340425531915, + "grad_norm": 2.1374971866607666, + "learning_rate": 2.501468570471066e-08, + "loss": 0.3201690912246704, + "mean_token_accuracy": 0.8778494596481323, + "num_tokens": 33821842.0, + "step": 3780 + }, + { + "epoch": 2.8731003039513676, + "grad_norm": 2.2370989322662354, + "learning_rate": 2.4720017142676745e-08, + "loss": 0.4030833840370178, + "mean_token_accuracy": 0.8520782589912415, + "num_tokens": 33830051.0, + "step": 3781 + }, + { + "epoch": 2.8738601823708207, + "grad_norm": 2.3659868240356445, + "learning_rate": 2.4427085827418706e-08, + "loss": 0.2570466697216034, + "mean_token_accuracy": 0.9111968874931335, + "num_tokens": 33835753.0, + "step": 3782 + }, + { + "epoch": 2.8746200607902734, + "grad_norm": 2.252115249633789, + "learning_rate": 2.4135891964526535e-08, + "loss": 0.373632550239563, + "mean_token_accuracy": 0.8691182136535645, + "num_tokens": 33842183.0, + "step": 3783 + }, + { + "epoch": 2.8753799392097266, + "grad_norm": 1.216013789176941, + "learning_rate": 2.3846435758372034e-08, + "loss": 0.3572605848312378, + "mean_token_accuracy": 0.8590090274810791, + "num_tokens": 33860538.0, + "step": 3784 + }, + { + "epoch": 2.8761398176291793, + "grad_norm": 2.739243268966675, + "learning_rate": 2.3558717412106025e-08, + "loss": 0.3257160782814026, + "mean_token_accuracy": 0.8806333541870117, + "num_tokens": 33866134.0, + "step": 3785 + }, + { + "epoch": 2.8768996960486324, + "grad_norm": 1.683767557144165, + "learning_rate": 2.3272737127660595e-08, + "loss": 0.3267333507537842, + "mean_token_accuracy": 0.9005235433578491, + "num_tokens": 33875630.0, + "step": 3786 + }, + { + "epoch": 2.877659574468085, + "grad_norm": 1.4830154180526733, + "learning_rate": 2.2988495105748245e-08, + "loss": 0.28507307171821594, + "mean_token_accuracy": 0.9133665561676025, + "num_tokens": 33887110.0, + "step": 3787 + }, + { + "epoch": 2.878419452887538, + "grad_norm": 1.7522467374801636, + "learning_rate": 2.2705991545859953e-08, + "loss": 0.45354849100112915, + "mean_token_accuracy": 0.8899869918823242, + "num_tokens": 33898735.0, + "step": 3788 + }, + { + "epoch": 2.879179331306991, + "grad_norm": 2.4311375617980957, + "learning_rate": 2.242522664626823e-08, + "loss": 0.3417064845561981, + "mean_token_accuracy": 0.8724955320358276, + "num_tokens": 33906031.0, + "step": 3789 + }, + { + "epoch": 2.8799392097264436, + "grad_norm": 2.44846510887146, + "learning_rate": 2.2146200604024614e-08, + "loss": 0.3186315596103668, + "mean_token_accuracy": 0.8888083696365356, + "num_tokens": 33911248.0, + "step": 3790 + }, + { + "epoch": 2.8806990881458967, + "grad_norm": 1.4528448581695557, + "learning_rate": 2.1868913614959963e-08, + "loss": 0.34161821007728577, + "mean_token_accuracy": 0.87728351354599, + "num_tokens": 33923786.0, + "step": 3791 + }, + { + "epoch": 2.8814589665653494, + "grad_norm": 1.8418529033660889, + "learning_rate": 2.1593365873685544e-08, + "loss": 0.2751237452030182, + "mean_token_accuracy": 0.9060730338096619, + "num_tokens": 33930983.0, + "step": 3792 + }, + { + "epoch": 2.8822188449848025, + "grad_norm": 1.5349152088165283, + "learning_rate": 2.131955757359111e-08, + "loss": 0.31487759947776794, + "mean_token_accuracy": 0.8839719891548157, + "num_tokens": 33942600.0, + "step": 3793 + }, + { + "epoch": 2.882978723404255, + "grad_norm": 2.317296266555786, + "learning_rate": 2.1047488906845715e-08, + "loss": 0.22481049597263336, + "mean_token_accuracy": 0.9269076585769653, + "num_tokens": 33947548.0, + "step": 3794 + }, + { + "epoch": 2.8837386018237083, + "grad_norm": 1.9512174129486084, + "learning_rate": 2.0777160064397727e-08, + "loss": 0.33469653129577637, + "mean_token_accuracy": 0.8800324201583862, + "num_tokens": 33955699.0, + "step": 3795 + }, + { + "epoch": 2.884498480243161, + "grad_norm": 1.3140486478805542, + "learning_rate": 2.050857123597455e-08, + "loss": 0.3801634609699249, + "mean_token_accuracy": 0.8677546977996826, + "num_tokens": 33972033.0, + "step": 3796 + }, + { + "epoch": 2.8852583586626137, + "grad_norm": 2.4413559436798096, + "learning_rate": 2.024172261008178e-08, + "loss": 0.4444601535797119, + "mean_token_accuracy": 0.8535408973693848, + "num_tokens": 33978859.0, + "step": 3797 + }, + { + "epoch": 2.886018237082067, + "grad_norm": 1.8970952033996582, + "learning_rate": 1.997661437400461e-08, + "loss": 0.29712194204330444, + "mean_token_accuracy": 0.8958410024642944, + "num_tokens": 33988416.0, + "step": 3798 + }, + { + "epoch": 2.88677811550152, + "grad_norm": 1.4225033521652222, + "learning_rate": 1.9713246713805588e-08, + "loss": 0.2251742035150528, + "mean_token_accuracy": 0.9059432744979858, + "num_tokens": 33998579.0, + "step": 3799 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 3.485994338989258, + "learning_rate": 1.9451619814326307e-08, + "loss": 0.2449614405632019, + "mean_token_accuracy": 0.9136157035827637, + "num_tokens": 34002108.0, + "step": 3800 + }, + { + "epoch": 2.8882978723404253, + "grad_norm": 1.7383781671524048, + "learning_rate": 1.91917338591871e-08, + "loss": 0.3420751690864563, + "mean_token_accuracy": 0.8810985088348389, + "num_tokens": 34010102.0, + "step": 3801 + }, + { + "epoch": 2.8890577507598785, + "grad_norm": 2.587632894515991, + "learning_rate": 1.893358903078568e-08, + "loss": 0.38646167516708374, + "mean_token_accuracy": 0.8570578098297119, + "num_tokens": 34016684.0, + "step": 3802 + }, + { + "epoch": 2.889817629179331, + "grad_norm": 1.2580358982086182, + "learning_rate": 1.867718551029768e-08, + "loss": 0.23658394813537598, + "mean_token_accuracy": 0.9092692136764526, + "num_tokens": 34029808.0, + "step": 3803 + }, + { + "epoch": 2.8905775075987843, + "grad_norm": 1.808404803276062, + "learning_rate": 1.842252347767748e-08, + "loss": 0.2760203778743744, + "mean_token_accuracy": 0.8876132965087891, + "num_tokens": 34038138.0, + "step": 3804 + }, + { + "epoch": 2.891337386018237, + "grad_norm": 1.6140836477279663, + "learning_rate": 1.8169603111656554e-08, + "loss": 0.3449614346027374, + "mean_token_accuracy": 0.8777539730072021, + "num_tokens": 34048093.0, + "step": 3805 + }, + { + "epoch": 2.89209726443769, + "grad_norm": 3.784487724304199, + "learning_rate": 1.791842458974402e-08, + "loss": 0.3181925415992737, + "mean_token_accuracy": 0.8902693390846252, + "num_tokens": 34051903.0, + "step": 3806 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 1.726521372795105, + "learning_rate": 1.7668988088226922e-08, + "loss": 0.3940914273262024, + "mean_token_accuracy": 0.8877660036087036, + "num_tokens": 34062433.0, + "step": 3807 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 2.1862759590148926, + "learning_rate": 1.7421293782168837e-08, + "loss": 0.2806234061717987, + "mean_token_accuracy": 0.9004480838775635, + "num_tokens": 34068835.0, + "step": 3808 + }, + { + "epoch": 2.8943768996960486, + "grad_norm": 1.8618063926696777, + "learning_rate": 1.717534184541153e-08, + "loss": 0.3391259014606476, + "mean_token_accuracy": 0.8807502388954163, + "num_tokens": 34078044.0, + "step": 3809 + }, + { + "epoch": 2.8951367781155017, + "grad_norm": 2.19085431098938, + "learning_rate": 1.6931132450573873e-08, + "loss": 0.34228384494781494, + "mean_token_accuracy": 0.8653440475463867, + "num_tokens": 34084925.0, + "step": 3810 + }, + { + "epoch": 2.8958966565349544, + "grad_norm": 2.0328660011291504, + "learning_rate": 1.6688665769050704e-08, + "loss": 0.3773893117904663, + "mean_token_accuracy": 0.8646367788314819, + "num_tokens": 34092740.0, + "step": 3811 + }, + { + "epoch": 2.896656534954407, + "grad_norm": 2.489732265472412, + "learning_rate": 1.644794197101507e-08, + "loss": 0.2722119688987732, + "mean_token_accuracy": 0.9241745471954346, + "num_tokens": 34097475.0, + "step": 3812 + }, + { + "epoch": 2.8974164133738602, + "grad_norm": 2.709529161453247, + "learning_rate": 1.620896122541571e-08, + "loss": 0.2608666718006134, + "mean_token_accuracy": 0.9132722020149231, + "num_tokens": 34101961.0, + "step": 3813 + }, + { + "epoch": 2.898176291793313, + "grad_norm": 2.089813709259033, + "learning_rate": 1.5971723699979015e-08, + "loss": 0.3234292268753052, + "mean_token_accuracy": 0.9032332897186279, + "num_tokens": 34109427.0, + "step": 3814 + }, + { + "epoch": 2.898936170212766, + "grad_norm": 1.3891119956970215, + "learning_rate": 1.5736229561207072e-08, + "loss": 0.2506135404109955, + "mean_token_accuracy": 0.8997396230697632, + "num_tokens": 34121770.0, + "step": 3815 + }, + { + "epoch": 2.8996960486322187, + "grad_norm": 1.9386579990386963, + "learning_rate": 1.5502478974378788e-08, + "loss": 0.29841434955596924, + "mean_token_accuracy": 0.8915755748748779, + "num_tokens": 34130111.0, + "step": 3816 + }, + { + "epoch": 2.900455927051672, + "grad_norm": 1.601960301399231, + "learning_rate": 1.5270472103549317e-08, + "loss": 0.34736987948417664, + "mean_token_accuracy": 0.876467227935791, + "num_tokens": 34142053.0, + "step": 3817 + }, + { + "epoch": 2.9012158054711246, + "grad_norm": 2.42319393157959, + "learning_rate": 1.5040209111550075e-08, + "loss": 0.24774286150932312, + "mean_token_accuracy": 0.9127346873283386, + "num_tokens": 34146627.0, + "step": 3818 + }, + { + "epoch": 2.9019756838905773, + "grad_norm": 2.711033582687378, + "learning_rate": 1.4811690159988456e-08, + "loss": 0.30365103483200073, + "mean_token_accuracy": 0.8981214165687561, + "num_tokens": 34151735.0, + "step": 3819 + }, + { + "epoch": 2.9027355623100304, + "grad_norm": 3.105949640274048, + "learning_rate": 1.4584915409248113e-08, + "loss": 0.38369080424308777, + "mean_token_accuracy": 0.8762385845184326, + "num_tokens": 34156484.0, + "step": 3820 + }, + { + "epoch": 2.9034954407294835, + "grad_norm": 1.9705839157104492, + "learning_rate": 1.435988501848784e-08, + "loss": 0.33529043197631836, + "mean_token_accuracy": 0.8921652436256409, + "num_tokens": 34164241.0, + "step": 3821 + }, + { + "epoch": 2.904255319148936, + "grad_norm": 2.084878921508789, + "learning_rate": 1.413659914564297e-08, + "loss": 0.24922935664653778, + "mean_token_accuracy": 0.9262560606002808, + "num_tokens": 34169898.0, + "step": 3822 + }, + { + "epoch": 2.905015197568389, + "grad_norm": 1.593758225440979, + "learning_rate": 1.3915057947423705e-08, + "loss": 0.3691917657852173, + "mean_token_accuracy": 0.8785613775253296, + "num_tokens": 34181419.0, + "step": 3823 + }, + { + "epoch": 2.905775075987842, + "grad_norm": 1.772596001625061, + "learning_rate": 1.3695261579316776e-08, + "loss": 0.358150839805603, + "mean_token_accuracy": 0.8747056722640991, + "num_tokens": 34190872.0, + "step": 3824 + }, + { + "epoch": 2.9065349544072947, + "grad_norm": 2.1670494079589844, + "learning_rate": 1.3477210195583234e-08, + "loss": 0.30586451292037964, + "mean_token_accuracy": 0.8851495981216431, + "num_tokens": 34197353.0, + "step": 3825 + }, + { + "epoch": 2.907294832826748, + "grad_norm": 2.7168121337890625, + "learning_rate": 1.3260903949260107e-08, + "loss": 0.2924152612686157, + "mean_token_accuracy": 0.8947597146034241, + "num_tokens": 34201889.0, + "step": 3826 + }, + { + "epoch": 2.9080547112462005, + "grad_norm": 1.576528787612915, + "learning_rate": 1.3046342992159567e-08, + "loss": 0.33903738856315613, + "mean_token_accuracy": 0.8710857033729553, + "num_tokens": 34212640.0, + "step": 3827 + }, + { + "epoch": 2.9088145896656536, + "grad_norm": 1.3831605911254883, + "learning_rate": 1.2833527474868657e-08, + "loss": 0.2891062796115875, + "mean_token_accuracy": 0.8909540176391602, + "num_tokens": 34223917.0, + "step": 3828 + }, + { + "epoch": 2.9095744680851063, + "grad_norm": 2.075225353240967, + "learning_rate": 1.2622457546749567e-08, + "loss": 0.14886733889579773, + "mean_token_accuracy": 0.9509548544883728, + "num_tokens": 34228609.0, + "step": 3829 + }, + { + "epoch": 2.910334346504559, + "grad_norm": 2.658463478088379, + "learning_rate": 1.2413133355939356e-08, + "loss": 0.13472142815589905, + "mean_token_accuracy": 0.957228422164917, + "num_tokens": 34232011.0, + "step": 3830 + }, + { + "epoch": 2.911094224924012, + "grad_norm": 1.8684933185577393, + "learning_rate": 1.2205555049349394e-08, + "loss": 0.13954663276672363, + "mean_token_accuracy": 0.953221321105957, + "num_tokens": 34237643.0, + "step": 3831 + }, + { + "epoch": 2.9118541033434653, + "grad_norm": 1.799784779548645, + "learning_rate": 1.1999722772666478e-08, + "loss": 0.28668212890625, + "mean_token_accuracy": 0.9035641551017761, + "num_tokens": 34246593.0, + "step": 3832 + }, + { + "epoch": 2.912613981762918, + "grad_norm": 1.3970232009887695, + "learning_rate": 1.1795636670351718e-08, + "loss": 0.2589891254901886, + "mean_token_accuracy": 0.9162927865982056, + "num_tokens": 34257535.0, + "step": 3833 + }, + { + "epoch": 2.9133738601823707, + "grad_norm": 2.5260443687438965, + "learning_rate": 1.1593296885640259e-08, + "loss": 0.452729195356369, + "mean_token_accuracy": 0.8569157123565674, + "num_tokens": 34263834.0, + "step": 3834 + }, + { + "epoch": 2.914133738601824, + "grad_norm": 1.879526972770691, + "learning_rate": 1.1392703560542118e-08, + "loss": 0.3608126640319824, + "mean_token_accuracy": 0.8750635385513306, + "num_tokens": 34272156.0, + "step": 3835 + }, + { + "epoch": 2.9148936170212765, + "grad_norm": 1.9857182502746582, + "learning_rate": 1.1193856835841344e-08, + "loss": 0.28058698773384094, + "mean_token_accuracy": 0.8984638452529907, + "num_tokens": 34280438.0, + "step": 3836 + }, + { + "epoch": 2.9156534954407296, + "grad_norm": 1.9187198877334595, + "learning_rate": 1.0996756851096579e-08, + "loss": 0.3203415870666504, + "mean_token_accuracy": 0.8920673131942749, + "num_tokens": 34288330.0, + "step": 3837 + }, + { + "epoch": 2.9164133738601823, + "grad_norm": 1.6627569198608398, + "learning_rate": 1.0801403744639672e-08, + "loss": 0.30393654108047485, + "mean_token_accuracy": 0.8877602815628052, + "num_tokens": 34297701.0, + "step": 3838 + }, + { + "epoch": 2.9171732522796354, + "grad_norm": 1.4527947902679443, + "learning_rate": 1.0607797653577333e-08, + "loss": 0.33950865268707275, + "mean_token_accuracy": 0.8850067853927612, + "num_tokens": 34311995.0, + "step": 3839 + }, + { + "epoch": 2.917933130699088, + "grad_norm": 1.694217324256897, + "learning_rate": 1.0415938713789487e-08, + "loss": 0.33595266938209534, + "mean_token_accuracy": 0.878333568572998, + "num_tokens": 34322095.0, + "step": 3840 + }, + { + "epoch": 2.918693009118541, + "grad_norm": 2.3357045650482178, + "learning_rate": 1.0225827059930082e-08, + "loss": 0.2966959476470947, + "mean_token_accuracy": 0.893630862236023, + "num_tokens": 34328400.0, + "step": 3841 + }, + { + "epoch": 2.919452887537994, + "grad_norm": 1.9848041534423828, + "learning_rate": 1.0037462825427113e-08, + "loss": 0.4187622368335724, + "mean_token_accuracy": 0.8641717433929443, + "num_tokens": 34337203.0, + "step": 3842 + }, + { + "epoch": 2.920212765957447, + "grad_norm": 1.7696800231933594, + "learning_rate": 9.850846142481773e-09, + "loss": 0.34298282861709595, + "mean_token_accuracy": 0.8812298774719238, + "num_tokens": 34346584.0, + "step": 3843 + }, + { + "epoch": 2.9209726443768997, + "grad_norm": 2.6058225631713867, + "learning_rate": 9.665977142068738e-09, + "loss": 0.2776247262954712, + "mean_token_accuracy": 0.908215343952179, + "num_tokens": 34351472.0, + "step": 3844 + }, + { + "epoch": 2.9217325227963524, + "grad_norm": 2.4990663528442383, + "learning_rate": 9.482855953936443e-09, + "loss": 0.2577187418937683, + "mean_token_accuracy": 0.9113357663154602, + "num_tokens": 34357101.0, + "step": 3845 + }, + { + "epoch": 2.9224924012158056, + "grad_norm": 3.2842514514923096, + "learning_rate": 9.30148270660569e-09, + "loss": 0.23392081260681152, + "mean_token_accuracy": 0.9370708465576172, + "num_tokens": 34360674.0, + "step": 3846 + }, + { + "epoch": 2.9232522796352582, + "grad_norm": 2.2124083042144775, + "learning_rate": 9.121857527372157e-09, + "loss": 0.3026091456413269, + "mean_token_accuracy": 0.886944055557251, + "num_tokens": 34367471.0, + "step": 3847 + }, + { + "epoch": 2.9240121580547114, + "grad_norm": 1.6130470037460327, + "learning_rate": 8.943980542302777e-09, + "loss": 0.33204811811447144, + "mean_token_accuracy": 0.8805426359176636, + "num_tokens": 34377461.0, + "step": 3848 + }, + { + "epoch": 2.924772036474164, + "grad_norm": 1.6536617279052734, + "learning_rate": 8.767851876239075e-09, + "loss": 0.33671748638153076, + "mean_token_accuracy": 0.8811848163604736, + "num_tokens": 34386732.0, + "step": 3849 + }, + { + "epoch": 2.925531914893617, + "grad_norm": 1.9558135271072388, + "learning_rate": 8.59347165279495e-09, + "loss": 0.3325084447860718, + "mean_token_accuracy": 0.8823798894882202, + "num_tokens": 34395705.0, + "step": 3850 + }, + { + "epoch": 2.92629179331307, + "grad_norm": 2.2350621223449707, + "learning_rate": 8.420839994356666e-09, + "loss": 0.28383463621139526, + "mean_token_accuracy": 0.8957310914993286, + "num_tokens": 34402470.0, + "step": 3851 + }, + { + "epoch": 2.9270516717325226, + "grad_norm": 1.9859482049942017, + "learning_rate": 8.249957022084254e-09, + "loss": 0.2720850110054016, + "mean_token_accuracy": 0.9078607559204102, + "num_tokens": 34410536.0, + "step": 3852 + }, + { + "epoch": 2.9278115501519757, + "grad_norm": 1.3174400329589844, + "learning_rate": 8.080822855909832e-09, + "loss": 0.330660879611969, + "mean_token_accuracy": 0.8777779936790466, + "num_tokens": 34425639.0, + "step": 3853 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 1.0108131170272827, + "learning_rate": 7.913437614538166e-09, + "loss": 0.3833892345428467, + "mean_token_accuracy": 0.8571817874908447, + "num_tokens": 34451572.0, + "step": 3854 + }, + { + "epoch": 2.9293313069908815, + "grad_norm": 1.347409725189209, + "learning_rate": 7.747801415446677e-09, + "loss": 0.3100135028362274, + "mean_token_accuracy": 0.9087803363800049, + "num_tokens": 34465396.0, + "step": 3855 + }, + { + "epoch": 2.930091185410334, + "grad_norm": 1.4636729955673218, + "learning_rate": 7.583914374885426e-09, + "loss": 0.32699912786483765, + "mean_token_accuracy": 0.87745201587677, + "num_tokens": 34477359.0, + "step": 3856 + }, + { + "epoch": 2.9308510638297873, + "grad_norm": 2.9707272052764893, + "learning_rate": 7.4217766078760185e-09, + "loss": 0.18189990520477295, + "mean_token_accuracy": 0.9301153421401978, + "num_tokens": 34481356.0, + "step": 3857 + }, + { + "epoch": 2.93161094224924, + "grad_norm": 2.3689684867858887, + "learning_rate": 7.261388228213817e-09, + "loss": 0.3382490873336792, + "mean_token_accuracy": 0.9132488965988159, + "num_tokens": 34487525.0, + "step": 3858 + }, + { + "epoch": 2.932370820668693, + "grad_norm": 2.3896703720092773, + "learning_rate": 7.102749348465166e-09, + "loss": 0.3891000747680664, + "mean_token_accuracy": 0.8888499736785889, + "num_tokens": 34493053.0, + "step": 3859 + }, + { + "epoch": 2.933130699088146, + "grad_norm": 3.2713520526885986, + "learning_rate": 6.945860079969058e-09, + "loss": 0.26146358251571655, + "mean_token_accuracy": 0.9090266227722168, + "num_tokens": 34496938.0, + "step": 3860 + }, + { + "epoch": 2.933890577507599, + "grad_norm": 2.9600296020507812, + "learning_rate": 6.790720532836026e-09, + "loss": 0.3506978750228882, + "mean_token_accuracy": 0.8768079280853271, + "num_tokens": 34501615.0, + "step": 3861 + }, + { + "epoch": 2.9346504559270516, + "grad_norm": 2.640066146850586, + "learning_rate": 6.6373308159495275e-09, + "loss": 0.39720577001571655, + "mean_token_accuracy": 0.8565619587898254, + "num_tokens": 34507438.0, + "step": 3862 + }, + { + "epoch": 2.9354103343465043, + "grad_norm": 1.9988795518875122, + "learning_rate": 6.485691036964003e-09, + "loss": 0.18736782670021057, + "mean_token_accuracy": 0.92908775806427, + "num_tokens": 34513939.0, + "step": 3863 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 1.0155757665634155, + "learning_rate": 6.3358013023062656e-09, + "loss": 0.2876095771789551, + "mean_token_accuracy": 0.8892107009887695, + "num_tokens": 34537761.0, + "step": 3864 + }, + { + "epoch": 2.9369300911854106, + "grad_norm": 1.1695115566253662, + "learning_rate": 6.1876617171743865e-09, + "loss": 0.2740359306335449, + "mean_token_accuracy": 0.9104942083358765, + "num_tokens": 34557302.0, + "step": 3865 + }, + { + "epoch": 2.9376899696048633, + "grad_norm": 2.36651349067688, + "learning_rate": 6.04127238553881e-09, + "loss": 0.2734505534172058, + "mean_token_accuracy": 0.897555947303772, + "num_tokens": 34563225.0, + "step": 3866 + }, + { + "epoch": 2.938449848024316, + "grad_norm": 3.6499621868133545, + "learning_rate": 5.896633410141239e-09, + "loss": 0.30723029375076294, + "mean_token_accuracy": 0.9058237075805664, + "num_tokens": 34567037.0, + "step": 3867 + }, + { + "epoch": 2.939209726443769, + "grad_norm": 2.1518232822418213, + "learning_rate": 5.753744892494639e-09, + "loss": 0.46499863266944885, + "mean_token_accuracy": 0.8726839423179626, + "num_tokens": 34576470.0, + "step": 3868 + }, + { + "epoch": 2.939969604863222, + "grad_norm": 2.8443753719329834, + "learning_rate": 5.612606932883513e-09, + "loss": 0.33730241656303406, + "mean_token_accuracy": 0.8874512910842896, + "num_tokens": 34582531.0, + "step": 3869 + }, + { + "epoch": 2.940729483282675, + "grad_norm": 1.732109546661377, + "learning_rate": 5.473219630364457e-09, + "loss": 0.28559115529060364, + "mean_token_accuracy": 0.894468367099762, + "num_tokens": 34591376.0, + "step": 3870 + }, + { + "epoch": 2.9414893617021276, + "grad_norm": 2.533249855041504, + "learning_rate": 5.335583082764495e-09, + "loss": 0.23319819569587708, + "mean_token_accuracy": 0.9174623489379883, + "num_tokens": 34596426.0, + "step": 3871 + }, + { + "epoch": 2.9422492401215807, + "grad_norm": 2.3505852222442627, + "learning_rate": 5.19969738668219e-09, + "loss": 0.3506584167480469, + "mean_token_accuracy": 0.8692278861999512, + "num_tokens": 34603058.0, + "step": 3872 + }, + { + "epoch": 2.9430091185410334, + "grad_norm": 1.9322142601013184, + "learning_rate": 5.065562637487365e-09, + "loss": 0.2503264546394348, + "mean_token_accuracy": 0.9093045592308044, + "num_tokens": 34610330.0, + "step": 3873 + }, + { + "epoch": 2.943768996960486, + "grad_norm": 2.398416519165039, + "learning_rate": 4.933178929321103e-09, + "loss": 0.3825327157974243, + "mean_token_accuracy": 0.8750842809677124, + "num_tokens": 34617198.0, + "step": 3874 + }, + { + "epoch": 2.9445288753799392, + "grad_norm": 2.3529703617095947, + "learning_rate": 4.802546355095472e-09, + "loss": 0.3877553343772888, + "mean_token_accuracy": 0.8654624223709106, + "num_tokens": 34624459.0, + "step": 3875 + }, + { + "epoch": 2.9452887537993924, + "grad_norm": 1.4786031246185303, + "learning_rate": 4.673665006492967e-09, + "loss": 0.42244911193847656, + "mean_token_accuracy": 0.8566311597824097, + "num_tokens": 34639359.0, + "step": 3876 + }, + { + "epoch": 2.946048632218845, + "grad_norm": 2.091810703277588, + "learning_rate": 4.546534973968175e-09, + "loss": 0.27733317017555237, + "mean_token_accuracy": 0.9033011794090271, + "num_tokens": 34646046.0, + "step": 3877 + }, + { + "epoch": 2.9468085106382977, + "grad_norm": 1.900180459022522, + "learning_rate": 4.4211563467452814e-09, + "loss": 0.44815146923065186, + "mean_token_accuracy": 0.8379489183425903, + "num_tokens": 34656329.0, + "step": 3878 + }, + { + "epoch": 2.947568389057751, + "grad_norm": 1.3837320804595947, + "learning_rate": 4.297529212820006e-09, + "loss": 0.33357739448547363, + "mean_token_accuracy": 0.8788042068481445, + "num_tokens": 34671303.0, + "step": 3879 + }, + { + "epoch": 2.9483282674772036, + "grad_norm": 1.3475737571716309, + "learning_rate": 4.175653658958501e-09, + "loss": 0.30217933654785156, + "mean_token_accuracy": 0.8932538032531738, + "num_tokens": 34685080.0, + "step": 3880 + }, + { + "epoch": 2.9490881458966567, + "grad_norm": 1.6425048112869263, + "learning_rate": 4.055529770698175e-09, + "loss": 0.4368054270744324, + "mean_token_accuracy": 0.8392083644866943, + "num_tokens": 34695104.0, + "step": 3881 + }, + { + "epoch": 2.9498480243161094, + "grad_norm": 1.729368805885315, + "learning_rate": 3.937157632346311e-09, + "loss": 0.28259193897247314, + "mean_token_accuracy": 0.9338148236274719, + "num_tokens": 34706664.0, + "step": 3882 + }, + { + "epoch": 2.9506079027355625, + "grad_norm": 1.8707934617996216, + "learning_rate": 3.820537326980622e-09, + "loss": 0.40049535036087036, + "mean_token_accuracy": 0.8617393374443054, + "num_tokens": 34715401.0, + "step": 3883 + }, + { + "epoch": 2.951367781155015, + "grad_norm": 1.7935676574707031, + "learning_rate": 3.7056689364503574e-09, + "loss": 0.3386167585849762, + "mean_token_accuracy": 0.8947521448135376, + "num_tokens": 34724093.0, + "step": 3884 + }, + { + "epoch": 2.952127659574468, + "grad_norm": 2.6346704959869385, + "learning_rate": 3.592552541374361e-09, + "loss": 0.1505163311958313, + "mean_token_accuracy": 0.9515544176101685, + "num_tokens": 34727908.0, + "step": 3885 + }, + { + "epoch": 2.952887537993921, + "grad_norm": 2.2813618183135986, + "learning_rate": 3.481188221142184e-09, + "loss": 0.3014339506626129, + "mean_token_accuracy": 0.8985507488250732, + "num_tokens": 34734037.0, + "step": 3886 + }, + { + "epoch": 2.9536474164133737, + "grad_norm": 2.482675313949585, + "learning_rate": 3.37157605391325e-09, + "loss": 0.3489428758621216, + "mean_token_accuracy": 0.8771353960037231, + "num_tokens": 34739874.0, + "step": 3887 + }, + { + "epoch": 2.954407294832827, + "grad_norm": 2.3721418380737305, + "learning_rate": 3.2637161166179654e-09, + "loss": 0.3582353889942169, + "mean_token_accuracy": 0.861088752746582, + "num_tokens": 34747007.0, + "step": 3888 + }, + { + "epoch": 2.9551671732522795, + "grad_norm": 2.1871862411499023, + "learning_rate": 3.1576084849563315e-09, + "loss": 0.30689212679862976, + "mean_token_accuracy": 0.8910759687423706, + "num_tokens": 34753361.0, + "step": 3889 + }, + { + "epoch": 2.9559270516717326, + "grad_norm": 2.1797537803649902, + "learning_rate": 3.0532532333987785e-09, + "loss": 0.3343493938446045, + "mean_token_accuracy": 0.880067765712738, + "num_tokens": 34760824.0, + "step": 3890 + }, + { + "epoch": 2.9566869300911853, + "grad_norm": 2.6021335124969482, + "learning_rate": 2.9506504351861644e-09, + "loss": 0.34991219639778137, + "mean_token_accuracy": 0.8728436231613159, + "num_tokens": 34766783.0, + "step": 3891 + }, + { + "epoch": 2.9574468085106385, + "grad_norm": 2.202974319458008, + "learning_rate": 2.849800162328664e-09, + "loss": 0.3138400912284851, + "mean_token_accuracy": 0.8995538949966431, + "num_tokens": 34773174.0, + "step": 3892 + }, + { + "epoch": 2.958206686930091, + "grad_norm": 1.687474250793457, + "learning_rate": 2.7507024856071595e-09, + "loss": 0.40479594469070435, + "mean_token_accuracy": 0.8831138610839844, + "num_tokens": 34785142.0, + "step": 3893 + }, + { + "epoch": 2.9589665653495443, + "grad_norm": 1.960195779800415, + "learning_rate": 2.6533574745718493e-09, + "loss": 0.3259456157684326, + "mean_token_accuracy": 0.8871631622314453, + "num_tokens": 34793001.0, + "step": 3894 + }, + { + "epoch": 2.959726443768997, + "grad_norm": 2.89237904548645, + "learning_rate": 2.557765197543638e-09, + "loss": 0.32338041067123413, + "mean_token_accuracy": 0.9038220047950745, + "num_tokens": 34797424.0, + "step": 3895 + }, + { + "epoch": 2.9604863221884496, + "grad_norm": 2.655599594116211, + "learning_rate": 2.4639257216127476e-09, + "loss": 0.2710941731929779, + "mean_token_accuracy": 0.8990030884742737, + "num_tokens": 34802629.0, + "step": 3896 + }, + { + "epoch": 2.961246200607903, + "grad_norm": 2.8153562545776367, + "learning_rate": 2.3718391126392735e-09, + "loss": 0.4303235411643982, + "mean_token_accuracy": 0.8491297960281372, + "num_tokens": 34807870.0, + "step": 3897 + }, + { + "epoch": 2.9620060790273555, + "grad_norm": 1.4196341037750244, + "learning_rate": 2.2815054352531842e-09, + "loss": 0.38827845454216003, + "mean_token_accuracy": 0.8595222234725952, + "num_tokens": 34823597.0, + "step": 3898 + }, + { + "epoch": 2.9627659574468086, + "grad_norm": 2.9653196334838867, + "learning_rate": 2.192924752854042e-09, + "loss": 0.2555926442146301, + "mean_token_accuracy": 0.9074755907058716, + "num_tokens": 34827781.0, + "step": 3899 + }, + { + "epoch": 2.9635258358662613, + "grad_norm": 1.4998196363449097, + "learning_rate": 2.106097127611284e-09, + "loss": 0.36219048500061035, + "mean_token_accuracy": 0.885735273361206, + "num_tokens": 34839234.0, + "step": 3900 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 1.718245029449463, + "learning_rate": 2.0210226204639414e-09, + "loss": 0.26162803173065186, + "mean_token_accuracy": 0.8963354825973511, + "num_tokens": 34848059.0, + "step": 3901 + }, + { + "epoch": 2.965045592705167, + "grad_norm": 2.0226235389709473, + "learning_rate": 1.9377012911203642e-09, + "loss": 0.3657612204551697, + "mean_token_accuracy": 0.8982006311416626, + "num_tokens": 34854617.0, + "step": 3902 + }, + { + "epoch": 2.96580547112462, + "grad_norm": 2.6306872367858887, + "learning_rate": 1.8561331980587738e-09, + "loss": 0.19888967275619507, + "mean_token_accuracy": 0.9320937395095825, + "num_tokens": 34859412.0, + "step": 3903 + }, + { + "epoch": 2.966565349544073, + "grad_norm": 1.2558201551437378, + "learning_rate": 1.7763183985269882e-09, + "loss": 0.39052504301071167, + "mean_token_accuracy": 0.8551221489906311, + "num_tokens": 34875603.0, + "step": 3904 + }, + { + "epoch": 2.967325227963526, + "grad_norm": 1.7441751956939697, + "learning_rate": 1.6982569485415879e-09, + "loss": 0.3208625912666321, + "mean_token_accuracy": 0.8973188996315002, + "num_tokens": 34884309.0, + "step": 3905 + }, + { + "epoch": 2.9680851063829787, + "grad_norm": 1.6294625997543335, + "learning_rate": 1.6219489028895808e-09, + "loss": 0.2818947732448578, + "mean_token_accuracy": 0.9170717000961304, + "num_tokens": 34894045.0, + "step": 3906 + }, + { + "epoch": 2.9688449848024314, + "grad_norm": 1.9129183292388916, + "learning_rate": 1.5473943151270155e-09, + "loss": 0.3932931423187256, + "mean_token_accuracy": 0.8591724038124084, + "num_tokens": 34903057.0, + "step": 3907 + }, + { + "epoch": 2.9696048632218845, + "grad_norm": 2.125586748123169, + "learning_rate": 1.474593237578703e-09, + "loss": 0.4141325056552887, + "mean_token_accuracy": 0.855269193649292, + "num_tokens": 34911138.0, + "step": 3908 + }, + { + "epoch": 2.9703647416413372, + "grad_norm": 2.039323329925537, + "learning_rate": 1.4035457213393278e-09, + "loss": 0.30452996492385864, + "mean_token_accuracy": 0.8897982835769653, + "num_tokens": 34918685.0, + "step": 3909 + }, + { + "epoch": 2.9711246200607904, + "grad_norm": 1.213478446006775, + "learning_rate": 1.3342518162728913e-09, + "loss": 0.3703617751598358, + "mean_token_accuracy": 0.8672454357147217, + "num_tokens": 34936658.0, + "step": 3910 + }, + { + "epoch": 2.971884498480243, + "grad_norm": 1.2648811340332031, + "learning_rate": 1.2667115710127131e-09, + "loss": 0.4004117250442505, + "mean_token_accuracy": 0.8572319149971008, + "num_tokens": 34955480.0, + "step": 3911 + }, + { + "epoch": 2.972644376899696, + "grad_norm": 2.34121036529541, + "learning_rate": 1.2009250329608757e-09, + "loss": 0.12352144718170166, + "mean_token_accuracy": 0.9538272619247437, + "num_tokens": 34959942.0, + "step": 3912 + }, + { + "epoch": 2.973404255319149, + "grad_norm": 1.5843939781188965, + "learning_rate": 1.1368922482887789e-09, + "loss": 0.27862548828125, + "mean_token_accuracy": 0.8930153846740723, + "num_tokens": 34969425.0, + "step": 3913 + }, + { + "epoch": 2.9741641337386016, + "grad_norm": 1.2919771671295166, + "learning_rate": 1.0746132619374184e-09, + "loss": 0.38437312841415405, + "mean_token_accuracy": 0.8620239496231079, + "num_tokens": 34987289.0, + "step": 3914 + }, + { + "epoch": 2.9749240121580547, + "grad_norm": 2.299374580383301, + "learning_rate": 1.0140881176165517e-09, + "loss": 0.3482919931411743, + "mean_token_accuracy": 0.8766785860061646, + "num_tokens": 34993701.0, + "step": 3915 + }, + { + "epoch": 2.975683890577508, + "grad_norm": 2.1415762901306152, + "learning_rate": 9.553168578049776e-10, + "loss": 0.3619397282600403, + "mean_token_accuracy": 0.8685888051986694, + "num_tokens": 35000430.0, + "step": 3916 + }, + { + "epoch": 2.9764437689969605, + "grad_norm": 1.1967521905899048, + "learning_rate": 8.982995237505343e-10, + "loss": 0.289741188287735, + "mean_token_accuracy": 0.9111574292182922, + "num_tokens": 35015151.0, + "step": 3917 + }, + { + "epoch": 2.977203647416413, + "grad_norm": 2.4301388263702393, + "learning_rate": 8.430361554701005e-10, + "loss": 0.3439575433731079, + "mean_token_accuracy": 0.8783204555511475, + "num_tokens": 35020729.0, + "step": 3918 + }, + { + "epoch": 2.9779635258358663, + "grad_norm": 1.7229973077774048, + "learning_rate": 7.895267917501503e-10, + "loss": 0.379913330078125, + "mean_token_accuracy": 0.8735131025314331, + "num_tokens": 35031484.0, + "step": 3919 + }, + { + "epoch": 2.978723404255319, + "grad_norm": 1.468673825263977, + "learning_rate": 7.377714701450877e-10, + "loss": 0.369578093290329, + "mean_token_accuracy": 0.8639857172966003, + "num_tokens": 35044582.0, + "step": 3920 + }, + { + "epoch": 2.979483282674772, + "grad_norm": 1.45562744140625, + "learning_rate": 6.877702269786346e-10, + "loss": 0.33700788021087646, + "mean_token_accuracy": 0.8805566430091858, + "num_tokens": 35058539.0, + "step": 3921 + }, + { + "epoch": 2.980243161094225, + "grad_norm": 1.483021855354309, + "learning_rate": 6.395230973443856e-10, + "loss": 0.4657078981399536, + "mean_token_accuracy": 0.8335970640182495, + "num_tokens": 35072770.0, + "step": 3922 + }, + { + "epoch": 2.981003039513678, + "grad_norm": 2.2210497856140137, + "learning_rate": 5.930301151033102e-10, + "loss": 0.3754214644432068, + "mean_token_accuracy": 0.8667312264442444, + "num_tokens": 35079930.0, + "step": 3923 + }, + { + "epoch": 2.9817629179331306, + "grad_norm": 1.8546303510665894, + "learning_rate": 5.48291312886251e-10, + "loss": 0.27907687425613403, + "mean_token_accuracy": 0.9037660360336304, + "num_tokens": 35089005.0, + "step": 3924 + }, + { + "epoch": 2.9825227963525833, + "grad_norm": 2.201045513153076, + "learning_rate": 5.053067220925356e-10, + "loss": 0.27560052275657654, + "mean_token_accuracy": 0.9001410603523254, + "num_tokens": 35095726.0, + "step": 3925 + }, + { + "epoch": 2.9832826747720365, + "grad_norm": 1.4042561054229736, + "learning_rate": 4.640763728908093e-10, + "loss": 0.33435091376304626, + "mean_token_accuracy": 0.9042688608169556, + "num_tokens": 35108469.0, + "step": 3926 + }, + { + "epoch": 2.9840425531914896, + "grad_norm": 1.213336706161499, + "learning_rate": 4.246002942173699e-10, + "loss": 0.28249555826187134, + "mean_token_accuracy": 0.8767675161361694, + "num_tokens": 35124765.0, + "step": 3927 + }, + { + "epoch": 2.9848024316109423, + "grad_norm": 1.9213181734085083, + "learning_rate": 3.868785137786657e-10, + "loss": 0.22949065268039703, + "mean_token_accuracy": 0.9307032823562622, + "num_tokens": 35131459.0, + "step": 3928 + }, + { + "epoch": 2.985562310030395, + "grad_norm": 1.7959866523742676, + "learning_rate": 3.509110580490749e-10, + "loss": 0.2500322461128235, + "mean_token_accuracy": 0.9240894913673401, + "num_tokens": 35138772.0, + "step": 3929 + }, + { + "epoch": 2.986322188449848, + "grad_norm": 1.6845020055770874, + "learning_rate": 3.166979522717384e-10, + "loss": 0.3233460485935211, + "mean_token_accuracy": 0.8901629447937012, + "num_tokens": 35148787.0, + "step": 3930 + }, + { + "epoch": 2.987082066869301, + "grad_norm": 1.60831618309021, + "learning_rate": 2.842392204591149e-10, + "loss": 0.28861671686172485, + "mean_token_accuracy": 0.8791468143463135, + "num_tokens": 35158830.0, + "step": 3931 + }, + { + "epoch": 2.987841945288754, + "grad_norm": 2.2622485160827637, + "learning_rate": 2.5353488539187066e-10, + "loss": 0.35594597458839417, + "mean_token_accuracy": 0.8696492910385132, + "num_tokens": 35165616.0, + "step": 3932 + }, + { + "epoch": 2.9886018237082066, + "grad_norm": 1.8257495164871216, + "learning_rate": 2.24584968619157e-10, + "loss": 0.35592517256736755, + "mean_token_accuracy": 0.8911738395690918, + "num_tokens": 35174892.0, + "step": 3933 + }, + { + "epoch": 2.9893617021276597, + "grad_norm": 1.8350274562835693, + "learning_rate": 1.9738949045972068e-10, + "loss": 0.1599535346031189, + "mean_token_accuracy": 0.9352525472640991, + "num_tokens": 35181249.0, + "step": 3934 + }, + { + "epoch": 2.9901215805471124, + "grad_norm": 1.3198978900909424, + "learning_rate": 1.7194846999996073e-10, + "loss": 0.24172186851501465, + "mean_token_accuracy": 0.9095510840415955, + "num_tokens": 35192328.0, + "step": 3935 + }, + { + "epoch": 2.990881458966565, + "grad_norm": 1.6335922479629517, + "learning_rate": 1.4826192509559412e-10, + "loss": 0.4396868348121643, + "mean_token_accuracy": 0.8414689898490906, + "num_tokens": 35204736.0, + "step": 3936 + }, + { + "epoch": 2.9916413373860182, + "grad_norm": 2.0894503593444824, + "learning_rate": 1.2632987237054527e-10, + "loss": 0.2892245948314667, + "mean_token_accuracy": 0.9252790212631226, + "num_tokens": 35211186.0, + "step": 3937 + }, + { + "epoch": 2.9924012158054714, + "grad_norm": 2.221811294555664, + "learning_rate": 1.061523272177789e-10, + "loss": 0.40185099840164185, + "mean_token_accuracy": 0.8510832190513611, + "num_tokens": 35220326.0, + "step": 3938 + }, + { + "epoch": 2.993161094224924, + "grad_norm": 1.7605009078979492, + "learning_rate": 8.772930379846723e-11, + "loss": 0.38544684648513794, + "mean_token_accuracy": 0.8694577217102051, + "num_tokens": 35229889.0, + "step": 3939 + }, + { + "epoch": 2.9939209726443767, + "grad_norm": 2.6683199405670166, + "learning_rate": 7.106081504254514e-11, + "loss": 0.16490477323532104, + "mean_token_accuracy": 0.9414010047912598, + "num_tokens": 35233835.0, + "step": 3940 + }, + { + "epoch": 2.99468085106383, + "grad_norm": 2.2280800342559814, + "learning_rate": 5.6146872648987774e-11, + "loss": 0.41871631145477295, + "mean_token_accuracy": 0.8475867509841919, + "num_tokens": 35241042.0, + "step": 3941 + }, + { + "epoch": 2.9954407294832825, + "grad_norm": 2.169602870941162, + "learning_rate": 4.298748708470024e-11, + "loss": 0.3991228938102722, + "mean_token_accuracy": 0.8692910671234131, + "num_tokens": 35248427.0, + "step": 3942 + }, + { + "epoch": 2.9962006079027357, + "grad_norm": 2.665966033935547, + "learning_rate": 3.158266758562789e-11, + "loss": 0.25984981656074524, + "mean_token_accuracy": 0.9204732179641724, + "num_tokens": 35253086.0, + "step": 3943 + }, + { + "epoch": 2.9969604863221884, + "grad_norm": 1.8087493181228638, + "learning_rate": 2.1932422155923618e-11, + "loss": 0.41246354579925537, + "mean_token_accuracy": 0.8548201322555542, + "num_tokens": 35263360.0, + "step": 3944 + }, + { + "epoch": 2.9977203647416415, + "grad_norm": 2.6384191513061523, + "learning_rate": 1.4036757568502978e-11, + "loss": 0.32927870750427246, + "mean_token_accuracy": 0.8796735405921936, + "num_tokens": 35269214.0, + "step": 3945 + }, + { + "epoch": 2.998480243161094, + "grad_norm": 1.2011899948120117, + "learning_rate": 7.89567936476665e-12, + "loss": 0.2989211678504944, + "mean_token_accuracy": 0.8949509859085083, + "num_tokens": 35283851.0, + "step": 3946 + }, + { + "epoch": 2.999240121580547, + "grad_norm": 1.6725144386291504, + "learning_rate": 3.509191854877969e-12, + "loss": 0.30066749453544617, + "mean_token_accuracy": 0.9032993316650391, + "num_tokens": 35300894.0, + "step": 3947 + }, + { + "epoch": 3.0, + "grad_norm": 2.00422739982605, + "learning_rate": 8.77298117762937e-13, + "loss": 0.4101974368095398, + "mean_token_accuracy": 0.8702684640884399, + "num_tokens": 35309034.0, + "step": 3948 + } + ], + "logging_steps": 1.0, + "max_steps": 3948, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.848873914830684e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3948/training_args.bin b/checkpoint-3948/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2fc4f538d721f958cdceda5408f2f4e1a35f4210 --- /dev/null +++ b/checkpoint-3948/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 +size 6225 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c351e5fb52f50ea6e07b40981aef81c80f9df7e4 --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "float32", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151662, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.5.3", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2104b83493c2833855e8fe32a7a784805ab5c2ee --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151662, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.5.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f787ad62bc7ccc577c324b6d71689c0739123f0c --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7db19800bbcf792dcb25dea9b5ae39f4e934a0d56f64ed6f74d7d89e87ae928 +size 17645743048 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c7afbed2efcdf019f88ab0572ec29d3bf595dfe2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e47e52c4e7f0b2bcf2103a878790216f3f6436d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 1010000, + "pad_token": "<|fim_pad|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2fc4f538d721f958cdceda5408f2f4e1a35f4210 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 +size 6225