diff --git a/README.md b/README.md
deleted file mode 100644
index 6b1ad030d8dc51c98535bc6be58bc0d0d780757a..0000000000000000000000000000000000000000
--- a/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
----
-base_model: Qwen/Qwen3-4B-Instruct-2507
-library_name: transformers
-model_name: Qwen3-8B_n3000_math
-tags:
-- generated_from_trainer
-- sft
-- trl
-licence: license
----
-
-# Model Card for Qwen3-8B_n3000_math
-
-This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507).
-It has been trained using [TRL](https://github.com/huggingface/trl).
-
-## Quick start
-
-```python
-from transformers import pipeline
-
-question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
-generator = pipeline("text-generation", model="None", device="cuda")
-output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
-print(output["generated_text"])
-```
-
-## Training procedure
-
-
-
-
-
-This model was trained with SFT.
-
-### Framework versions
-
-- TRL: 0.29.0
-- Transformers: 5.5.3
-- Pytorch: 2.8.0
-- Datasets: 4.5.0
-- Tokenizers: 0.22.2
-
-## Citations
-
-
-
-Cite TRL as:
-
-```bibtex
-@software{vonwerra2020trl,
- title = {{TRL: Transformers Reinforcement Learning}},
- author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
- license = {Apache-2.0},
- url = {https://github.com/huggingface/trl},
- year = {2020}
-}
-```
\ No newline at end of file
diff --git a/chat_template.jinja b/chat_template.jinja
deleted file mode 100644
index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000
--- a/chat_template.jinja
+++ /dev/null
@@ -1,61 +0,0 @@
-{%- if tools %}
- {{- '<|im_start|>system\n' }}
- {%- if messages[0].role == 'system' %}
- {{- messages[0].content + '\n\n' }}
- {%- endif %}
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
- {%- for tool in tools %}
- {{- "\n" }}
- {{- tool | tojson }}
- {%- endfor %}
- {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
-{%- else %}
- {%- if messages[0].role == 'system' %}
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
- {%- endif %}
-{%- endif %}
-{%- for message in messages %}
- {%- if message.content is string %}
- {%- set content = message.content %}
- {%- else %}
- {%- set content = '' %}
- {%- endif %}
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
- {%- elif message.role == "assistant" %}
- {{- '<|im_start|>' + message.role + '\n' + content }}
- {%- if message.tool_calls %}
- {%- for tool_call in message.tool_calls %}
- {%- if (loop.first and content) or (not loop.first) %}
- {{- '\n' }}
- {%- endif %}
- {%- if tool_call.function %}
- {%- set tool_call = tool_call.function %}
- {%- endif %}
- {{- '\n{"name": "' }}
- {{- tool_call.name }}
- {{- '", "arguments": ' }}
- {%- if tool_call.arguments is string %}
- {{- tool_call.arguments }}
- {%- else %}
- {{- tool_call.arguments | tojson }}
- {%- endif %}
- {{- '}\n' }}
- {%- endfor %}
- {%- endif %}
- {{- '<|im_end|>\n' }}
- {%- elif message.role == "tool" %}
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
- {{- '<|im_start|>user' }}
- {%- endif %}
- {{- '\n\n' }}
- {{- content }}
- {{- '\n' }}
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
- {{- '<|im_end|>\n' }}
- {%- endif %}
- {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
- {{- '<|im_start|>assistant\n' }}
-{%- endif %}
\ No newline at end of file
diff --git a/checkpoint-1000/chat_template.jinja b/checkpoint-1000/chat_template.jinja
deleted file mode 100644
index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000
--- a/checkpoint-1000/chat_template.jinja
+++ /dev/null
@@ -1,61 +0,0 @@
-{%- if tools %}
- {{- '<|im_start|>system\n' }}
- {%- if messages[0].role == 'system' %}
- {{- messages[0].content + '\n\n' }}
- {%- endif %}
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
- {%- for tool in tools %}
- {{- "\n" }}
- {{- tool | tojson }}
- {%- endfor %}
- {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
-{%- else %}
- {%- if messages[0].role == 'system' %}
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
- {%- endif %}
-{%- endif %}
-{%- for message in messages %}
- {%- if message.content is string %}
- {%- set content = message.content %}
- {%- else %}
- {%- set content = '' %}
- {%- endif %}
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
- {%- elif message.role == "assistant" %}
- {{- '<|im_start|>' + message.role + '\n' + content }}
- {%- if message.tool_calls %}
- {%- for tool_call in message.tool_calls %}
- {%- if (loop.first and content) or (not loop.first) %}
- {{- '\n' }}
- {%- endif %}
- {%- if tool_call.function %}
- {%- set tool_call = tool_call.function %}
- {%- endif %}
- {{- '\n{"name": "' }}
- {{- tool_call.name }}
- {{- '", "arguments": ' }}
- {%- if tool_call.arguments is string %}
- {{- tool_call.arguments }}
- {%- else %}
- {{- tool_call.arguments | tojson }}
- {%- endif %}
- {{- '}\n' }}
- {%- endfor %}
- {%- endif %}
- {{- '<|im_end|>\n' }}
- {%- elif message.role == "tool" %}
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
- {{- '<|im_start|>user' }}
- {%- endif %}
- {{- '\n\n' }}
- {{- content }}
- {{- '\n' }}
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
- {{- '<|im_end|>\n' }}
- {%- endif %}
- {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
- {{- '<|im_start|>assistant\n' }}
-{%- endif %}
\ No newline at end of file
diff --git a/checkpoint-1000/config.json b/checkpoint-1000/config.json
deleted file mode 100644
index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000
--- a/checkpoint-1000/config.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "architectures": [
- "Qwen3ForCausalLM"
- ],
- "attention_bias": false,
- "attention_dropout": 0.0,
- "bos_token_id": null,
- "dtype": "float32",
- "eos_token_id": 151645,
- "head_dim": 128,
- "hidden_act": "silu",
- "hidden_size": 2560,
- "initializer_range": 0.02,
- "intermediate_size": 9728,
- "layer_types": [
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention"
- ],
- "max_position_embeddings": 262144,
- "max_window_layers": 36,
- "model_type": "qwen3",
- "num_attention_heads": 32,
- "num_hidden_layers": 36,
- "num_key_value_heads": 8,
- "pad_token_id": 151662,
- "rms_norm_eps": 1e-06,
- "rope_parameters": {
- "rope_theta": 5000000,
- "rope_type": "default"
- },
- "sliding_window": null,
- "tie_word_embeddings": true,
- "transformers_version": "5.5.3",
- "use_cache": false,
- "use_sliding_window": false,
- "vocab_size": 151936
-}
diff --git a/checkpoint-1000/generation_config.json b/checkpoint-1000/generation_config.json
deleted file mode 100644
index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000
--- a/checkpoint-1000/generation_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "do_sample": true,
- "eos_token_id": [
- 151645,
- 151643
- ],
- "pad_token_id": 151662,
- "temperature": 0.7,
- "top_k": 20,
- "top_p": 0.8,
- "transformers_version": "5.5.3"
-}
diff --git a/checkpoint-1000/model.safetensors b/checkpoint-1000/model.safetensors
deleted file mode 100644
index bfb8d255b000e1103e63ca8cab7616c5309ec35c..0000000000000000000000000000000000000000
--- a/checkpoint-1000/model.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:16cf530a69292d5ebcdc898ff6e27f40e9fa97d07ec9a6fff92606a1cbec50f4
-size 17645743048
diff --git a/checkpoint-1000/optimizer.bin b/checkpoint-1000/optimizer.bin
deleted file mode 100644
index aa4e8c63ff91cc412c4b55b37e5e1b5cd2c26f25..0000000000000000000000000000000000000000
--- a/checkpoint-1000/optimizer.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ad09a9b1f9d56fb5e24fccb31bc61995bcb8aa26d3d4e5771bcd332a90d2d66e
-size 32180124005
diff --git a/checkpoint-1000/pytorch_model_fsdp.bin b/checkpoint-1000/pytorch_model_fsdp.bin
deleted file mode 100644
index d7e23f6b1f13fa3e7366b86beba7bead4ecc98c3..0000000000000000000000000000000000000000
--- a/checkpoint-1000/pytorch_model_fsdp.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cde7e1f8a53dcc9407e8636dd3c4261b755f26602abf7c70e6eb4291c93496bd
-size 17645897996
diff --git a/checkpoint-1000/rng_state_0.pth b/checkpoint-1000/rng_state_0.pth
deleted file mode 100644
index 3fc68e18ddaf65dfbdec55893d9a925ff5e43a18..0000000000000000000000000000000000000000
--- a/checkpoint-1000/rng_state_0.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4dd7671ce88d469c49c0530724ac76b2306574002d1ecd1ca9294e41621fd96a
-size 14917
diff --git a/checkpoint-1000/rng_state_1.pth b/checkpoint-1000/rng_state_1.pth
deleted file mode 100644
index 79d9de29ae34b3b0c10ea8ba0348aeafb0c12226..0000000000000000000000000000000000000000
--- a/checkpoint-1000/rng_state_1.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3246ef1170ccca541a03b89ad6f20e01c51eb6834a2c2211c78c71c70f896879
-size 14917
diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt
deleted file mode 100644
index 4c433f8131571742884bf6317ca17c54e07b544d..0000000000000000000000000000000000000000
--- a/checkpoint-1000/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3e3184dc815b4354af3c63c9b5b618608d5206305b4414657ef8e0195f7ad089
-size 1465
diff --git a/checkpoint-1000/tokenizer.json b/checkpoint-1000/tokenizer.json
deleted file mode 100644
index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000
--- a/checkpoint-1000/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
-size 11422650
diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json
deleted file mode 100644
index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000
--- a/checkpoint-1000/tokenizer_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "add_prefix_space": false,
- "backend": "tokenizers",
- "bos_token": null,
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|im_end|>",
- "errors": "replace",
- "extra_special_tokens": [
- "<|im_start|>",
- "<|im_end|>",
- "<|object_ref_start|>",
- "<|object_ref_end|>",
- "<|box_start|>",
- "<|box_end|>",
- "<|quad_start|>",
- "<|quad_end|>",
- "<|vision_start|>",
- "<|vision_end|>",
- "<|vision_pad|>",
- "<|image_pad|>",
- "<|video_pad|>"
- ],
- "is_local": false,
- "model_max_length": 1010000,
- "pad_token": "<|fim_pad|>",
- "split_special_tokens": false,
- "tokenizer_class": "Qwen2Tokenizer",
- "unk_token": null
-}
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
deleted file mode 100644
index 87532bf7321a64560106312a2f81138a0e52ebd6..0000000000000000000000000000000000000000
--- a/checkpoint-1000/trainer_state.json
+++ /dev/null
@@ -1,9034 +0,0 @@
-{
- "best_global_step": null,
- "best_metric": null,
- "best_model_checkpoint": null,
- "epoch": 0.7598784194528876,
- "eval_steps": 500,
- "global_step": 1000,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0007598784194528875,
- "grad_norm": 11.767926216125488,
- "learning_rate": 0.0,
- "loss": 0.7937269806861877,
- "mean_token_accuracy": 0.7822731137275696,
- "num_tokens": 10507.0,
- "step": 1
- },
- {
- "epoch": 0.001519756838905775,
- "grad_norm": 14.9199800491333,
- "learning_rate": 2.5252525252525256e-08,
- "loss": 0.7665389776229858,
- "mean_token_accuracy": 0.8342233300209045,
- "num_tokens": 14806.0,
- "step": 2
- },
- {
- "epoch": 0.0022796352583586625,
- "grad_norm": 11.991217613220215,
- "learning_rate": 5.050505050505051e-08,
- "loss": 0.9597002267837524,
- "mean_token_accuracy": 0.7054992318153381,
- "num_tokens": 27170.0,
- "step": 3
- },
- {
- "epoch": 0.00303951367781155,
- "grad_norm": 12.958333015441895,
- "learning_rate": 7.575757575757576e-08,
- "loss": 0.9971482753753662,
- "mean_token_accuracy": 0.7261134386062622,
- "num_tokens": 33729.0,
- "step": 4
- },
- {
- "epoch": 0.003799392097264438,
- "grad_norm": 13.5665283203125,
- "learning_rate": 1.0101010101010103e-07,
- "loss": 0.9504883885383606,
- "mean_token_accuracy": 0.745307445526123,
- "num_tokens": 41174.0,
- "step": 5
- },
- {
- "epoch": 0.004559270516717325,
- "grad_norm": 10.09444808959961,
- "learning_rate": 1.2626262626262626e-07,
- "loss": 0.759548008441925,
- "mean_token_accuracy": 0.7842121124267578,
- "num_tokens": 47943.0,
- "step": 6
- },
- {
- "epoch": 0.005319148936170213,
- "grad_norm": 10.741650581359863,
- "learning_rate": 1.5151515151515152e-07,
- "loss": 0.8231598138809204,
- "mean_token_accuracy": 0.7550969123840332,
- "num_tokens": 56665.0,
- "step": 7
- },
- {
- "epoch": 0.0060790273556231,
- "grad_norm": 12.250170707702637,
- "learning_rate": 1.767676767676768e-07,
- "loss": 0.8576581478118896,
- "mean_token_accuracy": 0.7568671703338623,
- "num_tokens": 67606.0,
- "step": 8
- },
- {
- "epoch": 0.006838905775075988,
- "grad_norm": 12.828629493713379,
- "learning_rate": 2.0202020202020205e-07,
- "loss": 0.9886435866355896,
- "mean_token_accuracy": 0.733400285243988,
- "num_tokens": 74272.0,
- "step": 9
- },
- {
- "epoch": 0.007598784194528876,
- "grad_norm": 15.966923713684082,
- "learning_rate": 2.2727272727272729e-07,
- "loss": 1.064985990524292,
- "mean_token_accuracy": 0.7101132869720459,
- "num_tokens": 80524.0,
- "step": 10
- },
- {
- "epoch": 0.008358662613981762,
- "grad_norm": 10.864850044250488,
- "learning_rate": 2.525252525252525e-07,
- "loss": 0.8311550617218018,
- "mean_token_accuracy": 0.7431639432907104,
- "num_tokens": 96292.0,
- "step": 11
- },
- {
- "epoch": 0.00911854103343465,
- "grad_norm": 16.438785552978516,
- "learning_rate": 2.7777777777777776e-07,
- "loss": 1.0579866170883179,
- "mean_token_accuracy": 0.7222976684570312,
- "num_tokens": 102992.0,
- "step": 12
- },
- {
- "epoch": 0.009878419452887538,
- "grad_norm": 11.179214477539062,
- "learning_rate": 3.0303030303030305e-07,
- "loss": 0.9816144704818726,
- "mean_token_accuracy": 0.7206371426582336,
- "num_tokens": 113571.0,
- "step": 13
- },
- {
- "epoch": 0.010638297872340425,
- "grad_norm": 12.780299186706543,
- "learning_rate": 3.2828282828282834e-07,
- "loss": 0.847449004650116,
- "mean_token_accuracy": 0.7826199531555176,
- "num_tokens": 119568.0,
- "step": 14
- },
- {
- "epoch": 0.011398176291793313,
- "grad_norm": 14.800421714782715,
- "learning_rate": 3.535353535353536e-07,
- "loss": 0.9275516271591187,
- "mean_token_accuracy": 0.7655045986175537,
- "num_tokens": 126258.0,
- "step": 15
- },
- {
- "epoch": 0.0121580547112462,
- "grad_norm": 11.267602920532227,
- "learning_rate": 3.787878787878788e-07,
- "loss": 0.8464037179946899,
- "mean_token_accuracy": 0.7606508731842041,
- "num_tokens": 136831.0,
- "step": 16
- },
- {
- "epoch": 0.012917933130699088,
- "grad_norm": 12.891013145446777,
- "learning_rate": 4.040404040404041e-07,
- "loss": 0.9903074502944946,
- "mean_token_accuracy": 0.7247487306594849,
- "num_tokens": 150434.0,
- "step": 17
- },
- {
- "epoch": 0.013677811550151976,
- "grad_norm": 11.13957691192627,
- "learning_rate": 4.2929292929292934e-07,
- "loss": 0.8287211656570435,
- "mean_token_accuracy": 0.7621913552284241,
- "num_tokens": 158516.0,
- "step": 18
- },
- {
- "epoch": 0.014437689969604863,
- "grad_norm": 18.39569664001465,
- "learning_rate": 4.5454545454545457e-07,
- "loss": 1.150015115737915,
- "mean_token_accuracy": 0.7349498271942139,
- "num_tokens": 162214.0,
- "step": 19
- },
- {
- "epoch": 0.015197568389057751,
- "grad_norm": 9.353750228881836,
- "learning_rate": 4.797979797979798e-07,
- "loss": 0.7228299379348755,
- "mean_token_accuracy": 0.7969573736190796,
- "num_tokens": 173035.0,
- "step": 20
- },
- {
- "epoch": 0.015957446808510637,
- "grad_norm": 8.267163276672363,
- "learning_rate": 5.05050505050505e-07,
- "loss": 0.7358136177062988,
- "mean_token_accuracy": 0.7903937101364136,
- "num_tokens": 183568.0,
- "step": 21
- },
- {
- "epoch": 0.016717325227963525,
- "grad_norm": 11.137128829956055,
- "learning_rate": 5.303030303030304e-07,
- "loss": 1.0075397491455078,
- "mean_token_accuracy": 0.702807605266571,
- "num_tokens": 192759.0,
- "step": 22
- },
- {
- "epoch": 0.017477203647416412,
- "grad_norm": 10.734103202819824,
- "learning_rate": 5.555555555555555e-07,
- "loss": 0.8925919532775879,
- "mean_token_accuracy": 0.7475671768188477,
- "num_tokens": 201280.0,
- "step": 23
- },
- {
- "epoch": 0.0182370820668693,
- "grad_norm": 11.945566177368164,
- "learning_rate": 5.808080808080809e-07,
- "loss": 0.7260514497756958,
- "mean_token_accuracy": 0.7859152555465698,
- "num_tokens": 218053.0,
- "step": 24
- },
- {
- "epoch": 0.018996960486322188,
- "grad_norm": 18.610652923583984,
- "learning_rate": 6.060606060606061e-07,
- "loss": 0.8995465636253357,
- "mean_token_accuracy": 0.7931990623474121,
- "num_tokens": 220953.0,
- "step": 25
- },
- {
- "epoch": 0.019756838905775075,
- "grad_norm": 10.51898193359375,
- "learning_rate": 6.313131313131314e-07,
- "loss": 0.9532671570777893,
- "mean_token_accuracy": 0.7257645726203918,
- "num_tokens": 231200.0,
- "step": 26
- },
- {
- "epoch": 0.020516717325227963,
- "grad_norm": 9.581812858581543,
- "learning_rate": 6.565656565656567e-07,
- "loss": 0.9038010239601135,
- "mean_token_accuracy": 0.7390379905700684,
- "num_tokens": 237711.0,
- "step": 27
- },
- {
- "epoch": 0.02127659574468085,
- "grad_norm": 12.297484397888184,
- "learning_rate": 6.818181818181818e-07,
- "loss": 1.048936367034912,
- "mean_token_accuracy": 0.7175670862197876,
- "num_tokens": 242503.0,
- "step": 28
- },
- {
- "epoch": 0.022036474164133738,
- "grad_norm": 7.437953472137451,
- "learning_rate": 7.070707070707071e-07,
- "loss": 0.8308826684951782,
- "mean_token_accuracy": 0.7415335774421692,
- "num_tokens": 250842.0,
- "step": 29
- },
- {
- "epoch": 0.022796352583586626,
- "grad_norm": 6.134475231170654,
- "learning_rate": 7.323232323232324e-07,
- "loss": 0.647913932800293,
- "mean_token_accuracy": 0.8124054670333862,
- "num_tokens": 267453.0,
- "step": 30
- },
- {
- "epoch": 0.023556231003039513,
- "grad_norm": 6.678966045379639,
- "learning_rate": 7.575757575757576e-07,
- "loss": 0.7052810192108154,
- "mean_token_accuracy": 0.7908754348754883,
- "num_tokens": 284416.0,
- "step": 31
- },
- {
- "epoch": 0.0243161094224924,
- "grad_norm": 7.42232084274292,
- "learning_rate": 7.82828282828283e-07,
- "loss": 1.022383213043213,
- "mean_token_accuracy": 0.7053230404853821,
- "num_tokens": 292073.0,
- "step": 32
- },
- {
- "epoch": 0.02507598784194529,
- "grad_norm": 6.463219165802002,
- "learning_rate": 8.080808080808082e-07,
- "loss": 0.7603012323379517,
- "mean_token_accuracy": 0.7728140354156494,
- "num_tokens": 298550.0,
- "step": 33
- },
- {
- "epoch": 0.025835866261398176,
- "grad_norm": 5.668411731719971,
- "learning_rate": 8.333333333333333e-07,
- "loss": 0.7707852721214294,
- "mean_token_accuracy": 0.7827773094177246,
- "num_tokens": 306683.0,
- "step": 34
- },
- {
- "epoch": 0.026595744680851064,
- "grad_norm": 4.984964847564697,
- "learning_rate": 8.585858585858587e-07,
- "loss": 0.6317349672317505,
- "mean_token_accuracy": 0.8106861114501953,
- "num_tokens": 318842.0,
- "step": 35
- },
- {
- "epoch": 0.02735562310030395,
- "grad_norm": 4.421732425689697,
- "learning_rate": 8.838383838383839e-07,
- "loss": 0.6228617429733276,
- "mean_token_accuracy": 0.8023355603218079,
- "num_tokens": 329850.0,
- "step": 36
- },
- {
- "epoch": 0.02811550151975684,
- "grad_norm": 5.970808029174805,
- "learning_rate": 9.090909090909091e-07,
- "loss": 0.8443238139152527,
- "mean_token_accuracy": 0.7462409734725952,
- "num_tokens": 335844.0,
- "step": 37
- },
- {
- "epoch": 0.028875379939209727,
- "grad_norm": 4.5389084815979,
- "learning_rate": 9.343434343434345e-07,
- "loss": 0.6976436376571655,
- "mean_token_accuracy": 0.790410041809082,
- "num_tokens": 348768.0,
- "step": 38
- },
- {
- "epoch": 0.029635258358662615,
- "grad_norm": 4.116631507873535,
- "learning_rate": 9.595959595959596e-07,
- "loss": 0.6698519587516785,
- "mean_token_accuracy": 0.7818127870559692,
- "num_tokens": 355460.0,
- "step": 39
- },
- {
- "epoch": 0.030395136778115502,
- "grad_norm": 3.3714773654937744,
- "learning_rate": 9.84848484848485e-07,
- "loss": 0.5723201036453247,
- "mean_token_accuracy": 0.8100086450576782,
- "num_tokens": 368507.0,
- "step": 40
- },
- {
- "epoch": 0.03115501519756839,
- "grad_norm": 4.4438347816467285,
- "learning_rate": 1.01010101010101e-06,
- "loss": 0.7508786916732788,
- "mean_token_accuracy": 0.7711942791938782,
- "num_tokens": 376467.0,
- "step": 41
- },
- {
- "epoch": 0.031914893617021274,
- "grad_norm": 5.609974384307861,
- "learning_rate": 1.0353535353535354e-06,
- "loss": 0.566256046295166,
- "mean_token_accuracy": 0.8319284319877625,
- "num_tokens": 381399.0,
- "step": 42
- },
- {
- "epoch": 0.03267477203647416,
- "grad_norm": 5.124386787414551,
- "learning_rate": 1.0606060606060608e-06,
- "loss": 0.8151067495346069,
- "mean_token_accuracy": 0.7537785768508911,
- "num_tokens": 387389.0,
- "step": 43
- },
- {
- "epoch": 0.03343465045592705,
- "grad_norm": 3.6318116188049316,
- "learning_rate": 1.085858585858586e-06,
- "loss": 0.5989949107170105,
- "mean_token_accuracy": 0.8129256963729858,
- "num_tokens": 395302.0,
- "step": 44
- },
- {
- "epoch": 0.03419452887537994,
- "grad_norm": 2.694424629211426,
- "learning_rate": 1.111111111111111e-06,
- "loss": 0.5831396579742432,
- "mean_token_accuracy": 0.8056820631027222,
- "num_tokens": 409920.0,
- "step": 45
- },
- {
- "epoch": 0.034954407294832825,
- "grad_norm": 2.2949178218841553,
- "learning_rate": 1.1363636363636364e-06,
- "loss": 0.472550630569458,
- "mean_token_accuracy": 0.8343006372451782,
- "num_tokens": 428323.0,
- "step": 46
- },
- {
- "epoch": 0.03571428571428571,
- "grad_norm": 3.3930575847625732,
- "learning_rate": 1.1616161616161617e-06,
- "loss": 0.6246505379676819,
- "mean_token_accuracy": 0.783149003982544,
- "num_tokens": 435889.0,
- "step": 47
- },
- {
- "epoch": 0.0364741641337386,
- "grad_norm": 3.692598819732666,
- "learning_rate": 1.186868686868687e-06,
- "loss": 0.46132946014404297,
- "mean_token_accuracy": 0.8583089113235474,
- "num_tokens": 441192.0,
- "step": 48
- },
- {
- "epoch": 0.03723404255319149,
- "grad_norm": 6.571533203125,
- "learning_rate": 1.2121212121212122e-06,
- "loss": 0.9351121783256531,
- "mean_token_accuracy": 0.7580878734588623,
- "num_tokens": 444277.0,
- "step": 49
- },
- {
- "epoch": 0.037993920972644375,
- "grad_norm": 5.029570579528809,
- "learning_rate": 1.2373737373737375e-06,
- "loss": 0.6921554803848267,
- "mean_token_accuracy": 0.8131166100502014,
- "num_tokens": 447646.0,
- "step": 50
- },
- {
- "epoch": 0.03875379939209726,
- "grad_norm": 2.9174208641052246,
- "learning_rate": 1.2626262626262629e-06,
- "loss": 0.591706395149231,
- "mean_token_accuracy": 0.8108617067337036,
- "num_tokens": 461397.0,
- "step": 51
- },
- {
- "epoch": 0.03951367781155015,
- "grad_norm": 4.315536022186279,
- "learning_rate": 1.287878787878788e-06,
- "loss": 0.6986310482025146,
- "mean_token_accuracy": 0.7710754871368408,
- "num_tokens": 472047.0,
- "step": 52
- },
- {
- "epoch": 0.04027355623100304,
- "grad_norm": 2.6216275691986084,
- "learning_rate": 1.3131313131313134e-06,
- "loss": 0.5553690791130066,
- "mean_token_accuracy": 0.8167896866798401,
- "num_tokens": 482795.0,
- "step": 53
- },
- {
- "epoch": 0.041033434650455926,
- "grad_norm": 3.0562477111816406,
- "learning_rate": 1.3383838383838385e-06,
- "loss": 0.6909202337265015,
- "mean_token_accuracy": 0.7859863638877869,
- "num_tokens": 494818.0,
- "step": 54
- },
- {
- "epoch": 0.04179331306990881,
- "grad_norm": 2.1420412063598633,
- "learning_rate": 1.3636363636363636e-06,
- "loss": 0.5415265560150146,
- "mean_token_accuracy": 0.818886399269104,
- "num_tokens": 513695.0,
- "step": 55
- },
- {
- "epoch": 0.0425531914893617,
- "grad_norm": 2.9610488414764404,
- "learning_rate": 1.3888888888888892e-06,
- "loss": 0.6602212190628052,
- "mean_token_accuracy": 0.7830734252929688,
- "num_tokens": 523784.0,
- "step": 56
- },
- {
- "epoch": 0.04331306990881459,
- "grad_norm": 2.511972665786743,
- "learning_rate": 1.4141414141414143e-06,
- "loss": 0.5717809796333313,
- "mean_token_accuracy": 0.8053616285324097,
- "num_tokens": 546308.0,
- "step": 57
- },
- {
- "epoch": 0.044072948328267476,
- "grad_norm": 3.52642822265625,
- "learning_rate": 1.4393939393939396e-06,
- "loss": 0.6242594718933105,
- "mean_token_accuracy": 0.8162082433700562,
- "num_tokens": 552019.0,
- "step": 58
- },
- {
- "epoch": 0.044832826747720364,
- "grad_norm": 3.02362322807312,
- "learning_rate": 1.4646464646464648e-06,
- "loss": 0.6634255647659302,
- "mean_token_accuracy": 0.7682032585144043,
- "num_tokens": 560009.0,
- "step": 59
- },
- {
- "epoch": 0.04559270516717325,
- "grad_norm": 2.3910107612609863,
- "learning_rate": 1.48989898989899e-06,
- "loss": 0.5519146919250488,
- "mean_token_accuracy": 0.8270269632339478,
- "num_tokens": 571005.0,
- "step": 60
- },
- {
- "epoch": 0.04635258358662614,
- "grad_norm": 4.28154993057251,
- "learning_rate": 1.5151515151515152e-06,
- "loss": 0.7437789440155029,
- "mean_token_accuracy": 0.7782418131828308,
- "num_tokens": 574950.0,
- "step": 61
- },
- {
- "epoch": 0.04711246200607903,
- "grad_norm": 3.4078686237335205,
- "learning_rate": 1.5404040404040404e-06,
- "loss": 0.6345915198326111,
- "mean_token_accuracy": 0.7903392314910889,
- "num_tokens": 581657.0,
- "step": 62
- },
- {
- "epoch": 0.047872340425531915,
- "grad_norm": 2.6834158897399902,
- "learning_rate": 1.565656565656566e-06,
- "loss": 0.5981127023696899,
- "mean_token_accuracy": 0.7911489605903625,
- "num_tokens": 591267.0,
- "step": 63
- },
- {
- "epoch": 0.0486322188449848,
- "grad_norm": 2.1054461002349854,
- "learning_rate": 1.590909090909091e-06,
- "loss": 0.5523523688316345,
- "mean_token_accuracy": 0.8194501399993896,
- "num_tokens": 606787.0,
- "step": 64
- },
- {
- "epoch": 0.04939209726443769,
- "grad_norm": 3.322596788406372,
- "learning_rate": 1.6161616161616164e-06,
- "loss": 0.48417025804519653,
- "mean_token_accuracy": 0.8293706178665161,
- "num_tokens": 611068.0,
- "step": 65
- },
- {
- "epoch": 0.05015197568389058,
- "grad_norm": 2.302450180053711,
- "learning_rate": 1.6414141414141415e-06,
- "loss": 0.6498389840126038,
- "mean_token_accuracy": 0.7728497385978699,
- "num_tokens": 624452.0,
- "step": 66
- },
- {
- "epoch": 0.050911854103343465,
- "grad_norm": 2.680191993713379,
- "learning_rate": 1.6666666666666667e-06,
- "loss": 0.6347037553787231,
- "mean_token_accuracy": 0.8108306527137756,
- "num_tokens": 638049.0,
- "step": 67
- },
- {
- "epoch": 0.05167173252279635,
- "grad_norm": 3.0297021865844727,
- "learning_rate": 1.6919191919191922e-06,
- "loss": 0.5344363451004028,
- "mean_token_accuracy": 0.8113535046577454,
- "num_tokens": 643892.0,
- "step": 68
- },
- {
- "epoch": 0.05243161094224924,
- "grad_norm": 2.9283676147460938,
- "learning_rate": 1.7171717171717173e-06,
- "loss": 0.6999260187149048,
- "mean_token_accuracy": 0.7782022356987,
- "num_tokens": 654418.0,
- "step": 69
- },
- {
- "epoch": 0.05319148936170213,
- "grad_norm": 3.4098572731018066,
- "learning_rate": 1.7424242424242427e-06,
- "loss": 0.6508946418762207,
- "mean_token_accuracy": 0.7942900657653809,
- "num_tokens": 659837.0,
- "step": 70
- },
- {
- "epoch": 0.053951367781155016,
- "grad_norm": 2.6756019592285156,
- "learning_rate": 1.7676767676767678e-06,
- "loss": 0.603486180305481,
- "mean_token_accuracy": 0.8015457391738892,
- "num_tokens": 668361.0,
- "step": 71
- },
- {
- "epoch": 0.0547112462006079,
- "grad_norm": 2.2630293369293213,
- "learning_rate": 1.792929292929293e-06,
- "loss": 0.6608274579048157,
- "mean_token_accuracy": 0.7753809690475464,
- "num_tokens": 679025.0,
- "step": 72
- },
- {
- "epoch": 0.05547112462006079,
- "grad_norm": 2.123962879180908,
- "learning_rate": 1.8181818181818183e-06,
- "loss": 0.4525482654571533,
- "mean_token_accuracy": 0.8425612449645996,
- "num_tokens": 688574.0,
- "step": 73
- },
- {
- "epoch": 0.05623100303951368,
- "grad_norm": 7.90519905090332,
- "learning_rate": 1.8434343434343434e-06,
- "loss": 0.6507195830345154,
- "mean_token_accuracy": 0.7714964151382446,
- "num_tokens": 694534.0,
- "step": 74
- },
- {
- "epoch": 0.056990881458966566,
- "grad_norm": 2.372203826904297,
- "learning_rate": 1.868686868686869e-06,
- "loss": 0.4458143413066864,
- "mean_token_accuracy": 0.7991449236869812,
- "num_tokens": 703114.0,
- "step": 75
- },
- {
- "epoch": 0.057750759878419454,
- "grad_norm": 2.918677568435669,
- "learning_rate": 1.8939393939393941e-06,
- "loss": 0.5614339113235474,
- "mean_token_accuracy": 0.8211464881896973,
- "num_tokens": 709038.0,
- "step": 76
- },
- {
- "epoch": 0.05851063829787234,
- "grad_norm": 1.6106709241867065,
- "learning_rate": 1.9191919191919192e-06,
- "loss": 0.5802098512649536,
- "mean_token_accuracy": 0.8055065870285034,
- "num_tokens": 730482.0,
- "step": 77
- },
- {
- "epoch": 0.05927051671732523,
- "grad_norm": 2.8069989681243896,
- "learning_rate": 1.944444444444445e-06,
- "loss": 0.5709059238433838,
- "mean_token_accuracy": 0.8024872541427612,
- "num_tokens": 751817.0,
- "step": 78
- },
- {
- "epoch": 0.06003039513677812,
- "grad_norm": 2.641667127609253,
- "learning_rate": 1.96969696969697e-06,
- "loss": 0.6480152606964111,
- "mean_token_accuracy": 0.7912271618843079,
- "num_tokens": 759236.0,
- "step": 79
- },
- {
- "epoch": 0.060790273556231005,
- "grad_norm": 2.6034350395202637,
- "learning_rate": 1.994949494949495e-06,
- "loss": 0.5535176396369934,
- "mean_token_accuracy": 0.7980542778968811,
- "num_tokens": 766496.0,
- "step": 80
- },
- {
- "epoch": 0.06155015197568389,
- "grad_norm": 1.7095069885253906,
- "learning_rate": 2.02020202020202e-06,
- "loss": 0.4545496106147766,
- "mean_token_accuracy": 0.8229660391807556,
- "num_tokens": 780124.0,
- "step": 81
- },
- {
- "epoch": 0.06231003039513678,
- "grad_norm": 3.788830518722534,
- "learning_rate": 2.0454545454545457e-06,
- "loss": 0.6679391264915466,
- "mean_token_accuracy": 0.7942397594451904,
- "num_tokens": 784555.0,
- "step": 82
- },
- {
- "epoch": 0.06306990881458967,
- "grad_norm": 2.009831666946411,
- "learning_rate": 2.070707070707071e-06,
- "loss": 0.5067101120948792,
- "mean_token_accuracy": 0.8276634216308594,
- "num_tokens": 797459.0,
- "step": 83
- },
- {
- "epoch": 0.06382978723404255,
- "grad_norm": 2.201627731323242,
- "learning_rate": 2.095959595959596e-06,
- "loss": 0.5012127161026001,
- "mean_token_accuracy": 0.8432504534721375,
- "num_tokens": 810817.0,
- "step": 84
- },
- {
- "epoch": 0.06458966565349544,
- "grad_norm": 2.492568016052246,
- "learning_rate": 2.1212121212121216e-06,
- "loss": 0.6142797470092773,
- "mean_token_accuracy": 0.8338661193847656,
- "num_tokens": 818191.0,
- "step": 85
- },
- {
- "epoch": 0.06534954407294832,
- "grad_norm": 2.8360862731933594,
- "learning_rate": 2.1464646464646467e-06,
- "loss": 0.5569300651550293,
- "mean_token_accuracy": 0.8121030330657959,
- "num_tokens": 825325.0,
- "step": 86
- },
- {
- "epoch": 0.06610942249240122,
- "grad_norm": 2.407548427581787,
- "learning_rate": 2.171717171717172e-06,
- "loss": 0.6442930102348328,
- "mean_token_accuracy": 0.792514443397522,
- "num_tokens": 834439.0,
- "step": 87
- },
- {
- "epoch": 0.0668693009118541,
- "grad_norm": 2.340728759765625,
- "learning_rate": 2.196969696969697e-06,
- "loss": 0.6494365930557251,
- "mean_token_accuracy": 0.7746615409851074,
- "num_tokens": 843078.0,
- "step": 88
- },
- {
- "epoch": 0.067629179331307,
- "grad_norm": 1.7703697681427002,
- "learning_rate": 2.222222222222222e-06,
- "loss": 0.598991870880127,
- "mean_token_accuracy": 0.7992157340049744,
- "num_tokens": 860171.0,
- "step": 89
- },
- {
- "epoch": 0.06838905775075987,
- "grad_norm": 2.5779271125793457,
- "learning_rate": 2.2474747474747476e-06,
- "loss": 0.5693082809448242,
- "mean_token_accuracy": 0.8093700408935547,
- "num_tokens": 866669.0,
- "step": 90
- },
- {
- "epoch": 0.06914893617021277,
- "grad_norm": 2.014092206954956,
- "learning_rate": 2.2727272727272728e-06,
- "loss": 0.5346695780754089,
- "mean_token_accuracy": 0.8165590763092041,
- "num_tokens": 876698.0,
- "step": 91
- },
- {
- "epoch": 0.06990881458966565,
- "grad_norm": 1.7555919885635376,
- "learning_rate": 2.2979797979797983e-06,
- "loss": 0.5321458578109741,
- "mean_token_accuracy": 0.8166656494140625,
- "num_tokens": 889488.0,
- "step": 92
- },
- {
- "epoch": 0.07066869300911854,
- "grad_norm": 1.8631824254989624,
- "learning_rate": 2.3232323232323234e-06,
- "loss": 0.5246532559394836,
- "mean_token_accuracy": 0.8088107705116272,
- "num_tokens": 901322.0,
- "step": 93
- },
- {
- "epoch": 0.07142857142857142,
- "grad_norm": 3.2332139015197754,
- "learning_rate": 2.348484848484849e-06,
- "loss": 0.5141711235046387,
- "mean_token_accuracy": 0.8382217884063721,
- "num_tokens": 905792.0,
- "step": 94
- },
- {
- "epoch": 0.07218844984802432,
- "grad_norm": 1.7806555032730103,
- "learning_rate": 2.373737373737374e-06,
- "loss": 0.5233149528503418,
- "mean_token_accuracy": 0.8101529479026794,
- "num_tokens": 917320.0,
- "step": 95
- },
- {
- "epoch": 0.0729483282674772,
- "grad_norm": 1.8169859647750854,
- "learning_rate": 2.3989898989898993e-06,
- "loss": 0.578881561756134,
- "mean_token_accuracy": 0.8044873476028442,
- "num_tokens": 931062.0,
- "step": 96
- },
- {
- "epoch": 0.0737082066869301,
- "grad_norm": 4.677402496337891,
- "learning_rate": 2.4242424242424244e-06,
- "loss": 0.7842556238174438,
- "mean_token_accuracy": 0.7579764127731323,
- "num_tokens": 934712.0,
- "step": 97
- },
- {
- "epoch": 0.07446808510638298,
- "grad_norm": 2.6987264156341553,
- "learning_rate": 2.4494949494949495e-06,
- "loss": 0.5669287443161011,
- "mean_token_accuracy": 0.8186933994293213,
- "num_tokens": 941058.0,
- "step": 98
- },
- {
- "epoch": 0.07522796352583587,
- "grad_norm": 1.6906023025512695,
- "learning_rate": 2.474747474747475e-06,
- "loss": 0.4976363778114319,
- "mean_token_accuracy": 0.8198553323745728,
- "num_tokens": 956509.0,
- "step": 99
- },
- {
- "epoch": 0.07598784194528875,
- "grad_norm": 2.7256152629852295,
- "learning_rate": 2.5e-06,
- "loss": 0.7138420343399048,
- "mean_token_accuracy": 0.7752805948257446,
- "num_tokens": 963920.0,
- "step": 100
- },
- {
- "epoch": 0.07674772036474165,
- "grad_norm": 2.174870491027832,
- "learning_rate": 2.5252525252525258e-06,
- "loss": 0.6733541488647461,
- "mean_token_accuracy": 0.7745175361633301,
- "num_tokens": 975268.0,
- "step": 101
- },
- {
- "epoch": 0.07750759878419453,
- "grad_norm": 1.5587213039398193,
- "learning_rate": 2.5505050505050505e-06,
- "loss": 0.44223445653915405,
- "mean_token_accuracy": 0.8278359174728394,
- "num_tokens": 991837.0,
- "step": 102
- },
- {
- "epoch": 0.07826747720364742,
- "grad_norm": 2.181840658187866,
- "learning_rate": 2.575757575757576e-06,
- "loss": 0.625128448009491,
- "mean_token_accuracy": 0.7941786050796509,
- "num_tokens": 1004325.0,
- "step": 103
- },
- {
- "epoch": 0.0790273556231003,
- "grad_norm": 1.4986687898635864,
- "learning_rate": 2.601010101010101e-06,
- "loss": 0.39262527227401733,
- "mean_token_accuracy": 0.8412648439407349,
- "num_tokens": 1018331.0,
- "step": 104
- },
- {
- "epoch": 0.0797872340425532,
- "grad_norm": 2.3416061401367188,
- "learning_rate": 2.6262626262626267e-06,
- "loss": 0.5495132803916931,
- "mean_token_accuracy": 0.8193322420120239,
- "num_tokens": 1026090.0,
- "step": 105
- },
- {
- "epoch": 0.08054711246200608,
- "grad_norm": 3.8168859481811523,
- "learning_rate": 2.6515151515151514e-06,
- "loss": 0.4898706376552582,
- "mean_token_accuracy": 0.8467956185340881,
- "num_tokens": 1029955.0,
- "step": 106
- },
- {
- "epoch": 0.08130699088145897,
- "grad_norm": 4.113908767700195,
- "learning_rate": 2.676767676767677e-06,
- "loss": 0.6189584732055664,
- "mean_token_accuracy": 0.8019394278526306,
- "num_tokens": 1033598.0,
- "step": 107
- },
- {
- "epoch": 0.08206686930091185,
- "grad_norm": 2.50003981590271,
- "learning_rate": 2.7020202020202025e-06,
- "loss": 0.6479471921920776,
- "mean_token_accuracy": 0.7790026664733887,
- "num_tokens": 1042533.0,
- "step": 108
- },
- {
- "epoch": 0.08282674772036475,
- "grad_norm": 1.408934473991394,
- "learning_rate": 2.7272727272727272e-06,
- "loss": 0.3909248113632202,
- "mean_token_accuracy": 0.8477586507797241,
- "num_tokens": 1061755.0,
- "step": 109
- },
- {
- "epoch": 0.08358662613981763,
- "grad_norm": 3.360633611679077,
- "learning_rate": 2.7525252525252528e-06,
- "loss": 0.6952459812164307,
- "mean_token_accuracy": 0.777535080909729,
- "num_tokens": 1067316.0,
- "step": 110
- },
- {
- "epoch": 0.08434650455927052,
- "grad_norm": 1.8631696701049805,
- "learning_rate": 2.7777777777777783e-06,
- "loss": 0.5420593023300171,
- "mean_token_accuracy": 0.8157662749290466,
- "num_tokens": 1079930.0,
- "step": 111
- },
- {
- "epoch": 0.0851063829787234,
- "grad_norm": 2.4308314323425293,
- "learning_rate": 2.803030303030303e-06,
- "loss": 0.5863882303237915,
- "mean_token_accuracy": 0.8206346035003662,
- "num_tokens": 1088069.0,
- "step": 112
- },
- {
- "epoch": 0.0858662613981763,
- "grad_norm": 2.922808885574341,
- "learning_rate": 2.8282828282828286e-06,
- "loss": 0.5217319130897522,
- "mean_token_accuracy": 0.8253234028816223,
- "num_tokens": 1093607.0,
- "step": 113
- },
- {
- "epoch": 0.08662613981762918,
- "grad_norm": 2.3596107959747314,
- "learning_rate": 2.8535353535353537e-06,
- "loss": 0.5070714950561523,
- "mean_token_accuracy": 0.8258323669433594,
- "num_tokens": 1100405.0,
- "step": 114
- },
- {
- "epoch": 0.08738601823708207,
- "grad_norm": 3.0853066444396973,
- "learning_rate": 2.8787878787878793e-06,
- "loss": 0.591964840888977,
- "mean_token_accuracy": 0.8047322630882263,
- "num_tokens": 1107535.0,
- "step": 115
- },
- {
- "epoch": 0.08814589665653495,
- "grad_norm": 1.9251092672348022,
- "learning_rate": 2.904040404040404e-06,
- "loss": 0.5226191878318787,
- "mean_token_accuracy": 0.8022720217704773,
- "num_tokens": 1118716.0,
- "step": 116
- },
- {
- "epoch": 0.08890577507598785,
- "grad_norm": 1.9692988395690918,
- "learning_rate": 2.9292929292929295e-06,
- "loss": 0.5462069511413574,
- "mean_token_accuracy": 0.8157015442848206,
- "num_tokens": 1131917.0,
- "step": 117
- },
- {
- "epoch": 0.08966565349544073,
- "grad_norm": 1.4738909006118774,
- "learning_rate": 2.954545454545455e-06,
- "loss": 0.4564219117164612,
- "mean_token_accuracy": 0.849632978439331,
- "num_tokens": 1148534.0,
- "step": 118
- },
- {
- "epoch": 0.09042553191489362,
- "grad_norm": 2.72646164894104,
- "learning_rate": 2.97979797979798e-06,
- "loss": 0.6654808521270752,
- "mean_token_accuracy": 0.7752684354782104,
- "num_tokens": 1155438.0,
- "step": 119
- },
- {
- "epoch": 0.0911854103343465,
- "grad_norm": 2.7843852043151855,
- "learning_rate": 3.0050505050505054e-06,
- "loss": 0.5354680418968201,
- "mean_token_accuracy": 0.8196378946304321,
- "num_tokens": 1161815.0,
- "step": 120
- },
- {
- "epoch": 0.0919452887537994,
- "grad_norm": 2.8052573204040527,
- "learning_rate": 3.0303030303030305e-06,
- "loss": 0.6366757154464722,
- "mean_token_accuracy": 0.7967483997344971,
- "num_tokens": 1168295.0,
- "step": 121
- },
- {
- "epoch": 0.09270516717325228,
- "grad_norm": 2.7462735176086426,
- "learning_rate": 3.055555555555556e-06,
- "loss": 0.59470534324646,
- "mean_token_accuracy": 0.8023771047592163,
- "num_tokens": 1174502.0,
- "step": 122
- },
- {
- "epoch": 0.09346504559270517,
- "grad_norm": 2.2743821144104004,
- "learning_rate": 3.0808080808080807e-06,
- "loss": 0.5720560550689697,
- "mean_token_accuracy": 0.8162771463394165,
- "num_tokens": 1183615.0,
- "step": 123
- },
- {
- "epoch": 0.09422492401215805,
- "grad_norm": 1.8669533729553223,
- "learning_rate": 3.1060606060606063e-06,
- "loss": 0.4655378758907318,
- "mean_token_accuracy": 0.8360732793807983,
- "num_tokens": 1193761.0,
- "step": 124
- },
- {
- "epoch": 0.09498480243161095,
- "grad_norm": 1.7666901350021362,
- "learning_rate": 3.131313131313132e-06,
- "loss": 0.5524153709411621,
- "mean_token_accuracy": 0.8252713680267334,
- "num_tokens": 1207870.0,
- "step": 125
- },
- {
- "epoch": 0.09574468085106383,
- "grad_norm": 2.4720070362091064,
- "learning_rate": 3.1565656565656566e-06,
- "loss": 0.5003011226654053,
- "mean_token_accuracy": 0.8491042852401733,
- "num_tokens": 1214603.0,
- "step": 126
- },
- {
- "epoch": 0.09650455927051672,
- "grad_norm": 1.6500422954559326,
- "learning_rate": 3.181818181818182e-06,
- "loss": 0.5137069225311279,
- "mean_token_accuracy": 0.8273531198501587,
- "num_tokens": 1228717.0,
- "step": 127
- },
- {
- "epoch": 0.0972644376899696,
- "grad_norm": 3.402543067932129,
- "learning_rate": 3.2070707070707072e-06,
- "loss": 0.708167552947998,
- "mean_token_accuracy": 0.7705385684967041,
- "num_tokens": 1234361.0,
- "step": 128
- },
- {
- "epoch": 0.0980243161094225,
- "grad_norm": 2.547285795211792,
- "learning_rate": 3.232323232323233e-06,
- "loss": 0.6020137071609497,
- "mean_token_accuracy": 0.7981340289115906,
- "num_tokens": 1244169.0,
- "step": 129
- },
- {
- "epoch": 0.09878419452887538,
- "grad_norm": 2.0578792095184326,
- "learning_rate": 3.257575757575758e-06,
- "loss": 0.4425000250339508,
- "mean_token_accuracy": 0.8567807674407959,
- "num_tokens": 1252709.0,
- "step": 130
- },
- {
- "epoch": 0.09954407294832827,
- "grad_norm": 1.672614336013794,
- "learning_rate": 3.282828282828283e-06,
- "loss": 0.4860966205596924,
- "mean_token_accuracy": 0.8393139243125916,
- "num_tokens": 1265766.0,
- "step": 131
- },
- {
- "epoch": 0.10030395136778116,
- "grad_norm": 3.2560198307037354,
- "learning_rate": 3.3080808080808086e-06,
- "loss": 0.624736487865448,
- "mean_token_accuracy": 0.7875322699546814,
- "num_tokens": 1270779.0,
- "step": 132
- },
- {
- "epoch": 0.10106382978723404,
- "grad_norm": 2.4468185901641846,
- "learning_rate": 3.3333333333333333e-06,
- "loss": 0.5062227249145508,
- "mean_token_accuracy": 0.8217229843139648,
- "num_tokens": 1277113.0,
- "step": 133
- },
- {
- "epoch": 0.10182370820668693,
- "grad_norm": 2.6371328830718994,
- "learning_rate": 3.358585858585859e-06,
- "loss": 0.477113276720047,
- "mean_token_accuracy": 0.8605583906173706,
- "num_tokens": 1282514.0,
- "step": 134
- },
- {
- "epoch": 0.10258358662613981,
- "grad_norm": 2.48421311378479,
- "learning_rate": 3.3838383838383844e-06,
- "loss": 0.40855684876441956,
- "mean_token_accuracy": 0.864548921585083,
- "num_tokens": 1287859.0,
- "step": 135
- },
- {
- "epoch": 0.1033434650455927,
- "grad_norm": 1.993099331855774,
- "learning_rate": 3.409090909090909e-06,
- "loss": 0.5913145542144775,
- "mean_token_accuracy": 0.8248485922813416,
- "num_tokens": 1301074.0,
- "step": 136
- },
- {
- "epoch": 0.10410334346504559,
- "grad_norm": 3.5947680473327637,
- "learning_rate": 3.4343434343434347e-06,
- "loss": 0.5028599500656128,
- "mean_token_accuracy": 0.8367215394973755,
- "num_tokens": 1305219.0,
- "step": 137
- },
- {
- "epoch": 0.10486322188449848,
- "grad_norm": 2.5778582096099854,
- "learning_rate": 3.45959595959596e-06,
- "loss": 0.5297672748565674,
- "mean_token_accuracy": 0.8232187032699585,
- "num_tokens": 1312482.0,
- "step": 138
- },
- {
- "epoch": 0.10562310030395136,
- "grad_norm": 1.8961588144302368,
- "learning_rate": 3.4848484848484854e-06,
- "loss": 0.39954107999801636,
- "mean_token_accuracy": 0.8605833053588867,
- "num_tokens": 1323404.0,
- "step": 139
- },
- {
- "epoch": 0.10638297872340426,
- "grad_norm": 1.9687960147857666,
- "learning_rate": 3.51010101010101e-06,
- "loss": 0.48791587352752686,
- "mean_token_accuracy": 0.8200347423553467,
- "num_tokens": 1333027.0,
- "step": 140
- },
- {
- "epoch": 0.10714285714285714,
- "grad_norm": 2.520242691040039,
- "learning_rate": 3.5353535353535356e-06,
- "loss": 0.6106002330780029,
- "mean_token_accuracy": 0.790692150592804,
- "num_tokens": 1340999.0,
- "step": 141
- },
- {
- "epoch": 0.10790273556231003,
- "grad_norm": 3.751617431640625,
- "learning_rate": 3.560606060606061e-06,
- "loss": 0.48141729831695557,
- "mean_token_accuracy": 0.8421382904052734,
- "num_tokens": 1344687.0,
- "step": 142
- },
- {
- "epoch": 0.10866261398176291,
- "grad_norm": 2.7101709842681885,
- "learning_rate": 3.585858585858586e-06,
- "loss": 0.5375241637229919,
- "mean_token_accuracy": 0.8061438202857971,
- "num_tokens": 1350192.0,
- "step": 143
- },
- {
- "epoch": 0.1094224924012158,
- "grad_norm": 2.583484411239624,
- "learning_rate": 3.6111111111111115e-06,
- "loss": 0.6492470502853394,
- "mean_token_accuracy": 0.7863001823425293,
- "num_tokens": 1358148.0,
- "step": 144
- },
- {
- "epoch": 0.11018237082066869,
- "grad_norm": 1.792561650276184,
- "learning_rate": 3.6363636363636366e-06,
- "loss": 0.48480600118637085,
- "mean_token_accuracy": 0.8358709812164307,
- "num_tokens": 1369519.0,
- "step": 145
- },
- {
- "epoch": 0.11094224924012158,
- "grad_norm": 2.6480472087860107,
- "learning_rate": 3.661616161616162e-06,
- "loss": 0.5268933176994324,
- "mean_token_accuracy": 0.8214013576507568,
- "num_tokens": 1375862.0,
- "step": 146
- },
- {
- "epoch": 0.11170212765957446,
- "grad_norm": 2.3174469470977783,
- "learning_rate": 3.686868686868687e-06,
- "loss": 0.42517897486686707,
- "mean_token_accuracy": 0.8523461222648621,
- "num_tokens": 1381546.0,
- "step": 147
- },
- {
- "epoch": 0.11246200607902736,
- "grad_norm": 3.0090949535369873,
- "learning_rate": 3.7121212121212124e-06,
- "loss": 0.4042336940765381,
- "mean_token_accuracy": 0.8670448064804077,
- "num_tokens": 1385896.0,
- "step": 148
- },
- {
- "epoch": 0.11322188449848024,
- "grad_norm": 2.4928104877471924,
- "learning_rate": 3.737373737373738e-06,
- "loss": 0.6498878598213196,
- "mean_token_accuracy": 0.7967068552970886,
- "num_tokens": 1394169.0,
- "step": 149
- },
- {
- "epoch": 0.11398176291793313,
- "grad_norm": 1.5984913110733032,
- "learning_rate": 3.7626262626262627e-06,
- "loss": 0.546096920967102,
- "mean_token_accuracy": 0.8035850524902344,
- "num_tokens": 1408785.0,
- "step": 150
- },
- {
- "epoch": 0.11474164133738601,
- "grad_norm": 2.3663532733917236,
- "learning_rate": 3.7878787878787882e-06,
- "loss": 0.6111721992492676,
- "mean_token_accuracy": 0.8015355467796326,
- "num_tokens": 1417510.0,
- "step": 151
- },
- {
- "epoch": 0.11550151975683891,
- "grad_norm": 2.518932819366455,
- "learning_rate": 3.8131313131313138e-06,
- "loss": 0.5274964570999146,
- "mean_token_accuracy": 0.8155480623245239,
- "num_tokens": 1424186.0,
- "step": 152
- },
- {
- "epoch": 0.11626139817629179,
- "grad_norm": 2.14353609085083,
- "learning_rate": 3.8383838383838385e-06,
- "loss": 0.5283297896385193,
- "mean_token_accuracy": 0.8275758028030396,
- "num_tokens": 1432630.0,
- "step": 153
- },
- {
- "epoch": 0.11702127659574468,
- "grad_norm": 1.8243604898452759,
- "learning_rate": 3.863636363636364e-06,
- "loss": 0.41854870319366455,
- "mean_token_accuracy": 0.8222295045852661,
- "num_tokens": 1442691.0,
- "step": 154
- },
- {
- "epoch": 0.11778115501519756,
- "grad_norm": 2.088212251663208,
- "learning_rate": 3.88888888888889e-06,
- "loss": 0.6062943339347839,
- "mean_token_accuracy": 0.8009427785873413,
- "num_tokens": 1456890.0,
- "step": 155
- },
- {
- "epoch": 0.11854103343465046,
- "grad_norm": 1.3469511270523071,
- "learning_rate": 3.914141414141415e-06,
- "loss": 0.4390433728694916,
- "mean_token_accuracy": 0.8436295986175537,
- "num_tokens": 1475349.0,
- "step": 156
- },
- {
- "epoch": 0.11930091185410334,
- "grad_norm": 3.247023105621338,
- "learning_rate": 3.93939393939394e-06,
- "loss": 0.6490433216094971,
- "mean_token_accuracy": 0.8037861585617065,
- "num_tokens": 1479952.0,
- "step": 157
- },
- {
- "epoch": 0.12006079027355623,
- "grad_norm": 2.6610445976257324,
- "learning_rate": 3.964646464646465e-06,
- "loss": 0.6221826076507568,
- "mean_token_accuracy": 0.7848749160766602,
- "num_tokens": 1487306.0,
- "step": 158
- },
- {
- "epoch": 0.12082066869300911,
- "grad_norm": 2.3060810565948486,
- "learning_rate": 3.98989898989899e-06,
- "loss": 0.5052388310432434,
- "mean_token_accuracy": 0.8281195759773254,
- "num_tokens": 1495367.0,
- "step": 159
- },
- {
- "epoch": 0.12158054711246201,
- "grad_norm": 2.504448652267456,
- "learning_rate": 4.015151515151515e-06,
- "loss": 0.5005477666854858,
- "mean_token_accuracy": 0.8408058881759644,
- "num_tokens": 1502069.0,
- "step": 160
- },
- {
- "epoch": 0.12234042553191489,
- "grad_norm": 3.993938446044922,
- "learning_rate": 4.04040404040404e-06,
- "loss": 0.5569638013839722,
- "mean_token_accuracy": 0.8095242977142334,
- "num_tokens": 1510224.0,
- "step": 161
- },
- {
- "epoch": 0.12310030395136778,
- "grad_norm": 2.2287683486938477,
- "learning_rate": 4.065656565656566e-06,
- "loss": 0.524042546749115,
- "mean_token_accuracy": 0.8102203607559204,
- "num_tokens": 1518364.0,
- "step": 162
- },
- {
- "epoch": 0.12386018237082067,
- "grad_norm": 1.9531738758087158,
- "learning_rate": 4.0909090909090915e-06,
- "loss": 0.45794573426246643,
- "mean_token_accuracy": 0.8560376167297363,
- "num_tokens": 1528097.0,
- "step": 163
- },
- {
- "epoch": 0.12462006079027356,
- "grad_norm": 1.5841206312179565,
- "learning_rate": 4.116161616161617e-06,
- "loss": 0.5420972108840942,
- "mean_token_accuracy": 0.8092726469039917,
- "num_tokens": 1544119.0,
- "step": 164
- },
- {
- "epoch": 0.12537993920972645,
- "grad_norm": 1.7536218166351318,
- "learning_rate": 4.141414141414142e-06,
- "loss": 0.554668664932251,
- "mean_token_accuracy": 0.8193825483322144,
- "num_tokens": 1559140.0,
- "step": 165
- },
- {
- "epoch": 0.12613981762917933,
- "grad_norm": 3.545454740524292,
- "learning_rate": 4.166666666666667e-06,
- "loss": 0.580947995185852,
- "mean_token_accuracy": 0.8286383152008057,
- "num_tokens": 1563625.0,
- "step": 166
- },
- {
- "epoch": 0.12689969604863222,
- "grad_norm": 1.6608915328979492,
- "learning_rate": 4.191919191919192e-06,
- "loss": 0.5523324012756348,
- "mean_token_accuracy": 0.8155215978622437,
- "num_tokens": 1574945.0,
- "step": 167
- },
- {
- "epoch": 0.1276595744680851,
- "grad_norm": 1.4832708835601807,
- "learning_rate": 4.217171717171717e-06,
- "loss": 0.5133191347122192,
- "mean_token_accuracy": 0.8367571830749512,
- "num_tokens": 1595865.0,
- "step": 168
- },
- {
- "epoch": 0.128419452887538,
- "grad_norm": 1.7807520627975464,
- "learning_rate": 4.242424242424243e-06,
- "loss": 0.5131410360336304,
- "mean_token_accuracy": 0.8129367232322693,
- "num_tokens": 1608723.0,
- "step": 169
- },
- {
- "epoch": 0.12917933130699089,
- "grad_norm": 2.707569122314453,
- "learning_rate": 4.267676767676767e-06,
- "loss": 0.6129013299942017,
- "mean_token_accuracy": 0.7926048040390015,
- "num_tokens": 1616136.0,
- "step": 170
- },
- {
- "epoch": 0.12993920972644377,
- "grad_norm": 2.5831644535064697,
- "learning_rate": 4.292929292929293e-06,
- "loss": 0.6264227628707886,
- "mean_token_accuracy": 0.8074911236763,
- "num_tokens": 1624228.0,
- "step": 171
- },
- {
- "epoch": 0.13069908814589665,
- "grad_norm": 3.1124250888824463,
- "learning_rate": 4.3181818181818185e-06,
- "loss": 0.41763827204704285,
- "mean_token_accuracy": 0.8565453290939331,
- "num_tokens": 1628098.0,
- "step": 172
- },
- {
- "epoch": 0.13145896656534956,
- "grad_norm": 2.3214211463928223,
- "learning_rate": 4.343434343434344e-06,
- "loss": 0.421974778175354,
- "mean_token_accuracy": 0.8391546010971069,
- "num_tokens": 1634950.0,
- "step": 173
- },
- {
- "epoch": 0.13221884498480244,
- "grad_norm": 2.1010327339172363,
- "learning_rate": 4.368686868686869e-06,
- "loss": 0.5307331681251526,
- "mean_token_accuracy": 0.8139588236808777,
- "num_tokens": 1644132.0,
- "step": 174
- },
- {
- "epoch": 0.13297872340425532,
- "grad_norm": 2.533612012863159,
- "learning_rate": 4.393939393939394e-06,
- "loss": 0.5626664161682129,
- "mean_token_accuracy": 0.8029808402061462,
- "num_tokens": 1651637.0,
- "step": 175
- },
- {
- "epoch": 0.1337386018237082,
- "grad_norm": 1.669508457183838,
- "learning_rate": 4.41919191919192e-06,
- "loss": 0.5351508259773254,
- "mean_token_accuracy": 0.8281655311584473,
- "num_tokens": 1666776.0,
- "step": 176
- },
- {
- "epoch": 0.1344984802431611,
- "grad_norm": 1.7579659223556519,
- "learning_rate": 4.444444444444444e-06,
- "loss": 0.5235031247138977,
- "mean_token_accuracy": 0.8143284320831299,
- "num_tokens": 1679241.0,
- "step": 177
- },
- {
- "epoch": 0.135258358662614,
- "grad_norm": 3.123563528060913,
- "learning_rate": 4.46969696969697e-06,
- "loss": 0.43051332235336304,
- "mean_token_accuracy": 0.8518186211585999,
- "num_tokens": 1683317.0,
- "step": 178
- },
- {
- "epoch": 0.13601823708206687,
- "grad_norm": 2.2411575317382812,
- "learning_rate": 4.494949494949495e-06,
- "loss": 0.5471380949020386,
- "mean_token_accuracy": 0.8267596960067749,
- "num_tokens": 1691366.0,
- "step": 179
- },
- {
- "epoch": 0.13677811550151975,
- "grad_norm": 2.621973991394043,
- "learning_rate": 4.520202020202021e-06,
- "loss": 0.5685839653015137,
- "mean_token_accuracy": 0.8260642290115356,
- "num_tokens": 1698148.0,
- "step": 180
- },
- {
- "epoch": 0.13753799392097266,
- "grad_norm": 2.1553852558135986,
- "learning_rate": 4.5454545454545455e-06,
- "loss": 0.5703883171081543,
- "mean_token_accuracy": 0.8219090700149536,
- "num_tokens": 1707225.0,
- "step": 181
- },
- {
- "epoch": 0.13829787234042554,
- "grad_norm": 5.1767897605896,
- "learning_rate": 4.5707070707070715e-06,
- "loss": 0.32704639434814453,
- "mean_token_accuracy": 0.8754568099975586,
- "num_tokens": 1712748.0,
- "step": 182
- },
- {
- "epoch": 0.13905775075987842,
- "grad_norm": 2.609168291091919,
- "learning_rate": 4.595959595959597e-06,
- "loss": 0.5939987301826477,
- "mean_token_accuracy": 0.8034975528717041,
- "num_tokens": 1719932.0,
- "step": 183
- },
- {
- "epoch": 0.1398176291793313,
- "grad_norm": 2.2059099674224854,
- "learning_rate": 4.621212121212122e-06,
- "loss": 0.5310720205307007,
- "mean_token_accuracy": 0.8177368640899658,
- "num_tokens": 1727640.0,
- "step": 184
- },
- {
- "epoch": 0.1405775075987842,
- "grad_norm": 2.6367759704589844,
- "learning_rate": 4.646464646464647e-06,
- "loss": 0.522086501121521,
- "mean_token_accuracy": 0.826233983039856,
- "num_tokens": 1733609.0,
- "step": 185
- },
- {
- "epoch": 0.1413373860182371,
- "grad_norm": 3.326732873916626,
- "learning_rate": 4.671717171717172e-06,
- "loss": 0.4127829074859619,
- "mean_token_accuracy": 0.8551101684570312,
- "num_tokens": 1737256.0,
- "step": 186
- },
- {
- "epoch": 0.14209726443768997,
- "grad_norm": 1.828412413597107,
- "learning_rate": 4.696969696969698e-06,
- "loss": 0.5444269180297852,
- "mean_token_accuracy": 0.8350818157196045,
- "num_tokens": 1750196.0,
- "step": 187
- },
- {
- "epoch": 0.14285714285714285,
- "grad_norm": 3.209203004837036,
- "learning_rate": 4.722222222222222e-06,
- "loss": 0.5087994933128357,
- "mean_token_accuracy": 0.8349015712738037,
- "num_tokens": 1754836.0,
- "step": 188
- },
- {
- "epoch": 0.14361702127659576,
- "grad_norm": 1.7339166402816772,
- "learning_rate": 4.747474747474748e-06,
- "loss": 0.5151352286338806,
- "mean_token_accuracy": 0.8321266174316406,
- "num_tokens": 1766015.0,
- "step": 189
- },
- {
- "epoch": 0.14437689969604864,
- "grad_norm": 2.699068069458008,
- "learning_rate": 4.772727272727273e-06,
- "loss": 0.4406203031539917,
- "mean_token_accuracy": 0.8425000905990601,
- "num_tokens": 1771684.0,
- "step": 190
- },
- {
- "epoch": 0.14513677811550152,
- "grad_norm": 2.8117282390594482,
- "learning_rate": 4.7979797979797985e-06,
- "loss": 0.40428489446640015,
- "mean_token_accuracy": 0.8654326796531677,
- "num_tokens": 1776301.0,
- "step": 191
- },
- {
- "epoch": 0.1458966565349544,
- "grad_norm": 2.9204647541046143,
- "learning_rate": 4.823232323232324e-06,
- "loss": 0.4191770553588867,
- "mean_token_accuracy": 0.8574687242507935,
- "num_tokens": 1781678.0,
- "step": 192
- },
- {
- "epoch": 0.1466565349544073,
- "grad_norm": 2.1648988723754883,
- "learning_rate": 4.848484848484849e-06,
- "loss": 0.5839012861251831,
- "mean_token_accuracy": 0.8053664565086365,
- "num_tokens": 1792516.0,
- "step": 193
- },
- {
- "epoch": 0.1474164133738602,
- "grad_norm": 2.3221631050109863,
- "learning_rate": 4.873737373737374e-06,
- "loss": 0.5037894248962402,
- "mean_token_accuracy": 0.8427227139472961,
- "num_tokens": 1800192.0,
- "step": 194
- },
- {
- "epoch": 0.14817629179331307,
- "grad_norm": 2.4536430835723877,
- "learning_rate": 4.898989898989899e-06,
- "loss": 0.42326074838638306,
- "mean_token_accuracy": 0.8510633111000061,
- "num_tokens": 1806159.0,
- "step": 195
- },
- {
- "epoch": 0.14893617021276595,
- "grad_norm": 2.4875805377960205,
- "learning_rate": 4.924242424242425e-06,
- "loss": 0.539531409740448,
- "mean_token_accuracy": 0.8060250282287598,
- "num_tokens": 1813392.0,
- "step": 196
- },
- {
- "epoch": 0.14969604863221886,
- "grad_norm": 2.1664798259735107,
- "learning_rate": 4.94949494949495e-06,
- "loss": 0.42502015829086304,
- "mean_token_accuracy": 0.8503251075744629,
- "num_tokens": 1821424.0,
- "step": 197
- },
- {
- "epoch": 0.15045592705167174,
- "grad_norm": 2.568808078765869,
- "learning_rate": 4.974747474747475e-06,
- "loss": 0.5025098323822021,
- "mean_token_accuracy": 0.8182311058044434,
- "num_tokens": 1827225.0,
- "step": 198
- },
- {
- "epoch": 0.15121580547112462,
- "grad_norm": 1.9116802215576172,
- "learning_rate": 5e-06,
- "loss": 0.4907258450984955,
- "mean_token_accuracy": 0.8310189843177795,
- "num_tokens": 1836297.0,
- "step": 199
- },
- {
- "epoch": 0.1519756838905775,
- "grad_norm": 3.150765895843506,
- "learning_rate": 4.999999122701883e-06,
- "loss": 0.390616774559021,
- "mean_token_accuracy": 0.8626647591590881,
- "num_tokens": 1839984.0,
- "step": 200
- },
- {
- "epoch": 0.15273556231003038,
- "grad_norm": 3.2229044437408447,
- "learning_rate": 4.999996490808146e-06,
- "loss": 0.48009657859802246,
- "mean_token_accuracy": 0.825214147567749,
- "num_tokens": 1844610.0,
- "step": 201
- },
- {
- "epoch": 0.1534954407294833,
- "grad_norm": 1.4473289251327515,
- "learning_rate": 4.9999921043206356e-06,
- "loss": 0.40135183930397034,
- "mean_token_accuracy": 0.8537827730178833,
- "num_tokens": 1859573.0,
- "step": 202
- },
- {
- "epoch": 0.15425531914893617,
- "grad_norm": 4.072319507598877,
- "learning_rate": 4.999985963242432e-06,
- "loss": 0.6158689260482788,
- "mean_token_accuracy": 0.8075432777404785,
- "num_tokens": 1863147.0,
- "step": 203
- },
- {
- "epoch": 0.15501519756838905,
- "grad_norm": 3.15741229057312,
- "learning_rate": 4.999978067577844e-06,
- "loss": 0.4603108763694763,
- "mean_token_accuracy": 0.8418779373168945,
- "num_tokens": 1867201.0,
- "step": 204
- },
- {
- "epoch": 0.15577507598784193,
- "grad_norm": 2.1925418376922607,
- "learning_rate": 4.999968417332415e-06,
- "loss": 0.5552488565444946,
- "mean_token_accuracy": 0.8216016292572021,
- "num_tokens": 1874837.0,
- "step": 205
- },
- {
- "epoch": 0.15653495440729484,
- "grad_norm": 2.2518117427825928,
- "learning_rate": 4.999957012512916e-06,
- "loss": 0.4912569522857666,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 1881842.0,
- "step": 206
- },
- {
- "epoch": 0.15729483282674772,
- "grad_norm": 1.8223762512207031,
- "learning_rate": 4.999943853127351e-06,
- "loss": 0.47709137201309204,
- "mean_token_accuracy": 0.8311659097671509,
- "num_tokens": 1890805.0,
- "step": 207
- },
- {
- "epoch": 0.1580547112462006,
- "grad_norm": 2.066499948501587,
- "learning_rate": 4.999928939184958e-06,
- "loss": 0.44794657826423645,
- "mean_token_accuracy": 0.8513424396514893,
- "num_tokens": 1898264.0,
- "step": 208
- },
- {
- "epoch": 0.15881458966565348,
- "grad_norm": 3.53865909576416,
- "learning_rate": 4.999912270696202e-06,
- "loss": 0.5978270769119263,
- "mean_token_accuracy": 0.8080137968063354,
- "num_tokens": 1902435.0,
- "step": 209
- },
- {
- "epoch": 0.1595744680851064,
- "grad_norm": 2.0760679244995117,
- "learning_rate": 4.999893847672783e-06,
- "loss": 0.5930601358413696,
- "mean_token_accuracy": 0.8028650283813477,
- "num_tokens": 1912252.0,
- "step": 210
- },
- {
- "epoch": 0.16033434650455927,
- "grad_norm": 2.21551513671875,
- "learning_rate": 4.99987367012763e-06,
- "loss": 0.6336753964424133,
- "mean_token_accuracy": 0.7902286648750305,
- "num_tokens": 1922095.0,
- "step": 211
- },
- {
- "epoch": 0.16109422492401215,
- "grad_norm": 1.7654480934143066,
- "learning_rate": 4.999851738074904e-06,
- "loss": 0.6373403668403625,
- "mean_token_accuracy": 0.7802424430847168,
- "num_tokens": 1938962.0,
- "step": 212
- },
- {
- "epoch": 0.16185410334346503,
- "grad_norm": 2.852834701538086,
- "learning_rate": 4.9998280515300006e-06,
- "loss": 0.6418683528900146,
- "mean_token_accuracy": 0.7895716428756714,
- "num_tokens": 1944668.0,
- "step": 213
- },
- {
- "epoch": 0.16261398176291794,
- "grad_norm": 3.4737212657928467,
- "learning_rate": 4.999802610509541e-06,
- "loss": 0.6323273181915283,
- "mean_token_accuracy": 0.7982614636421204,
- "num_tokens": 1949142.0,
- "step": 214
- },
- {
- "epoch": 0.16337386018237082,
- "grad_norm": 3.0802664756774902,
- "learning_rate": 4.999775415031381e-06,
- "loss": 0.5929068326950073,
- "mean_token_accuracy": 0.8112219572067261,
- "num_tokens": 1954141.0,
- "step": 215
- },
- {
- "epoch": 0.1641337386018237,
- "grad_norm": 2.9808855056762695,
- "learning_rate": 4.999746465114609e-06,
- "loss": 0.5556406378746033,
- "mean_token_accuracy": 0.8117628693580627,
- "num_tokens": 1959406.0,
- "step": 216
- },
- {
- "epoch": 0.16489361702127658,
- "grad_norm": 1.7346166372299194,
- "learning_rate": 4.999715760779541e-06,
- "loss": 0.5122925043106079,
- "mean_token_accuracy": 0.8040724992752075,
- "num_tokens": 1971921.0,
- "step": 217
- },
- {
- "epoch": 0.1656534954407295,
- "grad_norm": 1.4183907508850098,
- "learning_rate": 4.999683302047729e-06,
- "loss": 0.46471893787384033,
- "mean_token_accuracy": 0.8381330966949463,
- "num_tokens": 1988863.0,
- "step": 218
- },
- {
- "epoch": 0.16641337386018237,
- "grad_norm": 1.6797802448272705,
- "learning_rate": 4.999649088941951e-06,
- "loss": 0.38348832726478577,
- "mean_token_accuracy": 0.8344278931617737,
- "num_tokens": 2000003.0,
- "step": 219
- },
- {
- "epoch": 0.16717325227963525,
- "grad_norm": 3.036963939666748,
- "learning_rate": 4.999613121486222e-06,
- "loss": 0.6062780618667603,
- "mean_token_accuracy": 0.8217900991439819,
- "num_tokens": 2004813.0,
- "step": 220
- },
- {
- "epoch": 0.16793313069908813,
- "grad_norm": 2.0343217849731445,
- "learning_rate": 4.999575399705782e-06,
- "loss": 0.5052450895309448,
- "mean_token_accuracy": 0.8368623852729797,
- "num_tokens": 2013565.0,
- "step": 221
- },
- {
- "epoch": 0.16869300911854104,
- "grad_norm": 2.1162009239196777,
- "learning_rate": 4.9995359236271094e-06,
- "loss": 0.5169756412506104,
- "mean_token_accuracy": 0.8339958190917969,
- "num_tokens": 2025763.0,
- "step": 222
- },
- {
- "epoch": 0.16945288753799392,
- "grad_norm": 2.055333375930786,
- "learning_rate": 4.9994946932779076e-06,
- "loss": 0.6327048540115356,
- "mean_token_accuracy": 0.8078711032867432,
- "num_tokens": 2037005.0,
- "step": 223
- },
- {
- "epoch": 0.1702127659574468,
- "grad_norm": 3.334620475769043,
- "learning_rate": 4.999451708687114e-06,
- "loss": 0.5688358545303345,
- "mean_token_accuracy": 0.8015589714050293,
- "num_tokens": 2041473.0,
- "step": 224
- },
- {
- "epoch": 0.17097264437689969,
- "grad_norm": 2.3734676837921143,
- "learning_rate": 4.999406969884897e-06,
- "loss": 0.5673821568489075,
- "mean_token_accuracy": 0.8054057359695435,
- "num_tokens": 2049397.0,
- "step": 225
- },
- {
- "epoch": 0.1717325227963526,
- "grad_norm": 1.807358980178833,
- "learning_rate": 4.999360476902656e-06,
- "loss": 0.4376158118247986,
- "mean_token_accuracy": 0.8456039428710938,
- "num_tokens": 2058721.0,
- "step": 226
- },
- {
- "epoch": 0.17249240121580547,
- "grad_norm": 3.231638193130493,
- "learning_rate": 4.999312229773022e-06,
- "loss": 0.5592809915542603,
- "mean_token_accuracy": 0.8170154094696045,
- "num_tokens": 2063455.0,
- "step": 227
- },
- {
- "epoch": 0.17325227963525835,
- "grad_norm": 2.2717151641845703,
- "learning_rate": 4.999262228529855e-06,
- "loss": 0.6144396066665649,
- "mean_token_accuracy": 0.7948470115661621,
- "num_tokens": 2071686.0,
- "step": 228
- },
- {
- "epoch": 0.17401215805471124,
- "grad_norm": 1.4171342849731445,
- "learning_rate": 4.99921047320825e-06,
- "loss": 0.43680912256240845,
- "mean_token_accuracy": 0.84850013256073,
- "num_tokens": 2086999.0,
- "step": 229
- },
- {
- "epoch": 0.17477203647416414,
- "grad_norm": 3.162736654281616,
- "learning_rate": 4.99915696384453e-06,
- "loss": 0.6025407910346985,
- "mean_token_accuracy": 0.8042335510253906,
- "num_tokens": 2092001.0,
- "step": 230
- },
- {
- "epoch": 0.17553191489361702,
- "grad_norm": 1.8672804832458496,
- "learning_rate": 4.99910170047625e-06,
- "loss": 0.5843087434768677,
- "mean_token_accuracy": 0.8016980886459351,
- "num_tokens": 2103372.0,
- "step": 231
- },
- {
- "epoch": 0.1762917933130699,
- "grad_norm": 2.967587471008301,
- "learning_rate": 4.999044683142196e-06,
- "loss": 0.5123642086982727,
- "mean_token_accuracy": 0.8216149806976318,
- "num_tokens": 2108008.0,
- "step": 232
- },
- {
- "epoch": 0.1770516717325228,
- "grad_norm": 1.9651981592178345,
- "learning_rate": 4.998985911882383e-06,
- "loss": 0.5868178606033325,
- "mean_token_accuracy": 0.7904198169708252,
- "num_tokens": 2119009.0,
- "step": 233
- },
- {
- "epoch": 0.1778115501519757,
- "grad_norm": 2.7785449028015137,
- "learning_rate": 4.998925386738063e-06,
- "loss": 0.5075510144233704,
- "mean_token_accuracy": 0.8280210494995117,
- "num_tokens": 2124915.0,
- "step": 234
- },
- {
- "epoch": 0.17857142857142858,
- "grad_norm": 2.957470417022705,
- "learning_rate": 4.998863107751711e-06,
- "loss": 0.5351958274841309,
- "mean_token_accuracy": 0.846825122833252,
- "num_tokens": 2129905.0,
- "step": 235
- },
- {
- "epoch": 0.17933130699088146,
- "grad_norm": 3.207671880722046,
- "learning_rate": 4.99879907496704e-06,
- "loss": 0.6209091544151306,
- "mean_token_accuracy": 0.789960503578186,
- "num_tokens": 2135027.0,
- "step": 236
- },
- {
- "epoch": 0.18009118541033434,
- "grad_norm": 2.018953800201416,
- "learning_rate": 4.998733288428987e-06,
- "loss": 0.601510763168335,
- "mean_token_accuracy": 0.8136930465698242,
- "num_tokens": 2147016.0,
- "step": 237
- },
- {
- "epoch": 0.18085106382978725,
- "grad_norm": 2.437281847000122,
- "learning_rate": 4.998665748183727e-06,
- "loss": 0.5813639163970947,
- "mean_token_accuracy": 0.8116716146469116,
- "num_tokens": 2155386.0,
- "step": 238
- },
- {
- "epoch": 0.18161094224924013,
- "grad_norm": 1.5708180665969849,
- "learning_rate": 4.998596454278661e-06,
- "loss": 0.5252395272254944,
- "mean_token_accuracy": 0.8193864822387695,
- "num_tokens": 2170295.0,
- "step": 239
- },
- {
- "epoch": 0.182370820668693,
- "grad_norm": 1.9921495914459229,
- "learning_rate": 4.998525406762422e-06,
- "loss": 0.5335029363632202,
- "mean_token_accuracy": 0.8120872974395752,
- "num_tokens": 2180012.0,
- "step": 240
- },
- {
- "epoch": 0.1831306990881459,
- "grad_norm": 2.6562681198120117,
- "learning_rate": 4.998452605684874e-06,
- "loss": 0.48021435737609863,
- "mean_token_accuracy": 0.8388714790344238,
- "num_tokens": 2185607.0,
- "step": 241
- },
- {
- "epoch": 0.1838905775075988,
- "grad_norm": 2.2535853385925293,
- "learning_rate": 4.998378051097111e-06,
- "loss": 0.5747300386428833,
- "mean_token_accuracy": 0.8004639148712158,
- "num_tokens": 2194105.0,
- "step": 242
- },
- {
- "epoch": 0.18465045592705168,
- "grad_norm": 1.6151788234710693,
- "learning_rate": 4.998301743051459e-06,
- "loss": 0.6190565824508667,
- "mean_token_accuracy": 0.7816627621650696,
- "num_tokens": 2210629.0,
- "step": 243
- },
- {
- "epoch": 0.18541033434650456,
- "grad_norm": 2.1088173389434814,
- "learning_rate": 4.9982236816014735e-06,
- "loss": 0.4715560972690582,
- "mean_token_accuracy": 0.8485721349716187,
- "num_tokens": 2218958.0,
- "step": 244
- },
- {
- "epoch": 0.18617021276595744,
- "grad_norm": 2.6168735027313232,
- "learning_rate": 4.998143866801941e-06,
- "loss": 0.6077103018760681,
- "mean_token_accuracy": 0.8057924509048462,
- "num_tokens": 2226368.0,
- "step": 245
- },
- {
- "epoch": 0.18693009118541035,
- "grad_norm": 2.5988616943359375,
- "learning_rate": 4.99806229870888e-06,
- "loss": 0.5021637678146362,
- "mean_token_accuracy": 0.8361666202545166,
- "num_tokens": 2232485.0,
- "step": 246
- },
- {
- "epoch": 0.18768996960486323,
- "grad_norm": 2.015887498855591,
- "learning_rate": 4.9979789773795365e-06,
- "loss": 0.4309737980365753,
- "mean_token_accuracy": 0.8508044481277466,
- "num_tokens": 2240819.0,
- "step": 247
- },
- {
- "epoch": 0.1884498480243161,
- "grad_norm": 2.3115265369415283,
- "learning_rate": 4.997893902872389e-06,
- "loss": 0.5776500701904297,
- "mean_token_accuracy": 0.8079549074172974,
- "num_tokens": 2249460.0,
- "step": 248
- },
- {
- "epoch": 0.189209726443769,
- "grad_norm": 1.7387021780014038,
- "learning_rate": 4.997807075247147e-06,
- "loss": 0.430944561958313,
- "mean_token_accuracy": 0.8483544588088989,
- "num_tokens": 2259124.0,
- "step": 249
- },
- {
- "epoch": 0.1899696048632219,
- "grad_norm": 1.6378381252288818,
- "learning_rate": 4.997718494564747e-06,
- "loss": 0.4123363792896271,
- "mean_token_accuracy": 0.8557409644126892,
- "num_tokens": 2269899.0,
- "step": 250
- },
- {
- "epoch": 0.19072948328267478,
- "grad_norm": 1.336282730102539,
- "learning_rate": 4.997628160887361e-06,
- "loss": 0.502329409122467,
- "mean_token_accuracy": 0.8186938166618347,
- "num_tokens": 2292821.0,
- "step": 251
- },
- {
- "epoch": 0.19148936170212766,
- "grad_norm": 3.3335583209991455,
- "learning_rate": 4.997536074278388e-06,
- "loss": 0.584446907043457,
- "mean_token_accuracy": 0.8062717318534851,
- "num_tokens": 2297175.0,
- "step": 252
- },
- {
- "epoch": 0.19224924012158054,
- "grad_norm": 2.246727228164673,
- "learning_rate": 4.9974422348024565e-06,
- "loss": 0.5683060884475708,
- "mean_token_accuracy": 0.8193703293800354,
- "num_tokens": 2305456.0,
- "step": 253
- },
- {
- "epoch": 0.19300911854103345,
- "grad_norm": 2.3520865440368652,
- "learning_rate": 4.997346642525429e-06,
- "loss": 0.4724946618080139,
- "mean_token_accuracy": 0.8426719307899475,
- "num_tokens": 2312241.0,
- "step": 254
- },
- {
- "epoch": 0.19376899696048633,
- "grad_norm": 2.7115702629089355,
- "learning_rate": 4.9972492975143936e-06,
- "loss": 0.5019032955169678,
- "mean_token_accuracy": 0.8253573179244995,
- "num_tokens": 2318094.0,
- "step": 255
- },
- {
- "epoch": 0.1945288753799392,
- "grad_norm": 1.705528974533081,
- "learning_rate": 4.997150199837671e-06,
- "loss": 0.45588475465774536,
- "mean_token_accuracy": 0.836666464805603,
- "num_tokens": 2329025.0,
- "step": 256
- },
- {
- "epoch": 0.1952887537993921,
- "grad_norm": 2.161400318145752,
- "learning_rate": 4.997049349564814e-06,
- "loss": 0.5170183777809143,
- "mean_token_accuracy": 0.8287534117698669,
- "num_tokens": 2337448.0,
- "step": 257
- },
- {
- "epoch": 0.196048632218845,
- "grad_norm": 2.629669189453125,
- "learning_rate": 4.996946746766602e-06,
- "loss": 0.44650501012802124,
- "mean_token_accuracy": 0.850114107131958,
- "num_tokens": 2343207.0,
- "step": 258
- },
- {
- "epoch": 0.19680851063829788,
- "grad_norm": 1.6735503673553467,
- "learning_rate": 4.996842391515045e-06,
- "loss": 0.5247820019721985,
- "mean_token_accuracy": 0.8285071849822998,
- "num_tokens": 2356801.0,
- "step": 259
- },
- {
- "epoch": 0.19756838905775076,
- "grad_norm": 1.2753115892410278,
- "learning_rate": 4.996736283883382e-06,
- "loss": 0.41870927810668945,
- "mean_token_accuracy": 0.8448047637939453,
- "num_tokens": 2377306.0,
- "step": 260
- },
- {
- "epoch": 0.19832826747720364,
- "grad_norm": 2.6947314739227295,
- "learning_rate": 4.9966284239460875e-06,
- "loss": 0.5059205889701843,
- "mean_token_accuracy": 0.8430814743041992,
- "num_tokens": 2383352.0,
- "step": 261
- },
- {
- "epoch": 0.19908814589665655,
- "grad_norm": 2.0509963035583496,
- "learning_rate": 4.996518811778858e-06,
- "loss": 0.4565388560295105,
- "mean_token_accuracy": 0.8453130722045898,
- "num_tokens": 2391149.0,
- "step": 262
- },
- {
- "epoch": 0.19984802431610943,
- "grad_norm": 2.1856348514556885,
- "learning_rate": 4.996407447458626e-06,
- "loss": 0.531380832195282,
- "mean_token_accuracy": 0.8387004137039185,
- "num_tokens": 2399875.0,
- "step": 263
- },
- {
- "epoch": 0.2006079027355623,
- "grad_norm": 2.7348573207855225,
- "learning_rate": 4.99629433106355e-06,
- "loss": 0.5242817401885986,
- "mean_token_accuracy": 0.8177423477172852,
- "num_tokens": 2406586.0,
- "step": 264
- },
- {
- "epoch": 0.2013677811550152,
- "grad_norm": 1.76587975025177,
- "learning_rate": 4.99617946267302e-06,
- "loss": 0.49298471212387085,
- "mean_token_accuracy": 0.8271149396896362,
- "num_tokens": 2418683.0,
- "step": 265
- },
- {
- "epoch": 0.20212765957446807,
- "grad_norm": 2.8129730224609375,
- "learning_rate": 4.996062842367655e-06,
- "loss": 0.46420302987098694,
- "mean_token_accuracy": 0.8453244566917419,
- "num_tokens": 2422929.0,
- "step": 266
- },
- {
- "epoch": 0.20288753799392098,
- "grad_norm": 2.575744152069092,
- "learning_rate": 4.9959444702293025e-06,
- "loss": 0.43208545446395874,
- "mean_token_accuracy": 0.8494843244552612,
- "num_tokens": 2429567.0,
- "step": 267
- },
- {
- "epoch": 0.20364741641337386,
- "grad_norm": 2.7586750984191895,
- "learning_rate": 4.995824346341041e-06,
- "loss": 0.4390473961830139,
- "mean_token_accuracy": 0.8348895311355591,
- "num_tokens": 2434700.0,
- "step": 268
- },
- {
- "epoch": 0.20440729483282674,
- "grad_norm": 1.972145438194275,
- "learning_rate": 4.99570247078718e-06,
- "loss": 0.6219544410705566,
- "mean_token_accuracy": 0.7939999103546143,
- "num_tokens": 2447007.0,
- "step": 269
- },
- {
- "epoch": 0.20516717325227962,
- "grad_norm": 2.2963485717773438,
- "learning_rate": 4.995578843653255e-06,
- "loss": 0.5008970499038696,
- "mean_token_accuracy": 0.8255308866500854,
- "num_tokens": 2453936.0,
- "step": 270
- },
- {
- "epoch": 0.20592705167173253,
- "grad_norm": 1.8897721767425537,
- "learning_rate": 4.995453465026033e-06,
- "loss": 0.5436089038848877,
- "mean_token_accuracy": 0.819086492061615,
- "num_tokens": 2464494.0,
- "step": 271
- },
- {
- "epoch": 0.2066869300911854,
- "grad_norm": 2.319728374481201,
- "learning_rate": 4.995326334993508e-06,
- "loss": 0.5136368870735168,
- "mean_token_accuracy": 0.820817232131958,
- "num_tokens": 2470938.0,
- "step": 272
- },
- {
- "epoch": 0.2074468085106383,
- "grad_norm": 2.230414390563965,
- "learning_rate": 4.9951974536449055e-06,
- "loss": 0.5272846817970276,
- "mean_token_accuracy": 0.8203279972076416,
- "num_tokens": 2478629.0,
- "step": 273
- },
- {
- "epoch": 0.20820668693009117,
- "grad_norm": 3.401937484741211,
- "learning_rate": 4.9950668210706795e-06,
- "loss": 0.4389592111110687,
- "mean_token_accuracy": 0.8647899031639099,
- "num_tokens": 2482193.0,
- "step": 274
- },
- {
- "epoch": 0.20896656534954408,
- "grad_norm": 2.1278507709503174,
- "learning_rate": 4.994934437362513e-06,
- "loss": 0.598863422870636,
- "mean_token_accuracy": 0.7945119738578796,
- "num_tokens": 2492465.0,
- "step": 275
- },
- {
- "epoch": 0.20972644376899696,
- "grad_norm": 1.9259960651397705,
- "learning_rate": 4.994800302613318e-06,
- "loss": 0.49520939588546753,
- "mean_token_accuracy": 0.8371536135673523,
- "num_tokens": 2500825.0,
- "step": 276
- },
- {
- "epoch": 0.21048632218844984,
- "grad_norm": 2.346418857574463,
- "learning_rate": 4.994664416917236e-06,
- "loss": 0.5412614345550537,
- "mean_token_accuracy": 0.810661792755127,
- "num_tokens": 2509513.0,
- "step": 277
- },
- {
- "epoch": 0.21124620060790272,
- "grad_norm": 1.3092039823532104,
- "learning_rate": 4.994526780369636e-06,
- "loss": 0.46305379271507263,
- "mean_token_accuracy": 0.8358527421951294,
- "num_tokens": 2531405.0,
- "step": 278
- },
- {
- "epoch": 0.21200607902735563,
- "grad_norm": 2.924611806869507,
- "learning_rate": 4.9943873930671175e-06,
- "loss": 0.6134544610977173,
- "mean_token_accuracy": 0.7947378754615784,
- "num_tokens": 2536744.0,
- "step": 279
- },
- {
- "epoch": 0.2127659574468085,
- "grad_norm": 2.8290598392486572,
- "learning_rate": 4.994246255107506e-06,
- "loss": 0.465520441532135,
- "mean_token_accuracy": 0.8440108299255371,
- "num_tokens": 2541184.0,
- "step": 280
- },
- {
- "epoch": 0.2135258358662614,
- "grad_norm": 3.8081259727478027,
- "learning_rate": 4.994103366589859e-06,
- "loss": 0.43394139409065247,
- "mean_token_accuracy": 0.8579148054122925,
- "num_tokens": 2545395.0,
- "step": 281
- },
- {
- "epoch": 0.21428571428571427,
- "grad_norm": 1.7994529008865356,
- "learning_rate": 4.993958727614462e-06,
- "loss": 0.5076484680175781,
- "mean_token_accuracy": 0.8270803093910217,
- "num_tokens": 2556541.0,
- "step": 282
- },
- {
- "epoch": 0.21504559270516718,
- "grad_norm": 2.5582659244537354,
- "learning_rate": 4.993812338282826e-06,
- "loss": 0.4453684389591217,
- "mean_token_accuracy": 0.8488293886184692,
- "num_tokens": 2562949.0,
- "step": 283
- },
- {
- "epoch": 0.21580547112462006,
- "grad_norm": 1.6448938846588135,
- "learning_rate": 4.993664198697694e-06,
- "loss": 0.461971640586853,
- "mean_token_accuracy": 0.824763298034668,
- "num_tokens": 2576407.0,
- "step": 284
- },
- {
- "epoch": 0.21656534954407294,
- "grad_norm": 2.1264469623565674,
- "learning_rate": 4.993514308963037e-06,
- "loss": 0.6241602897644043,
- "mean_token_accuracy": 0.7916014790534973,
- "num_tokens": 2585695.0,
- "step": 285
- },
- {
- "epoch": 0.21732522796352582,
- "grad_norm": 3.629991292953491,
- "learning_rate": 4.993362669184051e-06,
- "loss": 0.610355019569397,
- "mean_token_accuracy": 0.7847568988800049,
- "num_tokens": 2589778.0,
- "step": 286
- },
- {
- "epoch": 0.21808510638297873,
- "grad_norm": 1.9070756435394287,
- "learning_rate": 4.993209279467164e-06,
- "loss": 0.5513623952865601,
- "mean_token_accuracy": 0.7911607027053833,
- "num_tokens": 2600920.0,
- "step": 287
- },
- {
- "epoch": 0.2188449848024316,
- "grad_norm": 1.761062741279602,
- "learning_rate": 4.993054139920031e-06,
- "loss": 0.4579957127571106,
- "mean_token_accuracy": 0.8189530372619629,
- "num_tokens": 2611856.0,
- "step": 288
- },
- {
- "epoch": 0.2196048632218845,
- "grad_norm": 1.7264713048934937,
- "learning_rate": 4.992897250651535e-06,
- "loss": 0.5871305465698242,
- "mean_token_accuracy": 0.7918527126312256,
- "num_tokens": 2624730.0,
- "step": 289
- },
- {
- "epoch": 0.22036474164133737,
- "grad_norm": 1.7455977201461792,
- "learning_rate": 4.992738611771787e-06,
- "loss": 0.5475119948387146,
- "mean_token_accuracy": 0.8226917386054993,
- "num_tokens": 2635705.0,
- "step": 290
- },
- {
- "epoch": 0.22112462006079028,
- "grad_norm": 2.095095157623291,
- "learning_rate": 4.992578223392124e-06,
- "loss": 0.5952225923538208,
- "mean_token_accuracy": 0.8078469038009644,
- "num_tokens": 2643954.0,
- "step": 291
- },
- {
- "epoch": 0.22188449848024316,
- "grad_norm": 2.994664192199707,
- "learning_rate": 4.992416085625115e-06,
- "loss": 0.5432442426681519,
- "mean_token_accuracy": 0.8329008221626282,
- "num_tokens": 2648800.0,
- "step": 292
- },
- {
- "epoch": 0.22264437689969604,
- "grad_norm": 2.796790361404419,
- "learning_rate": 4.992252198584554e-06,
- "loss": 0.5168961882591248,
- "mean_token_accuracy": 0.8393474817276001,
- "num_tokens": 2653546.0,
- "step": 293
- },
- {
- "epoch": 0.22340425531914893,
- "grad_norm": 1.8610522747039795,
- "learning_rate": 4.992086562385462e-06,
- "loss": 0.5728024244308472,
- "mean_token_accuracy": 0.797406792640686,
- "num_tokens": 2667483.0,
- "step": 294
- },
- {
- "epoch": 0.22416413373860183,
- "grad_norm": 1.695472002029419,
- "learning_rate": 4.9919191771440905e-06,
- "loss": 0.5460028648376465,
- "mean_token_accuracy": 0.8123016357421875,
- "num_tokens": 2683574.0,
- "step": 295
- },
- {
- "epoch": 0.22492401215805471,
- "grad_norm": 2.8627376556396484,
- "learning_rate": 4.9917500429779165e-06,
- "loss": 0.5566985011100769,
- "mean_token_accuracy": 0.815531313419342,
- "num_tokens": 2688985.0,
- "step": 296
- },
- {
- "epoch": 0.2256838905775076,
- "grad_norm": 2.73323655128479,
- "learning_rate": 4.991579160005644e-06,
- "loss": 0.48197102546691895,
- "mean_token_accuracy": 0.8471829295158386,
- "num_tokens": 2694799.0,
- "step": 297
- },
- {
- "epoch": 0.22644376899696048,
- "grad_norm": 1.8436161279678345,
- "learning_rate": 4.991406528347206e-06,
- "loss": 0.4528339207172394,
- "mean_token_accuracy": 0.8603188395500183,
- "num_tokens": 2707321.0,
- "step": 298
- },
- {
- "epoch": 0.22720364741641338,
- "grad_norm": 2.6231515407562256,
- "learning_rate": 4.9912321481237616e-06,
- "loss": 0.5916541814804077,
- "mean_token_accuracy": 0.8050242066383362,
- "num_tokens": 2714233.0,
- "step": 299
- },
- {
- "epoch": 0.22796352583586627,
- "grad_norm": 3.08776593208313,
- "learning_rate": 4.991056019457697e-06,
- "loss": 0.4860580563545227,
- "mean_token_accuracy": 0.8464088439941406,
- "num_tokens": 2718443.0,
- "step": 300
- },
- {
- "epoch": 0.22872340425531915,
- "grad_norm": 2.2537803649902344,
- "learning_rate": 4.990878142472628e-06,
- "loss": 0.5158311128616333,
- "mean_token_accuracy": 0.824694812297821,
- "num_tokens": 2726158.0,
- "step": 301
- },
- {
- "epoch": 0.22948328267477203,
- "grad_norm": 2.1122705936431885,
- "learning_rate": 4.990698517293394e-06,
- "loss": 0.495265394449234,
- "mean_token_accuracy": 0.8343238830566406,
- "num_tokens": 2735022.0,
- "step": 302
- },
- {
- "epoch": 0.23024316109422494,
- "grad_norm": 3.5503528118133545,
- "learning_rate": 4.9905171440460645e-06,
- "loss": 0.46063232421875,
- "mean_token_accuracy": 0.8420047760009766,
- "num_tokens": 2738550.0,
- "step": 303
- },
- {
- "epoch": 0.23100303951367782,
- "grad_norm": 3.9858486652374268,
- "learning_rate": 4.990334022857932e-06,
- "loss": 0.5832710266113281,
- "mean_token_accuracy": 0.8144199848175049,
- "num_tokens": 2741720.0,
- "step": 304
- },
- {
- "epoch": 0.2317629179331307,
- "grad_norm": 2.407231330871582,
- "learning_rate": 4.990149153857519e-06,
- "loss": 0.4692630171775818,
- "mean_token_accuracy": 0.8429223299026489,
- "num_tokens": 2748693.0,
- "step": 305
- },
- {
- "epoch": 0.23252279635258358,
- "grad_norm": 1.6996397972106934,
- "learning_rate": 4.989962537174573e-06,
- "loss": 0.49143946170806885,
- "mean_token_accuracy": 0.8340128064155579,
- "num_tokens": 2761254.0,
- "step": 306
- },
- {
- "epoch": 0.23328267477203649,
- "grad_norm": 3.746432065963745,
- "learning_rate": 4.989774172940071e-06,
- "loss": 0.6282026767730713,
- "mean_token_accuracy": 0.775698184967041,
- "num_tokens": 2765115.0,
- "step": 307
- },
- {
- "epoch": 0.23404255319148937,
- "grad_norm": 2.212872266769409,
- "learning_rate": 4.989584061286211e-06,
- "loss": 0.5193763971328735,
- "mean_token_accuracy": 0.8168246746063232,
- "num_tokens": 2772345.0,
- "step": 308
- },
- {
- "epoch": 0.23480243161094225,
- "grad_norm": 1.752297282218933,
- "learning_rate": 4.989392202346423e-06,
- "loss": 0.4437984824180603,
- "mean_token_accuracy": 0.8451256155967712,
- "num_tokens": 2783072.0,
- "step": 309
- },
- {
- "epoch": 0.23556231003039513,
- "grad_norm": 2.386019706726074,
- "learning_rate": 4.989198596255361e-06,
- "loss": 0.4090752899646759,
- "mean_token_accuracy": 0.8480085134506226,
- "num_tokens": 2788757.0,
- "step": 310
- },
- {
- "epoch": 0.23632218844984804,
- "grad_norm": 3.9981489181518555,
- "learning_rate": 4.989003243148904e-06,
- "loss": 0.5149132013320923,
- "mean_token_accuracy": 0.8179056644439697,
- "num_tokens": 2792096.0,
- "step": 311
- },
- {
- "epoch": 0.23708206686930092,
- "grad_norm": 1.8723100423812866,
- "learning_rate": 4.988806143164159e-06,
- "loss": 0.4531487822532654,
- "mean_token_accuracy": 0.8400167226791382,
- "num_tokens": 2802210.0,
- "step": 312
- },
- {
- "epoch": 0.2378419452887538,
- "grad_norm": 2.3415136337280273,
- "learning_rate": 4.988607296439459e-06,
- "loss": 0.5974439978599548,
- "mean_token_accuracy": 0.8035976886749268,
- "num_tokens": 2810088.0,
- "step": 313
- },
- {
- "epoch": 0.23860182370820668,
- "grad_norm": 1.5317577123641968,
- "learning_rate": 4.98840670311436e-06,
- "loss": 0.49247145652770996,
- "mean_token_accuracy": 0.8292540311813354,
- "num_tokens": 2824005.0,
- "step": 314
- },
- {
- "epoch": 0.2393617021276596,
- "grad_norm": 2.170772075653076,
- "learning_rate": 4.988204363329648e-06,
- "loss": 0.6359974145889282,
- "mean_token_accuracy": 0.7785564661026001,
- "num_tokens": 2834680.0,
- "step": 315
- },
- {
- "epoch": 0.24012158054711247,
- "grad_norm": 3.2655932903289795,
- "learning_rate": 4.988000277227334e-06,
- "loss": 0.5080196857452393,
- "mean_token_accuracy": 0.8295877575874329,
- "num_tokens": 2838735.0,
- "step": 316
- },
- {
- "epoch": 0.24088145896656535,
- "grad_norm": 3.406589984893799,
- "learning_rate": 4.987794444950651e-06,
- "loss": 0.3939085006713867,
- "mean_token_accuracy": 0.8700719475746155,
- "num_tokens": 2842127.0,
- "step": 317
- },
- {
- "epoch": 0.24164133738601823,
- "grad_norm": 1.8211106061935425,
- "learning_rate": 4.987586866644061e-06,
- "loss": 0.5270540118217468,
- "mean_token_accuracy": 0.826683521270752,
- "num_tokens": 2853656.0,
- "step": 318
- },
- {
- "epoch": 0.24240121580547114,
- "grad_norm": 1.8429969549179077,
- "learning_rate": 4.9873775424532515e-06,
- "loss": 0.4705049991607666,
- "mean_token_accuracy": 0.8355701565742493,
- "num_tokens": 2863513.0,
- "step": 319
- },
- {
- "epoch": 0.24316109422492402,
- "grad_norm": 2.2425320148468018,
- "learning_rate": 4.9871664725251314e-06,
- "loss": 0.485736608505249,
- "mean_token_accuracy": 0.835182785987854,
- "num_tokens": 2871556.0,
- "step": 320
- },
- {
- "epoch": 0.2439209726443769,
- "grad_norm": 1.6202056407928467,
- "learning_rate": 4.986953657007841e-06,
- "loss": 0.4437887370586395,
- "mean_token_accuracy": 0.8282591700553894,
- "num_tokens": 2884335.0,
- "step": 321
- },
- {
- "epoch": 0.24468085106382978,
- "grad_norm": 1.1027268171310425,
- "learning_rate": 4.98673909605074e-06,
- "loss": 0.3770800828933716,
- "mean_token_accuracy": 0.8325437307357788,
- "num_tokens": 2904286.0,
- "step": 322
- },
- {
- "epoch": 0.2454407294832827,
- "grad_norm": 2.3239076137542725,
- "learning_rate": 4.986522789804417e-06,
- "loss": 0.5387254953384399,
- "mean_token_accuracy": 0.806242823600769,
- "num_tokens": 2910975.0,
- "step": 323
- },
- {
- "epoch": 0.24620060790273557,
- "grad_norm": 2.243482828140259,
- "learning_rate": 4.986304738420684e-06,
- "loss": 0.4396553039550781,
- "mean_token_accuracy": 0.8561904430389404,
- "num_tokens": 2917087.0,
- "step": 324
- },
- {
- "epoch": 0.24696048632218845,
- "grad_norm": 2.537264347076416,
- "learning_rate": 4.986084942052577e-06,
- "loss": 0.395110160112381,
- "mean_token_accuracy": 0.8636915683746338,
- "num_tokens": 2921887.0,
- "step": 325
- },
- {
- "epoch": 0.24772036474164133,
- "grad_norm": 2.319399118423462,
- "learning_rate": 4.9858634008543574e-06,
- "loss": 0.581517219543457,
- "mean_token_accuracy": 0.8157487511634827,
- "num_tokens": 2928996.0,
- "step": 326
- },
- {
- "epoch": 0.24848024316109424,
- "grad_norm": 1.9787474870681763,
- "learning_rate": 4.985640114981513e-06,
- "loss": 0.5084106922149658,
- "mean_token_accuracy": 0.835221529006958,
- "num_tokens": 2940302.0,
- "step": 327
- },
- {
- "epoch": 0.24924012158054712,
- "grad_norm": 2.4783265590667725,
- "learning_rate": 4.985415084590752e-06,
- "loss": 0.6062222719192505,
- "mean_token_accuracy": 0.7885516285896301,
- "num_tokens": 2946386.0,
- "step": 328
- },
- {
- "epoch": 0.25,
- "grad_norm": 2.4081411361694336,
- "learning_rate": 4.985188309840012e-06,
- "loss": 0.5079880356788635,
- "mean_token_accuracy": 0.8313904404640198,
- "num_tokens": 2952323.0,
- "step": 329
- },
- {
- "epoch": 0.2507598784194529,
- "grad_norm": 2.64993953704834,
- "learning_rate": 4.984959790888451e-06,
- "loss": 0.5461447834968567,
- "mean_token_accuracy": 0.8125468492507935,
- "num_tokens": 2958119.0,
- "step": 330
- },
- {
- "epoch": 0.25151975683890576,
- "grad_norm": 2.549734115600586,
- "learning_rate": 4.984729527896451e-06,
- "loss": 0.5998573303222656,
- "mean_token_accuracy": 0.8076666593551636,
- "num_tokens": 2964947.0,
- "step": 331
- },
- {
- "epoch": 0.25227963525835867,
- "grad_norm": 3.2185161113739014,
- "learning_rate": 4.984497521025622e-06,
- "loss": 0.4232945442199707,
- "mean_token_accuracy": 0.8543803095817566,
- "num_tokens": 2968598.0,
- "step": 332
- },
- {
- "epoch": 0.2530395136778115,
- "grad_norm": 2.588994264602661,
- "learning_rate": 4.984263770438793e-06,
- "loss": 0.460967481136322,
- "mean_token_accuracy": 0.8416207432746887,
- "num_tokens": 2974510.0,
- "step": 333
- },
- {
- "epoch": 0.25379939209726443,
- "grad_norm": 2.1373162269592285,
- "learning_rate": 4.984028276300021e-06,
- "loss": 0.49382102489471436,
- "mean_token_accuracy": 0.8388048410415649,
- "num_tokens": 2981632.0,
- "step": 334
- },
- {
- "epoch": 0.25455927051671734,
- "grad_norm": 2.2524826526641846,
- "learning_rate": 4.983791038774585e-06,
- "loss": 0.4947671890258789,
- "mean_token_accuracy": 0.8066365122795105,
- "num_tokens": 2988736.0,
- "step": 335
- },
- {
- "epoch": 0.2553191489361702,
- "grad_norm": 1.7244199514389038,
- "learning_rate": 4.983552058028985e-06,
- "loss": 0.48096776008605957,
- "mean_token_accuracy": 0.830735445022583,
- "num_tokens": 3003576.0,
- "step": 336
- },
- {
- "epoch": 0.2560790273556231,
- "grad_norm": 3.0628933906555176,
- "learning_rate": 4.9833113342309495e-06,
- "loss": 0.6027032136917114,
- "mean_token_accuracy": 0.8008694648742676,
- "num_tokens": 3009549.0,
- "step": 337
- },
- {
- "epoch": 0.256838905775076,
- "grad_norm": 2.438674211502075,
- "learning_rate": 4.983068867549427e-06,
- "loss": 0.517090916633606,
- "mean_token_accuracy": 0.827893853187561,
- "num_tokens": 3015236.0,
- "step": 338
- },
- {
- "epoch": 0.25759878419452886,
- "grad_norm": 2.131535053253174,
- "learning_rate": 4.982824658154589e-06,
- "loss": 0.6656812429428101,
- "mean_token_accuracy": 0.7772425413131714,
- "num_tokens": 3028142.0,
- "step": 339
- },
- {
- "epoch": 0.25835866261398177,
- "grad_norm": 2.3206584453582764,
- "learning_rate": 4.9825787062178315e-06,
- "loss": 0.5757625699043274,
- "mean_token_accuracy": 0.8073873519897461,
- "num_tokens": 3040996.0,
- "step": 340
- },
- {
- "epoch": 0.2591185410334346,
- "grad_norm": 1.3905521631240845,
- "learning_rate": 4.982331011911774e-06,
- "loss": 0.4193805456161499,
- "mean_token_accuracy": 0.8399466872215271,
- "num_tokens": 3061931.0,
- "step": 341
- },
- {
- "epoch": 0.25987841945288753,
- "grad_norm": 2.184173345565796,
- "learning_rate": 4.982081575410256e-06,
- "loss": 0.4751223921775818,
- "mean_token_accuracy": 0.8409271240234375,
- "num_tokens": 3069081.0,
- "step": 342
- },
- {
- "epoch": 0.26063829787234044,
- "grad_norm": 3.538764238357544,
- "learning_rate": 4.9818303968883445e-06,
- "loss": 0.8119601011276245,
- "mean_token_accuracy": 0.7442739009857178,
- "num_tokens": 3073628.0,
- "step": 343
- },
- {
- "epoch": 0.2613981762917933,
- "grad_norm": 1.8063762187957764,
- "learning_rate": 4.981577476522323e-06,
- "loss": 0.5615730881690979,
- "mean_token_accuracy": 0.8207751512527466,
- "num_tokens": 3086596.0,
- "step": 344
- },
- {
- "epoch": 0.2621580547112462,
- "grad_norm": 2.4346961975097656,
- "learning_rate": 4.981322814489703e-06,
- "loss": 0.5266709327697754,
- "mean_token_accuracy": 0.8211277723312378,
- "num_tokens": 3092631.0,
- "step": 345
- },
- {
- "epoch": 0.2629179331306991,
- "grad_norm": 1.91289484500885,
- "learning_rate": 4.981066410969215e-06,
- "loss": 0.5047177672386169,
- "mean_token_accuracy": 0.8356877565383911,
- "num_tokens": 3101102.0,
- "step": 346
- },
- {
- "epoch": 0.26367781155015196,
- "grad_norm": 2.1495707035064697,
- "learning_rate": 4.980808266140813e-06,
- "loss": 0.47876280546188354,
- "mean_token_accuracy": 0.8364313244819641,
- "num_tokens": 3107998.0,
- "step": 347
- },
- {
- "epoch": 0.26443768996960487,
- "grad_norm": 2.5961992740631104,
- "learning_rate": 4.9805483801856744e-06,
- "loss": 0.5512958765029907,
- "mean_token_accuracy": 0.8181467652320862,
- "num_tokens": 3113848.0,
- "step": 348
- },
- {
- "epoch": 0.2651975683890577,
- "grad_norm": 3.2828900814056396,
- "learning_rate": 4.980286753286196e-06,
- "loss": 0.4217945635318756,
- "mean_token_accuracy": 0.8617103099822998,
- "num_tokens": 3117652.0,
- "step": 349
- },
- {
- "epoch": 0.26595744680851063,
- "grad_norm": 1.425554871559143,
- "learning_rate": 4.980023385625996e-06,
- "loss": 0.4042487144470215,
- "mean_token_accuracy": 0.8492785692214966,
- "num_tokens": 3132336.0,
- "step": 350
- },
- {
- "epoch": 0.26671732522796354,
- "grad_norm": 2.933504104614258,
- "learning_rate": 4.979758277389919e-06,
- "loss": 0.5406704545021057,
- "mean_token_accuracy": 0.8035423755645752,
- "num_tokens": 3137544.0,
- "step": 351
- },
- {
- "epoch": 0.2674772036474164,
- "grad_norm": 1.9958966970443726,
- "learning_rate": 4.9794914287640264e-06,
- "loss": 0.5857555270195007,
- "mean_token_accuracy": 0.7965140342712402,
- "num_tokens": 3149705.0,
- "step": 352
- },
- {
- "epoch": 0.2682370820668693,
- "grad_norm": 2.467694044113159,
- "learning_rate": 4.979222839935602e-06,
- "loss": 0.6404043436050415,
- "mean_token_accuracy": 0.7823755741119385,
- "num_tokens": 3158353.0,
- "step": 353
- },
- {
- "epoch": 0.2689969604863222,
- "grad_norm": 2.0102720260620117,
- "learning_rate": 4.9789525110931545e-06,
- "loss": 0.5681496858596802,
- "mean_token_accuracy": 0.8108169436454773,
- "num_tokens": 3167121.0,
- "step": 354
- },
- {
- "epoch": 0.26975683890577506,
- "grad_norm": 2.6017866134643555,
- "learning_rate": 4.978680442426409e-06,
- "loss": 0.6309828162193298,
- "mean_token_accuracy": 0.7742617130279541,
- "num_tokens": 3175012.0,
- "step": 355
- },
- {
- "epoch": 0.270516717325228,
- "grad_norm": 1.8799268007278442,
- "learning_rate": 4.978406634126315e-06,
- "loss": 0.524029016494751,
- "mean_token_accuracy": 0.8317689895629883,
- "num_tokens": 3185331.0,
- "step": 356
- },
- {
- "epoch": 0.2712765957446808,
- "grad_norm": 1.508332371711731,
- "learning_rate": 4.978131086385041e-06,
- "loss": 0.46656402945518494,
- "mean_token_accuracy": 0.8339117765426636,
- "num_tokens": 3198813.0,
- "step": 357
- },
- {
- "epoch": 0.27203647416413373,
- "grad_norm": 3.595707654953003,
- "learning_rate": 4.977853799395976e-06,
- "loss": 0.5101234912872314,
- "mean_token_accuracy": 0.8251723051071167,
- "num_tokens": 3206557.0,
- "step": 358
- },
- {
- "epoch": 0.27279635258358664,
- "grad_norm": 3.5317916870117188,
- "learning_rate": 4.977574773353732e-06,
- "loss": 0.5684665441513062,
- "mean_token_accuracy": 0.8124493360519409,
- "num_tokens": 3210912.0,
- "step": 359
- },
- {
- "epoch": 0.2735562310030395,
- "grad_norm": 2.8606204986572266,
- "learning_rate": 4.97729400845414e-06,
- "loss": 0.4746384620666504,
- "mean_token_accuracy": 0.8195606470108032,
- "num_tokens": 3215365.0,
- "step": 360
- },
- {
- "epoch": 0.2743161094224924,
- "grad_norm": 1.8214033842086792,
- "learning_rate": 4.977011504894253e-06,
- "loss": 0.4842769503593445,
- "mean_token_accuracy": 0.82928866147995,
- "num_tokens": 3224037.0,
- "step": 361
- },
- {
- "epoch": 0.2750759878419453,
- "grad_norm": 1.628746509552002,
- "learning_rate": 4.97672726287234e-06,
- "loss": 0.4397493302822113,
- "mean_token_accuracy": 0.8606528043746948,
- "num_tokens": 3235589.0,
- "step": 362
- },
- {
- "epoch": 0.27583586626139817,
- "grad_norm": 3.557973861694336,
- "learning_rate": 4.976441282587894e-06,
- "loss": 0.5732032060623169,
- "mean_token_accuracy": 0.8041545748710632,
- "num_tokens": 3239958.0,
- "step": 363
- },
- {
- "epoch": 0.2765957446808511,
- "grad_norm": 1.3467901945114136,
- "learning_rate": 4.9761535642416284e-06,
- "loss": 0.4525323510169983,
- "mean_token_accuracy": 0.8281061053276062,
- "num_tokens": 3257703.0,
- "step": 364
- },
- {
- "epoch": 0.2773556231003039,
- "grad_norm": 2.2649986743927,
- "learning_rate": 4.9758641080354745e-06,
- "loss": 0.5074734687805176,
- "mean_token_accuracy": 0.8447474241256714,
- "num_tokens": 3264334.0,
- "step": 365
- },
- {
- "epoch": 0.27811550151975684,
- "grad_norm": 2.8667566776275635,
- "learning_rate": 4.975572914172581e-06,
- "loss": 0.5759559869766235,
- "mean_token_accuracy": 0.7976793050765991,
- "num_tokens": 3269314.0,
- "step": 366
- },
- {
- "epoch": 0.27887537993920974,
- "grad_norm": 2.2514986991882324,
- "learning_rate": 4.975279982857324e-06,
- "loss": 0.5786465406417847,
- "mean_token_accuracy": 0.8058781623840332,
- "num_tokens": 3277324.0,
- "step": 367
- },
- {
- "epoch": 0.2796352583586626,
- "grad_norm": 1.3826723098754883,
- "learning_rate": 4.97498531429529e-06,
- "loss": 0.40801727771759033,
- "mean_token_accuracy": 0.8601310849189758,
- "num_tokens": 3290530.0,
- "step": 368
- },
- {
- "epoch": 0.2803951367781155,
- "grad_norm": 2.084092617034912,
- "learning_rate": 4.97468890869329e-06,
- "loss": 0.47076648473739624,
- "mean_token_accuracy": 0.8310186862945557,
- "num_tokens": 3298325.0,
- "step": 369
- },
- {
- "epoch": 0.2811550151975684,
- "grad_norm": 1.3467998504638672,
- "learning_rate": 4.974390766259353e-06,
- "loss": 0.44668465852737427,
- "mean_token_accuracy": 0.8275353908538818,
- "num_tokens": 3314302.0,
- "step": 370
- },
- {
- "epoch": 0.28191489361702127,
- "grad_norm": 2.5921075344085693,
- "learning_rate": 4.974090887202726e-06,
- "loss": 0.5343953967094421,
- "mean_token_accuracy": 0.8110706806182861,
- "num_tokens": 3320963.0,
- "step": 371
- },
- {
- "epoch": 0.2826747720364742,
- "grad_norm": 2.042781352996826,
- "learning_rate": 4.973789271733877e-06,
- "loss": 0.6293343305587769,
- "mean_token_accuracy": 0.7800243496894836,
- "num_tokens": 3332742.0,
- "step": 372
- },
- {
- "epoch": 0.28343465045592703,
- "grad_norm": 4.822193145751953,
- "learning_rate": 4.973485920064491e-06,
- "loss": 0.6256728768348694,
- "mean_token_accuracy": 0.7962433099746704,
- "num_tokens": 3335872.0,
- "step": 373
- },
- {
- "epoch": 0.28419452887537994,
- "grad_norm": 1.260988473892212,
- "learning_rate": 4.973180832407471e-06,
- "loss": 0.38731223344802856,
- "mean_token_accuracy": 0.8385066986083984,
- "num_tokens": 3351884.0,
- "step": 374
- },
- {
- "epoch": 0.28495440729483285,
- "grad_norm": 2.669966697692871,
- "learning_rate": 4.97287400897694e-06,
- "loss": 0.5594710111618042,
- "mean_token_accuracy": 0.8097212314605713,
- "num_tokens": 3358197.0,
- "step": 375
- },
- {
- "epoch": 0.2857142857142857,
- "grad_norm": 3.0344486236572266,
- "learning_rate": 4.972565449988238e-06,
- "loss": 0.34449583292007446,
- "mean_token_accuracy": 0.8813316822052002,
- "num_tokens": 3362133.0,
- "step": 376
- },
- {
- "epoch": 0.2864741641337386,
- "grad_norm": 2.562251091003418,
- "learning_rate": 4.972255155657925e-06,
- "loss": 0.5331522822380066,
- "mean_token_accuracy": 0.8212941288948059,
- "num_tokens": 3370346.0,
- "step": 377
- },
- {
- "epoch": 0.2872340425531915,
- "grad_norm": 2.7083740234375,
- "learning_rate": 4.9719431262037755e-06,
- "loss": 0.5403046011924744,
- "mean_token_accuracy": 0.8108335733413696,
- "num_tokens": 3375588.0,
- "step": 378
- },
- {
- "epoch": 0.28799392097264437,
- "grad_norm": 1.396430492401123,
- "learning_rate": 4.971629361844785e-06,
- "loss": 0.4041529893875122,
- "mean_token_accuracy": 0.8588063716888428,
- "num_tokens": 3390749.0,
- "step": 379
- },
- {
- "epoch": 0.2887537993920973,
- "grad_norm": 1.9872784614562988,
- "learning_rate": 4.971313862801166e-06,
- "loss": 0.4336993098258972,
- "mean_token_accuracy": 0.8511303663253784,
- "num_tokens": 3399064.0,
- "step": 380
- },
- {
- "epoch": 0.28951367781155013,
- "grad_norm": 1.9652575254440308,
- "learning_rate": 4.9709966292943455e-06,
- "loss": 0.4578358232975006,
- "mean_token_accuracy": 0.8229440450668335,
- "num_tokens": 3407229.0,
- "step": 381
- },
- {
- "epoch": 0.29027355623100304,
- "grad_norm": 1.6626898050308228,
- "learning_rate": 4.970677661546972e-06,
- "loss": 0.5427594184875488,
- "mean_token_accuracy": 0.815427303314209,
- "num_tokens": 3422321.0,
- "step": 382
- },
- {
- "epoch": 0.29103343465045595,
- "grad_norm": 3.5265562534332275,
- "learning_rate": 4.970356959782909e-06,
- "loss": 0.6661460995674133,
- "mean_token_accuracy": 0.7856965065002441,
- "num_tokens": 3427442.0,
- "step": 383
- },
- {
- "epoch": 0.2917933130699088,
- "grad_norm": 1.667205572128296,
- "learning_rate": 4.970034524227239e-06,
- "loss": 0.36256325244903564,
- "mean_token_accuracy": 0.8711205720901489,
- "num_tokens": 3436662.0,
- "step": 384
- },
- {
- "epoch": 0.2925531914893617,
- "grad_norm": 1.3389486074447632,
- "learning_rate": 4.969710355106256e-06,
- "loss": 0.4282698631286621,
- "mean_token_accuracy": 0.838951587677002,
- "num_tokens": 3450060.0,
- "step": 385
- },
- {
- "epoch": 0.2933130699088146,
- "grad_norm": 2.5163397789001465,
- "learning_rate": 4.969384452647477e-06,
- "loss": 0.5176984071731567,
- "mean_token_accuracy": 0.8235267996788025,
- "num_tokens": 3456990.0,
- "step": 386
- },
- {
- "epoch": 0.29407294832826747,
- "grad_norm": 1.7588495016098022,
- "learning_rate": 4.969056817079633e-06,
- "loss": 0.49710947275161743,
- "mean_token_accuracy": 0.818520724773407,
- "num_tokens": 3468098.0,
- "step": 387
- },
- {
- "epoch": 0.2948328267477204,
- "grad_norm": 2.6381046772003174,
- "learning_rate": 4.968727448632669e-06,
- "loss": 0.4425308108329773,
- "mean_token_accuracy": 0.8451643586158752,
- "num_tokens": 3472899.0,
- "step": 388
- },
- {
- "epoch": 0.29559270516717323,
- "grad_norm": 1.6345038414001465,
- "learning_rate": 4.968396347537751e-06,
- "loss": 0.4177059829235077,
- "mean_token_accuracy": 0.8498886227607727,
- "num_tokens": 3484826.0,
- "step": 389
- },
- {
- "epoch": 0.29635258358662614,
- "grad_norm": 3.0466468334198,
- "learning_rate": 4.968063514027258e-06,
- "loss": 0.4274463951587677,
- "mean_token_accuracy": 0.8387278318405151,
- "num_tokens": 3488610.0,
- "step": 390
- },
- {
- "epoch": 0.29711246200607905,
- "grad_norm": 2.6509406566619873,
- "learning_rate": 4.967728948334784e-06,
- "loss": 0.5401753783226013,
- "mean_token_accuracy": 0.8252490162849426,
- "num_tokens": 3493657.0,
- "step": 391
- },
- {
- "epoch": 0.2978723404255319,
- "grad_norm": 1.6372219324111938,
- "learning_rate": 4.967392650695141e-06,
- "loss": 0.3862472176551819,
- "mean_token_accuracy": 0.8555525541305542,
- "num_tokens": 3505588.0,
- "step": 392
- },
- {
- "epoch": 0.2986322188449848,
- "grad_norm": 2.1615452766418457,
- "learning_rate": 4.967054621344356e-06,
- "loss": 0.57850581407547,
- "mean_token_accuracy": 0.8222678899765015,
- "num_tokens": 3514396.0,
- "step": 393
- },
- {
- "epoch": 0.2993920972644377,
- "grad_norm": 1.8610916137695312,
- "learning_rate": 4.96671486051967e-06,
- "loss": 0.5440595149993896,
- "mean_token_accuracy": 0.8196715116500854,
- "num_tokens": 3523604.0,
- "step": 394
- },
- {
- "epoch": 0.30015197568389057,
- "grad_norm": 2.9585862159729004,
- "learning_rate": 4.966373368459542e-06,
- "loss": 0.6921588182449341,
- "mean_token_accuracy": 0.7816659808158875,
- "num_tokens": 3529849.0,
- "step": 395
- },
- {
- "epoch": 0.3009118541033435,
- "grad_norm": 1.9374035596847534,
- "learning_rate": 4.966030145403642e-06,
- "loss": 0.5494055151939392,
- "mean_token_accuracy": 0.8126792907714844,
- "num_tokens": 3539529.0,
- "step": 396
- },
- {
- "epoch": 0.30167173252279633,
- "grad_norm": 1.730530023574829,
- "learning_rate": 4.965685191592859e-06,
- "loss": 0.4271572232246399,
- "mean_token_accuracy": 0.8383668661117554,
- "num_tokens": 3550972.0,
- "step": 397
- },
- {
- "epoch": 0.30243161094224924,
- "grad_norm": 3.9635560512542725,
- "learning_rate": 4.9653385072692935e-06,
- "loss": 0.5576210021972656,
- "mean_token_accuracy": 0.799404501914978,
- "num_tokens": 3554147.0,
- "step": 398
- },
- {
- "epoch": 0.30319148936170215,
- "grad_norm": 2.5731968879699707,
- "learning_rate": 4.964990092676263e-06,
- "loss": 0.5478942394256592,
- "mean_token_accuracy": 0.8220961093902588,
- "num_tokens": 3559972.0,
- "step": 399
- },
- {
- "epoch": 0.303951367781155,
- "grad_norm": 2.2096588611602783,
- "learning_rate": 4.964639948058297e-06,
- "loss": 0.35461270809173584,
- "mean_token_accuracy": 0.8640927076339722,
- "num_tokens": 3565770.0,
- "step": 400
- },
- {
- "epoch": 0.3047112462006079,
- "grad_norm": 1.7874189615249634,
- "learning_rate": 4.964288073661142e-06,
- "loss": 0.38849619030952454,
- "mean_token_accuracy": 0.8443037271499634,
- "num_tokens": 3574514.0,
- "step": 401
- },
- {
- "epoch": 0.30547112462006076,
- "grad_norm": 1.5583146810531616,
- "learning_rate": 4.963934469731756e-06,
- "loss": 0.48909449577331543,
- "mean_token_accuracy": 0.8429768681526184,
- "num_tokens": 3585877.0,
- "step": 402
- },
- {
- "epoch": 0.30623100303951367,
- "grad_norm": 3.026599645614624,
- "learning_rate": 4.963579136518312e-06,
- "loss": 0.5138992071151733,
- "mean_token_accuracy": 0.8283728361129761,
- "num_tokens": 3590412.0,
- "step": 403
- },
- {
- "epoch": 0.3069908814589666,
- "grad_norm": 2.777505874633789,
- "learning_rate": 4.963222074270197e-06,
- "loss": 0.6241534948348999,
- "mean_token_accuracy": 0.8130464553833008,
- "num_tokens": 3596246.0,
- "step": 404
- },
- {
- "epoch": 0.30775075987841943,
- "grad_norm": 2.4772839546203613,
- "learning_rate": 4.962863283238011e-06,
- "loss": 0.5930814146995544,
- "mean_token_accuracy": 0.8036394715309143,
- "num_tokens": 3602878.0,
- "step": 405
- },
- {
- "epoch": 0.30851063829787234,
- "grad_norm": 1.5049982070922852,
- "learning_rate": 4.962502763673566e-06,
- "loss": 0.4903082549571991,
- "mean_token_accuracy": 0.8184912204742432,
- "num_tokens": 3617018.0,
- "step": 406
- },
- {
- "epoch": 0.30927051671732525,
- "grad_norm": 2.453155040740967,
- "learning_rate": 4.96214051582989e-06,
- "loss": 0.5138067603111267,
- "mean_token_accuracy": 0.8336835503578186,
- "num_tokens": 3624188.0,
- "step": 407
- },
- {
- "epoch": 0.3100303951367781,
- "grad_norm": 2.4038336277008057,
- "learning_rate": 4.961776539961222e-06,
- "loss": 0.5752760171890259,
- "mean_token_accuracy": 0.8054730892181396,
- "num_tokens": 3634152.0,
- "step": 408
- },
- {
- "epoch": 0.310790273556231,
- "grad_norm": 2.629068374633789,
- "learning_rate": 4.961410836323014e-06,
- "loss": 0.5580606460571289,
- "mean_token_accuracy": 0.8121089935302734,
- "num_tokens": 3639528.0,
- "step": 409
- },
- {
- "epoch": 0.31155015197568386,
- "grad_norm": 1.4245928525924683,
- "learning_rate": 4.961043405171931e-06,
- "loss": 0.5399882793426514,
- "mean_token_accuracy": 0.812280535697937,
- "num_tokens": 3655744.0,
- "step": 410
- },
- {
- "epoch": 0.3123100303951368,
- "grad_norm": 1.5236459970474243,
- "learning_rate": 4.9606742467658505e-06,
- "loss": 0.5234690308570862,
- "mean_token_accuracy": 0.8188928365707397,
- "num_tokens": 3675010.0,
- "step": 411
- },
- {
- "epoch": 0.3130699088145897,
- "grad_norm": 2.27961802482605,
- "learning_rate": 4.960303361363863e-06,
- "loss": 0.5502505898475647,
- "mean_token_accuracy": 0.8161963224411011,
- "num_tokens": 3682328.0,
- "step": 412
- },
- {
- "epoch": 0.31382978723404253,
- "grad_norm": 1.554518222808838,
- "learning_rate": 4.959930749226269e-06,
- "loss": 0.420867919921875,
- "mean_token_accuracy": 0.8499157428741455,
- "num_tokens": 3694980.0,
- "step": 413
- },
- {
- "epoch": 0.31458966565349544,
- "grad_norm": 2.609218120574951,
- "learning_rate": 4.9595564106145825e-06,
- "loss": 0.4706704318523407,
- "mean_token_accuracy": 0.8412490487098694,
- "num_tokens": 3700033.0,
- "step": 414
- },
- {
- "epoch": 0.31534954407294835,
- "grad_norm": 1.5303231477737427,
- "learning_rate": 4.959180345791528e-06,
- "loss": 0.4668654799461365,
- "mean_token_accuracy": 0.8125015497207642,
- "num_tokens": 3715012.0,
- "step": 415
- },
- {
- "epoch": 0.3161094224924012,
- "grad_norm": 1.2774665355682373,
- "learning_rate": 4.958802555021042e-06,
- "loss": 0.4339369237422943,
- "mean_token_accuracy": 0.8442851901054382,
- "num_tokens": 3733928.0,
- "step": 416
- },
- {
- "epoch": 0.3168693009118541,
- "grad_norm": 2.1240181922912598,
- "learning_rate": 4.958423038568274e-06,
- "loss": 0.4029104709625244,
- "mean_token_accuracy": 0.8627674579620361,
- "num_tokens": 3740202.0,
- "step": 417
- },
- {
- "epoch": 0.31762917933130697,
- "grad_norm": 2.00538969039917,
- "learning_rate": 4.958041796699583e-06,
- "loss": 0.5229607820510864,
- "mean_token_accuracy": 0.8282366394996643,
- "num_tokens": 3749308.0,
- "step": 418
- },
- {
- "epoch": 0.3183890577507599,
- "grad_norm": 2.6555092334747314,
- "learning_rate": 4.957658829682539e-06,
- "loss": 0.5344101190567017,
- "mean_token_accuracy": 0.8183202743530273,
- "num_tokens": 3754595.0,
- "step": 419
- },
- {
- "epoch": 0.3191489361702128,
- "grad_norm": 1.7468839883804321,
- "learning_rate": 4.9572741377859225e-06,
- "loss": 0.5667245984077454,
- "mean_token_accuracy": 0.8080123662948608,
- "num_tokens": 3765761.0,
- "step": 420
- },
- {
- "epoch": 0.31990881458966564,
- "grad_norm": 2.9612457752227783,
- "learning_rate": 4.956887721279726e-06,
- "loss": 0.5389559864997864,
- "mean_token_accuracy": 0.8019476532936096,
- "num_tokens": 3770844.0,
- "step": 421
- },
- {
- "epoch": 0.32066869300911854,
- "grad_norm": 1.842403769493103,
- "learning_rate": 4.95649958043515e-06,
- "loss": 0.38279837369918823,
- "mean_token_accuracy": 0.858866810798645,
- "num_tokens": 3778094.0,
- "step": 422
- },
- {
- "epoch": 0.32142857142857145,
- "grad_norm": 2.3108131885528564,
- "learning_rate": 4.956109715524609e-06,
- "loss": 0.5453893542289734,
- "mean_token_accuracy": 0.8085013031959534,
- "num_tokens": 3785015.0,
- "step": 423
- },
- {
- "epoch": 0.3221884498480243,
- "grad_norm": 3.0326945781707764,
- "learning_rate": 4.9557181268217225e-06,
- "loss": 0.5550523400306702,
- "mean_token_accuracy": 0.8125876188278198,
- "num_tokens": 3789830.0,
- "step": 424
- },
- {
- "epoch": 0.3229483282674772,
- "grad_norm": 1.8851977586746216,
- "learning_rate": 4.955324814601324e-06,
- "loss": 0.4902324974536896,
- "mean_token_accuracy": 0.8205406665802002,
- "num_tokens": 3799862.0,
- "step": 425
- },
- {
- "epoch": 0.32370820668693007,
- "grad_norm": 2.6018171310424805,
- "learning_rate": 4.954929779139455e-06,
- "loss": 0.5920133590698242,
- "mean_token_accuracy": 0.8340690732002258,
- "num_tokens": 3806617.0,
- "step": 426
- },
- {
- "epoch": 0.324468085106383,
- "grad_norm": 2.4283878803253174,
- "learning_rate": 4.954533020713367e-06,
- "loss": 0.5305854082107544,
- "mean_token_accuracy": 0.8137468099594116,
- "num_tokens": 3813843.0,
- "step": 427
- },
- {
- "epoch": 0.3252279635258359,
- "grad_norm": 2.667978525161743,
- "learning_rate": 4.954134539601519e-06,
- "loss": 0.5333638787269592,
- "mean_token_accuracy": 0.8402629494667053,
- "num_tokens": 3819450.0,
- "step": 428
- },
- {
- "epoch": 0.32598784194528874,
- "grad_norm": 1.7302523851394653,
- "learning_rate": 4.953734336083582e-06,
- "loss": 0.422895610332489,
- "mean_token_accuracy": 0.8709704875946045,
- "num_tokens": 3831027.0,
- "step": 429
- },
- {
- "epoch": 0.32674772036474165,
- "grad_norm": 2.427192211151123,
- "learning_rate": 4.953332410440434e-06,
- "loss": 0.6334598064422607,
- "mean_token_accuracy": 0.7817479968070984,
- "num_tokens": 3841776.0,
- "step": 430
- },
- {
- "epoch": 0.32750759878419455,
- "grad_norm": 1.460949182510376,
- "learning_rate": 4.952928762954161e-06,
- "loss": 0.3654777705669403,
- "mean_token_accuracy": 0.8780122995376587,
- "num_tokens": 3852213.0,
- "step": 431
- },
- {
- "epoch": 0.3282674772036474,
- "grad_norm": 1.9855005741119385,
- "learning_rate": 4.952523393908059e-06,
- "loss": 0.5117089748382568,
- "mean_token_accuracy": 0.811911404132843,
- "num_tokens": 3861176.0,
- "step": 432
- },
- {
- "epoch": 0.3290273556231003,
- "grad_norm": 2.2653207778930664,
- "learning_rate": 4.952116303586631e-06,
- "loss": 0.42514950037002563,
- "mean_token_accuracy": 0.8448518514633179,
- "num_tokens": 3867164.0,
- "step": 433
- },
- {
- "epoch": 0.32978723404255317,
- "grad_norm": 1.9780964851379395,
- "learning_rate": 4.951707492275589e-06,
- "loss": 0.5095293521881104,
- "mean_token_accuracy": 0.8262748718261719,
- "num_tokens": 3876406.0,
- "step": 434
- },
- {
- "epoch": 0.3305471124620061,
- "grad_norm": 2.9480233192443848,
- "learning_rate": 4.951296960261853e-06,
- "loss": 0.3494448959827423,
- "mean_token_accuracy": 0.8781307935714722,
- "num_tokens": 3880298.0,
- "step": 435
- },
- {
- "epoch": 0.331306990881459,
- "grad_norm": 2.335571527481079,
- "learning_rate": 4.95088470783355e-06,
- "loss": 0.5456914901733398,
- "mean_token_accuracy": 0.816297173500061,
- "num_tokens": 3886487.0,
- "step": 436
- },
- {
- "epoch": 0.33206686930091184,
- "grad_norm": 2.3046419620513916,
- "learning_rate": 4.950470735280013e-06,
- "loss": 0.4835948944091797,
- "mean_token_accuracy": 0.8539175391197205,
- "num_tokens": 3892706.0,
- "step": 437
- },
- {
- "epoch": 0.33282674772036475,
- "grad_norm": 2.44047474861145,
- "learning_rate": 4.950055042891786e-06,
- "loss": 0.5154092907905579,
- "mean_token_accuracy": 0.8579919338226318,
- "num_tokens": 3899532.0,
- "step": 438
- },
- {
- "epoch": 0.33358662613981765,
- "grad_norm": 4.826764106750488,
- "learning_rate": 4.949637630960618e-06,
- "loss": 0.5270259976387024,
- "mean_token_accuracy": 0.8172192573547363,
- "num_tokens": 3902260.0,
- "step": 439
- },
- {
- "epoch": 0.3343465045592705,
- "grad_norm": 2.001574754714966,
- "learning_rate": 4.949218499779462e-06,
- "loss": 0.5413002967834473,
- "mean_token_accuracy": 0.8162837028503418,
- "num_tokens": 3911706.0,
- "step": 440
- },
- {
- "epoch": 0.3351063829787234,
- "grad_norm": 1.7998944520950317,
- "learning_rate": 4.948797649642484e-06,
- "loss": 0.5131614208221436,
- "mean_token_accuracy": 0.8367440700531006,
- "num_tokens": 3923490.0,
- "step": 441
- },
- {
- "epoch": 0.33586626139817627,
- "grad_norm": 3.4566173553466797,
- "learning_rate": 4.94837508084505e-06,
- "loss": 0.7258909940719604,
- "mean_token_accuracy": 0.771377444267273,
- "num_tokens": 3928099.0,
- "step": 442
- },
- {
- "epoch": 0.3366261398176292,
- "grad_norm": 2.0040442943573,
- "learning_rate": 4.9479507936837364e-06,
- "loss": 0.482135534286499,
- "mean_token_accuracy": 0.8339327573776245,
- "num_tokens": 3937328.0,
- "step": 443
- },
- {
- "epoch": 0.3373860182370821,
- "grad_norm": 2.949502944946289,
- "learning_rate": 4.947524788456325e-06,
- "loss": 0.6474795341491699,
- "mean_token_accuracy": 0.7951677441596985,
- "num_tokens": 3942529.0,
- "step": 444
- },
- {
- "epoch": 0.33814589665653494,
- "grad_norm": 1.5528364181518555,
- "learning_rate": 4.947097065461801e-06,
- "loss": 0.48791584372520447,
- "mean_token_accuracy": 0.8425545692443848,
- "num_tokens": 3955200.0,
- "step": 445
- },
- {
- "epoch": 0.33890577507598785,
- "grad_norm": 1.8813284635543823,
- "learning_rate": 4.946667625000358e-06,
- "loss": 0.45922309160232544,
- "mean_token_accuracy": 0.8206527233123779,
- "num_tokens": 3962975.0,
- "step": 446
- },
- {
- "epoch": 0.33966565349544076,
- "grad_norm": 1.7157847881317139,
- "learning_rate": 4.946236467373392e-06,
- "loss": 0.5454182028770447,
- "mean_token_accuracy": 0.8049604892730713,
- "num_tokens": 3973956.0,
- "step": 447
- },
- {
- "epoch": 0.3404255319148936,
- "grad_norm": 2.008857250213623,
- "learning_rate": 4.945803592883509e-06,
- "loss": 0.5151860117912292,
- "mean_token_accuracy": 0.8262045383453369,
- "num_tokens": 3982853.0,
- "step": 448
- },
- {
- "epoch": 0.3411854103343465,
- "grad_norm": 1.6632496118545532,
- "learning_rate": 4.9453690018345144e-06,
- "loss": 0.42710691690444946,
- "mean_token_accuracy": 0.8521314859390259,
- "num_tokens": 3993838.0,
- "step": 449
- },
- {
- "epoch": 0.34194528875379937,
- "grad_norm": 1.365234375,
- "learning_rate": 4.944932694531423e-06,
- "loss": 0.5172526836395264,
- "mean_token_accuracy": 0.8277045488357544,
- "num_tokens": 4014179.0,
- "step": 450
- },
- {
- "epoch": 0.3427051671732523,
- "grad_norm": 1.7610243558883667,
- "learning_rate": 4.94449467128045e-06,
- "loss": 0.42104798555374146,
- "mean_token_accuracy": 0.8552065491676331,
- "num_tokens": 4023663.0,
- "step": 451
- },
- {
- "epoch": 0.3434650455927052,
- "grad_norm": 2.3732354640960693,
- "learning_rate": 4.944054932389018e-06,
- "loss": 0.5471175909042358,
- "mean_token_accuracy": 0.8487317562103271,
- "num_tokens": 4030100.0,
- "step": 452
- },
- {
- "epoch": 0.34422492401215804,
- "grad_norm": 1.5973623991012573,
- "learning_rate": 4.943613478165753e-06,
- "loss": 0.419813871383667,
- "mean_token_accuracy": 0.8484025001525879,
- "num_tokens": 4041124.0,
- "step": 453
- },
- {
- "epoch": 0.34498480243161095,
- "grad_norm": 2.966381549835205,
- "learning_rate": 4.943170308920484e-06,
- "loss": 0.5370652675628662,
- "mean_token_accuracy": 0.8439491987228394,
- "num_tokens": 4045675.0,
- "step": 454
- },
- {
- "epoch": 0.34574468085106386,
- "grad_norm": 2.5097248554229736,
- "learning_rate": 4.9427254249642445e-06,
- "loss": 0.5776349306106567,
- "mean_token_accuracy": 0.8060523867607117,
- "num_tokens": 4053250.0,
- "step": 455
- },
- {
- "epoch": 0.3465045592705167,
- "grad_norm": 1.6779125928878784,
- "learning_rate": 4.942278826609272e-06,
- "loss": 0.5245476961135864,
- "mean_token_accuracy": 0.8168526887893677,
- "num_tokens": 4064106.0,
- "step": 456
- },
- {
- "epoch": 0.3472644376899696,
- "grad_norm": 1.5945546627044678,
- "learning_rate": 4.9418305141690045e-06,
- "loss": 0.4972047209739685,
- "mean_token_accuracy": 0.8257735967636108,
- "num_tokens": 4077687.0,
- "step": 457
- },
- {
- "epoch": 0.34802431610942247,
- "grad_norm": 2.864778757095337,
- "learning_rate": 4.9413804879580865e-06,
- "loss": 0.5372499823570251,
- "mean_token_accuracy": 0.8423776626586914,
- "num_tokens": 4082632.0,
- "step": 458
- },
- {
- "epoch": 0.3487841945288754,
- "grad_norm": 1.4797078371047974,
- "learning_rate": 4.940928748292363e-06,
- "loss": 0.5903409719467163,
- "mean_token_accuracy": 0.8061295747756958,
- "num_tokens": 4104218.0,
- "step": 459
- },
- {
- "epoch": 0.3495440729483283,
- "grad_norm": 2.4376983642578125,
- "learning_rate": 4.940475295488882e-06,
- "loss": 0.4534894824028015,
- "mean_token_accuracy": 0.8395825028419495,
- "num_tokens": 4110530.0,
- "step": 460
- },
- {
- "epoch": 0.35030395136778114,
- "grad_norm": 1.2955626249313354,
- "learning_rate": 4.940020129865895e-06,
- "loss": 0.47155818343162537,
- "mean_token_accuracy": 0.8253582715988159,
- "num_tokens": 4128398.0,
- "step": 461
- },
- {
- "epoch": 0.35106382978723405,
- "grad_norm": 2.066575527191162,
- "learning_rate": 4.9395632517428546e-06,
- "loss": 0.5555641651153564,
- "mean_token_accuracy": 0.814624547958374,
- "num_tokens": 4137623.0,
- "step": 462
- },
- {
- "epoch": 0.3518237082066869,
- "grad_norm": 1.6407525539398193,
- "learning_rate": 4.939104661440415e-06,
- "loss": 0.4361790418624878,
- "mean_token_accuracy": 0.8544459342956543,
- "num_tokens": 4152803.0,
- "step": 463
- },
- {
- "epoch": 0.3525835866261398,
- "grad_norm": 2.1685116291046143,
- "learning_rate": 4.938644359280433e-06,
- "loss": 0.5347012877464294,
- "mean_token_accuracy": 0.853853702545166,
- "num_tokens": 4160778.0,
- "step": 464
- },
- {
- "epoch": 0.3533434650455927,
- "grad_norm": 1.8824869394302368,
- "learning_rate": 4.938182345585967e-06,
- "loss": 0.5512481927871704,
- "mean_token_accuracy": 0.7985891699790955,
- "num_tokens": 4170380.0,
- "step": 465
- },
- {
- "epoch": 0.3541033434650456,
- "grad_norm": 2.2229504585266113,
- "learning_rate": 4.937718620681273e-06,
- "loss": 0.516828179359436,
- "mean_token_accuracy": 0.8265621066093445,
- "num_tokens": 4178179.0,
- "step": 466
- },
- {
- "epoch": 0.3548632218844985,
- "grad_norm": 1.955990195274353,
- "learning_rate": 4.9372531848918145e-06,
- "loss": 0.5586158037185669,
- "mean_token_accuracy": 0.8367916345596313,
- "num_tokens": 4188626.0,
- "step": 467
- },
- {
- "epoch": 0.3556231003039514,
- "grad_norm": 1.9687023162841797,
- "learning_rate": 4.936786038544251e-06,
- "loss": 0.5517531633377075,
- "mean_token_accuracy": 0.8134098052978516,
- "num_tokens": 4198144.0,
- "step": 468
- },
- {
- "epoch": 0.35638297872340424,
- "grad_norm": 1.405516505241394,
- "learning_rate": 4.9363171819664434e-06,
- "loss": 0.5305492877960205,
- "mean_token_accuracy": 0.8014427423477173,
- "num_tokens": 4222818.0,
- "step": 469
- },
- {
- "epoch": 0.35714285714285715,
- "grad_norm": 2.6355695724487305,
- "learning_rate": 4.9358466154874535e-06,
- "loss": 0.5303391218185425,
- "mean_token_accuracy": 0.8028861284255981,
- "num_tokens": 4228318.0,
- "step": 470
- },
- {
- "epoch": 0.35790273556231,
- "grad_norm": 1.5133824348449707,
- "learning_rate": 4.935374339437543e-06,
- "loss": 0.5329189300537109,
- "mean_token_accuracy": 0.8479441404342651,
- "num_tokens": 4244527.0,
- "step": 471
- },
- {
- "epoch": 0.3586626139817629,
- "grad_norm": 3.4356725215911865,
- "learning_rate": 4.934900354148173e-06,
- "loss": 0.5431582927703857,
- "mean_token_accuracy": 0.8328983783721924,
- "num_tokens": 4248034.0,
- "step": 472
- },
- {
- "epoch": 0.3594224924012158,
- "grad_norm": 2.5789499282836914,
- "learning_rate": 4.934424659952006e-06,
- "loss": 0.4141455292701721,
- "mean_token_accuracy": 0.8658635020256042,
- "num_tokens": 4252953.0,
- "step": 473
- },
- {
- "epoch": 0.3601823708206687,
- "grad_norm": 1.145262598991394,
- "learning_rate": 4.933947257182901e-06,
- "loss": 0.40294092893600464,
- "mean_token_accuracy": 0.8565847277641296,
- "num_tokens": 4277813.0,
- "step": 474
- },
- {
- "epoch": 0.3609422492401216,
- "grad_norm": 1.7242133617401123,
- "learning_rate": 4.933468146175918e-06,
- "loss": 0.6036738753318787,
- "mean_token_accuracy": 0.8072597980499268,
- "num_tokens": 4291088.0,
- "step": 475
- },
- {
- "epoch": 0.3617021276595745,
- "grad_norm": 2.3490941524505615,
- "learning_rate": 4.932987327267317e-06,
- "loss": 0.49456146359443665,
- "mean_token_accuracy": 0.8372673988342285,
- "num_tokens": 4297376.0,
- "step": 476
- },
- {
- "epoch": 0.36246200607902734,
- "grad_norm": 1.3605526685714722,
- "learning_rate": 4.932504800794553e-06,
- "loss": 0.43595948815345764,
- "mean_token_accuracy": 0.8415953516960144,
- "num_tokens": 4312054.0,
- "step": 477
- },
- {
- "epoch": 0.36322188449848025,
- "grad_norm": 1.4525885581970215,
- "learning_rate": 4.9320205670962815e-06,
- "loss": 0.5390371680259705,
- "mean_token_accuracy": 0.8101649284362793,
- "num_tokens": 4328701.0,
- "step": 478
- },
- {
- "epoch": 0.3639817629179331,
- "grad_norm": 1.9862419366836548,
- "learning_rate": 4.931534626512359e-06,
- "loss": 0.45436930656433105,
- "mean_token_accuracy": 0.8352861404418945,
- "num_tokens": 4338372.0,
- "step": 479
- },
- {
- "epoch": 0.364741641337386,
- "grad_norm": 1.7804961204528809,
- "learning_rate": 4.931046979383836e-06,
- "loss": 0.4677754044532776,
- "mean_token_accuracy": 0.840467095375061,
- "num_tokens": 4347897.0,
- "step": 480
- },
- {
- "epoch": 0.3655015197568389,
- "grad_norm": 2.066632032394409,
- "learning_rate": 4.930557626052961e-06,
- "loss": 0.42418140172958374,
- "mean_token_accuracy": 0.8528275489807129,
- "num_tokens": 4354061.0,
- "step": 481
- },
- {
- "epoch": 0.3662613981762918,
- "grad_norm": 1.6155282258987427,
- "learning_rate": 4.930066566863182e-06,
- "loss": 0.5424284934997559,
- "mean_token_accuracy": 0.825040876865387,
- "num_tokens": 4370400.0,
- "step": 482
- },
- {
- "epoch": 0.3670212765957447,
- "grad_norm": 2.1452953815460205,
- "learning_rate": 4.929573802159143e-06,
- "loss": 0.5105804204940796,
- "mean_token_accuracy": 0.8284053802490234,
- "num_tokens": 4377579.0,
- "step": 483
- },
- {
- "epoch": 0.3677811550151976,
- "grad_norm": 1.8940945863723755,
- "learning_rate": 4.929079332286685e-06,
- "loss": 0.43478304147720337,
- "mean_token_accuracy": 0.8505665063858032,
- "num_tokens": 4385686.0,
- "step": 484
- },
- {
- "epoch": 0.36854103343465044,
- "grad_norm": 1.6785860061645508,
- "learning_rate": 4.928583157592846e-06,
- "loss": 0.40227848291397095,
- "mean_token_accuracy": 0.8623573780059814,
- "num_tokens": 4396128.0,
- "step": 485
- },
- {
- "epoch": 0.36930091185410335,
- "grad_norm": 1.6416733264923096,
- "learning_rate": 4.928085278425862e-06,
- "loss": 0.526267409324646,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 4407963.0,
- "step": 486
- },
- {
- "epoch": 0.3700607902735562,
- "grad_norm": 1.8882389068603516,
- "learning_rate": 4.927585695135162e-06,
- "loss": 0.5555213093757629,
- "mean_token_accuracy": 0.8115293979644775,
- "num_tokens": 4418057.0,
- "step": 487
- },
- {
- "epoch": 0.3708206686930091,
- "grad_norm": 2.300248384475708,
- "learning_rate": 4.9270844080713735e-06,
- "loss": 0.5812339186668396,
- "mean_token_accuracy": 0.800270676612854,
- "num_tokens": 4425358.0,
- "step": 488
- },
- {
- "epoch": 0.371580547112462,
- "grad_norm": 1.6802922487258911,
- "learning_rate": 4.926581417586319e-06,
- "loss": 0.5134941935539246,
- "mean_token_accuracy": 0.8247408866882324,
- "num_tokens": 4437702.0,
- "step": 489
- },
- {
- "epoch": 0.3723404255319149,
- "grad_norm": 1.7620291709899902,
- "learning_rate": 4.926076724033016e-06,
- "loss": 0.5233973264694214,
- "mean_token_accuracy": 0.8102161884307861,
- "num_tokens": 4448584.0,
- "step": 490
- },
- {
- "epoch": 0.3731003039513678,
- "grad_norm": 1.6911998987197876,
- "learning_rate": 4.925570327765678e-06,
- "loss": 0.5337274074554443,
- "mean_token_accuracy": 0.845306396484375,
- "num_tokens": 4462651.0,
- "step": 491
- },
- {
- "epoch": 0.3738601823708207,
- "grad_norm": 1.7991242408752441,
- "learning_rate": 4.9250622291397144e-06,
- "loss": 0.31018948554992676,
- "mean_token_accuracy": 0.8857606053352356,
- "num_tokens": 4469971.0,
- "step": 492
- },
- {
- "epoch": 0.37462006079027355,
- "grad_norm": 4.9776835441589355,
- "learning_rate": 4.924552428511727e-06,
- "loss": 0.44114983081817627,
- "mean_token_accuracy": 0.8429906368255615,
- "num_tokens": 4478275.0,
- "step": 493
- },
- {
- "epoch": 0.37537993920972645,
- "grad_norm": 1.8007272481918335,
- "learning_rate": 4.924040926239515e-06,
- "loss": 0.574328601360321,
- "mean_token_accuracy": 0.7669196128845215,
- "num_tokens": 4491551.0,
- "step": 494
- },
- {
- "epoch": 0.3761398176291793,
- "grad_norm": 2.021300792694092,
- "learning_rate": 4.92352772268207e-06,
- "loss": 0.45636120438575745,
- "mean_token_accuracy": 0.840438723564148,
- "num_tokens": 4498658.0,
- "step": 495
- },
- {
- "epoch": 0.3768996960486322,
- "grad_norm": 2.369748592376709,
- "learning_rate": 4.923012818199576e-06,
- "loss": 0.5206376910209656,
- "mean_token_accuracy": 0.8521823287010193,
- "num_tokens": 4504648.0,
- "step": 496
- },
- {
- "epoch": 0.3776595744680851,
- "grad_norm": 2.733485221862793,
- "learning_rate": 4.922496213153416e-06,
- "loss": 0.5067723989486694,
- "mean_token_accuracy": 0.8168281316757202,
- "num_tokens": 4509990.0,
- "step": 497
- },
- {
- "epoch": 0.378419452887538,
- "grad_norm": 2.3751676082611084,
- "learning_rate": 4.921977907906161e-06,
- "loss": 0.49757206439971924,
- "mean_token_accuracy": 0.8325017690658569,
- "num_tokens": 4518373.0,
- "step": 498
- },
- {
- "epoch": 0.3791793313069909,
- "grad_norm": 2.1672775745391846,
- "learning_rate": 4.921457902821578e-06,
- "loss": 0.4237566590309143,
- "mean_token_accuracy": 0.8404698371887207,
- "num_tokens": 4524338.0,
- "step": 499
- },
- {
- "epoch": 0.3799392097264438,
- "grad_norm": 1.8374360799789429,
- "learning_rate": 4.9209361982646275e-06,
- "loss": 0.4995468854904175,
- "mean_token_accuracy": 0.8299649953842163,
- "num_tokens": 4533396.0,
- "step": 500
- },
- {
- "epoch": 0.38069908814589665,
- "grad_norm": 2.083967924118042,
- "learning_rate": 4.920412794601461e-06,
- "loss": 0.489935040473938,
- "mean_token_accuracy": 0.8315291404724121,
- "num_tokens": 4540941.0,
- "step": 501
- },
- {
- "epoch": 0.38145896656534956,
- "grad_norm": 2.2075610160827637,
- "learning_rate": 4.919887692199423e-06,
- "loss": 0.5233147740364075,
- "mean_token_accuracy": 0.804171085357666,
- "num_tokens": 4548215.0,
- "step": 502
- },
- {
- "epoch": 0.3822188449848024,
- "grad_norm": 2.076775312423706,
- "learning_rate": 4.9193608914270515e-06,
- "loss": 0.5785550475120544,
- "mean_token_accuracy": 0.7993186116218567,
- "num_tokens": 4558204.0,
- "step": 503
- },
- {
- "epoch": 0.3829787234042553,
- "grad_norm": 2.238546133041382,
- "learning_rate": 4.918832392654075e-06,
- "loss": 0.5287384390830994,
- "mean_token_accuracy": 0.8214945793151855,
- "num_tokens": 4565407.0,
- "step": 504
- },
- {
- "epoch": 0.3837386018237082,
- "grad_norm": 1.6783074140548706,
- "learning_rate": 4.9183021962514145e-06,
- "loss": 0.6063359379768372,
- "mean_token_accuracy": 0.7914625406265259,
- "num_tokens": 4580991.0,
- "step": 505
- },
- {
- "epoch": 0.3844984802431611,
- "grad_norm": 1.6287449598312378,
- "learning_rate": 4.917770302591183e-06,
- "loss": 0.3598247766494751,
- "mean_token_accuracy": 0.8706809878349304,
- "num_tokens": 4590579.0,
- "step": 506
- },
- {
- "epoch": 0.385258358662614,
- "grad_norm": 1.5432041883468628,
- "learning_rate": 4.917236712046682e-06,
- "loss": 0.5267890095710754,
- "mean_token_accuracy": 0.8032117486000061,
- "num_tokens": 4608380.0,
- "step": 507
- },
- {
- "epoch": 0.3860182370820669,
- "grad_norm": 1.7664037942886353,
- "learning_rate": 4.9167014249924075e-06,
- "loss": 0.3552354574203491,
- "mean_token_accuracy": 0.8569793701171875,
- "num_tokens": 4616426.0,
- "step": 508
- },
- {
- "epoch": 0.38677811550151975,
- "grad_norm": 2.1147472858428955,
- "learning_rate": 4.916164441804044e-06,
- "loss": 0.5212404727935791,
- "mean_token_accuracy": 0.8196578025817871,
- "num_tokens": 4623908.0,
- "step": 509
- },
- {
- "epoch": 0.38753799392097266,
- "grad_norm": 2.1092333793640137,
- "learning_rate": 4.915625762858467e-06,
- "loss": 0.5197038650512695,
- "mean_token_accuracy": 0.8245604634284973,
- "num_tokens": 4630956.0,
- "step": 510
- },
- {
- "epoch": 0.3882978723404255,
- "grad_norm": 1.23331880569458,
- "learning_rate": 4.915085388533743e-06,
- "loss": 0.4759839177131653,
- "mean_token_accuracy": 0.8192248344421387,
- "num_tokens": 4651269.0,
- "step": 511
- },
- {
- "epoch": 0.3890577507598784,
- "grad_norm": 2.424199104309082,
- "learning_rate": 4.914543319209126e-06,
- "loss": 0.5576270818710327,
- "mean_token_accuracy": 0.8203302621841431,
- "num_tokens": 4657296.0,
- "step": 512
- },
- {
- "epoch": 0.3898176291793313,
- "grad_norm": 2.725156307220459,
- "learning_rate": 4.913999555265062e-06,
- "loss": 0.4337949752807617,
- "mean_token_accuracy": 0.8382406234741211,
- "num_tokens": 4661850.0,
- "step": 513
- },
- {
- "epoch": 0.3905775075987842,
- "grad_norm": 2.3120534420013428,
- "learning_rate": 4.913454097083185e-06,
- "loss": 0.4941597580909729,
- "mean_token_accuracy": 0.8302834033966064,
- "num_tokens": 4667769.0,
- "step": 514
- },
- {
- "epoch": 0.3913373860182371,
- "grad_norm": 2.3111207485198975,
- "learning_rate": 4.912906945046319e-06,
- "loss": 0.5253715515136719,
- "mean_token_accuracy": 0.84515380859375,
- "num_tokens": 4674537.0,
- "step": 515
- },
- {
- "epoch": 0.39209726443769,
- "grad_norm": 1.4117841720581055,
- "learning_rate": 4.912358099538476e-06,
- "loss": 0.4521017074584961,
- "mean_token_accuracy": 0.8208256959915161,
- "num_tokens": 4690605.0,
- "step": 516
- },
- {
- "epoch": 0.39285714285714285,
- "grad_norm": 2.3742799758911133,
- "learning_rate": 4.911807560944858e-06,
- "loss": 0.41572901606559753,
- "mean_token_accuracy": 0.8550551533699036,
- "num_tokens": 4706437.0,
- "step": 517
- },
- {
- "epoch": 0.39361702127659576,
- "grad_norm": 2.4052202701568604,
- "learning_rate": 4.911255329651852e-06,
- "loss": 0.6003736257553101,
- "mean_token_accuracy": 0.8247885704040527,
- "num_tokens": 4712746.0,
- "step": 518
- },
- {
- "epoch": 0.3943768996960486,
- "grad_norm": 1.9335490465164185,
- "learning_rate": 4.910701406047037e-06,
- "loss": 0.5457713603973389,
- "mean_token_accuracy": 0.787429690361023,
- "num_tokens": 4731937.0,
- "step": 519
- },
- {
- "epoch": 0.3951367781155015,
- "grad_norm": 2.257706880569458,
- "learning_rate": 4.910145790519177e-06,
- "loss": 0.5300652980804443,
- "mean_token_accuracy": 0.8192912936210632,
- "num_tokens": 4739422.0,
- "step": 520
- },
- {
- "epoch": 0.3958966565349544,
- "grad_norm": 1.2099462747573853,
- "learning_rate": 4.9095884834582256e-06,
- "loss": 0.45872747898101807,
- "mean_token_accuracy": 0.8362667560577393,
- "num_tokens": 4757113.0,
- "step": 521
- },
- {
- "epoch": 0.3966565349544073,
- "grad_norm": 2.7991135120391846,
- "learning_rate": 4.909029485255321e-06,
- "loss": 0.49039560556411743,
- "mean_token_accuracy": 0.8260016441345215,
- "num_tokens": 4761709.0,
- "step": 522
- },
- {
- "epoch": 0.3974164133738602,
- "grad_norm": 2.2360129356384277,
- "learning_rate": 4.90846879630279e-06,
- "loss": 0.49556830525398254,
- "mean_token_accuracy": 0.827864408493042,
- "num_tokens": 4769048.0,
- "step": 523
- },
- {
- "epoch": 0.3981762917933131,
- "grad_norm": 2.5953688621520996,
- "learning_rate": 4.907906416994146e-06,
- "loss": 0.387208491563797,
- "mean_token_accuracy": 0.8467001914978027,
- "num_tokens": 4774637.0,
- "step": 524
- },
- {
- "epoch": 0.39893617021276595,
- "grad_norm": 2.1046814918518066,
- "learning_rate": 4.907342347724088e-06,
- "loss": 0.5477259755134583,
- "mean_token_accuracy": 0.8060322999954224,
- "num_tokens": 4782774.0,
- "step": 525
- },
- {
- "epoch": 0.39969604863221886,
- "grad_norm": 2.5622646808624268,
- "learning_rate": 4.906776588888502e-06,
- "loss": 0.5684159398078918,
- "mean_token_accuracy": 0.8095303177833557,
- "num_tokens": 4788766.0,
- "step": 526
- },
- {
- "epoch": 0.4004559270516717,
- "grad_norm": 1.9027913808822632,
- "learning_rate": 4.906209140884459e-06,
- "loss": 0.535524845123291,
- "mean_token_accuracy": 0.815237820148468,
- "num_tokens": 4798492.0,
- "step": 527
- },
- {
- "epoch": 0.4012158054711246,
- "grad_norm": 2.1447622776031494,
- "learning_rate": 4.905640004110216e-06,
- "loss": 0.5628632307052612,
- "mean_token_accuracy": 0.8085395097732544,
- "num_tokens": 4805737.0,
- "step": 528
- },
- {
- "epoch": 0.40197568389057753,
- "grad_norm": 1.6754741668701172,
- "learning_rate": 4.905069178965215e-06,
- "loss": 0.5046736598014832,
- "mean_token_accuracy": 0.8247535228729248,
- "num_tokens": 4816912.0,
- "step": 529
- },
- {
- "epoch": 0.4027355623100304,
- "grad_norm": 2.271230459213257,
- "learning_rate": 4.904496665850083e-06,
- "loss": 0.6086187958717346,
- "mean_token_accuracy": 0.7935276627540588,
- "num_tokens": 4824577.0,
- "step": 530
- },
- {
- "epoch": 0.4034954407294833,
- "grad_norm": 2.107595205307007,
- "learning_rate": 4.903922465166633e-06,
- "loss": 0.5431341528892517,
- "mean_token_accuracy": 0.8129537105560303,
- "num_tokens": 4831772.0,
- "step": 531
- },
- {
- "epoch": 0.40425531914893614,
- "grad_norm": 1.3860732316970825,
- "learning_rate": 4.903346577317859e-06,
- "loss": 0.45816320180892944,
- "mean_token_accuracy": 0.8328287601470947,
- "num_tokens": 4850302.0,
- "step": 532
- },
- {
- "epoch": 0.40501519756838905,
- "grad_norm": 1.9186837673187256,
- "learning_rate": 4.902769002707942e-06,
- "loss": 0.3294633626937866,
- "mean_token_accuracy": 0.8853933811187744,
- "num_tokens": 4856624.0,
- "step": 533
- },
- {
- "epoch": 0.40577507598784196,
- "grad_norm": 1.516194462776184,
- "learning_rate": 4.902189741742247e-06,
- "loss": 0.45482105016708374,
- "mean_token_accuracy": 0.8370342254638672,
- "num_tokens": 4870395.0,
- "step": 534
- },
- {
- "epoch": 0.4065349544072948,
- "grad_norm": 2.3235628604888916,
- "learning_rate": 4.901608794827321e-06,
- "loss": 0.40688639879226685,
- "mean_token_accuracy": 0.8643521666526794,
- "num_tokens": 4875645.0,
- "step": 535
- },
- {
- "epoch": 0.4072948328267477,
- "grad_norm": 2.29286527633667,
- "learning_rate": 4.9010261623708945e-06,
- "loss": 0.45482826232910156,
- "mean_token_accuracy": 0.8429383039474487,
- "num_tokens": 4881772.0,
- "step": 536
- },
- {
- "epoch": 0.40805471124620063,
- "grad_norm": 1.5907070636749268,
- "learning_rate": 4.900441844781882e-06,
- "loss": 0.5266948342323303,
- "mean_token_accuracy": 0.8348641395568848,
- "num_tokens": 4894289.0,
- "step": 537
- },
- {
- "epoch": 0.4088145896656535,
- "grad_norm": 2.1816294193267822,
- "learning_rate": 4.89985584247038e-06,
- "loss": 0.4797617793083191,
- "mean_token_accuracy": 0.8549500703811646,
- "num_tokens": 4901106.0,
- "step": 538
- },
- {
- "epoch": 0.4095744680851064,
- "grad_norm": 1.7347146272659302,
- "learning_rate": 4.899268155847667e-06,
- "loss": 0.4754739999771118,
- "mean_token_accuracy": 0.8278418183326721,
- "num_tokens": 4912131.0,
- "step": 539
- },
- {
- "epoch": 0.41033434650455924,
- "grad_norm": 2.0694527626037598,
- "learning_rate": 4.898678785326205e-06,
- "loss": 0.5071008801460266,
- "mean_token_accuracy": 0.8157946467399597,
- "num_tokens": 4921141.0,
- "step": 540
- },
- {
- "epoch": 0.41109422492401215,
- "grad_norm": 2.570047616958618,
- "learning_rate": 4.898087731319637e-06,
- "loss": 0.43639278411865234,
- "mean_token_accuracy": 0.8682913780212402,
- "num_tokens": 4926182.0,
- "step": 541
- },
- {
- "epoch": 0.41185410334346506,
- "grad_norm": 4.064006805419922,
- "learning_rate": 4.8974949942427854e-06,
- "loss": 0.539260745048523,
- "mean_token_accuracy": 0.8225528001785278,
- "num_tokens": 4929449.0,
- "step": 542
- },
- {
- "epoch": 0.4126139817629179,
- "grad_norm": 1.7644332647323608,
- "learning_rate": 4.896900574511657e-06,
- "loss": 0.472618043422699,
- "mean_token_accuracy": 0.8332902193069458,
- "num_tokens": 4939443.0,
- "step": 543
- },
- {
- "epoch": 0.4133738601823708,
- "grad_norm": 2.879918336868286,
- "learning_rate": 4.89630447254344e-06,
- "loss": 0.6360667943954468,
- "mean_token_accuracy": 0.8215296268463135,
- "num_tokens": 4950838.0,
- "step": 544
- },
- {
- "epoch": 0.41413373860182373,
- "grad_norm": 1.4575570821762085,
- "learning_rate": 4.8957066887565005e-06,
- "loss": 0.45617997646331787,
- "mean_token_accuracy": 0.8373187184333801,
- "num_tokens": 4965222.0,
- "step": 545
- },
- {
- "epoch": 0.4148936170212766,
- "grad_norm": 2.4829535484313965,
- "learning_rate": 4.895107223570386e-06,
- "loss": 0.42285341024398804,
- "mean_token_accuracy": 0.8686380386352539,
- "num_tokens": 4970724.0,
- "step": 546
- },
- {
- "epoch": 0.4156534954407295,
- "grad_norm": 2.639474630355835,
- "learning_rate": 4.894506077405824e-06,
- "loss": 0.5906289219856262,
- "mean_token_accuracy": 0.8174435496330261,
- "num_tokens": 4976766.0,
- "step": 547
- },
- {
- "epoch": 0.41641337386018235,
- "grad_norm": 2.7960562705993652,
- "learning_rate": 4.893903250684723e-06,
- "loss": 0.4518949091434479,
- "mean_token_accuracy": 0.8387585282325745,
- "num_tokens": 4980991.0,
- "step": 548
- },
- {
- "epoch": 0.41717325227963525,
- "grad_norm": 2.184176206588745,
- "learning_rate": 4.893298743830168e-06,
- "loss": 0.5223842859268188,
- "mean_token_accuracy": 0.8170937299728394,
- "num_tokens": 4987781.0,
- "step": 549
- },
- {
- "epoch": 0.41793313069908816,
- "grad_norm": 2.2393438816070557,
- "learning_rate": 4.892692557266429e-06,
- "loss": 0.5238431692123413,
- "mean_token_accuracy": 0.8217905759811401,
- "num_tokens": 4994321.0,
- "step": 550
- },
- {
- "epoch": 0.418693009118541,
- "grad_norm": 3.579047441482544,
- "learning_rate": 4.8920846914189465e-06,
- "loss": 0.5367584228515625,
- "mean_token_accuracy": 0.8312011361122131,
- "num_tokens": 4997951.0,
- "step": 551
- },
- {
- "epoch": 0.4194528875379939,
- "grad_norm": 1.6330240964889526,
- "learning_rate": 4.891475146714348e-06,
- "loss": 0.6054705381393433,
- "mean_token_accuracy": 0.7938206791877747,
- "num_tokens": 5012726.0,
- "step": 552
- },
- {
- "epoch": 0.42021276595744683,
- "grad_norm": 1.5775716304779053,
- "learning_rate": 4.8908639235804324e-06,
- "loss": 0.4774656891822815,
- "mean_token_accuracy": 0.828762948513031,
- "num_tokens": 5026751.0,
- "step": 553
- },
- {
- "epoch": 0.4209726443768997,
- "grad_norm": 1.5719101428985596,
- "learning_rate": 4.890251022446181e-06,
- "loss": 0.549429178237915,
- "mean_token_accuracy": 0.8110791444778442,
- "num_tokens": 5041861.0,
- "step": 554
- },
- {
- "epoch": 0.4217325227963526,
- "grad_norm": 1.8585275411605835,
- "learning_rate": 4.889636443741752e-06,
- "loss": 0.4448118805885315,
- "mean_token_accuracy": 0.8462690711021423,
- "num_tokens": 5052690.0,
- "step": 555
- },
- {
- "epoch": 0.42249240121580545,
- "grad_norm": 2.189202070236206,
- "learning_rate": 4.88902018789848e-06,
- "loss": 0.4296762943267822,
- "mean_token_accuracy": 0.8488791584968567,
- "num_tokens": 5058964.0,
- "step": 556
- },
- {
- "epoch": 0.42325227963525835,
- "grad_norm": 1.9328460693359375,
- "learning_rate": 4.888402255348877e-06,
- "loss": 0.5369474291801453,
- "mean_token_accuracy": 0.8184729814529419,
- "num_tokens": 5068465.0,
- "step": 557
- },
- {
- "epoch": 0.42401215805471126,
- "grad_norm": 1.6233323812484741,
- "learning_rate": 4.887782646526631e-06,
- "loss": 0.5284391641616821,
- "mean_token_accuracy": 0.8276044726371765,
- "num_tokens": 5081052.0,
- "step": 558
- },
- {
- "epoch": 0.4247720364741641,
- "grad_norm": 2.222813844680786,
- "learning_rate": 4.887161361866608e-06,
- "loss": 0.5679137706756592,
- "mean_token_accuracy": 0.8012375831604004,
- "num_tokens": 5090001.0,
- "step": 559
- },
- {
- "epoch": 0.425531914893617,
- "grad_norm": 2.1062207221984863,
- "learning_rate": 4.8865384018048494e-06,
- "loss": 0.5554201602935791,
- "mean_token_accuracy": 0.8128066062927246,
- "num_tokens": 5097644.0,
- "step": 560
- },
- {
- "epoch": 0.42629179331306993,
- "grad_norm": 1.5380984544754028,
- "learning_rate": 4.8859137667785735e-06,
- "loss": 0.4948265850543976,
- "mean_token_accuracy": 0.8258291482925415,
- "num_tokens": 5110069.0,
- "step": 561
- },
- {
- "epoch": 0.4270516717325228,
- "grad_norm": 2.0290257930755615,
- "learning_rate": 4.8852874572261715e-06,
- "loss": 0.4969530403614044,
- "mean_token_accuracy": 0.8297134637832642,
- "num_tokens": 5117452.0,
- "step": 562
- },
- {
- "epoch": 0.4278115501519757,
- "grad_norm": 1.5651452541351318,
- "learning_rate": 4.884659473587213e-06,
- "loss": 0.5353102087974548,
- "mean_token_accuracy": 0.8161719441413879,
- "num_tokens": 5133756.0,
- "step": 563
- },
- {
- "epoch": 0.42857142857142855,
- "grad_norm": 2.2470998764038086,
- "learning_rate": 4.884029816302441e-06,
- "loss": 0.5104288458824158,
- "mean_token_accuracy": 0.8081635236740112,
- "num_tokens": 5140278.0,
- "step": 564
- },
- {
- "epoch": 0.42933130699088146,
- "grad_norm": 1.726891279220581,
- "learning_rate": 4.883398485813772e-06,
- "loss": 0.4508771002292633,
- "mean_token_accuracy": 0.8548800349235535,
- "num_tokens": 5150115.0,
- "step": 565
- },
- {
- "epoch": 0.43009118541033436,
- "grad_norm": 1.4779289960861206,
- "learning_rate": 4.8827654825642984e-06,
- "loss": 0.46861088275909424,
- "mean_token_accuracy": 0.8209476470947266,
- "num_tokens": 5163225.0,
- "step": 566
- },
- {
- "epoch": 0.4308510638297872,
- "grad_norm": 1.2361034154891968,
- "learning_rate": 4.882130806998287e-06,
- "loss": 0.4591076672077179,
- "mean_token_accuracy": 0.803041934967041,
- "num_tokens": 5180342.0,
- "step": 567
- },
- {
- "epoch": 0.4316109422492401,
- "grad_norm": 1.882467269897461,
- "learning_rate": 4.881494459561177e-06,
- "loss": 0.579258143901825,
- "mean_token_accuracy": 0.8007112741470337,
- "num_tokens": 5189595.0,
- "step": 568
- },
- {
- "epoch": 0.43237082066869303,
- "grad_norm": 1.095462441444397,
- "learning_rate": 4.880856440699582e-06,
- "loss": 0.3806574046611786,
- "mean_token_accuracy": 0.8650111556053162,
- "num_tokens": 5211642.0,
- "step": 569
- },
- {
- "epoch": 0.4331306990881459,
- "grad_norm": 1.6469846963882446,
- "learning_rate": 4.880216750861288e-06,
- "loss": 0.544589638710022,
- "mean_token_accuracy": 0.8060122728347778,
- "num_tokens": 5224137.0,
- "step": 570
- },
- {
- "epoch": 0.4338905775075988,
- "grad_norm": 1.8561251163482666,
- "learning_rate": 4.879575390495254e-06,
- "loss": 0.4094924330711365,
- "mean_token_accuracy": 0.8591406345367432,
- "num_tokens": 5231588.0,
- "step": 571
- },
- {
- "epoch": 0.43465045592705165,
- "grad_norm": 3.01326847076416,
- "learning_rate": 4.878932360051611e-06,
- "loss": 0.6139192581176758,
- "mean_token_accuracy": 0.8108739852905273,
- "num_tokens": 5236853.0,
- "step": 572
- },
- {
- "epoch": 0.43541033434650456,
- "grad_norm": 2.1753034591674805,
- "learning_rate": 4.878287659981663e-06,
- "loss": 0.49082931876182556,
- "mean_token_accuracy": 0.862828254699707,
- "num_tokens": 5243264.0,
- "step": 573
- },
- {
- "epoch": 0.43617021276595747,
- "grad_norm": 1.4437755346298218,
- "learning_rate": 4.8776412907378845e-06,
- "loss": 0.5608728528022766,
- "mean_token_accuracy": 0.8271626234054565,
- "num_tokens": 5261757.0,
- "step": 574
- },
- {
- "epoch": 0.4369300911854103,
- "grad_norm": 1.786683440208435,
- "learning_rate": 4.876993252773923e-06,
- "loss": 0.4377627968788147,
- "mean_token_accuracy": 0.844936192035675,
- "num_tokens": 5271038.0,
- "step": 575
- },
- {
- "epoch": 0.4376899696048632,
- "grad_norm": 1.3425915241241455,
- "learning_rate": 4.876343546544596e-06,
- "loss": 0.44762521982192993,
- "mean_token_accuracy": 0.8397793769836426,
- "num_tokens": 5285555.0,
- "step": 576
- },
- {
- "epoch": 0.43844984802431614,
- "grad_norm": 2.1549675464630127,
- "learning_rate": 4.8756921725058935e-06,
- "loss": 0.5332942008972168,
- "mean_token_accuracy": 0.820149302482605,
- "num_tokens": 5294595.0,
- "step": 577
- },
- {
- "epoch": 0.439209726443769,
- "grad_norm": 1.5254042148590088,
- "learning_rate": 4.875039131114975e-06,
- "loss": 0.3646543622016907,
- "mean_token_accuracy": 0.8442583084106445,
- "num_tokens": 5304955.0,
- "step": 578
- },
- {
- "epoch": 0.4399696048632219,
- "grad_norm": 1.5751557350158691,
- "learning_rate": 4.8743844228301676e-06,
- "loss": 0.4854734539985657,
- "mean_token_accuracy": 0.8317523002624512,
- "num_tokens": 5317351.0,
- "step": 579
- },
- {
- "epoch": 0.44072948328267475,
- "grad_norm": 1.6950466632843018,
- "learning_rate": 4.873728048110973e-06,
- "loss": 0.5907570719718933,
- "mean_token_accuracy": 0.7946986556053162,
- "num_tokens": 5332542.0,
- "step": 580
- },
- {
- "epoch": 0.44148936170212766,
- "grad_norm": 2.1180708408355713,
- "learning_rate": 4.873070007418059e-06,
- "loss": 0.5220296382904053,
- "mean_token_accuracy": 0.8037363290786743,
- "num_tokens": 5341722.0,
- "step": 581
- },
- {
- "epoch": 0.44224924012158057,
- "grad_norm": 1.3643816709518433,
- "learning_rate": 4.872410301213265e-06,
- "loss": 0.4865502417087555,
- "mean_token_accuracy": 0.8377852439880371,
- "num_tokens": 5359359.0,
- "step": 582
- },
- {
- "epoch": 0.4430091185410334,
- "grad_norm": 1.483280897140503,
- "learning_rate": 4.871748929959598e-06,
- "loss": 0.36856764554977417,
- "mean_token_accuracy": 0.8709549903869629,
- "num_tokens": 5369749.0,
- "step": 583
- },
- {
- "epoch": 0.44376899696048633,
- "grad_norm": 1.6891541481018066,
- "learning_rate": 4.871085894121234e-06,
- "loss": 0.5768930912017822,
- "mean_token_accuracy": 0.8030461668968201,
- "num_tokens": 5383912.0,
- "step": 584
- },
- {
- "epoch": 0.44452887537993924,
- "grad_norm": 2.1318740844726562,
- "learning_rate": 4.870421194163515e-06,
- "loss": 0.4337100386619568,
- "mean_token_accuracy": 0.8562518358230591,
- "num_tokens": 5389412.0,
- "step": 585
- },
- {
- "epoch": 0.4452887537993921,
- "grad_norm": 2.540255546569824,
- "learning_rate": 4.869754830552956e-06,
- "loss": 0.4708256125450134,
- "mean_token_accuracy": 0.8446552753448486,
- "num_tokens": 5394762.0,
- "step": 586
- },
- {
- "epoch": 0.446048632218845,
- "grad_norm": 2.048015594482422,
- "learning_rate": 4.869086803757235e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8181137442588806,
- "num_tokens": 5402379.0,
- "step": 587
- },
- {
- "epoch": 0.44680851063829785,
- "grad_norm": 2.9821012020111084,
- "learning_rate": 4.868417114245199e-06,
- "loss": 0.6299797296524048,
- "mean_token_accuracy": 0.8237329125404358,
- "num_tokens": 5408229.0,
- "step": 588
- },
- {
- "epoch": 0.44756838905775076,
- "grad_norm": 1.7807202339172363,
- "learning_rate": 4.867745762486862e-06,
- "loss": 0.5176759958267212,
- "mean_token_accuracy": 0.8184244632720947,
- "num_tokens": 5418383.0,
- "step": 589
- },
- {
- "epoch": 0.44832826747720367,
- "grad_norm": 1.5466399192810059,
- "learning_rate": 4.8670727489534035e-06,
- "loss": 0.5137228965759277,
- "mean_token_accuracy": 0.8365053534507751,
- "num_tokens": 5432127.0,
- "step": 590
- },
- {
- "epoch": 0.4490881458966565,
- "grad_norm": 2.9521141052246094,
- "learning_rate": 4.866398074117173e-06,
- "loss": 0.4056887924671173,
- "mean_token_accuracy": 0.8561501502990723,
- "num_tokens": 5436062.0,
- "step": 591
- },
- {
- "epoch": 0.44984802431610943,
- "grad_norm": 2.058743953704834,
- "learning_rate": 4.86572173845168e-06,
- "loss": 0.6124799251556396,
- "mean_token_accuracy": 0.8007957339286804,
- "num_tokens": 5444989.0,
- "step": 592
- },
- {
- "epoch": 0.4506079027355623,
- "grad_norm": 2.1243767738342285,
- "learning_rate": 4.865043742431605e-06,
- "loss": 0.5659694671630859,
- "mean_token_accuracy": 0.8084750175476074,
- "num_tokens": 5453865.0,
- "step": 593
- },
- {
- "epoch": 0.4513677811550152,
- "grad_norm": 1.6732314825057983,
- "learning_rate": 4.864364086532792e-06,
- "loss": 0.47879064083099365,
- "mean_token_accuracy": 0.8346436023712158,
- "num_tokens": 5466398.0,
- "step": 594
- },
- {
- "epoch": 0.4521276595744681,
- "grad_norm": 1.3793858289718628,
- "learning_rate": 4.863682771232249e-06,
- "loss": 0.45989373326301575,
- "mean_token_accuracy": 0.8254791498184204,
- "num_tokens": 5482121.0,
- "step": 595
- },
- {
- "epoch": 0.45288753799392095,
- "grad_norm": 1.9812315702438354,
- "learning_rate": 4.862999797008149e-06,
- "loss": 0.5778874754905701,
- "mean_token_accuracy": 0.8041508197784424,
- "num_tokens": 5493000.0,
- "step": 596
- },
- {
- "epoch": 0.45364741641337386,
- "grad_norm": 3.3065083026885986,
- "learning_rate": 4.862315164339829e-06,
- "loss": 0.4623975157737732,
- "mean_token_accuracy": 0.8426318168640137,
- "num_tokens": 5496723.0,
- "step": 597
- },
- {
- "epoch": 0.45440729483282677,
- "grad_norm": 3.167119026184082,
- "learning_rate": 4.861628873707792e-06,
- "loss": 0.6984533667564392,
- "mean_token_accuracy": 0.772136926651001,
- "num_tokens": 5501161.0,
- "step": 598
- },
- {
- "epoch": 0.4551671732522796,
- "grad_norm": 2.2130985260009766,
- "learning_rate": 4.860940925593703e-06,
- "loss": 0.4823192059993744,
- "mean_token_accuracy": 0.8462972640991211,
- "num_tokens": 5509544.0,
- "step": 599
- },
- {
- "epoch": 0.45592705167173253,
- "grad_norm": 3.029191732406616,
- "learning_rate": 4.86025132048039e-06,
- "loss": 0.523664116859436,
- "mean_token_accuracy": 0.8229140043258667,
- "num_tokens": 5514586.0,
- "step": 600
- },
- {
- "epoch": 0.4566869300911854,
- "grad_norm": 1.6983962059020996,
- "learning_rate": 4.859560058851844e-06,
- "loss": 0.4832698106765747,
- "mean_token_accuracy": 0.8403248190879822,
- "num_tokens": 5525773.0,
- "step": 601
- },
- {
- "epoch": 0.4574468085106383,
- "grad_norm": 3.0504038333892822,
- "learning_rate": 4.8588671411932195e-06,
- "loss": 0.5158926248550415,
- "mean_token_accuracy": 0.8098392486572266,
- "num_tokens": 5529739.0,
- "step": 602
- },
- {
- "epoch": 0.4582066869300912,
- "grad_norm": 2.584836483001709,
- "learning_rate": 4.858172567990832e-06,
- "loss": 0.5724587440490723,
- "mean_token_accuracy": 0.8128519058227539,
- "num_tokens": 5535763.0,
- "step": 603
- },
- {
- "epoch": 0.45896656534954405,
- "grad_norm": 2.0514042377471924,
- "learning_rate": 4.857476339732162e-06,
- "loss": 0.4337679445743561,
- "mean_token_accuracy": 0.8405929207801819,
- "num_tokens": 5543075.0,
- "step": 604
- },
- {
- "epoch": 0.45972644376899696,
- "grad_norm": 2.2949347496032715,
- "learning_rate": 4.856778456905846e-06,
- "loss": 0.46532145142555237,
- "mean_token_accuracy": 0.8345137238502502,
- "num_tokens": 5549035.0,
- "step": 605
- },
- {
- "epoch": 0.46048632218844987,
- "grad_norm": 2.2067551612854004,
- "learning_rate": 4.856078920001689e-06,
- "loss": 0.5855136513710022,
- "mean_token_accuracy": 0.8043795228004456,
- "num_tokens": 5555545.0,
- "step": 606
- },
- {
- "epoch": 0.4612462006079027,
- "grad_norm": 2.101945161819458,
- "learning_rate": 4.855377729510648e-06,
- "loss": 0.6071814298629761,
- "mean_token_accuracy": 0.7973253130912781,
- "num_tokens": 5563615.0,
- "step": 607
- },
- {
- "epoch": 0.46200607902735563,
- "grad_norm": 2.5958821773529053,
- "learning_rate": 4.8546748859248504e-06,
- "loss": 0.6278061866760254,
- "mean_token_accuracy": 0.7864972352981567,
- "num_tokens": 5570078.0,
- "step": 608
- },
- {
- "epoch": 0.4627659574468085,
- "grad_norm": 2.778101921081543,
- "learning_rate": 4.853970389737576e-06,
- "loss": 0.35521194338798523,
- "mean_token_accuracy": 0.8752605319023132,
- "num_tokens": 5573995.0,
- "step": 609
- },
- {
- "epoch": 0.4635258358662614,
- "grad_norm": 2.600534677505493,
- "learning_rate": 4.8532642414432675e-06,
- "loss": 0.6541563868522644,
- "mean_token_accuracy": 0.7843613028526306,
- "num_tokens": 5580333.0,
- "step": 610
- },
- {
- "epoch": 0.4642857142857143,
- "grad_norm": 1.778337836265564,
- "learning_rate": 4.852556441537528e-06,
- "loss": 0.3561405837535858,
- "mean_token_accuracy": 0.8579353094100952,
- "num_tokens": 5588430.0,
- "step": 611
- },
- {
- "epoch": 0.46504559270516715,
- "grad_norm": 1.5653862953186035,
- "learning_rate": 4.851846990517118e-06,
- "loss": 0.6067906618118286,
- "mean_token_accuracy": 0.7919317483901978,
- "num_tokens": 5601700.0,
- "step": 612
- },
- {
- "epoch": 0.46580547112462006,
- "grad_norm": 1.6097723245620728,
- "learning_rate": 4.851135888879958e-06,
- "loss": 0.446664422750473,
- "mean_token_accuracy": 0.8441969156265259,
- "num_tokens": 5612063.0,
- "step": 613
- },
- {
- "epoch": 0.46656534954407297,
- "grad_norm": 1.961207389831543,
- "learning_rate": 4.850423137125126e-06,
- "loss": 0.5508605241775513,
- "mean_token_accuracy": 0.8240450024604797,
- "num_tokens": 5620245.0,
- "step": 614
- },
- {
- "epoch": 0.4673252279635258,
- "grad_norm": 2.2189085483551025,
- "learning_rate": 4.8497087357528585e-06,
- "loss": 0.6805076599121094,
- "mean_token_accuracy": 0.771978497505188,
- "num_tokens": 5629590.0,
- "step": 615
- },
- {
- "epoch": 0.46808510638297873,
- "grad_norm": 2.5176279544830322,
- "learning_rate": 4.8489926852645505e-06,
- "loss": 0.4512156844139099,
- "mean_token_accuracy": 0.836459755897522,
- "num_tokens": 5635259.0,
- "step": 616
- },
- {
- "epoch": 0.4688449848024316,
- "grad_norm": 1.5327287912368774,
- "learning_rate": 4.848274986162754e-06,
- "loss": 0.4884302616119385,
- "mean_token_accuracy": 0.8194037079811096,
- "num_tokens": 5649993.0,
- "step": 617
- },
- {
- "epoch": 0.4696048632218845,
- "grad_norm": 2.184554100036621,
- "learning_rate": 4.847555638951177e-06,
- "loss": 0.5141451358795166,
- "mean_token_accuracy": 0.8245922327041626,
- "num_tokens": 5657375.0,
- "step": 618
- },
- {
- "epoch": 0.4703647416413374,
- "grad_norm": 1.6143407821655273,
- "learning_rate": 4.846834644134686e-06,
- "loss": 0.4276641607284546,
- "mean_token_accuracy": 0.8481845855712891,
- "num_tokens": 5667941.0,
- "step": 619
- },
- {
- "epoch": 0.47112462006079026,
- "grad_norm": 2.3747270107269287,
- "learning_rate": 4.846112002219301e-06,
- "loss": 0.5608246922492981,
- "mean_token_accuracy": 0.8073011040687561,
- "num_tokens": 5675042.0,
- "step": 620
- },
- {
- "epoch": 0.47188449848024316,
- "grad_norm": 2.390404224395752,
- "learning_rate": 4.845387713712203e-06,
- "loss": 0.46616724133491516,
- "mean_token_accuracy": 0.8468319177627563,
- "num_tokens": 5680207.0,
- "step": 621
- },
- {
- "epoch": 0.4726443768996961,
- "grad_norm": 1.7245099544525146,
- "learning_rate": 4.844661779121723e-06,
- "loss": 0.5652435421943665,
- "mean_token_accuracy": 0.8010749816894531,
- "num_tokens": 5693759.0,
- "step": 622
- },
- {
- "epoch": 0.4734042553191489,
- "grad_norm": 2.6923108100891113,
- "learning_rate": 4.843934198957351e-06,
- "loss": 0.6254661679267883,
- "mean_token_accuracy": 0.8236024975776672,
- "num_tokens": 5699916.0,
- "step": 623
- },
- {
- "epoch": 0.47416413373860183,
- "grad_norm": 2.516901969909668,
- "learning_rate": 4.84320497372973e-06,
- "loss": 0.6334252953529358,
- "mean_token_accuracy": 0.7803834676742554,
- "num_tokens": 5706554.0,
- "step": 624
- },
- {
- "epoch": 0.4749240121580547,
- "grad_norm": 2.3744447231292725,
- "learning_rate": 4.842474103950658e-06,
- "loss": 0.4221811890602112,
- "mean_token_accuracy": 0.8639545440673828,
- "num_tokens": 5711756.0,
- "step": 625
- },
- {
- "epoch": 0.4756838905775076,
- "grad_norm": 3.2373476028442383,
- "learning_rate": 4.841741590133089e-06,
- "loss": 0.6637828946113586,
- "mean_token_accuracy": 0.7968347072601318,
- "num_tokens": 5716458.0,
- "step": 626
- },
- {
- "epoch": 0.4764437689969605,
- "grad_norm": 2.153888463973999,
- "learning_rate": 4.841007432791129e-06,
- "loss": 0.4877486228942871,
- "mean_token_accuracy": 0.8345249891281128,
- "num_tokens": 5723155.0,
- "step": 627
- },
- {
- "epoch": 0.47720364741641336,
- "grad_norm": 2.120497703552246,
- "learning_rate": 4.8402716324400375e-06,
- "loss": 0.37323033809661865,
- "mean_token_accuracy": 0.8734050393104553,
- "num_tokens": 5729171.0,
- "step": 628
- },
- {
- "epoch": 0.47796352583586627,
- "grad_norm": 1.5294172763824463,
- "learning_rate": 4.839534189596228e-06,
- "loss": 0.4057067334651947,
- "mean_token_accuracy": 0.8523319959640503,
- "num_tokens": 5740112.0,
- "step": 629
- },
- {
- "epoch": 0.4787234042553192,
- "grad_norm": 2.1913886070251465,
- "learning_rate": 4.8387951047772656e-06,
- "loss": 0.4835960865020752,
- "mean_token_accuracy": 0.8438145518302917,
- "num_tokens": 5746838.0,
- "step": 630
- },
- {
- "epoch": 0.479483282674772,
- "grad_norm": 1.482897162437439,
- "learning_rate": 4.838054378501868e-06,
- "loss": 0.46967992186546326,
- "mean_token_accuracy": 0.8315759897232056,
- "num_tokens": 5760428.0,
- "step": 631
- },
- {
- "epoch": 0.48024316109422494,
- "grad_norm": 1.38850998878479,
- "learning_rate": 4.837312011289907e-06,
- "loss": 0.41845446825027466,
- "mean_token_accuracy": 0.8557186126708984,
- "num_tokens": 5773437.0,
- "step": 632
- },
- {
- "epoch": 0.4810030395136778,
- "grad_norm": 3.8337457180023193,
- "learning_rate": 4.836568003662403e-06,
- "loss": 0.5102912187576294,
- "mean_token_accuracy": 0.830644965171814,
- "num_tokens": 5776367.0,
- "step": 633
- },
- {
- "epoch": 0.4817629179331307,
- "grad_norm": 1.2084007263183594,
- "learning_rate": 4.8358223561415304e-06,
- "loss": 0.3835333585739136,
- "mean_token_accuracy": 0.8639016151428223,
- "num_tokens": 5792246.0,
- "step": 634
- },
- {
- "epoch": 0.4825227963525836,
- "grad_norm": 1.939408540725708,
- "learning_rate": 4.835075069250613e-06,
- "loss": 0.4044850468635559,
- "mean_token_accuracy": 0.8488376140594482,
- "num_tokens": 5799853.0,
- "step": 635
- },
- {
- "epoch": 0.48328267477203646,
- "grad_norm": 1.345870852470398,
- "learning_rate": 4.8343261435141245e-06,
- "loss": 0.46660199761390686,
- "mean_token_accuracy": 0.8371681571006775,
- "num_tokens": 5817478.0,
- "step": 636
- },
- {
- "epoch": 0.48404255319148937,
- "grad_norm": 1.6531339883804321,
- "learning_rate": 4.833575579457691e-06,
- "loss": 0.3886989951133728,
- "mean_token_accuracy": 0.8763507008552551,
- "num_tokens": 5825739.0,
- "step": 637
- },
- {
- "epoch": 0.4848024316109423,
- "grad_norm": 1.6443969011306763,
- "learning_rate": 4.832823377608088e-06,
- "loss": 0.4070289731025696,
- "mean_token_accuracy": 0.8586630821228027,
- "num_tokens": 5837917.0,
- "step": 638
- },
- {
- "epoch": 0.48556231003039513,
- "grad_norm": 2.005136013031006,
- "learning_rate": 4.832069538493237e-06,
- "loss": 0.40616685152053833,
- "mean_token_accuracy": 0.8571510314941406,
- "num_tokens": 5845250.0,
- "step": 639
- },
- {
- "epoch": 0.48632218844984804,
- "grad_norm": 1.5244266986846924,
- "learning_rate": 4.831314062642213e-06,
- "loss": 0.49530288577079773,
- "mean_token_accuracy": 0.8328841924667358,
- "num_tokens": 5857407.0,
- "step": 640
- },
- {
- "epoch": 0.4870820668693009,
- "grad_norm": 1.9876971244812012,
- "learning_rate": 4.830556950585239e-06,
- "loss": 0.4583776593208313,
- "mean_token_accuracy": 0.8427221179008484,
- "num_tokens": 5865391.0,
- "step": 641
- },
- {
- "epoch": 0.4878419452887538,
- "grad_norm": 3.023336172103882,
- "learning_rate": 4.829798202853683e-06,
- "loss": 0.6134771108627319,
- "mean_token_accuracy": 0.7981935739517212,
- "num_tokens": 5870729.0,
- "step": 642
- },
- {
- "epoch": 0.4886018237082067,
- "grad_norm": 1.8889515399932861,
- "learning_rate": 4.829037819980065e-06,
- "loss": 0.4420135021209717,
- "mean_token_accuracy": 0.8480775356292725,
- "num_tokens": 5878982.0,
- "step": 643
- },
- {
- "epoch": 0.48936170212765956,
- "grad_norm": 2.2408435344696045,
- "learning_rate": 4.828275802498051e-06,
- "loss": 0.525706946849823,
- "mean_token_accuracy": 0.8271557092666626,
- "num_tokens": 5885097.0,
- "step": 644
- },
- {
- "epoch": 0.49012158054711247,
- "grad_norm": 1.9734224081039429,
- "learning_rate": 4.827512150942454e-06,
- "loss": 0.44246578216552734,
- "mean_token_accuracy": 0.8456668257713318,
- "num_tokens": 5893941.0,
- "step": 645
- },
- {
- "epoch": 0.4908814589665654,
- "grad_norm": 1.9618173837661743,
- "learning_rate": 4.8267468658492335e-06,
- "loss": 0.5119768381118774,
- "mean_token_accuracy": 0.8355510830879211,
- "num_tokens": 5902829.0,
- "step": 646
- },
- {
- "epoch": 0.49164133738601823,
- "grad_norm": 1.7181587219238281,
- "learning_rate": 4.825979947755496e-06,
- "loss": 0.5666520595550537,
- "mean_token_accuracy": 0.7951971888542175,
- "num_tokens": 5915212.0,
- "step": 647
- },
- {
- "epoch": 0.49240121580547114,
- "grad_norm": 3.0121164321899414,
- "learning_rate": 4.8252113971994955e-06,
- "loss": 0.628632128238678,
- "mean_token_accuracy": 0.8041050434112549,
- "num_tokens": 5921410.0,
- "step": 648
- },
- {
- "epoch": 0.493161094224924,
- "grad_norm": 2.9980475902557373,
- "learning_rate": 4.824441214720629e-06,
- "loss": 0.4507424831390381,
- "mean_token_accuracy": 0.8636263608932495,
- "num_tokens": 5925179.0,
- "step": 649
- },
- {
- "epoch": 0.4939209726443769,
- "grad_norm": 2.0096445083618164,
- "learning_rate": 4.823669400859441e-06,
- "loss": 0.602759838104248,
- "mean_token_accuracy": 0.8104915618896484,
- "num_tokens": 5934160.0,
- "step": 650
- },
- {
- "epoch": 0.4946808510638298,
- "grad_norm": 1.1186442375183105,
- "learning_rate": 4.8228959561576195e-06,
- "loss": 0.41168469190597534,
- "mean_token_accuracy": 0.8461419939994812,
- "num_tokens": 5954163.0,
- "step": 651
- },
- {
- "epoch": 0.49544072948328266,
- "grad_norm": 1.855465054512024,
- "learning_rate": 4.822120881157998e-06,
- "loss": 0.5049735307693481,
- "mean_token_accuracy": 0.8225747346878052,
- "num_tokens": 5963840.0,
- "step": 652
- },
- {
- "epoch": 0.49620060790273557,
- "grad_norm": 3.550563335418701,
- "learning_rate": 4.821344176404554e-06,
- "loss": 0.49025264382362366,
- "mean_token_accuracy": 0.8265978693962097,
- "num_tokens": 5967358.0,
- "step": 653
- },
- {
- "epoch": 0.4969604863221885,
- "grad_norm": 3.063910484313965,
- "learning_rate": 4.820565842442408e-06,
- "loss": 0.5652767419815063,
- "mean_token_accuracy": 0.811700701713562,
- "num_tokens": 5971858.0,
- "step": 654
- },
- {
- "epoch": 0.49772036474164133,
- "grad_norm": 2.4613308906555176,
- "learning_rate": 4.819785879817827e-06,
- "loss": 0.5296125411987305,
- "mean_token_accuracy": 0.8336488008499146,
- "num_tokens": 5977442.0,
- "step": 655
- },
- {
- "epoch": 0.49848024316109424,
- "grad_norm": 2.342519760131836,
- "learning_rate": 4.819004289078217e-06,
- "loss": 0.5753380060195923,
- "mean_token_accuracy": 0.7922406792640686,
- "num_tokens": 5984531.0,
- "step": 656
- },
- {
- "epoch": 0.4992401215805471,
- "grad_norm": 2.0410680770874023,
- "learning_rate": 4.818221070772129e-06,
- "loss": 0.5433275699615479,
- "mean_token_accuracy": 0.8043830990791321,
- "num_tokens": 5992642.0,
- "step": 657
- },
- {
- "epoch": 0.5,
- "grad_norm": 1.4999698400497437,
- "learning_rate": 4.8174362254492555e-06,
- "loss": 0.5248899459838867,
- "mean_token_accuracy": 0.8107168674468994,
- "num_tokens": 6005543.0,
- "step": 658
- },
- {
- "epoch": 0.5007598784194529,
- "grad_norm": 1.9494401216506958,
- "learning_rate": 4.816649753660431e-06,
- "loss": 0.41291385889053345,
- "mean_token_accuracy": 0.8650569915771484,
- "num_tokens": 6012185.0,
- "step": 659
- },
- {
- "epoch": 0.5015197568389058,
- "grad_norm": 2.7514095306396484,
- "learning_rate": 4.815861655957632e-06,
- "loss": 0.4244142770767212,
- "mean_token_accuracy": 0.8485112190246582,
- "num_tokens": 6016809.0,
- "step": 660
- },
- {
- "epoch": 0.5022796352583586,
- "grad_norm": 1.4354928731918335,
- "learning_rate": 4.815071932893976e-06,
- "loss": 0.4332060217857361,
- "mean_token_accuracy": 0.8386815786361694,
- "num_tokens": 6034795.0,
- "step": 661
- },
- {
- "epoch": 0.5030395136778115,
- "grad_norm": 1.3113417625427246,
- "learning_rate": 4.81428058502372e-06,
- "loss": 0.5415540933609009,
- "mean_token_accuracy": 0.8115285038948059,
- "num_tokens": 6053624.0,
- "step": 662
- },
- {
- "epoch": 0.5037993920972644,
- "grad_norm": 1.820868730545044,
- "learning_rate": 4.813487612902265e-06,
- "loss": 0.5360245108604431,
- "mean_token_accuracy": 0.8313555717468262,
- "num_tokens": 6063399.0,
- "step": 663
- },
- {
- "epoch": 0.5045592705167173,
- "grad_norm": 2.347001552581787,
- "learning_rate": 4.812693017086145e-06,
- "loss": 0.4926982820034027,
- "mean_token_accuracy": 0.8137006759643555,
- "num_tokens": 6070111.0,
- "step": 664
- },
- {
- "epoch": 0.5053191489361702,
- "grad_norm": 1.8830888271331787,
- "learning_rate": 4.811896798133042e-06,
- "loss": 0.5419014692306519,
- "mean_token_accuracy": 0.8027454614639282,
- "num_tokens": 6081090.0,
- "step": 665
- },
- {
- "epoch": 0.506079027355623,
- "grad_norm": 2.3258056640625,
- "learning_rate": 4.811098956601772e-06,
- "loss": 0.4629337787628174,
- "mean_token_accuracy": 0.8416580557823181,
- "num_tokens": 6087921.0,
- "step": 666
- },
- {
- "epoch": 0.506838905775076,
- "grad_norm": 1.9578291177749634,
- "learning_rate": 4.810299493052289e-06,
- "loss": 0.40305402874946594,
- "mean_token_accuracy": 0.8529061079025269,
- "num_tokens": 6100034.0,
- "step": 667
- },
- {
- "epoch": 0.5075987841945289,
- "grad_norm": 2.800635576248169,
- "learning_rate": 4.809498408045691e-06,
- "loss": 0.5087342262268066,
- "mean_token_accuracy": 0.8214689493179321,
- "num_tokens": 6104742.0,
- "step": 668
- },
- {
- "epoch": 0.5083586626139818,
- "grad_norm": 1.5318149328231812,
- "learning_rate": 4.808695702144206e-06,
- "loss": 0.4733222723007202,
- "mean_token_accuracy": 0.837577223777771,
- "num_tokens": 6117242.0,
- "step": 669
- },
- {
- "epoch": 0.5091185410334347,
- "grad_norm": 1.2368661165237427,
- "learning_rate": 4.807891375911207e-06,
- "loss": 0.3929097056388855,
- "mean_token_accuracy": 0.8331400752067566,
- "num_tokens": 6133509.0,
- "step": 670
- },
- {
- "epoch": 0.5098784194528876,
- "grad_norm": 2.4711415767669678,
- "learning_rate": 4.8070854299112e-06,
- "loss": 0.6294851303100586,
- "mean_token_accuracy": 0.7956781983375549,
- "num_tokens": 6140294.0,
- "step": 671
- },
- {
- "epoch": 0.5106382978723404,
- "grad_norm": 2.590961217880249,
- "learning_rate": 4.806277864709828e-06,
- "loss": 0.580160915851593,
- "mean_token_accuracy": 0.809589684009552,
- "num_tokens": 6145803.0,
- "step": 672
- },
- {
- "epoch": 0.5113981762917933,
- "grad_norm": 2.4653842449188232,
- "learning_rate": 4.805468680873874e-06,
- "loss": 0.5262120366096497,
- "mean_token_accuracy": 0.822458803653717,
- "num_tokens": 6151236.0,
- "step": 673
- },
- {
- "epoch": 0.5121580547112462,
- "grad_norm": 2.860720157623291,
- "learning_rate": 4.804657878971252e-06,
- "loss": 0.4007391035556793,
- "mean_token_accuracy": 0.8637382984161377,
- "num_tokens": 6155310.0,
- "step": 674
- },
- {
- "epoch": 0.5129179331306991,
- "grad_norm": 2.520282030105591,
- "learning_rate": 4.803845459571014e-06,
- "loss": 0.45798182487487793,
- "mean_token_accuracy": 0.8270114660263062,
- "num_tokens": 6160326.0,
- "step": 675
- },
- {
- "epoch": 0.513677811550152,
- "grad_norm": 2.7290921211242676,
- "learning_rate": 4.803031423243349e-06,
- "loss": 0.5745848417282104,
- "mean_token_accuracy": 0.8401234745979309,
- "num_tokens": 6165709.0,
- "step": 676
- },
- {
- "epoch": 0.5144376899696048,
- "grad_norm": 1.6678650379180908,
- "learning_rate": 4.802215770559578e-06,
- "loss": 0.5257721543312073,
- "mean_token_accuracy": 0.8241991996765137,
- "num_tokens": 6177875.0,
- "step": 677
- },
- {
- "epoch": 0.5151975683890577,
- "grad_norm": 2.1720468997955322,
- "learning_rate": 4.801398502092156e-06,
- "loss": 0.45342206954956055,
- "mean_token_accuracy": 0.8463799953460693,
- "num_tokens": 6185415.0,
- "step": 678
- },
- {
- "epoch": 0.5159574468085106,
- "grad_norm": 2.282259702682495,
- "learning_rate": 4.800579618414677e-06,
- "loss": 0.4864169955253601,
- "mean_token_accuracy": 0.8300632238388062,
- "num_tokens": 6191832.0,
- "step": 679
- },
- {
- "epoch": 0.5167173252279635,
- "grad_norm": 2.0092248916625977,
- "learning_rate": 4.799759120101861e-06,
- "loss": 0.5781463980674744,
- "mean_token_accuracy": 0.8267031908035278,
- "num_tokens": 6199440.0,
- "step": 680
- },
- {
- "epoch": 0.5174772036474165,
- "grad_norm": 1.396580696105957,
- "learning_rate": 4.798937007729568e-06,
- "loss": 0.49689239263534546,
- "mean_token_accuracy": 0.8257499933242798,
- "num_tokens": 6213840.0,
- "step": 681
- },
- {
- "epoch": 0.5182370820668692,
- "grad_norm": 1.9060769081115723,
- "learning_rate": 4.798113281874788e-06,
- "loss": 0.48969539999961853,
- "mean_token_accuracy": 0.8171790838241577,
- "num_tokens": 6223006.0,
- "step": 682
- },
- {
- "epoch": 0.5189969604863222,
- "grad_norm": 1.6255282163619995,
- "learning_rate": 4.797287943115642e-06,
- "loss": 0.5532330870628357,
- "mean_token_accuracy": 0.8173393607139587,
- "num_tokens": 6234857.0,
- "step": 683
- },
- {
- "epoch": 0.5197568389057751,
- "grad_norm": 1.6923905611038208,
- "learning_rate": 4.796460992031386e-06,
- "loss": 0.4880887269973755,
- "mean_token_accuracy": 0.834983229637146,
- "num_tokens": 6245252.0,
- "step": 684
- },
- {
- "epoch": 0.520516717325228,
- "grad_norm": 2.13161301612854,
- "learning_rate": 4.7956324292024045e-06,
- "loss": 0.5687593817710876,
- "mean_token_accuracy": 0.7996571063995361,
- "num_tokens": 6253726.0,
- "step": 685
- },
- {
- "epoch": 0.5212765957446809,
- "grad_norm": 2.509375810623169,
- "learning_rate": 4.794802255210217e-06,
- "loss": 0.5396929979324341,
- "mean_token_accuracy": 0.8007107973098755,
- "num_tokens": 6259238.0,
- "step": 686
- },
- {
- "epoch": 0.5220364741641338,
- "grad_norm": 2.393710136413574,
- "learning_rate": 4.793970470637469e-06,
- "loss": 0.6165191531181335,
- "mean_token_accuracy": 0.7891418933868408,
- "num_tokens": 6266325.0,
- "step": 687
- },
- {
- "epoch": 0.5227963525835866,
- "grad_norm": 1.511647343635559,
- "learning_rate": 4.7931370760679415e-06,
- "loss": 0.4773876965045929,
- "mean_token_accuracy": 0.8381044864654541,
- "num_tokens": 6277447.0,
- "step": 688
- },
- {
- "epoch": 0.5235562310030395,
- "grad_norm": 2.206587314605713,
- "learning_rate": 4.792302072086542e-06,
- "loss": 0.5482058525085449,
- "mean_token_accuracy": 0.8239108920097351,
- "num_tokens": 6285163.0,
- "step": 689
- },
- {
- "epoch": 0.5243161094224924,
- "grad_norm": 3.018146514892578,
- "learning_rate": 4.7914654592793065e-06,
- "loss": 0.4880615472793579,
- "mean_token_accuracy": 0.8361308574676514,
- "num_tokens": 6289386.0,
- "step": 690
- },
- {
- "epoch": 0.5250759878419453,
- "grad_norm": 1.6469231843948364,
- "learning_rate": 4.790627238233405e-06,
- "loss": 0.4164774715900421,
- "mean_token_accuracy": 0.8496290445327759,
- "num_tokens": 6298915.0,
- "step": 691
- },
- {
- "epoch": 0.5258358662613982,
- "grad_norm": 2.352505922317505,
- "learning_rate": 4.789787409537131e-06,
- "loss": 0.5366303324699402,
- "mean_token_accuracy": 0.8350417613983154,
- "num_tokens": 6306130.0,
- "step": 692
- },
- {
- "epoch": 0.526595744680851,
- "grad_norm": 1.7463021278381348,
- "learning_rate": 4.7889459737799105e-06,
- "loss": 0.4389137923717499,
- "mean_token_accuracy": 0.8463300466537476,
- "num_tokens": 6315503.0,
- "step": 693
- },
- {
- "epoch": 0.5273556231003039,
- "grad_norm": 2.257706642150879,
- "learning_rate": 4.788102931552294e-06,
- "loss": 0.5309344530105591,
- "mean_token_accuracy": 0.8164352178573608,
- "num_tokens": 6321852.0,
- "step": 694
- },
- {
- "epoch": 0.5281155015197568,
- "grad_norm": 2.392732620239258,
- "learning_rate": 4.787258283445962e-06,
- "loss": 0.3956204056739807,
- "mean_token_accuracy": 0.8671456575393677,
- "num_tokens": 6327380.0,
- "step": 695
- },
- {
- "epoch": 0.5288753799392097,
- "grad_norm": 2.210514545440674,
- "learning_rate": 4.786412030053721e-06,
- "loss": 0.4842875003814697,
- "mean_token_accuracy": 0.8508446216583252,
- "num_tokens": 6334898.0,
- "step": 696
- },
- {
- "epoch": 0.5296352583586627,
- "grad_norm": 1.8678946495056152,
- "learning_rate": 4.785564171969503e-06,
- "loss": 0.47399595379829407,
- "mean_token_accuracy": 0.8514996767044067,
- "num_tokens": 6346374.0,
- "step": 697
- },
- {
- "epoch": 0.5303951367781155,
- "grad_norm": 2.604079484939575,
- "learning_rate": 4.784714709788368e-06,
- "loss": 0.5950228571891785,
- "mean_token_accuracy": 0.7983481884002686,
- "num_tokens": 6351648.0,
- "step": 698
- },
- {
- "epoch": 0.5311550151975684,
- "grad_norm": 1.662381649017334,
- "learning_rate": 4.783863644106502e-06,
- "loss": 0.41616758704185486,
- "mean_token_accuracy": 0.8554803133010864,
- "num_tokens": 6360506.0,
- "step": 699
- },
- {
- "epoch": 0.5319148936170213,
- "grad_norm": 1.6300342082977295,
- "learning_rate": 4.783010975521216e-06,
- "loss": 0.43029269576072693,
- "mean_token_accuracy": 0.8443028926849365,
- "num_tokens": 6370675.0,
- "step": 700
- },
- {
- "epoch": 0.5326747720364742,
- "grad_norm": 1.731873869895935,
- "learning_rate": 4.782156704630944e-06,
- "loss": 0.4383814334869385,
- "mean_token_accuracy": 0.8443183898925781,
- "num_tokens": 6381803.0,
- "step": 701
- },
- {
- "epoch": 0.5334346504559271,
- "grad_norm": 3.1788413524627686,
- "learning_rate": 4.7813008320352475e-06,
- "loss": 0.32194480299949646,
- "mean_token_accuracy": 0.8870962858200073,
- "num_tokens": 6389263.0,
- "step": 702
- },
- {
- "epoch": 0.53419452887538,
- "grad_norm": 2.099513530731201,
- "learning_rate": 4.78044335833481e-06,
- "loss": 0.36962923407554626,
- "mean_token_accuracy": 0.8661133646965027,
- "num_tokens": 6395589.0,
- "step": 703
- },
- {
- "epoch": 0.5349544072948328,
- "grad_norm": 1.4859435558319092,
- "learning_rate": 4.77958428413144e-06,
- "loss": 0.4619954824447632,
- "mean_token_accuracy": 0.8438555002212524,
- "num_tokens": 6407470.0,
- "step": 704
- },
- {
- "epoch": 0.5357142857142857,
- "grad_norm": 1.2561073303222656,
- "learning_rate": 4.7787236100280685e-06,
- "loss": 0.3770977258682251,
- "mean_token_accuracy": 0.8515733480453491,
- "num_tokens": 6422888.0,
- "step": 705
- },
- {
- "epoch": 0.5364741641337386,
- "grad_norm": 1.4455817937850952,
- "learning_rate": 4.777861336628751e-06,
- "loss": 0.46481069922447205,
- "mean_token_accuracy": 0.8502002954483032,
- "num_tokens": 6441266.0,
- "step": 706
- },
- {
- "epoch": 0.5372340425531915,
- "grad_norm": 1.1387295722961426,
- "learning_rate": 4.7769974645386616e-06,
- "loss": 0.36964765191078186,
- "mean_token_accuracy": 0.8719524145126343,
- "num_tokens": 6463686.0,
- "step": 707
- },
- {
- "epoch": 0.5379939209726444,
- "grad_norm": 1.7179663181304932,
- "learning_rate": 4.776131994364102e-06,
- "loss": 0.4231719970703125,
- "mean_token_accuracy": 0.8416585922241211,
- "num_tokens": 6472956.0,
- "step": 708
- },
- {
- "epoch": 0.5387537993920972,
- "grad_norm": 1.6328502893447876,
- "learning_rate": 4.775264926712489e-06,
- "loss": 0.5836569666862488,
- "mean_token_accuracy": 0.8039724230766296,
- "num_tokens": 6485773.0,
- "step": 709
- },
- {
- "epoch": 0.5395136778115501,
- "grad_norm": 1.8515360355377197,
- "learning_rate": 4.774396262192368e-06,
- "loss": 0.5477553009986877,
- "mean_token_accuracy": 0.8136521577835083,
- "num_tokens": 6496379.0,
- "step": 710
- },
- {
- "epoch": 0.540273556231003,
- "grad_norm": 1.741858959197998,
- "learning_rate": 4.7735260014133986e-06,
- "loss": 0.4663267731666565,
- "mean_token_accuracy": 0.8473691940307617,
- "num_tokens": 6507652.0,
- "step": 711
- },
- {
- "epoch": 0.541033434650456,
- "grad_norm": 1.7516659498214722,
- "learning_rate": 4.772654144986364e-06,
- "loss": 0.374914288520813,
- "mean_token_accuracy": 0.8600220680236816,
- "num_tokens": 6519030.0,
- "step": 712
- },
- {
- "epoch": 0.5417933130699089,
- "grad_norm": 2.662343978881836,
- "learning_rate": 4.7717806935231665e-06,
- "loss": 0.4206875264644623,
- "mean_token_accuracy": 0.8544126749038696,
- "num_tokens": 6523669.0,
- "step": 713
- },
- {
- "epoch": 0.5425531914893617,
- "grad_norm": 1.4088834524154663,
- "learning_rate": 4.770905647636828e-06,
- "loss": 0.5824331045150757,
- "mean_token_accuracy": 0.7857901453971863,
- "num_tokens": 6540560.0,
- "step": 714
- },
- {
- "epoch": 0.5433130699088146,
- "grad_norm": 2.173656940460205,
- "learning_rate": 4.77002900794149e-06,
- "loss": 0.555023729801178,
- "mean_token_accuracy": 0.8067290782928467,
- "num_tokens": 6548946.0,
- "step": 715
- },
- {
- "epoch": 0.5440729483282675,
- "grad_norm": 2.121018648147583,
- "learning_rate": 4.769150775052411e-06,
- "loss": 0.559730052947998,
- "mean_token_accuracy": 0.8166372776031494,
- "num_tokens": 6556065.0,
- "step": 716
- },
- {
- "epoch": 0.5448328267477204,
- "grad_norm": 3.335866928100586,
- "learning_rate": 4.768270949585968e-06,
- "loss": 0.6442267894744873,
- "mean_token_accuracy": 0.7858607769012451,
- "num_tokens": 6560615.0,
- "step": 717
- },
- {
- "epoch": 0.5455927051671733,
- "grad_norm": 2.3813695907592773,
- "learning_rate": 4.767389532159659e-06,
- "loss": 0.4027421474456787,
- "mean_token_accuracy": 0.8635619282722473,
- "num_tokens": 6565841.0,
- "step": 718
- },
- {
- "epoch": 0.5463525835866262,
- "grad_norm": 2.0657708644866943,
- "learning_rate": 4.766506523392095e-06,
- "loss": 0.38899827003479004,
- "mean_token_accuracy": 0.8660480380058289,
- "num_tokens": 6572362.0,
- "step": 719
- },
- {
- "epoch": 0.547112462006079,
- "grad_norm": 1.093705415725708,
- "learning_rate": 4.765621923903005e-06,
- "loss": 0.45967352390289307,
- "mean_token_accuracy": 0.8338102102279663,
- "num_tokens": 6595998.0,
- "step": 720
- },
- {
- "epoch": 0.5478723404255319,
- "grad_norm": 2.942065954208374,
- "learning_rate": 4.764735734313236e-06,
- "loss": 0.42910510301589966,
- "mean_token_accuracy": 0.8406122922897339,
- "num_tokens": 6601075.0,
- "step": 721
- },
- {
- "epoch": 0.5486322188449848,
- "grad_norm": 2.049011707305908,
- "learning_rate": 4.763847955244749e-06,
- "loss": 0.5584231615066528,
- "mean_token_accuracy": 0.8171684741973877,
- "num_tokens": 6609310.0,
- "step": 722
- },
- {
- "epoch": 0.5493920972644377,
- "grad_norm": 2.485543966293335,
- "learning_rate": 4.762958587320623e-06,
- "loss": 0.5396170020103455,
- "mean_token_accuracy": 0.8158525824546814,
- "num_tokens": 6616185.0,
- "step": 723
- },
- {
- "epoch": 0.5501519756838906,
- "grad_norm": 1.87015962600708,
- "learning_rate": 4.762067631165049e-06,
- "loss": 0.49739527702331543,
- "mean_token_accuracy": 0.8303765654563904,
- "num_tokens": 6625629.0,
- "step": 724
- },
- {
- "epoch": 0.5509118541033434,
- "grad_norm": 4.239654541015625,
- "learning_rate": 4.761175087403336e-06,
- "loss": 0.6029239296913147,
- "mean_token_accuracy": 0.8123486042022705,
- "num_tokens": 6629194.0,
- "step": 725
- },
- {
- "epoch": 0.5516717325227963,
- "grad_norm": 2.0134730339050293,
- "learning_rate": 4.760280956661904e-06,
- "loss": 0.4777873754501343,
- "mean_token_accuracy": 0.8283513784408569,
- "num_tokens": 6636929.0,
- "step": 726
- },
- {
- "epoch": 0.5524316109422492,
- "grad_norm": 1.991780400276184,
- "learning_rate": 4.75938523956829e-06,
- "loss": 0.4631248116493225,
- "mean_token_accuracy": 0.8275107741355896,
- "num_tokens": 6645135.0,
- "step": 727
- },
- {
- "epoch": 0.5531914893617021,
- "grad_norm": 1.423792839050293,
- "learning_rate": 4.75848793675114e-06,
- "loss": 0.49630722403526306,
- "mean_token_accuracy": 0.8388000130653381,
- "num_tokens": 6662690.0,
- "step": 728
- },
- {
- "epoch": 0.5539513677811551,
- "grad_norm": 2.345294952392578,
- "learning_rate": 4.757589048840219e-06,
- "loss": 0.37830638885498047,
- "mean_token_accuracy": 0.8782080411911011,
- "num_tokens": 6667285.0,
- "step": 729
- },
- {
- "epoch": 0.5547112462006079,
- "grad_norm": 2.7452144622802734,
- "learning_rate": 4.756688576466398e-06,
- "loss": 0.51595538854599,
- "mean_token_accuracy": 0.8441770672798157,
- "num_tokens": 6672324.0,
- "step": 730
- },
- {
- "epoch": 0.5554711246200608,
- "grad_norm": 1.5247859954833984,
- "learning_rate": 4.755786520261666e-06,
- "loss": 0.48365193605422974,
- "mean_token_accuracy": 0.8276445269584656,
- "num_tokens": 6685296.0,
- "step": 731
- },
- {
- "epoch": 0.5562310030395137,
- "grad_norm": 1.4018276929855347,
- "learning_rate": 4.75488288085912e-06,
- "loss": 0.3876481354236603,
- "mean_token_accuracy": 0.8612343072891235,
- "num_tokens": 6697515.0,
- "step": 732
- },
- {
- "epoch": 0.5569908814589666,
- "grad_norm": 2.9570324420928955,
- "learning_rate": 4.753977658892967e-06,
- "loss": 0.5468149185180664,
- "mean_token_accuracy": 0.8054271340370178,
- "num_tokens": 6702194.0,
- "step": 733
- },
- {
- "epoch": 0.5577507598784195,
- "grad_norm": 1.9282715320587158,
- "learning_rate": 4.753070854998529e-06,
- "loss": 0.4758574962615967,
- "mean_token_accuracy": 0.8379775285720825,
- "num_tokens": 6709938.0,
- "step": 734
- },
- {
- "epoch": 0.5585106382978723,
- "grad_norm": 1.981264591217041,
- "learning_rate": 4.752162469812234e-06,
- "loss": 0.48461222648620605,
- "mean_token_accuracy": 0.833509087562561,
- "num_tokens": 6718125.0,
- "step": 735
- },
- {
- "epoch": 0.5592705167173252,
- "grad_norm": 1.1643427610397339,
- "learning_rate": 4.751252503971624e-06,
- "loss": 0.410121887922287,
- "mean_token_accuracy": 0.8221402764320374,
- "num_tokens": 6735125.0,
- "step": 736
- },
- {
- "epoch": 0.5600303951367781,
- "grad_norm": 1.786566972732544,
- "learning_rate": 4.750340958115346e-06,
- "loss": 0.5964341163635254,
- "mean_token_accuracy": 0.8038164377212524,
- "num_tokens": 6747369.0,
- "step": 737
- },
- {
- "epoch": 0.560790273556231,
- "grad_norm": 1.7256991863250732,
- "learning_rate": 4.749427832883158e-06,
- "loss": 0.48737066984176636,
- "mean_token_accuracy": 0.830894947052002,
- "num_tokens": 6758115.0,
- "step": 738
- },
- {
- "epoch": 0.5615501519756839,
- "grad_norm": 1.997747540473938,
- "learning_rate": 4.748513128915928e-06,
- "loss": 0.5238886475563049,
- "mean_token_accuracy": 0.8066858053207397,
- "num_tokens": 6766111.0,
- "step": 739
- },
- {
- "epoch": 0.5623100303951368,
- "grad_norm": 2.127016305923462,
- "learning_rate": 4.747596846855629e-06,
- "loss": 0.5045586228370667,
- "mean_token_accuracy": 0.821424126625061,
- "num_tokens": 6772893.0,
- "step": 740
- },
- {
- "epoch": 0.5630699088145896,
- "grad_norm": 1.7664796113967896,
- "learning_rate": 4.7466789873453446e-06,
- "loss": 0.42954835295677185,
- "mean_token_accuracy": 0.8533384799957275,
- "num_tokens": 6785133.0,
- "step": 741
- },
- {
- "epoch": 0.5638297872340425,
- "grad_norm": 1.4987404346466064,
- "learning_rate": 4.7457595510292615e-06,
- "loss": 0.5378558039665222,
- "mean_token_accuracy": 0.8184819221496582,
- "num_tokens": 6799563.0,
- "step": 742
- },
- {
- "epoch": 0.5645896656534954,
- "grad_norm": 1.4444655179977417,
- "learning_rate": 4.744838538552678e-06,
- "loss": 0.42193782329559326,
- "mean_token_accuracy": 0.837514340877533,
- "num_tokens": 6812470.0,
- "step": 743
- },
- {
- "epoch": 0.5653495440729484,
- "grad_norm": 3.867751121520996,
- "learning_rate": 4.7439159505619946e-06,
- "loss": 0.4457814693450928,
- "mean_token_accuracy": 0.8630104660987854,
- "num_tokens": 6815652.0,
- "step": 744
- },
- {
- "epoch": 0.5661094224924013,
- "grad_norm": 2.1250710487365723,
- "learning_rate": 4.74299178770472e-06,
- "loss": 0.5638922452926636,
- "mean_token_accuracy": 0.7969781160354614,
- "num_tokens": 6824566.0,
- "step": 745
- },
- {
- "epoch": 0.5668693009118541,
- "grad_norm": 2.547072410583496,
- "learning_rate": 4.742066050629465e-06,
- "loss": 0.5516207814216614,
- "mean_token_accuracy": 0.8160669803619385,
- "num_tokens": 6830589.0,
- "step": 746
- },
- {
- "epoch": 0.567629179331307,
- "grad_norm": 1.2975233793258667,
- "learning_rate": 4.741138739985951e-06,
- "loss": 0.3823344111442566,
- "mean_token_accuracy": 0.8668368458747864,
- "num_tokens": 6842707.0,
- "step": 747
- },
- {
- "epoch": 0.5683890577507599,
- "grad_norm": 1.3410450220108032,
- "learning_rate": 4.740209856424998e-06,
- "loss": 0.5148671269416809,
- "mean_token_accuracy": 0.8188045024871826,
- "num_tokens": 6857624.0,
- "step": 748
- },
- {
- "epoch": 0.5691489361702128,
- "grad_norm": 1.219467282295227,
- "learning_rate": 4.7392794005985324e-06,
- "loss": 0.3998957872390747,
- "mean_token_accuracy": 0.855175256729126,
- "num_tokens": 6875064.0,
- "step": 749
- },
- {
- "epoch": 0.5699088145896657,
- "grad_norm": 1.3530343770980835,
- "learning_rate": 4.738347373159585e-06,
- "loss": 0.5359633564949036,
- "mean_token_accuracy": 0.8178457021713257,
- "num_tokens": 6890911.0,
- "step": 750
- },
- {
- "epoch": 0.5706686930091185,
- "grad_norm": 2.146988868713379,
- "learning_rate": 4.737413774762287e-06,
- "loss": 0.4460008144378662,
- "mean_token_accuracy": 0.8172903060913086,
- "num_tokens": 6896959.0,
- "step": 751
- },
- {
- "epoch": 0.5714285714285714,
- "grad_norm": 1.456023097038269,
- "learning_rate": 4.736478606061876e-06,
- "loss": 0.43616920709609985,
- "mean_token_accuracy": 0.8465108871459961,
- "num_tokens": 6908904.0,
- "step": 752
- },
- {
- "epoch": 0.5721884498480243,
- "grad_norm": 2.9696967601776123,
- "learning_rate": 4.735541867714687e-06,
- "loss": 0.43464532494544983,
- "mean_token_accuracy": 0.8608652353286743,
- "num_tokens": 6913026.0,
- "step": 753
- },
- {
- "epoch": 0.5729483282674772,
- "grad_norm": 2.2990667819976807,
- "learning_rate": 4.73460356037816e-06,
- "loss": 0.6619116067886353,
- "mean_token_accuracy": 0.7821142673492432,
- "num_tokens": 6920588.0,
- "step": 754
- },
- {
- "epoch": 0.5737082066869301,
- "grad_norm": 2.054746389389038,
- "learning_rate": 4.733663684710835e-06,
- "loss": 0.5304250717163086,
- "mean_token_accuracy": 0.8265531063079834,
- "num_tokens": 6928910.0,
- "step": 755
- },
- {
- "epoch": 0.574468085106383,
- "grad_norm": 2.0050594806671143,
- "learning_rate": 4.732722241372354e-06,
- "loss": 0.6393026113510132,
- "mean_token_accuracy": 0.796819806098938,
- "num_tokens": 6940217.0,
- "step": 756
- },
- {
- "epoch": 0.5752279635258358,
- "grad_norm": 1.4285320043563843,
- "learning_rate": 4.731779231023456e-06,
- "loss": 0.5432837009429932,
- "mean_token_accuracy": 0.8104778528213501,
- "num_tokens": 6959101.0,
- "step": 757
- },
- {
- "epoch": 0.5759878419452887,
- "grad_norm": 2.3941943645477295,
- "learning_rate": 4.730834654325984e-06,
- "loss": 0.46550673246383667,
- "mean_token_accuracy": 0.8444503545761108,
- "num_tokens": 6965036.0,
- "step": 758
- },
- {
- "epoch": 0.5767477203647416,
- "grad_norm": 2.3850574493408203,
- "learning_rate": 4.729888511942877e-06,
- "loss": 0.4916389584541321,
- "mean_token_accuracy": 0.8228527307510376,
- "num_tokens": 6971184.0,
- "step": 759
- },
- {
- "epoch": 0.5775075987841946,
- "grad_norm": 1.627480149269104,
- "learning_rate": 4.728940804538176e-06,
- "loss": 0.5863215923309326,
- "mean_token_accuracy": 0.7995302677154541,
- "num_tokens": 6982569.0,
- "step": 760
- },
- {
- "epoch": 0.5782674772036475,
- "grad_norm": 1.1723195314407349,
- "learning_rate": 4.727991532777016e-06,
- "loss": 0.36908864974975586,
- "mean_token_accuracy": 0.8355655670166016,
- "num_tokens": 6998659.0,
- "step": 761
- },
- {
- "epoch": 0.5790273556231003,
- "grad_norm": 1.5324925184249878,
- "learning_rate": 4.727040697325634e-06,
- "loss": 0.557658851146698,
- "mean_token_accuracy": 0.8141458034515381,
- "num_tokens": 7012969.0,
- "step": 762
- },
- {
- "epoch": 0.5797872340425532,
- "grad_norm": 2.4106390476226807,
- "learning_rate": 4.726088298851362e-06,
- "loss": 0.5004243850708008,
- "mean_token_accuracy": 0.8376860618591309,
- "num_tokens": 7018301.0,
- "step": 763
- },
- {
- "epoch": 0.5805471124620061,
- "grad_norm": 2.2594921588897705,
- "learning_rate": 4.725134338022631e-06,
- "loss": 0.6067016124725342,
- "mean_token_accuracy": 0.8100241422653198,
- "num_tokens": 7025201.0,
- "step": 764
- },
- {
- "epoch": 0.581306990881459,
- "grad_norm": 1.4649826288223267,
- "learning_rate": 4.724178815508967e-06,
- "loss": 0.36200693249702454,
- "mean_token_accuracy": 0.8621826171875,
- "num_tokens": 7035112.0,
- "step": 765
- },
- {
- "epoch": 0.5820668693009119,
- "grad_norm": 2.3634560108184814,
- "learning_rate": 4.723221731980993e-06,
- "loss": 0.41862213611602783,
- "mean_token_accuracy": 0.8541463613510132,
- "num_tokens": 7040339.0,
- "step": 766
- },
- {
- "epoch": 0.5828267477203647,
- "grad_norm": 2.7798104286193848,
- "learning_rate": 4.722263088110426e-06,
- "loss": 0.4647108018398285,
- "mean_token_accuracy": 0.8505672216415405,
- "num_tokens": 7044880.0,
- "step": 767
- },
- {
- "epoch": 0.5835866261398176,
- "grad_norm": 2.070528507232666,
- "learning_rate": 4.721302884570079e-06,
- "loss": 0.5147565007209778,
- "mean_token_accuracy": 0.8113877773284912,
- "num_tokens": 7052433.0,
- "step": 768
- },
- {
- "epoch": 0.5843465045592705,
- "grad_norm": 2.1953284740448,
- "learning_rate": 4.720341122033862e-06,
- "loss": 0.5075466632843018,
- "mean_token_accuracy": 0.8474211096763611,
- "num_tokens": 7058686.0,
- "step": 769
- },
- {
- "epoch": 0.5851063829787234,
- "grad_norm": 1.9287755489349365,
- "learning_rate": 4.719377801176774e-06,
- "loss": 0.5382202863693237,
- "mean_token_accuracy": 0.8148090243339539,
- "num_tokens": 7067538.0,
- "step": 770
- },
- {
- "epoch": 0.5858662613981763,
- "grad_norm": 1.5574456453323364,
- "learning_rate": 4.718412922674913e-06,
- "loss": 0.43406790494918823,
- "mean_token_accuracy": 0.8477081060409546,
- "num_tokens": 7077853.0,
- "step": 771
- },
- {
- "epoch": 0.5866261398176292,
- "grad_norm": 1.5490336418151855,
- "learning_rate": 4.717446487205466e-06,
- "loss": 0.43164271116256714,
- "mean_token_accuracy": 0.8504570126533508,
- "num_tokens": 7091728.0,
- "step": 772
- },
- {
- "epoch": 0.587386018237082,
- "grad_norm": 1.6945984363555908,
- "learning_rate": 4.716478495446717e-06,
- "loss": 0.5153743624687195,
- "mean_token_accuracy": 0.8213579058647156,
- "num_tokens": 7108680.0,
- "step": 773
- },
- {
- "epoch": 0.5881458966565349,
- "grad_norm": 2.2633883953094482,
- "learning_rate": 4.715508948078037e-06,
- "loss": 0.45254790782928467,
- "mean_token_accuracy": 0.8392219543457031,
- "num_tokens": 7115546.0,
- "step": 774
- },
- {
- "epoch": 0.5889057750759878,
- "grad_norm": 1.5731090307235718,
- "learning_rate": 4.714537845779894e-06,
- "loss": 0.38678881525993347,
- "mean_token_accuracy": 0.8800252676010132,
- "num_tokens": 7126360.0,
- "step": 775
- },
- {
- "epoch": 0.5896656534954408,
- "grad_norm": 2.4873392581939697,
- "learning_rate": 4.7135651892338445e-06,
- "loss": 0.5190927386283875,
- "mean_token_accuracy": 0.8145407438278198,
- "num_tokens": 7135705.0,
- "step": 776
- },
- {
- "epoch": 0.5904255319148937,
- "grad_norm": 1.2931004762649536,
- "learning_rate": 4.712590979122534e-06,
- "loss": 0.3686544895172119,
- "mean_token_accuracy": 0.8720537424087524,
- "num_tokens": 7150688.0,
- "step": 777
- },
- {
- "epoch": 0.5911854103343465,
- "grad_norm": 1.6353671550750732,
- "learning_rate": 4.7116152161297045e-06,
- "loss": 0.49065062403678894,
- "mean_token_accuracy": 0.8203760385513306,
- "num_tokens": 7161040.0,
- "step": 778
- },
- {
- "epoch": 0.5919452887537994,
- "grad_norm": 1.2345483303070068,
- "learning_rate": 4.710637900940181e-06,
- "loss": 0.4004976451396942,
- "mean_token_accuracy": 0.8302007913589478,
- "num_tokens": 7178074.0,
- "step": 779
- },
- {
- "epoch": 0.5927051671732523,
- "grad_norm": 2.2506837844848633,
- "learning_rate": 4.7096590342398825e-06,
- "loss": 0.45142874121665955,
- "mean_token_accuracy": 0.8481036424636841,
- "num_tokens": 7184153.0,
- "step": 780
- },
- {
- "epoch": 0.5934650455927052,
- "grad_norm": 1.420479416847229,
- "learning_rate": 4.708678616715815e-06,
- "loss": 0.4802100360393524,
- "mean_token_accuracy": 0.8586992025375366,
- "num_tokens": 7202810.0,
- "step": 781
- },
- {
- "epoch": 0.5942249240121581,
- "grad_norm": 3.457632303237915,
- "learning_rate": 4.707696649056073e-06,
- "loss": 0.5265094041824341,
- "mean_token_accuracy": 0.8260114192962646,
- "num_tokens": 7206396.0,
- "step": 782
- },
- {
- "epoch": 0.5949848024316109,
- "grad_norm": 1.1592093706130981,
- "learning_rate": 4.706713131949839e-06,
- "loss": 0.3708173632621765,
- "mean_token_accuracy": 0.8476542234420776,
- "num_tokens": 7225034.0,
- "step": 783
- },
- {
- "epoch": 0.5957446808510638,
- "grad_norm": 1.6761400699615479,
- "learning_rate": 4.705728066087384e-06,
- "loss": 0.4137252867221832,
- "mean_token_accuracy": 0.8462049961090088,
- "num_tokens": 7237101.0,
- "step": 784
- },
- {
- "epoch": 0.5965045592705167,
- "grad_norm": 2.320185422897339,
- "learning_rate": 4.704741452160064e-06,
- "loss": 0.5157154202461243,
- "mean_token_accuracy": 0.8391785621643066,
- "num_tokens": 7243826.0,
- "step": 785
- },
- {
- "epoch": 0.5972644376899696,
- "grad_norm": 2.079423427581787,
- "learning_rate": 4.703753290860323e-06,
- "loss": 0.4734993278980255,
- "mean_token_accuracy": 0.8353281021118164,
- "num_tokens": 7250175.0,
- "step": 786
- },
- {
- "epoch": 0.5980243161094225,
- "grad_norm": 1.8215159177780151,
- "learning_rate": 4.702763582881692e-06,
- "loss": 0.520193338394165,
- "mean_token_accuracy": 0.844062864780426,
- "num_tokens": 7258868.0,
- "step": 787
- },
- {
- "epoch": 0.5987841945288754,
- "grad_norm": 1.3823071718215942,
- "learning_rate": 4.701772328918784e-06,
- "loss": 0.4177844822406769,
- "mean_token_accuracy": 0.8363165259361267,
- "num_tokens": 7271744.0,
- "step": 788
- },
- {
- "epoch": 0.5995440729483282,
- "grad_norm": 2.4749298095703125,
- "learning_rate": 4.700779529667301e-06,
- "loss": 0.5115069150924683,
- "mean_token_accuracy": 0.8473520278930664,
- "num_tokens": 7277040.0,
- "step": 789
- },
- {
- "epoch": 0.6003039513677811,
- "grad_norm": 1.7072296142578125,
- "learning_rate": 4.699785185824026e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8161447048187256,
- "num_tokens": 7288288.0,
- "step": 790
- },
- {
- "epoch": 0.601063829787234,
- "grad_norm": 1.6479384899139404,
- "learning_rate": 4.69878929808683e-06,
- "loss": 0.4445168972015381,
- "mean_token_accuracy": 0.8381255865097046,
- "num_tokens": 7298640.0,
- "step": 791
- },
- {
- "epoch": 0.601823708206687,
- "grad_norm": 1.9095896482467651,
- "learning_rate": 4.6977918671546635e-06,
- "loss": 0.5841238498687744,
- "mean_token_accuracy": 0.7971454858779907,
- "num_tokens": 7307220.0,
- "step": 792
- },
- {
- "epoch": 0.6025835866261399,
- "grad_norm": 1.9614146947860718,
- "learning_rate": 4.696792893727562e-06,
- "loss": 0.34684082865715027,
- "mean_token_accuracy": 0.8739526271820068,
- "num_tokens": 7313875.0,
- "step": 793
- },
- {
- "epoch": 0.6033434650455927,
- "grad_norm": 2.015570640563965,
- "learning_rate": 4.695792378506645e-06,
- "loss": 0.42779117822647095,
- "mean_token_accuracy": 0.8625012636184692,
- "num_tokens": 7321439.0,
- "step": 794
- },
- {
- "epoch": 0.6041033434650456,
- "grad_norm": 2.8581228256225586,
- "learning_rate": 4.694790322194111e-06,
- "loss": 0.6519991159439087,
- "mean_token_accuracy": 0.7629562616348267,
- "num_tokens": 7326916.0,
- "step": 795
- },
- {
- "epoch": 0.6048632218844985,
- "grad_norm": 2.482715368270874,
- "learning_rate": 4.693786725493242e-06,
- "loss": 0.532963216304779,
- "mean_token_accuracy": 0.832184910774231,
- "num_tokens": 7333311.0,
- "step": 796
- },
- {
- "epoch": 0.6056231003039514,
- "grad_norm": 1.6076741218566895,
- "learning_rate": 4.692781589108402e-06,
- "loss": 0.43381205201148987,
- "mean_token_accuracy": 0.8402494192123413,
- "num_tokens": 7343731.0,
- "step": 797
- },
- {
- "epoch": 0.6063829787234043,
- "grad_norm": 2.2133216857910156,
- "learning_rate": 4.691774913745033e-06,
- "loss": 0.4380851089954376,
- "mean_token_accuracy": 0.8600908517837524,
- "num_tokens": 7350224.0,
- "step": 798
- },
- {
- "epoch": 0.6071428571428571,
- "grad_norm": 2.046280860900879,
- "learning_rate": 4.690766700109659e-06,
- "loss": 0.3821919560432434,
- "mean_token_accuracy": 0.8691814541816711,
- "num_tokens": 7356717.0,
- "step": 799
- },
- {
- "epoch": 0.60790273556231,
- "grad_norm": 1.8482693433761597,
- "learning_rate": 4.689756948909884e-06,
- "loss": 0.5217651128768921,
- "mean_token_accuracy": 0.803473711013794,
- "num_tokens": 7365806.0,
- "step": 800
- },
- {
- "epoch": 0.6086626139817629,
- "grad_norm": 2.192134141921997,
- "learning_rate": 4.688745660854388e-06,
- "loss": 0.573980987071991,
- "mean_token_accuracy": 0.8198676109313965,
- "num_tokens": 7380281.0,
- "step": 801
- },
- {
- "epoch": 0.6094224924012158,
- "grad_norm": 2.363626718521118,
- "learning_rate": 4.687732836652935e-06,
- "loss": 0.5204599499702454,
- "mean_token_accuracy": 0.8373252153396606,
- "num_tokens": 7386938.0,
- "step": 802
- },
- {
- "epoch": 0.6101823708206687,
- "grad_norm": 1.9320523738861084,
- "learning_rate": 4.686718477016361e-06,
- "loss": 0.47316622734069824,
- "mean_token_accuracy": 0.830596923828125,
- "num_tokens": 7395069.0,
- "step": 803
- },
- {
- "epoch": 0.6109422492401215,
- "grad_norm": 2.6573057174682617,
- "learning_rate": 4.6857025826565845e-06,
- "loss": 0.5495861768722534,
- "mean_token_accuracy": 0.8187421560287476,
- "num_tokens": 7400563.0,
- "step": 804
- },
- {
- "epoch": 0.6117021276595744,
- "grad_norm": 2.0893123149871826,
- "learning_rate": 4.684685154286599e-06,
- "loss": 0.5362675786018372,
- "mean_token_accuracy": 0.8394701480865479,
- "num_tokens": 7406973.0,
- "step": 805
- },
- {
- "epoch": 0.6124620060790273,
- "grad_norm": 2.455130100250244,
- "learning_rate": 4.683666192620474e-06,
- "loss": 0.5405995845794678,
- "mean_token_accuracy": 0.8079100847244263,
- "num_tokens": 7412931.0,
- "step": 806
- },
- {
- "epoch": 0.6132218844984803,
- "grad_norm": 2.311915636062622,
- "learning_rate": 4.682645698373357e-06,
- "loss": 0.5395106077194214,
- "mean_token_accuracy": 0.8156260251998901,
- "num_tokens": 7419699.0,
- "step": 807
- },
- {
- "epoch": 0.6139817629179332,
- "grad_norm": 1.686838984489441,
- "learning_rate": 4.6816236722614694e-06,
- "loss": 0.6034521460533142,
- "mean_token_accuracy": 0.7855954170227051,
- "num_tokens": 7431899.0,
- "step": 808
- },
- {
- "epoch": 0.6147416413373861,
- "grad_norm": 1.682759165763855,
- "learning_rate": 4.680600115002109e-06,
- "loss": 0.48593831062316895,
- "mean_token_accuracy": 0.8229435682296753,
- "num_tokens": 7443187.0,
- "step": 809
- },
- {
- "epoch": 0.6155015197568389,
- "grad_norm": 2.064589738845825,
- "learning_rate": 4.679575027313649e-06,
- "loss": 0.5098468661308289,
- "mean_token_accuracy": 0.8234638571739197,
- "num_tokens": 7450868.0,
- "step": 810
- },
- {
- "epoch": 0.6162613981762918,
- "grad_norm": 2.2063486576080322,
- "learning_rate": 4.6785484099155324e-06,
- "loss": 0.5138497352600098,
- "mean_token_accuracy": 0.8152111172676086,
- "num_tokens": 7457176.0,
- "step": 811
- },
- {
- "epoch": 0.6170212765957447,
- "grad_norm": 1.6258726119995117,
- "learning_rate": 4.67752026352828e-06,
- "loss": 0.4064181447029114,
- "mean_token_accuracy": 0.8720619678497314,
- "num_tokens": 7466557.0,
- "step": 812
- },
- {
- "epoch": 0.6177811550151976,
- "grad_norm": 2.3309383392333984,
- "learning_rate": 4.676490588873486e-06,
- "loss": 0.5180112719535828,
- "mean_token_accuracy": 0.8233879804611206,
- "num_tokens": 7472650.0,
- "step": 813
- },
- {
- "epoch": 0.6185410334346505,
- "grad_norm": 1.4545246362686157,
- "learning_rate": 4.675459386673815e-06,
- "loss": 0.37917959690093994,
- "mean_token_accuracy": 0.8598103523254395,
- "num_tokens": 7485171.0,
- "step": 814
- },
- {
- "epoch": 0.6193009118541033,
- "grad_norm": 2.654231071472168,
- "learning_rate": 4.674426657653003e-06,
- "loss": 0.554074227809906,
- "mean_token_accuracy": 0.8026446104049683,
- "num_tokens": 7490787.0,
- "step": 815
- },
- {
- "epoch": 0.6200607902735562,
- "grad_norm": 1.5543994903564453,
- "learning_rate": 4.67339240253586e-06,
- "loss": 0.6335440278053284,
- "mean_token_accuracy": 0.783241868019104,
- "num_tokens": 7505975.0,
- "step": 816
- },
- {
- "epoch": 0.6208206686930091,
- "grad_norm": 2.079998016357422,
- "learning_rate": 4.672356622048266e-06,
- "loss": 0.5169394016265869,
- "mean_token_accuracy": 0.8088761568069458,
- "num_tokens": 7513470.0,
- "step": 817
- },
- {
- "epoch": 0.621580547112462,
- "grad_norm": 1.5971896648406982,
- "learning_rate": 4.671319316917172e-06,
- "loss": 0.44588586688041687,
- "mean_token_accuracy": 0.8518649339675903,
- "num_tokens": 7524352.0,
- "step": 818
- },
- {
- "epoch": 0.6223404255319149,
- "grad_norm": 2.477579116821289,
- "learning_rate": 4.670280487870599e-06,
- "loss": 0.5713893175125122,
- "mean_token_accuracy": 0.8116940259933472,
- "num_tokens": 7530359.0,
- "step": 819
- },
- {
- "epoch": 0.6231003039513677,
- "grad_norm": 2.066211700439453,
- "learning_rate": 4.669240135637635e-06,
- "loss": 0.5295331478118896,
- "mean_token_accuracy": 0.819536566734314,
- "num_tokens": 7536963.0,
- "step": 820
- },
- {
- "epoch": 0.6238601823708206,
- "grad_norm": 2.1217997074127197,
- "learning_rate": 4.668198260948442e-06,
- "loss": 0.6146406531333923,
- "mean_token_accuracy": 0.7932635545730591,
- "num_tokens": 7545800.0,
- "step": 821
- },
- {
- "epoch": 0.6246200607902735,
- "grad_norm": 2.0173542499542236,
- "learning_rate": 4.667154864534245e-06,
- "loss": 0.6240535974502563,
- "mean_token_accuracy": 0.7883644104003906,
- "num_tokens": 7556165.0,
- "step": 822
- },
- {
- "epoch": 0.6253799392097265,
- "grad_norm": 2.014526128768921,
- "learning_rate": 4.666109947127343e-06,
- "loss": 0.40367332100868225,
- "mean_token_accuracy": 0.8653522729873657,
- "num_tokens": 7562665.0,
- "step": 823
- },
- {
- "epoch": 0.6261398176291794,
- "grad_norm": 2.5078861713409424,
- "learning_rate": 4.665063509461098e-06,
- "loss": 0.5903617739677429,
- "mean_token_accuracy": 0.7902897596359253,
- "num_tokens": 7568922.0,
- "step": 824
- },
- {
- "epoch": 0.6268996960486323,
- "grad_norm": 2.454622745513916,
- "learning_rate": 4.664015552269938e-06,
- "loss": 0.5238361358642578,
- "mean_token_accuracy": 0.838546872138977,
- "num_tokens": 7575965.0,
- "step": 825
- },
- {
- "epoch": 0.6276595744680851,
- "grad_norm": 2.920919418334961,
- "learning_rate": 4.662966076289363e-06,
- "loss": 0.5028782486915588,
- "mean_token_accuracy": 0.8311152458190918,
- "num_tokens": 7580193.0,
- "step": 826
- },
- {
- "epoch": 0.628419452887538,
- "grad_norm": 1.545382022857666,
- "learning_rate": 4.661915082255932e-06,
- "loss": 0.4817378520965576,
- "mean_token_accuracy": 0.8373227119445801,
- "num_tokens": 7593024.0,
- "step": 827
- },
- {
- "epoch": 0.6291793313069909,
- "grad_norm": 1.5152469873428345,
- "learning_rate": 4.6608625709072766e-06,
- "loss": 0.4693033695220947,
- "mean_token_accuracy": 0.8150848150253296,
- "num_tokens": 7606459.0,
- "step": 828
- },
- {
- "epoch": 0.6299392097264438,
- "grad_norm": 2.1310224533081055,
- "learning_rate": 4.659808542982089e-06,
- "loss": 0.4653395414352417,
- "mean_token_accuracy": 0.8286294341087341,
- "num_tokens": 7613036.0,
- "step": 829
- },
- {
- "epoch": 0.6306990881458967,
- "grad_norm": 2.1949679851531982,
- "learning_rate": 4.658752999220125e-06,
- "loss": 0.3698633909225464,
- "mean_token_accuracy": 0.871590793132782,
- "num_tokens": 7618527.0,
- "step": 830
- },
- {
- "epoch": 0.6314589665653495,
- "grad_norm": 2.2770416736602783,
- "learning_rate": 4.657695940362207e-06,
- "loss": 0.5202419757843018,
- "mean_token_accuracy": 0.817577600479126,
- "num_tokens": 7624459.0,
- "step": 831
- },
- {
- "epoch": 0.6322188449848024,
- "grad_norm": 1.402042269706726,
- "learning_rate": 4.65663736715022e-06,
- "loss": 0.51531583070755,
- "mean_token_accuracy": 0.8228116631507874,
- "num_tokens": 7639371.0,
- "step": 832
- },
- {
- "epoch": 0.6329787234042553,
- "grad_norm": 3.3554883003234863,
- "learning_rate": 4.65557728032711e-06,
- "loss": 0.6771188378334045,
- "mean_token_accuracy": 0.7880028486251831,
- "num_tokens": 7643924.0,
- "step": 833
- },
- {
- "epoch": 0.6337386018237082,
- "grad_norm": 2.081040143966675,
- "learning_rate": 4.654515680636888e-06,
- "loss": 0.5712796449661255,
- "mean_token_accuracy": 0.8177868127822876,
- "num_tokens": 7651881.0,
- "step": 834
- },
- {
- "epoch": 0.6344984802431611,
- "grad_norm": 0.9128716588020325,
- "learning_rate": 4.653452568824625e-06,
- "loss": 0.3423936069011688,
- "mean_token_accuracy": 0.8782886266708374,
- "num_tokens": 7677829.0,
- "step": 835
- },
- {
- "epoch": 0.6352583586626139,
- "grad_norm": 3.49015736579895,
- "learning_rate": 4.652387945636454e-06,
- "loss": 0.34657734632492065,
- "mean_token_accuracy": 0.8770567178726196,
- "num_tokens": 7680796.0,
- "step": 836
- },
- {
- "epoch": 0.6360182370820668,
- "grad_norm": 2.026247501373291,
- "learning_rate": 4.651321811819568e-06,
- "loss": 0.5098431706428528,
- "mean_token_accuracy": 0.8216961622238159,
- "num_tokens": 7688746.0,
- "step": 837
- },
- {
- "epoch": 0.6367781155015197,
- "grad_norm": 2.444343090057373,
- "learning_rate": 4.650254168122222e-06,
- "loss": 0.5490090250968933,
- "mean_token_accuracy": 0.8092857599258423,
- "num_tokens": 7695220.0,
- "step": 838
- },
- {
- "epoch": 0.6375379939209727,
- "grad_norm": 2.0171122550964355,
- "learning_rate": 4.649185015293728e-06,
- "loss": 0.47221142053604126,
- "mean_token_accuracy": 0.8514408469200134,
- "num_tokens": 7702759.0,
- "step": 839
- },
- {
- "epoch": 0.6382978723404256,
- "grad_norm": 1.9800984859466553,
- "learning_rate": 4.64811435408446e-06,
- "loss": 0.5238803625106812,
- "mean_token_accuracy": 0.8479194641113281,
- "num_tokens": 7714017.0,
- "step": 840
- },
- {
- "epoch": 0.6390577507598785,
- "grad_norm": 3.0674357414245605,
- "learning_rate": 4.647042185245848e-06,
- "loss": 0.4668245315551758,
- "mean_token_accuracy": 0.8381714820861816,
- "num_tokens": 7717801.0,
- "step": 841
- },
- {
- "epoch": 0.6398176291793313,
- "grad_norm": 1.5672820806503296,
- "learning_rate": 4.645968509530381e-06,
- "loss": 0.4428741931915283,
- "mean_token_accuracy": 0.8416479825973511,
- "num_tokens": 7728342.0,
- "step": 842
- },
- {
- "epoch": 0.6405775075987842,
- "grad_norm": 2.3042354583740234,
- "learning_rate": 4.644893327691608e-06,
- "loss": 0.49937760829925537,
- "mean_token_accuracy": 0.827070951461792,
- "num_tokens": 7734576.0,
- "step": 843
- },
- {
- "epoch": 0.6413373860182371,
- "grad_norm": 2.057772159576416,
- "learning_rate": 4.6438166404841316e-06,
- "loss": 0.5912986993789673,
- "mean_token_accuracy": 0.805509090423584,
- "num_tokens": 7742481.0,
- "step": 844
- },
- {
- "epoch": 0.64209726443769,
- "grad_norm": 1.9688186645507812,
- "learning_rate": 4.6427384486636115e-06,
- "loss": 0.482401967048645,
- "mean_token_accuracy": 0.8358086347579956,
- "num_tokens": 7750002.0,
- "step": 845
- },
- {
- "epoch": 0.6428571428571429,
- "grad_norm": 2.6852948665618896,
- "learning_rate": 4.6416587529867665e-06,
- "loss": 0.5479315519332886,
- "mean_token_accuracy": 0.8091106414794922,
- "num_tokens": 7755578.0,
- "step": 846
- },
- {
- "epoch": 0.6436170212765957,
- "grad_norm": 2.0547337532043457,
- "learning_rate": 4.640577554211366e-06,
- "loss": 0.5327274203300476,
- "mean_token_accuracy": 0.8280376195907593,
- "num_tokens": 7763513.0,
- "step": 847
- },
- {
- "epoch": 0.6443768996960486,
- "grad_norm": 2.0328633785247803,
- "learning_rate": 4.63949485309624e-06,
- "loss": 0.4814409613609314,
- "mean_token_accuracy": 0.8527672290802002,
- "num_tokens": 7771131.0,
- "step": 848
- },
- {
- "epoch": 0.6451367781155015,
- "grad_norm": 1.5892863273620605,
- "learning_rate": 4.638410650401267e-06,
- "loss": 0.4492785334587097,
- "mean_token_accuracy": 0.846997857093811,
- "num_tokens": 7781572.0,
- "step": 849
- },
- {
- "epoch": 0.6458966565349544,
- "grad_norm": 1.8295910358428955,
- "learning_rate": 4.637324946887384e-06,
- "loss": 0.37088239192962646,
- "mean_token_accuracy": 0.8616628646850586,
- "num_tokens": 7788604.0,
- "step": 850
- },
- {
- "epoch": 0.6466565349544073,
- "grad_norm": 3.380040168762207,
- "learning_rate": 4.636237743316578e-06,
- "loss": 0.4737280607223511,
- "mean_token_accuracy": 0.855940580368042,
- "num_tokens": 7792504.0,
- "step": 851
- },
- {
- "epoch": 0.6474164133738601,
- "grad_norm": 2.8790009021759033,
- "learning_rate": 4.635149040451891e-06,
- "loss": 0.39790448546409607,
- "mean_token_accuracy": 0.8710698485374451,
- "num_tokens": 7796333.0,
- "step": 852
- },
- {
- "epoch": 0.648176291793313,
- "grad_norm": 1.914914608001709,
- "learning_rate": 4.634058839057417e-06,
- "loss": 0.2954312562942505,
- "mean_token_accuracy": 0.8880234956741333,
- "num_tokens": 7802456.0,
- "step": 853
- },
- {
- "epoch": 0.648936170212766,
- "grad_norm": 1.3709120750427246,
- "learning_rate": 4.632967139898301e-06,
- "loss": 0.43224576115608215,
- "mean_token_accuracy": 0.8446190357208252,
- "num_tokens": 7816770.0,
- "step": 854
- },
- {
- "epoch": 0.6496960486322189,
- "grad_norm": 1.6579312086105347,
- "learning_rate": 4.63187394374074e-06,
- "loss": 0.3535553514957428,
- "mean_token_accuracy": 0.8738704919815063,
- "num_tokens": 7824963.0,
- "step": 855
- },
- {
- "epoch": 0.6504559270516718,
- "grad_norm": 2.4055678844451904,
- "learning_rate": 4.63077925135198e-06,
- "loss": 0.5078744292259216,
- "mean_token_accuracy": 0.8430874347686768,
- "num_tokens": 7830962.0,
- "step": 856
- },
- {
- "epoch": 0.6512158054711246,
- "grad_norm": 2.5171499252319336,
- "learning_rate": 4.629683063500319e-06,
- "loss": 0.5172419548034668,
- "mean_token_accuracy": 0.8087141513824463,
- "num_tokens": 7836638.0,
- "step": 857
- },
- {
- "epoch": 0.6519756838905775,
- "grad_norm": 1.7588486671447754,
- "learning_rate": 4.628585380955104e-06,
- "loss": 0.5759496092796326,
- "mean_token_accuracy": 0.8043236136436462,
- "num_tokens": 7844654.0,
- "step": 858
- },
- {
- "epoch": 0.6527355623100304,
- "grad_norm": 1.5887070894241333,
- "learning_rate": 4.62748620448673e-06,
- "loss": 0.41849038004875183,
- "mean_token_accuracy": 0.8556643724441528,
- "num_tokens": 7855642.0,
- "step": 859
- },
- {
- "epoch": 0.6534954407294833,
- "grad_norm": 3.227942705154419,
- "learning_rate": 4.626385534866642e-06,
- "loss": 0.5279449224472046,
- "mean_token_accuracy": 0.8250958323478699,
- "num_tokens": 7859890.0,
- "step": 860
- },
- {
- "epoch": 0.6542553191489362,
- "grad_norm": 2.440467119216919,
- "learning_rate": 4.625283372867333e-06,
- "loss": 0.5294933319091797,
- "mean_token_accuracy": 0.8235013484954834,
- "num_tokens": 7866766.0,
- "step": 861
- },
- {
- "epoch": 0.6550151975683891,
- "grad_norm": 2.4106903076171875,
- "learning_rate": 4.624179719262342e-06,
- "loss": 0.5662813186645508,
- "mean_token_accuracy": 0.8061668872833252,
- "num_tokens": 7872809.0,
- "step": 862
- },
- {
- "epoch": 0.6557750759878419,
- "grad_norm": 3.5151145458221436,
- "learning_rate": 4.623074574826254e-06,
- "loss": 0.5471097230911255,
- "mean_token_accuracy": 0.8220691084861755,
- "num_tokens": 7876136.0,
- "step": 863
- },
- {
- "epoch": 0.6565349544072948,
- "grad_norm": 1.5319840908050537,
- "learning_rate": 4.621967940334705e-06,
- "loss": 0.4178982377052307,
- "mean_token_accuracy": 0.8517135977745056,
- "num_tokens": 7886113.0,
- "step": 864
- },
- {
- "epoch": 0.6572948328267477,
- "grad_norm": 1.63701331615448,
- "learning_rate": 4.620859816564371e-06,
- "loss": 0.4666512608528137,
- "mean_token_accuracy": 0.8223508596420288,
- "num_tokens": 7897982.0,
- "step": 865
- },
- {
- "epoch": 0.6580547112462006,
- "grad_norm": 2.1515414714813232,
- "learning_rate": 4.619750204292978e-06,
- "loss": 0.5359305143356323,
- "mean_token_accuracy": 0.8192868232727051,
- "num_tokens": 7904947.0,
- "step": 866
- },
- {
- "epoch": 0.6588145896656535,
- "grad_norm": 2.2140955924987793,
- "learning_rate": 4.618639104299294e-06,
- "loss": 0.5275633931159973,
- "mean_token_accuracy": 0.8120715618133545,
- "num_tokens": 7913913.0,
- "step": 867
- },
- {
- "epoch": 0.6595744680851063,
- "grad_norm": 1.3956893682479858,
- "learning_rate": 4.6175265173631304e-06,
- "loss": 0.4378768503665924,
- "mean_token_accuracy": 0.8479125499725342,
- "num_tokens": 7927979.0,
- "step": 868
- },
- {
- "epoch": 0.6603343465045592,
- "grad_norm": 2.98103928565979,
- "learning_rate": 4.616412444265344e-06,
- "loss": 0.42614591121673584,
- "mean_token_accuracy": 0.8595094680786133,
- "num_tokens": 7934293.0,
- "step": 869
- },
- {
- "epoch": 0.6610942249240122,
- "grad_norm": 2.554845094680786,
- "learning_rate": 4.6152968857878365e-06,
- "loss": 0.3698030412197113,
- "mean_token_accuracy": 0.8717041015625,
- "num_tokens": 7938547.0,
- "step": 870
- },
- {
- "epoch": 0.6618541033434651,
- "grad_norm": 3.0901825428009033,
- "learning_rate": 4.6141798427135475e-06,
- "loss": 0.5037497282028198,
- "mean_token_accuracy": 0.8354041576385498,
- "num_tokens": 7942829.0,
- "step": 871
- },
- {
- "epoch": 0.662613981762918,
- "grad_norm": 2.8692073822021484,
- "learning_rate": 4.6130613158264605e-06,
- "loss": 0.5418164134025574,
- "mean_token_accuracy": 0.8298909664154053,
- "num_tokens": 7949303.0,
- "step": 872
- },
- {
- "epoch": 0.6633738601823708,
- "grad_norm": 3.960404396057129,
- "learning_rate": 4.611941305911602e-06,
- "loss": 0.6284480094909668,
- "mean_token_accuracy": 0.837495744228363,
- "num_tokens": 7952486.0,
- "step": 873
- },
- {
- "epoch": 0.6641337386018237,
- "grad_norm": 2.6690115928649902,
- "learning_rate": 4.610819813755038e-06,
- "loss": 0.5214360952377319,
- "mean_token_accuracy": 0.8213508129119873,
- "num_tokens": 7957559.0,
- "step": 874
- },
- {
- "epoch": 0.6648936170212766,
- "grad_norm": 2.3376171588897705,
- "learning_rate": 4.609696840143875e-06,
- "loss": 0.46887528896331787,
- "mean_token_accuracy": 0.8438819646835327,
- "num_tokens": 7962826.0,
- "step": 875
- },
- {
- "epoch": 0.6656534954407295,
- "grad_norm": 2.2222683429718018,
- "learning_rate": 4.6085723858662575e-06,
- "loss": 0.5607719421386719,
- "mean_token_accuracy": 0.8128405809402466,
- "num_tokens": 7970131.0,
- "step": 876
- },
- {
- "epoch": 0.6664133738601824,
- "grad_norm": 2.069091558456421,
- "learning_rate": 4.607446451711372e-06,
- "loss": 0.506301760673523,
- "mean_token_accuracy": 0.8256827592849731,
- "num_tokens": 7977524.0,
- "step": 877
- },
- {
- "epoch": 0.6671732522796353,
- "grad_norm": 1.3724967241287231,
- "learning_rate": 4.606319038469443e-06,
- "loss": 0.43285101652145386,
- "mean_token_accuracy": 0.8525032997131348,
- "num_tokens": 7989174.0,
- "step": 878
- },
- {
- "epoch": 0.6679331306990881,
- "grad_norm": 2.278205156326294,
- "learning_rate": 4.605190146931731e-06,
- "loss": 0.4845905303955078,
- "mean_token_accuracy": 0.8284652829170227,
- "num_tokens": 7998524.0,
- "step": 879
- },
- {
- "epoch": 0.668693009118541,
- "grad_norm": 1.3871766328811646,
- "learning_rate": 4.604059777890537e-06,
- "loss": 0.5736679434776306,
- "mean_token_accuracy": 0.8223285675048828,
- "num_tokens": 8015776.0,
- "step": 880
- },
- {
- "epoch": 0.6694528875379939,
- "grad_norm": 1.926164984703064,
- "learning_rate": 4.602927932139197e-06,
- "loss": 0.4133230447769165,
- "mean_token_accuracy": 0.8653768301010132,
- "num_tokens": 8022979.0,
- "step": 881
- },
- {
- "epoch": 0.6702127659574468,
- "grad_norm": 2.109272003173828,
- "learning_rate": 4.601794610472083e-06,
- "loss": 0.7005600929260254,
- "mean_token_accuracy": 0.7777010202407837,
- "num_tokens": 8032618.0,
- "step": 882
- },
- {
- "epoch": 0.6709726443768997,
- "grad_norm": 2.077977418899536,
- "learning_rate": 4.6006598136846056e-06,
- "loss": 0.5278208255767822,
- "mean_token_accuracy": 0.8230358958244324,
- "num_tokens": 8040534.0,
- "step": 883
- },
- {
- "epoch": 0.6717325227963525,
- "grad_norm": 1.678581714630127,
- "learning_rate": 4.599523542573207e-06,
- "loss": 0.4955351650714874,
- "mean_token_accuracy": 0.8270003795623779,
- "num_tokens": 8052249.0,
- "step": 884
- },
- {
- "epoch": 0.6724924012158054,
- "grad_norm": 2.0751662254333496,
- "learning_rate": 4.598385797935368e-06,
- "loss": 0.5266247987747192,
- "mean_token_accuracy": 0.8263581991195679,
- "num_tokens": 8060600.0,
- "step": 885
- },
- {
- "epoch": 0.6732522796352584,
- "grad_norm": 2.418405771255493,
- "learning_rate": 4.5972465805696e-06,
- "loss": 0.4481425881385803,
- "mean_token_accuracy": 0.846164345741272,
- "num_tokens": 8066025.0,
- "step": 886
- },
- {
- "epoch": 0.6740121580547113,
- "grad_norm": 2.3936474323272705,
- "learning_rate": 4.596105891275449e-06,
- "loss": 0.4553404450416565,
- "mean_token_accuracy": 0.8412896394729614,
- "num_tokens": 8071544.0,
- "step": 887
- },
- {
- "epoch": 0.6747720364741642,
- "grad_norm": 2.2024407386779785,
- "learning_rate": 4.594963730853497e-06,
- "loss": 0.6218541860580444,
- "mean_token_accuracy": 0.7890232801437378,
- "num_tokens": 8079061.0,
- "step": 888
- },
- {
- "epoch": 0.675531914893617,
- "grad_norm": 2.51015567779541,
- "learning_rate": 4.593820100105355e-06,
- "loss": 0.5149124264717102,
- "mean_token_accuracy": 0.8241918087005615,
- "num_tokens": 8084293.0,
- "step": 889
- },
- {
- "epoch": 0.6762917933130699,
- "grad_norm": 1.8748939037322998,
- "learning_rate": 4.5926749998336665e-06,
- "loss": 0.50836181640625,
- "mean_token_accuracy": 0.8067223429679871,
- "num_tokens": 8092511.0,
- "step": 890
- },
- {
- "epoch": 0.6770516717325228,
- "grad_norm": 1.801193118095398,
- "learning_rate": 4.5915284308421075e-06,
- "loss": 0.4372861683368683,
- "mean_token_accuracy": 0.8510604500770569,
- "num_tokens": 8101174.0,
- "step": 891
- },
- {
- "epoch": 0.6778115501519757,
- "grad_norm": 2.6476457118988037,
- "learning_rate": 4.590380393935383e-06,
- "loss": 0.38700711727142334,
- "mean_token_accuracy": 0.8659796714782715,
- "num_tokens": 8105398.0,
- "step": 892
- },
- {
- "epoch": 0.6785714285714286,
- "grad_norm": 1.1147183179855347,
- "learning_rate": 4.589230889919232e-06,
- "loss": 0.38546115159988403,
- "mean_token_accuracy": 0.8570581674575806,
- "num_tokens": 8127394.0,
- "step": 893
- },
- {
- "epoch": 0.6793313069908815,
- "grad_norm": 2.908905506134033,
- "learning_rate": 4.588079919600419e-06,
- "loss": 0.5108504295349121,
- "mean_token_accuracy": 0.8121406435966492,
- "num_tokens": 8131801.0,
- "step": 894
- },
- {
- "epoch": 0.6800911854103343,
- "grad_norm": 3.1522326469421387,
- "learning_rate": 4.586927483786739e-06,
- "loss": 0.44059112668037415,
- "mean_token_accuracy": 0.8448011875152588,
- "num_tokens": 8154416.0,
- "step": 895
- },
- {
- "epoch": 0.6808510638297872,
- "grad_norm": 1.5142440795898438,
- "learning_rate": 4.585773583287017e-06,
- "loss": 0.513217568397522,
- "mean_token_accuracy": 0.8386049270629883,
- "num_tokens": 8171156.0,
- "step": 896
- },
- {
- "epoch": 0.6816109422492401,
- "grad_norm": 2.597881317138672,
- "learning_rate": 4.584618218911104e-06,
- "loss": 0.4937712550163269,
- "mean_token_accuracy": 0.8223681449890137,
- "num_tokens": 8176124.0,
- "step": 897
- },
- {
- "epoch": 0.682370820668693,
- "grad_norm": 1.8185619115829468,
- "learning_rate": 4.583461391469879e-06,
- "loss": 0.519811749458313,
- "mean_token_accuracy": 0.8169777393341064,
- "num_tokens": 8185136.0,
- "step": 898
- },
- {
- "epoch": 0.6831306990881459,
- "grad_norm": 3.2061994075775146,
- "learning_rate": 4.582303101775249e-06,
- "loss": 0.4655115008354187,
- "mean_token_accuracy": 0.8425977230072021,
- "num_tokens": 8188864.0,
- "step": 899
- },
- {
- "epoch": 0.6838905775075987,
- "grad_norm": 1.3485229015350342,
- "learning_rate": 4.581143350640146e-06,
- "loss": 0.5014470815658569,
- "mean_token_accuracy": 0.8273109197616577,
- "num_tokens": 8203460.0,
- "step": 900
- },
- {
- "epoch": 0.6846504559270516,
- "grad_norm": 1.3264713287353516,
- "learning_rate": 4.579982138878527e-06,
- "loss": 0.5073703527450562,
- "mean_token_accuracy": 0.8259357213973999,
- "num_tokens": 8219348.0,
- "step": 901
- },
- {
- "epoch": 0.6854103343465046,
- "grad_norm": 2.4436347484588623,
- "learning_rate": 4.578819467305375e-06,
- "loss": 0.47020310163497925,
- "mean_token_accuracy": 0.8567265272140503,
- "num_tokens": 8224427.0,
- "step": 902
- },
- {
- "epoch": 0.6861702127659575,
- "grad_norm": 1.921749234199524,
- "learning_rate": 4.5776553367367e-06,
- "loss": 0.622514009475708,
- "mean_token_accuracy": 0.7863982319831848,
- "num_tokens": 8233151.0,
- "step": 903
- },
- {
- "epoch": 0.6869300911854104,
- "grad_norm": 1.8815616369247437,
- "learning_rate": 4.576489747989532e-06,
- "loss": 0.4910545349121094,
- "mean_token_accuracy": 0.8147122859954834,
- "num_tokens": 8240762.0,
- "step": 904
- },
- {
- "epoch": 0.6876899696048632,
- "grad_norm": 1.2366989850997925,
- "learning_rate": 4.575322701881926e-06,
- "loss": 0.3947566747665405,
- "mean_token_accuracy": 0.873993992805481,
- "num_tokens": 8259381.0,
- "step": 905
- },
- {
- "epoch": 0.6884498480243161,
- "grad_norm": 1.5767735242843628,
- "learning_rate": 4.57415419923296e-06,
- "loss": 0.57136070728302,
- "mean_token_accuracy": 0.8028088808059692,
- "num_tokens": 8273296.0,
- "step": 906
- },
- {
- "epoch": 0.689209726443769,
- "grad_norm": 2.378675699234009,
- "learning_rate": 4.572984240862733e-06,
- "loss": 0.5894849896430969,
- "mean_token_accuracy": 0.7977708578109741,
- "num_tokens": 8280083.0,
- "step": 907
- },
- {
- "epoch": 0.6899696048632219,
- "grad_norm": 2.0401132106781006,
- "learning_rate": 4.57181282759237e-06,
- "loss": 0.5524613261222839,
- "mean_token_accuracy": 0.8138598203659058,
- "num_tokens": 8288236.0,
- "step": 908
- },
- {
- "epoch": 0.6907294832826748,
- "grad_norm": 2.293701648712158,
- "learning_rate": 4.570639960244011e-06,
- "loss": 0.5154546499252319,
- "mean_token_accuracy": 0.8234660625457764,
- "num_tokens": 8294493.0,
- "step": 909
- },
- {
- "epoch": 0.6914893617021277,
- "grad_norm": 1.9286527633666992,
- "learning_rate": 4.56946563964082e-06,
- "loss": 0.5364264845848083,
- "mean_token_accuracy": 0.8147368431091309,
- "num_tokens": 8303441.0,
- "step": 910
- },
- {
- "epoch": 0.6922492401215805,
- "grad_norm": 1.2571251392364502,
- "learning_rate": 4.5682898666069815e-06,
- "loss": 0.43535223603248596,
- "mean_token_accuracy": 0.859239935874939,
- "num_tokens": 8321548.0,
- "step": 911
- },
- {
- "epoch": 0.6930091185410334,
- "grad_norm": 1.2224860191345215,
- "learning_rate": 4.567112641967697e-06,
- "loss": 0.40205076336860657,
- "mean_token_accuracy": 0.8724711537361145,
- "num_tokens": 8335205.0,
- "step": 912
- },
- {
- "epoch": 0.6937689969604863,
- "grad_norm": 1.2064491510391235,
- "learning_rate": 4.5659339665491894e-06,
- "loss": 0.37790587544441223,
- "mean_token_accuracy": 0.8464339971542358,
- "num_tokens": 8350926.0,
- "step": 913
- },
- {
- "epoch": 0.6945288753799392,
- "grad_norm": 2.1755270957946777,
- "learning_rate": 4.5647538411786965e-06,
- "loss": 0.42034298181533813,
- "mean_token_accuracy": 0.84148108959198,
- "num_tokens": 8356739.0,
- "step": 914
- },
- {
- "epoch": 0.6952887537993921,
- "grad_norm": 1.234864592552185,
- "learning_rate": 4.563572266684478e-06,
- "loss": 0.5062938332557678,
- "mean_token_accuracy": 0.8132052421569824,
- "num_tokens": 8373660.0,
- "step": 915
- },
- {
- "epoch": 0.6960486322188449,
- "grad_norm": 2.4250621795654297,
- "learning_rate": 4.562389243895807e-06,
- "loss": 0.4907791018486023,
- "mean_token_accuracy": 0.8337979912757874,
- "num_tokens": 8378661.0,
- "step": 916
- },
- {
- "epoch": 0.6968085106382979,
- "grad_norm": 1.5018314123153687,
- "learning_rate": 4.561204773642974e-06,
- "loss": 0.41041281819343567,
- "mean_token_accuracy": 0.8569784164428711,
- "num_tokens": 8390322.0,
- "step": 917
- },
- {
- "epoch": 0.6975683890577508,
- "grad_norm": 2.797269344329834,
- "learning_rate": 4.5600188567572874e-06,
- "loss": 0.3146931529045105,
- "mean_token_accuracy": 0.8913302421569824,
- "num_tokens": 8393567.0,
- "step": 918
- },
- {
- "epoch": 0.6983282674772037,
- "grad_norm": 1.4002827405929565,
- "learning_rate": 4.558831494071069e-06,
- "loss": 0.4275597333908081,
- "mean_token_accuracy": 0.8504893779754639,
- "num_tokens": 8407119.0,
- "step": 919
- },
- {
- "epoch": 0.6990881458966566,
- "grad_norm": 1.7045831680297852,
- "learning_rate": 4.557642686417654e-06,
- "loss": 0.49593430757522583,
- "mean_token_accuracy": 0.8185091018676758,
- "num_tokens": 8417408.0,
- "step": 920
- },
- {
- "epoch": 0.6998480243161094,
- "grad_norm": 2.8818066120147705,
- "learning_rate": 4.556452434631396e-06,
- "loss": 0.637908935546875,
- "mean_token_accuracy": 0.7883946895599365,
- "num_tokens": 8422319.0,
- "step": 921
- },
- {
- "epoch": 0.7006079027355623,
- "grad_norm": 2.3587265014648438,
- "learning_rate": 4.555260739547657e-06,
- "loss": 0.38749319314956665,
- "mean_token_accuracy": 0.8774704933166504,
- "num_tokens": 8427315.0,
- "step": 922
- },
- {
- "epoch": 0.7013677811550152,
- "grad_norm": 1.6648749113082886,
- "learning_rate": 4.554067602002815e-06,
- "loss": 0.4044865369796753,
- "mean_token_accuracy": 0.8524141311645508,
- "num_tokens": 8438662.0,
- "step": 923
- },
- {
- "epoch": 0.7021276595744681,
- "grad_norm": 3.467787742614746,
- "learning_rate": 4.55287302283426e-06,
- "loss": 0.591016411781311,
- "mean_token_accuracy": 0.81184983253479,
- "num_tokens": 8442237.0,
- "step": 924
- },
- {
- "epoch": 0.702887537993921,
- "grad_norm": 2.1458635330200195,
- "learning_rate": 4.551677002880395e-06,
- "loss": 0.5017476677894592,
- "mean_token_accuracy": 0.822914183139801,
- "num_tokens": 8449494.0,
- "step": 925
- },
- {
- "epoch": 0.7036474164133738,
- "grad_norm": 2.521714448928833,
- "learning_rate": 4.550479542980632e-06,
- "loss": 0.531912088394165,
- "mean_token_accuracy": 0.8225687742233276,
- "num_tokens": 8454983.0,
- "step": 926
- },
- {
- "epoch": 0.7044072948328267,
- "grad_norm": 3.5248100757598877,
- "learning_rate": 4.549280643975394e-06,
- "loss": 0.4631815254688263,
- "mean_token_accuracy": 0.8443771600723267,
- "num_tokens": 8458504.0,
- "step": 927
- },
- {
- "epoch": 0.7051671732522796,
- "grad_norm": 2.5105819702148438,
- "learning_rate": 4.548080306706114e-06,
- "loss": 0.30487123131752014,
- "mean_token_accuracy": 0.9018767476081848,
- "num_tokens": 8462589.0,
- "step": 928
- },
- {
- "epoch": 0.7059270516717325,
- "grad_norm": 1.3367713689804077,
- "learning_rate": 4.5468785320152365e-06,
- "loss": 0.4355026185512543,
- "mean_token_accuracy": 0.8323584794998169,
- "num_tokens": 8478450.0,
- "step": 929
- },
- {
- "epoch": 0.7066869300911854,
- "grad_norm": 2.2506282329559326,
- "learning_rate": 4.545675320746212e-06,
- "loss": 0.5082957744598389,
- "mean_token_accuracy": 0.823430597782135,
- "num_tokens": 8485991.0,
- "step": 930
- },
- {
- "epoch": 0.7074468085106383,
- "grad_norm": 1.7164632081985474,
- "learning_rate": 4.544470673743502e-06,
- "loss": 0.3960164785385132,
- "mean_token_accuracy": 0.8592486381530762,
- "num_tokens": 8495217.0,
- "step": 931
- },
- {
- "epoch": 0.7082066869300911,
- "grad_norm": 1.5864969491958618,
- "learning_rate": 4.543264591852572e-06,
- "loss": 0.49114471673965454,
- "mean_token_accuracy": 0.8330780267715454,
- "num_tokens": 8508904.0,
- "step": 932
- },
- {
- "epoch": 0.708966565349544,
- "grad_norm": 2.1707003116607666,
- "learning_rate": 4.542057075919898e-06,
- "loss": 0.49895772337913513,
- "mean_token_accuracy": 0.8327431082725525,
- "num_tokens": 8515792.0,
- "step": 933
- },
- {
- "epoch": 0.709726443768997,
- "grad_norm": 1.9002083539962769,
- "learning_rate": 4.54084812679296e-06,
- "loss": 0.4548531472682953,
- "mean_token_accuracy": 0.834532618522644,
- "num_tokens": 8524006.0,
- "step": 934
- },
- {
- "epoch": 0.7104863221884499,
- "grad_norm": 1.8505141735076904,
- "learning_rate": 4.539637745320247e-06,
- "loss": 0.35716521739959717,
- "mean_token_accuracy": 0.872222900390625,
- "num_tokens": 8533647.0,
- "step": 935
- },
- {
- "epoch": 0.7112462006079028,
- "grad_norm": 2.092620849609375,
- "learning_rate": 4.53842593235125e-06,
- "loss": 0.4673694372177124,
- "mean_token_accuracy": 0.8460999131202698,
- "num_tokens": 8540734.0,
- "step": 936
- },
- {
- "epoch": 0.7120060790273556,
- "grad_norm": 2.689514636993408,
- "learning_rate": 4.537212688736466e-06,
- "loss": 0.45461273193359375,
- "mean_token_accuracy": 0.8450704216957092,
- "num_tokens": 8544948.0,
- "step": 937
- },
- {
- "epoch": 0.7127659574468085,
- "grad_norm": 2.4507734775543213,
- "learning_rate": 4.535998015327396e-06,
- "loss": 0.4571906626224518,
- "mean_token_accuracy": 0.8429360389709473,
- "num_tokens": 8550445.0,
- "step": 938
- },
- {
- "epoch": 0.7135258358662614,
- "grad_norm": 1.8960013389587402,
- "learning_rate": 4.534781912976546e-06,
- "loss": 0.4461391568183899,
- "mean_token_accuracy": 0.8487973213195801,
- "num_tokens": 8557630.0,
- "step": 939
- },
- {
- "epoch": 0.7142857142857143,
- "grad_norm": 1.602611780166626,
- "learning_rate": 4.533564382537421e-06,
- "loss": 0.5277102589607239,
- "mean_token_accuracy": 0.8330916166305542,
- "num_tokens": 8570397.0,
- "step": 940
- },
- {
- "epoch": 0.7150455927051672,
- "grad_norm": 1.8936395645141602,
- "learning_rate": 4.532345424864533e-06,
- "loss": 0.38619571924209595,
- "mean_token_accuracy": 0.8514572381973267,
- "num_tokens": 8582673.0,
- "step": 941
- },
- {
- "epoch": 0.71580547112462,
- "grad_norm": 1.3898619413375854,
- "learning_rate": 4.531125040813392e-06,
- "loss": 0.4825032949447632,
- "mean_token_accuracy": 0.833012580871582,
- "num_tokens": 8597239.0,
- "step": 942
- },
- {
- "epoch": 0.7165653495440729,
- "grad_norm": 2.128230571746826,
- "learning_rate": 4.529903231240511e-06,
- "loss": 0.4862118065357208,
- "mean_token_accuracy": 0.8210917711257935,
- "num_tokens": 8605877.0,
- "step": 943
- },
- {
- "epoch": 0.7173252279635258,
- "grad_norm": 1.6552259922027588,
- "learning_rate": 4.528679997003403e-06,
- "loss": 0.5092059373855591,
- "mean_token_accuracy": 0.8247389793395996,
- "num_tokens": 8617060.0,
- "step": 944
- },
- {
- "epoch": 0.7180851063829787,
- "grad_norm": 2.1174771785736084,
- "learning_rate": 4.52745533896058e-06,
- "loss": 0.39110174775123596,
- "mean_token_accuracy": 0.8672944903373718,
- "num_tokens": 8623306.0,
- "step": 945
- },
- {
- "epoch": 0.7188449848024316,
- "grad_norm": 2.8648383617401123,
- "learning_rate": 4.526229257971556e-06,
- "loss": 0.49864327907562256,
- "mean_token_accuracy": 0.8305130004882812,
- "num_tokens": 8627466.0,
- "step": 946
- },
- {
- "epoch": 0.7196048632218845,
- "grad_norm": 2.155514717102051,
- "learning_rate": 4.52500175489684e-06,
- "loss": 0.5070191025733948,
- "mean_token_accuracy": 0.8311188817024231,
- "num_tokens": 8634759.0,
- "step": 947
- },
- {
- "epoch": 0.7203647416413373,
- "grad_norm": 1.8432683944702148,
- "learning_rate": 4.523772830597942e-06,
- "loss": 0.5569252371788025,
- "mean_token_accuracy": 0.8070821762084961,
- "num_tokens": 8644160.0,
- "step": 948
- },
- {
- "epoch": 0.7211246200607903,
- "grad_norm": 2.8912241458892822,
- "learning_rate": 4.522542485937369e-06,
- "loss": 0.4799427390098572,
- "mean_token_accuracy": 0.8443552851676941,
- "num_tokens": 8648377.0,
- "step": 949
- },
- {
- "epoch": 0.7218844984802432,
- "grad_norm": 3.3449625968933105,
- "learning_rate": 4.521310721778622e-06,
- "loss": 0.44043463468551636,
- "mean_token_accuracy": 0.8521315455436707,
- "num_tokens": 8651846.0,
- "step": 950
- },
- {
- "epoch": 0.7226443768996961,
- "grad_norm": 1.4127917289733887,
- "learning_rate": 4.520077538986203e-06,
- "loss": 0.4700999855995178,
- "mean_token_accuracy": 0.8377952575683594,
- "num_tokens": 8665199.0,
- "step": 951
- },
- {
- "epoch": 0.723404255319149,
- "grad_norm": 2.1607301235198975,
- "learning_rate": 4.518842938425606e-06,
- "loss": 0.4374256730079651,
- "mean_token_accuracy": 0.8448896408081055,
- "num_tokens": 8672158.0,
- "step": 952
- },
- {
- "epoch": 0.7241641337386018,
- "grad_norm": 1.3442779779434204,
- "learning_rate": 4.51760692096332e-06,
- "loss": 0.38948923349380493,
- "mean_token_accuracy": 0.8598923683166504,
- "num_tokens": 8684532.0,
- "step": 953
- },
- {
- "epoch": 0.7249240121580547,
- "grad_norm": 2.0003178119659424,
- "learning_rate": 4.516369487466832e-06,
- "loss": 0.3797217011451721,
- "mean_token_accuracy": 0.8652102947235107,
- "num_tokens": 8691460.0,
- "step": 954
- },
- {
- "epoch": 0.7256838905775076,
- "grad_norm": 1.8196535110473633,
- "learning_rate": 4.5151306388046175e-06,
- "loss": 0.5676811933517456,
- "mean_token_accuracy": 0.818500816822052,
- "num_tokens": 8701624.0,
- "step": 955
- },
- {
- "epoch": 0.7264437689969605,
- "grad_norm": 2.1962296962738037,
- "learning_rate": 4.513890375846152e-06,
- "loss": 0.45399484038352966,
- "mean_token_accuracy": 0.8463879227638245,
- "num_tokens": 8707410.0,
- "step": 956
- },
- {
- "epoch": 0.7272036474164134,
- "grad_norm": 1.8798872232437134,
- "learning_rate": 4.512648699461897e-06,
- "loss": 0.5679811239242554,
- "mean_token_accuracy": 0.8089900016784668,
- "num_tokens": 8715630.0,
- "step": 957
- },
- {
- "epoch": 0.7279635258358662,
- "grad_norm": 2.3540258407592773,
- "learning_rate": 4.511405610523309e-06,
- "loss": 0.5282865762710571,
- "mean_token_accuracy": 0.8196114301681519,
- "num_tokens": 8721934.0,
- "step": 958
- },
- {
- "epoch": 0.7287234042553191,
- "grad_norm": 2.5630908012390137,
- "learning_rate": 4.510161109902837e-06,
- "loss": 0.39442378282546997,
- "mean_token_accuracy": 0.8400980830192566,
- "num_tokens": 8726511.0,
- "step": 959
- },
- {
- "epoch": 0.729483282674772,
- "grad_norm": 1.9829226732254028,
- "learning_rate": 4.508915198473919e-06,
- "loss": 0.4611976742744446,
- "mean_token_accuracy": 0.8439624309539795,
- "num_tokens": 8733460.0,
- "step": 960
- },
- {
- "epoch": 0.7302431610942249,
- "grad_norm": 3.0291950702667236,
- "learning_rate": 4.507667877110982e-06,
- "loss": 0.5158340930938721,
- "mean_token_accuracy": 0.8300060033798218,
- "num_tokens": 8737629.0,
- "step": 961
- },
- {
- "epoch": 0.7310030395136778,
- "grad_norm": 1.9208252429962158,
- "learning_rate": 4.506419146689445e-06,
- "loss": 0.3807099163532257,
- "mean_token_accuracy": 0.871469259262085,
- "num_tokens": 8744615.0,
- "step": 962
- },
- {
- "epoch": 0.7317629179331308,
- "grad_norm": 3.051565408706665,
- "learning_rate": 4.505169008085717e-06,
- "loss": 0.38461726903915405,
- "mean_token_accuracy": 0.874465823173523,
- "num_tokens": 8748154.0,
- "step": 963
- },
- {
- "epoch": 0.7325227963525835,
- "grad_norm": 1.375466227531433,
- "learning_rate": 4.503917462177192e-06,
- "loss": 0.42490679025650024,
- "mean_token_accuracy": 0.8457326889038086,
- "num_tokens": 8760965.0,
- "step": 964
- },
- {
- "epoch": 0.7332826747720365,
- "grad_norm": 2.216681957244873,
- "learning_rate": 4.5026645098422515e-06,
- "loss": 0.43149900436401367,
- "mean_token_accuracy": 0.8527278900146484,
- "num_tokens": 8766996.0,
- "step": 965
- },
- {
- "epoch": 0.7340425531914894,
- "grad_norm": 1.9422595500946045,
- "learning_rate": 4.5014101519602684e-06,
- "loss": 0.4964504539966583,
- "mean_token_accuracy": 0.8137556314468384,
- "num_tokens": 8774411.0,
- "step": 966
- },
- {
- "epoch": 0.7348024316109423,
- "grad_norm": 2.058887004852295,
- "learning_rate": 4.500154389411598e-06,
- "loss": 0.4977570176124573,
- "mean_token_accuracy": 0.8254626989364624,
- "num_tokens": 8782220.0,
- "step": 967
- },
- {
- "epoch": 0.7355623100303952,
- "grad_norm": 2.9977786540985107,
- "learning_rate": 4.498897223077582e-06,
- "loss": 0.4061415195465088,
- "mean_token_accuracy": 0.8752427101135254,
- "num_tokens": 8786120.0,
- "step": 968
- },
- {
- "epoch": 0.736322188449848,
- "grad_norm": 2.2636303901672363,
- "learning_rate": 4.49763865384055e-06,
- "loss": 0.5062161087989807,
- "mean_token_accuracy": 0.8171653747558594,
- "num_tokens": 8792459.0,
- "step": 969
- },
- {
- "epoch": 0.7370820668693009,
- "grad_norm": 1.8850842714309692,
- "learning_rate": 4.496378682583813e-06,
- "loss": 0.5014280676841736,
- "mean_token_accuracy": 0.8547511100769043,
- "num_tokens": 8800675.0,
- "step": 970
- },
- {
- "epoch": 0.7378419452887538,
- "grad_norm": 1.191985011100769,
- "learning_rate": 4.495117310191667e-06,
- "loss": 0.4713883101940155,
- "mean_token_accuracy": 0.8213596343994141,
- "num_tokens": 8820740.0,
- "step": 971
- },
- {
- "epoch": 0.7386018237082067,
- "grad_norm": 1.823000192642212,
- "learning_rate": 4.493854537549393e-06,
- "loss": 0.46332645416259766,
- "mean_token_accuracy": 0.8359860777854919,
- "num_tokens": 8828884.0,
- "step": 972
- },
- {
- "epoch": 0.7393617021276596,
- "grad_norm": 2.590446949005127,
- "learning_rate": 4.492590365543253e-06,
- "loss": 0.49074703454971313,
- "mean_token_accuracy": 0.8433758020401001,
- "num_tokens": 8833859.0,
- "step": 973
- },
- {
- "epoch": 0.7401215805471124,
- "grad_norm": 2.2762670516967773,
- "learning_rate": 4.491324795060491e-06,
- "loss": 0.39465656876564026,
- "mean_token_accuracy": 0.8734766244888306,
- "num_tokens": 8839350.0,
- "step": 974
- },
- {
- "epoch": 0.7408814589665653,
- "grad_norm": 2.698725461959839,
- "learning_rate": 4.490057826989333e-06,
- "loss": 0.5552085041999817,
- "mean_token_accuracy": 0.8132266998291016,
- "num_tokens": 8844373.0,
- "step": 975
- },
- {
- "epoch": 0.7416413373860182,
- "grad_norm": 2.704606294631958,
- "learning_rate": 4.488789462218988e-06,
- "loss": 0.3447791635990143,
- "mean_token_accuracy": 0.8736170530319214,
- "num_tokens": 8848236.0,
- "step": 976
- },
- {
- "epoch": 0.7424012158054711,
- "grad_norm": 3.1260716915130615,
- "learning_rate": 4.487519701639641e-06,
- "loss": 0.5945233702659607,
- "mean_token_accuracy": 0.7997599840164185,
- "num_tokens": 8852935.0,
- "step": 977
- },
- {
- "epoch": 0.743161094224924,
- "grad_norm": 1.6895452737808228,
- "learning_rate": 4.486248546142459e-06,
- "loss": 0.4823892116546631,
- "mean_token_accuracy": 0.8279662132263184,
- "num_tokens": 8861743.0,
- "step": 978
- },
- {
- "epoch": 0.743920972644377,
- "grad_norm": 1.9161452054977417,
- "learning_rate": 4.4849759966195885e-06,
- "loss": 0.5266581773757935,
- "mean_token_accuracy": 0.8218623399734497,
- "num_tokens": 8870601.0,
- "step": 979
- },
- {
- "epoch": 0.7446808510638298,
- "grad_norm": 1.6894301176071167,
- "learning_rate": 4.483702053964154e-06,
- "loss": 0.4186219573020935,
- "mean_token_accuracy": 0.8471781015396118,
- "num_tokens": 8885617.0,
- "step": 980
- },
- {
- "epoch": 0.7454407294832827,
- "grad_norm": 1.6319992542266846,
- "learning_rate": 4.482426719070258e-06,
- "loss": 0.541317880153656,
- "mean_token_accuracy": 0.8216162323951721,
- "num_tokens": 8897595.0,
- "step": 981
- },
- {
- "epoch": 0.7462006079027356,
- "grad_norm": 5.102413177490234,
- "learning_rate": 4.4811499928329775e-06,
- "loss": 0.3928517699241638,
- "mean_token_accuracy": 0.858033299446106,
- "num_tokens": 8901682.0,
- "step": 982
- },
- {
- "epoch": 0.7469604863221885,
- "grad_norm": 2.213860273361206,
- "learning_rate": 4.479871876148368e-06,
- "loss": 0.4276347756385803,
- "mean_token_accuracy": 0.8529798984527588,
- "num_tokens": 8908088.0,
- "step": 983
- },
- {
- "epoch": 0.7477203647416414,
- "grad_norm": 1.2180038690567017,
- "learning_rate": 4.478592369913464e-06,
- "loss": 0.3941590189933777,
- "mean_token_accuracy": 0.8608149290084839,
- "num_tokens": 8925876.0,
- "step": 984
- },
- {
- "epoch": 0.7484802431610942,
- "grad_norm": 2.849802255630493,
- "learning_rate": 4.477311475026271e-06,
- "loss": 0.42190325260162354,
- "mean_token_accuracy": 0.860505223274231,
- "num_tokens": 8930190.0,
- "step": 985
- },
- {
- "epoch": 0.7492401215805471,
- "grad_norm": 1.704128384590149,
- "learning_rate": 4.476029192385769e-06,
- "loss": 0.4786282777786255,
- "mean_token_accuracy": 0.8302322626113892,
- "num_tokens": 8938340.0,
- "step": 986
- },
- {
- "epoch": 0.75,
- "grad_norm": 2.06322979927063,
- "learning_rate": 4.474745522891915e-06,
- "loss": 0.4648786187171936,
- "mean_token_accuracy": 0.8366481065750122,
- "num_tokens": 8944633.0,
- "step": 987
- },
- {
- "epoch": 0.7507598784194529,
- "grad_norm": 2.0745396614074707,
- "learning_rate": 4.473460467445637e-06,
- "loss": 0.5744885206222534,
- "mean_token_accuracy": 0.8357284069061279,
- "num_tokens": 8954457.0,
- "step": 988
- },
- {
- "epoch": 0.7515197568389058,
- "grad_norm": 1.9281407594680786,
- "learning_rate": 4.472174026948836e-06,
- "loss": 0.528974175453186,
- "mean_token_accuracy": 0.8083580732345581,
- "num_tokens": 8962701.0,
- "step": 989
- },
- {
- "epoch": 0.7522796352583586,
- "grad_norm": 3.012381076812744,
- "learning_rate": 4.470886202304385e-06,
- "loss": 0.48754751682281494,
- "mean_token_accuracy": 0.8368391990661621,
- "num_tokens": 8967272.0,
- "step": 990
- },
- {
- "epoch": 0.7530395136778115,
- "grad_norm": 1.691826581954956,
- "learning_rate": 4.469596994416131e-06,
- "loss": 0.484740674495697,
- "mean_token_accuracy": 0.8500643968582153,
- "num_tokens": 8976615.0,
- "step": 991
- },
- {
- "epoch": 0.7537993920972644,
- "grad_norm": 2.4961965084075928,
- "learning_rate": 4.468306404188887e-06,
- "loss": 0.50777268409729,
- "mean_token_accuracy": 0.8168395757675171,
- "num_tokens": 8983235.0,
- "step": 992
- },
- {
- "epoch": 0.7545592705167173,
- "grad_norm": 1.512007713317871,
- "learning_rate": 4.467014432528441e-06,
- "loss": 0.4583340287208557,
- "mean_token_accuracy": 0.8465162515640259,
- "num_tokens": 8993815.0,
- "step": 993
- },
- {
- "epoch": 0.7553191489361702,
- "grad_norm": 1.9362257719039917,
- "learning_rate": 4.465721080341547e-06,
- "loss": 0.6027892827987671,
- "mean_token_accuracy": 0.8052380084991455,
- "num_tokens": 9002697.0,
- "step": 994
- },
- {
- "epoch": 0.756079027355623,
- "grad_norm": 2.473632335662842,
- "learning_rate": 4.4644263485359316e-06,
- "loss": 0.5394320487976074,
- "mean_token_accuracy": 0.834665834903717,
- "num_tokens": 9007428.0,
- "step": 995
- },
- {
- "epoch": 0.756838905775076,
- "grad_norm": 2.2527434825897217,
- "learning_rate": 4.463130238020284e-06,
- "loss": 0.5485198497772217,
- "mean_token_accuracy": 0.8090173006057739,
- "num_tokens": 9013570.0,
- "step": 996
- },
- {
- "epoch": 0.7575987841945289,
- "grad_norm": 1.4130940437316895,
- "learning_rate": 4.4618327497042676e-06,
- "loss": 0.37994423508644104,
- "mean_token_accuracy": 0.8625167012214661,
- "num_tokens": 9025485.0,
- "step": 997
- },
- {
- "epoch": 0.7583586626139818,
- "grad_norm": 2.685115098953247,
- "learning_rate": 4.460533884498509e-06,
- "loss": 0.447973370552063,
- "mean_token_accuracy": 0.8564165234565735,
- "num_tokens": 9030355.0,
- "step": 998
- },
- {
- "epoch": 0.7591185410334347,
- "grad_norm": 3.2743139266967773,
- "learning_rate": 4.4592336433146e-06,
- "loss": 0.45275989174842834,
- "mean_token_accuracy": 0.8462578058242798,
- "num_tokens": 9034406.0,
- "step": 999
- },
- {
- "epoch": 0.7598784194528876,
- "grad_norm": 1.9383049011230469,
- "learning_rate": 4.457932027065102e-06,
- "loss": 0.5387729406356812,
- "mean_token_accuracy": 0.8357330560684204,
- "num_tokens": 9041502.0,
- "step": 1000
- }
- ],
- "logging_steps": 1.0,
- "max_steps": 3948,
- "num_input_tokens_seen": 0,
- "num_train_epochs": 3,
- "save_steps": 1000,
- "stateful_callbacks": {
- "TrainerControl": {
- "args": {
- "should_epoch_stop": false,
- "should_evaluate": false,
- "should_log": false,
- "should_save": true,
- "should_training_stop": false
- },
- "attributes": {}
- }
- },
- "total_flos": 9.855721706985882e+16,
- "train_batch_size": 1,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
deleted file mode 100644
index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000
--- a/checkpoint-1000/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021
-size 6225
diff --git a/checkpoint-2000/chat_template.jinja b/checkpoint-2000/chat_template.jinja
deleted file mode 100644
index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000
--- a/checkpoint-2000/chat_template.jinja
+++ /dev/null
@@ -1,61 +0,0 @@
-{%- if tools %}
- {{- '<|im_start|>system\n' }}
- {%- if messages[0].role == 'system' %}
- {{- messages[0].content + '\n\n' }}
- {%- endif %}
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
- {%- for tool in tools %}
- {{- "\n" }}
- {{- tool | tojson }}
- {%- endfor %}
- {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
-{%- else %}
- {%- if messages[0].role == 'system' %}
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
- {%- endif %}
-{%- endif %}
-{%- for message in messages %}
- {%- if message.content is string %}
- {%- set content = message.content %}
- {%- else %}
- {%- set content = '' %}
- {%- endif %}
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
- {%- elif message.role == "assistant" %}
- {{- '<|im_start|>' + message.role + '\n' + content }}
- {%- if message.tool_calls %}
- {%- for tool_call in message.tool_calls %}
- {%- if (loop.first and content) or (not loop.first) %}
- {{- '\n' }}
- {%- endif %}
- {%- if tool_call.function %}
- {%- set tool_call = tool_call.function %}
- {%- endif %}
- {{- '\n{"name": "' }}
- {{- tool_call.name }}
- {{- '", "arguments": ' }}
- {%- if tool_call.arguments is string %}
- {{- tool_call.arguments }}
- {%- else %}
- {{- tool_call.arguments | tojson }}
- {%- endif %}
- {{- '}\n' }}
- {%- endfor %}
- {%- endif %}
- {{- '<|im_end|>\n' }}
- {%- elif message.role == "tool" %}
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
- {{- '<|im_start|>user' }}
- {%- endif %}
- {{- '\n\n' }}
- {{- content }}
- {{- '\n' }}
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
- {{- '<|im_end|>\n' }}
- {%- endif %}
- {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
- {{- '<|im_start|>assistant\n' }}
-{%- endif %}
\ No newline at end of file
diff --git a/checkpoint-2000/config.json b/checkpoint-2000/config.json
deleted file mode 100644
index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000
--- a/checkpoint-2000/config.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "architectures": [
- "Qwen3ForCausalLM"
- ],
- "attention_bias": false,
- "attention_dropout": 0.0,
- "bos_token_id": null,
- "dtype": "float32",
- "eos_token_id": 151645,
- "head_dim": 128,
- "hidden_act": "silu",
- "hidden_size": 2560,
- "initializer_range": 0.02,
- "intermediate_size": 9728,
- "layer_types": [
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention"
- ],
- "max_position_embeddings": 262144,
- "max_window_layers": 36,
- "model_type": "qwen3",
- "num_attention_heads": 32,
- "num_hidden_layers": 36,
- "num_key_value_heads": 8,
- "pad_token_id": 151662,
- "rms_norm_eps": 1e-06,
- "rope_parameters": {
- "rope_theta": 5000000,
- "rope_type": "default"
- },
- "sliding_window": null,
- "tie_word_embeddings": true,
- "transformers_version": "5.5.3",
- "use_cache": false,
- "use_sliding_window": false,
- "vocab_size": 151936
-}
diff --git a/checkpoint-2000/generation_config.json b/checkpoint-2000/generation_config.json
deleted file mode 100644
index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000
--- a/checkpoint-2000/generation_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "do_sample": true,
- "eos_token_id": [
- 151645,
- 151643
- ],
- "pad_token_id": 151662,
- "temperature": 0.7,
- "top_k": 20,
- "top_p": 0.8,
- "transformers_version": "5.5.3"
-}
diff --git a/checkpoint-2000/model.safetensors b/checkpoint-2000/model.safetensors
deleted file mode 100644
index 7d26c4ee9aa6752602ff6db19d02edfff6e062f7..0000000000000000000000000000000000000000
--- a/checkpoint-2000/model.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1b1ce241be74f81ade1793d7d1184e1cf7ce2e9afe46f5dd9418012bd1861b43
-size 17645743048
diff --git a/checkpoint-2000/optimizer.bin b/checkpoint-2000/optimizer.bin
deleted file mode 100644
index 9bf26616282816435a39edb78ec22ebe2461696f..0000000000000000000000000000000000000000
--- a/checkpoint-2000/optimizer.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07e07657f743306d7736d8218c799dfc731283d7dedfca7eb48d4dcc64c64623
-size 32180124005
diff --git a/checkpoint-2000/pytorch_model_fsdp.bin b/checkpoint-2000/pytorch_model_fsdp.bin
deleted file mode 100644
index 675400f377bfee7718a7693c8e10f410f7ec7242..0000000000000000000000000000000000000000
--- a/checkpoint-2000/pytorch_model_fsdp.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:27df8f98b77baf9afbd9bdac0a9ff6cc9e53f4d44310a5d8c665d45656911b2e
-size 17645897996
diff --git a/checkpoint-2000/rng_state_0.pth b/checkpoint-2000/rng_state_0.pth
deleted file mode 100644
index 870021e3fa5ac35c2f711adf0c93a556ab4842da..0000000000000000000000000000000000000000
--- a/checkpoint-2000/rng_state_0.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:95e5fc2074c0df31522a514f862c86cb00d71c946a7f15cc9ec0e53a69fb28a7
-size 14917
diff --git a/checkpoint-2000/rng_state_1.pth b/checkpoint-2000/rng_state_1.pth
deleted file mode 100644
index 21f20da5eb1da017f08aaa88bd19cf24d40e3fbf..0000000000000000000000000000000000000000
--- a/checkpoint-2000/rng_state_1.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e7153eae67b6c9232a41bc996a2bf5b83229b8c7230d61911ac0fd40e64154e
-size 14917
diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt
deleted file mode 100644
index 19b70f1f9806f4fbfc99dc7c3b253116b6abd505..0000000000000000000000000000000000000000
--- a/checkpoint-2000/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c70c34042f727a1ef06eb662d77f90fe87f01cf21415dce97c8cb4c779b5625
-size 1465
diff --git a/checkpoint-2000/tokenizer.json b/checkpoint-2000/tokenizer.json
deleted file mode 100644
index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000
--- a/checkpoint-2000/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
-size 11422650
diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json
deleted file mode 100644
index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000
--- a/checkpoint-2000/tokenizer_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "add_prefix_space": false,
- "backend": "tokenizers",
- "bos_token": null,
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|im_end|>",
- "errors": "replace",
- "extra_special_tokens": [
- "<|im_start|>",
- "<|im_end|>",
- "<|object_ref_start|>",
- "<|object_ref_end|>",
- "<|box_start|>",
- "<|box_end|>",
- "<|quad_start|>",
- "<|quad_end|>",
- "<|vision_start|>",
- "<|vision_end|>",
- "<|vision_pad|>",
- "<|image_pad|>",
- "<|video_pad|>"
- ],
- "is_local": false,
- "model_max_length": 1010000,
- "pad_token": "<|fim_pad|>",
- "split_special_tokens": false,
- "tokenizer_class": "Qwen2Tokenizer",
- "unk_token": null
-}
diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json
deleted file mode 100644
index 18d493bffa53c88bc213582a98da8699d575acdc..0000000000000000000000000000000000000000
--- a/checkpoint-2000/trainer_state.json
+++ /dev/null
@@ -1,18034 +0,0 @@
-{
- "best_global_step": null,
- "best_metric": null,
- "best_model_checkpoint": null,
- "epoch": 1.5197568389057752,
- "eval_steps": 500,
- "global_step": 2000,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0007598784194528875,
- "grad_norm": 11.767926216125488,
- "learning_rate": 0.0,
- "loss": 0.7937269806861877,
- "mean_token_accuracy": 0.7822731137275696,
- "num_tokens": 10507.0,
- "step": 1
- },
- {
- "epoch": 0.001519756838905775,
- "grad_norm": 14.9199800491333,
- "learning_rate": 2.5252525252525256e-08,
- "loss": 0.7665389776229858,
- "mean_token_accuracy": 0.8342233300209045,
- "num_tokens": 14806.0,
- "step": 2
- },
- {
- "epoch": 0.0022796352583586625,
- "grad_norm": 11.991217613220215,
- "learning_rate": 5.050505050505051e-08,
- "loss": 0.9597002267837524,
- "mean_token_accuracy": 0.7054992318153381,
- "num_tokens": 27170.0,
- "step": 3
- },
- {
- "epoch": 0.00303951367781155,
- "grad_norm": 12.958333015441895,
- "learning_rate": 7.575757575757576e-08,
- "loss": 0.9971482753753662,
- "mean_token_accuracy": 0.7261134386062622,
- "num_tokens": 33729.0,
- "step": 4
- },
- {
- "epoch": 0.003799392097264438,
- "grad_norm": 13.5665283203125,
- "learning_rate": 1.0101010101010103e-07,
- "loss": 0.9504883885383606,
- "mean_token_accuracy": 0.745307445526123,
- "num_tokens": 41174.0,
- "step": 5
- },
- {
- "epoch": 0.004559270516717325,
- "grad_norm": 10.09444808959961,
- "learning_rate": 1.2626262626262626e-07,
- "loss": 0.759548008441925,
- "mean_token_accuracy": 0.7842121124267578,
- "num_tokens": 47943.0,
- "step": 6
- },
- {
- "epoch": 0.005319148936170213,
- "grad_norm": 10.741650581359863,
- "learning_rate": 1.5151515151515152e-07,
- "loss": 0.8231598138809204,
- "mean_token_accuracy": 0.7550969123840332,
- "num_tokens": 56665.0,
- "step": 7
- },
- {
- "epoch": 0.0060790273556231,
- "grad_norm": 12.250170707702637,
- "learning_rate": 1.767676767676768e-07,
- "loss": 0.8576581478118896,
- "mean_token_accuracy": 0.7568671703338623,
- "num_tokens": 67606.0,
- "step": 8
- },
- {
- "epoch": 0.006838905775075988,
- "grad_norm": 12.828629493713379,
- "learning_rate": 2.0202020202020205e-07,
- "loss": 0.9886435866355896,
- "mean_token_accuracy": 0.733400285243988,
- "num_tokens": 74272.0,
- "step": 9
- },
- {
- "epoch": 0.007598784194528876,
- "grad_norm": 15.966923713684082,
- "learning_rate": 2.2727272727272729e-07,
- "loss": 1.064985990524292,
- "mean_token_accuracy": 0.7101132869720459,
- "num_tokens": 80524.0,
- "step": 10
- },
- {
- "epoch": 0.008358662613981762,
- "grad_norm": 10.864850044250488,
- "learning_rate": 2.525252525252525e-07,
- "loss": 0.8311550617218018,
- "mean_token_accuracy": 0.7431639432907104,
- "num_tokens": 96292.0,
- "step": 11
- },
- {
- "epoch": 0.00911854103343465,
- "grad_norm": 16.438785552978516,
- "learning_rate": 2.7777777777777776e-07,
- "loss": 1.0579866170883179,
- "mean_token_accuracy": 0.7222976684570312,
- "num_tokens": 102992.0,
- "step": 12
- },
- {
- "epoch": 0.009878419452887538,
- "grad_norm": 11.179214477539062,
- "learning_rate": 3.0303030303030305e-07,
- "loss": 0.9816144704818726,
- "mean_token_accuracy": 0.7206371426582336,
- "num_tokens": 113571.0,
- "step": 13
- },
- {
- "epoch": 0.010638297872340425,
- "grad_norm": 12.780299186706543,
- "learning_rate": 3.2828282828282834e-07,
- "loss": 0.847449004650116,
- "mean_token_accuracy": 0.7826199531555176,
- "num_tokens": 119568.0,
- "step": 14
- },
- {
- "epoch": 0.011398176291793313,
- "grad_norm": 14.800421714782715,
- "learning_rate": 3.535353535353536e-07,
- "loss": 0.9275516271591187,
- "mean_token_accuracy": 0.7655045986175537,
- "num_tokens": 126258.0,
- "step": 15
- },
- {
- "epoch": 0.0121580547112462,
- "grad_norm": 11.267602920532227,
- "learning_rate": 3.787878787878788e-07,
- "loss": 0.8464037179946899,
- "mean_token_accuracy": 0.7606508731842041,
- "num_tokens": 136831.0,
- "step": 16
- },
- {
- "epoch": 0.012917933130699088,
- "grad_norm": 12.891013145446777,
- "learning_rate": 4.040404040404041e-07,
- "loss": 0.9903074502944946,
- "mean_token_accuracy": 0.7247487306594849,
- "num_tokens": 150434.0,
- "step": 17
- },
- {
- "epoch": 0.013677811550151976,
- "grad_norm": 11.13957691192627,
- "learning_rate": 4.2929292929292934e-07,
- "loss": 0.8287211656570435,
- "mean_token_accuracy": 0.7621913552284241,
- "num_tokens": 158516.0,
- "step": 18
- },
- {
- "epoch": 0.014437689969604863,
- "grad_norm": 18.39569664001465,
- "learning_rate": 4.5454545454545457e-07,
- "loss": 1.150015115737915,
- "mean_token_accuracy": 0.7349498271942139,
- "num_tokens": 162214.0,
- "step": 19
- },
- {
- "epoch": 0.015197568389057751,
- "grad_norm": 9.353750228881836,
- "learning_rate": 4.797979797979798e-07,
- "loss": 0.7228299379348755,
- "mean_token_accuracy": 0.7969573736190796,
- "num_tokens": 173035.0,
- "step": 20
- },
- {
- "epoch": 0.015957446808510637,
- "grad_norm": 8.267163276672363,
- "learning_rate": 5.05050505050505e-07,
- "loss": 0.7358136177062988,
- "mean_token_accuracy": 0.7903937101364136,
- "num_tokens": 183568.0,
- "step": 21
- },
- {
- "epoch": 0.016717325227963525,
- "grad_norm": 11.137128829956055,
- "learning_rate": 5.303030303030304e-07,
- "loss": 1.0075397491455078,
- "mean_token_accuracy": 0.702807605266571,
- "num_tokens": 192759.0,
- "step": 22
- },
- {
- "epoch": 0.017477203647416412,
- "grad_norm": 10.734103202819824,
- "learning_rate": 5.555555555555555e-07,
- "loss": 0.8925919532775879,
- "mean_token_accuracy": 0.7475671768188477,
- "num_tokens": 201280.0,
- "step": 23
- },
- {
- "epoch": 0.0182370820668693,
- "grad_norm": 11.945566177368164,
- "learning_rate": 5.808080808080809e-07,
- "loss": 0.7260514497756958,
- "mean_token_accuracy": 0.7859152555465698,
- "num_tokens": 218053.0,
- "step": 24
- },
- {
- "epoch": 0.018996960486322188,
- "grad_norm": 18.610652923583984,
- "learning_rate": 6.060606060606061e-07,
- "loss": 0.8995465636253357,
- "mean_token_accuracy": 0.7931990623474121,
- "num_tokens": 220953.0,
- "step": 25
- },
- {
- "epoch": 0.019756838905775075,
- "grad_norm": 10.51898193359375,
- "learning_rate": 6.313131313131314e-07,
- "loss": 0.9532671570777893,
- "mean_token_accuracy": 0.7257645726203918,
- "num_tokens": 231200.0,
- "step": 26
- },
- {
- "epoch": 0.020516717325227963,
- "grad_norm": 9.581812858581543,
- "learning_rate": 6.565656565656567e-07,
- "loss": 0.9038010239601135,
- "mean_token_accuracy": 0.7390379905700684,
- "num_tokens": 237711.0,
- "step": 27
- },
- {
- "epoch": 0.02127659574468085,
- "grad_norm": 12.297484397888184,
- "learning_rate": 6.818181818181818e-07,
- "loss": 1.048936367034912,
- "mean_token_accuracy": 0.7175670862197876,
- "num_tokens": 242503.0,
- "step": 28
- },
- {
- "epoch": 0.022036474164133738,
- "grad_norm": 7.437953472137451,
- "learning_rate": 7.070707070707071e-07,
- "loss": 0.8308826684951782,
- "mean_token_accuracy": 0.7415335774421692,
- "num_tokens": 250842.0,
- "step": 29
- },
- {
- "epoch": 0.022796352583586626,
- "grad_norm": 6.134475231170654,
- "learning_rate": 7.323232323232324e-07,
- "loss": 0.647913932800293,
- "mean_token_accuracy": 0.8124054670333862,
- "num_tokens": 267453.0,
- "step": 30
- },
- {
- "epoch": 0.023556231003039513,
- "grad_norm": 6.678966045379639,
- "learning_rate": 7.575757575757576e-07,
- "loss": 0.7052810192108154,
- "mean_token_accuracy": 0.7908754348754883,
- "num_tokens": 284416.0,
- "step": 31
- },
- {
- "epoch": 0.0243161094224924,
- "grad_norm": 7.42232084274292,
- "learning_rate": 7.82828282828283e-07,
- "loss": 1.022383213043213,
- "mean_token_accuracy": 0.7053230404853821,
- "num_tokens": 292073.0,
- "step": 32
- },
- {
- "epoch": 0.02507598784194529,
- "grad_norm": 6.463219165802002,
- "learning_rate": 8.080808080808082e-07,
- "loss": 0.7603012323379517,
- "mean_token_accuracy": 0.7728140354156494,
- "num_tokens": 298550.0,
- "step": 33
- },
- {
- "epoch": 0.025835866261398176,
- "grad_norm": 5.668411731719971,
- "learning_rate": 8.333333333333333e-07,
- "loss": 0.7707852721214294,
- "mean_token_accuracy": 0.7827773094177246,
- "num_tokens": 306683.0,
- "step": 34
- },
- {
- "epoch": 0.026595744680851064,
- "grad_norm": 4.984964847564697,
- "learning_rate": 8.585858585858587e-07,
- "loss": 0.6317349672317505,
- "mean_token_accuracy": 0.8106861114501953,
- "num_tokens": 318842.0,
- "step": 35
- },
- {
- "epoch": 0.02735562310030395,
- "grad_norm": 4.421732425689697,
- "learning_rate": 8.838383838383839e-07,
- "loss": 0.6228617429733276,
- "mean_token_accuracy": 0.8023355603218079,
- "num_tokens": 329850.0,
- "step": 36
- },
- {
- "epoch": 0.02811550151975684,
- "grad_norm": 5.970808029174805,
- "learning_rate": 9.090909090909091e-07,
- "loss": 0.8443238139152527,
- "mean_token_accuracy": 0.7462409734725952,
- "num_tokens": 335844.0,
- "step": 37
- },
- {
- "epoch": 0.028875379939209727,
- "grad_norm": 4.5389084815979,
- "learning_rate": 9.343434343434345e-07,
- "loss": 0.6976436376571655,
- "mean_token_accuracy": 0.790410041809082,
- "num_tokens": 348768.0,
- "step": 38
- },
- {
- "epoch": 0.029635258358662615,
- "grad_norm": 4.116631507873535,
- "learning_rate": 9.595959595959596e-07,
- "loss": 0.6698519587516785,
- "mean_token_accuracy": 0.7818127870559692,
- "num_tokens": 355460.0,
- "step": 39
- },
- {
- "epoch": 0.030395136778115502,
- "grad_norm": 3.3714773654937744,
- "learning_rate": 9.84848484848485e-07,
- "loss": 0.5723201036453247,
- "mean_token_accuracy": 0.8100086450576782,
- "num_tokens": 368507.0,
- "step": 40
- },
- {
- "epoch": 0.03115501519756839,
- "grad_norm": 4.4438347816467285,
- "learning_rate": 1.01010101010101e-06,
- "loss": 0.7508786916732788,
- "mean_token_accuracy": 0.7711942791938782,
- "num_tokens": 376467.0,
- "step": 41
- },
- {
- "epoch": 0.031914893617021274,
- "grad_norm": 5.609974384307861,
- "learning_rate": 1.0353535353535354e-06,
- "loss": 0.566256046295166,
- "mean_token_accuracy": 0.8319284319877625,
- "num_tokens": 381399.0,
- "step": 42
- },
- {
- "epoch": 0.03267477203647416,
- "grad_norm": 5.124386787414551,
- "learning_rate": 1.0606060606060608e-06,
- "loss": 0.8151067495346069,
- "mean_token_accuracy": 0.7537785768508911,
- "num_tokens": 387389.0,
- "step": 43
- },
- {
- "epoch": 0.03343465045592705,
- "grad_norm": 3.6318116188049316,
- "learning_rate": 1.085858585858586e-06,
- "loss": 0.5989949107170105,
- "mean_token_accuracy": 0.8129256963729858,
- "num_tokens": 395302.0,
- "step": 44
- },
- {
- "epoch": 0.03419452887537994,
- "grad_norm": 2.694424629211426,
- "learning_rate": 1.111111111111111e-06,
- "loss": 0.5831396579742432,
- "mean_token_accuracy": 0.8056820631027222,
- "num_tokens": 409920.0,
- "step": 45
- },
- {
- "epoch": 0.034954407294832825,
- "grad_norm": 2.2949178218841553,
- "learning_rate": 1.1363636363636364e-06,
- "loss": 0.472550630569458,
- "mean_token_accuracy": 0.8343006372451782,
- "num_tokens": 428323.0,
- "step": 46
- },
- {
- "epoch": 0.03571428571428571,
- "grad_norm": 3.3930575847625732,
- "learning_rate": 1.1616161616161617e-06,
- "loss": 0.6246505379676819,
- "mean_token_accuracy": 0.783149003982544,
- "num_tokens": 435889.0,
- "step": 47
- },
- {
- "epoch": 0.0364741641337386,
- "grad_norm": 3.692598819732666,
- "learning_rate": 1.186868686868687e-06,
- "loss": 0.46132946014404297,
- "mean_token_accuracy": 0.8583089113235474,
- "num_tokens": 441192.0,
- "step": 48
- },
- {
- "epoch": 0.03723404255319149,
- "grad_norm": 6.571533203125,
- "learning_rate": 1.2121212121212122e-06,
- "loss": 0.9351121783256531,
- "mean_token_accuracy": 0.7580878734588623,
- "num_tokens": 444277.0,
- "step": 49
- },
- {
- "epoch": 0.037993920972644375,
- "grad_norm": 5.029570579528809,
- "learning_rate": 1.2373737373737375e-06,
- "loss": 0.6921554803848267,
- "mean_token_accuracy": 0.8131166100502014,
- "num_tokens": 447646.0,
- "step": 50
- },
- {
- "epoch": 0.03875379939209726,
- "grad_norm": 2.9174208641052246,
- "learning_rate": 1.2626262626262629e-06,
- "loss": 0.591706395149231,
- "mean_token_accuracy": 0.8108617067337036,
- "num_tokens": 461397.0,
- "step": 51
- },
- {
- "epoch": 0.03951367781155015,
- "grad_norm": 4.315536022186279,
- "learning_rate": 1.287878787878788e-06,
- "loss": 0.6986310482025146,
- "mean_token_accuracy": 0.7710754871368408,
- "num_tokens": 472047.0,
- "step": 52
- },
- {
- "epoch": 0.04027355623100304,
- "grad_norm": 2.6216275691986084,
- "learning_rate": 1.3131313131313134e-06,
- "loss": 0.5553690791130066,
- "mean_token_accuracy": 0.8167896866798401,
- "num_tokens": 482795.0,
- "step": 53
- },
- {
- "epoch": 0.041033434650455926,
- "grad_norm": 3.0562477111816406,
- "learning_rate": 1.3383838383838385e-06,
- "loss": 0.6909202337265015,
- "mean_token_accuracy": 0.7859863638877869,
- "num_tokens": 494818.0,
- "step": 54
- },
- {
- "epoch": 0.04179331306990881,
- "grad_norm": 2.1420412063598633,
- "learning_rate": 1.3636363636363636e-06,
- "loss": 0.5415265560150146,
- "mean_token_accuracy": 0.818886399269104,
- "num_tokens": 513695.0,
- "step": 55
- },
- {
- "epoch": 0.0425531914893617,
- "grad_norm": 2.9610488414764404,
- "learning_rate": 1.3888888888888892e-06,
- "loss": 0.6602212190628052,
- "mean_token_accuracy": 0.7830734252929688,
- "num_tokens": 523784.0,
- "step": 56
- },
- {
- "epoch": 0.04331306990881459,
- "grad_norm": 2.511972665786743,
- "learning_rate": 1.4141414141414143e-06,
- "loss": 0.5717809796333313,
- "mean_token_accuracy": 0.8053616285324097,
- "num_tokens": 546308.0,
- "step": 57
- },
- {
- "epoch": 0.044072948328267476,
- "grad_norm": 3.52642822265625,
- "learning_rate": 1.4393939393939396e-06,
- "loss": 0.6242594718933105,
- "mean_token_accuracy": 0.8162082433700562,
- "num_tokens": 552019.0,
- "step": 58
- },
- {
- "epoch": 0.044832826747720364,
- "grad_norm": 3.02362322807312,
- "learning_rate": 1.4646464646464648e-06,
- "loss": 0.6634255647659302,
- "mean_token_accuracy": 0.7682032585144043,
- "num_tokens": 560009.0,
- "step": 59
- },
- {
- "epoch": 0.04559270516717325,
- "grad_norm": 2.3910107612609863,
- "learning_rate": 1.48989898989899e-06,
- "loss": 0.5519146919250488,
- "mean_token_accuracy": 0.8270269632339478,
- "num_tokens": 571005.0,
- "step": 60
- },
- {
- "epoch": 0.04635258358662614,
- "grad_norm": 4.28154993057251,
- "learning_rate": 1.5151515151515152e-06,
- "loss": 0.7437789440155029,
- "mean_token_accuracy": 0.7782418131828308,
- "num_tokens": 574950.0,
- "step": 61
- },
- {
- "epoch": 0.04711246200607903,
- "grad_norm": 3.4078686237335205,
- "learning_rate": 1.5404040404040404e-06,
- "loss": 0.6345915198326111,
- "mean_token_accuracy": 0.7903392314910889,
- "num_tokens": 581657.0,
- "step": 62
- },
- {
- "epoch": 0.047872340425531915,
- "grad_norm": 2.6834158897399902,
- "learning_rate": 1.565656565656566e-06,
- "loss": 0.5981127023696899,
- "mean_token_accuracy": 0.7911489605903625,
- "num_tokens": 591267.0,
- "step": 63
- },
- {
- "epoch": 0.0486322188449848,
- "grad_norm": 2.1054461002349854,
- "learning_rate": 1.590909090909091e-06,
- "loss": 0.5523523688316345,
- "mean_token_accuracy": 0.8194501399993896,
- "num_tokens": 606787.0,
- "step": 64
- },
- {
- "epoch": 0.04939209726443769,
- "grad_norm": 3.322596788406372,
- "learning_rate": 1.6161616161616164e-06,
- "loss": 0.48417025804519653,
- "mean_token_accuracy": 0.8293706178665161,
- "num_tokens": 611068.0,
- "step": 65
- },
- {
- "epoch": 0.05015197568389058,
- "grad_norm": 2.302450180053711,
- "learning_rate": 1.6414141414141415e-06,
- "loss": 0.6498389840126038,
- "mean_token_accuracy": 0.7728497385978699,
- "num_tokens": 624452.0,
- "step": 66
- },
- {
- "epoch": 0.050911854103343465,
- "grad_norm": 2.680191993713379,
- "learning_rate": 1.6666666666666667e-06,
- "loss": 0.6347037553787231,
- "mean_token_accuracy": 0.8108306527137756,
- "num_tokens": 638049.0,
- "step": 67
- },
- {
- "epoch": 0.05167173252279635,
- "grad_norm": 3.0297021865844727,
- "learning_rate": 1.6919191919191922e-06,
- "loss": 0.5344363451004028,
- "mean_token_accuracy": 0.8113535046577454,
- "num_tokens": 643892.0,
- "step": 68
- },
- {
- "epoch": 0.05243161094224924,
- "grad_norm": 2.9283676147460938,
- "learning_rate": 1.7171717171717173e-06,
- "loss": 0.6999260187149048,
- "mean_token_accuracy": 0.7782022356987,
- "num_tokens": 654418.0,
- "step": 69
- },
- {
- "epoch": 0.05319148936170213,
- "grad_norm": 3.4098572731018066,
- "learning_rate": 1.7424242424242427e-06,
- "loss": 0.6508946418762207,
- "mean_token_accuracy": 0.7942900657653809,
- "num_tokens": 659837.0,
- "step": 70
- },
- {
- "epoch": 0.053951367781155016,
- "grad_norm": 2.6756019592285156,
- "learning_rate": 1.7676767676767678e-06,
- "loss": 0.603486180305481,
- "mean_token_accuracy": 0.8015457391738892,
- "num_tokens": 668361.0,
- "step": 71
- },
- {
- "epoch": 0.0547112462006079,
- "grad_norm": 2.2630293369293213,
- "learning_rate": 1.792929292929293e-06,
- "loss": 0.6608274579048157,
- "mean_token_accuracy": 0.7753809690475464,
- "num_tokens": 679025.0,
- "step": 72
- },
- {
- "epoch": 0.05547112462006079,
- "grad_norm": 2.123962879180908,
- "learning_rate": 1.8181818181818183e-06,
- "loss": 0.4525482654571533,
- "mean_token_accuracy": 0.8425612449645996,
- "num_tokens": 688574.0,
- "step": 73
- },
- {
- "epoch": 0.05623100303951368,
- "grad_norm": 7.90519905090332,
- "learning_rate": 1.8434343434343434e-06,
- "loss": 0.6507195830345154,
- "mean_token_accuracy": 0.7714964151382446,
- "num_tokens": 694534.0,
- "step": 74
- },
- {
- "epoch": 0.056990881458966566,
- "grad_norm": 2.372203826904297,
- "learning_rate": 1.868686868686869e-06,
- "loss": 0.4458143413066864,
- "mean_token_accuracy": 0.7991449236869812,
- "num_tokens": 703114.0,
- "step": 75
- },
- {
- "epoch": 0.057750759878419454,
- "grad_norm": 2.918677568435669,
- "learning_rate": 1.8939393939393941e-06,
- "loss": 0.5614339113235474,
- "mean_token_accuracy": 0.8211464881896973,
- "num_tokens": 709038.0,
- "step": 76
- },
- {
- "epoch": 0.05851063829787234,
- "grad_norm": 1.6106709241867065,
- "learning_rate": 1.9191919191919192e-06,
- "loss": 0.5802098512649536,
- "mean_token_accuracy": 0.8055065870285034,
- "num_tokens": 730482.0,
- "step": 77
- },
- {
- "epoch": 0.05927051671732523,
- "grad_norm": 2.8069989681243896,
- "learning_rate": 1.944444444444445e-06,
- "loss": 0.5709059238433838,
- "mean_token_accuracy": 0.8024872541427612,
- "num_tokens": 751817.0,
- "step": 78
- },
- {
- "epoch": 0.06003039513677812,
- "grad_norm": 2.641667127609253,
- "learning_rate": 1.96969696969697e-06,
- "loss": 0.6480152606964111,
- "mean_token_accuracy": 0.7912271618843079,
- "num_tokens": 759236.0,
- "step": 79
- },
- {
- "epoch": 0.060790273556231005,
- "grad_norm": 2.6034350395202637,
- "learning_rate": 1.994949494949495e-06,
- "loss": 0.5535176396369934,
- "mean_token_accuracy": 0.7980542778968811,
- "num_tokens": 766496.0,
- "step": 80
- },
- {
- "epoch": 0.06155015197568389,
- "grad_norm": 1.7095069885253906,
- "learning_rate": 2.02020202020202e-06,
- "loss": 0.4545496106147766,
- "mean_token_accuracy": 0.8229660391807556,
- "num_tokens": 780124.0,
- "step": 81
- },
- {
- "epoch": 0.06231003039513678,
- "grad_norm": 3.788830518722534,
- "learning_rate": 2.0454545454545457e-06,
- "loss": 0.6679391264915466,
- "mean_token_accuracy": 0.7942397594451904,
- "num_tokens": 784555.0,
- "step": 82
- },
- {
- "epoch": 0.06306990881458967,
- "grad_norm": 2.009831666946411,
- "learning_rate": 2.070707070707071e-06,
- "loss": 0.5067101120948792,
- "mean_token_accuracy": 0.8276634216308594,
- "num_tokens": 797459.0,
- "step": 83
- },
- {
- "epoch": 0.06382978723404255,
- "grad_norm": 2.201627731323242,
- "learning_rate": 2.095959595959596e-06,
- "loss": 0.5012127161026001,
- "mean_token_accuracy": 0.8432504534721375,
- "num_tokens": 810817.0,
- "step": 84
- },
- {
- "epoch": 0.06458966565349544,
- "grad_norm": 2.492568016052246,
- "learning_rate": 2.1212121212121216e-06,
- "loss": 0.6142797470092773,
- "mean_token_accuracy": 0.8338661193847656,
- "num_tokens": 818191.0,
- "step": 85
- },
- {
- "epoch": 0.06534954407294832,
- "grad_norm": 2.8360862731933594,
- "learning_rate": 2.1464646464646467e-06,
- "loss": 0.5569300651550293,
- "mean_token_accuracy": 0.8121030330657959,
- "num_tokens": 825325.0,
- "step": 86
- },
- {
- "epoch": 0.06610942249240122,
- "grad_norm": 2.407548427581787,
- "learning_rate": 2.171717171717172e-06,
- "loss": 0.6442930102348328,
- "mean_token_accuracy": 0.792514443397522,
- "num_tokens": 834439.0,
- "step": 87
- },
- {
- "epoch": 0.0668693009118541,
- "grad_norm": 2.340728759765625,
- "learning_rate": 2.196969696969697e-06,
- "loss": 0.6494365930557251,
- "mean_token_accuracy": 0.7746615409851074,
- "num_tokens": 843078.0,
- "step": 88
- },
- {
- "epoch": 0.067629179331307,
- "grad_norm": 1.7703697681427002,
- "learning_rate": 2.222222222222222e-06,
- "loss": 0.598991870880127,
- "mean_token_accuracy": 0.7992157340049744,
- "num_tokens": 860171.0,
- "step": 89
- },
- {
- "epoch": 0.06838905775075987,
- "grad_norm": 2.5779271125793457,
- "learning_rate": 2.2474747474747476e-06,
- "loss": 0.5693082809448242,
- "mean_token_accuracy": 0.8093700408935547,
- "num_tokens": 866669.0,
- "step": 90
- },
- {
- "epoch": 0.06914893617021277,
- "grad_norm": 2.014092206954956,
- "learning_rate": 2.2727272727272728e-06,
- "loss": 0.5346695780754089,
- "mean_token_accuracy": 0.8165590763092041,
- "num_tokens": 876698.0,
- "step": 91
- },
- {
- "epoch": 0.06990881458966565,
- "grad_norm": 1.7555919885635376,
- "learning_rate": 2.2979797979797983e-06,
- "loss": 0.5321458578109741,
- "mean_token_accuracy": 0.8166656494140625,
- "num_tokens": 889488.0,
- "step": 92
- },
- {
- "epoch": 0.07066869300911854,
- "grad_norm": 1.8631824254989624,
- "learning_rate": 2.3232323232323234e-06,
- "loss": 0.5246532559394836,
- "mean_token_accuracy": 0.8088107705116272,
- "num_tokens": 901322.0,
- "step": 93
- },
- {
- "epoch": 0.07142857142857142,
- "grad_norm": 3.2332139015197754,
- "learning_rate": 2.348484848484849e-06,
- "loss": 0.5141711235046387,
- "mean_token_accuracy": 0.8382217884063721,
- "num_tokens": 905792.0,
- "step": 94
- },
- {
- "epoch": 0.07218844984802432,
- "grad_norm": 1.7806555032730103,
- "learning_rate": 2.373737373737374e-06,
- "loss": 0.5233149528503418,
- "mean_token_accuracy": 0.8101529479026794,
- "num_tokens": 917320.0,
- "step": 95
- },
- {
- "epoch": 0.0729483282674772,
- "grad_norm": 1.8169859647750854,
- "learning_rate": 2.3989898989898993e-06,
- "loss": 0.578881561756134,
- "mean_token_accuracy": 0.8044873476028442,
- "num_tokens": 931062.0,
- "step": 96
- },
- {
- "epoch": 0.0737082066869301,
- "grad_norm": 4.677402496337891,
- "learning_rate": 2.4242424242424244e-06,
- "loss": 0.7842556238174438,
- "mean_token_accuracy": 0.7579764127731323,
- "num_tokens": 934712.0,
- "step": 97
- },
- {
- "epoch": 0.07446808510638298,
- "grad_norm": 2.6987264156341553,
- "learning_rate": 2.4494949494949495e-06,
- "loss": 0.5669287443161011,
- "mean_token_accuracy": 0.8186933994293213,
- "num_tokens": 941058.0,
- "step": 98
- },
- {
- "epoch": 0.07522796352583587,
- "grad_norm": 1.6906023025512695,
- "learning_rate": 2.474747474747475e-06,
- "loss": 0.4976363778114319,
- "mean_token_accuracy": 0.8198553323745728,
- "num_tokens": 956509.0,
- "step": 99
- },
- {
- "epoch": 0.07598784194528875,
- "grad_norm": 2.7256152629852295,
- "learning_rate": 2.5e-06,
- "loss": 0.7138420343399048,
- "mean_token_accuracy": 0.7752805948257446,
- "num_tokens": 963920.0,
- "step": 100
- },
- {
- "epoch": 0.07674772036474165,
- "grad_norm": 2.174870491027832,
- "learning_rate": 2.5252525252525258e-06,
- "loss": 0.6733541488647461,
- "mean_token_accuracy": 0.7745175361633301,
- "num_tokens": 975268.0,
- "step": 101
- },
- {
- "epoch": 0.07750759878419453,
- "grad_norm": 1.5587213039398193,
- "learning_rate": 2.5505050505050505e-06,
- "loss": 0.44223445653915405,
- "mean_token_accuracy": 0.8278359174728394,
- "num_tokens": 991837.0,
- "step": 102
- },
- {
- "epoch": 0.07826747720364742,
- "grad_norm": 2.181840658187866,
- "learning_rate": 2.575757575757576e-06,
- "loss": 0.625128448009491,
- "mean_token_accuracy": 0.7941786050796509,
- "num_tokens": 1004325.0,
- "step": 103
- },
- {
- "epoch": 0.0790273556231003,
- "grad_norm": 1.4986687898635864,
- "learning_rate": 2.601010101010101e-06,
- "loss": 0.39262527227401733,
- "mean_token_accuracy": 0.8412648439407349,
- "num_tokens": 1018331.0,
- "step": 104
- },
- {
- "epoch": 0.0797872340425532,
- "grad_norm": 2.3416061401367188,
- "learning_rate": 2.6262626262626267e-06,
- "loss": 0.5495132803916931,
- "mean_token_accuracy": 0.8193322420120239,
- "num_tokens": 1026090.0,
- "step": 105
- },
- {
- "epoch": 0.08054711246200608,
- "grad_norm": 3.8168859481811523,
- "learning_rate": 2.6515151515151514e-06,
- "loss": 0.4898706376552582,
- "mean_token_accuracy": 0.8467956185340881,
- "num_tokens": 1029955.0,
- "step": 106
- },
- {
- "epoch": 0.08130699088145897,
- "grad_norm": 4.113908767700195,
- "learning_rate": 2.676767676767677e-06,
- "loss": 0.6189584732055664,
- "mean_token_accuracy": 0.8019394278526306,
- "num_tokens": 1033598.0,
- "step": 107
- },
- {
- "epoch": 0.08206686930091185,
- "grad_norm": 2.50003981590271,
- "learning_rate": 2.7020202020202025e-06,
- "loss": 0.6479471921920776,
- "mean_token_accuracy": 0.7790026664733887,
- "num_tokens": 1042533.0,
- "step": 108
- },
- {
- "epoch": 0.08282674772036475,
- "grad_norm": 1.408934473991394,
- "learning_rate": 2.7272727272727272e-06,
- "loss": 0.3909248113632202,
- "mean_token_accuracy": 0.8477586507797241,
- "num_tokens": 1061755.0,
- "step": 109
- },
- {
- "epoch": 0.08358662613981763,
- "grad_norm": 3.360633611679077,
- "learning_rate": 2.7525252525252528e-06,
- "loss": 0.6952459812164307,
- "mean_token_accuracy": 0.777535080909729,
- "num_tokens": 1067316.0,
- "step": 110
- },
- {
- "epoch": 0.08434650455927052,
- "grad_norm": 1.8631696701049805,
- "learning_rate": 2.7777777777777783e-06,
- "loss": 0.5420593023300171,
- "mean_token_accuracy": 0.8157662749290466,
- "num_tokens": 1079930.0,
- "step": 111
- },
- {
- "epoch": 0.0851063829787234,
- "grad_norm": 2.4308314323425293,
- "learning_rate": 2.803030303030303e-06,
- "loss": 0.5863882303237915,
- "mean_token_accuracy": 0.8206346035003662,
- "num_tokens": 1088069.0,
- "step": 112
- },
- {
- "epoch": 0.0858662613981763,
- "grad_norm": 2.922808885574341,
- "learning_rate": 2.8282828282828286e-06,
- "loss": 0.5217319130897522,
- "mean_token_accuracy": 0.8253234028816223,
- "num_tokens": 1093607.0,
- "step": 113
- },
- {
- "epoch": 0.08662613981762918,
- "grad_norm": 2.3596107959747314,
- "learning_rate": 2.8535353535353537e-06,
- "loss": 0.5070714950561523,
- "mean_token_accuracy": 0.8258323669433594,
- "num_tokens": 1100405.0,
- "step": 114
- },
- {
- "epoch": 0.08738601823708207,
- "grad_norm": 3.0853066444396973,
- "learning_rate": 2.8787878787878793e-06,
- "loss": 0.591964840888977,
- "mean_token_accuracy": 0.8047322630882263,
- "num_tokens": 1107535.0,
- "step": 115
- },
- {
- "epoch": 0.08814589665653495,
- "grad_norm": 1.9251092672348022,
- "learning_rate": 2.904040404040404e-06,
- "loss": 0.5226191878318787,
- "mean_token_accuracy": 0.8022720217704773,
- "num_tokens": 1118716.0,
- "step": 116
- },
- {
- "epoch": 0.08890577507598785,
- "grad_norm": 1.9692988395690918,
- "learning_rate": 2.9292929292929295e-06,
- "loss": 0.5462069511413574,
- "mean_token_accuracy": 0.8157015442848206,
- "num_tokens": 1131917.0,
- "step": 117
- },
- {
- "epoch": 0.08966565349544073,
- "grad_norm": 1.4738909006118774,
- "learning_rate": 2.954545454545455e-06,
- "loss": 0.4564219117164612,
- "mean_token_accuracy": 0.849632978439331,
- "num_tokens": 1148534.0,
- "step": 118
- },
- {
- "epoch": 0.09042553191489362,
- "grad_norm": 2.72646164894104,
- "learning_rate": 2.97979797979798e-06,
- "loss": 0.6654808521270752,
- "mean_token_accuracy": 0.7752684354782104,
- "num_tokens": 1155438.0,
- "step": 119
- },
- {
- "epoch": 0.0911854103343465,
- "grad_norm": 2.7843852043151855,
- "learning_rate": 3.0050505050505054e-06,
- "loss": 0.5354680418968201,
- "mean_token_accuracy": 0.8196378946304321,
- "num_tokens": 1161815.0,
- "step": 120
- },
- {
- "epoch": 0.0919452887537994,
- "grad_norm": 2.8052573204040527,
- "learning_rate": 3.0303030303030305e-06,
- "loss": 0.6366757154464722,
- "mean_token_accuracy": 0.7967483997344971,
- "num_tokens": 1168295.0,
- "step": 121
- },
- {
- "epoch": 0.09270516717325228,
- "grad_norm": 2.7462735176086426,
- "learning_rate": 3.055555555555556e-06,
- "loss": 0.59470534324646,
- "mean_token_accuracy": 0.8023771047592163,
- "num_tokens": 1174502.0,
- "step": 122
- },
- {
- "epoch": 0.09346504559270517,
- "grad_norm": 2.2743821144104004,
- "learning_rate": 3.0808080808080807e-06,
- "loss": 0.5720560550689697,
- "mean_token_accuracy": 0.8162771463394165,
- "num_tokens": 1183615.0,
- "step": 123
- },
- {
- "epoch": 0.09422492401215805,
- "grad_norm": 1.8669533729553223,
- "learning_rate": 3.1060606060606063e-06,
- "loss": 0.4655378758907318,
- "mean_token_accuracy": 0.8360732793807983,
- "num_tokens": 1193761.0,
- "step": 124
- },
- {
- "epoch": 0.09498480243161095,
- "grad_norm": 1.7666901350021362,
- "learning_rate": 3.131313131313132e-06,
- "loss": 0.5524153709411621,
- "mean_token_accuracy": 0.8252713680267334,
- "num_tokens": 1207870.0,
- "step": 125
- },
- {
- "epoch": 0.09574468085106383,
- "grad_norm": 2.4720070362091064,
- "learning_rate": 3.1565656565656566e-06,
- "loss": 0.5003011226654053,
- "mean_token_accuracy": 0.8491042852401733,
- "num_tokens": 1214603.0,
- "step": 126
- },
- {
- "epoch": 0.09650455927051672,
- "grad_norm": 1.6500422954559326,
- "learning_rate": 3.181818181818182e-06,
- "loss": 0.5137069225311279,
- "mean_token_accuracy": 0.8273531198501587,
- "num_tokens": 1228717.0,
- "step": 127
- },
- {
- "epoch": 0.0972644376899696,
- "grad_norm": 3.402543067932129,
- "learning_rate": 3.2070707070707072e-06,
- "loss": 0.708167552947998,
- "mean_token_accuracy": 0.7705385684967041,
- "num_tokens": 1234361.0,
- "step": 128
- },
- {
- "epoch": 0.0980243161094225,
- "grad_norm": 2.547285795211792,
- "learning_rate": 3.232323232323233e-06,
- "loss": 0.6020137071609497,
- "mean_token_accuracy": 0.7981340289115906,
- "num_tokens": 1244169.0,
- "step": 129
- },
- {
- "epoch": 0.09878419452887538,
- "grad_norm": 2.0578792095184326,
- "learning_rate": 3.257575757575758e-06,
- "loss": 0.4425000250339508,
- "mean_token_accuracy": 0.8567807674407959,
- "num_tokens": 1252709.0,
- "step": 130
- },
- {
- "epoch": 0.09954407294832827,
- "grad_norm": 1.672614336013794,
- "learning_rate": 3.282828282828283e-06,
- "loss": 0.4860966205596924,
- "mean_token_accuracy": 0.8393139243125916,
- "num_tokens": 1265766.0,
- "step": 131
- },
- {
- "epoch": 0.10030395136778116,
- "grad_norm": 3.2560198307037354,
- "learning_rate": 3.3080808080808086e-06,
- "loss": 0.624736487865448,
- "mean_token_accuracy": 0.7875322699546814,
- "num_tokens": 1270779.0,
- "step": 132
- },
- {
- "epoch": 0.10106382978723404,
- "grad_norm": 2.4468185901641846,
- "learning_rate": 3.3333333333333333e-06,
- "loss": 0.5062227249145508,
- "mean_token_accuracy": 0.8217229843139648,
- "num_tokens": 1277113.0,
- "step": 133
- },
- {
- "epoch": 0.10182370820668693,
- "grad_norm": 2.6371328830718994,
- "learning_rate": 3.358585858585859e-06,
- "loss": 0.477113276720047,
- "mean_token_accuracy": 0.8605583906173706,
- "num_tokens": 1282514.0,
- "step": 134
- },
- {
- "epoch": 0.10258358662613981,
- "grad_norm": 2.48421311378479,
- "learning_rate": 3.3838383838383844e-06,
- "loss": 0.40855684876441956,
- "mean_token_accuracy": 0.864548921585083,
- "num_tokens": 1287859.0,
- "step": 135
- },
- {
- "epoch": 0.1033434650455927,
- "grad_norm": 1.993099331855774,
- "learning_rate": 3.409090909090909e-06,
- "loss": 0.5913145542144775,
- "mean_token_accuracy": 0.8248485922813416,
- "num_tokens": 1301074.0,
- "step": 136
- },
- {
- "epoch": 0.10410334346504559,
- "grad_norm": 3.5947680473327637,
- "learning_rate": 3.4343434343434347e-06,
- "loss": 0.5028599500656128,
- "mean_token_accuracy": 0.8367215394973755,
- "num_tokens": 1305219.0,
- "step": 137
- },
- {
- "epoch": 0.10486322188449848,
- "grad_norm": 2.5778582096099854,
- "learning_rate": 3.45959595959596e-06,
- "loss": 0.5297672748565674,
- "mean_token_accuracy": 0.8232187032699585,
- "num_tokens": 1312482.0,
- "step": 138
- },
- {
- "epoch": 0.10562310030395136,
- "grad_norm": 1.8961588144302368,
- "learning_rate": 3.4848484848484854e-06,
- "loss": 0.39954107999801636,
- "mean_token_accuracy": 0.8605833053588867,
- "num_tokens": 1323404.0,
- "step": 139
- },
- {
- "epoch": 0.10638297872340426,
- "grad_norm": 1.9687960147857666,
- "learning_rate": 3.51010101010101e-06,
- "loss": 0.48791587352752686,
- "mean_token_accuracy": 0.8200347423553467,
- "num_tokens": 1333027.0,
- "step": 140
- },
- {
- "epoch": 0.10714285714285714,
- "grad_norm": 2.520242691040039,
- "learning_rate": 3.5353535353535356e-06,
- "loss": 0.6106002330780029,
- "mean_token_accuracy": 0.790692150592804,
- "num_tokens": 1340999.0,
- "step": 141
- },
- {
- "epoch": 0.10790273556231003,
- "grad_norm": 3.751617431640625,
- "learning_rate": 3.560606060606061e-06,
- "loss": 0.48141729831695557,
- "mean_token_accuracy": 0.8421382904052734,
- "num_tokens": 1344687.0,
- "step": 142
- },
- {
- "epoch": 0.10866261398176291,
- "grad_norm": 2.7101709842681885,
- "learning_rate": 3.585858585858586e-06,
- "loss": 0.5375241637229919,
- "mean_token_accuracy": 0.8061438202857971,
- "num_tokens": 1350192.0,
- "step": 143
- },
- {
- "epoch": 0.1094224924012158,
- "grad_norm": 2.583484411239624,
- "learning_rate": 3.6111111111111115e-06,
- "loss": 0.6492470502853394,
- "mean_token_accuracy": 0.7863001823425293,
- "num_tokens": 1358148.0,
- "step": 144
- },
- {
- "epoch": 0.11018237082066869,
- "grad_norm": 1.792561650276184,
- "learning_rate": 3.6363636363636366e-06,
- "loss": 0.48480600118637085,
- "mean_token_accuracy": 0.8358709812164307,
- "num_tokens": 1369519.0,
- "step": 145
- },
- {
- "epoch": 0.11094224924012158,
- "grad_norm": 2.6480472087860107,
- "learning_rate": 3.661616161616162e-06,
- "loss": 0.5268933176994324,
- "mean_token_accuracy": 0.8214013576507568,
- "num_tokens": 1375862.0,
- "step": 146
- },
- {
- "epoch": 0.11170212765957446,
- "grad_norm": 2.3174469470977783,
- "learning_rate": 3.686868686868687e-06,
- "loss": 0.42517897486686707,
- "mean_token_accuracy": 0.8523461222648621,
- "num_tokens": 1381546.0,
- "step": 147
- },
- {
- "epoch": 0.11246200607902736,
- "grad_norm": 3.0090949535369873,
- "learning_rate": 3.7121212121212124e-06,
- "loss": 0.4042336940765381,
- "mean_token_accuracy": 0.8670448064804077,
- "num_tokens": 1385896.0,
- "step": 148
- },
- {
- "epoch": 0.11322188449848024,
- "grad_norm": 2.4928104877471924,
- "learning_rate": 3.737373737373738e-06,
- "loss": 0.6498878598213196,
- "mean_token_accuracy": 0.7967068552970886,
- "num_tokens": 1394169.0,
- "step": 149
- },
- {
- "epoch": 0.11398176291793313,
- "grad_norm": 1.5984913110733032,
- "learning_rate": 3.7626262626262627e-06,
- "loss": 0.546096920967102,
- "mean_token_accuracy": 0.8035850524902344,
- "num_tokens": 1408785.0,
- "step": 150
- },
- {
- "epoch": 0.11474164133738601,
- "grad_norm": 2.3663532733917236,
- "learning_rate": 3.7878787878787882e-06,
- "loss": 0.6111721992492676,
- "mean_token_accuracy": 0.8015355467796326,
- "num_tokens": 1417510.0,
- "step": 151
- },
- {
- "epoch": 0.11550151975683891,
- "grad_norm": 2.518932819366455,
- "learning_rate": 3.8131313131313138e-06,
- "loss": 0.5274964570999146,
- "mean_token_accuracy": 0.8155480623245239,
- "num_tokens": 1424186.0,
- "step": 152
- },
- {
- "epoch": 0.11626139817629179,
- "grad_norm": 2.14353609085083,
- "learning_rate": 3.8383838383838385e-06,
- "loss": 0.5283297896385193,
- "mean_token_accuracy": 0.8275758028030396,
- "num_tokens": 1432630.0,
- "step": 153
- },
- {
- "epoch": 0.11702127659574468,
- "grad_norm": 1.8243604898452759,
- "learning_rate": 3.863636363636364e-06,
- "loss": 0.41854870319366455,
- "mean_token_accuracy": 0.8222295045852661,
- "num_tokens": 1442691.0,
- "step": 154
- },
- {
- "epoch": 0.11778115501519756,
- "grad_norm": 2.088212251663208,
- "learning_rate": 3.88888888888889e-06,
- "loss": 0.6062943339347839,
- "mean_token_accuracy": 0.8009427785873413,
- "num_tokens": 1456890.0,
- "step": 155
- },
- {
- "epoch": 0.11854103343465046,
- "grad_norm": 1.3469511270523071,
- "learning_rate": 3.914141414141415e-06,
- "loss": 0.4390433728694916,
- "mean_token_accuracy": 0.8436295986175537,
- "num_tokens": 1475349.0,
- "step": 156
- },
- {
- "epoch": 0.11930091185410334,
- "grad_norm": 3.247023105621338,
- "learning_rate": 3.93939393939394e-06,
- "loss": 0.6490433216094971,
- "mean_token_accuracy": 0.8037861585617065,
- "num_tokens": 1479952.0,
- "step": 157
- },
- {
- "epoch": 0.12006079027355623,
- "grad_norm": 2.6610445976257324,
- "learning_rate": 3.964646464646465e-06,
- "loss": 0.6221826076507568,
- "mean_token_accuracy": 0.7848749160766602,
- "num_tokens": 1487306.0,
- "step": 158
- },
- {
- "epoch": 0.12082066869300911,
- "grad_norm": 2.3060810565948486,
- "learning_rate": 3.98989898989899e-06,
- "loss": 0.5052388310432434,
- "mean_token_accuracy": 0.8281195759773254,
- "num_tokens": 1495367.0,
- "step": 159
- },
- {
- "epoch": 0.12158054711246201,
- "grad_norm": 2.504448652267456,
- "learning_rate": 4.015151515151515e-06,
- "loss": 0.5005477666854858,
- "mean_token_accuracy": 0.8408058881759644,
- "num_tokens": 1502069.0,
- "step": 160
- },
- {
- "epoch": 0.12234042553191489,
- "grad_norm": 3.993938446044922,
- "learning_rate": 4.04040404040404e-06,
- "loss": 0.5569638013839722,
- "mean_token_accuracy": 0.8095242977142334,
- "num_tokens": 1510224.0,
- "step": 161
- },
- {
- "epoch": 0.12310030395136778,
- "grad_norm": 2.2287683486938477,
- "learning_rate": 4.065656565656566e-06,
- "loss": 0.524042546749115,
- "mean_token_accuracy": 0.8102203607559204,
- "num_tokens": 1518364.0,
- "step": 162
- },
- {
- "epoch": 0.12386018237082067,
- "grad_norm": 1.9531738758087158,
- "learning_rate": 4.0909090909090915e-06,
- "loss": 0.45794573426246643,
- "mean_token_accuracy": 0.8560376167297363,
- "num_tokens": 1528097.0,
- "step": 163
- },
- {
- "epoch": 0.12462006079027356,
- "grad_norm": 1.5841206312179565,
- "learning_rate": 4.116161616161617e-06,
- "loss": 0.5420972108840942,
- "mean_token_accuracy": 0.8092726469039917,
- "num_tokens": 1544119.0,
- "step": 164
- },
- {
- "epoch": 0.12537993920972645,
- "grad_norm": 1.7536218166351318,
- "learning_rate": 4.141414141414142e-06,
- "loss": 0.554668664932251,
- "mean_token_accuracy": 0.8193825483322144,
- "num_tokens": 1559140.0,
- "step": 165
- },
- {
- "epoch": 0.12613981762917933,
- "grad_norm": 3.545454740524292,
- "learning_rate": 4.166666666666667e-06,
- "loss": 0.580947995185852,
- "mean_token_accuracy": 0.8286383152008057,
- "num_tokens": 1563625.0,
- "step": 166
- },
- {
- "epoch": 0.12689969604863222,
- "grad_norm": 1.6608915328979492,
- "learning_rate": 4.191919191919192e-06,
- "loss": 0.5523324012756348,
- "mean_token_accuracy": 0.8155215978622437,
- "num_tokens": 1574945.0,
- "step": 167
- },
- {
- "epoch": 0.1276595744680851,
- "grad_norm": 1.4832708835601807,
- "learning_rate": 4.217171717171717e-06,
- "loss": 0.5133191347122192,
- "mean_token_accuracy": 0.8367571830749512,
- "num_tokens": 1595865.0,
- "step": 168
- },
- {
- "epoch": 0.128419452887538,
- "grad_norm": 1.7807520627975464,
- "learning_rate": 4.242424242424243e-06,
- "loss": 0.5131410360336304,
- "mean_token_accuracy": 0.8129367232322693,
- "num_tokens": 1608723.0,
- "step": 169
- },
- {
- "epoch": 0.12917933130699089,
- "grad_norm": 2.707569122314453,
- "learning_rate": 4.267676767676767e-06,
- "loss": 0.6129013299942017,
- "mean_token_accuracy": 0.7926048040390015,
- "num_tokens": 1616136.0,
- "step": 170
- },
- {
- "epoch": 0.12993920972644377,
- "grad_norm": 2.5831644535064697,
- "learning_rate": 4.292929292929293e-06,
- "loss": 0.6264227628707886,
- "mean_token_accuracy": 0.8074911236763,
- "num_tokens": 1624228.0,
- "step": 171
- },
- {
- "epoch": 0.13069908814589665,
- "grad_norm": 3.1124250888824463,
- "learning_rate": 4.3181818181818185e-06,
- "loss": 0.41763827204704285,
- "mean_token_accuracy": 0.8565453290939331,
- "num_tokens": 1628098.0,
- "step": 172
- },
- {
- "epoch": 0.13145896656534956,
- "grad_norm": 2.3214211463928223,
- "learning_rate": 4.343434343434344e-06,
- "loss": 0.421974778175354,
- "mean_token_accuracy": 0.8391546010971069,
- "num_tokens": 1634950.0,
- "step": 173
- },
- {
- "epoch": 0.13221884498480244,
- "grad_norm": 2.1010327339172363,
- "learning_rate": 4.368686868686869e-06,
- "loss": 0.5307331681251526,
- "mean_token_accuracy": 0.8139588236808777,
- "num_tokens": 1644132.0,
- "step": 174
- },
- {
- "epoch": 0.13297872340425532,
- "grad_norm": 2.533612012863159,
- "learning_rate": 4.393939393939394e-06,
- "loss": 0.5626664161682129,
- "mean_token_accuracy": 0.8029808402061462,
- "num_tokens": 1651637.0,
- "step": 175
- },
- {
- "epoch": 0.1337386018237082,
- "grad_norm": 1.669508457183838,
- "learning_rate": 4.41919191919192e-06,
- "loss": 0.5351508259773254,
- "mean_token_accuracy": 0.8281655311584473,
- "num_tokens": 1666776.0,
- "step": 176
- },
- {
- "epoch": 0.1344984802431611,
- "grad_norm": 1.7579659223556519,
- "learning_rate": 4.444444444444444e-06,
- "loss": 0.5235031247138977,
- "mean_token_accuracy": 0.8143284320831299,
- "num_tokens": 1679241.0,
- "step": 177
- },
- {
- "epoch": 0.135258358662614,
- "grad_norm": 3.123563528060913,
- "learning_rate": 4.46969696969697e-06,
- "loss": 0.43051332235336304,
- "mean_token_accuracy": 0.8518186211585999,
- "num_tokens": 1683317.0,
- "step": 178
- },
- {
- "epoch": 0.13601823708206687,
- "grad_norm": 2.2411575317382812,
- "learning_rate": 4.494949494949495e-06,
- "loss": 0.5471380949020386,
- "mean_token_accuracy": 0.8267596960067749,
- "num_tokens": 1691366.0,
- "step": 179
- },
- {
- "epoch": 0.13677811550151975,
- "grad_norm": 2.621973991394043,
- "learning_rate": 4.520202020202021e-06,
- "loss": 0.5685839653015137,
- "mean_token_accuracy": 0.8260642290115356,
- "num_tokens": 1698148.0,
- "step": 180
- },
- {
- "epoch": 0.13753799392097266,
- "grad_norm": 2.1553852558135986,
- "learning_rate": 4.5454545454545455e-06,
- "loss": 0.5703883171081543,
- "mean_token_accuracy": 0.8219090700149536,
- "num_tokens": 1707225.0,
- "step": 181
- },
- {
- "epoch": 0.13829787234042554,
- "grad_norm": 5.1767897605896,
- "learning_rate": 4.5707070707070715e-06,
- "loss": 0.32704639434814453,
- "mean_token_accuracy": 0.8754568099975586,
- "num_tokens": 1712748.0,
- "step": 182
- },
- {
- "epoch": 0.13905775075987842,
- "grad_norm": 2.609168291091919,
- "learning_rate": 4.595959595959597e-06,
- "loss": 0.5939987301826477,
- "mean_token_accuracy": 0.8034975528717041,
- "num_tokens": 1719932.0,
- "step": 183
- },
- {
- "epoch": 0.1398176291793313,
- "grad_norm": 2.2059099674224854,
- "learning_rate": 4.621212121212122e-06,
- "loss": 0.5310720205307007,
- "mean_token_accuracy": 0.8177368640899658,
- "num_tokens": 1727640.0,
- "step": 184
- },
- {
- "epoch": 0.1405775075987842,
- "grad_norm": 2.6367759704589844,
- "learning_rate": 4.646464646464647e-06,
- "loss": 0.522086501121521,
- "mean_token_accuracy": 0.826233983039856,
- "num_tokens": 1733609.0,
- "step": 185
- },
- {
- "epoch": 0.1413373860182371,
- "grad_norm": 3.326732873916626,
- "learning_rate": 4.671717171717172e-06,
- "loss": 0.4127829074859619,
- "mean_token_accuracy": 0.8551101684570312,
- "num_tokens": 1737256.0,
- "step": 186
- },
- {
- "epoch": 0.14209726443768997,
- "grad_norm": 1.828412413597107,
- "learning_rate": 4.696969696969698e-06,
- "loss": 0.5444269180297852,
- "mean_token_accuracy": 0.8350818157196045,
- "num_tokens": 1750196.0,
- "step": 187
- },
- {
- "epoch": 0.14285714285714285,
- "grad_norm": 3.209203004837036,
- "learning_rate": 4.722222222222222e-06,
- "loss": 0.5087994933128357,
- "mean_token_accuracy": 0.8349015712738037,
- "num_tokens": 1754836.0,
- "step": 188
- },
- {
- "epoch": 0.14361702127659576,
- "grad_norm": 1.7339166402816772,
- "learning_rate": 4.747474747474748e-06,
- "loss": 0.5151352286338806,
- "mean_token_accuracy": 0.8321266174316406,
- "num_tokens": 1766015.0,
- "step": 189
- },
- {
- "epoch": 0.14437689969604864,
- "grad_norm": 2.699068069458008,
- "learning_rate": 4.772727272727273e-06,
- "loss": 0.4406203031539917,
- "mean_token_accuracy": 0.8425000905990601,
- "num_tokens": 1771684.0,
- "step": 190
- },
- {
- "epoch": 0.14513677811550152,
- "grad_norm": 2.8117282390594482,
- "learning_rate": 4.7979797979797985e-06,
- "loss": 0.40428489446640015,
- "mean_token_accuracy": 0.8654326796531677,
- "num_tokens": 1776301.0,
- "step": 191
- },
- {
- "epoch": 0.1458966565349544,
- "grad_norm": 2.9204647541046143,
- "learning_rate": 4.823232323232324e-06,
- "loss": 0.4191770553588867,
- "mean_token_accuracy": 0.8574687242507935,
- "num_tokens": 1781678.0,
- "step": 192
- },
- {
- "epoch": 0.1466565349544073,
- "grad_norm": 2.1648988723754883,
- "learning_rate": 4.848484848484849e-06,
- "loss": 0.5839012861251831,
- "mean_token_accuracy": 0.8053664565086365,
- "num_tokens": 1792516.0,
- "step": 193
- },
- {
- "epoch": 0.1474164133738602,
- "grad_norm": 2.3221631050109863,
- "learning_rate": 4.873737373737374e-06,
- "loss": 0.5037894248962402,
- "mean_token_accuracy": 0.8427227139472961,
- "num_tokens": 1800192.0,
- "step": 194
- },
- {
- "epoch": 0.14817629179331307,
- "grad_norm": 2.4536430835723877,
- "learning_rate": 4.898989898989899e-06,
- "loss": 0.42326074838638306,
- "mean_token_accuracy": 0.8510633111000061,
- "num_tokens": 1806159.0,
- "step": 195
- },
- {
- "epoch": 0.14893617021276595,
- "grad_norm": 2.4875805377960205,
- "learning_rate": 4.924242424242425e-06,
- "loss": 0.539531409740448,
- "mean_token_accuracy": 0.8060250282287598,
- "num_tokens": 1813392.0,
- "step": 196
- },
- {
- "epoch": 0.14969604863221886,
- "grad_norm": 2.1664798259735107,
- "learning_rate": 4.94949494949495e-06,
- "loss": 0.42502015829086304,
- "mean_token_accuracy": 0.8503251075744629,
- "num_tokens": 1821424.0,
- "step": 197
- },
- {
- "epoch": 0.15045592705167174,
- "grad_norm": 2.568808078765869,
- "learning_rate": 4.974747474747475e-06,
- "loss": 0.5025098323822021,
- "mean_token_accuracy": 0.8182311058044434,
- "num_tokens": 1827225.0,
- "step": 198
- },
- {
- "epoch": 0.15121580547112462,
- "grad_norm": 1.9116802215576172,
- "learning_rate": 5e-06,
- "loss": 0.4907258450984955,
- "mean_token_accuracy": 0.8310189843177795,
- "num_tokens": 1836297.0,
- "step": 199
- },
- {
- "epoch": 0.1519756838905775,
- "grad_norm": 3.150765895843506,
- "learning_rate": 4.999999122701883e-06,
- "loss": 0.390616774559021,
- "mean_token_accuracy": 0.8626647591590881,
- "num_tokens": 1839984.0,
- "step": 200
- },
- {
- "epoch": 0.15273556231003038,
- "grad_norm": 3.2229044437408447,
- "learning_rate": 4.999996490808146e-06,
- "loss": 0.48009657859802246,
- "mean_token_accuracy": 0.825214147567749,
- "num_tokens": 1844610.0,
- "step": 201
- },
- {
- "epoch": 0.1534954407294833,
- "grad_norm": 1.4473289251327515,
- "learning_rate": 4.9999921043206356e-06,
- "loss": 0.40135183930397034,
- "mean_token_accuracy": 0.8537827730178833,
- "num_tokens": 1859573.0,
- "step": 202
- },
- {
- "epoch": 0.15425531914893617,
- "grad_norm": 4.072319507598877,
- "learning_rate": 4.999985963242432e-06,
- "loss": 0.6158689260482788,
- "mean_token_accuracy": 0.8075432777404785,
- "num_tokens": 1863147.0,
- "step": 203
- },
- {
- "epoch": 0.15501519756838905,
- "grad_norm": 3.15741229057312,
- "learning_rate": 4.999978067577844e-06,
- "loss": 0.4603108763694763,
- "mean_token_accuracy": 0.8418779373168945,
- "num_tokens": 1867201.0,
- "step": 204
- },
- {
- "epoch": 0.15577507598784193,
- "grad_norm": 2.1925418376922607,
- "learning_rate": 4.999968417332415e-06,
- "loss": 0.5552488565444946,
- "mean_token_accuracy": 0.8216016292572021,
- "num_tokens": 1874837.0,
- "step": 205
- },
- {
- "epoch": 0.15653495440729484,
- "grad_norm": 2.2518117427825928,
- "learning_rate": 4.999957012512916e-06,
- "loss": 0.4912569522857666,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 1881842.0,
- "step": 206
- },
- {
- "epoch": 0.15729483282674772,
- "grad_norm": 1.8223762512207031,
- "learning_rate": 4.999943853127351e-06,
- "loss": 0.47709137201309204,
- "mean_token_accuracy": 0.8311659097671509,
- "num_tokens": 1890805.0,
- "step": 207
- },
- {
- "epoch": 0.1580547112462006,
- "grad_norm": 2.066499948501587,
- "learning_rate": 4.999928939184958e-06,
- "loss": 0.44794657826423645,
- "mean_token_accuracy": 0.8513424396514893,
- "num_tokens": 1898264.0,
- "step": 208
- },
- {
- "epoch": 0.15881458966565348,
- "grad_norm": 3.53865909576416,
- "learning_rate": 4.999912270696202e-06,
- "loss": 0.5978270769119263,
- "mean_token_accuracy": 0.8080137968063354,
- "num_tokens": 1902435.0,
- "step": 209
- },
- {
- "epoch": 0.1595744680851064,
- "grad_norm": 2.0760679244995117,
- "learning_rate": 4.999893847672783e-06,
- "loss": 0.5930601358413696,
- "mean_token_accuracy": 0.8028650283813477,
- "num_tokens": 1912252.0,
- "step": 210
- },
- {
- "epoch": 0.16033434650455927,
- "grad_norm": 2.21551513671875,
- "learning_rate": 4.99987367012763e-06,
- "loss": 0.6336753964424133,
- "mean_token_accuracy": 0.7902286648750305,
- "num_tokens": 1922095.0,
- "step": 211
- },
- {
- "epoch": 0.16109422492401215,
- "grad_norm": 1.7654480934143066,
- "learning_rate": 4.999851738074904e-06,
- "loss": 0.6373403668403625,
- "mean_token_accuracy": 0.7802424430847168,
- "num_tokens": 1938962.0,
- "step": 212
- },
- {
- "epoch": 0.16185410334346503,
- "grad_norm": 2.852834701538086,
- "learning_rate": 4.9998280515300006e-06,
- "loss": 0.6418683528900146,
- "mean_token_accuracy": 0.7895716428756714,
- "num_tokens": 1944668.0,
- "step": 213
- },
- {
- "epoch": 0.16261398176291794,
- "grad_norm": 3.4737212657928467,
- "learning_rate": 4.999802610509541e-06,
- "loss": 0.6323273181915283,
- "mean_token_accuracy": 0.7982614636421204,
- "num_tokens": 1949142.0,
- "step": 214
- },
- {
- "epoch": 0.16337386018237082,
- "grad_norm": 3.0802664756774902,
- "learning_rate": 4.999775415031381e-06,
- "loss": 0.5929068326950073,
- "mean_token_accuracy": 0.8112219572067261,
- "num_tokens": 1954141.0,
- "step": 215
- },
- {
- "epoch": 0.1641337386018237,
- "grad_norm": 2.9808855056762695,
- "learning_rate": 4.999746465114609e-06,
- "loss": 0.5556406378746033,
- "mean_token_accuracy": 0.8117628693580627,
- "num_tokens": 1959406.0,
- "step": 216
- },
- {
- "epoch": 0.16489361702127658,
- "grad_norm": 1.7346166372299194,
- "learning_rate": 4.999715760779541e-06,
- "loss": 0.5122925043106079,
- "mean_token_accuracy": 0.8040724992752075,
- "num_tokens": 1971921.0,
- "step": 217
- },
- {
- "epoch": 0.1656534954407295,
- "grad_norm": 1.4183907508850098,
- "learning_rate": 4.999683302047729e-06,
- "loss": 0.46471893787384033,
- "mean_token_accuracy": 0.8381330966949463,
- "num_tokens": 1988863.0,
- "step": 218
- },
- {
- "epoch": 0.16641337386018237,
- "grad_norm": 1.6797802448272705,
- "learning_rate": 4.999649088941951e-06,
- "loss": 0.38348832726478577,
- "mean_token_accuracy": 0.8344278931617737,
- "num_tokens": 2000003.0,
- "step": 219
- },
- {
- "epoch": 0.16717325227963525,
- "grad_norm": 3.036963939666748,
- "learning_rate": 4.999613121486222e-06,
- "loss": 0.6062780618667603,
- "mean_token_accuracy": 0.8217900991439819,
- "num_tokens": 2004813.0,
- "step": 220
- },
- {
- "epoch": 0.16793313069908813,
- "grad_norm": 2.0343217849731445,
- "learning_rate": 4.999575399705782e-06,
- "loss": 0.5052450895309448,
- "mean_token_accuracy": 0.8368623852729797,
- "num_tokens": 2013565.0,
- "step": 221
- },
- {
- "epoch": 0.16869300911854104,
- "grad_norm": 2.1162009239196777,
- "learning_rate": 4.9995359236271094e-06,
- "loss": 0.5169756412506104,
- "mean_token_accuracy": 0.8339958190917969,
- "num_tokens": 2025763.0,
- "step": 222
- },
- {
- "epoch": 0.16945288753799392,
- "grad_norm": 2.055333375930786,
- "learning_rate": 4.9994946932779076e-06,
- "loss": 0.6327048540115356,
- "mean_token_accuracy": 0.8078711032867432,
- "num_tokens": 2037005.0,
- "step": 223
- },
- {
- "epoch": 0.1702127659574468,
- "grad_norm": 3.334620475769043,
- "learning_rate": 4.999451708687114e-06,
- "loss": 0.5688358545303345,
- "mean_token_accuracy": 0.8015589714050293,
- "num_tokens": 2041473.0,
- "step": 224
- },
- {
- "epoch": 0.17097264437689969,
- "grad_norm": 2.3734676837921143,
- "learning_rate": 4.999406969884897e-06,
- "loss": 0.5673821568489075,
- "mean_token_accuracy": 0.8054057359695435,
- "num_tokens": 2049397.0,
- "step": 225
- },
- {
- "epoch": 0.1717325227963526,
- "grad_norm": 1.807358980178833,
- "learning_rate": 4.999360476902656e-06,
- "loss": 0.4376158118247986,
- "mean_token_accuracy": 0.8456039428710938,
- "num_tokens": 2058721.0,
- "step": 226
- },
- {
- "epoch": 0.17249240121580547,
- "grad_norm": 3.231638193130493,
- "learning_rate": 4.999312229773022e-06,
- "loss": 0.5592809915542603,
- "mean_token_accuracy": 0.8170154094696045,
- "num_tokens": 2063455.0,
- "step": 227
- },
- {
- "epoch": 0.17325227963525835,
- "grad_norm": 2.2717151641845703,
- "learning_rate": 4.999262228529855e-06,
- "loss": 0.6144396066665649,
- "mean_token_accuracy": 0.7948470115661621,
- "num_tokens": 2071686.0,
- "step": 228
- },
- {
- "epoch": 0.17401215805471124,
- "grad_norm": 1.4171342849731445,
- "learning_rate": 4.99921047320825e-06,
- "loss": 0.43680912256240845,
- "mean_token_accuracy": 0.84850013256073,
- "num_tokens": 2086999.0,
- "step": 229
- },
- {
- "epoch": 0.17477203647416414,
- "grad_norm": 3.162736654281616,
- "learning_rate": 4.99915696384453e-06,
- "loss": 0.6025407910346985,
- "mean_token_accuracy": 0.8042335510253906,
- "num_tokens": 2092001.0,
- "step": 230
- },
- {
- "epoch": 0.17553191489361702,
- "grad_norm": 1.8672804832458496,
- "learning_rate": 4.99910170047625e-06,
- "loss": 0.5843087434768677,
- "mean_token_accuracy": 0.8016980886459351,
- "num_tokens": 2103372.0,
- "step": 231
- },
- {
- "epoch": 0.1762917933130699,
- "grad_norm": 2.967587471008301,
- "learning_rate": 4.999044683142196e-06,
- "loss": 0.5123642086982727,
- "mean_token_accuracy": 0.8216149806976318,
- "num_tokens": 2108008.0,
- "step": 232
- },
- {
- "epoch": 0.1770516717325228,
- "grad_norm": 1.9651981592178345,
- "learning_rate": 4.998985911882383e-06,
- "loss": 0.5868178606033325,
- "mean_token_accuracy": 0.7904198169708252,
- "num_tokens": 2119009.0,
- "step": 233
- },
- {
- "epoch": 0.1778115501519757,
- "grad_norm": 2.7785449028015137,
- "learning_rate": 4.998925386738063e-06,
- "loss": 0.5075510144233704,
- "mean_token_accuracy": 0.8280210494995117,
- "num_tokens": 2124915.0,
- "step": 234
- },
- {
- "epoch": 0.17857142857142858,
- "grad_norm": 2.957470417022705,
- "learning_rate": 4.998863107751711e-06,
- "loss": 0.5351958274841309,
- "mean_token_accuracy": 0.846825122833252,
- "num_tokens": 2129905.0,
- "step": 235
- },
- {
- "epoch": 0.17933130699088146,
- "grad_norm": 3.207671880722046,
- "learning_rate": 4.99879907496704e-06,
- "loss": 0.6209091544151306,
- "mean_token_accuracy": 0.789960503578186,
- "num_tokens": 2135027.0,
- "step": 236
- },
- {
- "epoch": 0.18009118541033434,
- "grad_norm": 2.018953800201416,
- "learning_rate": 4.998733288428987e-06,
- "loss": 0.601510763168335,
- "mean_token_accuracy": 0.8136930465698242,
- "num_tokens": 2147016.0,
- "step": 237
- },
- {
- "epoch": 0.18085106382978725,
- "grad_norm": 2.437281847000122,
- "learning_rate": 4.998665748183727e-06,
- "loss": 0.5813639163970947,
- "mean_token_accuracy": 0.8116716146469116,
- "num_tokens": 2155386.0,
- "step": 238
- },
- {
- "epoch": 0.18161094224924013,
- "grad_norm": 1.5708180665969849,
- "learning_rate": 4.998596454278661e-06,
- "loss": 0.5252395272254944,
- "mean_token_accuracy": 0.8193864822387695,
- "num_tokens": 2170295.0,
- "step": 239
- },
- {
- "epoch": 0.182370820668693,
- "grad_norm": 1.9921495914459229,
- "learning_rate": 4.998525406762422e-06,
- "loss": 0.5335029363632202,
- "mean_token_accuracy": 0.8120872974395752,
- "num_tokens": 2180012.0,
- "step": 240
- },
- {
- "epoch": 0.1831306990881459,
- "grad_norm": 2.6562681198120117,
- "learning_rate": 4.998452605684874e-06,
- "loss": 0.48021435737609863,
- "mean_token_accuracy": 0.8388714790344238,
- "num_tokens": 2185607.0,
- "step": 241
- },
- {
- "epoch": 0.1838905775075988,
- "grad_norm": 2.2535853385925293,
- "learning_rate": 4.998378051097111e-06,
- "loss": 0.5747300386428833,
- "mean_token_accuracy": 0.8004639148712158,
- "num_tokens": 2194105.0,
- "step": 242
- },
- {
- "epoch": 0.18465045592705168,
- "grad_norm": 1.6151788234710693,
- "learning_rate": 4.998301743051459e-06,
- "loss": 0.6190565824508667,
- "mean_token_accuracy": 0.7816627621650696,
- "num_tokens": 2210629.0,
- "step": 243
- },
- {
- "epoch": 0.18541033434650456,
- "grad_norm": 2.1088173389434814,
- "learning_rate": 4.9982236816014735e-06,
- "loss": 0.4715560972690582,
- "mean_token_accuracy": 0.8485721349716187,
- "num_tokens": 2218958.0,
- "step": 244
- },
- {
- "epoch": 0.18617021276595744,
- "grad_norm": 2.6168735027313232,
- "learning_rate": 4.998143866801941e-06,
- "loss": 0.6077103018760681,
- "mean_token_accuracy": 0.8057924509048462,
- "num_tokens": 2226368.0,
- "step": 245
- },
- {
- "epoch": 0.18693009118541035,
- "grad_norm": 2.5988616943359375,
- "learning_rate": 4.99806229870888e-06,
- "loss": 0.5021637678146362,
- "mean_token_accuracy": 0.8361666202545166,
- "num_tokens": 2232485.0,
- "step": 246
- },
- {
- "epoch": 0.18768996960486323,
- "grad_norm": 2.015887498855591,
- "learning_rate": 4.9979789773795365e-06,
- "loss": 0.4309737980365753,
- "mean_token_accuracy": 0.8508044481277466,
- "num_tokens": 2240819.0,
- "step": 247
- },
- {
- "epoch": 0.1884498480243161,
- "grad_norm": 2.3115265369415283,
- "learning_rate": 4.997893902872389e-06,
- "loss": 0.5776500701904297,
- "mean_token_accuracy": 0.8079549074172974,
- "num_tokens": 2249460.0,
- "step": 248
- },
- {
- "epoch": 0.189209726443769,
- "grad_norm": 1.7387021780014038,
- "learning_rate": 4.997807075247147e-06,
- "loss": 0.430944561958313,
- "mean_token_accuracy": 0.8483544588088989,
- "num_tokens": 2259124.0,
- "step": 249
- },
- {
- "epoch": 0.1899696048632219,
- "grad_norm": 1.6378381252288818,
- "learning_rate": 4.997718494564747e-06,
- "loss": 0.4123363792896271,
- "mean_token_accuracy": 0.8557409644126892,
- "num_tokens": 2269899.0,
- "step": 250
- },
- {
- "epoch": 0.19072948328267478,
- "grad_norm": 1.336282730102539,
- "learning_rate": 4.997628160887361e-06,
- "loss": 0.502329409122467,
- "mean_token_accuracy": 0.8186938166618347,
- "num_tokens": 2292821.0,
- "step": 251
- },
- {
- "epoch": 0.19148936170212766,
- "grad_norm": 3.3335583209991455,
- "learning_rate": 4.997536074278388e-06,
- "loss": 0.584446907043457,
- "mean_token_accuracy": 0.8062717318534851,
- "num_tokens": 2297175.0,
- "step": 252
- },
- {
- "epoch": 0.19224924012158054,
- "grad_norm": 2.246727228164673,
- "learning_rate": 4.9974422348024565e-06,
- "loss": 0.5683060884475708,
- "mean_token_accuracy": 0.8193703293800354,
- "num_tokens": 2305456.0,
- "step": 253
- },
- {
- "epoch": 0.19300911854103345,
- "grad_norm": 2.3520865440368652,
- "learning_rate": 4.997346642525429e-06,
- "loss": 0.4724946618080139,
- "mean_token_accuracy": 0.8426719307899475,
- "num_tokens": 2312241.0,
- "step": 254
- },
- {
- "epoch": 0.19376899696048633,
- "grad_norm": 2.7115702629089355,
- "learning_rate": 4.9972492975143936e-06,
- "loss": 0.5019032955169678,
- "mean_token_accuracy": 0.8253573179244995,
- "num_tokens": 2318094.0,
- "step": 255
- },
- {
- "epoch": 0.1945288753799392,
- "grad_norm": 1.705528974533081,
- "learning_rate": 4.997150199837671e-06,
- "loss": 0.45588475465774536,
- "mean_token_accuracy": 0.836666464805603,
- "num_tokens": 2329025.0,
- "step": 256
- },
- {
- "epoch": 0.1952887537993921,
- "grad_norm": 2.161400318145752,
- "learning_rate": 4.997049349564814e-06,
- "loss": 0.5170183777809143,
- "mean_token_accuracy": 0.8287534117698669,
- "num_tokens": 2337448.0,
- "step": 257
- },
- {
- "epoch": 0.196048632218845,
- "grad_norm": 2.629669189453125,
- "learning_rate": 4.996946746766602e-06,
- "loss": 0.44650501012802124,
- "mean_token_accuracy": 0.850114107131958,
- "num_tokens": 2343207.0,
- "step": 258
- },
- {
- "epoch": 0.19680851063829788,
- "grad_norm": 1.6735503673553467,
- "learning_rate": 4.996842391515045e-06,
- "loss": 0.5247820019721985,
- "mean_token_accuracy": 0.8285071849822998,
- "num_tokens": 2356801.0,
- "step": 259
- },
- {
- "epoch": 0.19756838905775076,
- "grad_norm": 1.2753115892410278,
- "learning_rate": 4.996736283883382e-06,
- "loss": 0.41870927810668945,
- "mean_token_accuracy": 0.8448047637939453,
- "num_tokens": 2377306.0,
- "step": 260
- },
- {
- "epoch": 0.19832826747720364,
- "grad_norm": 2.6947314739227295,
- "learning_rate": 4.9966284239460875e-06,
- "loss": 0.5059205889701843,
- "mean_token_accuracy": 0.8430814743041992,
- "num_tokens": 2383352.0,
- "step": 261
- },
- {
- "epoch": 0.19908814589665655,
- "grad_norm": 2.0509963035583496,
- "learning_rate": 4.996518811778858e-06,
- "loss": 0.4565388560295105,
- "mean_token_accuracy": 0.8453130722045898,
- "num_tokens": 2391149.0,
- "step": 262
- },
- {
- "epoch": 0.19984802431610943,
- "grad_norm": 2.1856348514556885,
- "learning_rate": 4.996407447458626e-06,
- "loss": 0.531380832195282,
- "mean_token_accuracy": 0.8387004137039185,
- "num_tokens": 2399875.0,
- "step": 263
- },
- {
- "epoch": 0.2006079027355623,
- "grad_norm": 2.7348573207855225,
- "learning_rate": 4.99629433106355e-06,
- "loss": 0.5242817401885986,
- "mean_token_accuracy": 0.8177423477172852,
- "num_tokens": 2406586.0,
- "step": 264
- },
- {
- "epoch": 0.2013677811550152,
- "grad_norm": 1.76587975025177,
- "learning_rate": 4.99617946267302e-06,
- "loss": 0.49298471212387085,
- "mean_token_accuracy": 0.8271149396896362,
- "num_tokens": 2418683.0,
- "step": 265
- },
- {
- "epoch": 0.20212765957446807,
- "grad_norm": 2.8129730224609375,
- "learning_rate": 4.996062842367655e-06,
- "loss": 0.46420302987098694,
- "mean_token_accuracy": 0.8453244566917419,
- "num_tokens": 2422929.0,
- "step": 266
- },
- {
- "epoch": 0.20288753799392098,
- "grad_norm": 2.575744152069092,
- "learning_rate": 4.9959444702293025e-06,
- "loss": 0.43208545446395874,
- "mean_token_accuracy": 0.8494843244552612,
- "num_tokens": 2429567.0,
- "step": 267
- },
- {
- "epoch": 0.20364741641337386,
- "grad_norm": 2.7586750984191895,
- "learning_rate": 4.995824346341041e-06,
- "loss": 0.4390473961830139,
- "mean_token_accuracy": 0.8348895311355591,
- "num_tokens": 2434700.0,
- "step": 268
- },
- {
- "epoch": 0.20440729483282674,
- "grad_norm": 1.972145438194275,
- "learning_rate": 4.99570247078718e-06,
- "loss": 0.6219544410705566,
- "mean_token_accuracy": 0.7939999103546143,
- "num_tokens": 2447007.0,
- "step": 269
- },
- {
- "epoch": 0.20516717325227962,
- "grad_norm": 2.2963485717773438,
- "learning_rate": 4.995578843653255e-06,
- "loss": 0.5008970499038696,
- "mean_token_accuracy": 0.8255308866500854,
- "num_tokens": 2453936.0,
- "step": 270
- },
- {
- "epoch": 0.20592705167173253,
- "grad_norm": 1.8897721767425537,
- "learning_rate": 4.995453465026033e-06,
- "loss": 0.5436089038848877,
- "mean_token_accuracy": 0.819086492061615,
- "num_tokens": 2464494.0,
- "step": 271
- },
- {
- "epoch": 0.2066869300911854,
- "grad_norm": 2.319728374481201,
- "learning_rate": 4.995326334993508e-06,
- "loss": 0.5136368870735168,
- "mean_token_accuracy": 0.820817232131958,
- "num_tokens": 2470938.0,
- "step": 272
- },
- {
- "epoch": 0.2074468085106383,
- "grad_norm": 2.230414390563965,
- "learning_rate": 4.9951974536449055e-06,
- "loss": 0.5272846817970276,
- "mean_token_accuracy": 0.8203279972076416,
- "num_tokens": 2478629.0,
- "step": 273
- },
- {
- "epoch": 0.20820668693009117,
- "grad_norm": 3.401937484741211,
- "learning_rate": 4.9950668210706795e-06,
- "loss": 0.4389592111110687,
- "mean_token_accuracy": 0.8647899031639099,
- "num_tokens": 2482193.0,
- "step": 274
- },
- {
- "epoch": 0.20896656534954408,
- "grad_norm": 2.1278507709503174,
- "learning_rate": 4.994934437362513e-06,
- "loss": 0.598863422870636,
- "mean_token_accuracy": 0.7945119738578796,
- "num_tokens": 2492465.0,
- "step": 275
- },
- {
- "epoch": 0.20972644376899696,
- "grad_norm": 1.9259960651397705,
- "learning_rate": 4.994800302613318e-06,
- "loss": 0.49520939588546753,
- "mean_token_accuracy": 0.8371536135673523,
- "num_tokens": 2500825.0,
- "step": 276
- },
- {
- "epoch": 0.21048632218844984,
- "grad_norm": 2.346418857574463,
- "learning_rate": 4.994664416917236e-06,
- "loss": 0.5412614345550537,
- "mean_token_accuracy": 0.810661792755127,
- "num_tokens": 2509513.0,
- "step": 277
- },
- {
- "epoch": 0.21124620060790272,
- "grad_norm": 1.3092039823532104,
- "learning_rate": 4.994526780369636e-06,
- "loss": 0.46305379271507263,
- "mean_token_accuracy": 0.8358527421951294,
- "num_tokens": 2531405.0,
- "step": 278
- },
- {
- "epoch": 0.21200607902735563,
- "grad_norm": 2.924611806869507,
- "learning_rate": 4.9943873930671175e-06,
- "loss": 0.6134544610977173,
- "mean_token_accuracy": 0.7947378754615784,
- "num_tokens": 2536744.0,
- "step": 279
- },
- {
- "epoch": 0.2127659574468085,
- "grad_norm": 2.8290598392486572,
- "learning_rate": 4.994246255107506e-06,
- "loss": 0.465520441532135,
- "mean_token_accuracy": 0.8440108299255371,
- "num_tokens": 2541184.0,
- "step": 280
- },
- {
- "epoch": 0.2135258358662614,
- "grad_norm": 3.8081259727478027,
- "learning_rate": 4.994103366589859e-06,
- "loss": 0.43394139409065247,
- "mean_token_accuracy": 0.8579148054122925,
- "num_tokens": 2545395.0,
- "step": 281
- },
- {
- "epoch": 0.21428571428571427,
- "grad_norm": 1.7994529008865356,
- "learning_rate": 4.993958727614462e-06,
- "loss": 0.5076484680175781,
- "mean_token_accuracy": 0.8270803093910217,
- "num_tokens": 2556541.0,
- "step": 282
- },
- {
- "epoch": 0.21504559270516718,
- "grad_norm": 2.5582659244537354,
- "learning_rate": 4.993812338282826e-06,
- "loss": 0.4453684389591217,
- "mean_token_accuracy": 0.8488293886184692,
- "num_tokens": 2562949.0,
- "step": 283
- },
- {
- "epoch": 0.21580547112462006,
- "grad_norm": 1.6448938846588135,
- "learning_rate": 4.993664198697694e-06,
- "loss": 0.461971640586853,
- "mean_token_accuracy": 0.824763298034668,
- "num_tokens": 2576407.0,
- "step": 284
- },
- {
- "epoch": 0.21656534954407294,
- "grad_norm": 2.1264469623565674,
- "learning_rate": 4.993514308963037e-06,
- "loss": 0.6241602897644043,
- "mean_token_accuracy": 0.7916014790534973,
- "num_tokens": 2585695.0,
- "step": 285
- },
- {
- "epoch": 0.21732522796352582,
- "grad_norm": 3.629991292953491,
- "learning_rate": 4.993362669184051e-06,
- "loss": 0.610355019569397,
- "mean_token_accuracy": 0.7847568988800049,
- "num_tokens": 2589778.0,
- "step": 286
- },
- {
- "epoch": 0.21808510638297873,
- "grad_norm": 1.9070756435394287,
- "learning_rate": 4.993209279467164e-06,
- "loss": 0.5513623952865601,
- "mean_token_accuracy": 0.7911607027053833,
- "num_tokens": 2600920.0,
- "step": 287
- },
- {
- "epoch": 0.2188449848024316,
- "grad_norm": 1.761062741279602,
- "learning_rate": 4.993054139920031e-06,
- "loss": 0.4579957127571106,
- "mean_token_accuracy": 0.8189530372619629,
- "num_tokens": 2611856.0,
- "step": 288
- },
- {
- "epoch": 0.2196048632218845,
- "grad_norm": 1.7264713048934937,
- "learning_rate": 4.992897250651535e-06,
- "loss": 0.5871305465698242,
- "mean_token_accuracy": 0.7918527126312256,
- "num_tokens": 2624730.0,
- "step": 289
- },
- {
- "epoch": 0.22036474164133737,
- "grad_norm": 1.7455977201461792,
- "learning_rate": 4.992738611771787e-06,
- "loss": 0.5475119948387146,
- "mean_token_accuracy": 0.8226917386054993,
- "num_tokens": 2635705.0,
- "step": 290
- },
- {
- "epoch": 0.22112462006079028,
- "grad_norm": 2.095095157623291,
- "learning_rate": 4.992578223392124e-06,
- "loss": 0.5952225923538208,
- "mean_token_accuracy": 0.8078469038009644,
- "num_tokens": 2643954.0,
- "step": 291
- },
- {
- "epoch": 0.22188449848024316,
- "grad_norm": 2.994664192199707,
- "learning_rate": 4.992416085625115e-06,
- "loss": 0.5432442426681519,
- "mean_token_accuracy": 0.8329008221626282,
- "num_tokens": 2648800.0,
- "step": 292
- },
- {
- "epoch": 0.22264437689969604,
- "grad_norm": 2.796790361404419,
- "learning_rate": 4.992252198584554e-06,
- "loss": 0.5168961882591248,
- "mean_token_accuracy": 0.8393474817276001,
- "num_tokens": 2653546.0,
- "step": 293
- },
- {
- "epoch": 0.22340425531914893,
- "grad_norm": 1.8610522747039795,
- "learning_rate": 4.992086562385462e-06,
- "loss": 0.5728024244308472,
- "mean_token_accuracy": 0.797406792640686,
- "num_tokens": 2667483.0,
- "step": 294
- },
- {
- "epoch": 0.22416413373860183,
- "grad_norm": 1.695472002029419,
- "learning_rate": 4.9919191771440905e-06,
- "loss": 0.5460028648376465,
- "mean_token_accuracy": 0.8123016357421875,
- "num_tokens": 2683574.0,
- "step": 295
- },
- {
- "epoch": 0.22492401215805471,
- "grad_norm": 2.8627376556396484,
- "learning_rate": 4.9917500429779165e-06,
- "loss": 0.5566985011100769,
- "mean_token_accuracy": 0.815531313419342,
- "num_tokens": 2688985.0,
- "step": 296
- },
- {
- "epoch": 0.2256838905775076,
- "grad_norm": 2.73323655128479,
- "learning_rate": 4.991579160005644e-06,
- "loss": 0.48197102546691895,
- "mean_token_accuracy": 0.8471829295158386,
- "num_tokens": 2694799.0,
- "step": 297
- },
- {
- "epoch": 0.22644376899696048,
- "grad_norm": 1.8436161279678345,
- "learning_rate": 4.991406528347206e-06,
- "loss": 0.4528339207172394,
- "mean_token_accuracy": 0.8603188395500183,
- "num_tokens": 2707321.0,
- "step": 298
- },
- {
- "epoch": 0.22720364741641338,
- "grad_norm": 2.6231515407562256,
- "learning_rate": 4.9912321481237616e-06,
- "loss": 0.5916541814804077,
- "mean_token_accuracy": 0.8050242066383362,
- "num_tokens": 2714233.0,
- "step": 299
- },
- {
- "epoch": 0.22796352583586627,
- "grad_norm": 3.08776593208313,
- "learning_rate": 4.991056019457697e-06,
- "loss": 0.4860580563545227,
- "mean_token_accuracy": 0.8464088439941406,
- "num_tokens": 2718443.0,
- "step": 300
- },
- {
- "epoch": 0.22872340425531915,
- "grad_norm": 2.2537803649902344,
- "learning_rate": 4.990878142472628e-06,
- "loss": 0.5158311128616333,
- "mean_token_accuracy": 0.824694812297821,
- "num_tokens": 2726158.0,
- "step": 301
- },
- {
- "epoch": 0.22948328267477203,
- "grad_norm": 2.1122705936431885,
- "learning_rate": 4.990698517293394e-06,
- "loss": 0.495265394449234,
- "mean_token_accuracy": 0.8343238830566406,
- "num_tokens": 2735022.0,
- "step": 302
- },
- {
- "epoch": 0.23024316109422494,
- "grad_norm": 3.5503528118133545,
- "learning_rate": 4.9905171440460645e-06,
- "loss": 0.46063232421875,
- "mean_token_accuracy": 0.8420047760009766,
- "num_tokens": 2738550.0,
- "step": 303
- },
- {
- "epoch": 0.23100303951367782,
- "grad_norm": 3.9858486652374268,
- "learning_rate": 4.990334022857932e-06,
- "loss": 0.5832710266113281,
- "mean_token_accuracy": 0.8144199848175049,
- "num_tokens": 2741720.0,
- "step": 304
- },
- {
- "epoch": 0.2317629179331307,
- "grad_norm": 2.407231330871582,
- "learning_rate": 4.990149153857519e-06,
- "loss": 0.4692630171775818,
- "mean_token_accuracy": 0.8429223299026489,
- "num_tokens": 2748693.0,
- "step": 305
- },
- {
- "epoch": 0.23252279635258358,
- "grad_norm": 1.6996397972106934,
- "learning_rate": 4.989962537174573e-06,
- "loss": 0.49143946170806885,
- "mean_token_accuracy": 0.8340128064155579,
- "num_tokens": 2761254.0,
- "step": 306
- },
- {
- "epoch": 0.23328267477203649,
- "grad_norm": 3.746432065963745,
- "learning_rate": 4.989774172940071e-06,
- "loss": 0.6282026767730713,
- "mean_token_accuracy": 0.775698184967041,
- "num_tokens": 2765115.0,
- "step": 307
- },
- {
- "epoch": 0.23404255319148937,
- "grad_norm": 2.212872266769409,
- "learning_rate": 4.989584061286211e-06,
- "loss": 0.5193763971328735,
- "mean_token_accuracy": 0.8168246746063232,
- "num_tokens": 2772345.0,
- "step": 308
- },
- {
- "epoch": 0.23480243161094225,
- "grad_norm": 1.752297282218933,
- "learning_rate": 4.989392202346423e-06,
- "loss": 0.4437984824180603,
- "mean_token_accuracy": 0.8451256155967712,
- "num_tokens": 2783072.0,
- "step": 309
- },
- {
- "epoch": 0.23556231003039513,
- "grad_norm": 2.386019706726074,
- "learning_rate": 4.989198596255361e-06,
- "loss": 0.4090752899646759,
- "mean_token_accuracy": 0.8480085134506226,
- "num_tokens": 2788757.0,
- "step": 310
- },
- {
- "epoch": 0.23632218844984804,
- "grad_norm": 3.9981489181518555,
- "learning_rate": 4.989003243148904e-06,
- "loss": 0.5149132013320923,
- "mean_token_accuracy": 0.8179056644439697,
- "num_tokens": 2792096.0,
- "step": 311
- },
- {
- "epoch": 0.23708206686930092,
- "grad_norm": 1.8723100423812866,
- "learning_rate": 4.988806143164159e-06,
- "loss": 0.4531487822532654,
- "mean_token_accuracy": 0.8400167226791382,
- "num_tokens": 2802210.0,
- "step": 312
- },
- {
- "epoch": 0.2378419452887538,
- "grad_norm": 2.3415136337280273,
- "learning_rate": 4.988607296439459e-06,
- "loss": 0.5974439978599548,
- "mean_token_accuracy": 0.8035976886749268,
- "num_tokens": 2810088.0,
- "step": 313
- },
- {
- "epoch": 0.23860182370820668,
- "grad_norm": 1.5317577123641968,
- "learning_rate": 4.98840670311436e-06,
- "loss": 0.49247145652770996,
- "mean_token_accuracy": 0.8292540311813354,
- "num_tokens": 2824005.0,
- "step": 314
- },
- {
- "epoch": 0.2393617021276596,
- "grad_norm": 2.170772075653076,
- "learning_rate": 4.988204363329648e-06,
- "loss": 0.6359974145889282,
- "mean_token_accuracy": 0.7785564661026001,
- "num_tokens": 2834680.0,
- "step": 315
- },
- {
- "epoch": 0.24012158054711247,
- "grad_norm": 3.2655932903289795,
- "learning_rate": 4.988000277227334e-06,
- "loss": 0.5080196857452393,
- "mean_token_accuracy": 0.8295877575874329,
- "num_tokens": 2838735.0,
- "step": 316
- },
- {
- "epoch": 0.24088145896656535,
- "grad_norm": 3.406589984893799,
- "learning_rate": 4.987794444950651e-06,
- "loss": 0.3939085006713867,
- "mean_token_accuracy": 0.8700719475746155,
- "num_tokens": 2842127.0,
- "step": 317
- },
- {
- "epoch": 0.24164133738601823,
- "grad_norm": 1.8211106061935425,
- "learning_rate": 4.987586866644061e-06,
- "loss": 0.5270540118217468,
- "mean_token_accuracy": 0.826683521270752,
- "num_tokens": 2853656.0,
- "step": 318
- },
- {
- "epoch": 0.24240121580547114,
- "grad_norm": 1.8429969549179077,
- "learning_rate": 4.9873775424532515e-06,
- "loss": 0.4705049991607666,
- "mean_token_accuracy": 0.8355701565742493,
- "num_tokens": 2863513.0,
- "step": 319
- },
- {
- "epoch": 0.24316109422492402,
- "grad_norm": 2.2425320148468018,
- "learning_rate": 4.9871664725251314e-06,
- "loss": 0.485736608505249,
- "mean_token_accuracy": 0.835182785987854,
- "num_tokens": 2871556.0,
- "step": 320
- },
- {
- "epoch": 0.2439209726443769,
- "grad_norm": 1.6202056407928467,
- "learning_rate": 4.986953657007841e-06,
- "loss": 0.4437887370586395,
- "mean_token_accuracy": 0.8282591700553894,
- "num_tokens": 2884335.0,
- "step": 321
- },
- {
- "epoch": 0.24468085106382978,
- "grad_norm": 1.1027268171310425,
- "learning_rate": 4.98673909605074e-06,
- "loss": 0.3770800828933716,
- "mean_token_accuracy": 0.8325437307357788,
- "num_tokens": 2904286.0,
- "step": 322
- },
- {
- "epoch": 0.2454407294832827,
- "grad_norm": 2.3239076137542725,
- "learning_rate": 4.986522789804417e-06,
- "loss": 0.5387254953384399,
- "mean_token_accuracy": 0.806242823600769,
- "num_tokens": 2910975.0,
- "step": 323
- },
- {
- "epoch": 0.24620060790273557,
- "grad_norm": 2.243482828140259,
- "learning_rate": 4.986304738420684e-06,
- "loss": 0.4396553039550781,
- "mean_token_accuracy": 0.8561904430389404,
- "num_tokens": 2917087.0,
- "step": 324
- },
- {
- "epoch": 0.24696048632218845,
- "grad_norm": 2.537264347076416,
- "learning_rate": 4.986084942052577e-06,
- "loss": 0.395110160112381,
- "mean_token_accuracy": 0.8636915683746338,
- "num_tokens": 2921887.0,
- "step": 325
- },
- {
- "epoch": 0.24772036474164133,
- "grad_norm": 2.319399118423462,
- "learning_rate": 4.9858634008543574e-06,
- "loss": 0.581517219543457,
- "mean_token_accuracy": 0.8157487511634827,
- "num_tokens": 2928996.0,
- "step": 326
- },
- {
- "epoch": 0.24848024316109424,
- "grad_norm": 1.9787474870681763,
- "learning_rate": 4.985640114981513e-06,
- "loss": 0.5084106922149658,
- "mean_token_accuracy": 0.835221529006958,
- "num_tokens": 2940302.0,
- "step": 327
- },
- {
- "epoch": 0.24924012158054712,
- "grad_norm": 2.4783265590667725,
- "learning_rate": 4.985415084590752e-06,
- "loss": 0.6062222719192505,
- "mean_token_accuracy": 0.7885516285896301,
- "num_tokens": 2946386.0,
- "step": 328
- },
- {
- "epoch": 0.25,
- "grad_norm": 2.4081411361694336,
- "learning_rate": 4.985188309840012e-06,
- "loss": 0.5079880356788635,
- "mean_token_accuracy": 0.8313904404640198,
- "num_tokens": 2952323.0,
- "step": 329
- },
- {
- "epoch": 0.2507598784194529,
- "grad_norm": 2.64993953704834,
- "learning_rate": 4.984959790888451e-06,
- "loss": 0.5461447834968567,
- "mean_token_accuracy": 0.8125468492507935,
- "num_tokens": 2958119.0,
- "step": 330
- },
- {
- "epoch": 0.25151975683890576,
- "grad_norm": 2.549734115600586,
- "learning_rate": 4.984729527896451e-06,
- "loss": 0.5998573303222656,
- "mean_token_accuracy": 0.8076666593551636,
- "num_tokens": 2964947.0,
- "step": 331
- },
- {
- "epoch": 0.25227963525835867,
- "grad_norm": 3.2185161113739014,
- "learning_rate": 4.984497521025622e-06,
- "loss": 0.4232945442199707,
- "mean_token_accuracy": 0.8543803095817566,
- "num_tokens": 2968598.0,
- "step": 332
- },
- {
- "epoch": 0.2530395136778115,
- "grad_norm": 2.588994264602661,
- "learning_rate": 4.984263770438793e-06,
- "loss": 0.460967481136322,
- "mean_token_accuracy": 0.8416207432746887,
- "num_tokens": 2974510.0,
- "step": 333
- },
- {
- "epoch": 0.25379939209726443,
- "grad_norm": 2.1373162269592285,
- "learning_rate": 4.984028276300021e-06,
- "loss": 0.49382102489471436,
- "mean_token_accuracy": 0.8388048410415649,
- "num_tokens": 2981632.0,
- "step": 334
- },
- {
- "epoch": 0.25455927051671734,
- "grad_norm": 2.2524826526641846,
- "learning_rate": 4.983791038774585e-06,
- "loss": 0.4947671890258789,
- "mean_token_accuracy": 0.8066365122795105,
- "num_tokens": 2988736.0,
- "step": 335
- },
- {
- "epoch": 0.2553191489361702,
- "grad_norm": 1.7244199514389038,
- "learning_rate": 4.983552058028985e-06,
- "loss": 0.48096776008605957,
- "mean_token_accuracy": 0.830735445022583,
- "num_tokens": 3003576.0,
- "step": 336
- },
- {
- "epoch": 0.2560790273556231,
- "grad_norm": 3.0628933906555176,
- "learning_rate": 4.9833113342309495e-06,
- "loss": 0.6027032136917114,
- "mean_token_accuracy": 0.8008694648742676,
- "num_tokens": 3009549.0,
- "step": 337
- },
- {
- "epoch": 0.256838905775076,
- "grad_norm": 2.438674211502075,
- "learning_rate": 4.983068867549427e-06,
- "loss": 0.517090916633606,
- "mean_token_accuracy": 0.827893853187561,
- "num_tokens": 3015236.0,
- "step": 338
- },
- {
- "epoch": 0.25759878419452886,
- "grad_norm": 2.131535053253174,
- "learning_rate": 4.982824658154589e-06,
- "loss": 0.6656812429428101,
- "mean_token_accuracy": 0.7772425413131714,
- "num_tokens": 3028142.0,
- "step": 339
- },
- {
- "epoch": 0.25835866261398177,
- "grad_norm": 2.3206584453582764,
- "learning_rate": 4.9825787062178315e-06,
- "loss": 0.5757625699043274,
- "mean_token_accuracy": 0.8073873519897461,
- "num_tokens": 3040996.0,
- "step": 340
- },
- {
- "epoch": 0.2591185410334346,
- "grad_norm": 1.3905521631240845,
- "learning_rate": 4.982331011911774e-06,
- "loss": 0.4193805456161499,
- "mean_token_accuracy": 0.8399466872215271,
- "num_tokens": 3061931.0,
- "step": 341
- },
- {
- "epoch": 0.25987841945288753,
- "grad_norm": 2.184173345565796,
- "learning_rate": 4.982081575410256e-06,
- "loss": 0.4751223921775818,
- "mean_token_accuracy": 0.8409271240234375,
- "num_tokens": 3069081.0,
- "step": 342
- },
- {
- "epoch": 0.26063829787234044,
- "grad_norm": 3.538764238357544,
- "learning_rate": 4.9818303968883445e-06,
- "loss": 0.8119601011276245,
- "mean_token_accuracy": 0.7442739009857178,
- "num_tokens": 3073628.0,
- "step": 343
- },
- {
- "epoch": 0.2613981762917933,
- "grad_norm": 1.8063762187957764,
- "learning_rate": 4.981577476522323e-06,
- "loss": 0.5615730881690979,
- "mean_token_accuracy": 0.8207751512527466,
- "num_tokens": 3086596.0,
- "step": 344
- },
- {
- "epoch": 0.2621580547112462,
- "grad_norm": 2.4346961975097656,
- "learning_rate": 4.981322814489703e-06,
- "loss": 0.5266709327697754,
- "mean_token_accuracy": 0.8211277723312378,
- "num_tokens": 3092631.0,
- "step": 345
- },
- {
- "epoch": 0.2629179331306991,
- "grad_norm": 1.91289484500885,
- "learning_rate": 4.981066410969215e-06,
- "loss": 0.5047177672386169,
- "mean_token_accuracy": 0.8356877565383911,
- "num_tokens": 3101102.0,
- "step": 346
- },
- {
- "epoch": 0.26367781155015196,
- "grad_norm": 2.1495707035064697,
- "learning_rate": 4.980808266140813e-06,
- "loss": 0.47876280546188354,
- "mean_token_accuracy": 0.8364313244819641,
- "num_tokens": 3107998.0,
- "step": 347
- },
- {
- "epoch": 0.26443768996960487,
- "grad_norm": 2.5961992740631104,
- "learning_rate": 4.9805483801856744e-06,
- "loss": 0.5512958765029907,
- "mean_token_accuracy": 0.8181467652320862,
- "num_tokens": 3113848.0,
- "step": 348
- },
- {
- "epoch": 0.2651975683890577,
- "grad_norm": 3.2828900814056396,
- "learning_rate": 4.980286753286196e-06,
- "loss": 0.4217945635318756,
- "mean_token_accuracy": 0.8617103099822998,
- "num_tokens": 3117652.0,
- "step": 349
- },
- {
- "epoch": 0.26595744680851063,
- "grad_norm": 1.425554871559143,
- "learning_rate": 4.980023385625996e-06,
- "loss": 0.4042487144470215,
- "mean_token_accuracy": 0.8492785692214966,
- "num_tokens": 3132336.0,
- "step": 350
- },
- {
- "epoch": 0.26671732522796354,
- "grad_norm": 2.933504104614258,
- "learning_rate": 4.979758277389919e-06,
- "loss": 0.5406704545021057,
- "mean_token_accuracy": 0.8035423755645752,
- "num_tokens": 3137544.0,
- "step": 351
- },
- {
- "epoch": 0.2674772036474164,
- "grad_norm": 1.9958966970443726,
- "learning_rate": 4.9794914287640264e-06,
- "loss": 0.5857555270195007,
- "mean_token_accuracy": 0.7965140342712402,
- "num_tokens": 3149705.0,
- "step": 352
- },
- {
- "epoch": 0.2682370820668693,
- "grad_norm": 2.467694044113159,
- "learning_rate": 4.979222839935602e-06,
- "loss": 0.6404043436050415,
- "mean_token_accuracy": 0.7823755741119385,
- "num_tokens": 3158353.0,
- "step": 353
- },
- {
- "epoch": 0.2689969604863222,
- "grad_norm": 2.0102720260620117,
- "learning_rate": 4.9789525110931545e-06,
- "loss": 0.5681496858596802,
- "mean_token_accuracy": 0.8108169436454773,
- "num_tokens": 3167121.0,
- "step": 354
- },
- {
- "epoch": 0.26975683890577506,
- "grad_norm": 2.6017866134643555,
- "learning_rate": 4.978680442426409e-06,
- "loss": 0.6309828162193298,
- "mean_token_accuracy": 0.7742617130279541,
- "num_tokens": 3175012.0,
- "step": 355
- },
- {
- "epoch": 0.270516717325228,
- "grad_norm": 1.8799268007278442,
- "learning_rate": 4.978406634126315e-06,
- "loss": 0.524029016494751,
- "mean_token_accuracy": 0.8317689895629883,
- "num_tokens": 3185331.0,
- "step": 356
- },
- {
- "epoch": 0.2712765957446808,
- "grad_norm": 1.508332371711731,
- "learning_rate": 4.978131086385041e-06,
- "loss": 0.46656402945518494,
- "mean_token_accuracy": 0.8339117765426636,
- "num_tokens": 3198813.0,
- "step": 357
- },
- {
- "epoch": 0.27203647416413373,
- "grad_norm": 3.595707654953003,
- "learning_rate": 4.977853799395976e-06,
- "loss": 0.5101234912872314,
- "mean_token_accuracy": 0.8251723051071167,
- "num_tokens": 3206557.0,
- "step": 358
- },
- {
- "epoch": 0.27279635258358664,
- "grad_norm": 3.5317916870117188,
- "learning_rate": 4.977574773353732e-06,
- "loss": 0.5684665441513062,
- "mean_token_accuracy": 0.8124493360519409,
- "num_tokens": 3210912.0,
- "step": 359
- },
- {
- "epoch": 0.2735562310030395,
- "grad_norm": 2.8606204986572266,
- "learning_rate": 4.97729400845414e-06,
- "loss": 0.4746384620666504,
- "mean_token_accuracy": 0.8195606470108032,
- "num_tokens": 3215365.0,
- "step": 360
- },
- {
- "epoch": 0.2743161094224924,
- "grad_norm": 1.8214033842086792,
- "learning_rate": 4.977011504894253e-06,
- "loss": 0.4842769503593445,
- "mean_token_accuracy": 0.82928866147995,
- "num_tokens": 3224037.0,
- "step": 361
- },
- {
- "epoch": 0.2750759878419453,
- "grad_norm": 1.628746509552002,
- "learning_rate": 4.97672726287234e-06,
- "loss": 0.4397493302822113,
- "mean_token_accuracy": 0.8606528043746948,
- "num_tokens": 3235589.0,
- "step": 362
- },
- {
- "epoch": 0.27583586626139817,
- "grad_norm": 3.557973861694336,
- "learning_rate": 4.976441282587894e-06,
- "loss": 0.5732032060623169,
- "mean_token_accuracy": 0.8041545748710632,
- "num_tokens": 3239958.0,
- "step": 363
- },
- {
- "epoch": 0.2765957446808511,
- "grad_norm": 1.3467901945114136,
- "learning_rate": 4.9761535642416284e-06,
- "loss": 0.4525323510169983,
- "mean_token_accuracy": 0.8281061053276062,
- "num_tokens": 3257703.0,
- "step": 364
- },
- {
- "epoch": 0.2773556231003039,
- "grad_norm": 2.2649986743927,
- "learning_rate": 4.9758641080354745e-06,
- "loss": 0.5074734687805176,
- "mean_token_accuracy": 0.8447474241256714,
- "num_tokens": 3264334.0,
- "step": 365
- },
- {
- "epoch": 0.27811550151975684,
- "grad_norm": 2.8667566776275635,
- "learning_rate": 4.975572914172581e-06,
- "loss": 0.5759559869766235,
- "mean_token_accuracy": 0.7976793050765991,
- "num_tokens": 3269314.0,
- "step": 366
- },
- {
- "epoch": 0.27887537993920974,
- "grad_norm": 2.2514986991882324,
- "learning_rate": 4.975279982857324e-06,
- "loss": 0.5786465406417847,
- "mean_token_accuracy": 0.8058781623840332,
- "num_tokens": 3277324.0,
- "step": 367
- },
- {
- "epoch": 0.2796352583586626,
- "grad_norm": 1.3826723098754883,
- "learning_rate": 4.97498531429529e-06,
- "loss": 0.40801727771759033,
- "mean_token_accuracy": 0.8601310849189758,
- "num_tokens": 3290530.0,
- "step": 368
- },
- {
- "epoch": 0.2803951367781155,
- "grad_norm": 2.084092617034912,
- "learning_rate": 4.97468890869329e-06,
- "loss": 0.47076648473739624,
- "mean_token_accuracy": 0.8310186862945557,
- "num_tokens": 3298325.0,
- "step": 369
- },
- {
- "epoch": 0.2811550151975684,
- "grad_norm": 1.3467998504638672,
- "learning_rate": 4.974390766259353e-06,
- "loss": 0.44668465852737427,
- "mean_token_accuracy": 0.8275353908538818,
- "num_tokens": 3314302.0,
- "step": 370
- },
- {
- "epoch": 0.28191489361702127,
- "grad_norm": 2.5921075344085693,
- "learning_rate": 4.974090887202726e-06,
- "loss": 0.5343953967094421,
- "mean_token_accuracy": 0.8110706806182861,
- "num_tokens": 3320963.0,
- "step": 371
- },
- {
- "epoch": 0.2826747720364742,
- "grad_norm": 2.042781352996826,
- "learning_rate": 4.973789271733877e-06,
- "loss": 0.6293343305587769,
- "mean_token_accuracy": 0.7800243496894836,
- "num_tokens": 3332742.0,
- "step": 372
- },
- {
- "epoch": 0.28343465045592703,
- "grad_norm": 4.822193145751953,
- "learning_rate": 4.973485920064491e-06,
- "loss": 0.6256728768348694,
- "mean_token_accuracy": 0.7962433099746704,
- "num_tokens": 3335872.0,
- "step": 373
- },
- {
- "epoch": 0.28419452887537994,
- "grad_norm": 1.260988473892212,
- "learning_rate": 4.973180832407471e-06,
- "loss": 0.38731223344802856,
- "mean_token_accuracy": 0.8385066986083984,
- "num_tokens": 3351884.0,
- "step": 374
- },
- {
- "epoch": 0.28495440729483285,
- "grad_norm": 2.669966697692871,
- "learning_rate": 4.97287400897694e-06,
- "loss": 0.5594710111618042,
- "mean_token_accuracy": 0.8097212314605713,
- "num_tokens": 3358197.0,
- "step": 375
- },
- {
- "epoch": 0.2857142857142857,
- "grad_norm": 3.0344486236572266,
- "learning_rate": 4.972565449988238e-06,
- "loss": 0.34449583292007446,
- "mean_token_accuracy": 0.8813316822052002,
- "num_tokens": 3362133.0,
- "step": 376
- },
- {
- "epoch": 0.2864741641337386,
- "grad_norm": 2.562251091003418,
- "learning_rate": 4.972255155657925e-06,
- "loss": 0.5331522822380066,
- "mean_token_accuracy": 0.8212941288948059,
- "num_tokens": 3370346.0,
- "step": 377
- },
- {
- "epoch": 0.2872340425531915,
- "grad_norm": 2.7083740234375,
- "learning_rate": 4.9719431262037755e-06,
- "loss": 0.5403046011924744,
- "mean_token_accuracy": 0.8108335733413696,
- "num_tokens": 3375588.0,
- "step": 378
- },
- {
- "epoch": 0.28799392097264437,
- "grad_norm": 1.396430492401123,
- "learning_rate": 4.971629361844785e-06,
- "loss": 0.4041529893875122,
- "mean_token_accuracy": 0.8588063716888428,
- "num_tokens": 3390749.0,
- "step": 379
- },
- {
- "epoch": 0.2887537993920973,
- "grad_norm": 1.9872784614562988,
- "learning_rate": 4.971313862801166e-06,
- "loss": 0.4336993098258972,
- "mean_token_accuracy": 0.8511303663253784,
- "num_tokens": 3399064.0,
- "step": 380
- },
- {
- "epoch": 0.28951367781155013,
- "grad_norm": 1.9652575254440308,
- "learning_rate": 4.9709966292943455e-06,
- "loss": 0.4578358232975006,
- "mean_token_accuracy": 0.8229440450668335,
- "num_tokens": 3407229.0,
- "step": 381
- },
- {
- "epoch": 0.29027355623100304,
- "grad_norm": 1.6626898050308228,
- "learning_rate": 4.970677661546972e-06,
- "loss": 0.5427594184875488,
- "mean_token_accuracy": 0.815427303314209,
- "num_tokens": 3422321.0,
- "step": 382
- },
- {
- "epoch": 0.29103343465045595,
- "grad_norm": 3.5265562534332275,
- "learning_rate": 4.970356959782909e-06,
- "loss": 0.6661460995674133,
- "mean_token_accuracy": 0.7856965065002441,
- "num_tokens": 3427442.0,
- "step": 383
- },
- {
- "epoch": 0.2917933130699088,
- "grad_norm": 1.667205572128296,
- "learning_rate": 4.970034524227239e-06,
- "loss": 0.36256325244903564,
- "mean_token_accuracy": 0.8711205720901489,
- "num_tokens": 3436662.0,
- "step": 384
- },
- {
- "epoch": 0.2925531914893617,
- "grad_norm": 1.3389486074447632,
- "learning_rate": 4.969710355106256e-06,
- "loss": 0.4282698631286621,
- "mean_token_accuracy": 0.838951587677002,
- "num_tokens": 3450060.0,
- "step": 385
- },
- {
- "epoch": 0.2933130699088146,
- "grad_norm": 2.5163397789001465,
- "learning_rate": 4.969384452647477e-06,
- "loss": 0.5176984071731567,
- "mean_token_accuracy": 0.8235267996788025,
- "num_tokens": 3456990.0,
- "step": 386
- },
- {
- "epoch": 0.29407294832826747,
- "grad_norm": 1.7588495016098022,
- "learning_rate": 4.969056817079633e-06,
- "loss": 0.49710947275161743,
- "mean_token_accuracy": 0.818520724773407,
- "num_tokens": 3468098.0,
- "step": 387
- },
- {
- "epoch": 0.2948328267477204,
- "grad_norm": 2.6381046772003174,
- "learning_rate": 4.968727448632669e-06,
- "loss": 0.4425308108329773,
- "mean_token_accuracy": 0.8451643586158752,
- "num_tokens": 3472899.0,
- "step": 388
- },
- {
- "epoch": 0.29559270516717323,
- "grad_norm": 1.6345038414001465,
- "learning_rate": 4.968396347537751e-06,
- "loss": 0.4177059829235077,
- "mean_token_accuracy": 0.8498886227607727,
- "num_tokens": 3484826.0,
- "step": 389
- },
- {
- "epoch": 0.29635258358662614,
- "grad_norm": 3.0466468334198,
- "learning_rate": 4.968063514027258e-06,
- "loss": 0.4274463951587677,
- "mean_token_accuracy": 0.8387278318405151,
- "num_tokens": 3488610.0,
- "step": 390
- },
- {
- "epoch": 0.29711246200607905,
- "grad_norm": 2.6509406566619873,
- "learning_rate": 4.967728948334784e-06,
- "loss": 0.5401753783226013,
- "mean_token_accuracy": 0.8252490162849426,
- "num_tokens": 3493657.0,
- "step": 391
- },
- {
- "epoch": 0.2978723404255319,
- "grad_norm": 1.6372219324111938,
- "learning_rate": 4.967392650695141e-06,
- "loss": 0.3862472176551819,
- "mean_token_accuracy": 0.8555525541305542,
- "num_tokens": 3505588.0,
- "step": 392
- },
- {
- "epoch": 0.2986322188449848,
- "grad_norm": 2.1615452766418457,
- "learning_rate": 4.967054621344356e-06,
- "loss": 0.57850581407547,
- "mean_token_accuracy": 0.8222678899765015,
- "num_tokens": 3514396.0,
- "step": 393
- },
- {
- "epoch": 0.2993920972644377,
- "grad_norm": 1.8610916137695312,
- "learning_rate": 4.96671486051967e-06,
- "loss": 0.5440595149993896,
- "mean_token_accuracy": 0.8196715116500854,
- "num_tokens": 3523604.0,
- "step": 394
- },
- {
- "epoch": 0.30015197568389057,
- "grad_norm": 2.9585862159729004,
- "learning_rate": 4.966373368459542e-06,
- "loss": 0.6921588182449341,
- "mean_token_accuracy": 0.7816659808158875,
- "num_tokens": 3529849.0,
- "step": 395
- },
- {
- "epoch": 0.3009118541033435,
- "grad_norm": 1.9374035596847534,
- "learning_rate": 4.966030145403642e-06,
- "loss": 0.5494055151939392,
- "mean_token_accuracy": 0.8126792907714844,
- "num_tokens": 3539529.0,
- "step": 396
- },
- {
- "epoch": 0.30167173252279633,
- "grad_norm": 1.730530023574829,
- "learning_rate": 4.965685191592859e-06,
- "loss": 0.4271572232246399,
- "mean_token_accuracy": 0.8383668661117554,
- "num_tokens": 3550972.0,
- "step": 397
- },
- {
- "epoch": 0.30243161094224924,
- "grad_norm": 3.9635560512542725,
- "learning_rate": 4.9653385072692935e-06,
- "loss": 0.5576210021972656,
- "mean_token_accuracy": 0.799404501914978,
- "num_tokens": 3554147.0,
- "step": 398
- },
- {
- "epoch": 0.30319148936170215,
- "grad_norm": 2.5731968879699707,
- "learning_rate": 4.964990092676263e-06,
- "loss": 0.5478942394256592,
- "mean_token_accuracy": 0.8220961093902588,
- "num_tokens": 3559972.0,
- "step": 399
- },
- {
- "epoch": 0.303951367781155,
- "grad_norm": 2.2096588611602783,
- "learning_rate": 4.964639948058297e-06,
- "loss": 0.35461270809173584,
- "mean_token_accuracy": 0.8640927076339722,
- "num_tokens": 3565770.0,
- "step": 400
- },
- {
- "epoch": 0.3047112462006079,
- "grad_norm": 1.7874189615249634,
- "learning_rate": 4.964288073661142e-06,
- "loss": 0.38849619030952454,
- "mean_token_accuracy": 0.8443037271499634,
- "num_tokens": 3574514.0,
- "step": 401
- },
- {
- "epoch": 0.30547112462006076,
- "grad_norm": 1.5583146810531616,
- "learning_rate": 4.963934469731756e-06,
- "loss": 0.48909449577331543,
- "mean_token_accuracy": 0.8429768681526184,
- "num_tokens": 3585877.0,
- "step": 402
- },
- {
- "epoch": 0.30623100303951367,
- "grad_norm": 3.026599645614624,
- "learning_rate": 4.963579136518312e-06,
- "loss": 0.5138992071151733,
- "mean_token_accuracy": 0.8283728361129761,
- "num_tokens": 3590412.0,
- "step": 403
- },
- {
- "epoch": 0.3069908814589666,
- "grad_norm": 2.777505874633789,
- "learning_rate": 4.963222074270197e-06,
- "loss": 0.6241534948348999,
- "mean_token_accuracy": 0.8130464553833008,
- "num_tokens": 3596246.0,
- "step": 404
- },
- {
- "epoch": 0.30775075987841943,
- "grad_norm": 2.4772839546203613,
- "learning_rate": 4.962863283238011e-06,
- "loss": 0.5930814146995544,
- "mean_token_accuracy": 0.8036394715309143,
- "num_tokens": 3602878.0,
- "step": 405
- },
- {
- "epoch": 0.30851063829787234,
- "grad_norm": 1.5049982070922852,
- "learning_rate": 4.962502763673566e-06,
- "loss": 0.4903082549571991,
- "mean_token_accuracy": 0.8184912204742432,
- "num_tokens": 3617018.0,
- "step": 406
- },
- {
- "epoch": 0.30927051671732525,
- "grad_norm": 2.453155040740967,
- "learning_rate": 4.96214051582989e-06,
- "loss": 0.5138067603111267,
- "mean_token_accuracy": 0.8336835503578186,
- "num_tokens": 3624188.0,
- "step": 407
- },
- {
- "epoch": 0.3100303951367781,
- "grad_norm": 2.4038336277008057,
- "learning_rate": 4.961776539961222e-06,
- "loss": 0.5752760171890259,
- "mean_token_accuracy": 0.8054730892181396,
- "num_tokens": 3634152.0,
- "step": 408
- },
- {
- "epoch": 0.310790273556231,
- "grad_norm": 2.629068374633789,
- "learning_rate": 4.961410836323014e-06,
- "loss": 0.5580606460571289,
- "mean_token_accuracy": 0.8121089935302734,
- "num_tokens": 3639528.0,
- "step": 409
- },
- {
- "epoch": 0.31155015197568386,
- "grad_norm": 1.4245928525924683,
- "learning_rate": 4.961043405171931e-06,
- "loss": 0.5399882793426514,
- "mean_token_accuracy": 0.812280535697937,
- "num_tokens": 3655744.0,
- "step": 410
- },
- {
- "epoch": 0.3123100303951368,
- "grad_norm": 1.5236459970474243,
- "learning_rate": 4.9606742467658505e-06,
- "loss": 0.5234690308570862,
- "mean_token_accuracy": 0.8188928365707397,
- "num_tokens": 3675010.0,
- "step": 411
- },
- {
- "epoch": 0.3130699088145897,
- "grad_norm": 2.27961802482605,
- "learning_rate": 4.960303361363863e-06,
- "loss": 0.5502505898475647,
- "mean_token_accuracy": 0.8161963224411011,
- "num_tokens": 3682328.0,
- "step": 412
- },
- {
- "epoch": 0.31382978723404253,
- "grad_norm": 1.554518222808838,
- "learning_rate": 4.959930749226269e-06,
- "loss": 0.420867919921875,
- "mean_token_accuracy": 0.8499157428741455,
- "num_tokens": 3694980.0,
- "step": 413
- },
- {
- "epoch": 0.31458966565349544,
- "grad_norm": 2.609218120574951,
- "learning_rate": 4.9595564106145825e-06,
- "loss": 0.4706704318523407,
- "mean_token_accuracy": 0.8412490487098694,
- "num_tokens": 3700033.0,
- "step": 414
- },
- {
- "epoch": 0.31534954407294835,
- "grad_norm": 1.5303231477737427,
- "learning_rate": 4.959180345791528e-06,
- "loss": 0.4668654799461365,
- "mean_token_accuracy": 0.8125015497207642,
- "num_tokens": 3715012.0,
- "step": 415
- },
- {
- "epoch": 0.3161094224924012,
- "grad_norm": 1.2774665355682373,
- "learning_rate": 4.958802555021042e-06,
- "loss": 0.4339369237422943,
- "mean_token_accuracy": 0.8442851901054382,
- "num_tokens": 3733928.0,
- "step": 416
- },
- {
- "epoch": 0.3168693009118541,
- "grad_norm": 2.1240181922912598,
- "learning_rate": 4.958423038568274e-06,
- "loss": 0.4029104709625244,
- "mean_token_accuracy": 0.8627674579620361,
- "num_tokens": 3740202.0,
- "step": 417
- },
- {
- "epoch": 0.31762917933130697,
- "grad_norm": 2.00538969039917,
- "learning_rate": 4.958041796699583e-06,
- "loss": 0.5229607820510864,
- "mean_token_accuracy": 0.8282366394996643,
- "num_tokens": 3749308.0,
- "step": 418
- },
- {
- "epoch": 0.3183890577507599,
- "grad_norm": 2.6555092334747314,
- "learning_rate": 4.957658829682539e-06,
- "loss": 0.5344101190567017,
- "mean_token_accuracy": 0.8183202743530273,
- "num_tokens": 3754595.0,
- "step": 419
- },
- {
- "epoch": 0.3191489361702128,
- "grad_norm": 1.7468839883804321,
- "learning_rate": 4.9572741377859225e-06,
- "loss": 0.5667245984077454,
- "mean_token_accuracy": 0.8080123662948608,
- "num_tokens": 3765761.0,
- "step": 420
- },
- {
- "epoch": 0.31990881458966564,
- "grad_norm": 2.9612457752227783,
- "learning_rate": 4.956887721279726e-06,
- "loss": 0.5389559864997864,
- "mean_token_accuracy": 0.8019476532936096,
- "num_tokens": 3770844.0,
- "step": 421
- },
- {
- "epoch": 0.32066869300911854,
- "grad_norm": 1.842403769493103,
- "learning_rate": 4.95649958043515e-06,
- "loss": 0.38279837369918823,
- "mean_token_accuracy": 0.858866810798645,
- "num_tokens": 3778094.0,
- "step": 422
- },
- {
- "epoch": 0.32142857142857145,
- "grad_norm": 2.3108131885528564,
- "learning_rate": 4.956109715524609e-06,
- "loss": 0.5453893542289734,
- "mean_token_accuracy": 0.8085013031959534,
- "num_tokens": 3785015.0,
- "step": 423
- },
- {
- "epoch": 0.3221884498480243,
- "grad_norm": 3.0326945781707764,
- "learning_rate": 4.9557181268217225e-06,
- "loss": 0.5550523400306702,
- "mean_token_accuracy": 0.8125876188278198,
- "num_tokens": 3789830.0,
- "step": 424
- },
- {
- "epoch": 0.3229483282674772,
- "grad_norm": 1.8851977586746216,
- "learning_rate": 4.955324814601324e-06,
- "loss": 0.4902324974536896,
- "mean_token_accuracy": 0.8205406665802002,
- "num_tokens": 3799862.0,
- "step": 425
- },
- {
- "epoch": 0.32370820668693007,
- "grad_norm": 2.6018171310424805,
- "learning_rate": 4.954929779139455e-06,
- "loss": 0.5920133590698242,
- "mean_token_accuracy": 0.8340690732002258,
- "num_tokens": 3806617.0,
- "step": 426
- },
- {
- "epoch": 0.324468085106383,
- "grad_norm": 2.4283878803253174,
- "learning_rate": 4.954533020713367e-06,
- "loss": 0.5305854082107544,
- "mean_token_accuracy": 0.8137468099594116,
- "num_tokens": 3813843.0,
- "step": 427
- },
- {
- "epoch": 0.3252279635258359,
- "grad_norm": 2.667978525161743,
- "learning_rate": 4.954134539601519e-06,
- "loss": 0.5333638787269592,
- "mean_token_accuracy": 0.8402629494667053,
- "num_tokens": 3819450.0,
- "step": 428
- },
- {
- "epoch": 0.32598784194528874,
- "grad_norm": 1.7302523851394653,
- "learning_rate": 4.953734336083582e-06,
- "loss": 0.422895610332489,
- "mean_token_accuracy": 0.8709704875946045,
- "num_tokens": 3831027.0,
- "step": 429
- },
- {
- "epoch": 0.32674772036474165,
- "grad_norm": 2.427192211151123,
- "learning_rate": 4.953332410440434e-06,
- "loss": 0.6334598064422607,
- "mean_token_accuracy": 0.7817479968070984,
- "num_tokens": 3841776.0,
- "step": 430
- },
- {
- "epoch": 0.32750759878419455,
- "grad_norm": 1.460949182510376,
- "learning_rate": 4.952928762954161e-06,
- "loss": 0.3654777705669403,
- "mean_token_accuracy": 0.8780122995376587,
- "num_tokens": 3852213.0,
- "step": 431
- },
- {
- "epoch": 0.3282674772036474,
- "grad_norm": 1.9855005741119385,
- "learning_rate": 4.952523393908059e-06,
- "loss": 0.5117089748382568,
- "mean_token_accuracy": 0.811911404132843,
- "num_tokens": 3861176.0,
- "step": 432
- },
- {
- "epoch": 0.3290273556231003,
- "grad_norm": 2.2653207778930664,
- "learning_rate": 4.952116303586631e-06,
- "loss": 0.42514950037002563,
- "mean_token_accuracy": 0.8448518514633179,
- "num_tokens": 3867164.0,
- "step": 433
- },
- {
- "epoch": 0.32978723404255317,
- "grad_norm": 1.9780964851379395,
- "learning_rate": 4.951707492275589e-06,
- "loss": 0.5095293521881104,
- "mean_token_accuracy": 0.8262748718261719,
- "num_tokens": 3876406.0,
- "step": 434
- },
- {
- "epoch": 0.3305471124620061,
- "grad_norm": 2.9480233192443848,
- "learning_rate": 4.951296960261853e-06,
- "loss": 0.3494448959827423,
- "mean_token_accuracy": 0.8781307935714722,
- "num_tokens": 3880298.0,
- "step": 435
- },
- {
- "epoch": 0.331306990881459,
- "grad_norm": 2.335571527481079,
- "learning_rate": 4.95088470783355e-06,
- "loss": 0.5456914901733398,
- "mean_token_accuracy": 0.816297173500061,
- "num_tokens": 3886487.0,
- "step": 436
- },
- {
- "epoch": 0.33206686930091184,
- "grad_norm": 2.3046419620513916,
- "learning_rate": 4.950470735280013e-06,
- "loss": 0.4835948944091797,
- "mean_token_accuracy": 0.8539175391197205,
- "num_tokens": 3892706.0,
- "step": 437
- },
- {
- "epoch": 0.33282674772036475,
- "grad_norm": 2.44047474861145,
- "learning_rate": 4.950055042891786e-06,
- "loss": 0.5154092907905579,
- "mean_token_accuracy": 0.8579919338226318,
- "num_tokens": 3899532.0,
- "step": 438
- },
- {
- "epoch": 0.33358662613981765,
- "grad_norm": 4.826764106750488,
- "learning_rate": 4.949637630960618e-06,
- "loss": 0.5270259976387024,
- "mean_token_accuracy": 0.8172192573547363,
- "num_tokens": 3902260.0,
- "step": 439
- },
- {
- "epoch": 0.3343465045592705,
- "grad_norm": 2.001574754714966,
- "learning_rate": 4.949218499779462e-06,
- "loss": 0.5413002967834473,
- "mean_token_accuracy": 0.8162837028503418,
- "num_tokens": 3911706.0,
- "step": 440
- },
- {
- "epoch": 0.3351063829787234,
- "grad_norm": 1.7998944520950317,
- "learning_rate": 4.948797649642484e-06,
- "loss": 0.5131614208221436,
- "mean_token_accuracy": 0.8367440700531006,
- "num_tokens": 3923490.0,
- "step": 441
- },
- {
- "epoch": 0.33586626139817627,
- "grad_norm": 3.4566173553466797,
- "learning_rate": 4.94837508084505e-06,
- "loss": 0.7258909940719604,
- "mean_token_accuracy": 0.771377444267273,
- "num_tokens": 3928099.0,
- "step": 442
- },
- {
- "epoch": 0.3366261398176292,
- "grad_norm": 2.0040442943573,
- "learning_rate": 4.9479507936837364e-06,
- "loss": 0.482135534286499,
- "mean_token_accuracy": 0.8339327573776245,
- "num_tokens": 3937328.0,
- "step": 443
- },
- {
- "epoch": 0.3373860182370821,
- "grad_norm": 2.949502944946289,
- "learning_rate": 4.947524788456325e-06,
- "loss": 0.6474795341491699,
- "mean_token_accuracy": 0.7951677441596985,
- "num_tokens": 3942529.0,
- "step": 444
- },
- {
- "epoch": 0.33814589665653494,
- "grad_norm": 1.5528364181518555,
- "learning_rate": 4.947097065461801e-06,
- "loss": 0.48791584372520447,
- "mean_token_accuracy": 0.8425545692443848,
- "num_tokens": 3955200.0,
- "step": 445
- },
- {
- "epoch": 0.33890577507598785,
- "grad_norm": 1.8813284635543823,
- "learning_rate": 4.946667625000358e-06,
- "loss": 0.45922309160232544,
- "mean_token_accuracy": 0.8206527233123779,
- "num_tokens": 3962975.0,
- "step": 446
- },
- {
- "epoch": 0.33966565349544076,
- "grad_norm": 1.7157847881317139,
- "learning_rate": 4.946236467373392e-06,
- "loss": 0.5454182028770447,
- "mean_token_accuracy": 0.8049604892730713,
- "num_tokens": 3973956.0,
- "step": 447
- },
- {
- "epoch": 0.3404255319148936,
- "grad_norm": 2.008857250213623,
- "learning_rate": 4.945803592883509e-06,
- "loss": 0.5151860117912292,
- "mean_token_accuracy": 0.8262045383453369,
- "num_tokens": 3982853.0,
- "step": 448
- },
- {
- "epoch": 0.3411854103343465,
- "grad_norm": 1.6632496118545532,
- "learning_rate": 4.9453690018345144e-06,
- "loss": 0.42710691690444946,
- "mean_token_accuracy": 0.8521314859390259,
- "num_tokens": 3993838.0,
- "step": 449
- },
- {
- "epoch": 0.34194528875379937,
- "grad_norm": 1.365234375,
- "learning_rate": 4.944932694531423e-06,
- "loss": 0.5172526836395264,
- "mean_token_accuracy": 0.8277045488357544,
- "num_tokens": 4014179.0,
- "step": 450
- },
- {
- "epoch": 0.3427051671732523,
- "grad_norm": 1.7610243558883667,
- "learning_rate": 4.94449467128045e-06,
- "loss": 0.42104798555374146,
- "mean_token_accuracy": 0.8552065491676331,
- "num_tokens": 4023663.0,
- "step": 451
- },
- {
- "epoch": 0.3434650455927052,
- "grad_norm": 2.3732354640960693,
- "learning_rate": 4.944054932389018e-06,
- "loss": 0.5471175909042358,
- "mean_token_accuracy": 0.8487317562103271,
- "num_tokens": 4030100.0,
- "step": 452
- },
- {
- "epoch": 0.34422492401215804,
- "grad_norm": 1.5973623991012573,
- "learning_rate": 4.943613478165753e-06,
- "loss": 0.419813871383667,
- "mean_token_accuracy": 0.8484025001525879,
- "num_tokens": 4041124.0,
- "step": 453
- },
- {
- "epoch": 0.34498480243161095,
- "grad_norm": 2.966381549835205,
- "learning_rate": 4.943170308920484e-06,
- "loss": 0.5370652675628662,
- "mean_token_accuracy": 0.8439491987228394,
- "num_tokens": 4045675.0,
- "step": 454
- },
- {
- "epoch": 0.34574468085106386,
- "grad_norm": 2.5097248554229736,
- "learning_rate": 4.9427254249642445e-06,
- "loss": 0.5776349306106567,
- "mean_token_accuracy": 0.8060523867607117,
- "num_tokens": 4053250.0,
- "step": 455
- },
- {
- "epoch": 0.3465045592705167,
- "grad_norm": 1.6779125928878784,
- "learning_rate": 4.942278826609272e-06,
- "loss": 0.5245476961135864,
- "mean_token_accuracy": 0.8168526887893677,
- "num_tokens": 4064106.0,
- "step": 456
- },
- {
- "epoch": 0.3472644376899696,
- "grad_norm": 1.5945546627044678,
- "learning_rate": 4.9418305141690045e-06,
- "loss": 0.4972047209739685,
- "mean_token_accuracy": 0.8257735967636108,
- "num_tokens": 4077687.0,
- "step": 457
- },
- {
- "epoch": 0.34802431610942247,
- "grad_norm": 2.864778757095337,
- "learning_rate": 4.9413804879580865e-06,
- "loss": 0.5372499823570251,
- "mean_token_accuracy": 0.8423776626586914,
- "num_tokens": 4082632.0,
- "step": 458
- },
- {
- "epoch": 0.3487841945288754,
- "grad_norm": 1.4797078371047974,
- "learning_rate": 4.940928748292363e-06,
- "loss": 0.5903409719467163,
- "mean_token_accuracy": 0.8061295747756958,
- "num_tokens": 4104218.0,
- "step": 459
- },
- {
- "epoch": 0.3495440729483283,
- "grad_norm": 2.4376983642578125,
- "learning_rate": 4.940475295488882e-06,
- "loss": 0.4534894824028015,
- "mean_token_accuracy": 0.8395825028419495,
- "num_tokens": 4110530.0,
- "step": 460
- },
- {
- "epoch": 0.35030395136778114,
- "grad_norm": 1.2955626249313354,
- "learning_rate": 4.940020129865895e-06,
- "loss": 0.47155818343162537,
- "mean_token_accuracy": 0.8253582715988159,
- "num_tokens": 4128398.0,
- "step": 461
- },
- {
- "epoch": 0.35106382978723405,
- "grad_norm": 2.066575527191162,
- "learning_rate": 4.9395632517428546e-06,
- "loss": 0.5555641651153564,
- "mean_token_accuracy": 0.814624547958374,
- "num_tokens": 4137623.0,
- "step": 462
- },
- {
- "epoch": 0.3518237082066869,
- "grad_norm": 1.6407525539398193,
- "learning_rate": 4.939104661440415e-06,
- "loss": 0.4361790418624878,
- "mean_token_accuracy": 0.8544459342956543,
- "num_tokens": 4152803.0,
- "step": 463
- },
- {
- "epoch": 0.3525835866261398,
- "grad_norm": 2.1685116291046143,
- "learning_rate": 4.938644359280433e-06,
- "loss": 0.5347012877464294,
- "mean_token_accuracy": 0.853853702545166,
- "num_tokens": 4160778.0,
- "step": 464
- },
- {
- "epoch": 0.3533434650455927,
- "grad_norm": 1.8824869394302368,
- "learning_rate": 4.938182345585967e-06,
- "loss": 0.5512481927871704,
- "mean_token_accuracy": 0.7985891699790955,
- "num_tokens": 4170380.0,
- "step": 465
- },
- {
- "epoch": 0.3541033434650456,
- "grad_norm": 2.2229504585266113,
- "learning_rate": 4.937718620681273e-06,
- "loss": 0.516828179359436,
- "mean_token_accuracy": 0.8265621066093445,
- "num_tokens": 4178179.0,
- "step": 466
- },
- {
- "epoch": 0.3548632218844985,
- "grad_norm": 1.955990195274353,
- "learning_rate": 4.9372531848918145e-06,
- "loss": 0.5586158037185669,
- "mean_token_accuracy": 0.8367916345596313,
- "num_tokens": 4188626.0,
- "step": 467
- },
- {
- "epoch": 0.3556231003039514,
- "grad_norm": 1.9687023162841797,
- "learning_rate": 4.936786038544251e-06,
- "loss": 0.5517531633377075,
- "mean_token_accuracy": 0.8134098052978516,
- "num_tokens": 4198144.0,
- "step": 468
- },
- {
- "epoch": 0.35638297872340424,
- "grad_norm": 1.405516505241394,
- "learning_rate": 4.9363171819664434e-06,
- "loss": 0.5305492877960205,
- "mean_token_accuracy": 0.8014427423477173,
- "num_tokens": 4222818.0,
- "step": 469
- },
- {
- "epoch": 0.35714285714285715,
- "grad_norm": 2.6355695724487305,
- "learning_rate": 4.9358466154874535e-06,
- "loss": 0.5303391218185425,
- "mean_token_accuracy": 0.8028861284255981,
- "num_tokens": 4228318.0,
- "step": 470
- },
- {
- "epoch": 0.35790273556231,
- "grad_norm": 1.5133824348449707,
- "learning_rate": 4.935374339437543e-06,
- "loss": 0.5329189300537109,
- "mean_token_accuracy": 0.8479441404342651,
- "num_tokens": 4244527.0,
- "step": 471
- },
- {
- "epoch": 0.3586626139817629,
- "grad_norm": 3.4356725215911865,
- "learning_rate": 4.934900354148173e-06,
- "loss": 0.5431582927703857,
- "mean_token_accuracy": 0.8328983783721924,
- "num_tokens": 4248034.0,
- "step": 472
- },
- {
- "epoch": 0.3594224924012158,
- "grad_norm": 2.5789499282836914,
- "learning_rate": 4.934424659952006e-06,
- "loss": 0.4141455292701721,
- "mean_token_accuracy": 0.8658635020256042,
- "num_tokens": 4252953.0,
- "step": 473
- },
- {
- "epoch": 0.3601823708206687,
- "grad_norm": 1.145262598991394,
- "learning_rate": 4.933947257182901e-06,
- "loss": 0.40294092893600464,
- "mean_token_accuracy": 0.8565847277641296,
- "num_tokens": 4277813.0,
- "step": 474
- },
- {
- "epoch": 0.3609422492401216,
- "grad_norm": 1.7242133617401123,
- "learning_rate": 4.933468146175918e-06,
- "loss": 0.6036738753318787,
- "mean_token_accuracy": 0.8072597980499268,
- "num_tokens": 4291088.0,
- "step": 475
- },
- {
- "epoch": 0.3617021276595745,
- "grad_norm": 2.3490941524505615,
- "learning_rate": 4.932987327267317e-06,
- "loss": 0.49456146359443665,
- "mean_token_accuracy": 0.8372673988342285,
- "num_tokens": 4297376.0,
- "step": 476
- },
- {
- "epoch": 0.36246200607902734,
- "grad_norm": 1.3605526685714722,
- "learning_rate": 4.932504800794553e-06,
- "loss": 0.43595948815345764,
- "mean_token_accuracy": 0.8415953516960144,
- "num_tokens": 4312054.0,
- "step": 477
- },
- {
- "epoch": 0.36322188449848025,
- "grad_norm": 1.4525885581970215,
- "learning_rate": 4.9320205670962815e-06,
- "loss": 0.5390371680259705,
- "mean_token_accuracy": 0.8101649284362793,
- "num_tokens": 4328701.0,
- "step": 478
- },
- {
- "epoch": 0.3639817629179331,
- "grad_norm": 1.9862419366836548,
- "learning_rate": 4.931534626512359e-06,
- "loss": 0.45436930656433105,
- "mean_token_accuracy": 0.8352861404418945,
- "num_tokens": 4338372.0,
- "step": 479
- },
- {
- "epoch": 0.364741641337386,
- "grad_norm": 1.7804961204528809,
- "learning_rate": 4.931046979383836e-06,
- "loss": 0.4677754044532776,
- "mean_token_accuracy": 0.840467095375061,
- "num_tokens": 4347897.0,
- "step": 480
- },
- {
- "epoch": 0.3655015197568389,
- "grad_norm": 2.066632032394409,
- "learning_rate": 4.930557626052961e-06,
- "loss": 0.42418140172958374,
- "mean_token_accuracy": 0.8528275489807129,
- "num_tokens": 4354061.0,
- "step": 481
- },
- {
- "epoch": 0.3662613981762918,
- "grad_norm": 1.6155282258987427,
- "learning_rate": 4.930066566863182e-06,
- "loss": 0.5424284934997559,
- "mean_token_accuracy": 0.825040876865387,
- "num_tokens": 4370400.0,
- "step": 482
- },
- {
- "epoch": 0.3670212765957447,
- "grad_norm": 2.1452953815460205,
- "learning_rate": 4.929573802159143e-06,
- "loss": 0.5105804204940796,
- "mean_token_accuracy": 0.8284053802490234,
- "num_tokens": 4377579.0,
- "step": 483
- },
- {
- "epoch": 0.3677811550151976,
- "grad_norm": 1.8940945863723755,
- "learning_rate": 4.929079332286685e-06,
- "loss": 0.43478304147720337,
- "mean_token_accuracy": 0.8505665063858032,
- "num_tokens": 4385686.0,
- "step": 484
- },
- {
- "epoch": 0.36854103343465044,
- "grad_norm": 1.6785860061645508,
- "learning_rate": 4.928583157592846e-06,
- "loss": 0.40227848291397095,
- "mean_token_accuracy": 0.8623573780059814,
- "num_tokens": 4396128.0,
- "step": 485
- },
- {
- "epoch": 0.36930091185410335,
- "grad_norm": 1.6416733264923096,
- "learning_rate": 4.928085278425862e-06,
- "loss": 0.526267409324646,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 4407963.0,
- "step": 486
- },
- {
- "epoch": 0.3700607902735562,
- "grad_norm": 1.8882389068603516,
- "learning_rate": 4.927585695135162e-06,
- "loss": 0.5555213093757629,
- "mean_token_accuracy": 0.8115293979644775,
- "num_tokens": 4418057.0,
- "step": 487
- },
- {
- "epoch": 0.3708206686930091,
- "grad_norm": 2.300248384475708,
- "learning_rate": 4.9270844080713735e-06,
- "loss": 0.5812339186668396,
- "mean_token_accuracy": 0.800270676612854,
- "num_tokens": 4425358.0,
- "step": 488
- },
- {
- "epoch": 0.371580547112462,
- "grad_norm": 1.6802922487258911,
- "learning_rate": 4.926581417586319e-06,
- "loss": 0.5134941935539246,
- "mean_token_accuracy": 0.8247408866882324,
- "num_tokens": 4437702.0,
- "step": 489
- },
- {
- "epoch": 0.3723404255319149,
- "grad_norm": 1.7620291709899902,
- "learning_rate": 4.926076724033016e-06,
- "loss": 0.5233973264694214,
- "mean_token_accuracy": 0.8102161884307861,
- "num_tokens": 4448584.0,
- "step": 490
- },
- {
- "epoch": 0.3731003039513678,
- "grad_norm": 1.6911998987197876,
- "learning_rate": 4.925570327765678e-06,
- "loss": 0.5337274074554443,
- "mean_token_accuracy": 0.845306396484375,
- "num_tokens": 4462651.0,
- "step": 491
- },
- {
- "epoch": 0.3738601823708207,
- "grad_norm": 1.7991242408752441,
- "learning_rate": 4.9250622291397144e-06,
- "loss": 0.31018948554992676,
- "mean_token_accuracy": 0.8857606053352356,
- "num_tokens": 4469971.0,
- "step": 492
- },
- {
- "epoch": 0.37462006079027355,
- "grad_norm": 4.9776835441589355,
- "learning_rate": 4.924552428511727e-06,
- "loss": 0.44114983081817627,
- "mean_token_accuracy": 0.8429906368255615,
- "num_tokens": 4478275.0,
- "step": 493
- },
- {
- "epoch": 0.37537993920972645,
- "grad_norm": 1.8007272481918335,
- "learning_rate": 4.924040926239515e-06,
- "loss": 0.574328601360321,
- "mean_token_accuracy": 0.7669196128845215,
- "num_tokens": 4491551.0,
- "step": 494
- },
- {
- "epoch": 0.3761398176291793,
- "grad_norm": 2.021300792694092,
- "learning_rate": 4.92352772268207e-06,
- "loss": 0.45636120438575745,
- "mean_token_accuracy": 0.840438723564148,
- "num_tokens": 4498658.0,
- "step": 495
- },
- {
- "epoch": 0.3768996960486322,
- "grad_norm": 2.369748592376709,
- "learning_rate": 4.923012818199576e-06,
- "loss": 0.5206376910209656,
- "mean_token_accuracy": 0.8521823287010193,
- "num_tokens": 4504648.0,
- "step": 496
- },
- {
- "epoch": 0.3776595744680851,
- "grad_norm": 2.733485221862793,
- "learning_rate": 4.922496213153416e-06,
- "loss": 0.5067723989486694,
- "mean_token_accuracy": 0.8168281316757202,
- "num_tokens": 4509990.0,
- "step": 497
- },
- {
- "epoch": 0.378419452887538,
- "grad_norm": 2.3751676082611084,
- "learning_rate": 4.921977907906161e-06,
- "loss": 0.49757206439971924,
- "mean_token_accuracy": 0.8325017690658569,
- "num_tokens": 4518373.0,
- "step": 498
- },
- {
- "epoch": 0.3791793313069909,
- "grad_norm": 2.1672775745391846,
- "learning_rate": 4.921457902821578e-06,
- "loss": 0.4237566590309143,
- "mean_token_accuracy": 0.8404698371887207,
- "num_tokens": 4524338.0,
- "step": 499
- },
- {
- "epoch": 0.3799392097264438,
- "grad_norm": 1.8374360799789429,
- "learning_rate": 4.9209361982646275e-06,
- "loss": 0.4995468854904175,
- "mean_token_accuracy": 0.8299649953842163,
- "num_tokens": 4533396.0,
- "step": 500
- },
- {
- "epoch": 0.38069908814589665,
- "grad_norm": 2.083967924118042,
- "learning_rate": 4.920412794601461e-06,
- "loss": 0.489935040473938,
- "mean_token_accuracy": 0.8315291404724121,
- "num_tokens": 4540941.0,
- "step": 501
- },
- {
- "epoch": 0.38145896656534956,
- "grad_norm": 2.2075610160827637,
- "learning_rate": 4.919887692199423e-06,
- "loss": 0.5233147740364075,
- "mean_token_accuracy": 0.804171085357666,
- "num_tokens": 4548215.0,
- "step": 502
- },
- {
- "epoch": 0.3822188449848024,
- "grad_norm": 2.076775312423706,
- "learning_rate": 4.9193608914270515e-06,
- "loss": 0.5785550475120544,
- "mean_token_accuracy": 0.7993186116218567,
- "num_tokens": 4558204.0,
- "step": 503
- },
- {
- "epoch": 0.3829787234042553,
- "grad_norm": 2.238546133041382,
- "learning_rate": 4.918832392654075e-06,
- "loss": 0.5287384390830994,
- "mean_token_accuracy": 0.8214945793151855,
- "num_tokens": 4565407.0,
- "step": 504
- },
- {
- "epoch": 0.3837386018237082,
- "grad_norm": 1.6783074140548706,
- "learning_rate": 4.9183021962514145e-06,
- "loss": 0.6063359379768372,
- "mean_token_accuracy": 0.7914625406265259,
- "num_tokens": 4580991.0,
- "step": 505
- },
- {
- "epoch": 0.3844984802431611,
- "grad_norm": 1.6287449598312378,
- "learning_rate": 4.917770302591183e-06,
- "loss": 0.3598247766494751,
- "mean_token_accuracy": 0.8706809878349304,
- "num_tokens": 4590579.0,
- "step": 506
- },
- {
- "epoch": 0.385258358662614,
- "grad_norm": 1.5432041883468628,
- "learning_rate": 4.917236712046682e-06,
- "loss": 0.5267890095710754,
- "mean_token_accuracy": 0.8032117486000061,
- "num_tokens": 4608380.0,
- "step": 507
- },
- {
- "epoch": 0.3860182370820669,
- "grad_norm": 1.7664037942886353,
- "learning_rate": 4.9167014249924075e-06,
- "loss": 0.3552354574203491,
- "mean_token_accuracy": 0.8569793701171875,
- "num_tokens": 4616426.0,
- "step": 508
- },
- {
- "epoch": 0.38677811550151975,
- "grad_norm": 2.1147472858428955,
- "learning_rate": 4.916164441804044e-06,
- "loss": 0.5212404727935791,
- "mean_token_accuracy": 0.8196578025817871,
- "num_tokens": 4623908.0,
- "step": 509
- },
- {
- "epoch": 0.38753799392097266,
- "grad_norm": 2.1092333793640137,
- "learning_rate": 4.915625762858467e-06,
- "loss": 0.5197038650512695,
- "mean_token_accuracy": 0.8245604634284973,
- "num_tokens": 4630956.0,
- "step": 510
- },
- {
- "epoch": 0.3882978723404255,
- "grad_norm": 1.23331880569458,
- "learning_rate": 4.915085388533743e-06,
- "loss": 0.4759839177131653,
- "mean_token_accuracy": 0.8192248344421387,
- "num_tokens": 4651269.0,
- "step": 511
- },
- {
- "epoch": 0.3890577507598784,
- "grad_norm": 2.424199104309082,
- "learning_rate": 4.914543319209126e-06,
- "loss": 0.5576270818710327,
- "mean_token_accuracy": 0.8203302621841431,
- "num_tokens": 4657296.0,
- "step": 512
- },
- {
- "epoch": 0.3898176291793313,
- "grad_norm": 2.725156307220459,
- "learning_rate": 4.913999555265062e-06,
- "loss": 0.4337949752807617,
- "mean_token_accuracy": 0.8382406234741211,
- "num_tokens": 4661850.0,
- "step": 513
- },
- {
- "epoch": 0.3905775075987842,
- "grad_norm": 2.3120534420013428,
- "learning_rate": 4.913454097083185e-06,
- "loss": 0.4941597580909729,
- "mean_token_accuracy": 0.8302834033966064,
- "num_tokens": 4667769.0,
- "step": 514
- },
- {
- "epoch": 0.3913373860182371,
- "grad_norm": 2.3111207485198975,
- "learning_rate": 4.912906945046319e-06,
- "loss": 0.5253715515136719,
- "mean_token_accuracy": 0.84515380859375,
- "num_tokens": 4674537.0,
- "step": 515
- },
- {
- "epoch": 0.39209726443769,
- "grad_norm": 1.4117841720581055,
- "learning_rate": 4.912358099538476e-06,
- "loss": 0.4521017074584961,
- "mean_token_accuracy": 0.8208256959915161,
- "num_tokens": 4690605.0,
- "step": 516
- },
- {
- "epoch": 0.39285714285714285,
- "grad_norm": 2.3742799758911133,
- "learning_rate": 4.911807560944858e-06,
- "loss": 0.41572901606559753,
- "mean_token_accuracy": 0.8550551533699036,
- "num_tokens": 4706437.0,
- "step": 517
- },
- {
- "epoch": 0.39361702127659576,
- "grad_norm": 2.4052202701568604,
- "learning_rate": 4.911255329651852e-06,
- "loss": 0.6003736257553101,
- "mean_token_accuracy": 0.8247885704040527,
- "num_tokens": 4712746.0,
- "step": 518
- },
- {
- "epoch": 0.3943768996960486,
- "grad_norm": 1.9335490465164185,
- "learning_rate": 4.910701406047037e-06,
- "loss": 0.5457713603973389,
- "mean_token_accuracy": 0.787429690361023,
- "num_tokens": 4731937.0,
- "step": 519
- },
- {
- "epoch": 0.3951367781155015,
- "grad_norm": 2.257706880569458,
- "learning_rate": 4.910145790519177e-06,
- "loss": 0.5300652980804443,
- "mean_token_accuracy": 0.8192912936210632,
- "num_tokens": 4739422.0,
- "step": 520
- },
- {
- "epoch": 0.3958966565349544,
- "grad_norm": 1.2099462747573853,
- "learning_rate": 4.9095884834582256e-06,
- "loss": 0.45872747898101807,
- "mean_token_accuracy": 0.8362667560577393,
- "num_tokens": 4757113.0,
- "step": 521
- },
- {
- "epoch": 0.3966565349544073,
- "grad_norm": 2.7991135120391846,
- "learning_rate": 4.909029485255321e-06,
- "loss": 0.49039560556411743,
- "mean_token_accuracy": 0.8260016441345215,
- "num_tokens": 4761709.0,
- "step": 522
- },
- {
- "epoch": 0.3974164133738602,
- "grad_norm": 2.2360129356384277,
- "learning_rate": 4.90846879630279e-06,
- "loss": 0.49556830525398254,
- "mean_token_accuracy": 0.827864408493042,
- "num_tokens": 4769048.0,
- "step": 523
- },
- {
- "epoch": 0.3981762917933131,
- "grad_norm": 2.5953688621520996,
- "learning_rate": 4.907906416994146e-06,
- "loss": 0.387208491563797,
- "mean_token_accuracy": 0.8467001914978027,
- "num_tokens": 4774637.0,
- "step": 524
- },
- {
- "epoch": 0.39893617021276595,
- "grad_norm": 2.1046814918518066,
- "learning_rate": 4.907342347724088e-06,
- "loss": 0.5477259755134583,
- "mean_token_accuracy": 0.8060322999954224,
- "num_tokens": 4782774.0,
- "step": 525
- },
- {
- "epoch": 0.39969604863221886,
- "grad_norm": 2.5622646808624268,
- "learning_rate": 4.906776588888502e-06,
- "loss": 0.5684159398078918,
- "mean_token_accuracy": 0.8095303177833557,
- "num_tokens": 4788766.0,
- "step": 526
- },
- {
- "epoch": 0.4004559270516717,
- "grad_norm": 1.9027913808822632,
- "learning_rate": 4.906209140884459e-06,
- "loss": 0.535524845123291,
- "mean_token_accuracy": 0.815237820148468,
- "num_tokens": 4798492.0,
- "step": 527
- },
- {
- "epoch": 0.4012158054711246,
- "grad_norm": 2.1447622776031494,
- "learning_rate": 4.905640004110216e-06,
- "loss": 0.5628632307052612,
- "mean_token_accuracy": 0.8085395097732544,
- "num_tokens": 4805737.0,
- "step": 528
- },
- {
- "epoch": 0.40197568389057753,
- "grad_norm": 1.6754741668701172,
- "learning_rate": 4.905069178965215e-06,
- "loss": 0.5046736598014832,
- "mean_token_accuracy": 0.8247535228729248,
- "num_tokens": 4816912.0,
- "step": 529
- },
- {
- "epoch": 0.4027355623100304,
- "grad_norm": 2.271230459213257,
- "learning_rate": 4.904496665850083e-06,
- "loss": 0.6086187958717346,
- "mean_token_accuracy": 0.7935276627540588,
- "num_tokens": 4824577.0,
- "step": 530
- },
- {
- "epoch": 0.4034954407294833,
- "grad_norm": 2.107595205307007,
- "learning_rate": 4.903922465166633e-06,
- "loss": 0.5431341528892517,
- "mean_token_accuracy": 0.8129537105560303,
- "num_tokens": 4831772.0,
- "step": 531
- },
- {
- "epoch": 0.40425531914893614,
- "grad_norm": 1.3860732316970825,
- "learning_rate": 4.903346577317859e-06,
- "loss": 0.45816320180892944,
- "mean_token_accuracy": 0.8328287601470947,
- "num_tokens": 4850302.0,
- "step": 532
- },
- {
- "epoch": 0.40501519756838905,
- "grad_norm": 1.9186837673187256,
- "learning_rate": 4.902769002707942e-06,
- "loss": 0.3294633626937866,
- "mean_token_accuracy": 0.8853933811187744,
- "num_tokens": 4856624.0,
- "step": 533
- },
- {
- "epoch": 0.40577507598784196,
- "grad_norm": 1.516194462776184,
- "learning_rate": 4.902189741742247e-06,
- "loss": 0.45482105016708374,
- "mean_token_accuracy": 0.8370342254638672,
- "num_tokens": 4870395.0,
- "step": 534
- },
- {
- "epoch": 0.4065349544072948,
- "grad_norm": 2.3235628604888916,
- "learning_rate": 4.901608794827321e-06,
- "loss": 0.40688639879226685,
- "mean_token_accuracy": 0.8643521666526794,
- "num_tokens": 4875645.0,
- "step": 535
- },
- {
- "epoch": 0.4072948328267477,
- "grad_norm": 2.29286527633667,
- "learning_rate": 4.9010261623708945e-06,
- "loss": 0.45482826232910156,
- "mean_token_accuracy": 0.8429383039474487,
- "num_tokens": 4881772.0,
- "step": 536
- },
- {
- "epoch": 0.40805471124620063,
- "grad_norm": 1.5907070636749268,
- "learning_rate": 4.900441844781882e-06,
- "loss": 0.5266948342323303,
- "mean_token_accuracy": 0.8348641395568848,
- "num_tokens": 4894289.0,
- "step": 537
- },
- {
- "epoch": 0.4088145896656535,
- "grad_norm": 2.1816294193267822,
- "learning_rate": 4.89985584247038e-06,
- "loss": 0.4797617793083191,
- "mean_token_accuracy": 0.8549500703811646,
- "num_tokens": 4901106.0,
- "step": 538
- },
- {
- "epoch": 0.4095744680851064,
- "grad_norm": 1.7347146272659302,
- "learning_rate": 4.899268155847667e-06,
- "loss": 0.4754739999771118,
- "mean_token_accuracy": 0.8278418183326721,
- "num_tokens": 4912131.0,
- "step": 539
- },
- {
- "epoch": 0.41033434650455924,
- "grad_norm": 2.0694527626037598,
- "learning_rate": 4.898678785326205e-06,
- "loss": 0.5071008801460266,
- "mean_token_accuracy": 0.8157946467399597,
- "num_tokens": 4921141.0,
- "step": 540
- },
- {
- "epoch": 0.41109422492401215,
- "grad_norm": 2.570047616958618,
- "learning_rate": 4.898087731319637e-06,
- "loss": 0.43639278411865234,
- "mean_token_accuracy": 0.8682913780212402,
- "num_tokens": 4926182.0,
- "step": 541
- },
- {
- "epoch": 0.41185410334346506,
- "grad_norm": 4.064006805419922,
- "learning_rate": 4.8974949942427854e-06,
- "loss": 0.539260745048523,
- "mean_token_accuracy": 0.8225528001785278,
- "num_tokens": 4929449.0,
- "step": 542
- },
- {
- "epoch": 0.4126139817629179,
- "grad_norm": 1.7644332647323608,
- "learning_rate": 4.896900574511657e-06,
- "loss": 0.472618043422699,
- "mean_token_accuracy": 0.8332902193069458,
- "num_tokens": 4939443.0,
- "step": 543
- },
- {
- "epoch": 0.4133738601823708,
- "grad_norm": 2.879918336868286,
- "learning_rate": 4.89630447254344e-06,
- "loss": 0.6360667943954468,
- "mean_token_accuracy": 0.8215296268463135,
- "num_tokens": 4950838.0,
- "step": 544
- },
- {
- "epoch": 0.41413373860182373,
- "grad_norm": 1.4575570821762085,
- "learning_rate": 4.8957066887565005e-06,
- "loss": 0.45617997646331787,
- "mean_token_accuracy": 0.8373187184333801,
- "num_tokens": 4965222.0,
- "step": 545
- },
- {
- "epoch": 0.4148936170212766,
- "grad_norm": 2.4829535484313965,
- "learning_rate": 4.895107223570386e-06,
- "loss": 0.42285341024398804,
- "mean_token_accuracy": 0.8686380386352539,
- "num_tokens": 4970724.0,
- "step": 546
- },
- {
- "epoch": 0.4156534954407295,
- "grad_norm": 2.639474630355835,
- "learning_rate": 4.894506077405824e-06,
- "loss": 0.5906289219856262,
- "mean_token_accuracy": 0.8174435496330261,
- "num_tokens": 4976766.0,
- "step": 547
- },
- {
- "epoch": 0.41641337386018235,
- "grad_norm": 2.7960562705993652,
- "learning_rate": 4.893903250684723e-06,
- "loss": 0.4518949091434479,
- "mean_token_accuracy": 0.8387585282325745,
- "num_tokens": 4980991.0,
- "step": 548
- },
- {
- "epoch": 0.41717325227963525,
- "grad_norm": 2.184176206588745,
- "learning_rate": 4.893298743830168e-06,
- "loss": 0.5223842859268188,
- "mean_token_accuracy": 0.8170937299728394,
- "num_tokens": 4987781.0,
- "step": 549
- },
- {
- "epoch": 0.41793313069908816,
- "grad_norm": 2.2393438816070557,
- "learning_rate": 4.892692557266429e-06,
- "loss": 0.5238431692123413,
- "mean_token_accuracy": 0.8217905759811401,
- "num_tokens": 4994321.0,
- "step": 550
- },
- {
- "epoch": 0.418693009118541,
- "grad_norm": 3.579047441482544,
- "learning_rate": 4.8920846914189465e-06,
- "loss": 0.5367584228515625,
- "mean_token_accuracy": 0.8312011361122131,
- "num_tokens": 4997951.0,
- "step": 551
- },
- {
- "epoch": 0.4194528875379939,
- "grad_norm": 1.6330240964889526,
- "learning_rate": 4.891475146714348e-06,
- "loss": 0.6054705381393433,
- "mean_token_accuracy": 0.7938206791877747,
- "num_tokens": 5012726.0,
- "step": 552
- },
- {
- "epoch": 0.42021276595744683,
- "grad_norm": 1.5775716304779053,
- "learning_rate": 4.8908639235804324e-06,
- "loss": 0.4774656891822815,
- "mean_token_accuracy": 0.828762948513031,
- "num_tokens": 5026751.0,
- "step": 553
- },
- {
- "epoch": 0.4209726443768997,
- "grad_norm": 1.5719101428985596,
- "learning_rate": 4.890251022446181e-06,
- "loss": 0.549429178237915,
- "mean_token_accuracy": 0.8110791444778442,
- "num_tokens": 5041861.0,
- "step": 554
- },
- {
- "epoch": 0.4217325227963526,
- "grad_norm": 1.8585275411605835,
- "learning_rate": 4.889636443741752e-06,
- "loss": 0.4448118805885315,
- "mean_token_accuracy": 0.8462690711021423,
- "num_tokens": 5052690.0,
- "step": 555
- },
- {
- "epoch": 0.42249240121580545,
- "grad_norm": 2.189202070236206,
- "learning_rate": 4.88902018789848e-06,
- "loss": 0.4296762943267822,
- "mean_token_accuracy": 0.8488791584968567,
- "num_tokens": 5058964.0,
- "step": 556
- },
- {
- "epoch": 0.42325227963525835,
- "grad_norm": 1.9328460693359375,
- "learning_rate": 4.888402255348877e-06,
- "loss": 0.5369474291801453,
- "mean_token_accuracy": 0.8184729814529419,
- "num_tokens": 5068465.0,
- "step": 557
- },
- {
- "epoch": 0.42401215805471126,
- "grad_norm": 1.6233323812484741,
- "learning_rate": 4.887782646526631e-06,
- "loss": 0.5284391641616821,
- "mean_token_accuracy": 0.8276044726371765,
- "num_tokens": 5081052.0,
- "step": 558
- },
- {
- "epoch": 0.4247720364741641,
- "grad_norm": 2.222813844680786,
- "learning_rate": 4.887161361866608e-06,
- "loss": 0.5679137706756592,
- "mean_token_accuracy": 0.8012375831604004,
- "num_tokens": 5090001.0,
- "step": 559
- },
- {
- "epoch": 0.425531914893617,
- "grad_norm": 2.1062207221984863,
- "learning_rate": 4.8865384018048494e-06,
- "loss": 0.5554201602935791,
- "mean_token_accuracy": 0.8128066062927246,
- "num_tokens": 5097644.0,
- "step": 560
- },
- {
- "epoch": 0.42629179331306993,
- "grad_norm": 1.5380984544754028,
- "learning_rate": 4.8859137667785735e-06,
- "loss": 0.4948265850543976,
- "mean_token_accuracy": 0.8258291482925415,
- "num_tokens": 5110069.0,
- "step": 561
- },
- {
- "epoch": 0.4270516717325228,
- "grad_norm": 2.0290257930755615,
- "learning_rate": 4.8852874572261715e-06,
- "loss": 0.4969530403614044,
- "mean_token_accuracy": 0.8297134637832642,
- "num_tokens": 5117452.0,
- "step": 562
- },
- {
- "epoch": 0.4278115501519757,
- "grad_norm": 1.5651452541351318,
- "learning_rate": 4.884659473587213e-06,
- "loss": 0.5353102087974548,
- "mean_token_accuracy": 0.8161719441413879,
- "num_tokens": 5133756.0,
- "step": 563
- },
- {
- "epoch": 0.42857142857142855,
- "grad_norm": 2.2470998764038086,
- "learning_rate": 4.884029816302441e-06,
- "loss": 0.5104288458824158,
- "mean_token_accuracy": 0.8081635236740112,
- "num_tokens": 5140278.0,
- "step": 564
- },
- {
- "epoch": 0.42933130699088146,
- "grad_norm": 1.726891279220581,
- "learning_rate": 4.883398485813772e-06,
- "loss": 0.4508771002292633,
- "mean_token_accuracy": 0.8548800349235535,
- "num_tokens": 5150115.0,
- "step": 565
- },
- {
- "epoch": 0.43009118541033436,
- "grad_norm": 1.4779289960861206,
- "learning_rate": 4.8827654825642984e-06,
- "loss": 0.46861088275909424,
- "mean_token_accuracy": 0.8209476470947266,
- "num_tokens": 5163225.0,
- "step": 566
- },
- {
- "epoch": 0.4308510638297872,
- "grad_norm": 1.2361034154891968,
- "learning_rate": 4.882130806998287e-06,
- "loss": 0.4591076672077179,
- "mean_token_accuracy": 0.803041934967041,
- "num_tokens": 5180342.0,
- "step": 567
- },
- {
- "epoch": 0.4316109422492401,
- "grad_norm": 1.882467269897461,
- "learning_rate": 4.881494459561177e-06,
- "loss": 0.579258143901825,
- "mean_token_accuracy": 0.8007112741470337,
- "num_tokens": 5189595.0,
- "step": 568
- },
- {
- "epoch": 0.43237082066869303,
- "grad_norm": 1.095462441444397,
- "learning_rate": 4.880856440699582e-06,
- "loss": 0.3806574046611786,
- "mean_token_accuracy": 0.8650111556053162,
- "num_tokens": 5211642.0,
- "step": 569
- },
- {
- "epoch": 0.4331306990881459,
- "grad_norm": 1.6469846963882446,
- "learning_rate": 4.880216750861288e-06,
- "loss": 0.544589638710022,
- "mean_token_accuracy": 0.8060122728347778,
- "num_tokens": 5224137.0,
- "step": 570
- },
- {
- "epoch": 0.4338905775075988,
- "grad_norm": 1.8561251163482666,
- "learning_rate": 4.879575390495254e-06,
- "loss": 0.4094924330711365,
- "mean_token_accuracy": 0.8591406345367432,
- "num_tokens": 5231588.0,
- "step": 571
- },
- {
- "epoch": 0.43465045592705165,
- "grad_norm": 3.01326847076416,
- "learning_rate": 4.878932360051611e-06,
- "loss": 0.6139192581176758,
- "mean_token_accuracy": 0.8108739852905273,
- "num_tokens": 5236853.0,
- "step": 572
- },
- {
- "epoch": 0.43541033434650456,
- "grad_norm": 2.1753034591674805,
- "learning_rate": 4.878287659981663e-06,
- "loss": 0.49082931876182556,
- "mean_token_accuracy": 0.862828254699707,
- "num_tokens": 5243264.0,
- "step": 573
- },
- {
- "epoch": 0.43617021276595747,
- "grad_norm": 1.4437755346298218,
- "learning_rate": 4.8776412907378845e-06,
- "loss": 0.5608728528022766,
- "mean_token_accuracy": 0.8271626234054565,
- "num_tokens": 5261757.0,
- "step": 574
- },
- {
- "epoch": 0.4369300911854103,
- "grad_norm": 1.786683440208435,
- "learning_rate": 4.876993252773923e-06,
- "loss": 0.4377627968788147,
- "mean_token_accuracy": 0.844936192035675,
- "num_tokens": 5271038.0,
- "step": 575
- },
- {
- "epoch": 0.4376899696048632,
- "grad_norm": 1.3425915241241455,
- "learning_rate": 4.876343546544596e-06,
- "loss": 0.44762521982192993,
- "mean_token_accuracy": 0.8397793769836426,
- "num_tokens": 5285555.0,
- "step": 576
- },
- {
- "epoch": 0.43844984802431614,
- "grad_norm": 2.1549675464630127,
- "learning_rate": 4.8756921725058935e-06,
- "loss": 0.5332942008972168,
- "mean_token_accuracy": 0.820149302482605,
- "num_tokens": 5294595.0,
- "step": 577
- },
- {
- "epoch": 0.439209726443769,
- "grad_norm": 1.5254042148590088,
- "learning_rate": 4.875039131114975e-06,
- "loss": 0.3646543622016907,
- "mean_token_accuracy": 0.8442583084106445,
- "num_tokens": 5304955.0,
- "step": 578
- },
- {
- "epoch": 0.4399696048632219,
- "grad_norm": 1.5751557350158691,
- "learning_rate": 4.8743844228301676e-06,
- "loss": 0.4854734539985657,
- "mean_token_accuracy": 0.8317523002624512,
- "num_tokens": 5317351.0,
- "step": 579
- },
- {
- "epoch": 0.44072948328267475,
- "grad_norm": 1.6950466632843018,
- "learning_rate": 4.873728048110973e-06,
- "loss": 0.5907570719718933,
- "mean_token_accuracy": 0.7946986556053162,
- "num_tokens": 5332542.0,
- "step": 580
- },
- {
- "epoch": 0.44148936170212766,
- "grad_norm": 2.1180708408355713,
- "learning_rate": 4.873070007418059e-06,
- "loss": 0.5220296382904053,
- "mean_token_accuracy": 0.8037363290786743,
- "num_tokens": 5341722.0,
- "step": 581
- },
- {
- "epoch": 0.44224924012158057,
- "grad_norm": 1.3643816709518433,
- "learning_rate": 4.872410301213265e-06,
- "loss": 0.4865502417087555,
- "mean_token_accuracy": 0.8377852439880371,
- "num_tokens": 5359359.0,
- "step": 582
- },
- {
- "epoch": 0.4430091185410334,
- "grad_norm": 1.483280897140503,
- "learning_rate": 4.871748929959598e-06,
- "loss": 0.36856764554977417,
- "mean_token_accuracy": 0.8709549903869629,
- "num_tokens": 5369749.0,
- "step": 583
- },
- {
- "epoch": 0.44376899696048633,
- "grad_norm": 1.6891541481018066,
- "learning_rate": 4.871085894121234e-06,
- "loss": 0.5768930912017822,
- "mean_token_accuracy": 0.8030461668968201,
- "num_tokens": 5383912.0,
- "step": 584
- },
- {
- "epoch": 0.44452887537993924,
- "grad_norm": 2.1318740844726562,
- "learning_rate": 4.870421194163515e-06,
- "loss": 0.4337100386619568,
- "mean_token_accuracy": 0.8562518358230591,
- "num_tokens": 5389412.0,
- "step": 585
- },
- {
- "epoch": 0.4452887537993921,
- "grad_norm": 2.540255546569824,
- "learning_rate": 4.869754830552956e-06,
- "loss": 0.4708256125450134,
- "mean_token_accuracy": 0.8446552753448486,
- "num_tokens": 5394762.0,
- "step": 586
- },
- {
- "epoch": 0.446048632218845,
- "grad_norm": 2.048015594482422,
- "learning_rate": 4.869086803757235e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8181137442588806,
- "num_tokens": 5402379.0,
- "step": 587
- },
- {
- "epoch": 0.44680851063829785,
- "grad_norm": 2.9821012020111084,
- "learning_rate": 4.868417114245199e-06,
- "loss": 0.6299797296524048,
- "mean_token_accuracy": 0.8237329125404358,
- "num_tokens": 5408229.0,
- "step": 588
- },
- {
- "epoch": 0.44756838905775076,
- "grad_norm": 1.7807202339172363,
- "learning_rate": 4.867745762486862e-06,
- "loss": 0.5176759958267212,
- "mean_token_accuracy": 0.8184244632720947,
- "num_tokens": 5418383.0,
- "step": 589
- },
- {
- "epoch": 0.44832826747720367,
- "grad_norm": 1.5466399192810059,
- "learning_rate": 4.8670727489534035e-06,
- "loss": 0.5137228965759277,
- "mean_token_accuracy": 0.8365053534507751,
- "num_tokens": 5432127.0,
- "step": 590
- },
- {
- "epoch": 0.4490881458966565,
- "grad_norm": 2.9521141052246094,
- "learning_rate": 4.866398074117173e-06,
- "loss": 0.4056887924671173,
- "mean_token_accuracy": 0.8561501502990723,
- "num_tokens": 5436062.0,
- "step": 591
- },
- {
- "epoch": 0.44984802431610943,
- "grad_norm": 2.058743953704834,
- "learning_rate": 4.86572173845168e-06,
- "loss": 0.6124799251556396,
- "mean_token_accuracy": 0.8007957339286804,
- "num_tokens": 5444989.0,
- "step": 592
- },
- {
- "epoch": 0.4506079027355623,
- "grad_norm": 2.1243767738342285,
- "learning_rate": 4.865043742431605e-06,
- "loss": 0.5659694671630859,
- "mean_token_accuracy": 0.8084750175476074,
- "num_tokens": 5453865.0,
- "step": 593
- },
- {
- "epoch": 0.4513677811550152,
- "grad_norm": 1.6732314825057983,
- "learning_rate": 4.864364086532792e-06,
- "loss": 0.47879064083099365,
- "mean_token_accuracy": 0.8346436023712158,
- "num_tokens": 5466398.0,
- "step": 594
- },
- {
- "epoch": 0.4521276595744681,
- "grad_norm": 1.3793858289718628,
- "learning_rate": 4.863682771232249e-06,
- "loss": 0.45989373326301575,
- "mean_token_accuracy": 0.8254791498184204,
- "num_tokens": 5482121.0,
- "step": 595
- },
- {
- "epoch": 0.45288753799392095,
- "grad_norm": 1.9812315702438354,
- "learning_rate": 4.862999797008149e-06,
- "loss": 0.5778874754905701,
- "mean_token_accuracy": 0.8041508197784424,
- "num_tokens": 5493000.0,
- "step": 596
- },
- {
- "epoch": 0.45364741641337386,
- "grad_norm": 3.3065083026885986,
- "learning_rate": 4.862315164339829e-06,
- "loss": 0.4623975157737732,
- "mean_token_accuracy": 0.8426318168640137,
- "num_tokens": 5496723.0,
- "step": 597
- },
- {
- "epoch": 0.45440729483282677,
- "grad_norm": 3.167119026184082,
- "learning_rate": 4.861628873707792e-06,
- "loss": 0.6984533667564392,
- "mean_token_accuracy": 0.772136926651001,
- "num_tokens": 5501161.0,
- "step": 598
- },
- {
- "epoch": 0.4551671732522796,
- "grad_norm": 2.2130985260009766,
- "learning_rate": 4.860940925593703e-06,
- "loss": 0.4823192059993744,
- "mean_token_accuracy": 0.8462972640991211,
- "num_tokens": 5509544.0,
- "step": 599
- },
- {
- "epoch": 0.45592705167173253,
- "grad_norm": 3.029191732406616,
- "learning_rate": 4.86025132048039e-06,
- "loss": 0.523664116859436,
- "mean_token_accuracy": 0.8229140043258667,
- "num_tokens": 5514586.0,
- "step": 600
- },
- {
- "epoch": 0.4566869300911854,
- "grad_norm": 1.6983962059020996,
- "learning_rate": 4.859560058851844e-06,
- "loss": 0.4832698106765747,
- "mean_token_accuracy": 0.8403248190879822,
- "num_tokens": 5525773.0,
- "step": 601
- },
- {
- "epoch": 0.4574468085106383,
- "grad_norm": 3.0504038333892822,
- "learning_rate": 4.8588671411932195e-06,
- "loss": 0.5158926248550415,
- "mean_token_accuracy": 0.8098392486572266,
- "num_tokens": 5529739.0,
- "step": 602
- },
- {
- "epoch": 0.4582066869300912,
- "grad_norm": 2.584836483001709,
- "learning_rate": 4.858172567990832e-06,
- "loss": 0.5724587440490723,
- "mean_token_accuracy": 0.8128519058227539,
- "num_tokens": 5535763.0,
- "step": 603
- },
- {
- "epoch": 0.45896656534954405,
- "grad_norm": 2.0514042377471924,
- "learning_rate": 4.857476339732162e-06,
- "loss": 0.4337679445743561,
- "mean_token_accuracy": 0.8405929207801819,
- "num_tokens": 5543075.0,
- "step": 604
- },
- {
- "epoch": 0.45972644376899696,
- "grad_norm": 2.2949347496032715,
- "learning_rate": 4.856778456905846e-06,
- "loss": 0.46532145142555237,
- "mean_token_accuracy": 0.8345137238502502,
- "num_tokens": 5549035.0,
- "step": 605
- },
- {
- "epoch": 0.46048632218844987,
- "grad_norm": 2.2067551612854004,
- "learning_rate": 4.856078920001689e-06,
- "loss": 0.5855136513710022,
- "mean_token_accuracy": 0.8043795228004456,
- "num_tokens": 5555545.0,
- "step": 606
- },
- {
- "epoch": 0.4612462006079027,
- "grad_norm": 2.101945161819458,
- "learning_rate": 4.855377729510648e-06,
- "loss": 0.6071814298629761,
- "mean_token_accuracy": 0.7973253130912781,
- "num_tokens": 5563615.0,
- "step": 607
- },
- {
- "epoch": 0.46200607902735563,
- "grad_norm": 2.5958821773529053,
- "learning_rate": 4.8546748859248504e-06,
- "loss": 0.6278061866760254,
- "mean_token_accuracy": 0.7864972352981567,
- "num_tokens": 5570078.0,
- "step": 608
- },
- {
- "epoch": 0.4627659574468085,
- "grad_norm": 2.778101921081543,
- "learning_rate": 4.853970389737576e-06,
- "loss": 0.35521194338798523,
- "mean_token_accuracy": 0.8752605319023132,
- "num_tokens": 5573995.0,
- "step": 609
- },
- {
- "epoch": 0.4635258358662614,
- "grad_norm": 2.600534677505493,
- "learning_rate": 4.8532642414432675e-06,
- "loss": 0.6541563868522644,
- "mean_token_accuracy": 0.7843613028526306,
- "num_tokens": 5580333.0,
- "step": 610
- },
- {
- "epoch": 0.4642857142857143,
- "grad_norm": 1.778337836265564,
- "learning_rate": 4.852556441537528e-06,
- "loss": 0.3561405837535858,
- "mean_token_accuracy": 0.8579353094100952,
- "num_tokens": 5588430.0,
- "step": 611
- },
- {
- "epoch": 0.46504559270516715,
- "grad_norm": 1.5653862953186035,
- "learning_rate": 4.851846990517118e-06,
- "loss": 0.6067906618118286,
- "mean_token_accuracy": 0.7919317483901978,
- "num_tokens": 5601700.0,
- "step": 612
- },
- {
- "epoch": 0.46580547112462006,
- "grad_norm": 1.6097723245620728,
- "learning_rate": 4.851135888879958e-06,
- "loss": 0.446664422750473,
- "mean_token_accuracy": 0.8441969156265259,
- "num_tokens": 5612063.0,
- "step": 613
- },
- {
- "epoch": 0.46656534954407297,
- "grad_norm": 1.961207389831543,
- "learning_rate": 4.850423137125126e-06,
- "loss": 0.5508605241775513,
- "mean_token_accuracy": 0.8240450024604797,
- "num_tokens": 5620245.0,
- "step": 614
- },
- {
- "epoch": 0.4673252279635258,
- "grad_norm": 2.2189085483551025,
- "learning_rate": 4.8497087357528585e-06,
- "loss": 0.6805076599121094,
- "mean_token_accuracy": 0.771978497505188,
- "num_tokens": 5629590.0,
- "step": 615
- },
- {
- "epoch": 0.46808510638297873,
- "grad_norm": 2.5176279544830322,
- "learning_rate": 4.8489926852645505e-06,
- "loss": 0.4512156844139099,
- "mean_token_accuracy": 0.836459755897522,
- "num_tokens": 5635259.0,
- "step": 616
- },
- {
- "epoch": 0.4688449848024316,
- "grad_norm": 1.5327287912368774,
- "learning_rate": 4.848274986162754e-06,
- "loss": 0.4884302616119385,
- "mean_token_accuracy": 0.8194037079811096,
- "num_tokens": 5649993.0,
- "step": 617
- },
- {
- "epoch": 0.4696048632218845,
- "grad_norm": 2.184554100036621,
- "learning_rate": 4.847555638951177e-06,
- "loss": 0.5141451358795166,
- "mean_token_accuracy": 0.8245922327041626,
- "num_tokens": 5657375.0,
- "step": 618
- },
- {
- "epoch": 0.4703647416413374,
- "grad_norm": 1.6143407821655273,
- "learning_rate": 4.846834644134686e-06,
- "loss": 0.4276641607284546,
- "mean_token_accuracy": 0.8481845855712891,
- "num_tokens": 5667941.0,
- "step": 619
- },
- {
- "epoch": 0.47112462006079026,
- "grad_norm": 2.3747270107269287,
- "learning_rate": 4.846112002219301e-06,
- "loss": 0.5608246922492981,
- "mean_token_accuracy": 0.8073011040687561,
- "num_tokens": 5675042.0,
- "step": 620
- },
- {
- "epoch": 0.47188449848024316,
- "grad_norm": 2.390404224395752,
- "learning_rate": 4.845387713712203e-06,
- "loss": 0.46616724133491516,
- "mean_token_accuracy": 0.8468319177627563,
- "num_tokens": 5680207.0,
- "step": 621
- },
- {
- "epoch": 0.4726443768996961,
- "grad_norm": 1.7245099544525146,
- "learning_rate": 4.844661779121723e-06,
- "loss": 0.5652435421943665,
- "mean_token_accuracy": 0.8010749816894531,
- "num_tokens": 5693759.0,
- "step": 622
- },
- {
- "epoch": 0.4734042553191489,
- "grad_norm": 2.6923108100891113,
- "learning_rate": 4.843934198957351e-06,
- "loss": 0.6254661679267883,
- "mean_token_accuracy": 0.8236024975776672,
- "num_tokens": 5699916.0,
- "step": 623
- },
- {
- "epoch": 0.47416413373860183,
- "grad_norm": 2.516901969909668,
- "learning_rate": 4.84320497372973e-06,
- "loss": 0.6334252953529358,
- "mean_token_accuracy": 0.7803834676742554,
- "num_tokens": 5706554.0,
- "step": 624
- },
- {
- "epoch": 0.4749240121580547,
- "grad_norm": 2.3744447231292725,
- "learning_rate": 4.842474103950658e-06,
- "loss": 0.4221811890602112,
- "mean_token_accuracy": 0.8639545440673828,
- "num_tokens": 5711756.0,
- "step": 625
- },
- {
- "epoch": 0.4756838905775076,
- "grad_norm": 3.2373476028442383,
- "learning_rate": 4.841741590133089e-06,
- "loss": 0.6637828946113586,
- "mean_token_accuracy": 0.7968347072601318,
- "num_tokens": 5716458.0,
- "step": 626
- },
- {
- "epoch": 0.4764437689969605,
- "grad_norm": 2.153888463973999,
- "learning_rate": 4.841007432791129e-06,
- "loss": 0.4877486228942871,
- "mean_token_accuracy": 0.8345249891281128,
- "num_tokens": 5723155.0,
- "step": 627
- },
- {
- "epoch": 0.47720364741641336,
- "grad_norm": 2.120497703552246,
- "learning_rate": 4.8402716324400375e-06,
- "loss": 0.37323033809661865,
- "mean_token_accuracy": 0.8734050393104553,
- "num_tokens": 5729171.0,
- "step": 628
- },
- {
- "epoch": 0.47796352583586627,
- "grad_norm": 1.5294172763824463,
- "learning_rate": 4.839534189596228e-06,
- "loss": 0.4057067334651947,
- "mean_token_accuracy": 0.8523319959640503,
- "num_tokens": 5740112.0,
- "step": 629
- },
- {
- "epoch": 0.4787234042553192,
- "grad_norm": 2.1913886070251465,
- "learning_rate": 4.8387951047772656e-06,
- "loss": 0.4835960865020752,
- "mean_token_accuracy": 0.8438145518302917,
- "num_tokens": 5746838.0,
- "step": 630
- },
- {
- "epoch": 0.479483282674772,
- "grad_norm": 1.482897162437439,
- "learning_rate": 4.838054378501868e-06,
- "loss": 0.46967992186546326,
- "mean_token_accuracy": 0.8315759897232056,
- "num_tokens": 5760428.0,
- "step": 631
- },
- {
- "epoch": 0.48024316109422494,
- "grad_norm": 1.38850998878479,
- "learning_rate": 4.837312011289907e-06,
- "loss": 0.41845446825027466,
- "mean_token_accuracy": 0.8557186126708984,
- "num_tokens": 5773437.0,
- "step": 632
- },
- {
- "epoch": 0.4810030395136778,
- "grad_norm": 3.8337457180023193,
- "learning_rate": 4.836568003662403e-06,
- "loss": 0.5102912187576294,
- "mean_token_accuracy": 0.830644965171814,
- "num_tokens": 5776367.0,
- "step": 633
- },
- {
- "epoch": 0.4817629179331307,
- "grad_norm": 1.2084007263183594,
- "learning_rate": 4.8358223561415304e-06,
- "loss": 0.3835333585739136,
- "mean_token_accuracy": 0.8639016151428223,
- "num_tokens": 5792246.0,
- "step": 634
- },
- {
- "epoch": 0.4825227963525836,
- "grad_norm": 1.939408540725708,
- "learning_rate": 4.835075069250613e-06,
- "loss": 0.4044850468635559,
- "mean_token_accuracy": 0.8488376140594482,
- "num_tokens": 5799853.0,
- "step": 635
- },
- {
- "epoch": 0.48328267477203646,
- "grad_norm": 1.345870852470398,
- "learning_rate": 4.8343261435141245e-06,
- "loss": 0.46660199761390686,
- "mean_token_accuracy": 0.8371681571006775,
- "num_tokens": 5817478.0,
- "step": 636
- },
- {
- "epoch": 0.48404255319148937,
- "grad_norm": 1.6531339883804321,
- "learning_rate": 4.833575579457691e-06,
- "loss": 0.3886989951133728,
- "mean_token_accuracy": 0.8763507008552551,
- "num_tokens": 5825739.0,
- "step": 637
- },
- {
- "epoch": 0.4848024316109423,
- "grad_norm": 1.6443969011306763,
- "learning_rate": 4.832823377608088e-06,
- "loss": 0.4070289731025696,
- "mean_token_accuracy": 0.8586630821228027,
- "num_tokens": 5837917.0,
- "step": 638
- },
- {
- "epoch": 0.48556231003039513,
- "grad_norm": 2.005136013031006,
- "learning_rate": 4.832069538493237e-06,
- "loss": 0.40616685152053833,
- "mean_token_accuracy": 0.8571510314941406,
- "num_tokens": 5845250.0,
- "step": 639
- },
- {
- "epoch": 0.48632218844984804,
- "grad_norm": 1.5244266986846924,
- "learning_rate": 4.831314062642213e-06,
- "loss": 0.49530288577079773,
- "mean_token_accuracy": 0.8328841924667358,
- "num_tokens": 5857407.0,
- "step": 640
- },
- {
- "epoch": 0.4870820668693009,
- "grad_norm": 1.9876971244812012,
- "learning_rate": 4.830556950585239e-06,
- "loss": 0.4583776593208313,
- "mean_token_accuracy": 0.8427221179008484,
- "num_tokens": 5865391.0,
- "step": 641
- },
- {
- "epoch": 0.4878419452887538,
- "grad_norm": 3.023336172103882,
- "learning_rate": 4.829798202853683e-06,
- "loss": 0.6134771108627319,
- "mean_token_accuracy": 0.7981935739517212,
- "num_tokens": 5870729.0,
- "step": 642
- },
- {
- "epoch": 0.4886018237082067,
- "grad_norm": 1.8889515399932861,
- "learning_rate": 4.829037819980065e-06,
- "loss": 0.4420135021209717,
- "mean_token_accuracy": 0.8480775356292725,
- "num_tokens": 5878982.0,
- "step": 643
- },
- {
- "epoch": 0.48936170212765956,
- "grad_norm": 2.2408435344696045,
- "learning_rate": 4.828275802498051e-06,
- "loss": 0.525706946849823,
- "mean_token_accuracy": 0.8271557092666626,
- "num_tokens": 5885097.0,
- "step": 644
- },
- {
- "epoch": 0.49012158054711247,
- "grad_norm": 1.9734224081039429,
- "learning_rate": 4.827512150942454e-06,
- "loss": 0.44246578216552734,
- "mean_token_accuracy": 0.8456668257713318,
- "num_tokens": 5893941.0,
- "step": 645
- },
- {
- "epoch": 0.4908814589665654,
- "grad_norm": 1.9618173837661743,
- "learning_rate": 4.8267468658492335e-06,
- "loss": 0.5119768381118774,
- "mean_token_accuracy": 0.8355510830879211,
- "num_tokens": 5902829.0,
- "step": 646
- },
- {
- "epoch": 0.49164133738601823,
- "grad_norm": 1.7181587219238281,
- "learning_rate": 4.825979947755496e-06,
- "loss": 0.5666520595550537,
- "mean_token_accuracy": 0.7951971888542175,
- "num_tokens": 5915212.0,
- "step": 647
- },
- {
- "epoch": 0.49240121580547114,
- "grad_norm": 3.0121164321899414,
- "learning_rate": 4.8252113971994955e-06,
- "loss": 0.628632128238678,
- "mean_token_accuracy": 0.8041050434112549,
- "num_tokens": 5921410.0,
- "step": 648
- },
- {
- "epoch": 0.493161094224924,
- "grad_norm": 2.9980475902557373,
- "learning_rate": 4.824441214720629e-06,
- "loss": 0.4507424831390381,
- "mean_token_accuracy": 0.8636263608932495,
- "num_tokens": 5925179.0,
- "step": 649
- },
- {
- "epoch": 0.4939209726443769,
- "grad_norm": 2.0096445083618164,
- "learning_rate": 4.823669400859441e-06,
- "loss": 0.602759838104248,
- "mean_token_accuracy": 0.8104915618896484,
- "num_tokens": 5934160.0,
- "step": 650
- },
- {
- "epoch": 0.4946808510638298,
- "grad_norm": 1.1186442375183105,
- "learning_rate": 4.8228959561576195e-06,
- "loss": 0.41168469190597534,
- "mean_token_accuracy": 0.8461419939994812,
- "num_tokens": 5954163.0,
- "step": 651
- },
- {
- "epoch": 0.49544072948328266,
- "grad_norm": 1.855465054512024,
- "learning_rate": 4.822120881157998e-06,
- "loss": 0.5049735307693481,
- "mean_token_accuracy": 0.8225747346878052,
- "num_tokens": 5963840.0,
- "step": 652
- },
- {
- "epoch": 0.49620060790273557,
- "grad_norm": 3.550563335418701,
- "learning_rate": 4.821344176404554e-06,
- "loss": 0.49025264382362366,
- "mean_token_accuracy": 0.8265978693962097,
- "num_tokens": 5967358.0,
- "step": 653
- },
- {
- "epoch": 0.4969604863221885,
- "grad_norm": 3.063910484313965,
- "learning_rate": 4.820565842442408e-06,
- "loss": 0.5652767419815063,
- "mean_token_accuracy": 0.811700701713562,
- "num_tokens": 5971858.0,
- "step": 654
- },
- {
- "epoch": 0.49772036474164133,
- "grad_norm": 2.4613308906555176,
- "learning_rate": 4.819785879817827e-06,
- "loss": 0.5296125411987305,
- "mean_token_accuracy": 0.8336488008499146,
- "num_tokens": 5977442.0,
- "step": 655
- },
- {
- "epoch": 0.49848024316109424,
- "grad_norm": 2.342519760131836,
- "learning_rate": 4.819004289078217e-06,
- "loss": 0.5753380060195923,
- "mean_token_accuracy": 0.7922406792640686,
- "num_tokens": 5984531.0,
- "step": 656
- },
- {
- "epoch": 0.4992401215805471,
- "grad_norm": 2.0410680770874023,
- "learning_rate": 4.818221070772129e-06,
- "loss": 0.5433275699615479,
- "mean_token_accuracy": 0.8043830990791321,
- "num_tokens": 5992642.0,
- "step": 657
- },
- {
- "epoch": 0.5,
- "grad_norm": 1.4999698400497437,
- "learning_rate": 4.8174362254492555e-06,
- "loss": 0.5248899459838867,
- "mean_token_accuracy": 0.8107168674468994,
- "num_tokens": 6005543.0,
- "step": 658
- },
- {
- "epoch": 0.5007598784194529,
- "grad_norm": 1.9494401216506958,
- "learning_rate": 4.816649753660431e-06,
- "loss": 0.41291385889053345,
- "mean_token_accuracy": 0.8650569915771484,
- "num_tokens": 6012185.0,
- "step": 659
- },
- {
- "epoch": 0.5015197568389058,
- "grad_norm": 2.7514095306396484,
- "learning_rate": 4.815861655957632e-06,
- "loss": 0.4244142770767212,
- "mean_token_accuracy": 0.8485112190246582,
- "num_tokens": 6016809.0,
- "step": 660
- },
- {
- "epoch": 0.5022796352583586,
- "grad_norm": 1.4354928731918335,
- "learning_rate": 4.815071932893976e-06,
- "loss": 0.4332060217857361,
- "mean_token_accuracy": 0.8386815786361694,
- "num_tokens": 6034795.0,
- "step": 661
- },
- {
- "epoch": 0.5030395136778115,
- "grad_norm": 1.3113417625427246,
- "learning_rate": 4.81428058502372e-06,
- "loss": 0.5415540933609009,
- "mean_token_accuracy": 0.8115285038948059,
- "num_tokens": 6053624.0,
- "step": 662
- },
- {
- "epoch": 0.5037993920972644,
- "grad_norm": 1.820868730545044,
- "learning_rate": 4.813487612902265e-06,
- "loss": 0.5360245108604431,
- "mean_token_accuracy": 0.8313555717468262,
- "num_tokens": 6063399.0,
- "step": 663
- },
- {
- "epoch": 0.5045592705167173,
- "grad_norm": 2.347001552581787,
- "learning_rate": 4.812693017086145e-06,
- "loss": 0.4926982820034027,
- "mean_token_accuracy": 0.8137006759643555,
- "num_tokens": 6070111.0,
- "step": 664
- },
- {
- "epoch": 0.5053191489361702,
- "grad_norm": 1.8830888271331787,
- "learning_rate": 4.811896798133042e-06,
- "loss": 0.5419014692306519,
- "mean_token_accuracy": 0.8027454614639282,
- "num_tokens": 6081090.0,
- "step": 665
- },
- {
- "epoch": 0.506079027355623,
- "grad_norm": 2.3258056640625,
- "learning_rate": 4.811098956601772e-06,
- "loss": 0.4629337787628174,
- "mean_token_accuracy": 0.8416580557823181,
- "num_tokens": 6087921.0,
- "step": 666
- },
- {
- "epoch": 0.506838905775076,
- "grad_norm": 1.9578291177749634,
- "learning_rate": 4.810299493052289e-06,
- "loss": 0.40305402874946594,
- "mean_token_accuracy": 0.8529061079025269,
- "num_tokens": 6100034.0,
- "step": 667
- },
- {
- "epoch": 0.5075987841945289,
- "grad_norm": 2.800635576248169,
- "learning_rate": 4.809498408045691e-06,
- "loss": 0.5087342262268066,
- "mean_token_accuracy": 0.8214689493179321,
- "num_tokens": 6104742.0,
- "step": 668
- },
- {
- "epoch": 0.5083586626139818,
- "grad_norm": 1.5318149328231812,
- "learning_rate": 4.808695702144206e-06,
- "loss": 0.4733222723007202,
- "mean_token_accuracy": 0.837577223777771,
- "num_tokens": 6117242.0,
- "step": 669
- },
- {
- "epoch": 0.5091185410334347,
- "grad_norm": 1.2368661165237427,
- "learning_rate": 4.807891375911207e-06,
- "loss": 0.3929097056388855,
- "mean_token_accuracy": 0.8331400752067566,
- "num_tokens": 6133509.0,
- "step": 670
- },
- {
- "epoch": 0.5098784194528876,
- "grad_norm": 2.4711415767669678,
- "learning_rate": 4.8070854299112e-06,
- "loss": 0.6294851303100586,
- "mean_token_accuracy": 0.7956781983375549,
- "num_tokens": 6140294.0,
- "step": 671
- },
- {
- "epoch": 0.5106382978723404,
- "grad_norm": 2.590961217880249,
- "learning_rate": 4.806277864709828e-06,
- "loss": 0.580160915851593,
- "mean_token_accuracy": 0.809589684009552,
- "num_tokens": 6145803.0,
- "step": 672
- },
- {
- "epoch": 0.5113981762917933,
- "grad_norm": 2.4653842449188232,
- "learning_rate": 4.805468680873874e-06,
- "loss": 0.5262120366096497,
- "mean_token_accuracy": 0.822458803653717,
- "num_tokens": 6151236.0,
- "step": 673
- },
- {
- "epoch": 0.5121580547112462,
- "grad_norm": 2.860720157623291,
- "learning_rate": 4.804657878971252e-06,
- "loss": 0.4007391035556793,
- "mean_token_accuracy": 0.8637382984161377,
- "num_tokens": 6155310.0,
- "step": 674
- },
- {
- "epoch": 0.5129179331306991,
- "grad_norm": 2.520282030105591,
- "learning_rate": 4.803845459571014e-06,
- "loss": 0.45798182487487793,
- "mean_token_accuracy": 0.8270114660263062,
- "num_tokens": 6160326.0,
- "step": 675
- },
- {
- "epoch": 0.513677811550152,
- "grad_norm": 2.7290921211242676,
- "learning_rate": 4.803031423243349e-06,
- "loss": 0.5745848417282104,
- "mean_token_accuracy": 0.8401234745979309,
- "num_tokens": 6165709.0,
- "step": 676
- },
- {
- "epoch": 0.5144376899696048,
- "grad_norm": 1.6678650379180908,
- "learning_rate": 4.802215770559578e-06,
- "loss": 0.5257721543312073,
- "mean_token_accuracy": 0.8241991996765137,
- "num_tokens": 6177875.0,
- "step": 677
- },
- {
- "epoch": 0.5151975683890577,
- "grad_norm": 2.1720468997955322,
- "learning_rate": 4.801398502092156e-06,
- "loss": 0.45342206954956055,
- "mean_token_accuracy": 0.8463799953460693,
- "num_tokens": 6185415.0,
- "step": 678
- },
- {
- "epoch": 0.5159574468085106,
- "grad_norm": 2.282259702682495,
- "learning_rate": 4.800579618414677e-06,
- "loss": 0.4864169955253601,
- "mean_token_accuracy": 0.8300632238388062,
- "num_tokens": 6191832.0,
- "step": 679
- },
- {
- "epoch": 0.5167173252279635,
- "grad_norm": 2.0092248916625977,
- "learning_rate": 4.799759120101861e-06,
- "loss": 0.5781463980674744,
- "mean_token_accuracy": 0.8267031908035278,
- "num_tokens": 6199440.0,
- "step": 680
- },
- {
- "epoch": 0.5174772036474165,
- "grad_norm": 1.396580696105957,
- "learning_rate": 4.798937007729568e-06,
- "loss": 0.49689239263534546,
- "mean_token_accuracy": 0.8257499933242798,
- "num_tokens": 6213840.0,
- "step": 681
- },
- {
- "epoch": 0.5182370820668692,
- "grad_norm": 1.9060769081115723,
- "learning_rate": 4.798113281874788e-06,
- "loss": 0.48969539999961853,
- "mean_token_accuracy": 0.8171790838241577,
- "num_tokens": 6223006.0,
- "step": 682
- },
- {
- "epoch": 0.5189969604863222,
- "grad_norm": 1.6255282163619995,
- "learning_rate": 4.797287943115642e-06,
- "loss": 0.5532330870628357,
- "mean_token_accuracy": 0.8173393607139587,
- "num_tokens": 6234857.0,
- "step": 683
- },
- {
- "epoch": 0.5197568389057751,
- "grad_norm": 1.6923905611038208,
- "learning_rate": 4.796460992031386e-06,
- "loss": 0.4880887269973755,
- "mean_token_accuracy": 0.834983229637146,
- "num_tokens": 6245252.0,
- "step": 684
- },
- {
- "epoch": 0.520516717325228,
- "grad_norm": 2.13161301612854,
- "learning_rate": 4.7956324292024045e-06,
- "loss": 0.5687593817710876,
- "mean_token_accuracy": 0.7996571063995361,
- "num_tokens": 6253726.0,
- "step": 685
- },
- {
- "epoch": 0.5212765957446809,
- "grad_norm": 2.509375810623169,
- "learning_rate": 4.794802255210217e-06,
- "loss": 0.5396929979324341,
- "mean_token_accuracy": 0.8007107973098755,
- "num_tokens": 6259238.0,
- "step": 686
- },
- {
- "epoch": 0.5220364741641338,
- "grad_norm": 2.393710136413574,
- "learning_rate": 4.793970470637469e-06,
- "loss": 0.6165191531181335,
- "mean_token_accuracy": 0.7891418933868408,
- "num_tokens": 6266325.0,
- "step": 687
- },
- {
- "epoch": 0.5227963525835866,
- "grad_norm": 1.511647343635559,
- "learning_rate": 4.7931370760679415e-06,
- "loss": 0.4773876965045929,
- "mean_token_accuracy": 0.8381044864654541,
- "num_tokens": 6277447.0,
- "step": 688
- },
- {
- "epoch": 0.5235562310030395,
- "grad_norm": 2.206587314605713,
- "learning_rate": 4.792302072086542e-06,
- "loss": 0.5482058525085449,
- "mean_token_accuracy": 0.8239108920097351,
- "num_tokens": 6285163.0,
- "step": 689
- },
- {
- "epoch": 0.5243161094224924,
- "grad_norm": 3.018146514892578,
- "learning_rate": 4.7914654592793065e-06,
- "loss": 0.4880615472793579,
- "mean_token_accuracy": 0.8361308574676514,
- "num_tokens": 6289386.0,
- "step": 690
- },
- {
- "epoch": 0.5250759878419453,
- "grad_norm": 1.6469231843948364,
- "learning_rate": 4.790627238233405e-06,
- "loss": 0.4164774715900421,
- "mean_token_accuracy": 0.8496290445327759,
- "num_tokens": 6298915.0,
- "step": 691
- },
- {
- "epoch": 0.5258358662613982,
- "grad_norm": 2.352505922317505,
- "learning_rate": 4.789787409537131e-06,
- "loss": 0.5366303324699402,
- "mean_token_accuracy": 0.8350417613983154,
- "num_tokens": 6306130.0,
- "step": 692
- },
- {
- "epoch": 0.526595744680851,
- "grad_norm": 1.7463021278381348,
- "learning_rate": 4.7889459737799105e-06,
- "loss": 0.4389137923717499,
- "mean_token_accuracy": 0.8463300466537476,
- "num_tokens": 6315503.0,
- "step": 693
- },
- {
- "epoch": 0.5273556231003039,
- "grad_norm": 2.257706642150879,
- "learning_rate": 4.788102931552294e-06,
- "loss": 0.5309344530105591,
- "mean_token_accuracy": 0.8164352178573608,
- "num_tokens": 6321852.0,
- "step": 694
- },
- {
- "epoch": 0.5281155015197568,
- "grad_norm": 2.392732620239258,
- "learning_rate": 4.787258283445962e-06,
- "loss": 0.3956204056739807,
- "mean_token_accuracy": 0.8671456575393677,
- "num_tokens": 6327380.0,
- "step": 695
- },
- {
- "epoch": 0.5288753799392097,
- "grad_norm": 2.210514545440674,
- "learning_rate": 4.786412030053721e-06,
- "loss": 0.4842875003814697,
- "mean_token_accuracy": 0.8508446216583252,
- "num_tokens": 6334898.0,
- "step": 696
- },
- {
- "epoch": 0.5296352583586627,
- "grad_norm": 1.8678946495056152,
- "learning_rate": 4.785564171969503e-06,
- "loss": 0.47399595379829407,
- "mean_token_accuracy": 0.8514996767044067,
- "num_tokens": 6346374.0,
- "step": 697
- },
- {
- "epoch": 0.5303951367781155,
- "grad_norm": 2.604079484939575,
- "learning_rate": 4.784714709788368e-06,
- "loss": 0.5950228571891785,
- "mean_token_accuracy": 0.7983481884002686,
- "num_tokens": 6351648.0,
- "step": 698
- },
- {
- "epoch": 0.5311550151975684,
- "grad_norm": 1.662381649017334,
- "learning_rate": 4.783863644106502e-06,
- "loss": 0.41616758704185486,
- "mean_token_accuracy": 0.8554803133010864,
- "num_tokens": 6360506.0,
- "step": 699
- },
- {
- "epoch": 0.5319148936170213,
- "grad_norm": 1.6300342082977295,
- "learning_rate": 4.783010975521216e-06,
- "loss": 0.43029269576072693,
- "mean_token_accuracy": 0.8443028926849365,
- "num_tokens": 6370675.0,
- "step": 700
- },
- {
- "epoch": 0.5326747720364742,
- "grad_norm": 1.731873869895935,
- "learning_rate": 4.782156704630944e-06,
- "loss": 0.4383814334869385,
- "mean_token_accuracy": 0.8443183898925781,
- "num_tokens": 6381803.0,
- "step": 701
- },
- {
- "epoch": 0.5334346504559271,
- "grad_norm": 3.1788413524627686,
- "learning_rate": 4.7813008320352475e-06,
- "loss": 0.32194480299949646,
- "mean_token_accuracy": 0.8870962858200073,
- "num_tokens": 6389263.0,
- "step": 702
- },
- {
- "epoch": 0.53419452887538,
- "grad_norm": 2.099513530731201,
- "learning_rate": 4.78044335833481e-06,
- "loss": 0.36962923407554626,
- "mean_token_accuracy": 0.8661133646965027,
- "num_tokens": 6395589.0,
- "step": 703
- },
- {
- "epoch": 0.5349544072948328,
- "grad_norm": 1.4859435558319092,
- "learning_rate": 4.77958428413144e-06,
- "loss": 0.4619954824447632,
- "mean_token_accuracy": 0.8438555002212524,
- "num_tokens": 6407470.0,
- "step": 704
- },
- {
- "epoch": 0.5357142857142857,
- "grad_norm": 1.2561073303222656,
- "learning_rate": 4.7787236100280685e-06,
- "loss": 0.3770977258682251,
- "mean_token_accuracy": 0.8515733480453491,
- "num_tokens": 6422888.0,
- "step": 705
- },
- {
- "epoch": 0.5364741641337386,
- "grad_norm": 1.4455817937850952,
- "learning_rate": 4.777861336628751e-06,
- "loss": 0.46481069922447205,
- "mean_token_accuracy": 0.8502002954483032,
- "num_tokens": 6441266.0,
- "step": 706
- },
- {
- "epoch": 0.5372340425531915,
- "grad_norm": 1.1387295722961426,
- "learning_rate": 4.7769974645386616e-06,
- "loss": 0.36964765191078186,
- "mean_token_accuracy": 0.8719524145126343,
- "num_tokens": 6463686.0,
- "step": 707
- },
- {
- "epoch": 0.5379939209726444,
- "grad_norm": 1.7179663181304932,
- "learning_rate": 4.776131994364102e-06,
- "loss": 0.4231719970703125,
- "mean_token_accuracy": 0.8416585922241211,
- "num_tokens": 6472956.0,
- "step": 708
- },
- {
- "epoch": 0.5387537993920972,
- "grad_norm": 1.6328502893447876,
- "learning_rate": 4.775264926712489e-06,
- "loss": 0.5836569666862488,
- "mean_token_accuracy": 0.8039724230766296,
- "num_tokens": 6485773.0,
- "step": 709
- },
- {
- "epoch": 0.5395136778115501,
- "grad_norm": 1.8515360355377197,
- "learning_rate": 4.774396262192368e-06,
- "loss": 0.5477553009986877,
- "mean_token_accuracy": 0.8136521577835083,
- "num_tokens": 6496379.0,
- "step": 710
- },
- {
- "epoch": 0.540273556231003,
- "grad_norm": 1.741858959197998,
- "learning_rate": 4.7735260014133986e-06,
- "loss": 0.4663267731666565,
- "mean_token_accuracy": 0.8473691940307617,
- "num_tokens": 6507652.0,
- "step": 711
- },
- {
- "epoch": 0.541033434650456,
- "grad_norm": 1.7516659498214722,
- "learning_rate": 4.772654144986364e-06,
- "loss": 0.374914288520813,
- "mean_token_accuracy": 0.8600220680236816,
- "num_tokens": 6519030.0,
- "step": 712
- },
- {
- "epoch": 0.5417933130699089,
- "grad_norm": 2.662343978881836,
- "learning_rate": 4.7717806935231665e-06,
- "loss": 0.4206875264644623,
- "mean_token_accuracy": 0.8544126749038696,
- "num_tokens": 6523669.0,
- "step": 713
- },
- {
- "epoch": 0.5425531914893617,
- "grad_norm": 1.4088834524154663,
- "learning_rate": 4.770905647636828e-06,
- "loss": 0.5824331045150757,
- "mean_token_accuracy": 0.7857901453971863,
- "num_tokens": 6540560.0,
- "step": 714
- },
- {
- "epoch": 0.5433130699088146,
- "grad_norm": 2.173656940460205,
- "learning_rate": 4.77002900794149e-06,
- "loss": 0.555023729801178,
- "mean_token_accuracy": 0.8067290782928467,
- "num_tokens": 6548946.0,
- "step": 715
- },
- {
- "epoch": 0.5440729483282675,
- "grad_norm": 2.121018648147583,
- "learning_rate": 4.769150775052411e-06,
- "loss": 0.559730052947998,
- "mean_token_accuracy": 0.8166372776031494,
- "num_tokens": 6556065.0,
- "step": 716
- },
- {
- "epoch": 0.5448328267477204,
- "grad_norm": 3.335866928100586,
- "learning_rate": 4.768270949585968e-06,
- "loss": 0.6442267894744873,
- "mean_token_accuracy": 0.7858607769012451,
- "num_tokens": 6560615.0,
- "step": 717
- },
- {
- "epoch": 0.5455927051671733,
- "grad_norm": 2.3813695907592773,
- "learning_rate": 4.767389532159659e-06,
- "loss": 0.4027421474456787,
- "mean_token_accuracy": 0.8635619282722473,
- "num_tokens": 6565841.0,
- "step": 718
- },
- {
- "epoch": 0.5463525835866262,
- "grad_norm": 2.0657708644866943,
- "learning_rate": 4.766506523392095e-06,
- "loss": 0.38899827003479004,
- "mean_token_accuracy": 0.8660480380058289,
- "num_tokens": 6572362.0,
- "step": 719
- },
- {
- "epoch": 0.547112462006079,
- "grad_norm": 1.093705415725708,
- "learning_rate": 4.765621923903005e-06,
- "loss": 0.45967352390289307,
- "mean_token_accuracy": 0.8338102102279663,
- "num_tokens": 6595998.0,
- "step": 720
- },
- {
- "epoch": 0.5478723404255319,
- "grad_norm": 2.942065954208374,
- "learning_rate": 4.764735734313236e-06,
- "loss": 0.42910510301589966,
- "mean_token_accuracy": 0.8406122922897339,
- "num_tokens": 6601075.0,
- "step": 721
- },
- {
- "epoch": 0.5486322188449848,
- "grad_norm": 2.049011707305908,
- "learning_rate": 4.763847955244749e-06,
- "loss": 0.5584231615066528,
- "mean_token_accuracy": 0.8171684741973877,
- "num_tokens": 6609310.0,
- "step": 722
- },
- {
- "epoch": 0.5493920972644377,
- "grad_norm": 2.485543966293335,
- "learning_rate": 4.762958587320623e-06,
- "loss": 0.5396170020103455,
- "mean_token_accuracy": 0.8158525824546814,
- "num_tokens": 6616185.0,
- "step": 723
- },
- {
- "epoch": 0.5501519756838906,
- "grad_norm": 1.87015962600708,
- "learning_rate": 4.762067631165049e-06,
- "loss": 0.49739527702331543,
- "mean_token_accuracy": 0.8303765654563904,
- "num_tokens": 6625629.0,
- "step": 724
- },
- {
- "epoch": 0.5509118541033434,
- "grad_norm": 4.239654541015625,
- "learning_rate": 4.761175087403336e-06,
- "loss": 0.6029239296913147,
- "mean_token_accuracy": 0.8123486042022705,
- "num_tokens": 6629194.0,
- "step": 725
- },
- {
- "epoch": 0.5516717325227963,
- "grad_norm": 2.0134730339050293,
- "learning_rate": 4.760280956661904e-06,
- "loss": 0.4777873754501343,
- "mean_token_accuracy": 0.8283513784408569,
- "num_tokens": 6636929.0,
- "step": 726
- },
- {
- "epoch": 0.5524316109422492,
- "grad_norm": 1.991780400276184,
- "learning_rate": 4.75938523956829e-06,
- "loss": 0.4631248116493225,
- "mean_token_accuracy": 0.8275107741355896,
- "num_tokens": 6645135.0,
- "step": 727
- },
- {
- "epoch": 0.5531914893617021,
- "grad_norm": 1.423792839050293,
- "learning_rate": 4.75848793675114e-06,
- "loss": 0.49630722403526306,
- "mean_token_accuracy": 0.8388000130653381,
- "num_tokens": 6662690.0,
- "step": 728
- },
- {
- "epoch": 0.5539513677811551,
- "grad_norm": 2.345294952392578,
- "learning_rate": 4.757589048840219e-06,
- "loss": 0.37830638885498047,
- "mean_token_accuracy": 0.8782080411911011,
- "num_tokens": 6667285.0,
- "step": 729
- },
- {
- "epoch": 0.5547112462006079,
- "grad_norm": 2.7452144622802734,
- "learning_rate": 4.756688576466398e-06,
- "loss": 0.51595538854599,
- "mean_token_accuracy": 0.8441770672798157,
- "num_tokens": 6672324.0,
- "step": 730
- },
- {
- "epoch": 0.5554711246200608,
- "grad_norm": 1.5247859954833984,
- "learning_rate": 4.755786520261666e-06,
- "loss": 0.48365193605422974,
- "mean_token_accuracy": 0.8276445269584656,
- "num_tokens": 6685296.0,
- "step": 731
- },
- {
- "epoch": 0.5562310030395137,
- "grad_norm": 1.4018276929855347,
- "learning_rate": 4.75488288085912e-06,
- "loss": 0.3876481354236603,
- "mean_token_accuracy": 0.8612343072891235,
- "num_tokens": 6697515.0,
- "step": 732
- },
- {
- "epoch": 0.5569908814589666,
- "grad_norm": 2.9570324420928955,
- "learning_rate": 4.753977658892967e-06,
- "loss": 0.5468149185180664,
- "mean_token_accuracy": 0.8054271340370178,
- "num_tokens": 6702194.0,
- "step": 733
- },
- {
- "epoch": 0.5577507598784195,
- "grad_norm": 1.9282715320587158,
- "learning_rate": 4.753070854998529e-06,
- "loss": 0.4758574962615967,
- "mean_token_accuracy": 0.8379775285720825,
- "num_tokens": 6709938.0,
- "step": 734
- },
- {
- "epoch": 0.5585106382978723,
- "grad_norm": 1.981264591217041,
- "learning_rate": 4.752162469812234e-06,
- "loss": 0.48461222648620605,
- "mean_token_accuracy": 0.833509087562561,
- "num_tokens": 6718125.0,
- "step": 735
- },
- {
- "epoch": 0.5592705167173252,
- "grad_norm": 1.1643427610397339,
- "learning_rate": 4.751252503971624e-06,
- "loss": 0.410121887922287,
- "mean_token_accuracy": 0.8221402764320374,
- "num_tokens": 6735125.0,
- "step": 736
- },
- {
- "epoch": 0.5600303951367781,
- "grad_norm": 1.786566972732544,
- "learning_rate": 4.750340958115346e-06,
- "loss": 0.5964341163635254,
- "mean_token_accuracy": 0.8038164377212524,
- "num_tokens": 6747369.0,
- "step": 737
- },
- {
- "epoch": 0.560790273556231,
- "grad_norm": 1.7256991863250732,
- "learning_rate": 4.749427832883158e-06,
- "loss": 0.48737066984176636,
- "mean_token_accuracy": 0.830894947052002,
- "num_tokens": 6758115.0,
- "step": 738
- },
- {
- "epoch": 0.5615501519756839,
- "grad_norm": 1.997747540473938,
- "learning_rate": 4.748513128915928e-06,
- "loss": 0.5238886475563049,
- "mean_token_accuracy": 0.8066858053207397,
- "num_tokens": 6766111.0,
- "step": 739
- },
- {
- "epoch": 0.5623100303951368,
- "grad_norm": 2.127016305923462,
- "learning_rate": 4.747596846855629e-06,
- "loss": 0.5045586228370667,
- "mean_token_accuracy": 0.821424126625061,
- "num_tokens": 6772893.0,
- "step": 740
- },
- {
- "epoch": 0.5630699088145896,
- "grad_norm": 1.7664796113967896,
- "learning_rate": 4.7466789873453446e-06,
- "loss": 0.42954835295677185,
- "mean_token_accuracy": 0.8533384799957275,
- "num_tokens": 6785133.0,
- "step": 741
- },
- {
- "epoch": 0.5638297872340425,
- "grad_norm": 1.4987404346466064,
- "learning_rate": 4.7457595510292615e-06,
- "loss": 0.5378558039665222,
- "mean_token_accuracy": 0.8184819221496582,
- "num_tokens": 6799563.0,
- "step": 742
- },
- {
- "epoch": 0.5645896656534954,
- "grad_norm": 1.4444655179977417,
- "learning_rate": 4.744838538552678e-06,
- "loss": 0.42193782329559326,
- "mean_token_accuracy": 0.837514340877533,
- "num_tokens": 6812470.0,
- "step": 743
- },
- {
- "epoch": 0.5653495440729484,
- "grad_norm": 3.867751121520996,
- "learning_rate": 4.7439159505619946e-06,
- "loss": 0.4457814693450928,
- "mean_token_accuracy": 0.8630104660987854,
- "num_tokens": 6815652.0,
- "step": 744
- },
- {
- "epoch": 0.5661094224924013,
- "grad_norm": 2.1250710487365723,
- "learning_rate": 4.74299178770472e-06,
- "loss": 0.5638922452926636,
- "mean_token_accuracy": 0.7969781160354614,
- "num_tokens": 6824566.0,
- "step": 745
- },
- {
- "epoch": 0.5668693009118541,
- "grad_norm": 2.547072410583496,
- "learning_rate": 4.742066050629465e-06,
- "loss": 0.5516207814216614,
- "mean_token_accuracy": 0.8160669803619385,
- "num_tokens": 6830589.0,
- "step": 746
- },
- {
- "epoch": 0.567629179331307,
- "grad_norm": 1.2975233793258667,
- "learning_rate": 4.741138739985951e-06,
- "loss": 0.3823344111442566,
- "mean_token_accuracy": 0.8668368458747864,
- "num_tokens": 6842707.0,
- "step": 747
- },
- {
- "epoch": 0.5683890577507599,
- "grad_norm": 1.3410450220108032,
- "learning_rate": 4.740209856424998e-06,
- "loss": 0.5148671269416809,
- "mean_token_accuracy": 0.8188045024871826,
- "num_tokens": 6857624.0,
- "step": 748
- },
- {
- "epoch": 0.5691489361702128,
- "grad_norm": 1.219467282295227,
- "learning_rate": 4.7392794005985324e-06,
- "loss": 0.3998957872390747,
- "mean_token_accuracy": 0.855175256729126,
- "num_tokens": 6875064.0,
- "step": 749
- },
- {
- "epoch": 0.5699088145896657,
- "grad_norm": 1.3530343770980835,
- "learning_rate": 4.738347373159585e-06,
- "loss": 0.5359633564949036,
- "mean_token_accuracy": 0.8178457021713257,
- "num_tokens": 6890911.0,
- "step": 750
- },
- {
- "epoch": 0.5706686930091185,
- "grad_norm": 2.146988868713379,
- "learning_rate": 4.737413774762287e-06,
- "loss": 0.4460008144378662,
- "mean_token_accuracy": 0.8172903060913086,
- "num_tokens": 6896959.0,
- "step": 751
- },
- {
- "epoch": 0.5714285714285714,
- "grad_norm": 1.456023097038269,
- "learning_rate": 4.736478606061876e-06,
- "loss": 0.43616920709609985,
- "mean_token_accuracy": 0.8465108871459961,
- "num_tokens": 6908904.0,
- "step": 752
- },
- {
- "epoch": 0.5721884498480243,
- "grad_norm": 2.9696967601776123,
- "learning_rate": 4.735541867714687e-06,
- "loss": 0.43464532494544983,
- "mean_token_accuracy": 0.8608652353286743,
- "num_tokens": 6913026.0,
- "step": 753
- },
- {
- "epoch": 0.5729483282674772,
- "grad_norm": 2.2990667819976807,
- "learning_rate": 4.73460356037816e-06,
- "loss": 0.6619116067886353,
- "mean_token_accuracy": 0.7821142673492432,
- "num_tokens": 6920588.0,
- "step": 754
- },
- {
- "epoch": 0.5737082066869301,
- "grad_norm": 2.054746389389038,
- "learning_rate": 4.733663684710835e-06,
- "loss": 0.5304250717163086,
- "mean_token_accuracy": 0.8265531063079834,
- "num_tokens": 6928910.0,
- "step": 755
- },
- {
- "epoch": 0.574468085106383,
- "grad_norm": 2.0050594806671143,
- "learning_rate": 4.732722241372354e-06,
- "loss": 0.6393026113510132,
- "mean_token_accuracy": 0.796819806098938,
- "num_tokens": 6940217.0,
- "step": 756
- },
- {
- "epoch": 0.5752279635258358,
- "grad_norm": 1.4285320043563843,
- "learning_rate": 4.731779231023456e-06,
- "loss": 0.5432837009429932,
- "mean_token_accuracy": 0.8104778528213501,
- "num_tokens": 6959101.0,
- "step": 757
- },
- {
- "epoch": 0.5759878419452887,
- "grad_norm": 2.3941943645477295,
- "learning_rate": 4.730834654325984e-06,
- "loss": 0.46550673246383667,
- "mean_token_accuracy": 0.8444503545761108,
- "num_tokens": 6965036.0,
- "step": 758
- },
- {
- "epoch": 0.5767477203647416,
- "grad_norm": 2.3850574493408203,
- "learning_rate": 4.729888511942877e-06,
- "loss": 0.4916389584541321,
- "mean_token_accuracy": 0.8228527307510376,
- "num_tokens": 6971184.0,
- "step": 759
- },
- {
- "epoch": 0.5775075987841946,
- "grad_norm": 1.627480149269104,
- "learning_rate": 4.728940804538176e-06,
- "loss": 0.5863215923309326,
- "mean_token_accuracy": 0.7995302677154541,
- "num_tokens": 6982569.0,
- "step": 760
- },
- {
- "epoch": 0.5782674772036475,
- "grad_norm": 1.1723195314407349,
- "learning_rate": 4.727991532777016e-06,
- "loss": 0.36908864974975586,
- "mean_token_accuracy": 0.8355655670166016,
- "num_tokens": 6998659.0,
- "step": 761
- },
- {
- "epoch": 0.5790273556231003,
- "grad_norm": 1.5324925184249878,
- "learning_rate": 4.727040697325634e-06,
- "loss": 0.557658851146698,
- "mean_token_accuracy": 0.8141458034515381,
- "num_tokens": 7012969.0,
- "step": 762
- },
- {
- "epoch": 0.5797872340425532,
- "grad_norm": 2.4106390476226807,
- "learning_rate": 4.726088298851362e-06,
- "loss": 0.5004243850708008,
- "mean_token_accuracy": 0.8376860618591309,
- "num_tokens": 7018301.0,
- "step": 763
- },
- {
- "epoch": 0.5805471124620061,
- "grad_norm": 2.2594921588897705,
- "learning_rate": 4.725134338022631e-06,
- "loss": 0.6067016124725342,
- "mean_token_accuracy": 0.8100241422653198,
- "num_tokens": 7025201.0,
- "step": 764
- },
- {
- "epoch": 0.581306990881459,
- "grad_norm": 1.4649826288223267,
- "learning_rate": 4.724178815508967e-06,
- "loss": 0.36200693249702454,
- "mean_token_accuracy": 0.8621826171875,
- "num_tokens": 7035112.0,
- "step": 765
- },
- {
- "epoch": 0.5820668693009119,
- "grad_norm": 2.3634560108184814,
- "learning_rate": 4.723221731980993e-06,
- "loss": 0.41862213611602783,
- "mean_token_accuracy": 0.8541463613510132,
- "num_tokens": 7040339.0,
- "step": 766
- },
- {
- "epoch": 0.5828267477203647,
- "grad_norm": 2.7798104286193848,
- "learning_rate": 4.722263088110426e-06,
- "loss": 0.4647108018398285,
- "mean_token_accuracy": 0.8505672216415405,
- "num_tokens": 7044880.0,
- "step": 767
- },
- {
- "epoch": 0.5835866261398176,
- "grad_norm": 2.070528507232666,
- "learning_rate": 4.721302884570079e-06,
- "loss": 0.5147565007209778,
- "mean_token_accuracy": 0.8113877773284912,
- "num_tokens": 7052433.0,
- "step": 768
- },
- {
- "epoch": 0.5843465045592705,
- "grad_norm": 2.1953284740448,
- "learning_rate": 4.720341122033862e-06,
- "loss": 0.5075466632843018,
- "mean_token_accuracy": 0.8474211096763611,
- "num_tokens": 7058686.0,
- "step": 769
- },
- {
- "epoch": 0.5851063829787234,
- "grad_norm": 1.9287755489349365,
- "learning_rate": 4.719377801176774e-06,
- "loss": 0.5382202863693237,
- "mean_token_accuracy": 0.8148090243339539,
- "num_tokens": 7067538.0,
- "step": 770
- },
- {
- "epoch": 0.5858662613981763,
- "grad_norm": 1.5574456453323364,
- "learning_rate": 4.718412922674913e-06,
- "loss": 0.43406790494918823,
- "mean_token_accuracy": 0.8477081060409546,
- "num_tokens": 7077853.0,
- "step": 771
- },
- {
- "epoch": 0.5866261398176292,
- "grad_norm": 1.5490336418151855,
- "learning_rate": 4.717446487205466e-06,
- "loss": 0.43164271116256714,
- "mean_token_accuracy": 0.8504570126533508,
- "num_tokens": 7091728.0,
- "step": 772
- },
- {
- "epoch": 0.587386018237082,
- "grad_norm": 1.6945984363555908,
- "learning_rate": 4.716478495446717e-06,
- "loss": 0.5153743624687195,
- "mean_token_accuracy": 0.8213579058647156,
- "num_tokens": 7108680.0,
- "step": 773
- },
- {
- "epoch": 0.5881458966565349,
- "grad_norm": 2.2633883953094482,
- "learning_rate": 4.715508948078037e-06,
- "loss": 0.45254790782928467,
- "mean_token_accuracy": 0.8392219543457031,
- "num_tokens": 7115546.0,
- "step": 774
- },
- {
- "epoch": 0.5889057750759878,
- "grad_norm": 1.5731090307235718,
- "learning_rate": 4.714537845779894e-06,
- "loss": 0.38678881525993347,
- "mean_token_accuracy": 0.8800252676010132,
- "num_tokens": 7126360.0,
- "step": 775
- },
- {
- "epoch": 0.5896656534954408,
- "grad_norm": 2.4873392581939697,
- "learning_rate": 4.7135651892338445e-06,
- "loss": 0.5190927386283875,
- "mean_token_accuracy": 0.8145407438278198,
- "num_tokens": 7135705.0,
- "step": 776
- },
- {
- "epoch": 0.5904255319148937,
- "grad_norm": 1.2931004762649536,
- "learning_rate": 4.712590979122534e-06,
- "loss": 0.3686544895172119,
- "mean_token_accuracy": 0.8720537424087524,
- "num_tokens": 7150688.0,
- "step": 777
- },
- {
- "epoch": 0.5911854103343465,
- "grad_norm": 1.6353671550750732,
- "learning_rate": 4.7116152161297045e-06,
- "loss": 0.49065062403678894,
- "mean_token_accuracy": 0.8203760385513306,
- "num_tokens": 7161040.0,
- "step": 778
- },
- {
- "epoch": 0.5919452887537994,
- "grad_norm": 1.2345483303070068,
- "learning_rate": 4.710637900940181e-06,
- "loss": 0.4004976451396942,
- "mean_token_accuracy": 0.8302007913589478,
- "num_tokens": 7178074.0,
- "step": 779
- },
- {
- "epoch": 0.5927051671732523,
- "grad_norm": 2.2506837844848633,
- "learning_rate": 4.7096590342398825e-06,
- "loss": 0.45142874121665955,
- "mean_token_accuracy": 0.8481036424636841,
- "num_tokens": 7184153.0,
- "step": 780
- },
- {
- "epoch": 0.5934650455927052,
- "grad_norm": 1.420479416847229,
- "learning_rate": 4.708678616715815e-06,
- "loss": 0.4802100360393524,
- "mean_token_accuracy": 0.8586992025375366,
- "num_tokens": 7202810.0,
- "step": 781
- },
- {
- "epoch": 0.5942249240121581,
- "grad_norm": 3.457632303237915,
- "learning_rate": 4.707696649056073e-06,
- "loss": 0.5265094041824341,
- "mean_token_accuracy": 0.8260114192962646,
- "num_tokens": 7206396.0,
- "step": 782
- },
- {
- "epoch": 0.5949848024316109,
- "grad_norm": 1.1592093706130981,
- "learning_rate": 4.706713131949839e-06,
- "loss": 0.3708173632621765,
- "mean_token_accuracy": 0.8476542234420776,
- "num_tokens": 7225034.0,
- "step": 783
- },
- {
- "epoch": 0.5957446808510638,
- "grad_norm": 1.6761400699615479,
- "learning_rate": 4.705728066087384e-06,
- "loss": 0.4137252867221832,
- "mean_token_accuracy": 0.8462049961090088,
- "num_tokens": 7237101.0,
- "step": 784
- },
- {
- "epoch": 0.5965045592705167,
- "grad_norm": 2.320185422897339,
- "learning_rate": 4.704741452160064e-06,
- "loss": 0.5157154202461243,
- "mean_token_accuracy": 0.8391785621643066,
- "num_tokens": 7243826.0,
- "step": 785
- },
- {
- "epoch": 0.5972644376899696,
- "grad_norm": 2.079423427581787,
- "learning_rate": 4.703753290860323e-06,
- "loss": 0.4734993278980255,
- "mean_token_accuracy": 0.8353281021118164,
- "num_tokens": 7250175.0,
- "step": 786
- },
- {
- "epoch": 0.5980243161094225,
- "grad_norm": 1.8215159177780151,
- "learning_rate": 4.702763582881692e-06,
- "loss": 0.520193338394165,
- "mean_token_accuracy": 0.844062864780426,
- "num_tokens": 7258868.0,
- "step": 787
- },
- {
- "epoch": 0.5987841945288754,
- "grad_norm": 1.3823071718215942,
- "learning_rate": 4.701772328918784e-06,
- "loss": 0.4177844822406769,
- "mean_token_accuracy": 0.8363165259361267,
- "num_tokens": 7271744.0,
- "step": 788
- },
- {
- "epoch": 0.5995440729483282,
- "grad_norm": 2.4749298095703125,
- "learning_rate": 4.700779529667301e-06,
- "loss": 0.5115069150924683,
- "mean_token_accuracy": 0.8473520278930664,
- "num_tokens": 7277040.0,
- "step": 789
- },
- {
- "epoch": 0.6003039513677811,
- "grad_norm": 1.7072296142578125,
- "learning_rate": 4.699785185824026e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8161447048187256,
- "num_tokens": 7288288.0,
- "step": 790
- },
- {
- "epoch": 0.601063829787234,
- "grad_norm": 1.6479384899139404,
- "learning_rate": 4.69878929808683e-06,
- "loss": 0.4445168972015381,
- "mean_token_accuracy": 0.8381255865097046,
- "num_tokens": 7298640.0,
- "step": 791
- },
- {
- "epoch": 0.601823708206687,
- "grad_norm": 1.9095896482467651,
- "learning_rate": 4.6977918671546635e-06,
- "loss": 0.5841238498687744,
- "mean_token_accuracy": 0.7971454858779907,
- "num_tokens": 7307220.0,
- "step": 792
- },
- {
- "epoch": 0.6025835866261399,
- "grad_norm": 1.9614146947860718,
- "learning_rate": 4.696792893727562e-06,
- "loss": 0.34684082865715027,
- "mean_token_accuracy": 0.8739526271820068,
- "num_tokens": 7313875.0,
- "step": 793
- },
- {
- "epoch": 0.6033434650455927,
- "grad_norm": 2.015570640563965,
- "learning_rate": 4.695792378506645e-06,
- "loss": 0.42779117822647095,
- "mean_token_accuracy": 0.8625012636184692,
- "num_tokens": 7321439.0,
- "step": 794
- },
- {
- "epoch": 0.6041033434650456,
- "grad_norm": 2.8581228256225586,
- "learning_rate": 4.694790322194111e-06,
- "loss": 0.6519991159439087,
- "mean_token_accuracy": 0.7629562616348267,
- "num_tokens": 7326916.0,
- "step": 795
- },
- {
- "epoch": 0.6048632218844985,
- "grad_norm": 2.482715368270874,
- "learning_rate": 4.693786725493242e-06,
- "loss": 0.532963216304779,
- "mean_token_accuracy": 0.832184910774231,
- "num_tokens": 7333311.0,
- "step": 796
- },
- {
- "epoch": 0.6056231003039514,
- "grad_norm": 1.6076741218566895,
- "learning_rate": 4.692781589108402e-06,
- "loss": 0.43381205201148987,
- "mean_token_accuracy": 0.8402494192123413,
- "num_tokens": 7343731.0,
- "step": 797
- },
- {
- "epoch": 0.6063829787234043,
- "grad_norm": 2.2133216857910156,
- "learning_rate": 4.691774913745033e-06,
- "loss": 0.4380851089954376,
- "mean_token_accuracy": 0.8600908517837524,
- "num_tokens": 7350224.0,
- "step": 798
- },
- {
- "epoch": 0.6071428571428571,
- "grad_norm": 2.046280860900879,
- "learning_rate": 4.690766700109659e-06,
- "loss": 0.3821919560432434,
- "mean_token_accuracy": 0.8691814541816711,
- "num_tokens": 7356717.0,
- "step": 799
- },
- {
- "epoch": 0.60790273556231,
- "grad_norm": 1.8482693433761597,
- "learning_rate": 4.689756948909884e-06,
- "loss": 0.5217651128768921,
- "mean_token_accuracy": 0.803473711013794,
- "num_tokens": 7365806.0,
- "step": 800
- },
- {
- "epoch": 0.6086626139817629,
- "grad_norm": 2.192134141921997,
- "learning_rate": 4.688745660854388e-06,
- "loss": 0.573980987071991,
- "mean_token_accuracy": 0.8198676109313965,
- "num_tokens": 7380281.0,
- "step": 801
- },
- {
- "epoch": 0.6094224924012158,
- "grad_norm": 2.363626718521118,
- "learning_rate": 4.687732836652935e-06,
- "loss": 0.5204599499702454,
- "mean_token_accuracy": 0.8373252153396606,
- "num_tokens": 7386938.0,
- "step": 802
- },
- {
- "epoch": 0.6101823708206687,
- "grad_norm": 1.9320523738861084,
- "learning_rate": 4.686718477016361e-06,
- "loss": 0.47316622734069824,
- "mean_token_accuracy": 0.830596923828125,
- "num_tokens": 7395069.0,
- "step": 803
- },
- {
- "epoch": 0.6109422492401215,
- "grad_norm": 2.6573057174682617,
- "learning_rate": 4.6857025826565845e-06,
- "loss": 0.5495861768722534,
- "mean_token_accuracy": 0.8187421560287476,
- "num_tokens": 7400563.0,
- "step": 804
- },
- {
- "epoch": 0.6117021276595744,
- "grad_norm": 2.0893123149871826,
- "learning_rate": 4.684685154286599e-06,
- "loss": 0.5362675786018372,
- "mean_token_accuracy": 0.8394701480865479,
- "num_tokens": 7406973.0,
- "step": 805
- },
- {
- "epoch": 0.6124620060790273,
- "grad_norm": 2.455130100250244,
- "learning_rate": 4.683666192620474e-06,
- "loss": 0.5405995845794678,
- "mean_token_accuracy": 0.8079100847244263,
- "num_tokens": 7412931.0,
- "step": 806
- },
- {
- "epoch": 0.6132218844984803,
- "grad_norm": 2.311915636062622,
- "learning_rate": 4.682645698373357e-06,
- "loss": 0.5395106077194214,
- "mean_token_accuracy": 0.8156260251998901,
- "num_tokens": 7419699.0,
- "step": 807
- },
- {
- "epoch": 0.6139817629179332,
- "grad_norm": 1.686838984489441,
- "learning_rate": 4.6816236722614694e-06,
- "loss": 0.6034521460533142,
- "mean_token_accuracy": 0.7855954170227051,
- "num_tokens": 7431899.0,
- "step": 808
- },
- {
- "epoch": 0.6147416413373861,
- "grad_norm": 1.682759165763855,
- "learning_rate": 4.680600115002109e-06,
- "loss": 0.48593831062316895,
- "mean_token_accuracy": 0.8229435682296753,
- "num_tokens": 7443187.0,
- "step": 809
- },
- {
- "epoch": 0.6155015197568389,
- "grad_norm": 2.064589738845825,
- "learning_rate": 4.679575027313649e-06,
- "loss": 0.5098468661308289,
- "mean_token_accuracy": 0.8234638571739197,
- "num_tokens": 7450868.0,
- "step": 810
- },
- {
- "epoch": 0.6162613981762918,
- "grad_norm": 2.2063486576080322,
- "learning_rate": 4.6785484099155324e-06,
- "loss": 0.5138497352600098,
- "mean_token_accuracy": 0.8152111172676086,
- "num_tokens": 7457176.0,
- "step": 811
- },
- {
- "epoch": 0.6170212765957447,
- "grad_norm": 1.6258726119995117,
- "learning_rate": 4.67752026352828e-06,
- "loss": 0.4064181447029114,
- "mean_token_accuracy": 0.8720619678497314,
- "num_tokens": 7466557.0,
- "step": 812
- },
- {
- "epoch": 0.6177811550151976,
- "grad_norm": 2.3309383392333984,
- "learning_rate": 4.676490588873486e-06,
- "loss": 0.5180112719535828,
- "mean_token_accuracy": 0.8233879804611206,
- "num_tokens": 7472650.0,
- "step": 813
- },
- {
- "epoch": 0.6185410334346505,
- "grad_norm": 1.4545246362686157,
- "learning_rate": 4.675459386673815e-06,
- "loss": 0.37917959690093994,
- "mean_token_accuracy": 0.8598103523254395,
- "num_tokens": 7485171.0,
- "step": 814
- },
- {
- "epoch": 0.6193009118541033,
- "grad_norm": 2.654231071472168,
- "learning_rate": 4.674426657653003e-06,
- "loss": 0.554074227809906,
- "mean_token_accuracy": 0.8026446104049683,
- "num_tokens": 7490787.0,
- "step": 815
- },
- {
- "epoch": 0.6200607902735562,
- "grad_norm": 1.5543994903564453,
- "learning_rate": 4.67339240253586e-06,
- "loss": 0.6335440278053284,
- "mean_token_accuracy": 0.783241868019104,
- "num_tokens": 7505975.0,
- "step": 816
- },
- {
- "epoch": 0.6208206686930091,
- "grad_norm": 2.079998016357422,
- "learning_rate": 4.672356622048266e-06,
- "loss": 0.5169394016265869,
- "mean_token_accuracy": 0.8088761568069458,
- "num_tokens": 7513470.0,
- "step": 817
- },
- {
- "epoch": 0.621580547112462,
- "grad_norm": 1.5971896648406982,
- "learning_rate": 4.671319316917172e-06,
- "loss": 0.44588586688041687,
- "mean_token_accuracy": 0.8518649339675903,
- "num_tokens": 7524352.0,
- "step": 818
- },
- {
- "epoch": 0.6223404255319149,
- "grad_norm": 2.477579116821289,
- "learning_rate": 4.670280487870599e-06,
- "loss": 0.5713893175125122,
- "mean_token_accuracy": 0.8116940259933472,
- "num_tokens": 7530359.0,
- "step": 819
- },
- {
- "epoch": 0.6231003039513677,
- "grad_norm": 2.066211700439453,
- "learning_rate": 4.669240135637635e-06,
- "loss": 0.5295331478118896,
- "mean_token_accuracy": 0.819536566734314,
- "num_tokens": 7536963.0,
- "step": 820
- },
- {
- "epoch": 0.6238601823708206,
- "grad_norm": 2.1217997074127197,
- "learning_rate": 4.668198260948442e-06,
- "loss": 0.6146406531333923,
- "mean_token_accuracy": 0.7932635545730591,
- "num_tokens": 7545800.0,
- "step": 821
- },
- {
- "epoch": 0.6246200607902735,
- "grad_norm": 2.0173542499542236,
- "learning_rate": 4.667154864534245e-06,
- "loss": 0.6240535974502563,
- "mean_token_accuracy": 0.7883644104003906,
- "num_tokens": 7556165.0,
- "step": 822
- },
- {
- "epoch": 0.6253799392097265,
- "grad_norm": 2.014526128768921,
- "learning_rate": 4.666109947127343e-06,
- "loss": 0.40367332100868225,
- "mean_token_accuracy": 0.8653522729873657,
- "num_tokens": 7562665.0,
- "step": 823
- },
- {
- "epoch": 0.6261398176291794,
- "grad_norm": 2.5078861713409424,
- "learning_rate": 4.665063509461098e-06,
- "loss": 0.5903617739677429,
- "mean_token_accuracy": 0.7902897596359253,
- "num_tokens": 7568922.0,
- "step": 824
- },
- {
- "epoch": 0.6268996960486323,
- "grad_norm": 2.454622745513916,
- "learning_rate": 4.664015552269938e-06,
- "loss": 0.5238361358642578,
- "mean_token_accuracy": 0.838546872138977,
- "num_tokens": 7575965.0,
- "step": 825
- },
- {
- "epoch": 0.6276595744680851,
- "grad_norm": 2.920919418334961,
- "learning_rate": 4.662966076289363e-06,
- "loss": 0.5028782486915588,
- "mean_token_accuracy": 0.8311152458190918,
- "num_tokens": 7580193.0,
- "step": 826
- },
- {
- "epoch": 0.628419452887538,
- "grad_norm": 1.545382022857666,
- "learning_rate": 4.661915082255932e-06,
- "loss": 0.4817378520965576,
- "mean_token_accuracy": 0.8373227119445801,
- "num_tokens": 7593024.0,
- "step": 827
- },
- {
- "epoch": 0.6291793313069909,
- "grad_norm": 1.5152469873428345,
- "learning_rate": 4.6608625709072766e-06,
- "loss": 0.4693033695220947,
- "mean_token_accuracy": 0.8150848150253296,
- "num_tokens": 7606459.0,
- "step": 828
- },
- {
- "epoch": 0.6299392097264438,
- "grad_norm": 2.1310224533081055,
- "learning_rate": 4.659808542982089e-06,
- "loss": 0.4653395414352417,
- "mean_token_accuracy": 0.8286294341087341,
- "num_tokens": 7613036.0,
- "step": 829
- },
- {
- "epoch": 0.6306990881458967,
- "grad_norm": 2.1949679851531982,
- "learning_rate": 4.658752999220125e-06,
- "loss": 0.3698633909225464,
- "mean_token_accuracy": 0.871590793132782,
- "num_tokens": 7618527.0,
- "step": 830
- },
- {
- "epoch": 0.6314589665653495,
- "grad_norm": 2.2770416736602783,
- "learning_rate": 4.657695940362207e-06,
- "loss": 0.5202419757843018,
- "mean_token_accuracy": 0.817577600479126,
- "num_tokens": 7624459.0,
- "step": 831
- },
- {
- "epoch": 0.6322188449848024,
- "grad_norm": 1.402042269706726,
- "learning_rate": 4.65663736715022e-06,
- "loss": 0.51531583070755,
- "mean_token_accuracy": 0.8228116631507874,
- "num_tokens": 7639371.0,
- "step": 832
- },
- {
- "epoch": 0.6329787234042553,
- "grad_norm": 3.3554883003234863,
- "learning_rate": 4.65557728032711e-06,
- "loss": 0.6771188378334045,
- "mean_token_accuracy": 0.7880028486251831,
- "num_tokens": 7643924.0,
- "step": 833
- },
- {
- "epoch": 0.6337386018237082,
- "grad_norm": 2.081040143966675,
- "learning_rate": 4.654515680636888e-06,
- "loss": 0.5712796449661255,
- "mean_token_accuracy": 0.8177868127822876,
- "num_tokens": 7651881.0,
- "step": 834
- },
- {
- "epoch": 0.6344984802431611,
- "grad_norm": 0.9128716588020325,
- "learning_rate": 4.653452568824625e-06,
- "loss": 0.3423936069011688,
- "mean_token_accuracy": 0.8782886266708374,
- "num_tokens": 7677829.0,
- "step": 835
- },
- {
- "epoch": 0.6352583586626139,
- "grad_norm": 3.49015736579895,
- "learning_rate": 4.652387945636454e-06,
- "loss": 0.34657734632492065,
- "mean_token_accuracy": 0.8770567178726196,
- "num_tokens": 7680796.0,
- "step": 836
- },
- {
- "epoch": 0.6360182370820668,
- "grad_norm": 2.026247501373291,
- "learning_rate": 4.651321811819568e-06,
- "loss": 0.5098431706428528,
- "mean_token_accuracy": 0.8216961622238159,
- "num_tokens": 7688746.0,
- "step": 837
- },
- {
- "epoch": 0.6367781155015197,
- "grad_norm": 2.444343090057373,
- "learning_rate": 4.650254168122222e-06,
- "loss": 0.5490090250968933,
- "mean_token_accuracy": 0.8092857599258423,
- "num_tokens": 7695220.0,
- "step": 838
- },
- {
- "epoch": 0.6375379939209727,
- "grad_norm": 2.0171122550964355,
- "learning_rate": 4.649185015293728e-06,
- "loss": 0.47221142053604126,
- "mean_token_accuracy": 0.8514408469200134,
- "num_tokens": 7702759.0,
- "step": 839
- },
- {
- "epoch": 0.6382978723404256,
- "grad_norm": 1.9800984859466553,
- "learning_rate": 4.64811435408446e-06,
- "loss": 0.5238803625106812,
- "mean_token_accuracy": 0.8479194641113281,
- "num_tokens": 7714017.0,
- "step": 840
- },
- {
- "epoch": 0.6390577507598785,
- "grad_norm": 3.0674357414245605,
- "learning_rate": 4.647042185245848e-06,
- "loss": 0.4668245315551758,
- "mean_token_accuracy": 0.8381714820861816,
- "num_tokens": 7717801.0,
- "step": 841
- },
- {
- "epoch": 0.6398176291793313,
- "grad_norm": 1.5672820806503296,
- "learning_rate": 4.645968509530381e-06,
- "loss": 0.4428741931915283,
- "mean_token_accuracy": 0.8416479825973511,
- "num_tokens": 7728342.0,
- "step": 842
- },
- {
- "epoch": 0.6405775075987842,
- "grad_norm": 2.3042354583740234,
- "learning_rate": 4.644893327691608e-06,
- "loss": 0.49937760829925537,
- "mean_token_accuracy": 0.827070951461792,
- "num_tokens": 7734576.0,
- "step": 843
- },
- {
- "epoch": 0.6413373860182371,
- "grad_norm": 2.057772159576416,
- "learning_rate": 4.6438166404841316e-06,
- "loss": 0.5912986993789673,
- "mean_token_accuracy": 0.805509090423584,
- "num_tokens": 7742481.0,
- "step": 844
- },
- {
- "epoch": 0.64209726443769,
- "grad_norm": 1.9688186645507812,
- "learning_rate": 4.6427384486636115e-06,
- "loss": 0.482401967048645,
- "mean_token_accuracy": 0.8358086347579956,
- "num_tokens": 7750002.0,
- "step": 845
- },
- {
- "epoch": 0.6428571428571429,
- "grad_norm": 2.6852948665618896,
- "learning_rate": 4.6416587529867665e-06,
- "loss": 0.5479315519332886,
- "mean_token_accuracy": 0.8091106414794922,
- "num_tokens": 7755578.0,
- "step": 846
- },
- {
- "epoch": 0.6436170212765957,
- "grad_norm": 2.0547337532043457,
- "learning_rate": 4.640577554211366e-06,
- "loss": 0.5327274203300476,
- "mean_token_accuracy": 0.8280376195907593,
- "num_tokens": 7763513.0,
- "step": 847
- },
- {
- "epoch": 0.6443768996960486,
- "grad_norm": 2.0328633785247803,
- "learning_rate": 4.63949485309624e-06,
- "loss": 0.4814409613609314,
- "mean_token_accuracy": 0.8527672290802002,
- "num_tokens": 7771131.0,
- "step": 848
- },
- {
- "epoch": 0.6451367781155015,
- "grad_norm": 1.5892863273620605,
- "learning_rate": 4.638410650401267e-06,
- "loss": 0.4492785334587097,
- "mean_token_accuracy": 0.846997857093811,
- "num_tokens": 7781572.0,
- "step": 849
- },
- {
- "epoch": 0.6458966565349544,
- "grad_norm": 1.8295910358428955,
- "learning_rate": 4.637324946887384e-06,
- "loss": 0.37088239192962646,
- "mean_token_accuracy": 0.8616628646850586,
- "num_tokens": 7788604.0,
- "step": 850
- },
- {
- "epoch": 0.6466565349544073,
- "grad_norm": 3.380040168762207,
- "learning_rate": 4.636237743316578e-06,
- "loss": 0.4737280607223511,
- "mean_token_accuracy": 0.855940580368042,
- "num_tokens": 7792504.0,
- "step": 851
- },
- {
- "epoch": 0.6474164133738601,
- "grad_norm": 2.8790009021759033,
- "learning_rate": 4.635149040451891e-06,
- "loss": 0.39790448546409607,
- "mean_token_accuracy": 0.8710698485374451,
- "num_tokens": 7796333.0,
- "step": 852
- },
- {
- "epoch": 0.648176291793313,
- "grad_norm": 1.914914608001709,
- "learning_rate": 4.634058839057417e-06,
- "loss": 0.2954312562942505,
- "mean_token_accuracy": 0.8880234956741333,
- "num_tokens": 7802456.0,
- "step": 853
- },
- {
- "epoch": 0.648936170212766,
- "grad_norm": 1.3709120750427246,
- "learning_rate": 4.632967139898301e-06,
- "loss": 0.43224576115608215,
- "mean_token_accuracy": 0.8446190357208252,
- "num_tokens": 7816770.0,
- "step": 854
- },
- {
- "epoch": 0.6496960486322189,
- "grad_norm": 1.6579312086105347,
- "learning_rate": 4.63187394374074e-06,
- "loss": 0.3535553514957428,
- "mean_token_accuracy": 0.8738704919815063,
- "num_tokens": 7824963.0,
- "step": 855
- },
- {
- "epoch": 0.6504559270516718,
- "grad_norm": 2.4055678844451904,
- "learning_rate": 4.63077925135198e-06,
- "loss": 0.5078744292259216,
- "mean_token_accuracy": 0.8430874347686768,
- "num_tokens": 7830962.0,
- "step": 856
- },
- {
- "epoch": 0.6512158054711246,
- "grad_norm": 2.5171499252319336,
- "learning_rate": 4.629683063500319e-06,
- "loss": 0.5172419548034668,
- "mean_token_accuracy": 0.8087141513824463,
- "num_tokens": 7836638.0,
- "step": 857
- },
- {
- "epoch": 0.6519756838905775,
- "grad_norm": 1.7588486671447754,
- "learning_rate": 4.628585380955104e-06,
- "loss": 0.5759496092796326,
- "mean_token_accuracy": 0.8043236136436462,
- "num_tokens": 7844654.0,
- "step": 858
- },
- {
- "epoch": 0.6527355623100304,
- "grad_norm": 1.5887070894241333,
- "learning_rate": 4.62748620448673e-06,
- "loss": 0.41849038004875183,
- "mean_token_accuracy": 0.8556643724441528,
- "num_tokens": 7855642.0,
- "step": 859
- },
- {
- "epoch": 0.6534954407294833,
- "grad_norm": 3.227942705154419,
- "learning_rate": 4.626385534866642e-06,
- "loss": 0.5279449224472046,
- "mean_token_accuracy": 0.8250958323478699,
- "num_tokens": 7859890.0,
- "step": 860
- },
- {
- "epoch": 0.6542553191489362,
- "grad_norm": 2.440467119216919,
- "learning_rate": 4.625283372867333e-06,
- "loss": 0.5294933319091797,
- "mean_token_accuracy": 0.8235013484954834,
- "num_tokens": 7866766.0,
- "step": 861
- },
- {
- "epoch": 0.6550151975683891,
- "grad_norm": 2.4106903076171875,
- "learning_rate": 4.624179719262342e-06,
- "loss": 0.5662813186645508,
- "mean_token_accuracy": 0.8061668872833252,
- "num_tokens": 7872809.0,
- "step": 862
- },
- {
- "epoch": 0.6557750759878419,
- "grad_norm": 3.5151145458221436,
- "learning_rate": 4.623074574826254e-06,
- "loss": 0.5471097230911255,
- "mean_token_accuracy": 0.8220691084861755,
- "num_tokens": 7876136.0,
- "step": 863
- },
- {
- "epoch": 0.6565349544072948,
- "grad_norm": 1.5319840908050537,
- "learning_rate": 4.621967940334705e-06,
- "loss": 0.4178982377052307,
- "mean_token_accuracy": 0.8517135977745056,
- "num_tokens": 7886113.0,
- "step": 864
- },
- {
- "epoch": 0.6572948328267477,
- "grad_norm": 1.63701331615448,
- "learning_rate": 4.620859816564371e-06,
- "loss": 0.4666512608528137,
- "mean_token_accuracy": 0.8223508596420288,
- "num_tokens": 7897982.0,
- "step": 865
- },
- {
- "epoch": 0.6580547112462006,
- "grad_norm": 2.1515414714813232,
- "learning_rate": 4.619750204292978e-06,
- "loss": 0.5359305143356323,
- "mean_token_accuracy": 0.8192868232727051,
- "num_tokens": 7904947.0,
- "step": 866
- },
- {
- "epoch": 0.6588145896656535,
- "grad_norm": 2.2140955924987793,
- "learning_rate": 4.618639104299294e-06,
- "loss": 0.5275633931159973,
- "mean_token_accuracy": 0.8120715618133545,
- "num_tokens": 7913913.0,
- "step": 867
- },
- {
- "epoch": 0.6595744680851063,
- "grad_norm": 1.3956893682479858,
- "learning_rate": 4.6175265173631304e-06,
- "loss": 0.4378768503665924,
- "mean_token_accuracy": 0.8479125499725342,
- "num_tokens": 7927979.0,
- "step": 868
- },
- {
- "epoch": 0.6603343465045592,
- "grad_norm": 2.98103928565979,
- "learning_rate": 4.616412444265344e-06,
- "loss": 0.42614591121673584,
- "mean_token_accuracy": 0.8595094680786133,
- "num_tokens": 7934293.0,
- "step": 869
- },
- {
- "epoch": 0.6610942249240122,
- "grad_norm": 2.554845094680786,
- "learning_rate": 4.6152968857878365e-06,
- "loss": 0.3698030412197113,
- "mean_token_accuracy": 0.8717041015625,
- "num_tokens": 7938547.0,
- "step": 870
- },
- {
- "epoch": 0.6618541033434651,
- "grad_norm": 3.0901825428009033,
- "learning_rate": 4.6141798427135475e-06,
- "loss": 0.5037497282028198,
- "mean_token_accuracy": 0.8354041576385498,
- "num_tokens": 7942829.0,
- "step": 871
- },
- {
- "epoch": 0.662613981762918,
- "grad_norm": 2.8692073822021484,
- "learning_rate": 4.6130613158264605e-06,
- "loss": 0.5418164134025574,
- "mean_token_accuracy": 0.8298909664154053,
- "num_tokens": 7949303.0,
- "step": 872
- },
- {
- "epoch": 0.6633738601823708,
- "grad_norm": 3.960404396057129,
- "learning_rate": 4.611941305911602e-06,
- "loss": 0.6284480094909668,
- "mean_token_accuracy": 0.837495744228363,
- "num_tokens": 7952486.0,
- "step": 873
- },
- {
- "epoch": 0.6641337386018237,
- "grad_norm": 2.6690115928649902,
- "learning_rate": 4.610819813755038e-06,
- "loss": 0.5214360952377319,
- "mean_token_accuracy": 0.8213508129119873,
- "num_tokens": 7957559.0,
- "step": 874
- },
- {
- "epoch": 0.6648936170212766,
- "grad_norm": 2.3376171588897705,
- "learning_rate": 4.609696840143875e-06,
- "loss": 0.46887528896331787,
- "mean_token_accuracy": 0.8438819646835327,
- "num_tokens": 7962826.0,
- "step": 875
- },
- {
- "epoch": 0.6656534954407295,
- "grad_norm": 2.2222683429718018,
- "learning_rate": 4.6085723858662575e-06,
- "loss": 0.5607719421386719,
- "mean_token_accuracy": 0.8128405809402466,
- "num_tokens": 7970131.0,
- "step": 876
- },
- {
- "epoch": 0.6664133738601824,
- "grad_norm": 2.069091558456421,
- "learning_rate": 4.607446451711372e-06,
- "loss": 0.506301760673523,
- "mean_token_accuracy": 0.8256827592849731,
- "num_tokens": 7977524.0,
- "step": 877
- },
- {
- "epoch": 0.6671732522796353,
- "grad_norm": 1.3724967241287231,
- "learning_rate": 4.606319038469443e-06,
- "loss": 0.43285101652145386,
- "mean_token_accuracy": 0.8525032997131348,
- "num_tokens": 7989174.0,
- "step": 878
- },
- {
- "epoch": 0.6679331306990881,
- "grad_norm": 2.278205156326294,
- "learning_rate": 4.605190146931731e-06,
- "loss": 0.4845905303955078,
- "mean_token_accuracy": 0.8284652829170227,
- "num_tokens": 7998524.0,
- "step": 879
- },
- {
- "epoch": 0.668693009118541,
- "grad_norm": 1.3871766328811646,
- "learning_rate": 4.604059777890537e-06,
- "loss": 0.5736679434776306,
- "mean_token_accuracy": 0.8223285675048828,
- "num_tokens": 8015776.0,
- "step": 880
- },
- {
- "epoch": 0.6694528875379939,
- "grad_norm": 1.926164984703064,
- "learning_rate": 4.602927932139197e-06,
- "loss": 0.4133230447769165,
- "mean_token_accuracy": 0.8653768301010132,
- "num_tokens": 8022979.0,
- "step": 881
- },
- {
- "epoch": 0.6702127659574468,
- "grad_norm": 2.109272003173828,
- "learning_rate": 4.601794610472083e-06,
- "loss": 0.7005600929260254,
- "mean_token_accuracy": 0.7777010202407837,
- "num_tokens": 8032618.0,
- "step": 882
- },
- {
- "epoch": 0.6709726443768997,
- "grad_norm": 2.077977418899536,
- "learning_rate": 4.6006598136846056e-06,
- "loss": 0.5278208255767822,
- "mean_token_accuracy": 0.8230358958244324,
- "num_tokens": 8040534.0,
- "step": 883
- },
- {
- "epoch": 0.6717325227963525,
- "grad_norm": 1.678581714630127,
- "learning_rate": 4.599523542573207e-06,
- "loss": 0.4955351650714874,
- "mean_token_accuracy": 0.8270003795623779,
- "num_tokens": 8052249.0,
- "step": 884
- },
- {
- "epoch": 0.6724924012158054,
- "grad_norm": 2.0751662254333496,
- "learning_rate": 4.598385797935368e-06,
- "loss": 0.5266247987747192,
- "mean_token_accuracy": 0.8263581991195679,
- "num_tokens": 8060600.0,
- "step": 885
- },
- {
- "epoch": 0.6732522796352584,
- "grad_norm": 2.418405771255493,
- "learning_rate": 4.5972465805696e-06,
- "loss": 0.4481425881385803,
- "mean_token_accuracy": 0.846164345741272,
- "num_tokens": 8066025.0,
- "step": 886
- },
- {
- "epoch": 0.6740121580547113,
- "grad_norm": 2.3936474323272705,
- "learning_rate": 4.596105891275449e-06,
- "loss": 0.4553404450416565,
- "mean_token_accuracy": 0.8412896394729614,
- "num_tokens": 8071544.0,
- "step": 887
- },
- {
- "epoch": 0.6747720364741642,
- "grad_norm": 2.2024407386779785,
- "learning_rate": 4.594963730853497e-06,
- "loss": 0.6218541860580444,
- "mean_token_accuracy": 0.7890232801437378,
- "num_tokens": 8079061.0,
- "step": 888
- },
- {
- "epoch": 0.675531914893617,
- "grad_norm": 2.51015567779541,
- "learning_rate": 4.593820100105355e-06,
- "loss": 0.5149124264717102,
- "mean_token_accuracy": 0.8241918087005615,
- "num_tokens": 8084293.0,
- "step": 889
- },
- {
- "epoch": 0.6762917933130699,
- "grad_norm": 1.8748939037322998,
- "learning_rate": 4.5926749998336665e-06,
- "loss": 0.50836181640625,
- "mean_token_accuracy": 0.8067223429679871,
- "num_tokens": 8092511.0,
- "step": 890
- },
- {
- "epoch": 0.6770516717325228,
- "grad_norm": 1.801193118095398,
- "learning_rate": 4.5915284308421075e-06,
- "loss": 0.4372861683368683,
- "mean_token_accuracy": 0.8510604500770569,
- "num_tokens": 8101174.0,
- "step": 891
- },
- {
- "epoch": 0.6778115501519757,
- "grad_norm": 2.6476457118988037,
- "learning_rate": 4.590380393935383e-06,
- "loss": 0.38700711727142334,
- "mean_token_accuracy": 0.8659796714782715,
- "num_tokens": 8105398.0,
- "step": 892
- },
- {
- "epoch": 0.6785714285714286,
- "grad_norm": 1.1147183179855347,
- "learning_rate": 4.589230889919232e-06,
- "loss": 0.38546115159988403,
- "mean_token_accuracy": 0.8570581674575806,
- "num_tokens": 8127394.0,
- "step": 893
- },
- {
- "epoch": 0.6793313069908815,
- "grad_norm": 2.908905506134033,
- "learning_rate": 4.588079919600419e-06,
- "loss": 0.5108504295349121,
- "mean_token_accuracy": 0.8121406435966492,
- "num_tokens": 8131801.0,
- "step": 894
- },
- {
- "epoch": 0.6800911854103343,
- "grad_norm": 3.1522326469421387,
- "learning_rate": 4.586927483786739e-06,
- "loss": 0.44059112668037415,
- "mean_token_accuracy": 0.8448011875152588,
- "num_tokens": 8154416.0,
- "step": 895
- },
- {
- "epoch": 0.6808510638297872,
- "grad_norm": 1.5142440795898438,
- "learning_rate": 4.585773583287017e-06,
- "loss": 0.513217568397522,
- "mean_token_accuracy": 0.8386049270629883,
- "num_tokens": 8171156.0,
- "step": 896
- },
- {
- "epoch": 0.6816109422492401,
- "grad_norm": 2.597881317138672,
- "learning_rate": 4.584618218911104e-06,
- "loss": 0.4937712550163269,
- "mean_token_accuracy": 0.8223681449890137,
- "num_tokens": 8176124.0,
- "step": 897
- },
- {
- "epoch": 0.682370820668693,
- "grad_norm": 1.8185619115829468,
- "learning_rate": 4.583461391469879e-06,
- "loss": 0.519811749458313,
- "mean_token_accuracy": 0.8169777393341064,
- "num_tokens": 8185136.0,
- "step": 898
- },
- {
- "epoch": 0.6831306990881459,
- "grad_norm": 3.2061994075775146,
- "learning_rate": 4.582303101775249e-06,
- "loss": 0.4655115008354187,
- "mean_token_accuracy": 0.8425977230072021,
- "num_tokens": 8188864.0,
- "step": 899
- },
- {
- "epoch": 0.6838905775075987,
- "grad_norm": 1.3485229015350342,
- "learning_rate": 4.581143350640146e-06,
- "loss": 0.5014470815658569,
- "mean_token_accuracy": 0.8273109197616577,
- "num_tokens": 8203460.0,
- "step": 900
- },
- {
- "epoch": 0.6846504559270516,
- "grad_norm": 1.3264713287353516,
- "learning_rate": 4.579982138878527e-06,
- "loss": 0.5073703527450562,
- "mean_token_accuracy": 0.8259357213973999,
- "num_tokens": 8219348.0,
- "step": 901
- },
- {
- "epoch": 0.6854103343465046,
- "grad_norm": 2.4436347484588623,
- "learning_rate": 4.578819467305375e-06,
- "loss": 0.47020310163497925,
- "mean_token_accuracy": 0.8567265272140503,
- "num_tokens": 8224427.0,
- "step": 902
- },
- {
- "epoch": 0.6861702127659575,
- "grad_norm": 1.921749234199524,
- "learning_rate": 4.5776553367367e-06,
- "loss": 0.622514009475708,
- "mean_token_accuracy": 0.7863982319831848,
- "num_tokens": 8233151.0,
- "step": 903
- },
- {
- "epoch": 0.6869300911854104,
- "grad_norm": 1.8815616369247437,
- "learning_rate": 4.576489747989532e-06,
- "loss": 0.4910545349121094,
- "mean_token_accuracy": 0.8147122859954834,
- "num_tokens": 8240762.0,
- "step": 904
- },
- {
- "epoch": 0.6876899696048632,
- "grad_norm": 1.2366989850997925,
- "learning_rate": 4.575322701881926e-06,
- "loss": 0.3947566747665405,
- "mean_token_accuracy": 0.873993992805481,
- "num_tokens": 8259381.0,
- "step": 905
- },
- {
- "epoch": 0.6884498480243161,
- "grad_norm": 1.5767735242843628,
- "learning_rate": 4.57415419923296e-06,
- "loss": 0.57136070728302,
- "mean_token_accuracy": 0.8028088808059692,
- "num_tokens": 8273296.0,
- "step": 906
- },
- {
- "epoch": 0.689209726443769,
- "grad_norm": 2.378675699234009,
- "learning_rate": 4.572984240862733e-06,
- "loss": 0.5894849896430969,
- "mean_token_accuracy": 0.7977708578109741,
- "num_tokens": 8280083.0,
- "step": 907
- },
- {
- "epoch": 0.6899696048632219,
- "grad_norm": 2.0401132106781006,
- "learning_rate": 4.57181282759237e-06,
- "loss": 0.5524613261222839,
- "mean_token_accuracy": 0.8138598203659058,
- "num_tokens": 8288236.0,
- "step": 908
- },
- {
- "epoch": 0.6907294832826748,
- "grad_norm": 2.293701648712158,
- "learning_rate": 4.570639960244011e-06,
- "loss": 0.5154546499252319,
- "mean_token_accuracy": 0.8234660625457764,
- "num_tokens": 8294493.0,
- "step": 909
- },
- {
- "epoch": 0.6914893617021277,
- "grad_norm": 1.9286527633666992,
- "learning_rate": 4.56946563964082e-06,
- "loss": 0.5364264845848083,
- "mean_token_accuracy": 0.8147368431091309,
- "num_tokens": 8303441.0,
- "step": 910
- },
- {
- "epoch": 0.6922492401215805,
- "grad_norm": 1.2571251392364502,
- "learning_rate": 4.5682898666069815e-06,
- "loss": 0.43535223603248596,
- "mean_token_accuracy": 0.859239935874939,
- "num_tokens": 8321548.0,
- "step": 911
- },
- {
- "epoch": 0.6930091185410334,
- "grad_norm": 1.2224860191345215,
- "learning_rate": 4.567112641967697e-06,
- "loss": 0.40205076336860657,
- "mean_token_accuracy": 0.8724711537361145,
- "num_tokens": 8335205.0,
- "step": 912
- },
- {
- "epoch": 0.6937689969604863,
- "grad_norm": 1.2064491510391235,
- "learning_rate": 4.5659339665491894e-06,
- "loss": 0.37790587544441223,
- "mean_token_accuracy": 0.8464339971542358,
- "num_tokens": 8350926.0,
- "step": 913
- },
- {
- "epoch": 0.6945288753799392,
- "grad_norm": 2.1755270957946777,
- "learning_rate": 4.5647538411786965e-06,
- "loss": 0.42034298181533813,
- "mean_token_accuracy": 0.84148108959198,
- "num_tokens": 8356739.0,
- "step": 914
- },
- {
- "epoch": 0.6952887537993921,
- "grad_norm": 1.234864592552185,
- "learning_rate": 4.563572266684478e-06,
- "loss": 0.5062938332557678,
- "mean_token_accuracy": 0.8132052421569824,
- "num_tokens": 8373660.0,
- "step": 915
- },
- {
- "epoch": 0.6960486322188449,
- "grad_norm": 2.4250621795654297,
- "learning_rate": 4.562389243895807e-06,
- "loss": 0.4907791018486023,
- "mean_token_accuracy": 0.8337979912757874,
- "num_tokens": 8378661.0,
- "step": 916
- },
- {
- "epoch": 0.6968085106382979,
- "grad_norm": 1.5018314123153687,
- "learning_rate": 4.561204773642974e-06,
- "loss": 0.41041281819343567,
- "mean_token_accuracy": 0.8569784164428711,
- "num_tokens": 8390322.0,
- "step": 917
- },
- {
- "epoch": 0.6975683890577508,
- "grad_norm": 2.797269344329834,
- "learning_rate": 4.5600188567572874e-06,
- "loss": 0.3146931529045105,
- "mean_token_accuracy": 0.8913302421569824,
- "num_tokens": 8393567.0,
- "step": 918
- },
- {
- "epoch": 0.6983282674772037,
- "grad_norm": 1.4002827405929565,
- "learning_rate": 4.558831494071069e-06,
- "loss": 0.4275597333908081,
- "mean_token_accuracy": 0.8504893779754639,
- "num_tokens": 8407119.0,
- "step": 919
- },
- {
- "epoch": 0.6990881458966566,
- "grad_norm": 1.7045831680297852,
- "learning_rate": 4.557642686417654e-06,
- "loss": 0.49593430757522583,
- "mean_token_accuracy": 0.8185091018676758,
- "num_tokens": 8417408.0,
- "step": 920
- },
- {
- "epoch": 0.6998480243161094,
- "grad_norm": 2.8818066120147705,
- "learning_rate": 4.556452434631396e-06,
- "loss": 0.637908935546875,
- "mean_token_accuracy": 0.7883946895599365,
- "num_tokens": 8422319.0,
- "step": 921
- },
- {
- "epoch": 0.7006079027355623,
- "grad_norm": 2.3587265014648438,
- "learning_rate": 4.555260739547657e-06,
- "loss": 0.38749319314956665,
- "mean_token_accuracy": 0.8774704933166504,
- "num_tokens": 8427315.0,
- "step": 922
- },
- {
- "epoch": 0.7013677811550152,
- "grad_norm": 1.6648749113082886,
- "learning_rate": 4.554067602002815e-06,
- "loss": 0.4044865369796753,
- "mean_token_accuracy": 0.8524141311645508,
- "num_tokens": 8438662.0,
- "step": 923
- },
- {
- "epoch": 0.7021276595744681,
- "grad_norm": 3.467787742614746,
- "learning_rate": 4.55287302283426e-06,
- "loss": 0.591016411781311,
- "mean_token_accuracy": 0.81184983253479,
- "num_tokens": 8442237.0,
- "step": 924
- },
- {
- "epoch": 0.702887537993921,
- "grad_norm": 2.1458635330200195,
- "learning_rate": 4.551677002880395e-06,
- "loss": 0.5017476677894592,
- "mean_token_accuracy": 0.822914183139801,
- "num_tokens": 8449494.0,
- "step": 925
- },
- {
- "epoch": 0.7036474164133738,
- "grad_norm": 2.521714448928833,
- "learning_rate": 4.550479542980632e-06,
- "loss": 0.531912088394165,
- "mean_token_accuracy": 0.8225687742233276,
- "num_tokens": 8454983.0,
- "step": 926
- },
- {
- "epoch": 0.7044072948328267,
- "grad_norm": 3.5248100757598877,
- "learning_rate": 4.549280643975394e-06,
- "loss": 0.4631815254688263,
- "mean_token_accuracy": 0.8443771600723267,
- "num_tokens": 8458504.0,
- "step": 927
- },
- {
- "epoch": 0.7051671732522796,
- "grad_norm": 2.5105819702148438,
- "learning_rate": 4.548080306706114e-06,
- "loss": 0.30487123131752014,
- "mean_token_accuracy": 0.9018767476081848,
- "num_tokens": 8462589.0,
- "step": 928
- },
- {
- "epoch": 0.7059270516717325,
- "grad_norm": 1.3367713689804077,
- "learning_rate": 4.5468785320152365e-06,
- "loss": 0.4355026185512543,
- "mean_token_accuracy": 0.8323584794998169,
- "num_tokens": 8478450.0,
- "step": 929
- },
- {
- "epoch": 0.7066869300911854,
- "grad_norm": 2.2506282329559326,
- "learning_rate": 4.545675320746212e-06,
- "loss": 0.5082957744598389,
- "mean_token_accuracy": 0.823430597782135,
- "num_tokens": 8485991.0,
- "step": 930
- },
- {
- "epoch": 0.7074468085106383,
- "grad_norm": 1.7164632081985474,
- "learning_rate": 4.544470673743502e-06,
- "loss": 0.3960164785385132,
- "mean_token_accuracy": 0.8592486381530762,
- "num_tokens": 8495217.0,
- "step": 931
- },
- {
- "epoch": 0.7082066869300911,
- "grad_norm": 1.5864969491958618,
- "learning_rate": 4.543264591852572e-06,
- "loss": 0.49114471673965454,
- "mean_token_accuracy": 0.8330780267715454,
- "num_tokens": 8508904.0,
- "step": 932
- },
- {
- "epoch": 0.708966565349544,
- "grad_norm": 2.1707003116607666,
- "learning_rate": 4.542057075919898e-06,
- "loss": 0.49895772337913513,
- "mean_token_accuracy": 0.8327431082725525,
- "num_tokens": 8515792.0,
- "step": 933
- },
- {
- "epoch": 0.709726443768997,
- "grad_norm": 1.9002083539962769,
- "learning_rate": 4.54084812679296e-06,
- "loss": 0.4548531472682953,
- "mean_token_accuracy": 0.834532618522644,
- "num_tokens": 8524006.0,
- "step": 934
- },
- {
- "epoch": 0.7104863221884499,
- "grad_norm": 1.8505141735076904,
- "learning_rate": 4.539637745320247e-06,
- "loss": 0.35716521739959717,
- "mean_token_accuracy": 0.872222900390625,
- "num_tokens": 8533647.0,
- "step": 935
- },
- {
- "epoch": 0.7112462006079028,
- "grad_norm": 2.092620849609375,
- "learning_rate": 4.53842593235125e-06,
- "loss": 0.4673694372177124,
- "mean_token_accuracy": 0.8460999131202698,
- "num_tokens": 8540734.0,
- "step": 936
- },
- {
- "epoch": 0.7120060790273556,
- "grad_norm": 2.689514636993408,
- "learning_rate": 4.537212688736466e-06,
- "loss": 0.45461273193359375,
- "mean_token_accuracy": 0.8450704216957092,
- "num_tokens": 8544948.0,
- "step": 937
- },
- {
- "epoch": 0.7127659574468085,
- "grad_norm": 2.4507734775543213,
- "learning_rate": 4.535998015327396e-06,
- "loss": 0.4571906626224518,
- "mean_token_accuracy": 0.8429360389709473,
- "num_tokens": 8550445.0,
- "step": 938
- },
- {
- "epoch": 0.7135258358662614,
- "grad_norm": 1.8960013389587402,
- "learning_rate": 4.534781912976546e-06,
- "loss": 0.4461391568183899,
- "mean_token_accuracy": 0.8487973213195801,
- "num_tokens": 8557630.0,
- "step": 939
- },
- {
- "epoch": 0.7142857142857143,
- "grad_norm": 1.602611780166626,
- "learning_rate": 4.533564382537421e-06,
- "loss": 0.5277102589607239,
- "mean_token_accuracy": 0.8330916166305542,
- "num_tokens": 8570397.0,
- "step": 940
- },
- {
- "epoch": 0.7150455927051672,
- "grad_norm": 1.8936395645141602,
- "learning_rate": 4.532345424864533e-06,
- "loss": 0.38619571924209595,
- "mean_token_accuracy": 0.8514572381973267,
- "num_tokens": 8582673.0,
- "step": 941
- },
- {
- "epoch": 0.71580547112462,
- "grad_norm": 1.3898619413375854,
- "learning_rate": 4.531125040813392e-06,
- "loss": 0.4825032949447632,
- "mean_token_accuracy": 0.833012580871582,
- "num_tokens": 8597239.0,
- "step": 942
- },
- {
- "epoch": 0.7165653495440729,
- "grad_norm": 2.128230571746826,
- "learning_rate": 4.529903231240511e-06,
- "loss": 0.4862118065357208,
- "mean_token_accuracy": 0.8210917711257935,
- "num_tokens": 8605877.0,
- "step": 943
- },
- {
- "epoch": 0.7173252279635258,
- "grad_norm": 1.6552259922027588,
- "learning_rate": 4.528679997003403e-06,
- "loss": 0.5092059373855591,
- "mean_token_accuracy": 0.8247389793395996,
- "num_tokens": 8617060.0,
- "step": 944
- },
- {
- "epoch": 0.7180851063829787,
- "grad_norm": 2.1174771785736084,
- "learning_rate": 4.52745533896058e-06,
- "loss": 0.39110174775123596,
- "mean_token_accuracy": 0.8672944903373718,
- "num_tokens": 8623306.0,
- "step": 945
- },
- {
- "epoch": 0.7188449848024316,
- "grad_norm": 2.8648383617401123,
- "learning_rate": 4.526229257971556e-06,
- "loss": 0.49864327907562256,
- "mean_token_accuracy": 0.8305130004882812,
- "num_tokens": 8627466.0,
- "step": 946
- },
- {
- "epoch": 0.7196048632218845,
- "grad_norm": 2.155514717102051,
- "learning_rate": 4.52500175489684e-06,
- "loss": 0.5070191025733948,
- "mean_token_accuracy": 0.8311188817024231,
- "num_tokens": 8634759.0,
- "step": 947
- },
- {
- "epoch": 0.7203647416413373,
- "grad_norm": 1.8432683944702148,
- "learning_rate": 4.523772830597942e-06,
- "loss": 0.5569252371788025,
- "mean_token_accuracy": 0.8070821762084961,
- "num_tokens": 8644160.0,
- "step": 948
- },
- {
- "epoch": 0.7211246200607903,
- "grad_norm": 2.8912241458892822,
- "learning_rate": 4.522542485937369e-06,
- "loss": 0.4799427390098572,
- "mean_token_accuracy": 0.8443552851676941,
- "num_tokens": 8648377.0,
- "step": 949
- },
- {
- "epoch": 0.7218844984802432,
- "grad_norm": 3.3449625968933105,
- "learning_rate": 4.521310721778622e-06,
- "loss": 0.44043463468551636,
- "mean_token_accuracy": 0.8521315455436707,
- "num_tokens": 8651846.0,
- "step": 950
- },
- {
- "epoch": 0.7226443768996961,
- "grad_norm": 1.4127917289733887,
- "learning_rate": 4.520077538986203e-06,
- "loss": 0.4700999855995178,
- "mean_token_accuracy": 0.8377952575683594,
- "num_tokens": 8665199.0,
- "step": 951
- },
- {
- "epoch": 0.723404255319149,
- "grad_norm": 2.1607301235198975,
- "learning_rate": 4.518842938425606e-06,
- "loss": 0.4374256730079651,
- "mean_token_accuracy": 0.8448896408081055,
- "num_tokens": 8672158.0,
- "step": 952
- },
- {
- "epoch": 0.7241641337386018,
- "grad_norm": 1.3442779779434204,
- "learning_rate": 4.51760692096332e-06,
- "loss": 0.38948923349380493,
- "mean_token_accuracy": 0.8598923683166504,
- "num_tokens": 8684532.0,
- "step": 953
- },
- {
- "epoch": 0.7249240121580547,
- "grad_norm": 2.0003178119659424,
- "learning_rate": 4.516369487466832e-06,
- "loss": 0.3797217011451721,
- "mean_token_accuracy": 0.8652102947235107,
- "num_tokens": 8691460.0,
- "step": 954
- },
- {
- "epoch": 0.7256838905775076,
- "grad_norm": 1.8196535110473633,
- "learning_rate": 4.5151306388046175e-06,
- "loss": 0.5676811933517456,
- "mean_token_accuracy": 0.818500816822052,
- "num_tokens": 8701624.0,
- "step": 955
- },
- {
- "epoch": 0.7264437689969605,
- "grad_norm": 2.1962296962738037,
- "learning_rate": 4.513890375846152e-06,
- "loss": 0.45399484038352966,
- "mean_token_accuracy": 0.8463879227638245,
- "num_tokens": 8707410.0,
- "step": 956
- },
- {
- "epoch": 0.7272036474164134,
- "grad_norm": 1.8798872232437134,
- "learning_rate": 4.512648699461897e-06,
- "loss": 0.5679811239242554,
- "mean_token_accuracy": 0.8089900016784668,
- "num_tokens": 8715630.0,
- "step": 957
- },
- {
- "epoch": 0.7279635258358662,
- "grad_norm": 2.3540258407592773,
- "learning_rate": 4.511405610523309e-06,
- "loss": 0.5282865762710571,
- "mean_token_accuracy": 0.8196114301681519,
- "num_tokens": 8721934.0,
- "step": 958
- },
- {
- "epoch": 0.7287234042553191,
- "grad_norm": 2.5630908012390137,
- "learning_rate": 4.510161109902837e-06,
- "loss": 0.39442378282546997,
- "mean_token_accuracy": 0.8400980830192566,
- "num_tokens": 8726511.0,
- "step": 959
- },
- {
- "epoch": 0.729483282674772,
- "grad_norm": 1.9829226732254028,
- "learning_rate": 4.508915198473919e-06,
- "loss": 0.4611976742744446,
- "mean_token_accuracy": 0.8439624309539795,
- "num_tokens": 8733460.0,
- "step": 960
- },
- {
- "epoch": 0.7302431610942249,
- "grad_norm": 3.0291950702667236,
- "learning_rate": 4.507667877110982e-06,
- "loss": 0.5158340930938721,
- "mean_token_accuracy": 0.8300060033798218,
- "num_tokens": 8737629.0,
- "step": 961
- },
- {
- "epoch": 0.7310030395136778,
- "grad_norm": 1.9208252429962158,
- "learning_rate": 4.506419146689445e-06,
- "loss": 0.3807099163532257,
- "mean_token_accuracy": 0.871469259262085,
- "num_tokens": 8744615.0,
- "step": 962
- },
- {
- "epoch": 0.7317629179331308,
- "grad_norm": 3.051565408706665,
- "learning_rate": 4.505169008085717e-06,
- "loss": 0.38461726903915405,
- "mean_token_accuracy": 0.874465823173523,
- "num_tokens": 8748154.0,
- "step": 963
- },
- {
- "epoch": 0.7325227963525835,
- "grad_norm": 1.375466227531433,
- "learning_rate": 4.503917462177192e-06,
- "loss": 0.42490679025650024,
- "mean_token_accuracy": 0.8457326889038086,
- "num_tokens": 8760965.0,
- "step": 964
- },
- {
- "epoch": 0.7332826747720365,
- "grad_norm": 2.216681957244873,
- "learning_rate": 4.5026645098422515e-06,
- "loss": 0.43149900436401367,
- "mean_token_accuracy": 0.8527278900146484,
- "num_tokens": 8766996.0,
- "step": 965
- },
- {
- "epoch": 0.7340425531914894,
- "grad_norm": 1.9422595500946045,
- "learning_rate": 4.5014101519602684e-06,
- "loss": 0.4964504539966583,
- "mean_token_accuracy": 0.8137556314468384,
- "num_tokens": 8774411.0,
- "step": 966
- },
- {
- "epoch": 0.7348024316109423,
- "grad_norm": 2.058887004852295,
- "learning_rate": 4.500154389411598e-06,
- "loss": 0.4977570176124573,
- "mean_token_accuracy": 0.8254626989364624,
- "num_tokens": 8782220.0,
- "step": 967
- },
- {
- "epoch": 0.7355623100303952,
- "grad_norm": 2.9977786540985107,
- "learning_rate": 4.498897223077582e-06,
- "loss": 0.4061415195465088,
- "mean_token_accuracy": 0.8752427101135254,
- "num_tokens": 8786120.0,
- "step": 968
- },
- {
- "epoch": 0.736322188449848,
- "grad_norm": 2.2636303901672363,
- "learning_rate": 4.49763865384055e-06,
- "loss": 0.5062161087989807,
- "mean_token_accuracy": 0.8171653747558594,
- "num_tokens": 8792459.0,
- "step": 969
- },
- {
- "epoch": 0.7370820668693009,
- "grad_norm": 1.8850842714309692,
- "learning_rate": 4.496378682583813e-06,
- "loss": 0.5014280676841736,
- "mean_token_accuracy": 0.8547511100769043,
- "num_tokens": 8800675.0,
- "step": 970
- },
- {
- "epoch": 0.7378419452887538,
- "grad_norm": 1.191985011100769,
- "learning_rate": 4.495117310191667e-06,
- "loss": 0.4713883101940155,
- "mean_token_accuracy": 0.8213596343994141,
- "num_tokens": 8820740.0,
- "step": 971
- },
- {
- "epoch": 0.7386018237082067,
- "grad_norm": 1.823000192642212,
- "learning_rate": 4.493854537549393e-06,
- "loss": 0.46332645416259766,
- "mean_token_accuracy": 0.8359860777854919,
- "num_tokens": 8828884.0,
- "step": 972
- },
- {
- "epoch": 0.7393617021276596,
- "grad_norm": 2.590446949005127,
- "learning_rate": 4.492590365543253e-06,
- "loss": 0.49074703454971313,
- "mean_token_accuracy": 0.8433758020401001,
- "num_tokens": 8833859.0,
- "step": 973
- },
- {
- "epoch": 0.7401215805471124,
- "grad_norm": 2.2762670516967773,
- "learning_rate": 4.491324795060491e-06,
- "loss": 0.39465656876564026,
- "mean_token_accuracy": 0.8734766244888306,
- "num_tokens": 8839350.0,
- "step": 974
- },
- {
- "epoch": 0.7408814589665653,
- "grad_norm": 2.698725461959839,
- "learning_rate": 4.490057826989333e-06,
- "loss": 0.5552085041999817,
- "mean_token_accuracy": 0.8132266998291016,
- "num_tokens": 8844373.0,
- "step": 975
- },
- {
- "epoch": 0.7416413373860182,
- "grad_norm": 2.704606294631958,
- "learning_rate": 4.488789462218988e-06,
- "loss": 0.3447791635990143,
- "mean_token_accuracy": 0.8736170530319214,
- "num_tokens": 8848236.0,
- "step": 976
- },
- {
- "epoch": 0.7424012158054711,
- "grad_norm": 3.1260716915130615,
- "learning_rate": 4.487519701639641e-06,
- "loss": 0.5945233702659607,
- "mean_token_accuracy": 0.7997599840164185,
- "num_tokens": 8852935.0,
- "step": 977
- },
- {
- "epoch": 0.743161094224924,
- "grad_norm": 1.6895452737808228,
- "learning_rate": 4.486248546142459e-06,
- "loss": 0.4823892116546631,
- "mean_token_accuracy": 0.8279662132263184,
- "num_tokens": 8861743.0,
- "step": 978
- },
- {
- "epoch": 0.743920972644377,
- "grad_norm": 1.9161452054977417,
- "learning_rate": 4.4849759966195885e-06,
- "loss": 0.5266581773757935,
- "mean_token_accuracy": 0.8218623399734497,
- "num_tokens": 8870601.0,
- "step": 979
- },
- {
- "epoch": 0.7446808510638298,
- "grad_norm": 1.6894301176071167,
- "learning_rate": 4.483702053964154e-06,
- "loss": 0.4186219573020935,
- "mean_token_accuracy": 0.8471781015396118,
- "num_tokens": 8885617.0,
- "step": 980
- },
- {
- "epoch": 0.7454407294832827,
- "grad_norm": 1.6319992542266846,
- "learning_rate": 4.482426719070258e-06,
- "loss": 0.541317880153656,
- "mean_token_accuracy": 0.8216162323951721,
- "num_tokens": 8897595.0,
- "step": 981
- },
- {
- "epoch": 0.7462006079027356,
- "grad_norm": 5.102413177490234,
- "learning_rate": 4.4811499928329775e-06,
- "loss": 0.3928517699241638,
- "mean_token_accuracy": 0.858033299446106,
- "num_tokens": 8901682.0,
- "step": 982
- },
- {
- "epoch": 0.7469604863221885,
- "grad_norm": 2.213860273361206,
- "learning_rate": 4.479871876148368e-06,
- "loss": 0.4276347756385803,
- "mean_token_accuracy": 0.8529798984527588,
- "num_tokens": 8908088.0,
- "step": 983
- },
- {
- "epoch": 0.7477203647416414,
- "grad_norm": 1.2180038690567017,
- "learning_rate": 4.478592369913464e-06,
- "loss": 0.3941590189933777,
- "mean_token_accuracy": 0.8608149290084839,
- "num_tokens": 8925876.0,
- "step": 984
- },
- {
- "epoch": 0.7484802431610942,
- "grad_norm": 2.849802255630493,
- "learning_rate": 4.477311475026271e-06,
- "loss": 0.42190325260162354,
- "mean_token_accuracy": 0.860505223274231,
- "num_tokens": 8930190.0,
- "step": 985
- },
- {
- "epoch": 0.7492401215805471,
- "grad_norm": 1.704128384590149,
- "learning_rate": 4.476029192385769e-06,
- "loss": 0.4786282777786255,
- "mean_token_accuracy": 0.8302322626113892,
- "num_tokens": 8938340.0,
- "step": 986
- },
- {
- "epoch": 0.75,
- "grad_norm": 2.06322979927063,
- "learning_rate": 4.474745522891915e-06,
- "loss": 0.4648786187171936,
- "mean_token_accuracy": 0.8366481065750122,
- "num_tokens": 8944633.0,
- "step": 987
- },
- {
- "epoch": 0.7507598784194529,
- "grad_norm": 2.0745396614074707,
- "learning_rate": 4.473460467445637e-06,
- "loss": 0.5744885206222534,
- "mean_token_accuracy": 0.8357284069061279,
- "num_tokens": 8954457.0,
- "step": 988
- },
- {
- "epoch": 0.7515197568389058,
- "grad_norm": 1.9281407594680786,
- "learning_rate": 4.472174026948836e-06,
- "loss": 0.528974175453186,
- "mean_token_accuracy": 0.8083580732345581,
- "num_tokens": 8962701.0,
- "step": 989
- },
- {
- "epoch": 0.7522796352583586,
- "grad_norm": 3.012381076812744,
- "learning_rate": 4.470886202304385e-06,
- "loss": 0.48754751682281494,
- "mean_token_accuracy": 0.8368391990661621,
- "num_tokens": 8967272.0,
- "step": 990
- },
- {
- "epoch": 0.7530395136778115,
- "grad_norm": 1.691826581954956,
- "learning_rate": 4.469596994416131e-06,
- "loss": 0.484740674495697,
- "mean_token_accuracy": 0.8500643968582153,
- "num_tokens": 8976615.0,
- "step": 991
- },
- {
- "epoch": 0.7537993920972644,
- "grad_norm": 2.4961965084075928,
- "learning_rate": 4.468306404188887e-06,
- "loss": 0.50777268409729,
- "mean_token_accuracy": 0.8168395757675171,
- "num_tokens": 8983235.0,
- "step": 992
- },
- {
- "epoch": 0.7545592705167173,
- "grad_norm": 1.512007713317871,
- "learning_rate": 4.467014432528441e-06,
- "loss": 0.4583340287208557,
- "mean_token_accuracy": 0.8465162515640259,
- "num_tokens": 8993815.0,
- "step": 993
- },
- {
- "epoch": 0.7553191489361702,
- "grad_norm": 1.9362257719039917,
- "learning_rate": 4.465721080341547e-06,
- "loss": 0.6027892827987671,
- "mean_token_accuracy": 0.8052380084991455,
- "num_tokens": 9002697.0,
- "step": 994
- },
- {
- "epoch": 0.756079027355623,
- "grad_norm": 2.473632335662842,
- "learning_rate": 4.4644263485359316e-06,
- "loss": 0.5394320487976074,
- "mean_token_accuracy": 0.834665834903717,
- "num_tokens": 9007428.0,
- "step": 995
- },
- {
- "epoch": 0.756838905775076,
- "grad_norm": 2.2527434825897217,
- "learning_rate": 4.463130238020284e-06,
- "loss": 0.5485198497772217,
- "mean_token_accuracy": 0.8090173006057739,
- "num_tokens": 9013570.0,
- "step": 996
- },
- {
- "epoch": 0.7575987841945289,
- "grad_norm": 1.4130940437316895,
- "learning_rate": 4.4618327497042676e-06,
- "loss": 0.37994423508644104,
- "mean_token_accuracy": 0.8625167012214661,
- "num_tokens": 9025485.0,
- "step": 997
- },
- {
- "epoch": 0.7583586626139818,
- "grad_norm": 2.685115098953247,
- "learning_rate": 4.460533884498509e-06,
- "loss": 0.447973370552063,
- "mean_token_accuracy": 0.8564165234565735,
- "num_tokens": 9030355.0,
- "step": 998
- },
- {
- "epoch": 0.7591185410334347,
- "grad_norm": 3.2743139266967773,
- "learning_rate": 4.4592336433146e-06,
- "loss": 0.45275989174842834,
- "mean_token_accuracy": 0.8462578058242798,
- "num_tokens": 9034406.0,
- "step": 999
- },
- {
- "epoch": 0.7598784194528876,
- "grad_norm": 1.9383049011230469,
- "learning_rate": 4.457932027065102e-06,
- "loss": 0.5387729406356812,
- "mean_token_accuracy": 0.8357330560684204,
- "num_tokens": 9041502.0,
- "step": 1000
- },
- {
- "epoch": 0.7606382978723404,
- "grad_norm": 2.7348275184631348,
- "learning_rate": 4.456629036663537e-06,
- "loss": 0.4448447823524475,
- "mean_token_accuracy": 0.8453642129898071,
- "num_tokens": 9046088.0,
- "step": 1001
- },
- {
- "epoch": 0.7613981762917933,
- "grad_norm": 1.8477401733398438,
- "learning_rate": 4.455324673024396e-06,
- "loss": 0.5766505002975464,
- "mean_token_accuracy": 0.8074213862419128,
- "num_tokens": 9055678.0,
- "step": 1002
- },
- {
- "epoch": 0.7621580547112462,
- "grad_norm": 3.134481430053711,
- "learning_rate": 4.4540189370631315e-06,
- "loss": 0.5690872669219971,
- "mean_token_accuracy": 0.8414670825004578,
- "num_tokens": 9062006.0,
- "step": 1003
- },
- {
- "epoch": 0.7629179331306991,
- "grad_norm": 1.7933398485183716,
- "learning_rate": 4.452711829696158e-06,
- "loss": 0.4898291826248169,
- "mean_token_accuracy": 0.8259007930755615,
- "num_tokens": 9070754.0,
- "step": 1004
- },
- {
- "epoch": 0.763677811550152,
- "grad_norm": 1.2552275657653809,
- "learning_rate": 4.451403351840855e-06,
- "loss": 0.4280198812484741,
- "mean_token_accuracy": 0.8409112691879272,
- "num_tokens": 9085306.0,
- "step": 1005
- },
- {
- "epoch": 0.7644376899696048,
- "grad_norm": 1.6749331951141357,
- "learning_rate": 4.450093504415562e-06,
- "loss": 0.3723178505897522,
- "mean_token_accuracy": 0.8545734882354736,
- "num_tokens": 9102453.0,
- "step": 1006
- },
- {
- "epoch": 0.7651975683890577,
- "grad_norm": 2.7514500617980957,
- "learning_rate": 4.44878228833958e-06,
- "loss": 0.5463190674781799,
- "mean_token_accuracy": 0.8121639490127563,
- "num_tokens": 9108342.0,
- "step": 1007
- },
- {
- "epoch": 0.7659574468085106,
- "grad_norm": 1.3322733640670776,
- "learning_rate": 4.447469704533172e-06,
- "loss": 0.573723316192627,
- "mean_token_accuracy": 0.8065711259841919,
- "num_tokens": 9123712.0,
- "step": 1008
- },
- {
- "epoch": 0.7667173252279635,
- "grad_norm": 2.6893765926361084,
- "learning_rate": 4.446155753917559e-06,
- "loss": 0.6856257915496826,
- "mean_token_accuracy": 0.7718256711959839,
- "num_tokens": 9130728.0,
- "step": 1009
- },
- {
- "epoch": 0.7674772036474165,
- "grad_norm": 1.792765498161316,
- "learning_rate": 4.444840437414923e-06,
- "loss": 0.48203110694885254,
- "mean_token_accuracy": 0.8419194221496582,
- "num_tokens": 9137983.0,
- "step": 1010
- },
- {
- "epoch": 0.7682370820668692,
- "grad_norm": 1.4957399368286133,
- "learning_rate": 4.443523755948401e-06,
- "loss": 0.4372181296348572,
- "mean_token_accuracy": 0.8491764664649963,
- "num_tokens": 9148081.0,
- "step": 1011
- },
- {
- "epoch": 0.7689969604863222,
- "grad_norm": 1.7294867038726807,
- "learning_rate": 4.442205710442095e-06,
- "loss": 0.54277503490448,
- "mean_token_accuracy": 0.8196806907653809,
- "num_tokens": 9158407.0,
- "step": 1012
- },
- {
- "epoch": 0.7697568389057751,
- "grad_norm": 2.2091221809387207,
- "learning_rate": 4.4408863018210564e-06,
- "loss": 0.4888187646865845,
- "mean_token_accuracy": 0.8384175300598145,
- "num_tokens": 9164754.0,
- "step": 1013
- },
- {
- "epoch": 0.770516717325228,
- "grad_norm": 1.7615830898284912,
- "learning_rate": 4.439565531011299e-06,
- "loss": 0.4640008211135864,
- "mean_token_accuracy": 0.8424701690673828,
- "num_tokens": 9172715.0,
- "step": 1014
- },
- {
- "epoch": 0.7712765957446809,
- "grad_norm": 1.6796128749847412,
- "learning_rate": 4.43824339893979e-06,
- "loss": 0.5227609276771545,
- "mean_token_accuracy": 0.8135923743247986,
- "num_tokens": 9183214.0,
- "step": 1015
- },
- {
- "epoch": 0.7720364741641338,
- "grad_norm": 2.1485698223114014,
- "learning_rate": 4.436919906534452e-06,
- "loss": 0.4857056140899658,
- "mean_token_accuracy": 0.8323013782501221,
- "num_tokens": 9190360.0,
- "step": 1016
- },
- {
- "epoch": 0.7727963525835866,
- "grad_norm": 2.7842206954956055,
- "learning_rate": 4.4355950547241645e-06,
- "loss": 0.46406883001327515,
- "mean_token_accuracy": 0.859869122505188,
- "num_tokens": 9194523.0,
- "step": 1017
- },
- {
- "epoch": 0.7735562310030395,
- "grad_norm": 2.3774640560150146,
- "learning_rate": 4.434268844438758e-06,
- "loss": 0.5625549554824829,
- "mean_token_accuracy": 0.8188897371292114,
- "num_tokens": 9201155.0,
- "step": 1018
- },
- {
- "epoch": 0.7743161094224924,
- "grad_norm": 2.004427909851074,
- "learning_rate": 4.432941276609018e-06,
- "loss": 0.5164387226104736,
- "mean_token_accuracy": 0.829569935798645,
- "num_tokens": 9209269.0,
- "step": 1019
- },
- {
- "epoch": 0.7750759878419453,
- "grad_norm": 1.7218989133834839,
- "learning_rate": 4.431612352166684e-06,
- "loss": 0.481005996465683,
- "mean_token_accuracy": 0.8359906673431396,
- "num_tokens": 9220860.0,
- "step": 1020
- },
- {
- "epoch": 0.7758358662613982,
- "grad_norm": 2.197108507156372,
- "learning_rate": 4.4302820720444454e-06,
- "loss": 0.440413236618042,
- "mean_token_accuracy": 0.8412867784500122,
- "num_tokens": 9226414.0,
- "step": 1021
- },
- {
- "epoch": 0.776595744680851,
- "grad_norm": 2.6995162963867188,
- "learning_rate": 4.428950437175944e-06,
- "loss": 0.3884299397468567,
- "mean_token_accuracy": 0.8696021437644958,
- "num_tokens": 9230898.0,
- "step": 1022
- },
- {
- "epoch": 0.7773556231003039,
- "grad_norm": 2.1671667098999023,
- "learning_rate": 4.427617448495772e-06,
- "loss": 0.5747478008270264,
- "mean_token_accuracy": 0.7842930555343628,
- "num_tokens": 9238479.0,
- "step": 1023
- },
- {
- "epoch": 0.7781155015197568,
- "grad_norm": 1.6299028396606445,
- "learning_rate": 4.426283106939474e-06,
- "loss": 0.39478403329849243,
- "mean_token_accuracy": 0.8685503602027893,
- "num_tokens": 9248263.0,
- "step": 1024
- },
- {
- "epoch": 0.7788753799392097,
- "grad_norm": 2.2621798515319824,
- "learning_rate": 4.424947413443539e-06,
- "loss": 0.4582178592681885,
- "mean_token_accuracy": 0.8312377333641052,
- "num_tokens": 9254168.0,
- "step": 1025
- },
- {
- "epoch": 0.7796352583586627,
- "grad_norm": 2.121091365814209,
- "learning_rate": 4.423610368945411e-06,
- "loss": 0.5315121412277222,
- "mean_token_accuracy": 0.8121483325958252,
- "num_tokens": 9261808.0,
- "step": 1026
- },
- {
- "epoch": 0.7803951367781155,
- "grad_norm": 1.8558297157287598,
- "learning_rate": 4.422271974383479e-06,
- "loss": 0.4299176037311554,
- "mean_token_accuracy": 0.8452648520469666,
- "num_tokens": 9269264.0,
- "step": 1027
- },
- {
- "epoch": 0.7811550151975684,
- "grad_norm": 1.9089949131011963,
- "learning_rate": 4.420932230697079e-06,
- "loss": 0.43876272439956665,
- "mean_token_accuracy": 0.8434094190597534,
- "num_tokens": 9277381.0,
- "step": 1028
- },
- {
- "epoch": 0.7819148936170213,
- "grad_norm": 1.8619649410247803,
- "learning_rate": 4.419591138826495e-06,
- "loss": 0.48798668384552,
- "mean_token_accuracy": 0.8281317353248596,
- "num_tokens": 9285413.0,
- "step": 1029
- },
- {
- "epoch": 0.7826747720364742,
- "grad_norm": 1.3273087739944458,
- "learning_rate": 4.418248699712955e-06,
- "loss": 0.4611460864543915,
- "mean_token_accuracy": 0.8233213424682617,
- "num_tokens": 9300805.0,
- "step": 1030
- },
- {
- "epoch": 0.7834346504559271,
- "grad_norm": 1.0473746061325073,
- "learning_rate": 4.416904914298637e-06,
- "loss": 0.36537665128707886,
- "mean_token_accuracy": 0.8671857118606567,
- "num_tokens": 9320035.0,
- "step": 1031
- },
- {
- "epoch": 0.78419452887538,
- "grad_norm": 1.9130918979644775,
- "learning_rate": 4.415559783526661e-06,
- "loss": 0.4916655123233795,
- "mean_token_accuracy": 0.8266351222991943,
- "num_tokens": 9326795.0,
- "step": 1032
- },
- {
- "epoch": 0.7849544072948328,
- "grad_norm": 2.0001816749572754,
- "learning_rate": 4.414213308341092e-06,
- "loss": 0.5711008310317993,
- "mean_token_accuracy": 0.8093076348304749,
- "num_tokens": 9335625.0,
- "step": 1033
- },
- {
- "epoch": 0.7857142857142857,
- "grad_norm": 3.933542251586914,
- "learning_rate": 4.412865489686936e-06,
- "loss": 0.621616542339325,
- "mean_token_accuracy": 0.7938898801803589,
- "num_tokens": 9339080.0,
- "step": 1034
- },
- {
- "epoch": 0.7864741641337386,
- "grad_norm": 2.061558961868286,
- "learning_rate": 4.411516328510145e-06,
- "loss": 0.583686113357544,
- "mean_token_accuracy": 0.8216883540153503,
- "num_tokens": 9348581.0,
- "step": 1035
- },
- {
- "epoch": 0.7872340425531915,
- "grad_norm": 1.9401264190673828,
- "learning_rate": 4.410165825757613e-06,
- "loss": 0.4905240535736084,
- "mean_token_accuracy": 0.8229951858520508,
- "num_tokens": 9356032.0,
- "step": 1036
- },
- {
- "epoch": 0.7879939209726444,
- "grad_norm": 3.620547294616699,
- "learning_rate": 4.408813982377175e-06,
- "loss": 0.4269888997077942,
- "mean_token_accuracy": 0.8713940978050232,
- "num_tokens": 9359061.0,
- "step": 1037
- },
- {
- "epoch": 0.7887537993920972,
- "grad_norm": 1.2027851343154907,
- "learning_rate": 4.407460799317605e-06,
- "loss": 0.39972418546676636,
- "mean_token_accuracy": 0.8610097765922546,
- "num_tokens": 9377068.0,
- "step": 1038
- },
- {
- "epoch": 0.7895136778115501,
- "grad_norm": 2.566753387451172,
- "learning_rate": 4.40610627752862e-06,
- "loss": 0.45267152786254883,
- "mean_token_accuracy": 0.83243328332901,
- "num_tokens": 9383604.0,
- "step": 1039
- },
- {
- "epoch": 0.790273556231003,
- "grad_norm": 2.940094470977783,
- "learning_rate": 4.404750417960876e-06,
- "loss": 0.42862242460250854,
- "mean_token_accuracy": 0.8582849502563477,
- "num_tokens": 9387541.0,
- "step": 1040
- },
- {
- "epoch": 0.791033434650456,
- "grad_norm": 2.0223944187164307,
- "learning_rate": 4.403393221565966e-06,
- "loss": 0.4349963665008545,
- "mean_token_accuracy": 0.8453047871589661,
- "num_tokens": 9394382.0,
- "step": 1041
- },
- {
- "epoch": 0.7917933130699089,
- "grad_norm": 2.9399030208587646,
- "learning_rate": 4.402034689296425e-06,
- "loss": 0.32197174429893494,
- "mean_token_accuracy": 0.8953392505645752,
- "num_tokens": 9397741.0,
- "step": 1042
- },
- {
- "epoch": 0.7925531914893617,
- "grad_norm": 2.819016456604004,
- "learning_rate": 4.400674822105721e-06,
- "loss": 0.6790289878845215,
- "mean_token_accuracy": 0.8135063648223877,
- "num_tokens": 9403509.0,
- "step": 1043
- },
- {
- "epoch": 0.7933130699088146,
- "grad_norm": 1.3225977420806885,
- "learning_rate": 4.399313620948262e-06,
- "loss": 0.42203834652900696,
- "mean_token_accuracy": 0.8399381637573242,
- "num_tokens": 9418870.0,
- "step": 1044
- },
- {
- "epoch": 0.7940729483282675,
- "grad_norm": 1.7822176218032837,
- "learning_rate": 4.397951086779392e-06,
- "loss": 0.4666554927825928,
- "mean_token_accuracy": 0.8364764451980591,
- "num_tokens": 9427640.0,
- "step": 1045
- },
- {
- "epoch": 0.7948328267477204,
- "grad_norm": 3.186439037322998,
- "learning_rate": 4.396587220555389e-06,
- "loss": 0.6048363447189331,
- "mean_token_accuracy": 0.7806557416915894,
- "num_tokens": 9431927.0,
- "step": 1046
- },
- {
- "epoch": 0.7955927051671733,
- "grad_norm": 3.0804805755615234,
- "learning_rate": 4.395222023233467e-06,
- "loss": 0.445969820022583,
- "mean_token_accuracy": 0.850671112537384,
- "num_tokens": 9436136.0,
- "step": 1047
- },
- {
- "epoch": 0.7963525835866262,
- "grad_norm": 1.675968885421753,
- "learning_rate": 4.393855495771774e-06,
- "loss": 0.4311422109603882,
- "mean_token_accuracy": 0.8449079990386963,
- "num_tokens": 9445189.0,
- "step": 1048
- },
- {
- "epoch": 0.797112462006079,
- "grad_norm": 2.342410087585449,
- "learning_rate": 4.3924876391293915e-06,
- "loss": 0.5733606219291687,
- "mean_token_accuracy": 0.8156592845916748,
- "num_tokens": 9451939.0,
- "step": 1049
- },
- {
- "epoch": 0.7978723404255319,
- "grad_norm": 1.5967470407485962,
- "learning_rate": 4.391118454266335e-06,
- "loss": 0.46664729714393616,
- "mean_token_accuracy": 0.8091695308685303,
- "num_tokens": 9463968.0,
- "step": 1050
- },
- {
- "epoch": 0.7986322188449848,
- "grad_norm": 1.5777863264083862,
- "learning_rate": 4.389747942143549e-06,
- "loss": 0.46028903126716614,
- "mean_token_accuracy": 0.8347330093383789,
- "num_tokens": 9475561.0,
- "step": 1051
- },
- {
- "epoch": 0.7993920972644377,
- "grad_norm": 2.7630488872528076,
- "learning_rate": 4.388376103722914e-06,
- "loss": 0.5618188977241516,
- "mean_token_accuracy": 0.8273467421531677,
- "num_tokens": 9480661.0,
- "step": 1052
- },
- {
- "epoch": 0.8001519756838906,
- "grad_norm": 2.093397378921509,
- "learning_rate": 4.387002939967237e-06,
- "loss": 0.2998353838920593,
- "mean_token_accuracy": 0.8905231952667236,
- "num_tokens": 9485924.0,
- "step": 1053
- },
- {
- "epoch": 0.8009118541033434,
- "grad_norm": 1.4385871887207031,
- "learning_rate": 4.38562845184026e-06,
- "loss": 0.4944111704826355,
- "mean_token_accuracy": 0.8403056263923645,
- "num_tokens": 9500128.0,
- "step": 1054
- },
- {
- "epoch": 0.8016717325227963,
- "grad_norm": 1.6393156051635742,
- "learning_rate": 4.384252640306649e-06,
- "loss": 0.5727907419204712,
- "mean_token_accuracy": 0.7849414348602295,
- "num_tokens": 9511569.0,
- "step": 1055
- },
- {
- "epoch": 0.8024316109422492,
- "grad_norm": 2.3909664154052734,
- "learning_rate": 4.382875506332002e-06,
- "loss": 0.4760419726371765,
- "mean_token_accuracy": 0.8408266305923462,
- "num_tokens": 9517244.0,
- "step": 1056
- },
- {
- "epoch": 0.8031914893617021,
- "grad_norm": 1.7288594245910645,
- "learning_rate": 4.381497050882845e-06,
- "loss": 0.5375926494598389,
- "mean_token_accuracy": 0.8138614892959595,
- "num_tokens": 9528736.0,
- "step": 1057
- },
- {
- "epoch": 0.8039513677811551,
- "grad_norm": 2.093407392501831,
- "learning_rate": 4.380117274926632e-06,
- "loss": 0.46659404039382935,
- "mean_token_accuracy": 0.8450702428817749,
- "num_tokens": 9536200.0,
- "step": 1058
- },
- {
- "epoch": 0.8047112462006079,
- "grad_norm": 1.6835898160934448,
- "learning_rate": 4.3787361794317405e-06,
- "loss": 0.43157699704170227,
- "mean_token_accuracy": 0.8279973268508911,
- "num_tokens": 9546314.0,
- "step": 1059
- },
- {
- "epoch": 0.8054711246200608,
- "grad_norm": 1.983067512512207,
- "learning_rate": 4.377353765367479e-06,
- "loss": 0.5021739602088928,
- "mean_token_accuracy": 0.8274815082550049,
- "num_tokens": 9554375.0,
- "step": 1060
- },
- {
- "epoch": 0.8062310030395137,
- "grad_norm": 2.0472030639648438,
- "learning_rate": 4.375970033704078e-06,
- "loss": 0.34298190474510193,
- "mean_token_accuracy": 0.8900876045227051,
- "num_tokens": 9560230.0,
- "step": 1061
- },
- {
- "epoch": 0.8069908814589666,
- "grad_norm": 1.9613717794418335,
- "learning_rate": 4.374584985412692e-06,
- "loss": 0.3826758861541748,
- "mean_token_accuracy": 0.839923620223999,
- "num_tokens": 9566809.0,
- "step": 1062
- },
- {
- "epoch": 0.8077507598784195,
- "grad_norm": 1.991289496421814,
- "learning_rate": 4.373198621465405e-06,
- "loss": 0.5492525100708008,
- "mean_token_accuracy": 0.8153272867202759,
- "num_tokens": 9576810.0,
- "step": 1063
- },
- {
- "epoch": 0.8085106382978723,
- "grad_norm": 2.421370506286621,
- "learning_rate": 4.3718109428352155e-06,
- "loss": 0.5240297317504883,
- "mean_token_accuracy": 0.8087242245674133,
- "num_tokens": 9582906.0,
- "step": 1064
- },
- {
- "epoch": 0.8092705167173252,
- "grad_norm": 3.697765588760376,
- "learning_rate": 4.370421950496055e-06,
- "loss": 0.6096476912498474,
- "mean_token_accuracy": 0.787585973739624,
- "num_tokens": 9586920.0,
- "step": 1065
- },
- {
- "epoch": 0.8100303951367781,
- "grad_norm": 2.0767786502838135,
- "learning_rate": 4.369031645422768e-06,
- "loss": 0.41120079159736633,
- "mean_token_accuracy": 0.8513731956481934,
- "num_tokens": 9593902.0,
- "step": 1066
- },
- {
- "epoch": 0.810790273556231,
- "grad_norm": 2.5968732833862305,
- "learning_rate": 4.367640028591126e-06,
- "loss": 0.3364982008934021,
- "mean_token_accuracy": 0.8786963224411011,
- "num_tokens": 9597745.0,
- "step": 1067
- },
- {
- "epoch": 0.8115501519756839,
- "grad_norm": 2.165742874145508,
- "learning_rate": 4.366247100977818e-06,
- "loss": 0.406129390001297,
- "mean_token_accuracy": 0.868243932723999,
- "num_tokens": 9603496.0,
- "step": 1068
- },
- {
- "epoch": 0.8123100303951368,
- "grad_norm": 2.0493404865264893,
- "learning_rate": 4.364852863560456e-06,
- "loss": 0.5356296300888062,
- "mean_token_accuracy": 0.8191947340965271,
- "num_tokens": 9610898.0,
- "step": 1069
- },
- {
- "epoch": 0.8130699088145896,
- "grad_norm": 2.3224308490753174,
- "learning_rate": 4.363457317317568e-06,
- "loss": 0.41461923718452454,
- "mean_token_accuracy": 0.8537945747375488,
- "num_tokens": 9616626.0,
- "step": 1070
- },
- {
- "epoch": 0.8138297872340425,
- "grad_norm": 1.7387986183166504,
- "learning_rate": 4.362060463228603e-06,
- "loss": 0.5134786367416382,
- "mean_token_accuracy": 0.8511737585067749,
- "num_tokens": 9626223.0,
- "step": 1071
- },
- {
- "epoch": 0.8145896656534954,
- "grad_norm": 3.0270655155181885,
- "learning_rate": 4.360662302273926e-06,
- "loss": 0.3410695791244507,
- "mean_token_accuracy": 0.8746449947357178,
- "num_tokens": 9629455.0,
- "step": 1072
- },
- {
- "epoch": 0.8153495440729484,
- "grad_norm": 1.7727062702178955,
- "learning_rate": 4.35926283543482e-06,
- "loss": 0.4610968828201294,
- "mean_token_accuracy": 0.8444793224334717,
- "num_tokens": 9638070.0,
- "step": 1073
- },
- {
- "epoch": 0.8161094224924013,
- "grad_norm": 3.6333565711975098,
- "learning_rate": 4.357862063693486e-06,
- "loss": 0.3881273865699768,
- "mean_token_accuracy": 0.8757344484329224,
- "num_tokens": 9641028.0,
- "step": 1074
- },
- {
- "epoch": 0.8168693009118541,
- "grad_norm": 3.024042844772339,
- "learning_rate": 4.356459988033039e-06,
- "loss": 0.3853808641433716,
- "mean_token_accuracy": 0.8602254390716553,
- "num_tokens": 9645730.0,
- "step": 1075
- },
- {
- "epoch": 0.817629179331307,
- "grad_norm": 2.3359482288360596,
- "learning_rate": 4.355056609437509e-06,
- "loss": 0.4852045476436615,
- "mean_token_accuracy": 0.8502728343009949,
- "num_tokens": 9650975.0,
- "step": 1076
- },
- {
- "epoch": 0.8183890577507599,
- "grad_norm": 2.2390685081481934,
- "learning_rate": 4.353651928891842e-06,
- "loss": 0.5287341475486755,
- "mean_token_accuracy": 0.8247801065444946,
- "num_tokens": 9657471.0,
- "step": 1077
- },
- {
- "epoch": 0.8191489361702128,
- "grad_norm": 2.3809144496917725,
- "learning_rate": 4.352245947381897e-06,
- "loss": 0.5218510627746582,
- "mean_token_accuracy": 0.8149170875549316,
- "num_tokens": 9664108.0,
- "step": 1078
- },
- {
- "epoch": 0.8199088145896657,
- "grad_norm": 1.7072309255599976,
- "learning_rate": 4.3508386658944455e-06,
- "loss": 0.46481168270111084,
- "mean_token_accuracy": 0.834963321685791,
- "num_tokens": 9673175.0,
- "step": 1079
- },
- {
- "epoch": 0.8206686930091185,
- "grad_norm": 1.7383702993392944,
- "learning_rate": 4.349430085417171e-06,
- "loss": 0.4505952000617981,
- "mean_token_accuracy": 0.8507769107818604,
- "num_tokens": 9682800.0,
- "step": 1080
- },
- {
- "epoch": 0.8214285714285714,
- "grad_norm": 2.4308547973632812,
- "learning_rate": 4.348020206938672e-06,
- "loss": 0.4832455515861511,
- "mean_token_accuracy": 0.8538393974304199,
- "num_tokens": 9688123.0,
- "step": 1081
- },
- {
- "epoch": 0.8221884498480243,
- "grad_norm": 2.2686192989349365,
- "learning_rate": 4.3466090314484526e-06,
- "loss": 0.5112563371658325,
- "mean_token_accuracy": 0.8308460712432861,
- "num_tokens": 9694299.0,
- "step": 1082
- },
- {
- "epoch": 0.8229483282674772,
- "grad_norm": 2.806093454360962,
- "learning_rate": 4.345196559936931e-06,
- "loss": 0.4818246364593506,
- "mean_token_accuracy": 0.86617112159729,
- "num_tokens": 9698471.0,
- "step": 1083
- },
- {
- "epoch": 0.8237082066869301,
- "grad_norm": 1.7340706586837769,
- "learning_rate": 4.343782793395435e-06,
- "loss": 0.38246971368789673,
- "mean_token_accuracy": 0.8675198554992676,
- "num_tokens": 9706444.0,
- "step": 1084
- },
- {
- "epoch": 0.824468085106383,
- "grad_norm": 1.664942741394043,
- "learning_rate": 4.3423677328162e-06,
- "loss": 0.498797208070755,
- "mean_token_accuracy": 0.8447319865226746,
- "num_tokens": 9716765.0,
- "step": 1085
- },
- {
- "epoch": 0.8252279635258358,
- "grad_norm": 1.3608235120773315,
- "learning_rate": 4.340951379192369e-06,
- "loss": 0.41961491107940674,
- "mean_token_accuracy": 0.8339346647262573,
- "num_tokens": 9729564.0,
- "step": 1086
- },
- {
- "epoch": 0.8259878419452887,
- "grad_norm": 1.642503261566162,
- "learning_rate": 4.3395337335179945e-06,
- "loss": 0.5477945804595947,
- "mean_token_accuracy": 0.8117889761924744,
- "num_tokens": 9741217.0,
- "step": 1087
- },
- {
- "epoch": 0.8267477203647416,
- "grad_norm": 3.0345044136047363,
- "learning_rate": 4.338114796788035e-06,
- "loss": 0.5024623870849609,
- "mean_token_accuracy": 0.8333141207695007,
- "num_tokens": 9744941.0,
- "step": 1088
- },
- {
- "epoch": 0.8275075987841946,
- "grad_norm": 1.3096630573272705,
- "learning_rate": 4.336694569998354e-06,
- "loss": 0.44169723987579346,
- "mean_token_accuracy": 0.859926700592041,
- "num_tokens": 9757854.0,
- "step": 1089
- },
- {
- "epoch": 0.8282674772036475,
- "grad_norm": 2.203279495239258,
- "learning_rate": 4.3352730541457215e-06,
- "loss": 0.5283265113830566,
- "mean_token_accuracy": 0.8053759932518005,
- "num_tokens": 9764096.0,
- "step": 1090
- },
- {
- "epoch": 0.8290273556231003,
- "grad_norm": 1.3774312734603882,
- "learning_rate": 4.333850250227814e-06,
- "loss": 0.4584103226661682,
- "mean_token_accuracy": 0.8342611193656921,
- "num_tokens": 9777768.0,
- "step": 1091
- },
- {
- "epoch": 0.8297872340425532,
- "grad_norm": 1.822637915611267,
- "learning_rate": 4.332426159243206e-06,
- "loss": 0.5432791709899902,
- "mean_token_accuracy": 0.8136210441589355,
- "num_tokens": 9791276.0,
- "step": 1092
- },
- {
- "epoch": 0.8305471124620061,
- "grad_norm": 3.0190067291259766,
- "learning_rate": 4.331000782191384e-06,
- "loss": 0.5018150806427002,
- "mean_token_accuracy": 0.8234807252883911,
- "num_tokens": 9794902.0,
- "step": 1093
- },
- {
- "epoch": 0.831306990881459,
- "grad_norm": 2.09987735748291,
- "learning_rate": 4.329574120072728e-06,
- "loss": 0.4270891547203064,
- "mean_token_accuracy": 0.8544977903366089,
- "num_tokens": 9800903.0,
- "step": 1094
- },
- {
- "epoch": 0.8320668693009119,
- "grad_norm": 1.969549536705017,
- "learning_rate": 4.328146173888528e-06,
- "loss": 0.45801427960395813,
- "mean_token_accuracy": 0.8334714770317078,
- "num_tokens": 9808719.0,
- "step": 1095
- },
- {
- "epoch": 0.8328267477203647,
- "grad_norm": 1.4565571546554565,
- "learning_rate": 4.32671694464097e-06,
- "loss": 0.34864288568496704,
- "mean_token_accuracy": 0.8689061999320984,
- "num_tokens": 9818262.0,
- "step": 1096
- },
- {
- "epoch": 0.8335866261398176,
- "grad_norm": 1.2163832187652588,
- "learning_rate": 4.3252864333331424e-06,
- "loss": 0.37953704595565796,
- "mean_token_accuracy": 0.866554856300354,
- "num_tokens": 9833942.0,
- "step": 1097
- },
- {
- "epoch": 0.8343465045592705,
- "grad_norm": 1.6112010478973389,
- "learning_rate": 4.323854640969033e-06,
- "loss": 0.5442801713943481,
- "mean_token_accuracy": 0.8190416097640991,
- "num_tokens": 9844765.0,
- "step": 1098
- },
- {
- "epoch": 0.8351063829787234,
- "grad_norm": 1.8190315961837769,
- "learning_rate": 4.322421568553529e-06,
- "loss": 0.48271381855010986,
- "mean_token_accuracy": 0.8203652501106262,
- "num_tokens": 9852625.0,
- "step": 1099
- },
- {
- "epoch": 0.8358662613981763,
- "grad_norm": 2.7897756099700928,
- "learning_rate": 4.320987217092416e-06,
- "loss": 0.4086323380470276,
- "mean_token_accuracy": 0.8504934310913086,
- "num_tokens": 9856888.0,
- "step": 1100
- },
- {
- "epoch": 0.8366261398176292,
- "grad_norm": 1.7035977840423584,
- "learning_rate": 4.319551587592377e-06,
- "loss": 0.6325064301490784,
- "mean_token_accuracy": 0.788190484046936,
- "num_tokens": 9869419.0,
- "step": 1101
- },
- {
- "epoch": 0.837386018237082,
- "grad_norm": 2.609731912612915,
- "learning_rate": 4.318114681060989e-06,
- "loss": 0.519314706325531,
- "mean_token_accuracy": 0.8469992280006409,
- "num_tokens": 9874553.0,
- "step": 1102
- },
- {
- "epoch": 0.8381458966565349,
- "grad_norm": 1.2519766092300415,
- "learning_rate": 4.316676498506735e-06,
- "loss": 0.3566005825996399,
- "mean_token_accuracy": 0.8588439226150513,
- "num_tokens": 9886498.0,
- "step": 1103
- },
- {
- "epoch": 0.8389057750759878,
- "grad_norm": 1.430892825126648,
- "learning_rate": 4.3152370409389795e-06,
- "loss": 0.5250182747840881,
- "mean_token_accuracy": 0.8164948225021362,
- "num_tokens": 9900256.0,
- "step": 1104
- },
- {
- "epoch": 0.8396656534954408,
- "grad_norm": 3.1245436668395996,
- "learning_rate": 4.3137963093679945e-06,
- "loss": 0.3173971176147461,
- "mean_token_accuracy": 0.8835347890853882,
- "num_tokens": 9903899.0,
- "step": 1105
- },
- {
- "epoch": 0.8404255319148937,
- "grad_norm": 3.131812572479248,
- "learning_rate": 4.3123543048049395e-06,
- "loss": 0.6567763090133667,
- "mean_token_accuracy": 0.8233605027198792,
- "num_tokens": 9908798.0,
- "step": 1106
- },
- {
- "epoch": 0.8411854103343465,
- "grad_norm": 1.3551725149154663,
- "learning_rate": 4.310911028261867e-06,
- "loss": 0.3993729054927826,
- "mean_token_accuracy": 0.8529655933380127,
- "num_tokens": 9922577.0,
- "step": 1107
- },
- {
- "epoch": 0.8419452887537994,
- "grad_norm": 2.572533130645752,
- "learning_rate": 4.309466480751726e-06,
- "loss": 0.40906503796577454,
- "mean_token_accuracy": 0.8630726933479309,
- "num_tokens": 9926890.0,
- "step": 1108
- },
- {
- "epoch": 0.8427051671732523,
- "grad_norm": 1.9146469831466675,
- "learning_rate": 4.308020663288356e-06,
- "loss": 0.48423194885253906,
- "mean_token_accuracy": 0.8370280861854553,
- "num_tokens": 9934293.0,
- "step": 1109
- },
- {
- "epoch": 0.8434650455927052,
- "grad_norm": 1.6178001165390015,
- "learning_rate": 4.306573576886485e-06,
- "loss": 0.4262213408946991,
- "mean_token_accuracy": 0.839401125907898,
- "num_tokens": 9944513.0,
- "step": 1110
- },
- {
- "epoch": 0.8442249240121581,
- "grad_norm": 2.4444572925567627,
- "learning_rate": 4.305125222561736e-06,
- "loss": 0.5199950933456421,
- "mean_token_accuracy": 0.8507720232009888,
- "num_tokens": 9949512.0,
- "step": 1111
- },
- {
- "epoch": 0.8449848024316109,
- "grad_norm": 1.7983134984970093,
- "learning_rate": 4.303675601330618e-06,
- "loss": 0.36155956983566284,
- "mean_token_accuracy": 0.8568712472915649,
- "num_tokens": 9956402.0,
- "step": 1112
- },
- {
- "epoch": 0.8457446808510638,
- "grad_norm": 2.391096353530884,
- "learning_rate": 4.302224714210532e-06,
- "loss": 0.5391949415206909,
- "mean_token_accuracy": 0.8183057308197021,
- "num_tokens": 9961606.0,
- "step": 1113
- },
- {
- "epoch": 0.8465045592705167,
- "grad_norm": 1.8520214557647705,
- "learning_rate": 4.3007725622197675e-06,
- "loss": 0.5758882761001587,
- "mean_token_accuracy": 0.7924330234527588,
- "num_tokens": 9971473.0,
- "step": 1114
- },
- {
- "epoch": 0.8472644376899696,
- "grad_norm": 2.436640739440918,
- "learning_rate": 4.2993191463775e-06,
- "loss": 0.3837985396385193,
- "mean_token_accuracy": 0.8620110750198364,
- "num_tokens": 9976333.0,
- "step": 1115
- },
- {
- "epoch": 0.8480243161094225,
- "grad_norm": 1.7287120819091797,
- "learning_rate": 4.29786446770379e-06,
- "loss": 0.40066856145858765,
- "mean_token_accuracy": 0.8618333339691162,
- "num_tokens": 9985617.0,
- "step": 1116
- },
- {
- "epoch": 0.8487841945288754,
- "grad_norm": 2.0310518741607666,
- "learning_rate": 4.296408527219592e-06,
- "loss": 0.5465943217277527,
- "mean_token_accuracy": 0.812044620513916,
- "num_tokens": 9995363.0,
- "step": 1117
- },
- {
- "epoch": 0.8495440729483282,
- "grad_norm": 1.4858589172363281,
- "learning_rate": 4.294951325946737e-06,
- "loss": 0.45840176939964294,
- "mean_token_accuracy": 0.8432979583740234,
- "num_tokens": 10006400.0,
- "step": 1118
- },
- {
- "epoch": 0.8503039513677811,
- "grad_norm": 1.6153514385223389,
- "learning_rate": 4.293492864907947e-06,
- "loss": 0.5225611925125122,
- "mean_token_accuracy": 0.8180211186408997,
- "num_tokens": 10018352.0,
- "step": 1119
- },
- {
- "epoch": 0.851063829787234,
- "grad_norm": 2.1178412437438965,
- "learning_rate": 4.2920331451268246e-06,
- "loss": 0.5580621361732483,
- "mean_token_accuracy": 0.8211709260940552,
- "num_tokens": 10025614.0,
- "step": 1120
- },
- {
- "epoch": 0.851823708206687,
- "grad_norm": 2.036839246749878,
- "learning_rate": 4.2905721676278585e-06,
- "loss": 0.4658433198928833,
- "mean_token_accuracy": 0.8380423784255981,
- "num_tokens": 10032489.0,
- "step": 1121
- },
- {
- "epoch": 0.8525835866261399,
- "grad_norm": 2.0056262016296387,
- "learning_rate": 4.28910993343642e-06,
- "loss": 0.47023308277130127,
- "mean_token_accuracy": 0.8340359926223755,
- "num_tokens": 10040050.0,
- "step": 1122
- },
- {
- "epoch": 0.8533434650455927,
- "grad_norm": 2.540024518966675,
- "learning_rate": 4.2876464435787576e-06,
- "loss": 0.502303957939148,
- "mean_token_accuracy": 0.8288739919662476,
- "num_tokens": 10045042.0,
- "step": 1123
- },
- {
- "epoch": 0.8541033434650456,
- "grad_norm": 1.7894693613052368,
- "learning_rate": 4.286181699082008e-06,
- "loss": 0.4732973575592041,
- "mean_token_accuracy": 0.8340568542480469,
- "num_tokens": 10054424.0,
- "step": 1124
- },
- {
- "epoch": 0.8548632218844985,
- "grad_norm": 1.5601223707199097,
- "learning_rate": 4.284715700974186e-06,
- "loss": 0.472471684217453,
- "mean_token_accuracy": 0.8274722695350647,
- "num_tokens": 10065523.0,
- "step": 1125
- },
- {
- "epoch": 0.8556231003039514,
- "grad_norm": 1.7326055765151978,
- "learning_rate": 4.283248450284182e-06,
- "loss": 0.5924872159957886,
- "mean_token_accuracy": 0.7943467497825623,
- "num_tokens": 10076839.0,
- "step": 1126
- },
- {
- "epoch": 0.8563829787234043,
- "grad_norm": 1.5165479183197021,
- "learning_rate": 4.281779948041772e-06,
- "loss": 0.44768425822257996,
- "mean_token_accuracy": 0.8394696712493896,
- "num_tokens": 10088168.0,
- "step": 1127
- },
- {
- "epoch": 0.8571428571428571,
- "grad_norm": 1.5448920726776123,
- "learning_rate": 4.280310195277606e-06,
- "loss": 0.4458175003528595,
- "mean_token_accuracy": 0.835773229598999,
- "num_tokens": 10100306.0,
- "step": 1128
- },
- {
- "epoch": 0.85790273556231,
- "grad_norm": 1.6311609745025635,
- "learning_rate": 4.278839193023214e-06,
- "loss": 0.4158072769641876,
- "mean_token_accuracy": 0.8482539653778076,
- "num_tokens": 10110581.0,
- "step": 1129
- },
- {
- "epoch": 0.8586626139817629,
- "grad_norm": 1.6714754104614258,
- "learning_rate": 4.277366942311001e-06,
- "loss": 0.3686875104904175,
- "mean_token_accuracy": 0.8681533336639404,
- "num_tokens": 10118799.0,
- "step": 1130
- },
- {
- "epoch": 0.8594224924012158,
- "grad_norm": 2.1604413986206055,
- "learning_rate": 4.2758934441742494e-06,
- "loss": 0.37267982959747314,
- "mean_token_accuracy": 0.8520427346229553,
- "num_tokens": 10124734.0,
- "step": 1131
- },
- {
- "epoch": 0.8601823708206687,
- "grad_norm": 2.123013973236084,
- "learning_rate": 4.274418699647117e-06,
- "loss": 0.49963313341140747,
- "mean_token_accuracy": 0.8248758912086487,
- "num_tokens": 10131965.0,
- "step": 1132
- },
- {
- "epoch": 0.8609422492401215,
- "grad_norm": 1.4308786392211914,
- "learning_rate": 4.272942709764638e-06,
- "loss": 0.48666873574256897,
- "mean_token_accuracy": 0.8304717540740967,
- "num_tokens": 10145164.0,
- "step": 1133
- },
- {
- "epoch": 0.8617021276595744,
- "grad_norm": 1.7952618598937988,
- "learning_rate": 4.271465475562716e-06,
- "loss": 0.5536223649978638,
- "mean_token_accuracy": 0.8093959093093872,
- "num_tokens": 10154083.0,
- "step": 1134
- },
- {
- "epoch": 0.8624620060790273,
- "grad_norm": 2.0622456073760986,
- "learning_rate": 4.269986998078132e-06,
- "loss": 0.5173629522323608,
- "mean_token_accuracy": 0.8285619020462036,
- "num_tokens": 10161889.0,
- "step": 1135
- },
- {
- "epoch": 0.8632218844984803,
- "grad_norm": 2.0707509517669678,
- "learning_rate": 4.268507278348539e-06,
- "loss": 0.5871608257293701,
- "mean_token_accuracy": 0.7827386856079102,
- "num_tokens": 10170726.0,
- "step": 1136
- },
- {
- "epoch": 0.8639817629179332,
- "grad_norm": 2.054368257522583,
- "learning_rate": 4.2670263174124615e-06,
- "loss": 0.5788969993591309,
- "mean_token_accuracy": 0.7967237234115601,
- "num_tokens": 10178474.0,
- "step": 1137
- },
- {
- "epoch": 0.8647416413373861,
- "grad_norm": 1.901846170425415,
- "learning_rate": 4.265544116309294e-06,
- "loss": 0.5405587553977966,
- "mean_token_accuracy": 0.8151819705963135,
- "num_tokens": 10187013.0,
- "step": 1138
- },
- {
- "epoch": 0.8655015197568389,
- "grad_norm": 2.901285409927368,
- "learning_rate": 4.264060676079302e-06,
- "loss": 0.44101861119270325,
- "mean_token_accuracy": 0.8433429002761841,
- "num_tokens": 10191517.0,
- "step": 1139
- },
- {
- "epoch": 0.8662613981762918,
- "grad_norm": 2.4168388843536377,
- "learning_rate": 4.262575997763622e-06,
- "loss": 0.4686204195022583,
- "mean_token_accuracy": 0.8505309820175171,
- "num_tokens": 10196948.0,
- "step": 1140
- },
- {
- "epoch": 0.8670212765957447,
- "grad_norm": 1.9588396549224854,
- "learning_rate": 4.2610900824042575e-06,
- "loss": 0.47056013345718384,
- "mean_token_accuracy": 0.8280024528503418,
- "num_tokens": 10204292.0,
- "step": 1141
- },
- {
- "epoch": 0.8677811550151976,
- "grad_norm": 2.569150924682617,
- "learning_rate": 4.2596029310440826e-06,
- "loss": 0.573108434677124,
- "mean_token_accuracy": 0.8108246326446533,
- "num_tokens": 10209571.0,
- "step": 1142
- },
- {
- "epoch": 0.8685410334346505,
- "grad_norm": 2.038032293319702,
- "learning_rate": 4.258114544726835e-06,
- "loss": 0.40545332431793213,
- "mean_token_accuracy": 0.8611703515052795,
- "num_tokens": 10215716.0,
- "step": 1143
- },
- {
- "epoch": 0.8693009118541033,
- "grad_norm": 1.9884231090545654,
- "learning_rate": 4.256624924497124e-06,
- "loss": 0.40085992217063904,
- "mean_token_accuracy": 0.8615031242370605,
- "num_tokens": 10222775.0,
- "step": 1144
- },
- {
- "epoch": 0.8700607902735562,
- "grad_norm": 1.912842035293579,
- "learning_rate": 4.25513407140042e-06,
- "loss": 0.41022324562072754,
- "mean_token_accuracy": 0.8459607362747192,
- "num_tokens": 10229589.0,
- "step": 1145
- },
- {
- "epoch": 0.8708206686930091,
- "grad_norm": 1.9190576076507568,
- "learning_rate": 4.253641986483063e-06,
- "loss": 0.5541447401046753,
- "mean_token_accuracy": 0.8256468772888184,
- "num_tokens": 10240633.0,
- "step": 1146
- },
- {
- "epoch": 0.871580547112462,
- "grad_norm": 1.3742294311523438,
- "learning_rate": 4.2521486707922545e-06,
- "loss": 0.3680543899536133,
- "mean_token_accuracy": 0.8654477596282959,
- "num_tokens": 10251252.0,
- "step": 1147
- },
- {
- "epoch": 0.8723404255319149,
- "grad_norm": 1.4438525438308716,
- "learning_rate": 4.250654125376062e-06,
- "loss": 0.45830875635147095,
- "mean_token_accuracy": 0.8433834314346313,
- "num_tokens": 10263980.0,
- "step": 1148
- },
- {
- "epoch": 0.8731003039513677,
- "grad_norm": 2.1273653507232666,
- "learning_rate": 4.249158351283414e-06,
- "loss": 0.4129376709461212,
- "mean_token_accuracy": 0.861556351184845,
- "num_tokens": 10270426.0,
- "step": 1149
- },
- {
- "epoch": 0.8738601823708206,
- "grad_norm": 2.598440647125244,
- "learning_rate": 4.247661349564103e-06,
- "loss": 0.418030709028244,
- "mean_token_accuracy": 0.86553955078125,
- "num_tokens": 10275493.0,
- "step": 1150
- },
- {
- "epoch": 0.8746200607902735,
- "grad_norm": 1.6852490901947021,
- "learning_rate": 4.246163121268782e-06,
- "loss": 0.6403408050537109,
- "mean_token_accuracy": 0.7966094017028809,
- "num_tokens": 10287989.0,
- "step": 1151
- },
- {
- "epoch": 0.8753799392097265,
- "grad_norm": 2.5013794898986816,
- "learning_rate": 4.244663667448965e-06,
- "loss": 0.49922505021095276,
- "mean_token_accuracy": 0.8318735361099243,
- "num_tokens": 10293360.0,
- "step": 1152
- },
- {
- "epoch": 0.8761398176291794,
- "grad_norm": 1.2022709846496582,
- "learning_rate": 4.243162989157027e-06,
- "loss": 0.4414965510368347,
- "mean_token_accuracy": 0.8338693380355835,
- "num_tokens": 10310558.0,
- "step": 1153
- },
- {
- "epoch": 0.8768996960486323,
- "grad_norm": 1.9903281927108765,
- "learning_rate": 4.241661087446202e-06,
- "loss": 0.4277610778808594,
- "mean_token_accuracy": 0.8560749292373657,
- "num_tokens": 10316983.0,
- "step": 1154
- },
- {
- "epoch": 0.8776595744680851,
- "grad_norm": 2.104923725128174,
- "learning_rate": 4.240157963370583e-06,
- "loss": 0.44431713223457336,
- "mean_token_accuracy": 0.8785282969474792,
- "num_tokens": 10323294.0,
- "step": 1155
- },
- {
- "epoch": 0.878419452887538,
- "grad_norm": 2.8364813327789307,
- "learning_rate": 4.2386536179851175e-06,
- "loss": 0.49948397278785706,
- "mean_token_accuracy": 0.8305255174636841,
- "num_tokens": 10327662.0,
- "step": 1156
- },
- {
- "epoch": 0.8791793313069909,
- "grad_norm": 1.9493682384490967,
- "learning_rate": 4.2371480523456156e-06,
- "loss": 0.45867404341697693,
- "mean_token_accuracy": 0.8373264074325562,
- "num_tokens": 10335699.0,
- "step": 1157
- },
- {
- "epoch": 0.8799392097264438,
- "grad_norm": 2.268616199493408,
- "learning_rate": 4.235641267508741e-06,
- "loss": 0.4547857940196991,
- "mean_token_accuracy": 0.8252766132354736,
- "num_tokens": 10342464.0,
- "step": 1158
- },
- {
- "epoch": 0.8806990881458967,
- "grad_norm": 2.1334283351898193,
- "learning_rate": 4.234133264532012e-06,
- "loss": 0.39503124356269836,
- "mean_token_accuracy": 0.8648351430892944,
- "num_tokens": 10347514.0,
- "step": 1159
- },
- {
- "epoch": 0.8814589665653495,
- "grad_norm": 1.2775357961654663,
- "learning_rate": 4.232624044473805e-06,
- "loss": 0.39945733547210693,
- "mean_token_accuracy": 0.8369829654693604,
- "num_tokens": 10363316.0,
- "step": 1160
- },
- {
- "epoch": 0.8822188449848024,
- "grad_norm": 2.458413600921631,
- "learning_rate": 4.231113608393348e-06,
- "loss": 0.5020045638084412,
- "mean_token_accuracy": 0.8295938968658447,
- "num_tokens": 10368401.0,
- "step": 1161
- },
- {
- "epoch": 0.8829787234042553,
- "grad_norm": 1.7464948892593384,
- "learning_rate": 4.229601957350722e-06,
- "loss": 0.5335392951965332,
- "mean_token_accuracy": 0.8134858012199402,
- "num_tokens": 10378337.0,
- "step": 1162
- },
- {
- "epoch": 0.8837386018237082,
- "grad_norm": 3.1152119636535645,
- "learning_rate": 4.228089092406863e-06,
- "loss": 0.4811682105064392,
- "mean_token_accuracy": 0.8460187315940857,
- "num_tokens": 10382362.0,
- "step": 1163
- },
- {
- "epoch": 0.8844984802431611,
- "grad_norm": 2.190847158432007,
- "learning_rate": 4.226575014623557e-06,
- "loss": 0.4428049921989441,
- "mean_token_accuracy": 0.8382467031478882,
- "num_tokens": 10388211.0,
- "step": 1164
- },
- {
- "epoch": 0.8852583586626139,
- "grad_norm": 1.860153079032898,
- "learning_rate": 4.225059725063444e-06,
- "loss": 0.5265918970108032,
- "mean_token_accuracy": 0.8181334733963013,
- "num_tokens": 10398873.0,
- "step": 1165
- },
- {
- "epoch": 0.8860182370820668,
- "grad_norm": 1.3372713327407837,
- "learning_rate": 4.22354322479001e-06,
- "loss": 0.43202850222587585,
- "mean_token_accuracy": 0.8432420492172241,
- "num_tokens": 10413158.0,
- "step": 1166
- },
- {
- "epoch": 0.8867781155015197,
- "grad_norm": 1.3653379678726196,
- "learning_rate": 4.222025514867596e-06,
- "loss": 0.43780991435050964,
- "mean_token_accuracy": 0.8441485166549683,
- "num_tokens": 10428137.0,
- "step": 1167
- },
- {
- "epoch": 0.8875379939209727,
- "grad_norm": 3.0230672359466553,
- "learning_rate": 4.220506596361387e-06,
- "loss": 0.6039337515830994,
- "mean_token_accuracy": 0.8274872303009033,
- "num_tokens": 10432586.0,
- "step": 1168
- },
- {
- "epoch": 0.8882978723404256,
- "grad_norm": 2.2180392742156982,
- "learning_rate": 4.218986470337419e-06,
- "loss": 0.5453792810440063,
- "mean_token_accuracy": 0.8127184510231018,
- "num_tokens": 10439471.0,
- "step": 1169
- },
- {
- "epoch": 0.8890577507598785,
- "grad_norm": 1.8519103527069092,
- "learning_rate": 4.217465137862575e-06,
- "loss": 0.5145469903945923,
- "mean_token_accuracy": 0.8178654909133911,
- "num_tokens": 10450471.0,
- "step": 1170
- },
- {
- "epoch": 0.8898176291793313,
- "grad_norm": 2.034008026123047,
- "learning_rate": 4.215942600004586e-06,
- "loss": 0.44061461091041565,
- "mean_token_accuracy": 0.8572084307670593,
- "num_tokens": 10457382.0,
- "step": 1171
- },
- {
- "epoch": 0.8905775075987842,
- "grad_norm": 3.4304304122924805,
- "learning_rate": 4.214418857832025e-06,
- "loss": 0.44397830963134766,
- "mean_token_accuracy": 0.842149019241333,
- "num_tokens": 10460650.0,
- "step": 1172
- },
- {
- "epoch": 0.8913373860182371,
- "grad_norm": 1.9021750688552856,
- "learning_rate": 4.212893912414316e-06,
- "loss": 0.3769867420196533,
- "mean_token_accuracy": 0.8806171417236328,
- "num_tokens": 10468214.0,
- "step": 1173
- },
- {
- "epoch": 0.89209726443769,
- "grad_norm": 1.9704062938690186,
- "learning_rate": 4.211367764821722e-06,
- "loss": 0.5501819849014282,
- "mean_token_accuracy": 0.8176811337471008,
- "num_tokens": 10476739.0,
- "step": 1174
- },
- {
- "epoch": 0.8928571428571429,
- "grad_norm": 1.4350415468215942,
- "learning_rate": 4.209840416125353e-06,
- "loss": 0.41897401213645935,
- "mean_token_accuracy": 0.8498011827468872,
- "num_tokens": 10491769.0,
- "step": 1175
- },
- {
- "epoch": 0.8936170212765957,
- "grad_norm": 3.8237783908843994,
- "learning_rate": 4.208311867397162e-06,
- "loss": 0.5296977162361145,
- "mean_token_accuracy": 0.8168715834617615,
- "num_tokens": 10494958.0,
- "step": 1176
- },
- {
- "epoch": 0.8943768996960486,
- "grad_norm": 2.04784893989563,
- "learning_rate": 4.206782119709942e-06,
- "loss": 0.476105272769928,
- "mean_token_accuracy": 0.834011435508728,
- "num_tokens": 10502077.0,
- "step": 1177
- },
- {
- "epoch": 0.8951367781155015,
- "grad_norm": 1.8839610815048218,
- "learning_rate": 4.205251174137329e-06,
- "loss": 0.49628815054893494,
- "mean_token_accuracy": 0.8212119936943054,
- "num_tokens": 10510077.0,
- "step": 1178
- },
- {
- "epoch": 0.8958966565349544,
- "grad_norm": 1.2100634574890137,
- "learning_rate": 4.2037190317538e-06,
- "loss": 0.4931519329547882,
- "mean_token_accuracy": 0.8170043230056763,
- "num_tokens": 10528373.0,
- "step": 1179
- },
- {
- "epoch": 0.8966565349544073,
- "grad_norm": 1.884637713432312,
- "learning_rate": 4.202185693634671e-06,
- "loss": 0.4913347363471985,
- "mean_token_accuracy": 0.8234949707984924,
- "num_tokens": 10537108.0,
- "step": 1180
- },
- {
- "epoch": 0.8974164133738601,
- "grad_norm": 1.5062434673309326,
- "learning_rate": 4.200651160856099e-06,
- "loss": 0.4160492420196533,
- "mean_token_accuracy": 0.845937192440033,
- "num_tokens": 10547577.0,
- "step": 1181
- },
- {
- "epoch": 0.898176291793313,
- "grad_norm": 2.331169605255127,
- "learning_rate": 4.1991154344950755e-06,
- "loss": 0.6532632112503052,
- "mean_token_accuracy": 0.7743191123008728,
- "num_tokens": 10556328.0,
- "step": 1182
- },
- {
- "epoch": 0.898936170212766,
- "grad_norm": 1.3538362979888916,
- "learning_rate": 4.197578515629435e-06,
- "loss": 0.4437566101551056,
- "mean_token_accuracy": 0.8427901268005371,
- "num_tokens": 10570026.0,
- "step": 1183
- },
- {
- "epoch": 0.8996960486322189,
- "grad_norm": 2.3828957080841064,
- "learning_rate": 4.196040405337846e-06,
- "loss": 0.6185290217399597,
- "mean_token_accuracy": 0.7969824075698853,
- "num_tokens": 10576465.0,
- "step": 1184
- },
- {
- "epoch": 0.9004559270516718,
- "grad_norm": 2.4759042263031006,
- "learning_rate": 4.194501104699813e-06,
- "loss": 0.46489226818084717,
- "mean_token_accuracy": 0.8472316265106201,
- "num_tokens": 10582034.0,
- "step": 1185
- },
- {
- "epoch": 0.9012158054711246,
- "grad_norm": 1.9215164184570312,
- "learning_rate": 4.192960614795676e-06,
- "loss": 0.48001551628112793,
- "mean_token_accuracy": 0.8371596336364746,
- "num_tokens": 10590556.0,
- "step": 1186
- },
- {
- "epoch": 0.9019756838905775,
- "grad_norm": 2.2717080116271973,
- "learning_rate": 4.19141893670661e-06,
- "loss": 0.40083563327789307,
- "mean_token_accuracy": 0.8464195728302002,
- "num_tokens": 10595661.0,
- "step": 1187
- },
- {
- "epoch": 0.9027355623100304,
- "grad_norm": 2.187122344970703,
- "learning_rate": 4.189876071514624e-06,
- "loss": 0.4942901134490967,
- "mean_token_accuracy": 0.8186990022659302,
- "num_tokens": 10603366.0,
- "step": 1188
- },
- {
- "epoch": 0.9034954407294833,
- "grad_norm": 1.542414665222168,
- "learning_rate": 4.188332020302561e-06,
- "loss": 0.4731982946395874,
- "mean_token_accuracy": 0.8487229347229004,
- "num_tokens": 10616203.0,
- "step": 1189
- },
- {
- "epoch": 0.9042553191489362,
- "grad_norm": 0.9957579970359802,
- "learning_rate": 4.186786784154096e-06,
- "loss": 0.33211836218833923,
- "mean_token_accuracy": 0.870644748210907,
- "num_tokens": 10633294.0,
- "step": 1190
- },
- {
- "epoch": 0.9050151975683891,
- "grad_norm": 2.593867540359497,
- "learning_rate": 4.1852403641537344e-06,
- "loss": 0.6825464963912964,
- "mean_token_accuracy": 0.7716869115829468,
- "num_tokens": 10640615.0,
- "step": 1191
- },
- {
- "epoch": 0.9057750759878419,
- "grad_norm": 2.0424516201019287,
- "learning_rate": 4.183692761386813e-06,
- "loss": 0.5672709941864014,
- "mean_token_accuracy": 0.7973801493644714,
- "num_tokens": 10649845.0,
- "step": 1192
- },
- {
- "epoch": 0.9065349544072948,
- "grad_norm": 1.429018259048462,
- "learning_rate": 4.1821439769395e-06,
- "loss": 0.5427846908569336,
- "mean_token_accuracy": 0.8200292587280273,
- "num_tokens": 10665898.0,
- "step": 1193
- },
- {
- "epoch": 0.9072948328267477,
- "grad_norm": 1.9764264822006226,
- "learning_rate": 4.180594011898791e-06,
- "loss": 0.4784567356109619,
- "mean_token_accuracy": 0.82924485206604,
- "num_tokens": 10673595.0,
- "step": 1194
- },
- {
- "epoch": 0.9080547112462006,
- "grad_norm": 1.4004309177398682,
- "learning_rate": 4.1790428673525104e-06,
- "loss": 0.4791432023048401,
- "mean_token_accuracy": 0.8334879875183105,
- "num_tokens": 10687892.0,
- "step": 1195
- },
- {
- "epoch": 0.9088145896656535,
- "grad_norm": 2.2207727432250977,
- "learning_rate": 4.177490544389313e-06,
- "loss": 0.5089365243911743,
- "mean_token_accuracy": 0.8270776271820068,
- "num_tokens": 10694911.0,
- "step": 1196
- },
- {
- "epoch": 0.9095744680851063,
- "grad_norm": 2.2890450954437256,
- "learning_rate": 4.175937044098678e-06,
- "loss": 0.5152267813682556,
- "mean_token_accuracy": 0.8527299165725708,
- "num_tokens": 10700512.0,
- "step": 1197
- },
- {
- "epoch": 0.9103343465045592,
- "grad_norm": 1.7938050031661987,
- "learning_rate": 4.1743823675709115e-06,
- "loss": 0.3507300615310669,
- "mean_token_accuracy": 0.8694599866867065,
- "num_tokens": 10707953.0,
- "step": 1198
- },
- {
- "epoch": 0.9110942249240122,
- "grad_norm": 1.4368808269500732,
- "learning_rate": 4.172826515897146e-06,
- "loss": 0.407418429851532,
- "mean_token_accuracy": 0.8432893753051758,
- "num_tokens": 10717485.0,
- "step": 1199
- },
- {
- "epoch": 0.9118541033434651,
- "grad_norm": 1.735339879989624,
- "learning_rate": 4.171269490169337e-06,
- "loss": 0.46996885538101196,
- "mean_token_accuracy": 0.8331948518753052,
- "num_tokens": 10726160.0,
- "step": 1200
- },
- {
- "epoch": 0.912613981762918,
- "grad_norm": 1.7859221696853638,
- "learning_rate": 4.1697112914802665e-06,
- "loss": 0.5325199365615845,
- "mean_token_accuracy": 0.8179605007171631,
- "num_tokens": 10736284.0,
- "step": 1201
- },
- {
- "epoch": 0.9133738601823708,
- "grad_norm": 2.6394896507263184,
- "learning_rate": 4.168151920923536e-06,
- "loss": 0.4039744734764099,
- "mean_token_accuracy": 0.8545527458190918,
- "num_tokens": 10740673.0,
- "step": 1202
- },
- {
- "epoch": 0.9141337386018237,
- "grad_norm": 1.910988211631775,
- "learning_rate": 4.1665913795935755e-06,
- "loss": 0.5190291404724121,
- "mean_token_accuracy": 0.8203921318054199,
- "num_tokens": 10751946.0,
- "step": 1203
- },
- {
- "epoch": 0.9148936170212766,
- "grad_norm": 3.0006964206695557,
- "learning_rate": 4.16502966858563e-06,
- "loss": 0.5856777429580688,
- "mean_token_accuracy": 0.8061224222183228,
- "num_tokens": 10756795.0,
- "step": 1204
- },
- {
- "epoch": 0.9156534954407295,
- "grad_norm": 1.7396167516708374,
- "learning_rate": 4.163466788995768e-06,
- "loss": 0.54935222864151,
- "mean_token_accuracy": 0.8052443265914917,
- "num_tokens": 10767202.0,
- "step": 1205
- },
- {
- "epoch": 0.9164133738601824,
- "grad_norm": 2.143735885620117,
- "learning_rate": 4.161902741920881e-06,
- "loss": 0.5020298361778259,
- "mean_token_accuracy": 0.8249630928039551,
- "num_tokens": 10774329.0,
- "step": 1206
- },
- {
- "epoch": 0.9171732522796353,
- "grad_norm": 2.8871893882751465,
- "learning_rate": 4.160337528458676e-06,
- "loss": 0.5154489278793335,
- "mean_token_accuracy": 0.8276848793029785,
- "num_tokens": 10778929.0,
- "step": 1207
- },
- {
- "epoch": 0.9179331306990881,
- "grad_norm": 1.4642788171768188,
- "learning_rate": 4.15877114970768e-06,
- "loss": 0.5033774375915527,
- "mean_token_accuracy": 0.8296241164207458,
- "num_tokens": 10790928.0,
- "step": 1208
- },
- {
- "epoch": 0.918693009118541,
- "grad_norm": 1.8313497304916382,
- "learning_rate": 4.1572036067672386e-06,
- "loss": 0.5674909353256226,
- "mean_token_accuracy": 0.7975562214851379,
- "num_tokens": 10801372.0,
- "step": 1209
- },
- {
- "epoch": 0.9194528875379939,
- "grad_norm": 2.005958080291748,
- "learning_rate": 4.155634900737513e-06,
- "loss": 0.5557019114494324,
- "mean_token_accuracy": 0.8141391277313232,
- "num_tokens": 10809150.0,
- "step": 1210
- },
- {
- "epoch": 0.9202127659574468,
- "grad_norm": 2.333519697189331,
- "learning_rate": 4.154065032719482e-06,
- "loss": 0.6990420818328857,
- "mean_token_accuracy": 0.7565394043922424,
- "num_tokens": 10816612.0,
- "step": 1211
- },
- {
- "epoch": 0.9209726443768997,
- "grad_norm": 1.4472655057907104,
- "learning_rate": 4.152494003814939e-06,
- "loss": 0.541398286819458,
- "mean_token_accuracy": 0.8027358055114746,
- "num_tokens": 10833840.0,
- "step": 1212
- },
- {
- "epoch": 0.9217325227963525,
- "grad_norm": 1.6183619499206543,
- "learning_rate": 4.150921815126493e-06,
- "loss": 0.6096762418746948,
- "mean_token_accuracy": 0.7994354963302612,
- "num_tokens": 10846367.0,
- "step": 1213
- },
- {
- "epoch": 0.9224924012158054,
- "grad_norm": 2.614919900894165,
- "learning_rate": 4.149348467757566e-06,
- "loss": 0.41846764087677,
- "mean_token_accuracy": 0.8555068969726562,
- "num_tokens": 10850836.0,
- "step": 1214
- },
- {
- "epoch": 0.9232522796352584,
- "grad_norm": 1.4419831037521362,
- "learning_rate": 4.147773962812393e-06,
- "loss": 0.4139535427093506,
- "mean_token_accuracy": 0.845671534538269,
- "num_tokens": 10864228.0,
- "step": 1215
- },
- {
- "epoch": 0.9240121580547113,
- "grad_norm": 2.3868865966796875,
- "learning_rate": 4.146198301396025e-06,
- "loss": 0.3357275128364563,
- "mean_token_accuracy": 0.8829520344734192,
- "num_tokens": 10868920.0,
- "step": 1216
- },
- {
- "epoch": 0.9247720364741642,
- "grad_norm": 1.7685474157333374,
- "learning_rate": 4.14462148461432e-06,
- "loss": 0.45333072543144226,
- "mean_token_accuracy": 0.8505891561508179,
- "num_tokens": 10877286.0,
- "step": 1217
- },
- {
- "epoch": 0.925531914893617,
- "grad_norm": 1.7627625465393066,
- "learning_rate": 4.143043513573949e-06,
- "loss": 0.5028705596923828,
- "mean_token_accuracy": 0.825471043586731,
- "num_tokens": 10887047.0,
- "step": 1218
- },
- {
- "epoch": 0.9262917933130699,
- "grad_norm": 1.3168725967407227,
- "learning_rate": 4.141464389382392e-06,
- "loss": 0.5494637489318848,
- "mean_token_accuracy": 0.8121747970581055,
- "num_tokens": 10903599.0,
- "step": 1219
- },
- {
- "epoch": 0.9270516717325228,
- "grad_norm": 2.5180399417877197,
- "learning_rate": 4.13988411314794e-06,
- "loss": 0.6134277582168579,
- "mean_token_accuracy": 0.7983006834983826,
- "num_tokens": 10909791.0,
- "step": 1220
- },
- {
- "epoch": 0.9278115501519757,
- "grad_norm": 1.1889166831970215,
- "learning_rate": 4.13830268597969e-06,
- "loss": 0.36713096499443054,
- "mean_token_accuracy": 0.8416121006011963,
- "num_tokens": 10925794.0,
- "step": 1221
- },
- {
- "epoch": 0.9285714285714286,
- "grad_norm": 2.142422676086426,
- "learning_rate": 4.136720108987552e-06,
- "loss": 0.4427933096885681,
- "mean_token_accuracy": 0.8427745699882507,
- "num_tokens": 10931622.0,
- "step": 1222
- },
- {
- "epoch": 0.9293313069908815,
- "grad_norm": 1.908564567565918,
- "learning_rate": 4.1351363832822364e-06,
- "loss": 0.5088109374046326,
- "mean_token_accuracy": 0.8309272527694702,
- "num_tokens": 10940843.0,
- "step": 1223
- },
- {
- "epoch": 0.9300911854103343,
- "grad_norm": 1.2862322330474854,
- "learning_rate": 4.133551509975264e-06,
- "loss": 0.3963761329650879,
- "mean_token_accuracy": 0.8602159023284912,
- "num_tokens": 10954481.0,
- "step": 1224
- },
- {
- "epoch": 0.9308510638297872,
- "grad_norm": 1.5876200199127197,
- "learning_rate": 4.13196549017896e-06,
- "loss": 0.4311184287071228,
- "mean_token_accuracy": 0.8460899591445923,
- "num_tokens": 10963501.0,
- "step": 1225
- },
- {
- "epoch": 0.9316109422492401,
- "grad_norm": 2.459878444671631,
- "learning_rate": 4.130378325006453e-06,
- "loss": 0.5016295313835144,
- "mean_token_accuracy": 0.8125218152999878,
- "num_tokens": 10968850.0,
- "step": 1226
- },
- {
- "epoch": 0.932370820668693,
- "grad_norm": 2.059718370437622,
- "learning_rate": 4.128790015571679e-06,
- "loss": 0.48982277512550354,
- "mean_token_accuracy": 0.8327049016952515,
- "num_tokens": 10976642.0,
- "step": 1227
- },
- {
- "epoch": 0.9331306990881459,
- "grad_norm": 1.3719185590744019,
- "learning_rate": 4.127200562989372e-06,
- "loss": 0.38778752088546753,
- "mean_token_accuracy": 0.8623501062393188,
- "num_tokens": 10988703.0,
- "step": 1228
- },
- {
- "epoch": 0.9338905775075987,
- "grad_norm": 1.302140712738037,
- "learning_rate": 4.125609968375073e-06,
- "loss": 0.4887842535972595,
- "mean_token_accuracy": 0.8322232961654663,
- "num_tokens": 11005981.0,
- "step": 1229
- },
- {
- "epoch": 0.9346504559270516,
- "grad_norm": 1.819624423980713,
- "learning_rate": 4.12401823284512e-06,
- "loss": 0.49825209379196167,
- "mean_token_accuracy": 0.8278916478157043,
- "num_tokens": 11014145.0,
- "step": 1230
- },
- {
- "epoch": 0.9354103343465046,
- "grad_norm": 1.2762807607650757,
- "learning_rate": 4.122425357516658e-06,
- "loss": 0.433994323015213,
- "mean_token_accuracy": 0.853028416633606,
- "num_tokens": 11029232.0,
- "step": 1231
- },
- {
- "epoch": 0.9361702127659575,
- "grad_norm": 2.2171671390533447,
- "learning_rate": 4.1208313435076255e-06,
- "loss": 0.38436949253082275,
- "mean_token_accuracy": 0.8616260290145874,
- "num_tokens": 11034743.0,
- "step": 1232
- },
- {
- "epoch": 0.9369300911854104,
- "grad_norm": 1.355879545211792,
- "learning_rate": 4.119236191936764e-06,
- "loss": 0.5378084182739258,
- "mean_token_accuracy": 0.8256701231002808,
- "num_tokens": 11048149.0,
- "step": 1233
- },
- {
- "epoch": 0.9376899696048632,
- "grad_norm": 2.66812801361084,
- "learning_rate": 4.117639903923611e-06,
- "loss": 0.5236451625823975,
- "mean_token_accuracy": 0.8431973457336426,
- "num_tokens": 11052295.0,
- "step": 1234
- },
- {
- "epoch": 0.9384498480243161,
- "grad_norm": 1.5740545988082886,
- "learning_rate": 4.116042480588505e-06,
- "loss": 0.44322824478149414,
- "mean_token_accuracy": 0.8436908721923828,
- "num_tokens": 11062066.0,
- "step": 1235
- },
- {
- "epoch": 0.939209726443769,
- "grad_norm": 1.230706810951233,
- "learning_rate": 4.114443923052577e-06,
- "loss": 0.3325323462486267,
- "mean_token_accuracy": 0.8674666881561279,
- "num_tokens": 11074300.0,
- "step": 1236
- },
- {
- "epoch": 0.9399696048632219,
- "grad_norm": 1.9870070219039917,
- "learning_rate": 4.112844232437757e-06,
- "loss": 0.5711548328399658,
- "mean_token_accuracy": 0.8081738948822021,
- "num_tokens": 11082297.0,
- "step": 1237
- },
- {
- "epoch": 0.9407294832826748,
- "grad_norm": 1.3020970821380615,
- "learning_rate": 4.11124340986677e-06,
- "loss": 0.4187922477722168,
- "mean_token_accuracy": 0.8566171526908875,
- "num_tokens": 11096810.0,
- "step": 1238
- },
- {
- "epoch": 0.9414893617021277,
- "grad_norm": 2.1399197578430176,
- "learning_rate": 4.109641456463135e-06,
- "loss": 0.5293116569519043,
- "mean_token_accuracy": 0.8176157474517822,
- "num_tokens": 11102761.0,
- "step": 1239
- },
- {
- "epoch": 0.9422492401215805,
- "grad_norm": 1.3503763675689697,
- "learning_rate": 4.108038373351163e-06,
- "loss": 0.4907652735710144,
- "mean_token_accuracy": 0.8204987049102783,
- "num_tokens": 11118480.0,
- "step": 1240
- },
- {
- "epoch": 0.9430091185410334,
- "grad_norm": 1.9571399688720703,
- "learning_rate": 4.106434161655962e-06,
- "loss": 0.4709656536579132,
- "mean_token_accuracy": 0.8371885418891907,
- "num_tokens": 11126265.0,
- "step": 1241
- },
- {
- "epoch": 0.9437689969604863,
- "grad_norm": 2.1277313232421875,
- "learning_rate": 4.104828822503427e-06,
- "loss": 0.4010283350944519,
- "mean_token_accuracy": 0.8586333990097046,
- "num_tokens": 11133022.0,
- "step": 1242
- },
- {
- "epoch": 0.9445288753799392,
- "grad_norm": 1.6745036840438843,
- "learning_rate": 4.103222357020248e-06,
- "loss": 0.562545657157898,
- "mean_token_accuracy": 0.8052060604095459,
- "num_tokens": 11145255.0,
- "step": 1243
- },
- {
- "epoch": 0.9452887537993921,
- "grad_norm": 2.3616299629211426,
- "learning_rate": 4.101614766333904e-06,
- "loss": 0.5878340601921082,
- "mean_token_accuracy": 0.796745777130127,
- "num_tokens": 11152020.0,
- "step": 1244
- },
- {
- "epoch": 0.9460486322188449,
- "grad_norm": 1.6182078123092651,
- "learning_rate": 4.100006051572664e-06,
- "loss": 0.5357589721679688,
- "mean_token_accuracy": 0.8089962005615234,
- "num_tokens": 11163112.0,
- "step": 1245
- },
- {
- "epoch": 0.9468085106382979,
- "grad_norm": 1.911770224571228,
- "learning_rate": 4.098396213865587e-06,
- "loss": 0.49805426597595215,
- "mean_token_accuracy": 0.8289647102355957,
- "num_tokens": 11171768.0,
- "step": 1246
- },
- {
- "epoch": 0.9475683890577508,
- "grad_norm": 1.649155616760254,
- "learning_rate": 4.096785254342518e-06,
- "loss": 0.5756166577339172,
- "mean_token_accuracy": 0.807680606842041,
- "num_tokens": 11183527.0,
- "step": 1247
- },
- {
- "epoch": 0.9483282674772037,
- "grad_norm": 1.8922761678695679,
- "learning_rate": 4.095173174134091e-06,
- "loss": 0.44688963890075684,
- "mean_token_accuracy": 0.8375608921051025,
- "num_tokens": 11191494.0,
- "step": 1248
- },
- {
- "epoch": 0.9490881458966566,
- "grad_norm": 2.9044547080993652,
- "learning_rate": 4.093559974371725e-06,
- "loss": 0.48609739542007446,
- "mean_token_accuracy": 0.8404892086982727,
- "num_tokens": 11195837.0,
- "step": 1249
- },
- {
- "epoch": 0.9498480243161094,
- "grad_norm": 2.287506580352783,
- "learning_rate": 4.091945656187626e-06,
- "loss": 0.5260225534439087,
- "mean_token_accuracy": 0.8181945085525513,
- "num_tokens": 11202174.0,
- "step": 1250
- },
- {
- "epoch": 0.9506079027355623,
- "grad_norm": 1.7908886671066284,
- "learning_rate": 4.090330220714785e-06,
- "loss": 0.4207724928855896,
- "mean_token_accuracy": 0.8616912364959717,
- "num_tokens": 11209995.0,
- "step": 1251
- },
- {
- "epoch": 0.9513677811550152,
- "grad_norm": 2.905418634414673,
- "learning_rate": 4.0887136690869774e-06,
- "loss": 0.4209241271018982,
- "mean_token_accuracy": 0.8561323285102844,
- "num_tokens": 11213799.0,
- "step": 1252
- },
- {
- "epoch": 0.9521276595744681,
- "grad_norm": 2.814150333404541,
- "learning_rate": 4.08709600243876e-06,
- "loss": 0.36855608224868774,
- "mean_token_accuracy": 0.8764539361000061,
- "num_tokens": 11217643.0,
- "step": 1253
- },
- {
- "epoch": 0.952887537993921,
- "grad_norm": 1.9385707378387451,
- "learning_rate": 4.0854772219054735e-06,
- "loss": 0.531031608581543,
- "mean_token_accuracy": 0.80600905418396,
- "num_tokens": 11225871.0,
- "step": 1254
- },
- {
- "epoch": 0.9536474164133738,
- "grad_norm": 2.103058099746704,
- "learning_rate": 4.083857328623243e-06,
- "loss": 0.4576364755630493,
- "mean_token_accuracy": 0.8447524905204773,
- "num_tokens": 11231829.0,
- "step": 1255
- },
- {
- "epoch": 0.9544072948328267,
- "grad_norm": 1.7518818378448486,
- "learning_rate": 4.082236323728969e-06,
- "loss": 0.5386767983436584,
- "mean_token_accuracy": 0.8055596351623535,
- "num_tokens": 11240977.0,
- "step": 1256
- },
- {
- "epoch": 0.9551671732522796,
- "grad_norm": 1.8434966802597046,
- "learning_rate": 4.0806142083603365e-06,
- "loss": 0.5415925979614258,
- "mean_token_accuracy": 0.809962272644043,
- "num_tokens": 11249616.0,
- "step": 1257
- },
- {
- "epoch": 0.9559270516717325,
- "grad_norm": 1.7341015338897705,
- "learning_rate": 4.078990983655807e-06,
- "loss": 0.4621101915836334,
- "mean_token_accuracy": 0.8330386877059937,
- "num_tokens": 11258616.0,
- "step": 1258
- },
- {
- "epoch": 0.9566869300911854,
- "grad_norm": 1.8589727878570557,
- "learning_rate": 4.077366650754624e-06,
- "loss": 0.4031238555908203,
- "mean_token_accuracy": 0.842434287071228,
- "num_tokens": 11266006.0,
- "step": 1259
- },
- {
- "epoch": 0.9574468085106383,
- "grad_norm": 1.657175898551941,
- "learning_rate": 4.075741210796806e-06,
- "loss": 0.41686388850212097,
- "mean_token_accuracy": 0.8443650007247925,
- "num_tokens": 11275601.0,
- "step": 1260
- },
- {
- "epoch": 0.9582066869300911,
- "grad_norm": 2.4303717613220215,
- "learning_rate": 4.07411466492315e-06,
- "loss": 0.4554435610771179,
- "mean_token_accuracy": 0.853043794631958,
- "num_tokens": 11280650.0,
- "step": 1261
- },
- {
- "epoch": 0.958966565349544,
- "grad_norm": 2.3653745651245117,
- "learning_rate": 4.072487014275228e-06,
- "loss": 0.4304995536804199,
- "mean_token_accuracy": 0.8462260961532593,
- "num_tokens": 11285637.0,
- "step": 1262
- },
- {
- "epoch": 0.959726443768997,
- "grad_norm": 1.6689718961715698,
- "learning_rate": 4.070858259995388e-06,
- "loss": 0.5290807485580444,
- "mean_token_accuracy": 0.8176917433738708,
- "num_tokens": 11299110.0,
- "step": 1263
- },
- {
- "epoch": 0.9604863221884499,
- "grad_norm": 2.103879451751709,
- "learning_rate": 4.069228403226751e-06,
- "loss": 0.4620879888534546,
- "mean_token_accuracy": 0.835270345211029,
- "num_tokens": 11305564.0,
- "step": 1264
- },
- {
- "epoch": 0.9612462006079028,
- "grad_norm": 2.139012575149536,
- "learning_rate": 4.067597445113216e-06,
- "loss": 0.5143396258354187,
- "mean_token_accuracy": 0.8191739320755005,
- "num_tokens": 11311870.0,
- "step": 1265
- },
- {
- "epoch": 0.9620060790273556,
- "grad_norm": 1.3971210718154907,
- "learning_rate": 4.06596538679945e-06,
- "loss": 0.472080260515213,
- "mean_token_accuracy": 0.8321092128753662,
- "num_tokens": 11323970.0,
- "step": 1266
- },
- {
- "epoch": 0.9627659574468085,
- "grad_norm": 1.4965174198150635,
- "learning_rate": 4.064332229430895e-06,
- "loss": 0.359701007604599,
- "mean_token_accuracy": 0.8903120160102844,
- "num_tokens": 11333412.0,
- "step": 1267
- },
- {
- "epoch": 0.9635258358662614,
- "grad_norm": 1.1898726224899292,
- "learning_rate": 4.062697974153764e-06,
- "loss": 0.3423798084259033,
- "mean_token_accuracy": 0.8661491870880127,
- "num_tokens": 11347657.0,
- "step": 1268
- },
- {
- "epoch": 0.9642857142857143,
- "grad_norm": 1.4952168464660645,
- "learning_rate": 4.06106262211504e-06,
- "loss": 0.4214417338371277,
- "mean_token_accuracy": 0.8362159729003906,
- "num_tokens": 11357786.0,
- "step": 1269
- },
- {
- "epoch": 0.9650455927051672,
- "grad_norm": 1.7949583530426025,
- "learning_rate": 4.059426174462476e-06,
- "loss": 0.59087735414505,
- "mean_token_accuracy": 0.7965556979179382,
- "num_tokens": 11370561.0,
- "step": 1270
- },
- {
- "epoch": 0.96580547112462,
- "grad_norm": 1.8973214626312256,
- "learning_rate": 4.057788632344594e-06,
- "loss": 0.47525322437286377,
- "mean_token_accuracy": 0.8317365050315857,
- "num_tokens": 11378507.0,
- "step": 1271
- },
- {
- "epoch": 0.9665653495440729,
- "grad_norm": 1.8665250539779663,
- "learning_rate": 4.056149996910683e-06,
- "loss": 0.3537125587463379,
- "mean_token_accuracy": 0.8921569585800171,
- "num_tokens": 11385186.0,
- "step": 1272
- },
- {
- "epoch": 0.9673252279635258,
- "grad_norm": 1.5072317123413086,
- "learning_rate": 4.054510269310803e-06,
- "loss": 0.5145624876022339,
- "mean_token_accuracy": 0.8265488147735596,
- "num_tokens": 11397125.0,
- "step": 1273
- },
- {
- "epoch": 0.9680851063829787,
- "grad_norm": 1.520525574684143,
- "learning_rate": 4.052869450695776e-06,
- "loss": 0.44322293996810913,
- "mean_token_accuracy": 0.8403642177581787,
- "num_tokens": 11409919.0,
- "step": 1274
- },
- {
- "epoch": 0.9688449848024316,
- "grad_norm": 1.3764475584030151,
- "learning_rate": 4.051227542217192e-06,
- "loss": 0.5774400234222412,
- "mean_token_accuracy": 0.804118275642395,
- "num_tokens": 11425900.0,
- "step": 1275
- },
- {
- "epoch": 0.9696048632218845,
- "grad_norm": 1.3922648429870605,
- "learning_rate": 4.049584545027406e-06,
- "loss": 0.42727944254875183,
- "mean_token_accuracy": 0.8654505014419556,
- "num_tokens": 11438787.0,
- "step": 1276
- },
- {
- "epoch": 0.9703647416413373,
- "grad_norm": 1.8505840301513672,
- "learning_rate": 4.047940460279537e-06,
- "loss": 0.490803062915802,
- "mean_token_accuracy": 0.8340574502944946,
- "num_tokens": 11447997.0,
- "step": 1277
- },
- {
- "epoch": 0.9711246200607903,
- "grad_norm": 2.28271222114563,
- "learning_rate": 4.046295289127466e-06,
- "loss": 0.588828444480896,
- "mean_token_accuracy": 0.833497166633606,
- "num_tokens": 11454072.0,
- "step": 1278
- },
- {
- "epoch": 0.9718844984802432,
- "grad_norm": 2.4242560863494873,
- "learning_rate": 4.044649032725836e-06,
- "loss": 0.5128831267356873,
- "mean_token_accuracy": 0.8225122690200806,
- "num_tokens": 11460211.0,
- "step": 1279
- },
- {
- "epoch": 0.9726443768996961,
- "grad_norm": 2.1738455295562744,
- "learning_rate": 4.0430016922300566e-06,
- "loss": 0.441631942987442,
- "mean_token_accuracy": 0.841723620891571,
- "num_tokens": 11466814.0,
- "step": 1280
- },
- {
- "epoch": 0.973404255319149,
- "grad_norm": 2.541599988937378,
- "learning_rate": 4.0413532687962926e-06,
- "loss": 0.5062629580497742,
- "mean_token_accuracy": 0.8013502359390259,
- "num_tokens": 11472371.0,
- "step": 1281
- },
- {
- "epoch": 0.9741641337386018,
- "grad_norm": 2.8011014461517334,
- "learning_rate": 4.039703763581472e-06,
- "loss": 0.5061966776847839,
- "mean_token_accuracy": 0.829810380935669,
- "num_tokens": 11476672.0,
- "step": 1282
- },
- {
- "epoch": 0.9749240121580547,
- "grad_norm": 2.4505462646484375,
- "learning_rate": 4.038053177743279e-06,
- "loss": 0.43407535552978516,
- "mean_token_accuracy": 0.8428469896316528,
- "num_tokens": 11481297.0,
- "step": 1283
- },
- {
- "epoch": 0.9756838905775076,
- "grad_norm": 2.1618378162384033,
- "learning_rate": 4.036401512440161e-06,
- "loss": 0.6056663393974304,
- "mean_token_accuracy": 0.7977457642555237,
- "num_tokens": 11488657.0,
- "step": 1284
- },
- {
- "epoch": 0.9764437689969605,
- "grad_norm": 1.9192147254943848,
- "learning_rate": 4.034748768831319e-06,
- "loss": 0.524390697479248,
- "mean_token_accuracy": 0.8120636940002441,
- "num_tokens": 11496485.0,
- "step": 1285
- },
- {
- "epoch": 0.9772036474164134,
- "grad_norm": 2.766435384750366,
- "learning_rate": 4.033094948076713e-06,
- "loss": 0.5494908690452576,
- "mean_token_accuracy": 0.8141890168190002,
- "num_tokens": 11501341.0,
- "step": 1286
- },
- {
- "epoch": 0.9779635258358662,
- "grad_norm": 1.3519539833068848,
- "learning_rate": 4.031440051337056e-06,
- "loss": 0.4339691400527954,
- "mean_token_accuracy": 0.8400131464004517,
- "num_tokens": 11512843.0,
- "step": 1287
- },
- {
- "epoch": 0.9787234042553191,
- "grad_norm": 1.2492141723632812,
- "learning_rate": 4.02978407977382e-06,
- "loss": 0.4433518052101135,
- "mean_token_accuracy": 0.8432940244674683,
- "num_tokens": 11530227.0,
- "step": 1288
- },
- {
- "epoch": 0.979483282674772,
- "grad_norm": 1.6597715616226196,
- "learning_rate": 4.02812703454923e-06,
- "loss": 0.602222204208374,
- "mean_token_accuracy": 0.786965548992157,
- "num_tokens": 11543955.0,
- "step": 1289
- },
- {
- "epoch": 0.9802431610942249,
- "grad_norm": 1.6621816158294678,
- "learning_rate": 4.026468916826262e-06,
- "loss": 0.35662174224853516,
- "mean_token_accuracy": 0.8716133832931519,
- "num_tokens": 11552064.0,
- "step": 1290
- },
- {
- "epoch": 0.9810030395136778,
- "grad_norm": 4.539844989776611,
- "learning_rate": 4.024809727768648e-06,
- "loss": 0.543423593044281,
- "mean_token_accuracy": 0.8293194770812988,
- "num_tokens": 11555595.0,
- "step": 1291
- },
- {
- "epoch": 0.9817629179331308,
- "grad_norm": 1.4026556015014648,
- "learning_rate": 4.023149468540871e-06,
- "loss": 0.4301237165927887,
- "mean_token_accuracy": 0.8358224630355835,
- "num_tokens": 11572275.0,
- "step": 1292
- },
- {
- "epoch": 0.9825227963525835,
- "grad_norm": 1.611262321472168,
- "learning_rate": 4.021488140308165e-06,
- "loss": 0.5378580689430237,
- "mean_token_accuracy": 0.8173760771751404,
- "num_tokens": 11584299.0,
- "step": 1293
- },
- {
- "epoch": 0.9832826747720365,
- "grad_norm": 4.138631820678711,
- "learning_rate": 4.019825744236514e-06,
- "loss": 0.40272149443626404,
- "mean_token_accuracy": 0.8648844957351685,
- "num_tokens": 11586705.0,
- "step": 1294
- },
- {
- "epoch": 0.9840425531914894,
- "grad_norm": 3.177703619003296,
- "learning_rate": 4.018162281492651e-06,
- "loss": 0.5320103168487549,
- "mean_token_accuracy": 0.8250276446342468,
- "num_tokens": 11590689.0,
- "step": 1295
- },
- {
- "epoch": 0.9848024316109423,
- "grad_norm": 2.727597713470459,
- "learning_rate": 4.016497753244058e-06,
- "loss": 0.5662774443626404,
- "mean_token_accuracy": 0.8074625730514526,
- "num_tokens": 11596092.0,
- "step": 1296
- },
- {
- "epoch": 0.9855623100303952,
- "grad_norm": 1.485139012336731,
- "learning_rate": 4.014832160658966e-06,
- "loss": 0.5414972305297852,
- "mean_token_accuracy": 0.8082696199417114,
- "num_tokens": 11613785.0,
- "step": 1297
- },
- {
- "epoch": 0.986322188449848,
- "grad_norm": 2.4025990962982178,
- "learning_rate": 4.013165504906352e-06,
- "loss": 0.6556503772735596,
- "mean_token_accuracy": 0.7785214781761169,
- "num_tokens": 11620421.0,
- "step": 1298
- },
- {
- "epoch": 0.9870820668693009,
- "grad_norm": 1.878273606300354,
- "learning_rate": 4.011497787155938e-06,
- "loss": 0.4221133887767792,
- "mean_token_accuracy": 0.850035548210144,
- "num_tokens": 11627998.0,
- "step": 1299
- },
- {
- "epoch": 0.9878419452887538,
- "grad_norm": 2.0430715084075928,
- "learning_rate": 4.009829008578192e-06,
- "loss": 0.5205984711647034,
- "mean_token_accuracy": 0.819183349609375,
- "num_tokens": 11636279.0,
- "step": 1300
- },
- {
- "epoch": 0.9886018237082067,
- "grad_norm": 3.4769439697265625,
- "learning_rate": 4.00815917034433e-06,
- "loss": 0.5449948310852051,
- "mean_token_accuracy": 0.8240023851394653,
- "num_tokens": 11639638.0,
- "step": 1301
- },
- {
- "epoch": 0.9893617021276596,
- "grad_norm": 2.4783987998962402,
- "learning_rate": 4.006488273626307e-06,
- "loss": 0.4316832423210144,
- "mean_token_accuracy": 0.8474695086479187,
- "num_tokens": 11645463.0,
- "step": 1302
- },
- {
- "epoch": 0.9901215805471124,
- "grad_norm": 1.881475567817688,
- "learning_rate": 4.004816319596822e-06,
- "loss": 0.5157331824302673,
- "mean_token_accuracy": 0.826042652130127,
- "num_tokens": 11653955.0,
- "step": 1303
- },
- {
- "epoch": 0.9908814589665653,
- "grad_norm": 2.6569254398345947,
- "learning_rate": 4.003143309429317e-06,
- "loss": 0.46492767333984375,
- "mean_token_accuracy": 0.8320850133895874,
- "num_tokens": 11659357.0,
- "step": 1304
- },
- {
- "epoch": 0.9916413373860182,
- "grad_norm": 2.4917593002319336,
- "learning_rate": 4.0014692442979756e-06,
- "loss": 0.459585040807724,
- "mean_token_accuracy": 0.8457611799240112,
- "num_tokens": 11664207.0,
- "step": 1305
- },
- {
- "epoch": 0.9924012158054711,
- "grad_norm": 2.6885526180267334,
- "learning_rate": 3.999794125377721e-06,
- "loss": 0.4677402973175049,
- "mean_token_accuracy": 0.8307361602783203,
- "num_tokens": 11668879.0,
- "step": 1306
- },
- {
- "epoch": 0.993161094224924,
- "grad_norm": 1.9737319946289062,
- "learning_rate": 3.998117953844215e-06,
- "loss": 0.44684839248657227,
- "mean_token_accuracy": 0.8367687463760376,
- "num_tokens": 11676081.0,
- "step": 1307
- },
- {
- "epoch": 0.993920972644377,
- "grad_norm": 1.4333021640777588,
- "learning_rate": 3.996440730873861e-06,
- "loss": 0.526146650314331,
- "mean_token_accuracy": 0.816251814365387,
- "num_tokens": 11689333.0,
- "step": 1308
- },
- {
- "epoch": 0.9946808510638298,
- "grad_norm": 1.3689230680465698,
- "learning_rate": 3.9947624576437975e-06,
- "loss": 0.40214329957962036,
- "mean_token_accuracy": 0.8610327839851379,
- "num_tokens": 11701540.0,
- "step": 1309
- },
- {
- "epoch": 0.9954407294832827,
- "grad_norm": 1.2435375452041626,
- "learning_rate": 3.9930831353319025e-06,
- "loss": 0.4532913267612457,
- "mean_token_accuracy": 0.8415389060974121,
- "num_tokens": 11717920.0,
- "step": 1310
- },
- {
- "epoch": 0.9962006079027356,
- "grad_norm": 1.9968011379241943,
- "learning_rate": 3.9914027651167866e-06,
- "loss": 0.46954160928726196,
- "mean_token_accuracy": 0.8351103663444519,
- "num_tokens": 11724999.0,
- "step": 1311
- },
- {
- "epoch": 0.9969604863221885,
- "grad_norm": 1.9521311521530151,
- "learning_rate": 3.989721348177801e-06,
- "loss": 0.5068016052246094,
- "mean_token_accuracy": 0.8220845460891724,
- "num_tokens": 11732569.0,
- "step": 1312
- },
- {
- "epoch": 0.9977203647416414,
- "grad_norm": 2.7332582473754883,
- "learning_rate": 3.988038885695028e-06,
- "loss": 0.4154692590236664,
- "mean_token_accuracy": 0.8493857383728027,
- "num_tokens": 11736759.0,
- "step": 1313
- },
- {
- "epoch": 0.9984802431610942,
- "grad_norm": 1.8656952381134033,
- "learning_rate": 3.986355378849284e-06,
- "loss": 0.4151354134082794,
- "mean_token_accuracy": 0.83440101146698,
- "num_tokens": 11743827.0,
- "step": 1314
- },
- {
- "epoch": 0.9992401215805471,
- "grad_norm": 1.304006576538086,
- "learning_rate": 3.984670828822118e-06,
- "loss": 0.4926128089427948,
- "mean_token_accuracy": 0.8603005409240723,
- "num_tokens": 11757707.0,
- "step": 1315
- },
- {
- "epoch": 1.0,
- "grad_norm": 1.497079610824585,
- "learning_rate": 3.982985236795815e-06,
- "loss": 0.43342477083206177,
- "mean_token_accuracy": 0.8550825119018555,
- "num_tokens": 11769678.0,
- "step": 1316
- },
- {
- "epoch": 1.000759878419453,
- "grad_norm": 2.870274543762207,
- "learning_rate": 3.981298603953385e-06,
- "loss": 0.3723528981208801,
- "mean_token_accuracy": 0.8745899796485901,
- "num_tokens": 11773290.0,
- "step": 1317
- },
- {
- "epoch": 1.0015197568389058,
- "grad_norm": 1.3442503213882446,
- "learning_rate": 3.979610931478574e-06,
- "loss": 0.34688329696655273,
- "mean_token_accuracy": 0.8749074935913086,
- "num_tokens": 11786400.0,
- "step": 1318
- },
- {
- "epoch": 1.0022796352583587,
- "grad_norm": 1.7272238731384277,
- "learning_rate": 3.977922220555855e-06,
- "loss": 0.28274932503700256,
- "mean_token_accuracy": 0.896713137626648,
- "num_tokens": 11793059.0,
- "step": 1319
- },
- {
- "epoch": 1.0030395136778116,
- "grad_norm": 1.7362451553344727,
- "learning_rate": 3.976232472370431e-06,
- "loss": 0.5494794845581055,
- "mean_token_accuracy": 0.8341718912124634,
- "num_tokens": 11802593.0,
- "step": 1320
- },
- {
- "epoch": 1.0037993920972645,
- "grad_norm": 1.3316494226455688,
- "learning_rate": 3.97454168810823e-06,
- "loss": 0.41505366563796997,
- "mean_token_accuracy": 0.8581969738006592,
- "num_tokens": 11813925.0,
- "step": 1321
- },
- {
- "epoch": 1.0045592705167172,
- "grad_norm": 1.6152615547180176,
- "learning_rate": 3.972849868955913e-06,
- "loss": 0.44761013984680176,
- "mean_token_accuracy": 0.8413045406341553,
- "num_tokens": 11825709.0,
- "step": 1322
- },
- {
- "epoch": 1.0053191489361701,
- "grad_norm": 2.1172471046447754,
- "learning_rate": 3.97115701610086e-06,
- "loss": 0.3903353810310364,
- "mean_token_accuracy": 0.8662760257720947,
- "num_tokens": 11832070.0,
- "step": 1323
- },
- {
- "epoch": 1.006079027355623,
- "grad_norm": 1.5923868417739868,
- "learning_rate": 3.969463130731183e-06,
- "loss": 0.4491051137447357,
- "mean_token_accuracy": 0.8677828311920166,
- "num_tokens": 11843154.0,
- "step": 1324
- },
- {
- "epoch": 1.006838905775076,
- "grad_norm": 1.6848995685577393,
- "learning_rate": 3.967768214035716e-06,
- "loss": 0.45765817165374756,
- "mean_token_accuracy": 0.8401060104370117,
- "num_tokens": 11854826.0,
- "step": 1325
- },
- {
- "epoch": 1.0075987841945289,
- "grad_norm": 2.3739020824432373,
- "learning_rate": 3.966072267204014e-06,
- "loss": 0.4482722580432892,
- "mean_token_accuracy": 0.8368916511535645,
- "num_tokens": 11860559.0,
- "step": 1326
- },
- {
- "epoch": 1.0083586626139818,
- "grad_norm": 1.5403034687042236,
- "learning_rate": 3.964375291426361e-06,
- "loss": 0.35589972138404846,
- "mean_token_accuracy": 0.8728118538856506,
- "num_tokens": 11871959.0,
- "step": 1327
- },
- {
- "epoch": 1.0091185410334347,
- "grad_norm": 1.6750119924545288,
- "learning_rate": 3.962677287893758e-06,
- "loss": 0.35873427987098694,
- "mean_token_accuracy": 0.9027186632156372,
- "num_tokens": 11881818.0,
- "step": 1328
- },
- {
- "epoch": 1.0098784194528876,
- "grad_norm": 1.5489170551300049,
- "learning_rate": 3.9609782577979305e-06,
- "loss": 0.3634672462940216,
- "mean_token_accuracy": 0.8582607507705688,
- "num_tokens": 11891084.0,
- "step": 1329
- },
- {
- "epoch": 1.0106382978723405,
- "grad_norm": 2.43859601020813,
- "learning_rate": 3.959278202331323e-06,
- "loss": 0.3640799820423126,
- "mean_token_accuracy": 0.88062584400177,
- "num_tokens": 11896032.0,
- "step": 1330
- },
- {
- "epoch": 1.0113981762917934,
- "grad_norm": 3.612184524536133,
- "learning_rate": 3.9575771226870986e-06,
- "loss": 0.3733130097389221,
- "mean_token_accuracy": 0.8946067094802856,
- "num_tokens": 11899479.0,
- "step": 1331
- },
- {
- "epoch": 1.012158054711246,
- "grad_norm": 1.541355848312378,
- "learning_rate": 3.955875020059141e-06,
- "loss": 0.320593923330307,
- "mean_token_accuracy": 0.9057406783103943,
- "num_tokens": 11910179.0,
- "step": 1332
- },
- {
- "epoch": 1.012917933130699,
- "grad_norm": 2.0565030574798584,
- "learning_rate": 3.954171895642052e-06,
- "loss": 0.3341682553291321,
- "mean_token_accuracy": 0.8829344511032104,
- "num_tokens": 11916489.0,
- "step": 1333
- },
- {
- "epoch": 1.013677811550152,
- "grad_norm": 2.9732539653778076,
- "learning_rate": 3.9524677506311505e-06,
- "loss": 0.38488566875457764,
- "mean_token_accuracy": 0.8752974271774292,
- "num_tokens": 11920682.0,
- "step": 1334
- },
- {
- "epoch": 1.0144376899696048,
- "grad_norm": 2.7697458267211914,
- "learning_rate": 3.950762586222469e-06,
- "loss": 0.39864760637283325,
- "mean_token_accuracy": 0.8593167662620544,
- "num_tokens": 11925233.0,
- "step": 1335
- },
- {
- "epoch": 1.0151975683890577,
- "grad_norm": 2.2302119731903076,
- "learning_rate": 3.949056403612758e-06,
- "loss": 0.3985682725906372,
- "mean_token_accuracy": 0.8677899837493896,
- "num_tokens": 11932000.0,
- "step": 1336
- },
- {
- "epoch": 1.0159574468085106,
- "grad_norm": 2.360572576522827,
- "learning_rate": 3.947349203999485e-06,
- "loss": 0.36940714716911316,
- "mean_token_accuracy": 0.8760676383972168,
- "num_tokens": 11937569.0,
- "step": 1337
- },
- {
- "epoch": 1.0167173252279635,
- "grad_norm": 1.3383921384811401,
- "learning_rate": 3.945640988580824e-06,
- "loss": 0.40628793835639954,
- "mean_token_accuracy": 0.866442084312439,
- "num_tokens": 11955679.0,
- "step": 1338
- },
- {
- "epoch": 1.0174772036474165,
- "grad_norm": 2.1502623558044434,
- "learning_rate": 3.943931758555669e-06,
- "loss": 0.4493565559387207,
- "mean_token_accuracy": 0.8307522535324097,
- "num_tokens": 11962734.0,
- "step": 1339
- },
- {
- "epoch": 1.0182370820668694,
- "grad_norm": 2.4737331867218018,
- "learning_rate": 3.942221515123624e-06,
- "loss": 0.28508758544921875,
- "mean_token_accuracy": 0.8967142105102539,
- "num_tokens": 11967783.0,
- "step": 1340
- },
- {
- "epoch": 1.0189969604863223,
- "grad_norm": 2.4525370597839355,
- "learning_rate": 3.940510259485002e-06,
- "loss": 0.40227818489074707,
- "mean_token_accuracy": 0.8618967533111572,
- "num_tokens": 11972918.0,
- "step": 1341
- },
- {
- "epoch": 1.0197568389057752,
- "grad_norm": 1.7299731969833374,
- "learning_rate": 3.938797992840828e-06,
- "loss": 0.26339593529701233,
- "mean_token_accuracy": 0.9004406929016113,
- "num_tokens": 11981250.0,
- "step": 1342
- },
- {
- "epoch": 1.0205167173252279,
- "grad_norm": 2.8756747245788574,
- "learning_rate": 3.937084716392839e-06,
- "loss": 0.47792482376098633,
- "mean_token_accuracy": 0.8440839052200317,
- "num_tokens": 11986356.0,
- "step": 1343
- },
- {
- "epoch": 1.0212765957446808,
- "grad_norm": 2.104473114013672,
- "learning_rate": 3.935370431343475e-06,
- "loss": 0.36723971366882324,
- "mean_token_accuracy": 0.8831232786178589,
- "num_tokens": 11994495.0,
- "step": 1344
- },
- {
- "epoch": 1.0220364741641337,
- "grad_norm": 1.9173074960708618,
- "learning_rate": 3.933655138895889e-06,
- "loss": 0.409319669008255,
- "mean_token_accuracy": 0.8632645606994629,
- "num_tokens": 12002060.0,
- "step": 1345
- },
- {
- "epoch": 1.0227963525835866,
- "grad_norm": 2.958311080932617,
- "learning_rate": 3.9319388402539395e-06,
- "loss": 0.5390093922615051,
- "mean_token_accuracy": 0.8204828500747681,
- "num_tokens": 12007588.0,
- "step": 1346
- },
- {
- "epoch": 1.0235562310030395,
- "grad_norm": 1.6470831632614136,
- "learning_rate": 3.930221536622192e-06,
- "loss": 0.4524633288383484,
- "mean_token_accuracy": 0.8516575694084167,
- "num_tokens": 12018831.0,
- "step": 1347
- },
- {
- "epoch": 1.0243161094224924,
- "grad_norm": 1.3160780668258667,
- "learning_rate": 3.928503229205913e-06,
- "loss": 0.4180558919906616,
- "mean_token_accuracy": 0.8495022058486938,
- "num_tokens": 12033947.0,
- "step": 1348
- },
- {
- "epoch": 1.0250759878419453,
- "grad_norm": 1.9686089754104614,
- "learning_rate": 3.92678391921108e-06,
- "loss": 0.41927334666252136,
- "mean_token_accuracy": 0.8462997674942017,
- "num_tokens": 12042005.0,
- "step": 1349
- },
- {
- "epoch": 1.0258358662613982,
- "grad_norm": 2.351778507232666,
- "learning_rate": 3.92506360784437e-06,
- "loss": 0.2946245074272156,
- "mean_token_accuracy": 0.9170923233032227,
- "num_tokens": 12046579.0,
- "step": 1350
- },
- {
- "epoch": 1.0265957446808511,
- "grad_norm": 2.0636913776397705,
- "learning_rate": 3.923342296313162e-06,
- "loss": 0.3422774076461792,
- "mean_token_accuracy": 0.8809213638305664,
- "num_tokens": 12053214.0,
- "step": 1351
- },
- {
- "epoch": 1.027355623100304,
- "grad_norm": 1.7272592782974243,
- "learning_rate": 3.92161998582554e-06,
- "loss": 0.5864541530609131,
- "mean_token_accuracy": 0.7986117601394653,
- "num_tokens": 12068522.0,
- "step": 1352
- },
- {
- "epoch": 1.028115501519757,
- "grad_norm": 0.8980231881141663,
- "learning_rate": 3.919896677590289e-06,
- "loss": 0.2964550256729126,
- "mean_token_accuracy": 0.8911845088005066,
- "num_tokens": 12093834.0,
- "step": 1353
- },
- {
- "epoch": 1.0288753799392096,
- "grad_norm": 1.6031712293624878,
- "learning_rate": 3.918172372816892e-06,
- "loss": 0.37254488468170166,
- "mean_token_accuracy": 0.8615843057632446,
- "num_tokens": 12104393.0,
- "step": 1354
- },
- {
- "epoch": 1.0296352583586625,
- "grad_norm": 1.282134771347046,
- "learning_rate": 3.916447072715531e-06,
- "loss": 0.3522927761077881,
- "mean_token_accuracy": 0.8713657259941101,
- "num_tokens": 12118671.0,
- "step": 1355
- },
- {
- "epoch": 1.0303951367781155,
- "grad_norm": 2.1986680030822754,
- "learning_rate": 3.914720778497091e-06,
- "loss": 0.3716316223144531,
- "mean_token_accuracy": 0.8661249279975891,
- "num_tokens": 12125178.0,
- "step": 1356
- },
- {
- "epoch": 1.0311550151975684,
- "grad_norm": 1.5937882661819458,
- "learning_rate": 3.91299349137315e-06,
- "loss": 0.48067355155944824,
- "mean_token_accuracy": 0.8284252882003784,
- "num_tokens": 12136785.0,
- "step": 1357
- },
- {
- "epoch": 1.0319148936170213,
- "grad_norm": 1.6743099689483643,
- "learning_rate": 3.9112652125559845e-06,
- "loss": 0.4461551308631897,
- "mean_token_accuracy": 0.8381845355033875,
- "num_tokens": 12150066.0,
- "step": 1358
- },
- {
- "epoch": 1.0326747720364742,
- "grad_norm": 2.2346715927124023,
- "learning_rate": 3.909535943258567e-06,
- "loss": 0.3148220181465149,
- "mean_token_accuracy": 0.8797591924667358,
- "num_tokens": 12155506.0,
- "step": 1359
- },
- {
- "epoch": 1.033434650455927,
- "grad_norm": 1.9608992338180542,
- "learning_rate": 3.907805684694567e-06,
- "loss": 0.32598960399627686,
- "mean_token_accuracy": 0.8819410800933838,
- "num_tokens": 12163261.0,
- "step": 1360
- },
- {
- "epoch": 1.03419452887538,
- "grad_norm": 2.413477897644043,
- "learning_rate": 3.906074438078343e-06,
- "loss": 0.38179588317871094,
- "mean_token_accuracy": 0.8739585876464844,
- "num_tokens": 12169254.0,
- "step": 1361
- },
- {
- "epoch": 1.034954407294833,
- "grad_norm": 2.0258278846740723,
- "learning_rate": 3.904342204624955e-06,
- "loss": 0.33240315318107605,
- "mean_token_accuracy": 0.8808181285858154,
- "num_tokens": 12175379.0,
- "step": 1362
- },
- {
- "epoch": 1.0357142857142858,
- "grad_norm": 2.4111437797546387,
- "learning_rate": 3.9026089855501475e-06,
- "loss": 0.412802517414093,
- "mean_token_accuracy": 0.8504396677017212,
- "num_tokens": 12182007.0,
- "step": 1363
- },
- {
- "epoch": 1.0364741641337385,
- "grad_norm": 2.0424840450286865,
- "learning_rate": 3.900874782070362e-06,
- "loss": 0.2914797067642212,
- "mean_token_accuracy": 0.8731886148452759,
- "num_tokens": 12187743.0,
- "step": 1364
- },
- {
- "epoch": 1.0372340425531914,
- "grad_norm": 2.9248716831207275,
- "learning_rate": 3.899139595402729e-06,
- "loss": 0.34071338176727295,
- "mean_token_accuracy": 0.8736443519592285,
- "num_tokens": 12191830.0,
- "step": 1365
- },
- {
- "epoch": 1.0379939209726443,
- "grad_norm": 2.240220785140991,
- "learning_rate": 3.8974034267650695e-06,
- "loss": 0.23049014806747437,
- "mean_token_accuracy": 0.9000070691108704,
- "num_tokens": 12196460.0,
- "step": 1366
- },
- {
- "epoch": 1.0387537993920972,
- "grad_norm": 1.5038460493087769,
- "learning_rate": 3.895666277375892e-06,
- "loss": 0.32255327701568604,
- "mean_token_accuracy": 0.873004674911499,
- "num_tokens": 12206230.0,
- "step": 1367
- },
- {
- "epoch": 1.0395136778115501,
- "grad_norm": 1.2339142560958862,
- "learning_rate": 3.893928148454398e-06,
- "loss": 0.4069131314754486,
- "mean_token_accuracy": 0.8461740016937256,
- "num_tokens": 12226502.0,
- "step": 1368
- },
- {
- "epoch": 1.040273556231003,
- "grad_norm": 2.531553268432617,
- "learning_rate": 3.89218904122047e-06,
- "loss": 0.43681037425994873,
- "mean_token_accuracy": 0.8497104048728943,
- "num_tokens": 12232241.0,
- "step": 1369
- },
- {
- "epoch": 1.041033434650456,
- "grad_norm": 3.8404815196990967,
- "learning_rate": 3.890448956894682e-06,
- "loss": 0.3241814970970154,
- "mean_token_accuracy": 0.884732723236084,
- "num_tokens": 12235126.0,
- "step": 1370
- },
- {
- "epoch": 1.0417933130699089,
- "grad_norm": 2.9608030319213867,
- "learning_rate": 3.888707896698293e-06,
- "loss": 0.4641021490097046,
- "mean_token_accuracy": 0.8496800661087036,
- "num_tokens": 12240630.0,
- "step": 1371
- },
- {
- "epoch": 1.0425531914893618,
- "grad_norm": 2.1166417598724365,
- "learning_rate": 3.886965861853243e-06,
- "loss": 0.42038479447364807,
- "mean_token_accuracy": 0.8512747287750244,
- "num_tokens": 12247969.0,
- "step": 1372
- },
- {
- "epoch": 1.0433130699088147,
- "grad_norm": 2.5918161869049072,
- "learning_rate": 3.885222853582163e-06,
- "loss": 0.2871917188167572,
- "mean_token_accuracy": 0.9129709601402283,
- "num_tokens": 12252161.0,
- "step": 1373
- },
- {
- "epoch": 1.0440729483282676,
- "grad_norm": 2.4261348247528076,
- "learning_rate": 3.88347887310836e-06,
- "loss": 0.4003123342990875,
- "mean_token_accuracy": 0.8570356369018555,
- "num_tokens": 12258135.0,
- "step": 1374
- },
- {
- "epoch": 1.0448328267477203,
- "grad_norm": 1.3439548015594482,
- "learning_rate": 3.881733921655829e-06,
- "loss": 0.3278140425682068,
- "mean_token_accuracy": 0.8831373453140259,
- "num_tokens": 12272849.0,
- "step": 1375
- },
- {
- "epoch": 1.0455927051671732,
- "grad_norm": 1.527989387512207,
- "learning_rate": 3.879988000449243e-06,
- "loss": 0.33789363503456116,
- "mean_token_accuracy": 0.8825669884681702,
- "num_tokens": 12283281.0,
- "step": 1376
- },
- {
- "epoch": 1.046352583586626,
- "grad_norm": 1.6755503416061401,
- "learning_rate": 3.878241110713957e-06,
- "loss": 0.4816160798072815,
- "mean_token_accuracy": 0.8193758726119995,
- "num_tokens": 12295422.0,
- "step": 1377
- },
- {
- "epoch": 1.047112462006079,
- "grad_norm": 2.8110361099243164,
- "learning_rate": 3.876493253676004e-06,
- "loss": 0.38662949204444885,
- "mean_token_accuracy": 0.8611986637115479,
- "num_tokens": 12299806.0,
- "step": 1378
- },
- {
- "epoch": 1.047872340425532,
- "grad_norm": 1.86097252368927,
- "learning_rate": 3.8747444305621e-06,
- "loss": 0.27612629532814026,
- "mean_token_accuracy": 0.8984048366546631,
- "num_tokens": 12306599.0,
- "step": 1379
- },
- {
- "epoch": 1.0486322188449848,
- "grad_norm": 2.361828565597534,
- "learning_rate": 3.872994642599635e-06,
- "loss": 0.469953715801239,
- "mean_token_accuracy": 0.8464452028274536,
- "num_tokens": 12314249.0,
- "step": 1380
- },
- {
- "epoch": 1.0493920972644377,
- "grad_norm": 1.9524794816970825,
- "learning_rate": 3.871243891016676e-06,
- "loss": 0.5419625043869019,
- "mean_token_accuracy": 0.8468329906463623,
- "num_tokens": 12324987.0,
- "step": 1381
- },
- {
- "epoch": 1.0501519756838906,
- "grad_norm": 1.6931511163711548,
- "learning_rate": 3.869492177041971e-06,
- "loss": 0.3791416883468628,
- "mean_token_accuracy": 0.8692882061004639,
- "num_tokens": 12336864.0,
- "step": 1382
- },
- {
- "epoch": 1.0509118541033435,
- "grad_norm": 1.909692406654358,
- "learning_rate": 3.867739501904938e-06,
- "loss": 0.27974557876586914,
- "mean_token_accuracy": 0.9004636406898499,
- "num_tokens": 12343093.0,
- "step": 1383
- },
- {
- "epoch": 1.0516717325227964,
- "grad_norm": 1.415162205696106,
- "learning_rate": 3.8659858668356735e-06,
- "loss": 0.38928335905075073,
- "mean_token_accuracy": 0.8491984009742737,
- "num_tokens": 12356613.0,
- "step": 1384
- },
- {
- "epoch": 1.0524316109422491,
- "grad_norm": 1.8195741176605225,
- "learning_rate": 3.864231273064944e-06,
- "loss": 0.3798758089542389,
- "mean_token_accuracy": 0.8728072047233582,
- "num_tokens": 12364860.0,
- "step": 1385
- },
- {
- "epoch": 1.053191489361702,
- "grad_norm": 1.8481454849243164,
- "learning_rate": 3.862475721824193e-06,
- "loss": 0.269635945558548,
- "mean_token_accuracy": 0.899247407913208,
- "num_tokens": 12371841.0,
- "step": 1386
- },
- {
- "epoch": 1.053951367781155,
- "grad_norm": 1.7838784456253052,
- "learning_rate": 3.8607192143455325e-06,
- "loss": 0.36971768736839294,
- "mean_token_accuracy": 0.8833638429641724,
- "num_tokens": 12380685.0,
- "step": 1387
- },
- {
- "epoch": 1.0547112462006079,
- "grad_norm": 1.333358645439148,
- "learning_rate": 3.858961751861748e-06,
- "loss": 0.4039418399333954,
- "mean_token_accuracy": 0.8541078567504883,
- "num_tokens": 12394072.0,
- "step": 1388
- },
- {
- "epoch": 1.0554711246200608,
- "grad_norm": 2.1600265502929688,
- "learning_rate": 3.857203335606294e-06,
- "loss": 0.38211894035339355,
- "mean_token_accuracy": 0.8549972772598267,
- "num_tokens": 12400449.0,
- "step": 1389
- },
- {
- "epoch": 1.0562310030395137,
- "grad_norm": 2.914902687072754,
- "learning_rate": 3.855443966813295e-06,
- "loss": 0.2237374186515808,
- "mean_token_accuracy": 0.9253600835800171,
- "num_tokens": 12403758.0,
- "step": 1390
- },
- {
- "epoch": 1.0569908814589666,
- "grad_norm": 2.2361080646514893,
- "learning_rate": 3.853683646717543e-06,
- "loss": 0.3359566926956177,
- "mean_token_accuracy": 0.898173451423645,
- "num_tokens": 12410374.0,
- "step": 1391
- },
- {
- "epoch": 1.0577507598784195,
- "grad_norm": 2.3639304637908936,
- "learning_rate": 3.8519223765544985e-06,
- "loss": 0.3844943046569824,
- "mean_token_accuracy": 0.863599419593811,
- "num_tokens": 12416016.0,
- "step": 1392
- },
- {
- "epoch": 1.0585106382978724,
- "grad_norm": 2.202971935272217,
- "learning_rate": 3.85016015756029e-06,
- "loss": 0.3546281158924103,
- "mean_token_accuracy": 0.8907540440559387,
- "num_tokens": 12422026.0,
- "step": 1393
- },
- {
- "epoch": 1.0592705167173253,
- "grad_norm": 1.1279661655426025,
- "learning_rate": 3.848396990971709e-06,
- "loss": 0.31522464752197266,
- "mean_token_accuracy": 0.8662257194519043,
- "num_tokens": 12439964.0,
- "step": 1394
- },
- {
- "epoch": 1.0600303951367782,
- "grad_norm": 2.4731740951538086,
- "learning_rate": 3.846632878026214e-06,
- "loss": 0.456442266702652,
- "mean_token_accuracy": 0.8516958951950073,
- "num_tokens": 12446231.0,
- "step": 1395
- },
- {
- "epoch": 1.060790273556231,
- "grad_norm": 1.7631878852844238,
- "learning_rate": 3.844867819961928e-06,
- "loss": 0.487227201461792,
- "mean_token_accuracy": 0.8466947078704834,
- "num_tokens": 12459989.0,
- "step": 1396
- },
- {
- "epoch": 1.0615501519756838,
- "grad_norm": 2.4468278884887695,
- "learning_rate": 3.843101818017637e-06,
- "loss": 0.3367291986942291,
- "mean_token_accuracy": 0.8734689950942993,
- "num_tokens": 12465741.0,
- "step": 1397
- },
- {
- "epoch": 1.0623100303951367,
- "grad_norm": 1.9045145511627197,
- "learning_rate": 3.841334873432789e-06,
- "loss": 0.4652615487575531,
- "mean_token_accuracy": 0.8333107233047485,
- "num_tokens": 12474963.0,
- "step": 1398
- },
- {
- "epoch": 1.0630699088145896,
- "grad_norm": 1.6816917657852173,
- "learning_rate": 3.839566987447492e-06,
- "loss": 0.4144279956817627,
- "mean_token_accuracy": 0.8472539186477661,
- "num_tokens": 12485521.0,
- "step": 1399
- },
- {
- "epoch": 1.0638297872340425,
- "grad_norm": 1.8990092277526855,
- "learning_rate": 3.837798161302518e-06,
- "loss": 0.4040985405445099,
- "mean_token_accuracy": 0.8514704704284668,
- "num_tokens": 12493495.0,
- "step": 1400
- },
- {
- "epoch": 1.0645896656534954,
- "grad_norm": 2.27785325050354,
- "learning_rate": 3.836028396239297e-06,
- "loss": 0.43425723910331726,
- "mean_token_accuracy": 0.8795069456100464,
- "num_tokens": 12499789.0,
- "step": 1401
- },
- {
- "epoch": 1.0653495440729484,
- "grad_norm": 2.5130882263183594,
- "learning_rate": 3.8342576934999184e-06,
- "loss": 0.33892524242401123,
- "mean_token_accuracy": 0.8717449903488159,
- "num_tokens": 12504885.0,
- "step": 1402
- },
- {
- "epoch": 1.0661094224924013,
- "grad_norm": 2.650040864944458,
- "learning_rate": 3.832486054327131e-06,
- "loss": 0.4200317859649658,
- "mean_token_accuracy": 0.8616159558296204,
- "num_tokens": 12509783.0,
- "step": 1403
- },
- {
- "epoch": 1.0668693009118542,
- "grad_norm": 2.9176881313323975,
- "learning_rate": 3.830713479964335e-06,
- "loss": 0.37018489837646484,
- "mean_token_accuracy": 0.8676021695137024,
- "num_tokens": 12514441.0,
- "step": 1404
- },
- {
- "epoch": 1.067629179331307,
- "grad_norm": 1.6430318355560303,
- "learning_rate": 3.828939971655595e-06,
- "loss": 0.27539193630218506,
- "mean_token_accuracy": 0.9077831506729126,
- "num_tokens": 12523677.0,
- "step": 1405
- },
- {
- "epoch": 1.06838905775076,
- "grad_norm": 1.3683708906173706,
- "learning_rate": 3.827165530645627e-06,
- "loss": 0.4085099697113037,
- "mean_token_accuracy": 0.8579255938529968,
- "num_tokens": 12540104.0,
- "step": 1406
- },
- {
- "epoch": 1.0691489361702127,
- "grad_norm": 2.528465747833252,
- "learning_rate": 3.825390158179802e-06,
- "loss": 0.42462456226348877,
- "mean_token_accuracy": 0.852813720703125,
- "num_tokens": 12548239.0,
- "step": 1407
- },
- {
- "epoch": 1.0699088145896656,
- "grad_norm": 1.8288795948028564,
- "learning_rate": 3.823613855504144e-06,
- "loss": 0.412417471408844,
- "mean_token_accuracy": 0.8622130751609802,
- "num_tokens": 12557316.0,
- "step": 1408
- },
- {
- "epoch": 1.0706686930091185,
- "grad_norm": 2.341794490814209,
- "learning_rate": 3.82183662386533e-06,
- "loss": 0.2996668815612793,
- "mean_token_accuracy": 0.8964041471481323,
- "num_tokens": 12562377.0,
- "step": 1409
- },
- {
- "epoch": 1.0714285714285714,
- "grad_norm": 2.555877208709717,
- "learning_rate": 3.82005846451069e-06,
- "loss": 0.4184221625328064,
- "mean_token_accuracy": 0.8678828477859497,
- "num_tokens": 12568516.0,
- "step": 1410
- },
- {
- "epoch": 1.0721884498480243,
- "grad_norm": 2.081308126449585,
- "learning_rate": 3.8182793786882065e-06,
- "loss": 0.4376835823059082,
- "mean_token_accuracy": 0.8409077525138855,
- "num_tokens": 12576598.0,
- "step": 1411
- },
- {
- "epoch": 1.0729483282674772,
- "grad_norm": 2.0272316932678223,
- "learning_rate": 3.816499367646508e-06,
- "loss": 0.3630060851573944,
- "mean_token_accuracy": 0.8762413263320923,
- "num_tokens": 12584587.0,
- "step": 1412
- },
- {
- "epoch": 1.0737082066869301,
- "grad_norm": 2.6382484436035156,
- "learning_rate": 3.814718432634877e-06,
- "loss": 0.4244990348815918,
- "mean_token_accuracy": 0.8509312272071838,
- "num_tokens": 12590028.0,
- "step": 1413
- },
- {
- "epoch": 1.074468085106383,
- "grad_norm": 2.429800271987915,
- "learning_rate": 3.8129365749032398e-06,
- "loss": 0.36990004777908325,
- "mean_token_accuracy": 0.8749774098396301,
- "num_tokens": 12594984.0,
- "step": 1414
- },
- {
- "epoch": 1.075227963525836,
- "grad_norm": 3.5939090251922607,
- "learning_rate": 3.8111537957021736e-06,
- "loss": 0.4245661199092865,
- "mean_token_accuracy": 0.8481623530387878,
- "num_tokens": 12598494.0,
- "step": 1415
- },
- {
- "epoch": 1.0759878419452888,
- "grad_norm": 2.705955982208252,
- "learning_rate": 3.809370096282903e-06,
- "loss": 0.41851678490638733,
- "mean_token_accuracy": 0.8548051714897156,
- "num_tokens": 12603876.0,
- "step": 1416
- },
- {
- "epoch": 1.0767477203647418,
- "grad_norm": 1.7812079191207886,
- "learning_rate": 3.807585477897296e-06,
- "loss": 0.47113919258117676,
- "mean_token_accuracy": 0.8346904516220093,
- "num_tokens": 12613402.0,
- "step": 1417
- },
- {
- "epoch": 1.0775075987841944,
- "grad_norm": 1.4335212707519531,
- "learning_rate": 3.8057999417978654e-06,
- "loss": 0.3802063465118408,
- "mean_token_accuracy": 0.8563423156738281,
- "num_tokens": 12626865.0,
- "step": 1418
- },
- {
- "epoch": 1.0782674772036474,
- "grad_norm": 1.9171305894851685,
- "learning_rate": 3.8040134892377702e-06,
- "loss": 0.20898357033729553,
- "mean_token_accuracy": 0.9189738035202026,
- "num_tokens": 12632593.0,
- "step": 1419
- },
- {
- "epoch": 1.0790273556231003,
- "grad_norm": 1.4996821880340576,
- "learning_rate": 3.802226121470811e-06,
- "loss": 0.4203261137008667,
- "mean_token_accuracy": 0.8479211330413818,
- "num_tokens": 12646395.0,
- "step": 1420
- },
- {
- "epoch": 1.0797872340425532,
- "grad_norm": 2.2007253170013428,
- "learning_rate": 3.800437839751432e-06,
- "loss": 0.40370577573776245,
- "mean_token_accuracy": 0.8427679538726807,
- "num_tokens": 12653508.0,
- "step": 1421
- },
- {
- "epoch": 1.080547112462006,
- "grad_norm": 1.7266581058502197,
- "learning_rate": 3.7986486453347183e-06,
- "loss": 0.46750491857528687,
- "mean_token_accuracy": 0.8429205417633057,
- "num_tokens": 12666329.0,
- "step": 1422
- },
- {
- "epoch": 1.081306990881459,
- "grad_norm": 1.4716318845748901,
- "learning_rate": 3.796858539476394e-06,
- "loss": 0.3330317735671997,
- "mean_token_accuracy": 0.879012942314148,
- "num_tokens": 12676741.0,
- "step": 1423
- },
- {
- "epoch": 1.082066869300912,
- "grad_norm": 2.652127265930176,
- "learning_rate": 3.795067523432826e-06,
- "loss": 0.35365715622901917,
- "mean_token_accuracy": 0.8796792030334473,
- "num_tokens": 12681479.0,
- "step": 1424
- },
- {
- "epoch": 1.0828267477203648,
- "grad_norm": 1.2937829494476318,
- "learning_rate": 3.793275598461017e-06,
- "loss": 0.25272446870803833,
- "mean_token_accuracy": 0.9231734275817871,
- "num_tokens": 12694238.0,
- "step": 1425
- },
- {
- "epoch": 1.0835866261398177,
- "grad_norm": 1.3831220865249634,
- "learning_rate": 3.7914827658186104e-06,
- "loss": 0.4935331344604492,
- "mean_token_accuracy": 0.8417420387268066,
- "num_tokens": 12712857.0,
- "step": 1426
- },
- {
- "epoch": 1.0843465045592706,
- "grad_norm": 3.059525728225708,
- "learning_rate": 3.7896890267638832e-06,
- "loss": 0.2592190206050873,
- "mean_token_accuracy": 0.9040263295173645,
- "num_tokens": 12716766.0,
- "step": 1427
- },
- {
- "epoch": 1.0851063829787233,
- "grad_norm": 2.8399202823638916,
- "learning_rate": 3.787894382555752e-06,
- "loss": 0.32098138332366943,
- "mean_token_accuracy": 0.8838302493095398,
- "num_tokens": 12720774.0,
- "step": 1428
- },
- {
- "epoch": 1.0858662613981762,
- "grad_norm": 2.618479013442993,
- "learning_rate": 3.7860988344537664e-06,
- "loss": 0.425255686044693,
- "mean_token_accuracy": 0.8564130067825317,
- "num_tokens": 12726506.0,
- "step": 1429
- },
- {
- "epoch": 1.0866261398176291,
- "grad_norm": 1.3108669519424438,
- "learning_rate": 3.7843023837181126e-06,
- "loss": 0.40220165252685547,
- "mean_token_accuracy": 0.8588873147964478,
- "num_tokens": 12742814.0,
- "step": 1430
- },
- {
- "epoch": 1.087386018237082,
- "grad_norm": 2.2083566188812256,
- "learning_rate": 3.782505031609607e-06,
- "loss": 0.318379282951355,
- "mean_token_accuracy": 0.8887606859207153,
- "num_tokens": 12748388.0,
- "step": 1431
- },
- {
- "epoch": 1.088145896656535,
- "grad_norm": 1.922358751296997,
- "learning_rate": 3.7807067793897006e-06,
- "loss": 0.2519589364528656,
- "mean_token_accuracy": 0.8936764001846313,
- "num_tokens": 12754761.0,
- "step": 1432
- },
- {
- "epoch": 1.0889057750759878,
- "grad_norm": 1.7367439270019531,
- "learning_rate": 3.778907628320477e-06,
- "loss": 0.3970367908477783,
- "mean_token_accuracy": 0.858735203742981,
- "num_tokens": 12764016.0,
- "step": 1433
- },
- {
- "epoch": 1.0896656534954408,
- "grad_norm": 2.1931066513061523,
- "learning_rate": 3.77710757966465e-06,
- "loss": 0.5250554084777832,
- "mean_token_accuracy": 0.8356746435165405,
- "num_tokens": 12772272.0,
- "step": 1434
- },
- {
- "epoch": 1.0904255319148937,
- "grad_norm": 1.718337893486023,
- "learning_rate": 3.775306634685562e-06,
- "loss": 0.283231645822525,
- "mean_token_accuracy": 0.9009919166564941,
- "num_tokens": 12780706.0,
- "step": 1435
- },
- {
- "epoch": 1.0911854103343466,
- "grad_norm": 2.1985926628112793,
- "learning_rate": 3.773504794647187e-06,
- "loss": 0.3913170397281647,
- "mean_token_accuracy": 0.8909255266189575,
- "num_tokens": 12787052.0,
- "step": 1436
- },
- {
- "epoch": 1.0919452887537995,
- "grad_norm": 2.8687937259674072,
- "learning_rate": 3.771702060814123e-06,
- "loss": 0.3135771155357361,
- "mean_token_accuracy": 0.9016125202178955,
- "num_tokens": 12791854.0,
- "step": 1437
- },
- {
- "epoch": 1.0927051671732522,
- "grad_norm": 4.203946590423584,
- "learning_rate": 3.7698984344516e-06,
- "loss": 0.3642737865447998,
- "mean_token_accuracy": 0.8842349052429199,
- "num_tokens": 12794969.0,
- "step": 1438
- },
- {
- "epoch": 1.093465045592705,
- "grad_norm": 1.5134642124176025,
- "learning_rate": 3.7680939168254733e-06,
- "loss": 0.3732057213783264,
- "mean_token_accuracy": 0.8671083450317383,
- "num_tokens": 12808480.0,
- "step": 1439
- },
- {
- "epoch": 1.094224924012158,
- "grad_norm": 3.2103970050811768,
- "learning_rate": 3.7662885092022206e-06,
- "loss": 0.3556194603443146,
- "mean_token_accuracy": 0.8786529302597046,
- "num_tokens": 12812654.0,
- "step": 1440
- },
- {
- "epoch": 1.094984802431611,
- "grad_norm": 2.2774064540863037,
- "learning_rate": 3.7644822128489476e-06,
- "loss": 0.38409674167633057,
- "mean_token_accuracy": 0.866563081741333,
- "num_tokens": 12819854.0,
- "step": 1441
- },
- {
- "epoch": 1.0957446808510638,
- "grad_norm": 1.8250885009765625,
- "learning_rate": 3.7626750290333824e-06,
- "loss": 0.3812350034713745,
- "mean_token_accuracy": 0.8676212430000305,
- "num_tokens": 12830338.0,
- "step": 1442
- },
- {
- "epoch": 1.0965045592705167,
- "grad_norm": 1.8337891101837158,
- "learning_rate": 3.7608669590238765e-06,
- "loss": 0.3892471194267273,
- "mean_token_accuracy": 0.8616238832473755,
- "num_tokens": 12840340.0,
- "step": 1443
- },
- {
- "epoch": 1.0972644376899696,
- "grad_norm": 1.5300254821777344,
- "learning_rate": 3.7590580040894025e-06,
- "loss": 0.35288217663764954,
- "mean_token_accuracy": 0.8625509738922119,
- "num_tokens": 12853144.0,
- "step": 1444
- },
- {
- "epoch": 1.0980243161094225,
- "grad_norm": 2.152683734893799,
- "learning_rate": 3.7572481654995554e-06,
- "loss": 0.4004772901535034,
- "mean_token_accuracy": 0.858427107334137,
- "num_tokens": 12859970.0,
- "step": 1445
- },
- {
- "epoch": 1.0987841945288754,
- "grad_norm": 1.532832145690918,
- "learning_rate": 3.755437444524548e-06,
- "loss": 0.46820127964019775,
- "mean_token_accuracy": 0.8585472106933594,
- "num_tokens": 12875243.0,
- "step": 1446
- },
- {
- "epoch": 1.0995440729483283,
- "grad_norm": 1.6485342979431152,
- "learning_rate": 3.7536258424352164e-06,
- "loss": 0.46329325437545776,
- "mean_token_accuracy": 0.8376060724258423,
- "num_tokens": 12886383.0,
- "step": 1447
- },
- {
- "epoch": 1.1003039513677813,
- "grad_norm": 2.402256488800049,
- "learning_rate": 3.75181336050301e-06,
- "loss": 0.43916207551956177,
- "mean_token_accuracy": 0.8448786735534668,
- "num_tokens": 12892613.0,
- "step": 1448
- },
- {
- "epoch": 1.101063829787234,
- "grad_norm": 1.3893651962280273,
- "learning_rate": 3.7500000000000005e-06,
- "loss": 0.3919021785259247,
- "mean_token_accuracy": 0.8495820760726929,
- "num_tokens": 12905523.0,
- "step": 1449
- },
- {
- "epoch": 1.1018237082066868,
- "grad_norm": 1.5519827604293823,
- "learning_rate": 3.7481857621988734e-06,
- "loss": 0.4710700809955597,
- "mean_token_accuracy": 0.8387632369995117,
- "num_tokens": 12918236.0,
- "step": 1450
- },
- {
- "epoch": 1.1025835866261398,
- "grad_norm": 2.0141353607177734,
- "learning_rate": 3.74637064837293e-06,
- "loss": 0.30866751074790955,
- "mean_token_accuracy": 0.9059321880340576,
- "num_tokens": 12924391.0,
- "step": 1451
- },
- {
- "epoch": 1.1033434650455927,
- "grad_norm": 1.2201496362686157,
- "learning_rate": 3.7445546597960882e-06,
- "loss": 0.3938257396221161,
- "mean_token_accuracy": 0.8726630210876465,
- "num_tokens": 12943338.0,
- "step": 1452
- },
- {
- "epoch": 1.1041033434650456,
- "grad_norm": 2.29434871673584,
- "learning_rate": 3.742737797742878e-06,
- "loss": 0.4347776174545288,
- "mean_token_accuracy": 0.840569257736206,
- "num_tokens": 12950636.0,
- "step": 1453
- },
- {
- "epoch": 1.1048632218844985,
- "grad_norm": 2.3875105381011963,
- "learning_rate": 3.7409200634884425e-06,
- "loss": 0.48353564739227295,
- "mean_token_accuracy": 0.8207056522369385,
- "num_tokens": 12957635.0,
- "step": 1454
- },
- {
- "epoch": 1.1056231003039514,
- "grad_norm": 2.3539648056030273,
- "learning_rate": 3.7391014583085384e-06,
- "loss": 0.3532431721687317,
- "mean_token_accuracy": 0.8903788924217224,
- "num_tokens": 12963032.0,
- "step": 1455
- },
- {
- "epoch": 1.1063829787234043,
- "grad_norm": 1.5611135959625244,
- "learning_rate": 3.737281983479534e-06,
- "loss": 0.4734863042831421,
- "mean_token_accuracy": 0.8413879871368408,
- "num_tokens": 12977170.0,
- "step": 1456
- },
- {
- "epoch": 1.1071428571428572,
- "grad_norm": 1.474320411682129,
- "learning_rate": 3.735461640278404e-06,
- "loss": 0.41854286193847656,
- "mean_token_accuracy": 0.8499876856803894,
- "num_tokens": 12993750.0,
- "step": 1457
- },
- {
- "epoch": 1.1079027355623101,
- "grad_norm": 2.6873273849487305,
- "learning_rate": 3.733640429982738e-06,
- "loss": 0.47637903690338135,
- "mean_token_accuracy": 0.83599853515625,
- "num_tokens": 12999058.0,
- "step": 1458
- },
- {
- "epoch": 1.108662613981763,
- "grad_norm": 1.4575026035308838,
- "learning_rate": 3.731818353870729e-06,
- "loss": 0.38441652059555054,
- "mean_token_accuracy": 0.8582364320755005,
- "num_tokens": 13013864.0,
- "step": 1459
- },
- {
- "epoch": 1.1094224924012157,
- "grad_norm": 1.7722690105438232,
- "learning_rate": 3.729995413221183e-06,
- "loss": 0.4224998950958252,
- "mean_token_accuracy": 0.8511888384819031,
- "num_tokens": 13023714.0,
- "step": 1460
- },
- {
- "epoch": 1.1101823708206686,
- "grad_norm": 2.625760555267334,
- "learning_rate": 3.7281716093135068e-06,
- "loss": 0.3487582802772522,
- "mean_token_accuracy": 0.8834779262542725,
- "num_tokens": 13028608.0,
- "step": 1461
- },
- {
- "epoch": 1.1109422492401215,
- "grad_norm": 1.2554056644439697,
- "learning_rate": 3.726346943427719e-06,
- "loss": 0.33312469720840454,
- "mean_token_accuracy": 0.8704153299331665,
- "num_tokens": 13044901.0,
- "step": 1462
- },
- {
- "epoch": 1.1117021276595744,
- "grad_norm": 2.1109910011291504,
- "learning_rate": 3.7245214168444388e-06,
- "loss": 0.387290894985199,
- "mean_token_accuracy": 0.860816240310669,
- "num_tokens": 13051452.0,
- "step": 1463
- },
- {
- "epoch": 1.1124620060790273,
- "grad_norm": 3.159201145172119,
- "learning_rate": 3.722695030844891e-06,
- "loss": 0.37690871953964233,
- "mean_token_accuracy": 0.8717561960220337,
- "num_tokens": 13055131.0,
- "step": 1464
- },
- {
- "epoch": 1.1132218844984803,
- "grad_norm": 1.3810011148452759,
- "learning_rate": 3.7208677867109042e-06,
- "loss": 0.36598485708236694,
- "mean_token_accuracy": 0.8683375120162964,
- "num_tokens": 13069798.0,
- "step": 1465
- },
- {
- "epoch": 1.1139817629179332,
- "grad_norm": 2.500849485397339,
- "learning_rate": 3.7190396857249087e-06,
- "loss": 0.2781746983528137,
- "mean_token_accuracy": 0.9026005268096924,
- "num_tokens": 13075127.0,
- "step": 1466
- },
- {
- "epoch": 1.114741641337386,
- "grad_norm": 1.7445712089538574,
- "learning_rate": 3.7172107291699356e-06,
- "loss": 0.5055314302444458,
- "mean_token_accuracy": 0.8252174258232117,
- "num_tokens": 13084843.0,
- "step": 1467
- },
- {
- "epoch": 1.115501519756839,
- "grad_norm": 1.6386256217956543,
- "learning_rate": 3.7153809183296174e-06,
- "loss": 0.38478314876556396,
- "mean_token_accuracy": 0.8600847721099854,
- "num_tokens": 13096517.0,
- "step": 1468
- },
- {
- "epoch": 1.1162613981762919,
- "grad_norm": 2.3818395137786865,
- "learning_rate": 3.713550254488185e-06,
- "loss": 0.40308547019958496,
- "mean_token_accuracy": 0.8628184795379639,
- "num_tokens": 13102324.0,
- "step": 1469
- },
- {
- "epoch": 1.1170212765957448,
- "grad_norm": 1.73163640499115,
- "learning_rate": 3.7117187389304703e-06,
- "loss": 0.5035421848297119,
- "mean_token_accuracy": 0.8229597210884094,
- "num_tokens": 13113763.0,
- "step": 1470
- },
- {
- "epoch": 1.1177811550151975,
- "grad_norm": 3.147177219390869,
- "learning_rate": 3.7098863729418997e-06,
- "loss": 0.557449221611023,
- "mean_token_accuracy": 0.8266849517822266,
- "num_tokens": 13118849.0,
- "step": 1471
- },
- {
- "epoch": 1.1185410334346504,
- "grad_norm": 1.5061391592025757,
- "learning_rate": 3.7080531578085e-06,
- "loss": 0.3759554922580719,
- "mean_token_accuracy": 0.8541903495788574,
- "num_tokens": 13131337.0,
- "step": 1472
- },
- {
- "epoch": 1.1193009118541033,
- "grad_norm": 2.172346353530884,
- "learning_rate": 3.7062190948168906e-06,
- "loss": 0.41491609811782837,
- "mean_token_accuracy": 0.8531454801559448,
- "num_tokens": 13139767.0,
- "step": 1473
- },
- {
- "epoch": 1.1200607902735562,
- "grad_norm": 2.1527154445648193,
- "learning_rate": 3.7043841852542884e-06,
- "loss": 0.4309239387512207,
- "mean_token_accuracy": 0.8327745199203491,
- "num_tokens": 13147210.0,
- "step": 1474
- },
- {
- "epoch": 1.1208206686930091,
- "grad_norm": 1.8342832326889038,
- "learning_rate": 3.7025484304085035e-06,
- "loss": 0.34393298625946045,
- "mean_token_accuracy": 0.8948153257369995,
- "num_tokens": 13154831.0,
- "step": 1475
- },
- {
- "epoch": 1.121580547112462,
- "grad_norm": 2.509291172027588,
- "learning_rate": 3.7007118315679384e-06,
- "loss": 0.4479471445083618,
- "mean_token_accuracy": 0.8280234336853027,
- "num_tokens": 13161040.0,
- "step": 1476
- },
- {
- "epoch": 1.122340425531915,
- "grad_norm": 2.914710521697998,
- "learning_rate": 3.6988743900215895e-06,
- "loss": 0.3724832832813263,
- "mean_token_accuracy": 0.863893985748291,
- "num_tokens": 13164975.0,
- "step": 1477
- },
- {
- "epoch": 1.1231003039513678,
- "grad_norm": 3.274808645248413,
- "learning_rate": 3.6970361070590443e-06,
- "loss": 0.4088161885738373,
- "mean_token_accuracy": 0.8474822044372559,
- "num_tokens": 13168826.0,
- "step": 1478
- },
- {
- "epoch": 1.1238601823708207,
- "grad_norm": 2.861546277999878,
- "learning_rate": 3.695196983970481e-06,
- "loss": 0.45837992429733276,
- "mean_token_accuracy": 0.8579759001731873,
- "num_tokens": 13173794.0,
- "step": 1479
- },
- {
- "epoch": 1.1246200607902737,
- "grad_norm": 1.9491597414016724,
- "learning_rate": 3.6933570220466654e-06,
- "loss": 0.4333910346031189,
- "mean_token_accuracy": 0.8444236516952515,
- "num_tokens": 13181598.0,
- "step": 1480
- },
- {
- "epoch": 1.1253799392097266,
- "grad_norm": 1.329848051071167,
- "learning_rate": 3.6915162225789546e-06,
- "loss": 0.36404621601104736,
- "mean_token_accuracy": 0.8694117069244385,
- "num_tokens": 13196381.0,
- "step": 1481
- },
- {
- "epoch": 1.1261398176291793,
- "grad_norm": 1.8854197263717651,
- "learning_rate": 3.6896745868592924e-06,
- "loss": 0.4085756838321686,
- "mean_token_accuracy": 0.855188250541687,
- "num_tokens": 13205236.0,
- "step": 1482
- },
- {
- "epoch": 1.1268996960486322,
- "grad_norm": 3.01684832572937,
- "learning_rate": 3.6878321161802106e-06,
- "loss": 0.28105655312538147,
- "mean_token_accuracy": 0.9009426236152649,
- "num_tokens": 13209380.0,
- "step": 1483
- },
- {
- "epoch": 1.127659574468085,
- "grad_norm": 1.8051308393478394,
- "learning_rate": 3.685988811834823e-06,
- "loss": 0.3314531147480011,
- "mean_token_accuracy": 0.8805814385414124,
- "num_tokens": 13217714.0,
- "step": 1484
- },
- {
- "epoch": 1.128419452887538,
- "grad_norm": 1.61757493019104,
- "learning_rate": 3.684144675116836e-06,
- "loss": 0.4543863534927368,
- "mean_token_accuracy": 0.8400536775588989,
- "num_tokens": 13229330.0,
- "step": 1485
- },
- {
- "epoch": 1.1291793313069909,
- "grad_norm": 1.602686882019043,
- "learning_rate": 3.682299707320532e-06,
- "loss": 0.3653204143047333,
- "mean_token_accuracy": 0.8655825853347778,
- "num_tokens": 13242872.0,
- "step": 1486
- },
- {
- "epoch": 1.1299392097264438,
- "grad_norm": 2.3093113899230957,
- "learning_rate": 3.680453909740782e-06,
- "loss": 0.4383693039417267,
- "mean_token_accuracy": 0.839782178401947,
- "num_tokens": 13248976.0,
- "step": 1487
- },
- {
- "epoch": 1.1306990881458967,
- "grad_norm": 1.180559754371643,
- "learning_rate": 3.6786072836730376e-06,
- "loss": 0.5354755520820618,
- "mean_token_accuracy": 0.8151205778121948,
- "num_tokens": 13272896.0,
- "step": 1488
- },
- {
- "epoch": 1.1314589665653496,
- "grad_norm": 1.9554040431976318,
- "learning_rate": 3.6767598304133325e-06,
- "loss": 0.4485316872596741,
- "mean_token_accuracy": 0.8399936556816101,
- "num_tokens": 13280757.0,
- "step": 1489
- },
- {
- "epoch": 1.1322188449848025,
- "grad_norm": 2.236471176147461,
- "learning_rate": 3.674911551258279e-06,
- "loss": 0.45594364404678345,
- "mean_token_accuracy": 0.8552400469779968,
- "num_tokens": 13287328.0,
- "step": 1490
- },
- {
- "epoch": 1.1329787234042552,
- "grad_norm": 2.5228686332702637,
- "learning_rate": 3.673062447505072e-06,
- "loss": 0.4048641622066498,
- "mean_token_accuracy": 0.8617376685142517,
- "num_tokens": 13292716.0,
- "step": 1491
- },
- {
- "epoch": 1.1337386018237081,
- "grad_norm": 1.1274473667144775,
- "learning_rate": 3.6712125204514836e-06,
- "loss": 0.3848876357078552,
- "mean_token_accuracy": 0.8672975301742554,
- "num_tokens": 13313403.0,
- "step": 1492
- },
- {
- "epoch": 1.134498480243161,
- "grad_norm": 2.349541425704956,
- "learning_rate": 3.6693617713958633e-06,
- "loss": 0.3166058361530304,
- "mean_token_accuracy": 0.8896721601486206,
- "num_tokens": 13318720.0,
- "step": 1493
- },
- {
- "epoch": 1.135258358662614,
- "grad_norm": 2.2438278198242188,
- "learning_rate": 3.6675102016371387e-06,
- "loss": 0.5418218970298767,
- "mean_token_accuracy": 0.8256527185440063,
- "num_tokens": 13325360.0,
- "step": 1494
- },
- {
- "epoch": 1.1360182370820668,
- "grad_norm": 2.21268892288208,
- "learning_rate": 3.665657812474812e-06,
- "loss": 0.48603951930999756,
- "mean_token_accuracy": 0.8273470401763916,
- "num_tokens": 13333217.0,
- "step": 1495
- },
- {
- "epoch": 1.1367781155015197,
- "grad_norm": 2.6105997562408447,
- "learning_rate": 3.6638046052089614e-06,
- "loss": 0.31221291422843933,
- "mean_token_accuracy": 0.888375997543335,
- "num_tokens": 13338413.0,
- "step": 1496
- },
- {
- "epoch": 1.1375379939209727,
- "grad_norm": 3.655658483505249,
- "learning_rate": 3.661950581140239e-06,
- "loss": 0.3609023988246918,
- "mean_token_accuracy": 0.8838576078414917,
- "num_tokens": 13341499.0,
- "step": 1497
- },
- {
- "epoch": 1.1382978723404256,
- "grad_norm": 2.242009162902832,
- "learning_rate": 3.660095741569871e-06,
- "loss": 0.40022802352905273,
- "mean_token_accuracy": 0.8559960722923279,
- "num_tokens": 13347917.0,
- "step": 1498
- },
- {
- "epoch": 1.1390577507598785,
- "grad_norm": 1.7958979606628418,
- "learning_rate": 3.658240087799655e-06,
- "loss": 0.499157190322876,
- "mean_token_accuracy": 0.8423802256584167,
- "num_tokens": 13361570.0,
- "step": 1499
- },
- {
- "epoch": 1.1398176291793314,
- "grad_norm": 2.5406908988952637,
- "learning_rate": 3.6563836211319593e-06,
- "loss": 0.4090137481689453,
- "mean_token_accuracy": 0.8769663572311401,
- "num_tokens": 13367183.0,
- "step": 1500
- },
- {
- "epoch": 1.1405775075987843,
- "grad_norm": 1.9861716032028198,
- "learning_rate": 3.654526342869724e-06,
- "loss": 0.5125207304954529,
- "mean_token_accuracy": 0.8315266370773315,
- "num_tokens": 13376767.0,
- "step": 1501
- },
- {
- "epoch": 1.141337386018237,
- "grad_norm": 1.731188178062439,
- "learning_rate": 3.65266825431646e-06,
- "loss": 0.39452576637268066,
- "mean_token_accuracy": 0.8585706353187561,
- "num_tokens": 13388437.0,
- "step": 1502
- },
- {
- "epoch": 1.1420972644376899,
- "grad_norm": 1.5203773975372314,
- "learning_rate": 3.6508093567762425e-06,
- "loss": 0.39466819167137146,
- "mean_token_accuracy": 0.8584027886390686,
- "num_tokens": 13399727.0,
- "step": 1503
- },
- {
- "epoch": 1.1428571428571428,
- "grad_norm": 2.606462001800537,
- "learning_rate": 3.6489496515537204e-06,
- "loss": 0.4521079361438751,
- "mean_token_accuracy": 0.8413360118865967,
- "num_tokens": 13408426.0,
- "step": 1504
- },
- {
- "epoch": 1.1436170212765957,
- "grad_norm": 2.6207993030548096,
- "learning_rate": 3.647089139954104e-06,
- "loss": 0.4709353446960449,
- "mean_token_accuracy": 0.8397113084793091,
- "num_tokens": 13413506.0,
- "step": 1505
- },
- {
- "epoch": 1.1443768996960486,
- "grad_norm": 1.7214165925979614,
- "learning_rate": 3.6452278232831734e-06,
- "loss": 0.45506367087364197,
- "mean_token_accuracy": 0.8466023206710815,
- "num_tokens": 13424592.0,
- "step": 1506
- },
- {
- "epoch": 1.1451367781155015,
- "grad_norm": 1.7111759185791016,
- "learning_rate": 3.643365702847272e-06,
- "loss": 0.5016278624534607,
- "mean_token_accuracy": 0.8196234703063965,
- "num_tokens": 13434421.0,
- "step": 1507
- },
- {
- "epoch": 1.1458966565349544,
- "grad_norm": 1.7528148889541626,
- "learning_rate": 3.641502779953307e-06,
- "loss": 0.5020896196365356,
- "mean_token_accuracy": 0.826249361038208,
- "num_tokens": 13445286.0,
- "step": 1508
- },
- {
- "epoch": 1.1466565349544073,
- "grad_norm": 1.3470909595489502,
- "learning_rate": 3.639639055908751e-06,
- "loss": 0.45765724778175354,
- "mean_token_accuracy": 0.8380560278892517,
- "num_tokens": 13465030.0,
- "step": 1509
- },
- {
- "epoch": 1.1474164133738602,
- "grad_norm": 2.4846835136413574,
- "learning_rate": 3.6377745320216346e-06,
- "loss": 0.46488267183303833,
- "mean_token_accuracy": 0.8393925428390503,
- "num_tokens": 13470883.0,
- "step": 1510
- },
- {
- "epoch": 1.1481762917933132,
- "grad_norm": 1.770201563835144,
- "learning_rate": 3.635909209600555e-06,
- "loss": 0.5262179374694824,
- "mean_token_accuracy": 0.8201162815093994,
- "num_tokens": 13482558.0,
- "step": 1511
- },
- {
- "epoch": 1.148936170212766,
- "grad_norm": 1.5955098867416382,
- "learning_rate": 3.6340430899546656e-06,
- "loss": 0.430621862411499,
- "mean_token_accuracy": 0.8488553762435913,
- "num_tokens": 13493003.0,
- "step": 1512
- },
- {
- "epoch": 1.1496960486322187,
- "grad_norm": 2.846176862716675,
- "learning_rate": 3.632176174393682e-06,
- "loss": 0.23461638391017914,
- "mean_token_accuracy": 0.9218817353248596,
- "num_tokens": 13496566.0,
- "step": 1513
- },
- {
- "epoch": 1.1504559270516717,
- "grad_norm": 1.9606610536575317,
- "learning_rate": 3.630308464227877e-06,
- "loss": 0.4940161108970642,
- "mean_token_accuracy": 0.8474864959716797,
- "num_tokens": 13504843.0,
- "step": 1514
- },
- {
- "epoch": 1.1512158054711246,
- "grad_norm": 1.1588608026504517,
- "learning_rate": 3.628439960768082e-06,
- "loss": 0.32650992274284363,
- "mean_token_accuracy": 0.8797246217727661,
- "num_tokens": 13521513.0,
- "step": 1515
- },
- {
- "epoch": 1.1519756838905775,
- "grad_norm": 1.3566495180130005,
- "learning_rate": 3.6265706653256837e-06,
- "loss": 0.4359064996242523,
- "mean_token_accuracy": 0.8379859328269958,
- "num_tokens": 13540608.0,
- "step": 1516
- },
- {
- "epoch": 1.1527355623100304,
- "grad_norm": 1.4728609323501587,
- "learning_rate": 3.624700579212626e-06,
- "loss": 0.29939693212509155,
- "mean_token_accuracy": 0.8831408023834229,
- "num_tokens": 13550641.0,
- "step": 1517
- },
- {
- "epoch": 1.1534954407294833,
- "grad_norm": 2.162325382232666,
- "learning_rate": 3.6228297037414077e-06,
- "loss": 0.4097636938095093,
- "mean_token_accuracy": 0.8575425148010254,
- "num_tokens": 13556931.0,
- "step": 1518
- },
- {
- "epoch": 1.1542553191489362,
- "grad_norm": 1.754439353942871,
- "learning_rate": 3.6209580402250816e-06,
- "loss": 0.400202214717865,
- "mean_token_accuracy": 0.8569821119308472,
- "num_tokens": 13565491.0,
- "step": 1519
- },
- {
- "epoch": 1.155015197568389,
- "grad_norm": 1.5250083208084106,
- "learning_rate": 3.619085589977251e-06,
- "loss": 0.43330419063568115,
- "mean_token_accuracy": 0.8492985963821411,
- "num_tokens": 13577147.0,
- "step": 1520
- },
- {
- "epoch": 1.155775075987842,
- "grad_norm": 1.9108905792236328,
- "learning_rate": 3.617212354312076e-06,
- "loss": 0.30567464232444763,
- "mean_token_accuracy": 0.8850164413452148,
- "num_tokens": 13584366.0,
- "step": 1521
- },
- {
- "epoch": 1.156534954407295,
- "grad_norm": 2.2574243545532227,
- "learning_rate": 3.615338334544265e-06,
- "loss": 0.4391738772392273,
- "mean_token_accuracy": 0.839765727519989,
- "num_tokens": 13591816.0,
- "step": 1522
- },
- {
- "epoch": 1.1572948328267478,
- "grad_norm": 2.1235218048095703,
- "learning_rate": 3.6134635319890763e-06,
- "loss": 0.45043107867240906,
- "mean_token_accuracy": 0.8385299444198608,
- "num_tokens": 13599736.0,
- "step": 1523
- },
- {
- "epoch": 1.1580547112462005,
- "grad_norm": 2.2274110317230225,
- "learning_rate": 3.611587947962319e-06,
- "loss": 0.3623226284980774,
- "mean_token_accuracy": 0.8724044561386108,
- "num_tokens": 13605354.0,
- "step": 1524
- },
- {
- "epoch": 1.1588145896656534,
- "grad_norm": 3.414236545562744,
- "learning_rate": 3.6097115837803504e-06,
- "loss": 0.30060696601867676,
- "mean_token_accuracy": 0.8971061706542969,
- "num_tokens": 13608851.0,
- "step": 1525
- },
- {
- "epoch": 1.1595744680851063,
- "grad_norm": 2.496264696121216,
- "learning_rate": 3.6078344407600744e-06,
- "loss": 0.3567180037498474,
- "mean_token_accuracy": 0.8596180081367493,
- "num_tokens": 13614339.0,
- "step": 1526
- },
- {
- "epoch": 1.1603343465045592,
- "grad_norm": 2.0191843509674072,
- "learning_rate": 3.6059565202189433e-06,
- "loss": 0.43206095695495605,
- "mean_token_accuracy": 0.8464000821113586,
- "num_tokens": 13622395.0,
- "step": 1527
- },
- {
- "epoch": 1.1610942249240122,
- "grad_norm": 1.5475906133651733,
- "learning_rate": 3.604077823474954e-06,
- "loss": 0.4535648226737976,
- "mean_token_accuracy": 0.8391586542129517,
- "num_tokens": 13635356.0,
- "step": 1528
- },
- {
- "epoch": 1.161854103343465,
- "grad_norm": 2.1348211765289307,
- "learning_rate": 3.6021983518466468e-06,
- "loss": 0.2733963429927826,
- "mean_token_accuracy": 0.9007417559623718,
- "num_tokens": 13640641.0,
- "step": 1529
- },
- {
- "epoch": 1.162613981762918,
- "grad_norm": 2.8452792167663574,
- "learning_rate": 3.600318106653108e-06,
- "loss": 0.29591235518455505,
- "mean_token_accuracy": 0.8934413194656372,
- "num_tokens": 13644995.0,
- "step": 1530
- },
- {
- "epoch": 1.1633738601823709,
- "grad_norm": 2.342907190322876,
- "learning_rate": 3.5984370892139663e-06,
- "loss": 0.4675130248069763,
- "mean_token_accuracy": 0.8352028131484985,
- "num_tokens": 13652695.0,
- "step": 1531
- },
- {
- "epoch": 1.1641337386018238,
- "grad_norm": 2.3480238914489746,
- "learning_rate": 3.5965553008493924e-06,
- "loss": 0.3114515542984009,
- "mean_token_accuracy": 0.8845353126525879,
- "num_tokens": 13658101.0,
- "step": 1532
- },
- {
- "epoch": 1.1648936170212765,
- "grad_norm": 1.8608155250549316,
- "learning_rate": 3.594672742880097e-06,
- "loss": 0.3864145278930664,
- "mean_token_accuracy": 0.867354154586792,
- "num_tokens": 13666042.0,
- "step": 1533
- },
- {
- "epoch": 1.1656534954407296,
- "grad_norm": 1.4756088256835938,
- "learning_rate": 3.5927894166273324e-06,
- "loss": 0.3671600818634033,
- "mean_token_accuracy": 0.8695988655090332,
- "num_tokens": 13678253.0,
- "step": 1534
- },
- {
- "epoch": 1.1664133738601823,
- "grad_norm": 2.8831355571746826,
- "learning_rate": 3.5909053234128893e-06,
- "loss": 0.267184317111969,
- "mean_token_accuracy": 0.9008115530014038,
- "num_tokens": 13681790.0,
- "step": 1535
- },
- {
- "epoch": 1.1671732522796352,
- "grad_norm": 2.1984763145446777,
- "learning_rate": 3.5890204645590964e-06,
- "loss": 0.4431505799293518,
- "mean_token_accuracy": 0.8623673915863037,
- "num_tokens": 13688444.0,
- "step": 1536
- },
- {
- "epoch": 1.167933130699088,
- "grad_norm": 1.8271523714065552,
- "learning_rate": 3.5871348413888207e-06,
- "loss": 0.3861040771007538,
- "mean_token_accuracy": 0.8624277114868164,
- "num_tokens": 13696872.0,
- "step": 1537
- },
- {
- "epoch": 1.168693009118541,
- "grad_norm": 1.6313756704330444,
- "learning_rate": 3.585248455225466e-06,
- "loss": 0.3775154948234558,
- "mean_token_accuracy": 0.8624461889266968,
- "num_tokens": 13706167.0,
- "step": 1538
- },
- {
- "epoch": 1.169452887537994,
- "grad_norm": 2.4377901554107666,
- "learning_rate": 3.5833613073929684e-06,
- "loss": 0.2308957427740097,
- "mean_token_accuracy": 0.920600175857544,
- "num_tokens": 13710367.0,
- "step": 1539
- },
- {
- "epoch": 1.1702127659574468,
- "grad_norm": 2.2621750831604004,
- "learning_rate": 3.5814733992158025e-06,
- "loss": 0.33167219161987305,
- "mean_token_accuracy": 0.8963261842727661,
- "num_tokens": 13716384.0,
- "step": 1540
- },
- {
- "epoch": 1.1709726443768997,
- "grad_norm": 1.3178150653839111,
- "learning_rate": 3.579584732018975e-06,
- "loss": 0.3276631832122803,
- "mean_token_accuracy": 0.8853521347045898,
- "num_tokens": 13731031.0,
- "step": 1541
- },
- {
- "epoch": 1.1717325227963526,
- "grad_norm": 2.177750587463379,
- "learning_rate": 3.577695307128024e-06,
- "loss": 0.48177266120910645,
- "mean_token_accuracy": 0.830329418182373,
- "num_tokens": 13737925.0,
- "step": 1542
- },
- {
- "epoch": 1.1724924012158056,
- "grad_norm": 2.2268829345703125,
- "learning_rate": 3.5758051258690223e-06,
- "loss": 0.48843517899513245,
- "mean_token_accuracy": 0.8310644030570984,
- "num_tokens": 13746039.0,
- "step": 1543
- },
- {
- "epoch": 1.1732522796352582,
- "grad_norm": 1.498701572418213,
- "learning_rate": 3.5739141895685708e-06,
- "loss": 0.4542962312698364,
- "mean_token_accuracy": 0.8500330448150635,
- "num_tokens": 13765002.0,
- "step": 1544
- },
- {
- "epoch": 1.1740121580547112,
- "grad_norm": 1.786670446395874,
- "learning_rate": 3.5720224995538023e-06,
- "loss": 0.27367928624153137,
- "mean_token_accuracy": 0.8916142582893372,
- "num_tokens": 13774113.0,
- "step": 1545
- },
- {
- "epoch": 1.174772036474164,
- "grad_norm": 2.0311272144317627,
- "learning_rate": 3.5701300571523757e-06,
- "loss": 0.559987485408783,
- "mean_token_accuracy": 0.8266973495483398,
- "num_tokens": 13783912.0,
- "step": 1546
- },
- {
- "epoch": 1.175531914893617,
- "grad_norm": 1.8732186555862427,
- "learning_rate": 3.5682368636924825e-06,
- "loss": 0.5184751152992249,
- "mean_token_accuracy": 0.8450918197631836,
- "num_tokens": 13792728.0,
- "step": 1547
- },
- {
- "epoch": 1.1762917933130699,
- "grad_norm": 1.4410661458969116,
- "learning_rate": 3.566342920502837e-06,
- "loss": 0.383536696434021,
- "mean_token_accuracy": 0.8672217726707458,
- "num_tokens": 13813590.0,
- "step": 1548
- },
- {
- "epoch": 1.1770516717325228,
- "grad_norm": 3.06056547164917,
- "learning_rate": 3.564448228912682e-06,
- "loss": 0.3941686153411865,
- "mean_token_accuracy": 0.8696402311325073,
- "num_tokens": 13817704.0,
- "step": 1549
- },
- {
- "epoch": 1.1778115501519757,
- "grad_norm": 1.6150329113006592,
- "learning_rate": 3.562552790251785e-06,
- "loss": 0.41606605052948,
- "mean_token_accuracy": 0.8488572835922241,
- "num_tokens": 13831303.0,
- "step": 1550
- },
- {
- "epoch": 1.1785714285714286,
- "grad_norm": 2.1199934482574463,
- "learning_rate": 3.5606566058504377e-06,
- "loss": 0.3974752426147461,
- "mean_token_accuracy": 0.8686345219612122,
- "num_tokens": 13837613.0,
- "step": 1551
- },
- {
- "epoch": 1.1793313069908815,
- "grad_norm": 1.5683876276016235,
- "learning_rate": 3.558759677039455e-06,
- "loss": 0.35225993394851685,
- "mean_token_accuracy": 0.8710784316062927,
- "num_tokens": 13846779.0,
- "step": 1552
- },
- {
- "epoch": 1.1800911854103344,
- "grad_norm": 1.4644675254821777,
- "learning_rate": 3.5568620051501755e-06,
- "loss": 0.38400042057037354,
- "mean_token_accuracy": 0.8548328876495361,
- "num_tokens": 13860713.0,
- "step": 1553
- },
- {
- "epoch": 1.1808510638297873,
- "grad_norm": 1.461491346359253,
- "learning_rate": 3.5549635915144578e-06,
- "loss": 0.4572640061378479,
- "mean_token_accuracy": 0.8506045937538147,
- "num_tokens": 13877289.0,
- "step": 1554
- },
- {
- "epoch": 1.18161094224924,
- "grad_norm": 2.6364715099334717,
- "learning_rate": 3.553064437464682e-06,
- "loss": 0.3954341411590576,
- "mean_token_accuracy": 0.8561649322509766,
- "num_tokens": 13882064.0,
- "step": 1555
- },
- {
- "epoch": 1.182370820668693,
- "grad_norm": 2.027273654937744,
- "learning_rate": 3.551164544333745e-06,
- "loss": 0.47625732421875,
- "mean_token_accuracy": 0.8349384069442749,
- "num_tokens": 13890306.0,
- "step": 1556
- },
- {
- "epoch": 1.1831306990881458,
- "grad_norm": 2.8427743911743164,
- "learning_rate": 3.549263913455069e-06,
- "loss": 0.4273033142089844,
- "mean_token_accuracy": 0.8541387319564819,
- "num_tokens": 13894882.0,
- "step": 1557
- },
- {
- "epoch": 1.1838905775075987,
- "grad_norm": 1.6298975944519043,
- "learning_rate": 3.5473625461625884e-06,
- "loss": 0.4378639757633209,
- "mean_token_accuracy": 0.8634963631629944,
- "num_tokens": 13906152.0,
- "step": 1558
- },
- {
- "epoch": 1.1846504559270516,
- "grad_norm": 2.4098947048187256,
- "learning_rate": 3.5454604437907535e-06,
- "loss": 0.47236716747283936,
- "mean_token_accuracy": 0.8646864891052246,
- "num_tokens": 13911803.0,
- "step": 1559
- },
- {
- "epoch": 1.1854103343465046,
- "grad_norm": 1.5972497463226318,
- "learning_rate": 3.543557607674537e-06,
- "loss": 0.3001407980918884,
- "mean_token_accuracy": 0.8927055597305298,
- "num_tokens": 13921304.0,
- "step": 1560
- },
- {
- "epoch": 1.1861702127659575,
- "grad_norm": 2.1140005588531494,
- "learning_rate": 3.54165403914942e-06,
- "loss": 0.41898271441459656,
- "mean_token_accuracy": 0.8542245626449585,
- "num_tokens": 13929434.0,
- "step": 1561
- },
- {
- "epoch": 1.1869300911854104,
- "grad_norm": 1.8733803033828735,
- "learning_rate": 3.539749739551401e-06,
- "loss": 0.35469961166381836,
- "mean_token_accuracy": 0.8805290460586548,
- "num_tokens": 13937781.0,
- "step": 1562
- },
- {
- "epoch": 1.1876899696048633,
- "grad_norm": 2.2805802822113037,
- "learning_rate": 3.53784471021699e-06,
- "loss": 0.44496792554855347,
- "mean_token_accuracy": 0.8454172611236572,
- "num_tokens": 13944394.0,
- "step": 1563
- },
- {
- "epoch": 1.1884498480243162,
- "grad_norm": 0.9728449583053589,
- "learning_rate": 3.535938952483211e-06,
- "loss": 0.3156968355178833,
- "mean_token_accuracy": 0.8739837408065796,
- "num_tokens": 13966712.0,
- "step": 1564
- },
- {
- "epoch": 1.189209726443769,
- "grad_norm": 3.025338888168335,
- "learning_rate": 3.534032467687597e-06,
- "loss": 0.30036938190460205,
- "mean_token_accuracy": 0.9058252573013306,
- "num_tokens": 13970183.0,
- "step": 1565
- },
- {
- "epoch": 1.1899696048632218,
- "grad_norm": 2.0659425258636475,
- "learning_rate": 3.532125257168193e-06,
- "loss": 0.30619731545448303,
- "mean_token_accuracy": 0.9041587710380554,
- "num_tokens": 13976657.0,
- "step": 1566
- },
- {
- "epoch": 1.1907294832826747,
- "grad_norm": 3.2036776542663574,
- "learning_rate": 3.5302173222635526e-06,
- "loss": 0.4145944118499756,
- "mean_token_accuracy": 0.8502328395843506,
- "num_tokens": 13981198.0,
- "step": 1567
- },
- {
- "epoch": 1.1914893617021276,
- "grad_norm": 1.7767539024353027,
- "learning_rate": 3.5283086643127396e-06,
- "loss": 0.437128484249115,
- "mean_token_accuracy": 0.8965631723403931,
- "num_tokens": 13990259.0,
- "step": 1568
- },
- {
- "epoch": 1.1922492401215805,
- "grad_norm": 1.7777384519577026,
- "learning_rate": 3.5263992846553203e-06,
- "loss": 0.33831220865249634,
- "mean_token_accuracy": 0.8734279870986938,
- "num_tokens": 13999363.0,
- "step": 1569
- },
- {
- "epoch": 1.1930091185410334,
- "grad_norm": 1.6710708141326904,
- "learning_rate": 3.5244891846313733e-06,
- "loss": 0.4005590081214905,
- "mean_token_accuracy": 0.8820298314094543,
- "num_tokens": 14008719.0,
- "step": 1570
- },
- {
- "epoch": 1.1937689969604863,
- "grad_norm": 1.0378777980804443,
- "learning_rate": 3.5225783655814798e-06,
- "loss": 0.3174915313720703,
- "mean_token_accuracy": 0.8894162774085999,
- "num_tokens": 14025806.0,
- "step": 1571
- },
- {
- "epoch": 1.1945288753799392,
- "grad_norm": 1.2647521495819092,
- "learning_rate": 3.520666828846726e-06,
- "loss": 0.4173050820827484,
- "mean_token_accuracy": 0.8437265157699585,
- "num_tokens": 14046445.0,
- "step": 1572
- },
- {
- "epoch": 1.1952887537993921,
- "grad_norm": 2.8625528812408447,
- "learning_rate": 3.518754575768702e-06,
- "loss": 0.37182557582855225,
- "mean_token_accuracy": 0.8660947680473328,
- "num_tokens": 14051197.0,
- "step": 1573
- },
- {
- "epoch": 1.196048632218845,
- "grad_norm": 1.1213171482086182,
- "learning_rate": 3.516841607689501e-06,
- "loss": 0.332731157541275,
- "mean_token_accuracy": 0.8573278784751892,
- "num_tokens": 14070817.0,
- "step": 1574
- },
- {
- "epoch": 1.196808510638298,
- "grad_norm": 1.197508692741394,
- "learning_rate": 3.5149279259517165e-06,
- "loss": 0.34058472514152527,
- "mean_token_accuracy": 0.8603571653366089,
- "num_tokens": 14085301.0,
- "step": 1575
- },
- {
- "epoch": 1.1975683890577509,
- "grad_norm": 4.019949913024902,
- "learning_rate": 3.5130135318984454e-06,
- "loss": 0.3094622492790222,
- "mean_token_accuracy": 0.8905094861984253,
- "num_tokens": 14088107.0,
- "step": 1576
- },
- {
- "epoch": 1.1983282674772036,
- "grad_norm": 2.591181755065918,
- "learning_rate": 3.5110984268732827e-06,
- "loss": 0.3407078981399536,
- "mean_token_accuracy": 0.880385160446167,
- "num_tokens": 14092887.0,
- "step": 1577
- },
- {
- "epoch": 1.1990881458966565,
- "grad_norm": 1.3069331645965576,
- "learning_rate": 3.509182612220322e-06,
- "loss": 0.3761988878250122,
- "mean_token_accuracy": 0.862013041973114,
- "num_tokens": 14109216.0,
- "step": 1578
- },
- {
- "epoch": 1.1998480243161094,
- "grad_norm": 1.7802022695541382,
- "learning_rate": 3.507266089284157e-06,
- "loss": 0.3824652135372162,
- "mean_token_accuracy": 0.8707721829414368,
- "num_tokens": 14119645.0,
- "step": 1579
- },
- {
- "epoch": 1.2006079027355623,
- "grad_norm": 2.7937185764312744,
- "learning_rate": 3.5053488594098763e-06,
- "loss": 0.33828890323638916,
- "mean_token_accuracy": 0.8765541315078735,
- "num_tokens": 14124628.0,
- "step": 1580
- },
- {
- "epoch": 1.2013677811550152,
- "grad_norm": 1.892671823501587,
- "learning_rate": 3.5034309239430664e-06,
- "loss": 0.3476094603538513,
- "mean_token_accuracy": 0.9053795337677002,
- "num_tokens": 14131756.0,
- "step": 1581
- },
- {
- "epoch": 1.202127659574468,
- "grad_norm": 1.6857695579528809,
- "learning_rate": 3.501512284229807e-06,
- "loss": 0.5397108793258667,
- "mean_token_accuracy": 0.8173421025276184,
- "num_tokens": 14143024.0,
- "step": 1582
- },
- {
- "epoch": 1.202887537993921,
- "grad_norm": 2.501737117767334,
- "learning_rate": 3.4995929416166756e-06,
- "loss": 0.4192458391189575,
- "mean_token_accuracy": 0.8558136224746704,
- "num_tokens": 14149499.0,
- "step": 1583
- },
- {
- "epoch": 1.203647416413374,
- "grad_norm": 2.0133907794952393,
- "learning_rate": 3.4976728974507387e-06,
- "loss": 0.4791576564311981,
- "mean_token_accuracy": 0.8253597021102905,
- "num_tokens": 14158381.0,
- "step": 1584
- },
- {
- "epoch": 1.2044072948328268,
- "grad_norm": 2.984611988067627,
- "learning_rate": 3.4957521530795576e-06,
- "loss": 0.3040750026702881,
- "mean_token_accuracy": 0.8902391791343689,
- "num_tokens": 14162419.0,
- "step": 1585
- },
- {
- "epoch": 1.2051671732522795,
- "grad_norm": 1.518591284751892,
- "learning_rate": 3.493830709851185e-06,
- "loss": 0.35539618134498596,
- "mean_token_accuracy": 0.8737183809280396,
- "num_tokens": 14173048.0,
- "step": 1586
- },
- {
- "epoch": 1.2059270516717326,
- "grad_norm": 2.628758192062378,
- "learning_rate": 3.4919085691141636e-06,
- "loss": 0.33340200781822205,
- "mean_token_accuracy": 0.8705098628997803,
- "num_tokens": 14178255.0,
- "step": 1587
- },
- {
- "epoch": 1.2066869300911853,
- "grad_norm": 2.5565974712371826,
- "learning_rate": 3.4899857322175252e-06,
- "loss": 0.44939476251602173,
- "mean_token_accuracy": 0.8315504193305969,
- "num_tokens": 14183808.0,
- "step": 1588
- },
- {
- "epoch": 1.2074468085106382,
- "grad_norm": 1.7521045207977295,
- "learning_rate": 3.4880622005107916e-06,
- "loss": 0.3168621063232422,
- "mean_token_accuracy": 0.8824669122695923,
- "num_tokens": 14192186.0,
- "step": 1589
- },
- {
- "epoch": 1.2082066869300911,
- "grad_norm": 1.9816104173660278,
- "learning_rate": 3.486137975343971e-06,
- "loss": 0.3892582058906555,
- "mean_token_accuracy": 0.8524188995361328,
- "num_tokens": 14200512.0,
- "step": 1590
- },
- {
- "epoch": 1.208966565349544,
- "grad_norm": 1.459800124168396,
- "learning_rate": 3.484213058067559e-06,
- "loss": 0.45930033922195435,
- "mean_token_accuracy": 0.8408471345901489,
- "num_tokens": 14215232.0,
- "step": 1591
- },
- {
- "epoch": 1.209726443768997,
- "grad_norm": 2.015493154525757,
- "learning_rate": 3.482287450032536e-06,
- "loss": 0.5514016151428223,
- "mean_token_accuracy": 0.8456779718399048,
- "num_tokens": 14225402.0,
- "step": 1592
- },
- {
- "epoch": 1.2104863221884499,
- "grad_norm": 3.4511911869049072,
- "learning_rate": 3.4803611525903687e-06,
- "loss": 0.4772771894931793,
- "mean_token_accuracy": 0.8558698892593384,
- "num_tokens": 14229038.0,
- "step": 1593
- },
- {
- "epoch": 1.2112462006079028,
- "grad_norm": 2.2247982025146484,
- "learning_rate": 3.4784341670930067e-06,
- "loss": 0.4042825996875763,
- "mean_token_accuracy": 0.8635870218276978,
- "num_tokens": 14237057.0,
- "step": 1594
- },
- {
- "epoch": 1.2120060790273557,
- "grad_norm": 2.0534820556640625,
- "learning_rate": 3.4765064948928813e-06,
- "loss": 0.34057414531707764,
- "mean_token_accuracy": 0.8800770044326782,
- "num_tokens": 14243013.0,
- "step": 1595
- },
- {
- "epoch": 1.2127659574468086,
- "grad_norm": 2.594703197479248,
- "learning_rate": 3.474578137342909e-06,
- "loss": 0.4997410774230957,
- "mean_token_accuracy": 0.8302106261253357,
- "num_tokens": 14251210.0,
- "step": 1596
- },
- {
- "epoch": 1.2135258358662613,
- "grad_norm": 2.517833948135376,
- "learning_rate": 3.4726490957964836e-06,
- "loss": 0.3630390465259552,
- "mean_token_accuracy": 0.8679884672164917,
- "num_tokens": 14255893.0,
- "step": 1597
- },
- {
- "epoch": 1.2142857142857142,
- "grad_norm": 1.5177065134048462,
- "learning_rate": 3.4707193716074816e-06,
- "loss": 0.36218544840812683,
- "mean_token_accuracy": 0.879178524017334,
- "num_tokens": 14268143.0,
- "step": 1598
- },
- {
- "epoch": 1.215045592705167,
- "grad_norm": 2.215291738510132,
- "learning_rate": 3.4687889661302577e-06,
- "loss": 0.4166645407676697,
- "mean_token_accuracy": 0.8495793342590332,
- "num_tokens": 14276794.0,
- "step": 1599
- },
- {
- "epoch": 1.21580547112462,
- "grad_norm": 1.534294843673706,
- "learning_rate": 3.466857880719645e-06,
- "loss": 0.2635883092880249,
- "mean_token_accuracy": 0.8971712589263916,
- "num_tokens": 14287000.0,
- "step": 1600
- },
- {
- "epoch": 1.216565349544073,
- "grad_norm": 1.2338658571243286,
- "learning_rate": 3.464926116730953e-06,
- "loss": 0.339110404253006,
- "mean_token_accuracy": 0.895592987537384,
- "num_tokens": 14303217.0,
- "step": 1601
- },
- {
- "epoch": 1.2173252279635258,
- "grad_norm": 1.8717178106307983,
- "learning_rate": 3.462993675519968e-06,
- "loss": 0.41204726696014404,
- "mean_token_accuracy": 0.8560728430747986,
- "num_tokens": 14311372.0,
- "step": 1602
- },
- {
- "epoch": 1.2180851063829787,
- "grad_norm": 2.844160795211792,
- "learning_rate": 3.4610605584429526e-06,
- "loss": 0.4129520058631897,
- "mean_token_accuracy": 0.8555002212524414,
- "num_tokens": 14316244.0,
- "step": 1603
- },
- {
- "epoch": 1.2188449848024316,
- "grad_norm": 1.099926471710205,
- "learning_rate": 3.4591267668566412e-06,
- "loss": 0.35783132910728455,
- "mean_token_accuracy": 0.8693175315856934,
- "num_tokens": 14338414.0,
- "step": 1604
- },
- {
- "epoch": 1.2196048632218845,
- "grad_norm": 1.6448384523391724,
- "learning_rate": 3.457192302118244e-06,
- "loss": 0.42060258984565735,
- "mean_token_accuracy": 0.8557323217391968,
- "num_tokens": 14349143.0,
- "step": 1605
- },
- {
- "epoch": 1.2203647416413375,
- "grad_norm": 2.097529888153076,
- "learning_rate": 3.455257165585444e-06,
- "loss": 0.5227499008178711,
- "mean_token_accuracy": 0.828961968421936,
- "num_tokens": 14360032.0,
- "step": 1606
- },
- {
- "epoch": 1.2211246200607904,
- "grad_norm": 1.602988600730896,
- "learning_rate": 3.453321358616393e-06,
- "loss": 0.3537187874317169,
- "mean_token_accuracy": 0.8776708841323853,
- "num_tokens": 14370005.0,
- "step": 1607
- },
- {
- "epoch": 1.221884498480243,
- "grad_norm": 2.358971357345581,
- "learning_rate": 3.4513848825697145e-06,
- "loss": 0.3448919653892517,
- "mean_token_accuracy": 0.8887944221496582,
- "num_tokens": 14375718.0,
- "step": 1608
- },
- {
- "epoch": 1.222644376899696,
- "grad_norm": 1.72306227684021,
- "learning_rate": 3.4494477388045035e-06,
- "loss": 0.36985084414482117,
- "mean_token_accuracy": 0.859595537185669,
- "num_tokens": 14385016.0,
- "step": 1609
- },
- {
- "epoch": 1.2234042553191489,
- "grad_norm": 1.5494085550308228,
- "learning_rate": 3.4475099286803204e-06,
- "loss": 0.49003708362579346,
- "mean_token_accuracy": 0.8701964616775513,
- "num_tokens": 14399277.0,
- "step": 1610
- },
- {
- "epoch": 1.2241641337386018,
- "grad_norm": 2.6874046325683594,
- "learning_rate": 3.445571453557196e-06,
- "loss": 0.3424490690231323,
- "mean_token_accuracy": 0.8835943937301636,
- "num_tokens": 14404182.0,
- "step": 1611
- },
- {
- "epoch": 1.2249240121580547,
- "grad_norm": 2.2163190841674805,
- "learning_rate": 3.443632314795627e-06,
- "loss": 0.40944457054138184,
- "mean_token_accuracy": 0.8649888038635254,
- "num_tokens": 14410158.0,
- "step": 1612
- },
- {
- "epoch": 1.2256838905775076,
- "grad_norm": 2.7961158752441406,
- "learning_rate": 3.4416925137565756e-06,
- "loss": 0.17890746891498566,
- "mean_token_accuracy": 0.9439430832862854,
- "num_tokens": 14413285.0,
- "step": 1613
- },
- {
- "epoch": 1.2264437689969605,
- "grad_norm": 1.421451210975647,
- "learning_rate": 3.439752051801467e-06,
- "loss": 0.33948683738708496,
- "mean_token_accuracy": 0.8754585981369019,
- "num_tokens": 14424674.0,
- "step": 1614
- },
- {
- "epoch": 1.2272036474164134,
- "grad_norm": 2.105196237564087,
- "learning_rate": 3.4378109302921946e-06,
- "loss": 0.40009379386901855,
- "mean_token_accuracy": 0.8600341081619263,
- "num_tokens": 14432400.0,
- "step": 1615
- },
- {
- "epoch": 1.2279635258358663,
- "grad_norm": 2.004122734069824,
- "learning_rate": 3.4358691505911105e-06,
- "loss": 0.46013444662094116,
- "mean_token_accuracy": 0.8400925993919373,
- "num_tokens": 14440741.0,
- "step": 1616
- },
- {
- "epoch": 1.2287234042553192,
- "grad_norm": 1.8407535552978516,
- "learning_rate": 3.4339267140610317e-06,
- "loss": 0.38828906416893005,
- "mean_token_accuracy": 0.8582802414894104,
- "num_tokens": 14448698.0,
- "step": 1617
- },
- {
- "epoch": 1.2294832826747721,
- "grad_norm": 2.4285924434661865,
- "learning_rate": 3.4319836220652334e-06,
- "loss": 0.3109283447265625,
- "mean_token_accuracy": 0.8888344764709473,
- "num_tokens": 14453674.0,
- "step": 1618
- },
- {
- "epoch": 1.2302431610942248,
- "grad_norm": 1.6322550773620605,
- "learning_rate": 3.430039875967454e-06,
- "loss": 0.5222204327583313,
- "mean_token_accuracy": 0.825019121170044,
- "num_tokens": 14465736.0,
- "step": 1619
- },
- {
- "epoch": 1.2310030395136777,
- "grad_norm": 2.307573080062866,
- "learning_rate": 3.428095477131888e-06,
- "loss": 0.29477375745773315,
- "mean_token_accuracy": 0.8899064660072327,
- "num_tokens": 14471266.0,
- "step": 1620
- },
- {
- "epoch": 1.2317629179331306,
- "grad_norm": 1.8044531345367432,
- "learning_rate": 3.4261504269231904e-06,
- "loss": 0.4883342981338501,
- "mean_token_accuracy": 0.8310165405273438,
- "num_tokens": 14481679.0,
- "step": 1621
- },
- {
- "epoch": 1.2325227963525835,
- "grad_norm": 2.7585411071777344,
- "learning_rate": 3.4242047267064714e-06,
- "loss": 0.45369645953178406,
- "mean_token_accuracy": 0.8432134985923767,
- "num_tokens": 14487299.0,
- "step": 1622
- },
- {
- "epoch": 1.2332826747720365,
- "grad_norm": 2.687490701675415,
- "learning_rate": 3.4222583778472997e-06,
- "loss": 0.5627540349960327,
- "mean_token_accuracy": 0.8186438083648682,
- "num_tokens": 14494254.0,
- "step": 1623
- },
- {
- "epoch": 1.2340425531914894,
- "grad_norm": 2.622443199157715,
- "learning_rate": 3.4203113817116955e-06,
- "loss": 0.28697147965431213,
- "mean_token_accuracy": 0.8861737847328186,
- "num_tokens": 14498632.0,
- "step": 1624
- },
- {
- "epoch": 1.2348024316109423,
- "grad_norm": 2.6943359375,
- "learning_rate": 3.4183637396661372e-06,
- "loss": 0.25273287296295166,
- "mean_token_accuracy": 0.9104914665222168,
- "num_tokens": 14502797.0,
- "step": 1625
- },
- {
- "epoch": 1.2355623100303952,
- "grad_norm": 2.428189992904663,
- "learning_rate": 3.4164154530775552e-06,
- "loss": 0.4213451147079468,
- "mean_token_accuracy": 0.851524293422699,
- "num_tokens": 14508503.0,
- "step": 1626
- },
- {
- "epoch": 1.236322188449848,
- "grad_norm": 2.1722824573516846,
- "learning_rate": 3.4144665233133318e-06,
- "loss": 0.35238856077194214,
- "mean_token_accuracy": 0.8730837106704712,
- "num_tokens": 14516126.0,
- "step": 1627
- },
- {
- "epoch": 1.237082066869301,
- "grad_norm": 2.291365146636963,
- "learning_rate": 3.4125169517413005e-06,
- "loss": 0.43963465094566345,
- "mean_token_accuracy": 0.8525444865226746,
- "num_tokens": 14522507.0,
- "step": 1628
- },
- {
- "epoch": 1.237841945288754,
- "grad_norm": 1.6181648969650269,
- "learning_rate": 3.410566739729746e-06,
- "loss": 0.2799680233001709,
- "mean_token_accuracy": 0.8915654420852661,
- "num_tokens": 14531025.0,
- "step": 1629
- },
- {
- "epoch": 1.2386018237082066,
- "grad_norm": 1.4039218425750732,
- "learning_rate": 3.408615888647402e-06,
- "loss": 0.29756587743759155,
- "mean_token_accuracy": 0.8951715230941772,
- "num_tokens": 14543770.0,
- "step": 1630
- },
- {
- "epoch": 1.2393617021276595,
- "grad_norm": 2.148325204849243,
- "learning_rate": 3.4066643998634506e-06,
- "loss": 0.3983418345451355,
- "mean_token_accuracy": 0.8635951280593872,
- "num_tokens": 14550896.0,
- "step": 1631
- },
- {
- "epoch": 1.2401215805471124,
- "grad_norm": 1.5225859880447388,
- "learning_rate": 3.4047122747475227e-06,
- "loss": 0.3247569799423218,
- "mean_token_accuracy": 0.8727027177810669,
- "num_tokens": 14562181.0,
- "step": 1632
- },
- {
- "epoch": 1.2408814589665653,
- "grad_norm": 3.99835467338562,
- "learning_rate": 3.402759514669694e-06,
- "loss": 0.4317352771759033,
- "mean_token_accuracy": 0.8488142490386963,
- "num_tokens": 14565521.0,
- "step": 1633
- },
- {
- "epoch": 1.2416413373860182,
- "grad_norm": 1.7306902408599854,
- "learning_rate": 3.4008061210004872e-06,
- "loss": 0.389854371547699,
- "mean_token_accuracy": 0.8553084135055542,
- "num_tokens": 14574633.0,
- "step": 1634
- },
- {
- "epoch": 1.2424012158054711,
- "grad_norm": 2.3614673614501953,
- "learning_rate": 3.3988520951108683e-06,
- "loss": 0.3150152564048767,
- "mean_token_accuracy": 0.8865959644317627,
- "num_tokens": 14580240.0,
- "step": 1635
- },
- {
- "epoch": 1.243161094224924,
- "grad_norm": 1.5625747442245483,
- "learning_rate": 3.3968974383722497e-06,
- "loss": 0.43160033226013184,
- "mean_token_accuracy": 0.840155839920044,
- "num_tokens": 14594255.0,
- "step": 1636
- },
- {
- "epoch": 1.243920972644377,
- "grad_norm": 1.871620535850525,
- "learning_rate": 3.3949421521564825e-06,
- "loss": 0.49550193548202515,
- "mean_token_accuracy": 0.8315126299858093,
- "num_tokens": 14605416.0,
- "step": 1637
- },
- {
- "epoch": 1.2446808510638299,
- "grad_norm": 2.111304759979248,
- "learning_rate": 3.392986237835863e-06,
- "loss": 0.2794899046421051,
- "mean_token_accuracy": 0.9049773216247559,
- "num_tokens": 14611711.0,
- "step": 1638
- },
- {
- "epoch": 1.2454407294832828,
- "grad_norm": 3.7479894161224365,
- "learning_rate": 3.391029696783127e-06,
- "loss": 0.469397634267807,
- "mean_token_accuracy": 0.8352956771850586,
- "num_tokens": 14615536.0,
- "step": 1639
- },
- {
- "epoch": 1.2462006079027357,
- "grad_norm": 3.277726650238037,
- "learning_rate": 3.389072530371451e-06,
- "loss": 0.35431790351867676,
- "mean_token_accuracy": 0.8822286128997803,
- "num_tokens": 14619390.0,
- "step": 1640
- },
- {
- "epoch": 1.2469604863221884,
- "grad_norm": 1.9583072662353516,
- "learning_rate": 3.3871147399744482e-06,
- "loss": 0.3708694577217102,
- "mean_token_accuracy": 0.8720351457595825,
- "num_tokens": 14626573.0,
- "step": 1641
- },
- {
- "epoch": 1.2477203647416413,
- "grad_norm": 1.8734042644500732,
- "learning_rate": 3.385156326966173e-06,
- "loss": 0.48163774609565735,
- "mean_token_accuracy": 0.8479621410369873,
- "num_tokens": 14636382.0,
- "step": 1642
- },
- {
- "epoch": 1.2484802431610942,
- "grad_norm": 2.0085532665252686,
- "learning_rate": 3.383197292721114e-06,
- "loss": 0.4893198311328888,
- "mean_token_accuracy": 0.838238000869751,
- "num_tokens": 14645083.0,
- "step": 1643
- },
- {
- "epoch": 1.249240121580547,
- "grad_norm": 2.0874593257904053,
- "learning_rate": 3.3812376386141966e-06,
- "loss": 0.4610505700111389,
- "mean_token_accuracy": 0.8441368341445923,
- "num_tokens": 14654048.0,
- "step": 1644
- },
- {
- "epoch": 1.25,
- "grad_norm": 1.6887420415878296,
- "learning_rate": 3.379277366020782e-06,
- "loss": 0.3628596067428589,
- "mean_token_accuracy": 0.8838590383529663,
- "num_tokens": 14662317.0,
- "step": 1645
- },
- {
- "epoch": 1.250759878419453,
- "grad_norm": 2.389002561569214,
- "learning_rate": 3.3773164763166653e-06,
- "loss": 0.21903495490550995,
- "mean_token_accuracy": 0.9249413013458252,
- "num_tokens": 14666394.0,
- "step": 1646
- },
- {
- "epoch": 1.2515197568389058,
- "grad_norm": 1.7091087102890015,
- "learning_rate": 3.3753549708780736e-06,
- "loss": 0.37802332639694214,
- "mean_token_accuracy": 0.8644627332687378,
- "num_tokens": 14676214.0,
- "step": 1647
- },
- {
- "epoch": 1.2522796352583587,
- "grad_norm": 2.5717999935150146,
- "learning_rate": 3.3733928510816677e-06,
- "loss": 0.4236462116241455,
- "mean_token_accuracy": 0.8519910573959351,
- "num_tokens": 14681681.0,
- "step": 1648
- },
- {
- "epoch": 1.2530395136778116,
- "grad_norm": 1.958856463432312,
- "learning_rate": 3.3714301183045382e-06,
- "loss": 0.3923419415950775,
- "mean_token_accuracy": 0.8720202445983887,
- "num_tokens": 14690419.0,
- "step": 1649
- },
- {
- "epoch": 1.2537993920972643,
- "grad_norm": 1.5900038480758667,
- "learning_rate": 3.369466773924207e-06,
- "loss": 0.4182325601577759,
- "mean_token_accuracy": 0.8515387177467346,
- "num_tokens": 14699790.0,
- "step": 1650
- },
- {
- "epoch": 1.2545592705167175,
- "grad_norm": 1.260547161102295,
- "learning_rate": 3.3675028193186243e-06,
- "loss": 0.3915718197822571,
- "mean_token_accuracy": 0.8536830544471741,
- "num_tokens": 14717502.0,
- "step": 1651
- },
- {
- "epoch": 1.2553191489361701,
- "grad_norm": 1.8152283430099487,
- "learning_rate": 3.365538255866169e-06,
- "loss": 0.424524188041687,
- "mean_token_accuracy": 0.8434420824050903,
- "num_tokens": 14726591.0,
- "step": 1652
- },
- {
- "epoch": 1.256079027355623,
- "grad_norm": 1.3357285261154175,
- "learning_rate": 3.3635730849456484e-06,
- "loss": 0.2949739396572113,
- "mean_token_accuracy": 0.8868321180343628,
- "num_tokens": 14739911.0,
- "step": 1653
- },
- {
- "epoch": 1.256838905775076,
- "grad_norm": 1.1770358085632324,
- "learning_rate": 3.3616073079362925e-06,
- "loss": 0.29939576983451843,
- "mean_token_accuracy": 0.8923654556274414,
- "num_tokens": 14755521.0,
- "step": 1654
- },
- {
- "epoch": 1.2575987841945289,
- "grad_norm": 2.059162139892578,
- "learning_rate": 3.3596409262177633e-06,
- "loss": 0.4562555253505707,
- "mean_token_accuracy": 0.8585271239280701,
- "num_tokens": 14764173.0,
- "step": 1655
- },
- {
- "epoch": 1.2583586626139818,
- "grad_norm": 1.430752158164978,
- "learning_rate": 3.357673941170139e-06,
- "loss": 0.35301265120506287,
- "mean_token_accuracy": 0.8920517563819885,
- "num_tokens": 14775596.0,
- "step": 1656
- },
- {
- "epoch": 1.2591185410334347,
- "grad_norm": 1.6066302061080933,
- "learning_rate": 3.3557063541739283e-06,
- "loss": 0.41129636764526367,
- "mean_token_accuracy": 0.8512256145477295,
- "num_tokens": 14786289.0,
- "step": 1657
- },
- {
- "epoch": 1.2598784194528876,
- "grad_norm": 1.5471590757369995,
- "learning_rate": 3.353738166610058e-06,
- "loss": 0.3935067057609558,
- "mean_token_accuracy": 0.8514131903648376,
- "num_tokens": 14798672.0,
- "step": 1658
- },
- {
- "epoch": 1.2606382978723405,
- "grad_norm": 1.3455181121826172,
- "learning_rate": 3.35176937985988e-06,
- "loss": 0.3486790657043457,
- "mean_token_accuracy": 0.8644362688064575,
- "num_tokens": 14811603.0,
- "step": 1659
- },
- {
- "epoch": 1.2613981762917934,
- "grad_norm": 1.891432762145996,
- "learning_rate": 3.349799995305162e-06,
- "loss": 0.3325638175010681,
- "mean_token_accuracy": 0.8844645023345947,
- "num_tokens": 14819256.0,
- "step": 1660
- },
- {
- "epoch": 1.262158054711246,
- "grad_norm": 2.600614309310913,
- "learning_rate": 3.3478300143280946e-06,
- "loss": 0.30310919880867004,
- "mean_token_accuracy": 0.9103429317474365,
- "num_tokens": 14823706.0,
- "step": 1661
- },
- {
- "epoch": 1.2629179331306992,
- "grad_norm": 3.8636202812194824,
- "learning_rate": 3.3458594383112868e-06,
- "loss": 0.28377676010131836,
- "mean_token_accuracy": 0.9047091007232666,
- "num_tokens": 14826688.0,
- "step": 1662
- },
- {
- "epoch": 1.263677811550152,
- "grad_norm": 2.3100268840789795,
- "learning_rate": 3.343888268637765e-06,
- "loss": 0.4723394513130188,
- "mean_token_accuracy": 0.8306777477264404,
- "num_tokens": 14835471.0,
- "step": 1663
- },
- {
- "epoch": 1.2644376899696048,
- "grad_norm": 1.7582160234451294,
- "learning_rate": 3.341916506690971e-06,
- "loss": 0.48168784379959106,
- "mean_token_accuracy": 0.8281306028366089,
- "num_tokens": 14846513.0,
- "step": 1664
- },
- {
- "epoch": 1.2651975683890577,
- "grad_norm": 2.166055917739868,
- "learning_rate": 3.3399441538547638e-06,
- "loss": 0.4626024067401886,
- "mean_token_accuracy": 0.8377980589866638,
- "num_tokens": 14853408.0,
- "step": 1665
- },
- {
- "epoch": 1.2659574468085106,
- "grad_norm": 2.23038911819458,
- "learning_rate": 3.337971211513417e-06,
- "loss": 0.38434159755706787,
- "mean_token_accuracy": 0.8708412647247314,
- "num_tokens": 14859919.0,
- "step": 1666
- },
- {
- "epoch": 1.2667173252279635,
- "grad_norm": 2.092505693435669,
- "learning_rate": 3.3359976810516164e-06,
- "loss": 0.35072219371795654,
- "mean_token_accuracy": 0.8761640191078186,
- "num_tokens": 14865624.0,
- "step": 1667
- },
- {
- "epoch": 1.2674772036474165,
- "grad_norm": 1.8255130052566528,
- "learning_rate": 3.3340235638544633e-06,
- "loss": 0.4404270648956299,
- "mean_token_accuracy": 0.836356520652771,
- "num_tokens": 14874181.0,
- "step": 1668
- },
- {
- "epoch": 1.2682370820668694,
- "grad_norm": 1.9889036417007446,
- "learning_rate": 3.332048861307467e-06,
- "loss": 0.4199368357658386,
- "mean_token_accuracy": 0.8508217334747314,
- "num_tokens": 14882275.0,
- "step": 1669
- },
- {
- "epoch": 1.2689969604863223,
- "grad_norm": 4.050281047821045,
- "learning_rate": 3.330073574796551e-06,
- "loss": 0.4271625280380249,
- "mean_token_accuracy": 0.8471108675003052,
- "num_tokens": 14893633.0,
- "step": 1670
- },
- {
- "epoch": 1.2697568389057752,
- "grad_norm": 1.998838186264038,
- "learning_rate": 3.328097705708047e-06,
- "loss": 0.34743767976760864,
- "mean_token_accuracy": 0.8771528005599976,
- "num_tokens": 14899859.0,
- "step": 1671
- },
- {
- "epoch": 1.2705167173252279,
- "grad_norm": 1.7989062070846558,
- "learning_rate": 3.3261212554286977e-06,
- "loss": 0.5267184376716614,
- "mean_token_accuracy": 0.8323302268981934,
- "num_tokens": 14911131.0,
- "step": 1672
- },
- {
- "epoch": 1.2712765957446808,
- "grad_norm": 1.312070369720459,
- "learning_rate": 3.324144225345649e-06,
- "loss": 0.4675425887107849,
- "mean_token_accuracy": 0.8157106637954712,
- "num_tokens": 14928955.0,
- "step": 1673
- },
- {
- "epoch": 1.2720364741641337,
- "grad_norm": 2.0547919273376465,
- "learning_rate": 3.3221666168464584e-06,
- "loss": 0.33704331517219543,
- "mean_token_accuracy": 0.8621441125869751,
- "num_tokens": 14935536.0,
- "step": 1674
- },
- {
- "epoch": 1.2727963525835866,
- "grad_norm": 2.810413122177124,
- "learning_rate": 3.320188431319088e-06,
- "loss": 0.4007563292980194,
- "mean_token_accuracy": 0.8649672269821167,
- "num_tokens": 14940219.0,
- "step": 1675
- },
- {
- "epoch": 1.2735562310030395,
- "grad_norm": 1.3516674041748047,
- "learning_rate": 3.318209670151904e-06,
- "loss": 0.3457040786743164,
- "mean_token_accuracy": 0.8698287010192871,
- "num_tokens": 14952904.0,
- "step": 1676
- },
- {
- "epoch": 1.2743161094224924,
- "grad_norm": 2.440643310546875,
- "learning_rate": 3.3162303347336765e-06,
- "loss": 0.5195086002349854,
- "mean_token_accuracy": 0.8348199129104614,
- "num_tokens": 14958623.0,
- "step": 1677
- },
- {
- "epoch": 1.2750759878419453,
- "grad_norm": 1.3264343738555908,
- "learning_rate": 3.3142504264535808e-06,
- "loss": 0.2990425229072571,
- "mean_token_accuracy": 0.8961933851242065,
- "num_tokens": 14971494.0,
- "step": 1678
- },
- {
- "epoch": 1.2758358662613982,
- "grad_norm": 1.3106894493103027,
- "learning_rate": 3.3122699467011913e-06,
- "loss": 0.291853666305542,
- "mean_token_accuracy": 0.893449068069458,
- "num_tokens": 14985239.0,
- "step": 1679
- },
- {
- "epoch": 1.2765957446808511,
- "grad_norm": 2.5387396812438965,
- "learning_rate": 3.3102888968664857e-06,
- "loss": 0.4336916208267212,
- "mean_token_accuracy": 0.8447890877723694,
- "num_tokens": 14991453.0,
- "step": 1680
- },
- {
- "epoch": 1.2773556231003038,
- "grad_norm": 2.7052135467529297,
- "learning_rate": 3.308307278339842e-06,
- "loss": 0.3279378116130829,
- "mean_token_accuracy": 0.8935879468917847,
- "num_tokens": 14995428.0,
- "step": 1681
- },
- {
- "epoch": 1.278115501519757,
- "grad_norm": 1.6251261234283447,
- "learning_rate": 3.306325092512034e-06,
- "loss": 0.32066458463668823,
- "mean_token_accuracy": 0.8909799456596375,
- "num_tokens": 15004841.0,
- "step": 1682
- },
- {
- "epoch": 1.2788753799392096,
- "grad_norm": 2.3014605045318604,
- "learning_rate": 3.3043423407742374e-06,
- "loss": 0.3523373603820801,
- "mean_token_accuracy": 0.8810735940933228,
- "num_tokens": 15010742.0,
- "step": 1683
- },
- {
- "epoch": 1.2796352583586625,
- "grad_norm": 2.9563019275665283,
- "learning_rate": 3.3023590245180237e-06,
- "loss": 0.39715707302093506,
- "mean_token_accuracy": 0.8779881000518799,
- "num_tokens": 15015357.0,
- "step": 1684
- },
- {
- "epoch": 1.2803951367781155,
- "grad_norm": 1.5787957906723022,
- "learning_rate": 3.300375145135361e-06,
- "loss": 0.44630166888237,
- "mean_token_accuracy": 0.8400174975395203,
- "num_tokens": 15031360.0,
- "step": 1685
- },
- {
- "epoch": 1.2811550151975684,
- "grad_norm": 1.6753438711166382,
- "learning_rate": 3.2983907040186112e-06,
- "loss": 0.3235800862312317,
- "mean_token_accuracy": 0.8938044309616089,
- "num_tokens": 15040276.0,
- "step": 1686
- },
- {
- "epoch": 1.2819148936170213,
- "grad_norm": 1.7331148386001587,
- "learning_rate": 3.296405702560532e-06,
- "loss": 0.39061424136161804,
- "mean_token_accuracy": 0.8599754571914673,
- "num_tokens": 15049725.0,
- "step": 1687
- },
- {
- "epoch": 1.2826747720364742,
- "grad_norm": 2.2029430866241455,
- "learning_rate": 3.294420142154274e-06,
- "loss": 0.43598297238349915,
- "mean_token_accuracy": 0.8663698434829712,
- "num_tokens": 15058182.0,
- "step": 1688
- },
- {
- "epoch": 1.283434650455927,
- "grad_norm": 2.943964958190918,
- "learning_rate": 3.29243402419338e-06,
- "loss": 0.405210942029953,
- "mean_token_accuracy": 0.854996919631958,
- "num_tokens": 15062920.0,
- "step": 1689
- },
- {
- "epoch": 1.28419452887538,
- "grad_norm": 1.9343379735946655,
- "learning_rate": 3.2904473500717826e-06,
- "loss": 0.35011449456214905,
- "mean_token_accuracy": 0.8745867013931274,
- "num_tokens": 15070298.0,
- "step": 1690
- },
- {
- "epoch": 1.284954407294833,
- "grad_norm": 2.559859037399292,
- "learning_rate": 3.2884601211838087e-06,
- "loss": 0.38816407322883606,
- "mean_token_accuracy": 0.854763388633728,
- "num_tokens": 15075667.0,
- "step": 1691
- },
- {
- "epoch": 1.2857142857142856,
- "grad_norm": 1.4357839822769165,
- "learning_rate": 3.2864723389241697e-06,
- "loss": 0.4512745141983032,
- "mean_token_accuracy": 0.8398592472076416,
- "num_tokens": 15090291.0,
- "step": 1692
- },
- {
- "epoch": 1.2864741641337387,
- "grad_norm": 1.7643728256225586,
- "learning_rate": 3.284484004687969e-06,
- "loss": 0.3536742627620697,
- "mean_token_accuracy": 0.8726381063461304,
- "num_tokens": 15099325.0,
- "step": 1693
- },
- {
- "epoch": 1.2872340425531914,
- "grad_norm": 1.853173017501831,
- "learning_rate": 3.2824951198706958e-06,
- "loss": 0.36579740047454834,
- "mean_token_accuracy": 0.8988048434257507,
- "num_tokens": 15107090.0,
- "step": 1694
- },
- {
- "epoch": 1.2879939209726443,
- "grad_norm": 1.6526862382888794,
- "learning_rate": 3.280505685868226e-06,
- "loss": 0.3853636682033539,
- "mean_token_accuracy": 0.8743607997894287,
- "num_tokens": 15117818.0,
- "step": 1695
- },
- {
- "epoch": 1.2887537993920972,
- "grad_norm": 2.790398597717285,
- "learning_rate": 3.278515704076821e-06,
- "loss": 0.2707311511039734,
- "mean_token_accuracy": 0.9034668803215027,
- "num_tokens": 15121641.0,
- "step": 1696
- },
- {
- "epoch": 1.2895136778115501,
- "grad_norm": 1.69557523727417,
- "learning_rate": 3.276525175893126e-06,
- "loss": 0.3707970082759857,
- "mean_token_accuracy": 0.8617855906486511,
- "num_tokens": 15130414.0,
- "step": 1697
- },
- {
- "epoch": 1.290273556231003,
- "grad_norm": 1.1360478401184082,
- "learning_rate": 3.274534102714172e-06,
- "loss": 0.3368082344532013,
- "mean_token_accuracy": 0.8781654834747314,
- "num_tokens": 15148307.0,
- "step": 1698
- },
- {
- "epoch": 1.291033434650456,
- "grad_norm": 1.5894653797149658,
- "learning_rate": 3.272542485937369e-06,
- "loss": 0.3870658278465271,
- "mean_token_accuracy": 0.8830926418304443,
- "num_tokens": 15161841.0,
- "step": 1699
- },
- {
- "epoch": 1.2917933130699089,
- "grad_norm": 2.3735709190368652,
- "learning_rate": 3.270550326960511e-06,
- "loss": 0.3873991370201111,
- "mean_token_accuracy": 0.8729057908058167,
- "num_tokens": 15167733.0,
- "step": 1700
- },
- {
- "epoch": 1.2925531914893618,
- "grad_norm": 1.3739598989486694,
- "learning_rate": 3.268557627181772e-06,
- "loss": 0.30831626057624817,
- "mean_token_accuracy": 0.8695719242095947,
- "num_tokens": 15180861.0,
- "step": 1701
- },
- {
- "epoch": 1.2933130699088147,
- "grad_norm": 1.7526969909667969,
- "learning_rate": 3.2665643879997054e-06,
- "loss": 0.4716024398803711,
- "mean_token_accuracy": 0.8303275108337402,
- "num_tokens": 15191642.0,
- "step": 1702
- },
- {
- "epoch": 1.2940729483282674,
- "grad_norm": 2.7866084575653076,
- "learning_rate": 3.2645706108132426e-06,
- "loss": 0.33337634801864624,
- "mean_token_accuracy": 0.8790726065635681,
- "num_tokens": 15196038.0,
- "step": 1703
- },
- {
- "epoch": 1.2948328267477205,
- "grad_norm": 2.319765090942383,
- "learning_rate": 3.2625762970216944e-06,
- "loss": 0.3999716639518738,
- "mean_token_accuracy": 0.8693568706512451,
- "num_tokens": 15202075.0,
- "step": 1704
- },
- {
- "epoch": 1.2955927051671732,
- "grad_norm": 3.18292498588562,
- "learning_rate": 3.2605814480247454e-06,
- "loss": 0.4579541087150574,
- "mean_token_accuracy": 0.8516187071800232,
- "num_tokens": 15206886.0,
- "step": 1705
- },
- {
- "epoch": 1.296352583586626,
- "grad_norm": 2.1816933155059814,
- "learning_rate": 3.258586065222459e-06,
- "loss": 0.5198885202407837,
- "mean_token_accuracy": 0.8170592784881592,
- "num_tokens": 15214088.0,
- "step": 1706
- },
- {
- "epoch": 1.297112462006079,
- "grad_norm": 1.9076340198516846,
- "learning_rate": 3.2565901500152702e-06,
- "loss": 0.49752360582351685,
- "mean_token_accuracy": 0.8681992292404175,
- "num_tokens": 15226046.0,
- "step": 1707
- },
- {
- "epoch": 1.297872340425532,
- "grad_norm": 2.0223331451416016,
- "learning_rate": 3.2545937038039904e-06,
- "loss": 0.4515793025493622,
- "mean_token_accuracy": 0.8429619073867798,
- "num_tokens": 15234993.0,
- "step": 1708
- },
- {
- "epoch": 1.2986322188449848,
- "grad_norm": 2.5089669227600098,
- "learning_rate": 3.2525967279898017e-06,
- "loss": 0.43628376722335815,
- "mean_token_accuracy": 0.8493682146072388,
- "num_tokens": 15240575.0,
- "step": 1709
- },
- {
- "epoch": 1.2993920972644377,
- "grad_norm": 2.8347091674804688,
- "learning_rate": 3.2505992239742582e-06,
- "loss": 0.25112441182136536,
- "mean_token_accuracy": 0.908825159072876,
- "num_tokens": 15244085.0,
- "step": 1710
- },
- {
- "epoch": 1.3001519756838906,
- "grad_norm": 2.3157572746276855,
- "learning_rate": 3.2486011931592863e-06,
- "loss": 0.482818067073822,
- "mean_token_accuracy": 0.8305923938751221,
- "num_tokens": 15250377.0,
- "step": 1711
- },
- {
- "epoch": 1.3009118541033435,
- "grad_norm": 3.169052839279175,
- "learning_rate": 3.2466026369471804e-06,
- "loss": 0.3493242561817169,
- "mean_token_accuracy": 0.86913001537323,
- "num_tokens": 15255041.0,
- "step": 1712
- },
- {
- "epoch": 1.3016717325227964,
- "grad_norm": 1.4475083351135254,
- "learning_rate": 3.2446035567406033e-06,
- "loss": 0.4177290201187134,
- "mean_token_accuracy": 0.8497589826583862,
- "num_tokens": 15266946.0,
- "step": 1713
- },
- {
- "epoch": 1.3024316109422491,
- "grad_norm": 1.6473008394241333,
- "learning_rate": 3.2426039539425875e-06,
- "loss": 0.5272886753082275,
- "mean_token_accuracy": 0.8440133333206177,
- "num_tokens": 15279263.0,
- "step": 1714
- },
- {
- "epoch": 1.3031914893617023,
- "grad_norm": 2.3996543884277344,
- "learning_rate": 3.240603829956531e-06,
- "loss": 0.4272066652774811,
- "mean_token_accuracy": 0.8495640754699707,
- "num_tokens": 15285213.0,
- "step": 1715
- },
- {
- "epoch": 1.303951367781155,
- "grad_norm": 1.63034987449646,
- "learning_rate": 3.238603186186198e-06,
- "loss": 0.4034635126590729,
- "mean_token_accuracy": 0.8638584613800049,
- "num_tokens": 15295974.0,
- "step": 1716
- },
- {
- "epoch": 1.3047112462006079,
- "grad_norm": 2.153608798980713,
- "learning_rate": 3.2366020240357166e-06,
- "loss": 0.30712565779685974,
- "mean_token_accuracy": 0.8863866329193115,
- "num_tokens": 15302220.0,
- "step": 1717
- },
- {
- "epoch": 1.3054711246200608,
- "grad_norm": 2.9814558029174805,
- "learning_rate": 3.2346003449095803e-06,
- "loss": 0.3922840356826782,
- "mean_token_accuracy": 0.868030309677124,
- "num_tokens": 15306747.0,
- "step": 1718
- },
- {
- "epoch": 1.3062310030395137,
- "grad_norm": 3.3417985439300537,
- "learning_rate": 3.2325981502126434e-06,
- "loss": 0.30750396847724915,
- "mean_token_accuracy": 0.9065356850624084,
- "num_tokens": 15310309.0,
- "step": 1719
- },
- {
- "epoch": 1.3069908814589666,
- "grad_norm": 2.237682819366455,
- "learning_rate": 3.2305954413501252e-06,
- "loss": 0.35068294405937195,
- "mean_token_accuracy": 0.8887614011764526,
- "num_tokens": 15316463.0,
- "step": 1720
- },
- {
- "epoch": 1.3077507598784195,
- "grad_norm": 1.9526605606079102,
- "learning_rate": 3.228592219727602e-06,
- "loss": 0.42061835527420044,
- "mean_token_accuracy": 0.8456839323043823,
- "num_tokens": 15323984.0,
- "step": 1721
- },
- {
- "epoch": 1.3085106382978724,
- "grad_norm": 1.6454212665557861,
- "learning_rate": 3.226588486751012e-06,
- "loss": 0.5189976692199707,
- "mean_token_accuracy": 0.8187375068664551,
- "num_tokens": 15338807.0,
- "step": 1722
- },
- {
- "epoch": 1.3092705167173253,
- "grad_norm": 1.4521609544754028,
- "learning_rate": 3.2245842438266526e-06,
- "loss": 0.329673171043396,
- "mean_token_accuracy": 0.853867769241333,
- "num_tokens": 15350400.0,
- "step": 1723
- },
- {
- "epoch": 1.3100303951367782,
- "grad_norm": 1.8750989437103271,
- "learning_rate": 3.222579492361179e-06,
- "loss": 0.4635341167449951,
- "mean_token_accuracy": 0.8393422365188599,
- "num_tokens": 15360557.0,
- "step": 1724
- },
- {
- "epoch": 1.310790273556231,
- "grad_norm": 1.2728849649429321,
- "learning_rate": 3.220574233761603e-06,
- "loss": 0.3255572021007538,
- "mean_token_accuracy": 0.8989741802215576,
- "num_tokens": 15376548.0,
- "step": 1725
- },
- {
- "epoch": 1.3115501519756838,
- "grad_norm": 3.5155694484710693,
- "learning_rate": 3.2185684694352913e-06,
- "loss": 0.34204089641571045,
- "mean_token_accuracy": 0.8781906366348267,
- "num_tokens": 15380304.0,
- "step": 1726
- },
- {
- "epoch": 1.3123100303951367,
- "grad_norm": 2.059800148010254,
- "learning_rate": 3.216562200789968e-06,
- "loss": 0.36288338899612427,
- "mean_token_accuracy": 0.8595278263092041,
- "num_tokens": 15387653.0,
- "step": 1727
- },
- {
- "epoch": 1.3130699088145896,
- "grad_norm": 3.5388240814208984,
- "learning_rate": 3.214555429233707e-06,
- "loss": 0.5434849858283997,
- "mean_token_accuracy": 0.8074631690979004,
- "num_tokens": 15391662.0,
- "step": 1728
- },
- {
- "epoch": 1.3138297872340425,
- "grad_norm": 2.8595592975616455,
- "learning_rate": 3.2125481561749406e-06,
- "loss": 0.5113687515258789,
- "mean_token_accuracy": 0.8448649644851685,
- "num_tokens": 15397536.0,
- "step": 1729
- },
- {
- "epoch": 1.3145896656534954,
- "grad_norm": 2.50386905670166,
- "learning_rate": 3.210540383022449e-06,
- "loss": 0.5293697118759155,
- "mean_token_accuracy": 0.8096445798873901,
- "num_tokens": 15403478.0,
- "step": 1730
- },
- {
- "epoch": 1.3153495440729484,
- "grad_norm": 1.880035400390625,
- "learning_rate": 3.208532111185365e-06,
- "loss": 0.5344835519790649,
- "mean_token_accuracy": 0.8172965049743652,
- "num_tokens": 15413812.0,
- "step": 1731
- },
- {
- "epoch": 1.3161094224924013,
- "grad_norm": 1.3688768148422241,
- "learning_rate": 3.2065233420731717e-06,
- "loss": 0.2577427327632904,
- "mean_token_accuracy": 0.9142681360244751,
- "num_tokens": 15423583.0,
- "step": 1732
- },
- {
- "epoch": 1.3168693009118542,
- "grad_norm": 1.7945705652236938,
- "learning_rate": 3.2045140770956987e-06,
- "loss": 0.3983926773071289,
- "mean_token_accuracy": 0.8652000427246094,
- "num_tokens": 15432473.0,
- "step": 1733
- },
- {
- "epoch": 1.3176291793313069,
- "grad_norm": 1.8243350982666016,
- "learning_rate": 3.2025043176631283e-06,
- "loss": 0.48644185066223145,
- "mean_token_accuracy": 0.8319193124771118,
- "num_tokens": 15445463.0,
- "step": 1734
- },
- {
- "epoch": 1.31838905775076,
- "grad_norm": 2.000094175338745,
- "learning_rate": 3.2004940651859844e-06,
- "loss": 0.43567317724227905,
- "mean_token_accuracy": 0.8857482671737671,
- "num_tokens": 15452382.0,
- "step": 1735
- },
- {
- "epoch": 1.3191489361702127,
- "grad_norm": 2.379974365234375,
- "learning_rate": 3.198483321075141e-06,
- "loss": 0.5153506398200989,
- "mean_token_accuracy": 0.8295865654945374,
- "num_tokens": 15458740.0,
- "step": 1736
- },
- {
- "epoch": 1.3199088145896656,
- "grad_norm": 1.6564184427261353,
- "learning_rate": 3.196472086741815e-06,
- "loss": 0.508430540561676,
- "mean_token_accuracy": 0.8181540369987488,
- "num_tokens": 15471844.0,
- "step": 1737
- },
- {
- "epoch": 1.3206686930091185,
- "grad_norm": 2.006925344467163,
- "learning_rate": 3.194460363597569e-06,
- "loss": 0.34542378783226013,
- "mean_token_accuracy": 0.8827437162399292,
- "num_tokens": 15478414.0,
- "step": 1738
- },
- {
- "epoch": 1.3214285714285714,
- "grad_norm": 3.589045763015747,
- "learning_rate": 3.192448153054306e-06,
- "loss": 0.4385780096054077,
- "mean_token_accuracy": 0.8480287790298462,
- "num_tokens": 15482063.0,
- "step": 1739
- },
- {
- "epoch": 1.3221884498480243,
- "grad_norm": 1.9797427654266357,
- "learning_rate": 3.190435456524275e-06,
- "loss": 0.4330386519432068,
- "mean_token_accuracy": 0.8458058834075928,
- "num_tokens": 15489803.0,
- "step": 1740
- },
- {
- "epoch": 1.3229483282674772,
- "grad_norm": 1.4777411222457886,
- "learning_rate": 3.188422275420063e-06,
- "loss": 0.3997895419597626,
- "mean_token_accuracy": 0.8639512062072754,
- "num_tokens": 15501103.0,
- "step": 1741
- },
- {
- "epoch": 1.3237082066869301,
- "grad_norm": 2.882338523864746,
- "learning_rate": 3.186408611154597e-06,
- "loss": 0.2336438149213791,
- "mean_token_accuracy": 0.9176726937294006,
- "num_tokens": 15504854.0,
- "step": 1742
- },
- {
- "epoch": 1.324468085106383,
- "grad_norm": 2.353503704071045,
- "learning_rate": 3.184394465141146e-06,
- "loss": 0.4107069671154022,
- "mean_token_accuracy": 0.8677014112472534,
- "num_tokens": 15510662.0,
- "step": 1743
- },
- {
- "epoch": 1.325227963525836,
- "grad_norm": 2.6551976203918457,
- "learning_rate": 3.1823798387933134e-06,
- "loss": 0.3862302899360657,
- "mean_token_accuracy": 0.8819445371627808,
- "num_tokens": 15515681.0,
- "step": 1744
- },
- {
- "epoch": 1.3259878419452886,
- "grad_norm": 1.478572964668274,
- "learning_rate": 3.180364733525043e-06,
- "loss": 0.43972986936569214,
- "mean_token_accuracy": 0.832388162612915,
- "num_tokens": 15529542.0,
- "step": 1745
- },
- {
- "epoch": 1.3267477203647418,
- "grad_norm": 1.6003550291061401,
- "learning_rate": 3.178349150750612e-06,
- "loss": 0.3404902219772339,
- "mean_token_accuracy": 0.8764007091522217,
- "num_tokens": 15538865.0,
- "step": 1746
- },
- {
- "epoch": 1.3275075987841944,
- "grad_norm": 2.130689859390259,
- "learning_rate": 3.1763330918846347e-06,
- "loss": 0.383136510848999,
- "mean_token_accuracy": 0.8652247190475464,
- "num_tokens": 15545567.0,
- "step": 1747
- },
- {
- "epoch": 1.3282674772036474,
- "grad_norm": 2.395937442779541,
- "learning_rate": 3.1743165583420586e-06,
- "loss": 0.3870319128036499,
- "mean_token_accuracy": 0.8618065118789673,
- "num_tokens": 15551090.0,
- "step": 1748
- },
- {
- "epoch": 1.3290273556231003,
- "grad_norm": 2.0841057300567627,
- "learning_rate": 3.1722995515381644e-06,
- "loss": 0.4838739335536957,
- "mean_token_accuracy": 0.8548711538314819,
- "num_tokens": 15558913.0,
- "step": 1749
- },
- {
- "epoch": 1.3297872340425532,
- "grad_norm": 1.4237847328186035,
- "learning_rate": 3.1702820728885657e-06,
- "loss": 0.40350261330604553,
- "mean_token_accuracy": 0.858984649181366,
- "num_tokens": 15572045.0,
- "step": 1750
- },
- {
- "epoch": 1.330547112462006,
- "grad_norm": 2.2641282081604004,
- "learning_rate": 3.1682641238092064e-06,
- "loss": 0.5117636919021606,
- "mean_token_accuracy": 0.8078924417495728,
- "num_tokens": 15579753.0,
- "step": 1751
- },
- {
- "epoch": 1.331306990881459,
- "grad_norm": 1.0010309219360352,
- "learning_rate": 3.1662457057163603e-06,
- "loss": 0.3220978379249573,
- "mean_token_accuracy": 0.8786559104919434,
- "num_tokens": 15602823.0,
- "step": 1752
- },
- {
- "epoch": 1.332066869300912,
- "grad_norm": 2.441230535507202,
- "learning_rate": 3.164226820026632e-06,
- "loss": 0.37529727816581726,
- "mean_token_accuracy": 0.8886898756027222,
- "num_tokens": 15608473.0,
- "step": 1753
- },
- {
- "epoch": 1.3328267477203648,
- "grad_norm": 1.2960991859436035,
- "learning_rate": 3.162207468156952e-06,
- "loss": 0.3393767476081848,
- "mean_token_accuracy": 0.8766993284225464,
- "num_tokens": 15620893.0,
- "step": 1754
- },
- {
- "epoch": 1.3335866261398177,
- "grad_norm": 2.0806996822357178,
- "learning_rate": 3.16018765152458e-06,
- "loss": 0.38034507632255554,
- "mean_token_accuracy": 0.8854838609695435,
- "num_tokens": 15627068.0,
- "step": 1755
- },
- {
- "epoch": 1.3343465045592704,
- "grad_norm": 1.4316699504852295,
- "learning_rate": 3.1581673715471007e-06,
- "loss": 0.3665890693664551,
- "mean_token_accuracy": 0.870919406414032,
- "num_tokens": 15641070.0,
- "step": 1756
- },
- {
- "epoch": 1.3351063829787235,
- "grad_norm": 1.3466622829437256,
- "learning_rate": 3.1561466296424247e-06,
- "loss": 0.37387198209762573,
- "mean_token_accuracy": 0.8633951544761658,
- "num_tokens": 15653777.0,
- "step": 1757
- },
- {
- "epoch": 1.3358662613981762,
- "grad_norm": 1.8108628988265991,
- "learning_rate": 3.154125427228786e-06,
- "loss": 0.38428938388824463,
- "mean_token_accuracy": 0.85402512550354,
- "num_tokens": 15662494.0,
- "step": 1758
- },
- {
- "epoch": 1.3366261398176291,
- "grad_norm": 1.3221700191497803,
- "learning_rate": 3.152103765724743e-06,
- "loss": 0.42825520038604736,
- "mean_token_accuracy": 0.8435465097427368,
- "num_tokens": 15677552.0,
- "step": 1759
- },
- {
- "epoch": 1.337386018237082,
- "grad_norm": 2.6247692108154297,
- "learning_rate": 3.150081646549174e-06,
- "loss": 0.36186715960502625,
- "mean_token_accuracy": 0.8767328262329102,
- "num_tokens": 15682103.0,
- "step": 1760
- },
- {
- "epoch": 1.338145896656535,
- "grad_norm": 2.1469814777374268,
- "learning_rate": 3.1480590711212823e-06,
- "loss": 0.3734385669231415,
- "mean_token_accuracy": 0.8711104393005371,
- "num_tokens": 15689182.0,
- "step": 1761
- },
- {
- "epoch": 1.3389057750759878,
- "grad_norm": 2.1702585220336914,
- "learning_rate": 3.1460360408605866e-06,
- "loss": 0.2795315086841583,
- "mean_token_accuracy": 0.8892190456390381,
- "num_tokens": 15694272.0,
- "step": 1762
- },
- {
- "epoch": 1.3396656534954408,
- "grad_norm": 1.918797254562378,
- "learning_rate": 3.144012557186931e-06,
- "loss": 0.4363473057746887,
- "mean_token_accuracy": 0.8573931455612183,
- "num_tokens": 15703532.0,
- "step": 1763
- },
- {
- "epoch": 1.3404255319148937,
- "grad_norm": 2.5579960346221924,
- "learning_rate": 3.14198862152047e-06,
- "loss": 0.406247079372406,
- "mean_token_accuracy": 0.8617593050003052,
- "num_tokens": 15708652.0,
- "step": 1764
- },
- {
- "epoch": 1.3411854103343466,
- "grad_norm": 2.3617870807647705,
- "learning_rate": 3.1399642352816825e-06,
- "loss": 0.2839522659778595,
- "mean_token_accuracy": 0.8996064066886902,
- "num_tokens": 15713598.0,
- "step": 1765
- },
- {
- "epoch": 1.3419452887537995,
- "grad_norm": 1.248302936553955,
- "learning_rate": 3.1379393998913594e-06,
- "loss": 0.2922290861606598,
- "mean_token_accuracy": 0.8948773145675659,
- "num_tokens": 15726693.0,
- "step": 1766
- },
- {
- "epoch": 1.3427051671732522,
- "grad_norm": 2.143599510192871,
- "learning_rate": 3.135914116770609e-06,
- "loss": 0.32176223397254944,
- "mean_token_accuracy": 0.8808754682540894,
- "num_tokens": 15731901.0,
- "step": 1767
- },
- {
- "epoch": 1.3434650455927053,
- "grad_norm": 4.226369857788086,
- "learning_rate": 3.1338883873408517e-06,
- "loss": 0.4682556390762329,
- "mean_token_accuracy": 0.8566025495529175,
- "num_tokens": 15735029.0,
- "step": 1768
- },
- {
- "epoch": 1.344224924012158,
- "grad_norm": 1.8695988655090332,
- "learning_rate": 3.1318622130238237e-06,
- "loss": 0.4297192394733429,
- "mean_token_accuracy": 0.8419148921966553,
- "num_tokens": 15744310.0,
- "step": 1769
- },
- {
- "epoch": 1.344984802431611,
- "grad_norm": 2.4321305751800537,
- "learning_rate": 3.1298355952415714e-06,
- "loss": 0.36076444387435913,
- "mean_token_accuracy": 0.8826035261154175,
- "num_tokens": 15749337.0,
- "step": 1770
- },
- {
- "epoch": 1.3457446808510638,
- "grad_norm": 1.5500011444091797,
- "learning_rate": 3.127808535416454e-06,
- "loss": 0.48664039373397827,
- "mean_token_accuracy": 0.844344437122345,
- "num_tokens": 15761096.0,
- "step": 1771
- },
- {
- "epoch": 1.3465045592705167,
- "grad_norm": 2.1498289108276367,
- "learning_rate": 3.1257810349711388e-06,
- "loss": 0.4841752052307129,
- "mean_token_accuracy": 0.8324567079544067,
- "num_tokens": 15768646.0,
- "step": 1772
- },
- {
- "epoch": 1.3472644376899696,
- "grad_norm": 1.2995187044143677,
- "learning_rate": 3.1237530953286046e-06,
- "loss": 0.492019385099411,
- "mean_token_accuracy": 0.8285316228866577,
- "num_tokens": 15788401.0,
- "step": 1773
- },
- {
- "epoch": 1.3480243161094225,
- "grad_norm": 2.324819803237915,
- "learning_rate": 3.121724717912138e-06,
- "loss": 0.33166298270225525,
- "mean_token_accuracy": 0.8856451511383057,
- "num_tokens": 15794097.0,
- "step": 1774
- },
- {
- "epoch": 1.3487841945288754,
- "grad_norm": 1.9611430168151855,
- "learning_rate": 3.11969590414533e-06,
- "loss": 0.3974284827709198,
- "mean_token_accuracy": 0.8751305937767029,
- "num_tokens": 15801065.0,
- "step": 1775
- },
- {
- "epoch": 1.3495440729483283,
- "grad_norm": 1.7084417343139648,
- "learning_rate": 3.1176666554520827e-06,
- "loss": 0.38729435205459595,
- "mean_token_accuracy": 0.8680770397186279,
- "num_tokens": 15810353.0,
- "step": 1776
- },
- {
- "epoch": 1.3503039513677813,
- "grad_norm": 1.7616240978240967,
- "learning_rate": 3.1156369732566006e-06,
- "loss": 0.4271578788757324,
- "mean_token_accuracy": 0.843730092048645,
- "num_tokens": 15821889.0,
- "step": 1777
- },
- {
- "epoch": 1.351063829787234,
- "grad_norm": 2.030747413635254,
- "learning_rate": 3.113606858983391e-06,
- "loss": 0.361891508102417,
- "mean_token_accuracy": 0.8522407412528992,
- "num_tokens": 15830800.0,
- "step": 1778
- },
- {
- "epoch": 1.3518237082066868,
- "grad_norm": 1.4842649698257446,
- "learning_rate": 3.1115763140572686e-06,
- "loss": 0.466334730386734,
- "mean_token_accuracy": 0.8433995246887207,
- "num_tokens": 15849422.0,
- "step": 1779
- },
- {
- "epoch": 1.3525835866261398,
- "grad_norm": 1.6595379114151,
- "learning_rate": 3.109545339903347e-06,
- "loss": 0.4622533321380615,
- "mean_token_accuracy": 0.8526314496994019,
- "num_tokens": 15860431.0,
- "step": 1780
- },
- {
- "epoch": 1.3533434650455927,
- "grad_norm": 2.1235809326171875,
- "learning_rate": 3.107513937947041e-06,
- "loss": 0.42694270610809326,
- "mean_token_accuracy": 0.854864239692688,
- "num_tokens": 15869044.0,
- "step": 1781
- },
- {
- "epoch": 1.3541033434650456,
- "grad_norm": 1.5889263153076172,
- "learning_rate": 3.1054821096140675e-06,
- "loss": 0.41838499903678894,
- "mean_token_accuracy": 0.8671513795852661,
- "num_tokens": 15878598.0,
- "step": 1782
- },
- {
- "epoch": 1.3548632218844985,
- "grad_norm": 2.2261741161346436,
- "learning_rate": 3.1034498563304435e-06,
- "loss": 0.4045066237449646,
- "mean_token_accuracy": 0.843826949596405,
- "num_tokens": 15885167.0,
- "step": 1783
- },
- {
- "epoch": 1.3556231003039514,
- "grad_norm": 2.2569329738616943,
- "learning_rate": 3.1014171795224794e-06,
- "loss": 0.36677104234695435,
- "mean_token_accuracy": 0.8747833967208862,
- "num_tokens": 15891308.0,
- "step": 1784
- },
- {
- "epoch": 1.3563829787234043,
- "grad_norm": 2.1027088165283203,
- "learning_rate": 3.0993840806167884e-06,
- "loss": 0.437946081161499,
- "mean_token_accuracy": 0.8370785117149353,
- "num_tokens": 15898952.0,
- "step": 1785
- },
- {
- "epoch": 1.3571428571428572,
- "grad_norm": 1.8768929243087769,
- "learning_rate": 3.0973505610402767e-06,
- "loss": 0.4201734662055969,
- "mean_token_accuracy": 0.8474810123443604,
- "num_tokens": 15907340.0,
- "step": 1786
- },
- {
- "epoch": 1.35790273556231,
- "grad_norm": 1.7216229438781738,
- "learning_rate": 3.0953166222201474e-06,
- "loss": 0.4225231409072876,
- "mean_token_accuracy": 0.8437749147415161,
- "num_tokens": 15917852.0,
- "step": 1787
- },
- {
- "epoch": 1.358662613981763,
- "grad_norm": 2.6256966590881348,
- "learning_rate": 3.093282265583895e-06,
- "loss": 0.435439795255661,
- "mean_token_accuracy": 0.8452040553092957,
- "num_tokens": 15923739.0,
- "step": 1788
- },
- {
- "epoch": 1.3594224924012157,
- "grad_norm": 2.90028977394104,
- "learning_rate": 3.0912474925593124e-06,
- "loss": 0.3730456829071045,
- "mean_token_accuracy": 0.8766646385192871,
- "num_tokens": 15927943.0,
- "step": 1789
- },
- {
- "epoch": 1.3601823708206686,
- "grad_norm": 1.5966626405715942,
- "learning_rate": 3.0892123045744787e-06,
- "loss": 0.42150455713272095,
- "mean_token_accuracy": 0.854656457901001,
- "num_tokens": 15939922.0,
- "step": 1790
- },
- {
- "epoch": 1.3609422492401215,
- "grad_norm": 1.8069748878479004,
- "learning_rate": 3.0871767030577686e-06,
- "loss": 0.4954872131347656,
- "mean_token_accuracy": 0.8289790153503418,
- "num_tokens": 15950095.0,
- "step": 1791
- },
- {
- "epoch": 1.3617021276595744,
- "grad_norm": 2.0855250358581543,
- "learning_rate": 3.085140689437846e-06,
- "loss": 0.41999945044517517,
- "mean_token_accuracy": 0.8517382144927979,
- "num_tokens": 15957972.0,
- "step": 1792
- },
- {
- "epoch": 1.3624620060790273,
- "grad_norm": 2.108659267425537,
- "learning_rate": 3.0831042651436634e-06,
- "loss": 0.3668023645877838,
- "mean_token_accuracy": 0.8710855841636658,
- "num_tokens": 15965614.0,
- "step": 1793
- },
- {
- "epoch": 1.3632218844984803,
- "grad_norm": 1.3799632787704468,
- "learning_rate": 3.0810674316044602e-06,
- "loss": 0.351409375667572,
- "mean_token_accuracy": 0.870837390422821,
- "num_tokens": 15978854.0,
- "step": 1794
- },
- {
- "epoch": 1.3639817629179332,
- "grad_norm": 1.540397047996521,
- "learning_rate": 3.0790301902497664e-06,
- "loss": 0.403600811958313,
- "mean_token_accuracy": 0.8485002517700195,
- "num_tokens": 15993324.0,
- "step": 1795
- },
- {
- "epoch": 1.364741641337386,
- "grad_norm": 1.946882963180542,
- "learning_rate": 3.076992542509396e-06,
- "loss": 0.40118327736854553,
- "mean_token_accuracy": 0.8607497811317444,
- "num_tokens": 16001937.0,
- "step": 1796
- },
- {
- "epoch": 1.365501519756839,
- "grad_norm": 2.0464305877685547,
- "learning_rate": 3.0749544898134487e-06,
- "loss": 0.31742292642593384,
- "mean_token_accuracy": 0.8878391981124878,
- "num_tokens": 16009277.0,
- "step": 1797
- },
- {
- "epoch": 1.3662613981762917,
- "grad_norm": 2.091754913330078,
- "learning_rate": 3.072916033592307e-06,
- "loss": 0.31580421328544617,
- "mean_token_accuracy": 0.8875244855880737,
- "num_tokens": 16015756.0,
- "step": 1798
- },
- {
- "epoch": 1.3670212765957448,
- "grad_norm": 3.4449212551116943,
- "learning_rate": 3.0708771752766397e-06,
- "loss": 0.4692591726779938,
- "mean_token_accuracy": 0.8456202149391174,
- "num_tokens": 16019912.0,
- "step": 1799
- },
- {
- "epoch": 1.3677811550151975,
- "grad_norm": 1.600419521331787,
- "learning_rate": 3.068837916297396e-06,
- "loss": 0.40389442443847656,
- "mean_token_accuracy": 0.8378961086273193,
- "num_tokens": 16032637.0,
- "step": 1800
- },
- {
- "epoch": 1.3685410334346504,
- "grad_norm": 1.5282686948776245,
- "learning_rate": 3.0667982580858047e-06,
- "loss": 0.379841685295105,
- "mean_token_accuracy": 0.8752143383026123,
- "num_tokens": 16045205.0,
- "step": 1801
- },
- {
- "epoch": 1.3693009118541033,
- "grad_norm": 2.486079454421997,
- "learning_rate": 3.0647582020733773e-06,
- "loss": 0.41060030460357666,
- "mean_token_accuracy": 0.8575131893157959,
- "num_tokens": 16051189.0,
- "step": 1802
- },
- {
- "epoch": 1.3700607902735562,
- "grad_norm": 1.9458621740341187,
- "learning_rate": 3.062717749691904e-06,
- "loss": 0.4442213773727417,
- "mean_token_accuracy": 0.8451495170593262,
- "num_tokens": 16059700.0,
- "step": 1803
- },
- {
- "epoch": 1.3708206686930091,
- "grad_norm": 1.4333001375198364,
- "learning_rate": 3.0606769023734535e-06,
- "loss": 0.39132001996040344,
- "mean_token_accuracy": 0.8609901666641235,
- "num_tokens": 16072458.0,
- "step": 1804
- },
- {
- "epoch": 1.371580547112462,
- "grad_norm": 1.490355372428894,
- "learning_rate": 3.0586356615503693e-06,
- "loss": 0.4108564257621765,
- "mean_token_accuracy": 0.8871046304702759,
- "num_tokens": 16083142.0,
- "step": 1805
- },
- {
- "epoch": 1.372340425531915,
- "grad_norm": 1.7765129804611206,
- "learning_rate": 3.056594028655274e-06,
- "loss": 0.3850266635417938,
- "mean_token_accuracy": 0.8923365473747253,
- "num_tokens": 16092519.0,
- "step": 1806
- },
- {
- "epoch": 1.3731003039513678,
- "grad_norm": 1.955661416053772,
- "learning_rate": 3.0545520051210637e-06,
- "loss": 0.4665378928184509,
- "mean_token_accuracy": 0.837419867515564,
- "num_tokens": 16100618.0,
- "step": 1807
- },
- {
- "epoch": 1.3738601823708207,
- "grad_norm": 3.259265422821045,
- "learning_rate": 3.052509592380909e-06,
- "loss": 0.24722981452941895,
- "mean_token_accuracy": 0.9106054306030273,
- "num_tokens": 16103836.0,
- "step": 1808
- },
- {
- "epoch": 1.3746200607902734,
- "grad_norm": 1.7995736598968506,
- "learning_rate": 3.050466791868254e-06,
- "loss": 0.4982220530509949,
- "mean_token_accuracy": 0.8298169374465942,
- "num_tokens": 16114727.0,
- "step": 1809
- },
- {
- "epoch": 1.3753799392097266,
- "grad_norm": 1.9643093347549438,
- "learning_rate": 3.048423605016815e-06,
- "loss": 0.5076829195022583,
- "mean_token_accuracy": 0.8303098678588867,
- "num_tokens": 16129491.0,
- "step": 1810
- },
- {
- "epoch": 1.3761398176291793,
- "grad_norm": 3.505594491958618,
- "learning_rate": 3.0463800332605787e-06,
- "loss": 0.27466052770614624,
- "mean_token_accuracy": 0.9018045663833618,
- "num_tokens": 16132640.0,
- "step": 1811
- },
- {
- "epoch": 1.3768996960486322,
- "grad_norm": 1.798437237739563,
- "learning_rate": 3.0443360780338034e-06,
- "loss": 0.4004853069782257,
- "mean_token_accuracy": 0.8569544553756714,
- "num_tokens": 16143317.0,
- "step": 1812
- },
- {
- "epoch": 1.377659574468085,
- "grad_norm": 2.276740789413452,
- "learning_rate": 3.042291740771014e-06,
- "loss": 0.3823797106742859,
- "mean_token_accuracy": 0.8764113783836365,
- "num_tokens": 16148898.0,
- "step": 1813
- },
- {
- "epoch": 1.378419452887538,
- "grad_norm": 2.5051357746124268,
- "learning_rate": 3.0402470229070057e-06,
- "loss": 0.40365856885910034,
- "mean_token_accuracy": 0.8809891939163208,
- "num_tokens": 16153815.0,
- "step": 1814
- },
- {
- "epoch": 1.3791793313069909,
- "grad_norm": 1.2379236221313477,
- "learning_rate": 3.03820192587684e-06,
- "loss": 0.3955119848251343,
- "mean_token_accuracy": 0.8536627292633057,
- "num_tokens": 16167783.0,
- "step": 1815
- },
- {
- "epoch": 1.3799392097264438,
- "grad_norm": 2.2286343574523926,
- "learning_rate": 3.036156451115846e-06,
- "loss": 0.39647501707077026,
- "mean_token_accuracy": 0.8621993064880371,
- "num_tokens": 16174707.0,
- "step": 1816
- },
- {
- "epoch": 1.3806990881458967,
- "grad_norm": 1.884639024734497,
- "learning_rate": 3.034110600059616e-06,
- "loss": 0.31612110137939453,
- "mean_token_accuracy": 0.8942475318908691,
- "num_tokens": 16181919.0,
- "step": 1817
- },
- {
- "epoch": 1.3814589665653496,
- "grad_norm": 1.891312599182129,
- "learning_rate": 3.0320643741440052e-06,
- "loss": 0.46209126710891724,
- "mean_token_accuracy": 0.8374713659286499,
- "num_tokens": 16189276.0,
- "step": 1818
- },
- {
- "epoch": 1.3822188449848025,
- "grad_norm": 2.507478713989258,
- "learning_rate": 3.0300177748051375e-06,
- "loss": 0.37601593136787415,
- "mean_token_accuracy": 0.8633589148521423,
- "num_tokens": 16194346.0,
- "step": 1819
- },
- {
- "epoch": 1.3829787234042552,
- "grad_norm": 1.5046696662902832,
- "learning_rate": 3.0279708034793907e-06,
- "loss": 0.3284982144832611,
- "mean_token_accuracy": 0.8792630434036255,
- "num_tokens": 16205457.0,
- "step": 1820
- },
- {
- "epoch": 1.3837386018237083,
- "grad_norm": 2.4244449138641357,
- "learning_rate": 3.025923461603412e-06,
- "loss": 0.40939009189605713,
- "mean_token_accuracy": 0.8596426248550415,
- "num_tokens": 16211866.0,
- "step": 1821
- },
- {
- "epoch": 1.384498480243161,
- "grad_norm": 2.8656933307647705,
- "learning_rate": 3.0238757506141013e-06,
- "loss": 0.4397110044956207,
- "mean_token_accuracy": 0.8597331047058105,
- "num_tokens": 16216607.0,
- "step": 1822
- },
- {
- "epoch": 1.385258358662614,
- "grad_norm": 2.0718610286712646,
- "learning_rate": 3.0218276719486245e-06,
- "loss": 0.49057573080062866,
- "mean_token_accuracy": 0.8325331211090088,
- "num_tokens": 16224014.0,
- "step": 1823
- },
- {
- "epoch": 1.3860182370820668,
- "grad_norm": 1.054450273513794,
- "learning_rate": 3.019779227044398e-06,
- "loss": 0.3758106827735901,
- "mean_token_accuracy": 0.8689473867416382,
- "num_tokens": 16248627.0,
- "step": 1824
- },
- {
- "epoch": 1.3867781155015197,
- "grad_norm": 2.1115148067474365,
- "learning_rate": 3.0177304173391038e-06,
- "loss": 0.502967119216919,
- "mean_token_accuracy": 0.823198676109314,
- "num_tokens": 16256255.0,
- "step": 1825
- },
- {
- "epoch": 1.3875379939209727,
- "grad_norm": 2.207277297973633,
- "learning_rate": 3.015681244270672e-06,
- "loss": 0.3458971083164215,
- "mean_token_accuracy": 0.8930196762084961,
- "num_tokens": 16261823.0,
- "step": 1826
- },
- {
- "epoch": 1.3882978723404256,
- "grad_norm": 1.289669156074524,
- "learning_rate": 3.0136317092772923e-06,
- "loss": 0.4422765374183655,
- "mean_token_accuracy": 0.8358346819877625,
- "num_tokens": 16280659.0,
- "step": 1827
- },
- {
- "epoch": 1.3890577507598785,
- "grad_norm": 2.233865737915039,
- "learning_rate": 3.0115818137974066e-06,
- "loss": 0.3643006384372711,
- "mean_token_accuracy": 0.8682862520217896,
- "num_tokens": 16286356.0,
- "step": 1828
- },
- {
- "epoch": 1.3898176291793314,
- "grad_norm": 1.0950042009353638,
- "learning_rate": 3.0095315592697126e-06,
- "loss": 0.34712421894073486,
- "mean_token_accuracy": 0.8578766584396362,
- "num_tokens": 16307298.0,
- "step": 1829
- },
- {
- "epoch": 1.3905775075987843,
- "grad_norm": 1.1708037853240967,
- "learning_rate": 3.007480947133155e-06,
- "loss": 0.33152541518211365,
- "mean_token_accuracy": 0.894973874092102,
- "num_tokens": 16323232.0,
- "step": 1830
- },
- {
- "epoch": 1.391337386018237,
- "grad_norm": 1.2226970195770264,
- "learning_rate": 3.0054299788269343e-06,
- "loss": 0.3915635943412781,
- "mean_token_accuracy": 0.8575779795646667,
- "num_tokens": 16339273.0,
- "step": 1831
- },
- {
- "epoch": 1.39209726443769,
- "grad_norm": 1.2226042747497559,
- "learning_rate": 3.0033786557904982e-06,
- "loss": 0.45846253633499146,
- "mean_token_accuracy": 0.8290432691574097,
- "num_tokens": 16360145.0,
- "step": 1832
- },
- {
- "epoch": 1.3928571428571428,
- "grad_norm": 2.0117406845092773,
- "learning_rate": 3.001326979463545e-06,
- "loss": 0.3837882876396179,
- "mean_token_accuracy": 0.8941739797592163,
- "num_tokens": 16366602.0,
- "step": 1833
- },
- {
- "epoch": 1.3936170212765957,
- "grad_norm": 1.8419997692108154,
- "learning_rate": 2.9992749512860177e-06,
- "loss": 0.40777021646499634,
- "mean_token_accuracy": 0.854655385017395,
- "num_tokens": 16375611.0,
- "step": 1834
- },
- {
- "epoch": 1.3943768996960486,
- "grad_norm": 1.9405122995376587,
- "learning_rate": 2.9972225726981114e-06,
- "loss": 0.46685922145843506,
- "mean_token_accuracy": 0.8493201732635498,
- "num_tokens": 16384878.0,
- "step": 1835
- },
- {
- "epoch": 1.3951367781155015,
- "grad_norm": 1.2425674200057983,
- "learning_rate": 2.995169845140264e-06,
- "loss": 0.394692063331604,
- "mean_token_accuracy": 0.851348876953125,
- "num_tokens": 16404452.0,
- "step": 1836
- },
- {
- "epoch": 1.3958966565349544,
- "grad_norm": 1.2215365171432495,
- "learning_rate": 2.9931167700531575e-06,
- "loss": 0.31412452459335327,
- "mean_token_accuracy": 0.882760763168335,
- "num_tokens": 16419358.0,
- "step": 1837
- },
- {
- "epoch": 1.3966565349544073,
- "grad_norm": 1.912168025970459,
- "learning_rate": 2.9910633488777198e-06,
- "loss": 0.5065487623214722,
- "mean_token_accuracy": 0.8524355292320251,
- "num_tokens": 16430418.0,
- "step": 1838
- },
- {
- "epoch": 1.3974164133738602,
- "grad_norm": 2.2173948287963867,
- "learning_rate": 2.989009583055121e-06,
- "loss": 0.4290938377380371,
- "mean_token_accuracy": 0.8381836414337158,
- "num_tokens": 16438267.0,
- "step": 1839
- },
- {
- "epoch": 1.3981762917933132,
- "grad_norm": 1.8293484449386597,
- "learning_rate": 2.9869554740267726e-06,
- "loss": 0.41683733463287354,
- "mean_token_accuracy": 0.8548779487609863,
- "num_tokens": 16447382.0,
- "step": 1840
- },
- {
- "epoch": 1.398936170212766,
- "grad_norm": 1.835015892982483,
- "learning_rate": 2.9849010232343274e-06,
- "loss": 0.5080599784851074,
- "mean_token_accuracy": 0.8193596601486206,
- "num_tokens": 16458541.0,
- "step": 1841
- },
- {
- "epoch": 1.3996960486322187,
- "grad_norm": 2.031339645385742,
- "learning_rate": 2.982846232119679e-06,
- "loss": 0.5168882012367249,
- "mean_token_accuracy": 0.8525956869125366,
- "num_tokens": 16467747.0,
- "step": 1842
- },
- {
- "epoch": 1.4004559270516717,
- "grad_norm": 1.5554167032241821,
- "learning_rate": 2.9807911021249573e-06,
- "loss": 0.35098958015441895,
- "mean_token_accuracy": 0.888373851776123,
- "num_tokens": 16479319.0,
- "step": 1843
- },
- {
- "epoch": 1.4012158054711246,
- "grad_norm": 1.7183740139007568,
- "learning_rate": 2.9787356346925327e-06,
- "loss": 0.41263148188591003,
- "mean_token_accuracy": 0.8478364944458008,
- "num_tokens": 16489952.0,
- "step": 1844
- },
- {
- "epoch": 1.4019756838905775,
- "grad_norm": 1.7743209600448608,
- "learning_rate": 2.9766798312650112e-06,
- "loss": 0.4211183190345764,
- "mean_token_accuracy": 0.8641136884689331,
- "num_tokens": 16498655.0,
- "step": 1845
- },
- {
- "epoch": 1.4027355623100304,
- "grad_norm": 2.141300916671753,
- "learning_rate": 2.9746236932852355e-06,
- "loss": 0.49548980593681335,
- "mean_token_accuracy": 0.8304252028465271,
- "num_tokens": 16506348.0,
- "step": 1846
- },
- {
- "epoch": 1.4034954407294833,
- "grad_norm": 2.341571807861328,
- "learning_rate": 2.9725672221962804e-06,
- "loss": 0.40804803371429443,
- "mean_token_accuracy": 0.8545800447463989,
- "num_tokens": 16513091.0,
- "step": 1847
- },
- {
- "epoch": 1.4042553191489362,
- "grad_norm": 1.934428095817566,
- "learning_rate": 2.9705104194414587e-06,
- "loss": 0.30029812455177307,
- "mean_token_accuracy": 0.9032052755355835,
- "num_tokens": 16519455.0,
- "step": 1848
- },
- {
- "epoch": 1.405015197568389,
- "grad_norm": 1.420804500579834,
- "learning_rate": 2.9684532864643123e-06,
- "loss": 0.4384060502052307,
- "mean_token_accuracy": 0.8465110063552856,
- "num_tokens": 16533222.0,
- "step": 1849
- },
- {
- "epoch": 1.405775075987842,
- "grad_norm": 2.1180737018585205,
- "learning_rate": 2.9663958247086165e-06,
- "loss": 0.3915565609931946,
- "mean_token_accuracy": 0.8633890748023987,
- "num_tokens": 16539489.0,
- "step": 1850
- },
- {
- "epoch": 1.4065349544072947,
- "grad_norm": 1.408048152923584,
- "learning_rate": 2.964338035618378e-06,
- "loss": 0.46166157722473145,
- "mean_token_accuracy": 0.8305013179779053,
- "num_tokens": 16555785.0,
- "step": 1851
- },
- {
- "epoch": 1.4072948328267478,
- "grad_norm": 1.3418530225753784,
- "learning_rate": 2.9622799206378306e-06,
- "loss": 0.5314373970031738,
- "mean_token_accuracy": 0.81779944896698,
- "num_tokens": 16578111.0,
- "step": 1852
- },
- {
- "epoch": 1.4080547112462005,
- "grad_norm": 1.4634262323379517,
- "learning_rate": 2.9602214812114414e-06,
- "loss": 0.4859408140182495,
- "mean_token_accuracy": 0.8261818885803223,
- "num_tokens": 16591976.0,
- "step": 1853
- },
- {
- "epoch": 1.4088145896656534,
- "grad_norm": 1.4840295314788818,
- "learning_rate": 2.9581627187838997e-06,
- "loss": 0.4079628586769104,
- "mean_token_accuracy": 0.8549603223800659,
- "num_tokens": 16603631.0,
- "step": 1854
- },
- {
- "epoch": 1.4095744680851063,
- "grad_norm": 2.1474642753601074,
- "learning_rate": 2.956103634800126e-06,
- "loss": 0.32997995615005493,
- "mean_token_accuracy": 0.8836915493011475,
- "num_tokens": 16609875.0,
- "step": 1855
- },
- {
- "epoch": 1.4103343465045592,
- "grad_norm": 2.627460241317749,
- "learning_rate": 2.9540442307052643e-06,
- "loss": 0.3229186236858368,
- "mean_token_accuracy": 0.8852157592773438,
- "num_tokens": 16614113.0,
- "step": 1856
- },
- {
- "epoch": 1.4110942249240122,
- "grad_norm": 1.9569811820983887,
- "learning_rate": 2.9519845079446824e-06,
- "loss": 0.5057883858680725,
- "mean_token_accuracy": 0.8585711717605591,
- "num_tokens": 16624611.0,
- "step": 1857
- },
- {
- "epoch": 1.411854103343465,
- "grad_norm": 2.0604090690612793,
- "learning_rate": 2.949924467963975e-06,
- "loss": 0.4681510329246521,
- "mean_token_accuracy": 0.8390560150146484,
- "num_tokens": 16632938.0,
- "step": 1858
- },
- {
- "epoch": 1.412613981762918,
- "grad_norm": 2.5430450439453125,
- "learning_rate": 2.9478641122089563e-06,
- "loss": 0.3090999126434326,
- "mean_token_accuracy": 0.8943990468978882,
- "num_tokens": 16637135.0,
- "step": 1859
- },
- {
- "epoch": 1.4133738601823709,
- "grad_norm": 1.3275387287139893,
- "learning_rate": 2.945803442125663e-06,
- "loss": 0.3592180013656616,
- "mean_token_accuracy": 0.8678265810012817,
- "num_tokens": 16650322.0,
- "step": 1860
- },
- {
- "epoch": 1.4141337386018238,
- "grad_norm": 1.9070929288864136,
- "learning_rate": 2.943742459160354e-06,
- "loss": 0.5332518815994263,
- "mean_token_accuracy": 0.8475706577301025,
- "num_tokens": 16660240.0,
- "step": 1861
- },
- {
- "epoch": 1.4148936170212765,
- "grad_norm": 2.8724546432495117,
- "learning_rate": 2.9416811647595052e-06,
- "loss": 0.5052884817123413,
- "mean_token_accuracy": 0.8363175392150879,
- "num_tokens": 16665481.0,
- "step": 1862
- },
- {
- "epoch": 1.4156534954407296,
- "grad_norm": 4.203817844390869,
- "learning_rate": 2.939619560369813e-06,
- "loss": 0.546925961971283,
- "mean_token_accuracy": 0.834044337272644,
- "num_tokens": 16669615.0,
- "step": 1863
- },
- {
- "epoch": 1.4164133738601823,
- "grad_norm": 1.6466281414031982,
- "learning_rate": 2.9375576474381907e-06,
- "loss": 0.3474533259868622,
- "mean_token_accuracy": 0.8571163415908813,
- "num_tokens": 16678893.0,
- "step": 1864
- },
- {
- "epoch": 1.4171732522796352,
- "grad_norm": 1.8885842561721802,
- "learning_rate": 2.9354954274117683e-06,
- "loss": 0.3726021349430084,
- "mean_token_accuracy": 0.8629094958305359,
- "num_tokens": 16685939.0,
- "step": 1865
- },
- {
- "epoch": 1.417933130699088,
- "grad_norm": 2.830599784851074,
- "learning_rate": 2.9334329017378898e-06,
- "loss": 0.4138668477535248,
- "mean_token_accuracy": 0.8670746088027954,
- "num_tokens": 16690012.0,
- "step": 1866
- },
- {
- "epoch": 1.418693009118541,
- "grad_norm": 1.6838961839675903,
- "learning_rate": 2.9313700718641167e-06,
- "loss": 0.33954259753227234,
- "mean_token_accuracy": 0.8660278916358948,
- "num_tokens": 16700061.0,
- "step": 1867
- },
- {
- "epoch": 1.419452887537994,
- "grad_norm": 2.8767011165618896,
- "learning_rate": 2.9293069392382224e-06,
- "loss": 0.4650302827358246,
- "mean_token_accuracy": 0.8448452949523926,
- "num_tokens": 16705072.0,
- "step": 1868
- },
- {
- "epoch": 1.4202127659574468,
- "grad_norm": 1.5901305675506592,
- "learning_rate": 2.927243505308192e-06,
- "loss": 0.40838998556137085,
- "mean_token_accuracy": 0.8560664653778076,
- "num_tokens": 16714763.0,
- "step": 1869
- },
- {
- "epoch": 1.4209726443768997,
- "grad_norm": 1.3293657302856445,
- "learning_rate": 2.925179771522223e-06,
- "loss": 0.34712862968444824,
- "mean_token_accuracy": 0.8633697032928467,
- "num_tokens": 16729575.0,
- "step": 1870
- },
- {
- "epoch": 1.4217325227963526,
- "grad_norm": 1.7465964555740356,
- "learning_rate": 2.9231157393287234e-06,
- "loss": 0.48190903663635254,
- "mean_token_accuracy": 0.8255834579467773,
- "num_tokens": 16742529.0,
- "step": 1871
- },
- {
- "epoch": 1.4224924012158056,
- "grad_norm": 1.865749716758728,
- "learning_rate": 2.9210514101763116e-06,
- "loss": 0.4912028908729553,
- "mean_token_accuracy": 0.8309572339057922,
- "num_tokens": 16753989.0,
- "step": 1872
- },
- {
- "epoch": 1.4232522796352582,
- "grad_norm": 2.55780291557312,
- "learning_rate": 2.9189867855138103e-06,
- "loss": 0.4550635814666748,
- "mean_token_accuracy": 0.8584091067314148,
- "num_tokens": 16758906.0,
- "step": 1873
- },
- {
- "epoch": 1.4240121580547114,
- "grad_norm": 1.867530107498169,
- "learning_rate": 2.9169218667902562e-06,
- "loss": 0.3524911105632782,
- "mean_token_accuracy": 0.8715004920959473,
- "num_tokens": 16765969.0,
- "step": 1874
- },
- {
- "epoch": 1.424772036474164,
- "grad_norm": 1.8886862993240356,
- "learning_rate": 2.9148566554548857e-06,
- "loss": 0.37144535779953003,
- "mean_token_accuracy": 0.8640961050987244,
- "num_tokens": 16773935.0,
- "step": 1875
- },
- {
- "epoch": 1.425531914893617,
- "grad_norm": 1.266065239906311,
- "learning_rate": 2.912791152957145e-06,
- "loss": 0.3341747522354126,
- "mean_token_accuracy": 0.8929134607315063,
- "num_tokens": 16787780.0,
- "step": 1876
- },
- {
- "epoch": 1.4262917933130699,
- "grad_norm": 2.524888753890991,
- "learning_rate": 2.9107253607466833e-06,
- "loss": 0.33709171414375305,
- "mean_token_accuracy": 0.8857531547546387,
- "num_tokens": 16792753.0,
- "step": 1877
- },
- {
- "epoch": 1.4270516717325228,
- "grad_norm": 1.9269018173217773,
- "learning_rate": 2.908659280273354e-06,
- "loss": 0.32599249482154846,
- "mean_token_accuracy": 0.8777773380279541,
- "num_tokens": 16799904.0,
- "step": 1878
- },
- {
- "epoch": 1.4278115501519757,
- "grad_norm": 1.9844375848770142,
- "learning_rate": 2.9065929129872097e-06,
- "loss": 0.4086732268333435,
- "mean_token_accuracy": 0.8505409955978394,
- "num_tokens": 16807774.0,
- "step": 1879
- },
- {
- "epoch": 1.4285714285714286,
- "grad_norm": 4.0958662033081055,
- "learning_rate": 2.9045262603385073e-06,
- "loss": 0.3838827610015869,
- "mean_token_accuracy": 0.877601146697998,
- "num_tokens": 16810908.0,
- "step": 1880
- },
- {
- "epoch": 1.4293313069908815,
- "grad_norm": 1.7323768138885498,
- "learning_rate": 2.902459323777704e-06,
- "loss": 0.37459003925323486,
- "mean_token_accuracy": 0.8655836582183838,
- "num_tokens": 16819494.0,
- "step": 1881
- },
- {
- "epoch": 1.4300911854103344,
- "grad_norm": 2.608043670654297,
- "learning_rate": 2.900392104755455e-06,
- "loss": 0.5798726677894592,
- "mean_token_accuracy": 0.8382592797279358,
- "num_tokens": 16827745.0,
- "step": 1882
- },
- {
- "epoch": 1.4308510638297873,
- "grad_norm": 1.3262078762054443,
- "learning_rate": 2.8983246047226137e-06,
- "loss": 0.3724595904350281,
- "mean_token_accuracy": 0.8651963472366333,
- "num_tokens": 16844171.0,
- "step": 1883
- },
- {
- "epoch": 1.43161094224924,
- "grad_norm": 1.7250545024871826,
- "learning_rate": 2.8962568251302327e-06,
- "loss": 0.3478979468345642,
- "mean_token_accuracy": 0.8807886242866516,
- "num_tokens": 16852838.0,
- "step": 1884
- },
- {
- "epoch": 1.4323708206686931,
- "grad_norm": 2.114525318145752,
- "learning_rate": 2.8941887674295573e-06,
- "loss": 0.5156140327453613,
- "mean_token_accuracy": 0.825178861618042,
- "num_tokens": 16861087.0,
- "step": 1885
- },
- {
- "epoch": 1.4331306990881458,
- "grad_norm": 2.400829792022705,
- "learning_rate": 2.892120433072031e-06,
- "loss": 0.2807392477989197,
- "mean_token_accuracy": 0.8907361030578613,
- "num_tokens": 16866557.0,
- "step": 1886
- },
- {
- "epoch": 1.4338905775075987,
- "grad_norm": 2.490880012512207,
- "learning_rate": 2.8900518235092908e-06,
- "loss": 0.2615952491760254,
- "mean_token_accuracy": 0.9152894020080566,
- "num_tokens": 16871357.0,
- "step": 1887
- },
- {
- "epoch": 1.4346504559270516,
- "grad_norm": 1.9058431386947632,
- "learning_rate": 2.887982940193165e-06,
- "loss": 0.43623363971710205,
- "mean_token_accuracy": 0.84696364402771,
- "num_tokens": 16879016.0,
- "step": 1888
- },
- {
- "epoch": 1.4354103343465046,
- "grad_norm": 1.4520210027694702,
- "learning_rate": 2.8859137845756785e-06,
- "loss": 0.3961856961250305,
- "mean_token_accuracy": 0.8518897294998169,
- "num_tokens": 16892254.0,
- "step": 1889
- },
- {
- "epoch": 1.4361702127659575,
- "grad_norm": 2.500274896621704,
- "learning_rate": 2.8838443581090415e-06,
- "loss": 0.41457289457321167,
- "mean_token_accuracy": 0.8751448392868042,
- "num_tokens": 16897156.0,
- "step": 1890
- },
- {
- "epoch": 1.4369300911854104,
- "grad_norm": 2.9312057495117188,
- "learning_rate": 2.8817746622456585e-06,
- "loss": 0.45875269174575806,
- "mean_token_accuracy": 0.8411039113998413,
- "num_tokens": 16902291.0,
- "step": 1891
- },
- {
- "epoch": 1.4376899696048633,
- "grad_norm": 2.367419481277466,
- "learning_rate": 2.879704698438121e-06,
- "loss": 0.3643629848957062,
- "mean_token_accuracy": 0.8771071434020996,
- "num_tokens": 16908128.0,
- "step": 1892
- },
- {
- "epoch": 1.4384498480243162,
- "grad_norm": 1.9907705783843994,
- "learning_rate": 2.8776344681392106e-06,
- "loss": 0.3206835389137268,
- "mean_token_accuracy": 0.879996657371521,
- "num_tokens": 16914918.0,
- "step": 1893
- },
- {
- "epoch": 1.439209726443769,
- "grad_norm": 3.536956310272217,
- "learning_rate": 2.875563972801893e-06,
- "loss": 0.3640141785144806,
- "mean_token_accuracy": 0.8814959526062012,
- "num_tokens": 16918187.0,
- "step": 1894
- },
- {
- "epoch": 1.4399696048632218,
- "grad_norm": 1.3451156616210938,
- "learning_rate": 2.8734932138793226e-06,
- "loss": 0.3427346348762512,
- "mean_token_accuracy": 0.8835382461547852,
- "num_tokens": 16931135.0,
- "step": 1895
- },
- {
- "epoch": 1.4407294832826747,
- "grad_norm": 2.0735955238342285,
- "learning_rate": 2.871422192824837e-06,
- "loss": 0.4265315532684326,
- "mean_token_accuracy": 0.8452677726745605,
- "num_tokens": 16937995.0,
- "step": 1896
- },
- {
- "epoch": 1.4414893617021276,
- "grad_norm": 1.5124932527542114,
- "learning_rate": 2.8693509110919597e-06,
- "loss": 0.497121661901474,
- "mean_token_accuracy": 0.815092921257019,
- "num_tokens": 16952743.0,
- "step": 1897
- },
- {
- "epoch": 1.4422492401215805,
- "grad_norm": 3.716669797897339,
- "learning_rate": 2.867279370134395e-06,
- "loss": 0.5452651381492615,
- "mean_token_accuracy": 0.8150380849838257,
- "num_tokens": 16956797.0,
- "step": 1898
- },
- {
- "epoch": 1.4430091185410334,
- "grad_norm": 1.3571398258209229,
- "learning_rate": 2.8652075714060296e-06,
- "loss": 0.4249724745750427,
- "mean_token_accuracy": 0.8675867915153503,
- "num_tokens": 16974494.0,
- "step": 1899
- },
- {
- "epoch": 1.4437689969604863,
- "grad_norm": 2.310673475265503,
- "learning_rate": 2.863135516360932e-06,
- "loss": 0.39368677139282227,
- "mean_token_accuracy": 0.878392219543457,
- "num_tokens": 16980612.0,
- "step": 1900
- },
- {
- "epoch": 1.4445288753799392,
- "grad_norm": 1.9025533199310303,
- "learning_rate": 2.8610632064533517e-06,
- "loss": 0.4786127805709839,
- "mean_token_accuracy": 0.8720556497573853,
- "num_tokens": 16992262.0,
- "step": 1901
- },
- {
- "epoch": 1.4452887537993921,
- "grad_norm": 2.528564453125,
- "learning_rate": 2.8589906431377133e-06,
- "loss": 0.4223094582557678,
- "mean_token_accuracy": 0.8513246178627014,
- "num_tokens": 16997717.0,
- "step": 1902
- },
- {
- "epoch": 1.446048632218845,
- "grad_norm": 1.010425329208374,
- "learning_rate": 2.8569178278686222e-06,
- "loss": 0.3908255696296692,
- "mean_token_accuracy": 0.8620463609695435,
- "num_tokens": 17020903.0,
- "step": 1903
- },
- {
- "epoch": 1.4468085106382977,
- "grad_norm": 1.5760232210159302,
- "learning_rate": 2.8548447621008614e-06,
- "loss": 0.4134044051170349,
- "mean_token_accuracy": 0.8472093343734741,
- "num_tokens": 17035250.0,
- "step": 1904
- },
- {
- "epoch": 1.4475683890577509,
- "grad_norm": 2.0668535232543945,
- "learning_rate": 2.8527714472893866e-06,
- "loss": 0.44095730781555176,
- "mean_token_accuracy": 0.881983757019043,
- "num_tokens": 17042170.0,
- "step": 1905
- },
- {
- "epoch": 1.4483282674772036,
- "grad_norm": 1.1620599031448364,
- "learning_rate": 2.85069788488933e-06,
- "loss": 0.3607163429260254,
- "mean_token_accuracy": 0.8684282898902893,
- "num_tokens": 17061937.0,
- "step": 1906
- },
- {
- "epoch": 1.4490881458966565,
- "grad_norm": 2.1316568851470947,
- "learning_rate": 2.8486240763559984e-06,
- "loss": 0.3478124141693115,
- "mean_token_accuracy": 0.8772403001785278,
- "num_tokens": 17068628.0,
- "step": 1907
- },
- {
- "epoch": 1.4498480243161094,
- "grad_norm": 2.4756391048431396,
- "learning_rate": 2.8465500231448707e-06,
- "loss": 0.46441152691841125,
- "mean_token_accuracy": 0.8436450958251953,
- "num_tokens": 17075495.0,
- "step": 1908
- },
- {
- "epoch": 1.4506079027355623,
- "grad_norm": 2.249720573425293,
- "learning_rate": 2.844475726711595e-06,
- "loss": 0.41565513610839844,
- "mean_token_accuracy": 0.8525094985961914,
- "num_tokens": 17080940.0,
- "step": 1909
- },
- {
- "epoch": 1.4513677811550152,
- "grad_norm": 2.3081841468811035,
- "learning_rate": 2.8424011885119956e-06,
- "loss": 0.49903199076652527,
- "mean_token_accuracy": 0.8212426900863647,
- "num_tokens": 17092024.0,
- "step": 1910
- },
- {
- "epoch": 1.452127659574468,
- "grad_norm": 1.2929959297180176,
- "learning_rate": 2.8403264100020613e-06,
- "loss": 0.47038257122039795,
- "mean_token_accuracy": 0.8319816589355469,
- "num_tokens": 17108840.0,
- "step": 1911
- },
- {
- "epoch": 1.452887537993921,
- "grad_norm": 1.6476463079452515,
- "learning_rate": 2.8382513926379508e-06,
- "loss": 0.42287829518318176,
- "mean_token_accuracy": 0.8555682897567749,
- "num_tokens": 17119704.0,
- "step": 1912
- },
- {
- "epoch": 1.453647416413374,
- "grad_norm": 1.759998083114624,
- "learning_rate": 2.836176137875993e-06,
- "loss": 0.40904951095581055,
- "mean_token_accuracy": 0.8698266744613647,
- "num_tokens": 17130676.0,
- "step": 1913
- },
- {
- "epoch": 1.4544072948328268,
- "grad_norm": 1.510909914970398,
- "learning_rate": 2.8341006471726817e-06,
- "loss": 0.47834792733192444,
- "mean_token_accuracy": 0.8335825204849243,
- "num_tokens": 17146304.0,
- "step": 1914
- },
- {
- "epoch": 1.4551671732522795,
- "grad_norm": 3.538071632385254,
- "learning_rate": 2.832024921984674e-06,
- "loss": 0.34059035778045654,
- "mean_token_accuracy": 0.8769031763076782,
- "num_tokens": 17150458.0,
- "step": 1915
- },
- {
- "epoch": 1.4559270516717326,
- "grad_norm": 2.3368659019470215,
- "learning_rate": 2.8299489637687955e-06,
- "loss": 0.43068382143974304,
- "mean_token_accuracy": 0.845360517501831,
- "num_tokens": 17157368.0,
- "step": 1916
- },
- {
- "epoch": 1.4566869300911853,
- "grad_norm": 1.8720396757125854,
- "learning_rate": 2.8278727739820334e-06,
- "loss": 0.37013399600982666,
- "mean_token_accuracy": 0.854241132736206,
- "num_tokens": 17166325.0,
- "step": 1917
- },
- {
- "epoch": 1.4574468085106382,
- "grad_norm": 1.6706892251968384,
- "learning_rate": 2.825796354081537e-06,
- "loss": 0.5397020578384399,
- "mean_token_accuracy": 0.8309713006019592,
- "num_tokens": 17178920.0,
- "step": 1918
- },
- {
- "epoch": 1.4582066869300911,
- "grad_norm": 2.729210376739502,
- "learning_rate": 2.8237197055246175e-06,
- "loss": 0.25137859582901,
- "mean_token_accuracy": 0.9148792028427124,
- "num_tokens": 17183107.0,
- "step": 1919
- },
- {
- "epoch": 1.458966565349544,
- "grad_norm": 3.023500680923462,
- "learning_rate": 2.821642829768748e-06,
- "loss": 0.43312495946884155,
- "mean_token_accuracy": 0.8481811285018921,
- "num_tokens": 17187853.0,
- "step": 1920
- },
- {
- "epoch": 1.459726443768997,
- "grad_norm": 1.8108519315719604,
- "learning_rate": 2.8195657282715595e-06,
- "loss": 0.5101792216300964,
- "mean_token_accuracy": 0.8315553069114685,
- "num_tokens": 17199247.0,
- "step": 1921
- },
- {
- "epoch": 1.4604863221884499,
- "grad_norm": 2.0262672901153564,
- "learning_rate": 2.817488402490841e-06,
- "loss": 0.4449934959411621,
- "mean_token_accuracy": 0.8634527325630188,
- "num_tokens": 17206348.0,
- "step": 1922
- },
- {
- "epoch": 1.4612462006079028,
- "grad_norm": 2.6163926124572754,
- "learning_rate": 2.8154108538845405e-06,
- "loss": 0.43052345514297485,
- "mean_token_accuracy": 0.8375401496887207,
- "num_tokens": 17211702.0,
- "step": 1923
- },
- {
- "epoch": 1.4620060790273557,
- "grad_norm": 2.0854408740997314,
- "learning_rate": 2.813333083910761e-06,
- "loss": 0.5011380910873413,
- "mean_token_accuracy": 0.8359915018081665,
- "num_tokens": 17219096.0,
- "step": 1924
- },
- {
- "epoch": 1.4627659574468086,
- "grad_norm": 2.2081687450408936,
- "learning_rate": 2.8112550940277615e-06,
- "loss": 0.5239193439483643,
- "mean_token_accuracy": 0.8499593734741211,
- "num_tokens": 17229266.0,
- "step": 1925
- },
- {
- "epoch": 1.4635258358662613,
- "grad_norm": 1.798343539237976,
- "learning_rate": 2.809176885693956e-06,
- "loss": 0.4515029191970825,
- "mean_token_accuracy": 0.8400485515594482,
- "num_tokens": 17239280.0,
- "step": 1926
- },
- {
- "epoch": 1.4642857142857144,
- "grad_norm": 1.897887945175171,
- "learning_rate": 2.807098460367911e-06,
- "loss": 0.35935714840888977,
- "mean_token_accuracy": 0.8776072263717651,
- "num_tokens": 17247132.0,
- "step": 1927
- },
- {
- "epoch": 1.465045592705167,
- "grad_norm": 2.705836296081543,
- "learning_rate": 2.8050198195083445e-06,
- "loss": 0.3728443682193756,
- "mean_token_accuracy": 0.8649885654449463,
- "num_tokens": 17251865.0,
- "step": 1928
- },
- {
- "epoch": 1.46580547112462,
- "grad_norm": 1.841178059577942,
- "learning_rate": 2.802940964574127e-06,
- "loss": 0.40604841709136963,
- "mean_token_accuracy": 0.8537783622741699,
- "num_tokens": 17260163.0,
- "step": 1929
- },
- {
- "epoch": 1.466565349544073,
- "grad_norm": 2.7393605709075928,
- "learning_rate": 2.800861897024279e-06,
- "loss": 0.39346879720687866,
- "mean_token_accuracy": 0.8628787994384766,
- "num_tokens": 17264876.0,
- "step": 1930
- },
- {
- "epoch": 1.4673252279635258,
- "grad_norm": 1.84367835521698,
- "learning_rate": 2.798782618317971e-06,
- "loss": 0.37411895394325256,
- "mean_token_accuracy": 0.8605265617370605,
- "num_tokens": 17273049.0,
- "step": 1931
- },
- {
- "epoch": 1.4680851063829787,
- "grad_norm": 1.6546733379364014,
- "learning_rate": 2.796703129914519e-06,
- "loss": 0.4997844099998474,
- "mean_token_accuracy": 0.8267433643341064,
- "num_tokens": 17285074.0,
- "step": 1932
- },
- {
- "epoch": 1.4688449848024316,
- "grad_norm": 2.2749221324920654,
- "learning_rate": 2.79462343327339e-06,
- "loss": 0.35453367233276367,
- "mean_token_accuracy": 0.8746850490570068,
- "num_tokens": 17290273.0,
- "step": 1933
- },
- {
- "epoch": 1.4696048632218845,
- "grad_norm": 1.7142518758773804,
- "learning_rate": 2.7925435298541944e-06,
- "loss": 0.345878541469574,
- "mean_token_accuracy": 0.8600981831550598,
- "num_tokens": 17301045.0,
- "step": 1934
- },
- {
- "epoch": 1.4703647416413375,
- "grad_norm": 3.163342237472534,
- "learning_rate": 2.7904634211166877e-06,
- "loss": 0.4356975853443146,
- "mean_token_accuracy": 0.8460350036621094,
- "num_tokens": 17305108.0,
- "step": 1935
- },
- {
- "epoch": 1.4711246200607904,
- "grad_norm": 1.6377612352371216,
- "learning_rate": 2.7883831085207707e-06,
- "loss": 0.4459729790687561,
- "mean_token_accuracy": 0.8463394641876221,
- "num_tokens": 17315479.0,
- "step": 1936
- },
- {
- "epoch": 1.471884498480243,
- "grad_norm": 1.865268588066101,
- "learning_rate": 2.7863025935264876e-06,
- "loss": 0.394723117351532,
- "mean_token_accuracy": 0.864177942276001,
- "num_tokens": 17324795.0,
- "step": 1937
- },
- {
- "epoch": 1.4726443768996962,
- "grad_norm": 1.241937518119812,
- "learning_rate": 2.784221877594024e-06,
- "loss": 0.2752220630645752,
- "mean_token_accuracy": 0.8998259902000427,
- "num_tokens": 17338000.0,
- "step": 1938
- },
- {
- "epoch": 1.4734042553191489,
- "grad_norm": 1.8013651371002197,
- "learning_rate": 2.7821409621837042e-06,
- "loss": 0.4251005947589874,
- "mean_token_accuracy": 0.8518919348716736,
- "num_tokens": 17347351.0,
- "step": 1939
- },
- {
- "epoch": 1.4741641337386018,
- "grad_norm": 1.2902207374572754,
- "learning_rate": 2.7800598487559976e-06,
- "loss": 0.3640727400779724,
- "mean_token_accuracy": 0.8592870235443115,
- "num_tokens": 17362335.0,
- "step": 1940
- },
- {
- "epoch": 1.4749240121580547,
- "grad_norm": 2.5427513122558594,
- "learning_rate": 2.777978538771508e-06,
- "loss": 0.38166797161102295,
- "mean_token_accuracy": 0.8653234839439392,
- "num_tokens": 17367733.0,
- "step": 1941
- },
- {
- "epoch": 1.4756838905775076,
- "grad_norm": 1.7793641090393066,
- "learning_rate": 2.7758970336909795e-06,
- "loss": 0.3113783895969391,
- "mean_token_accuracy": 0.8812868595123291,
- "num_tokens": 17375267.0,
- "step": 1942
- },
- {
- "epoch": 1.4764437689969605,
- "grad_norm": 3.4031741619110107,
- "learning_rate": 2.7738153349752923e-06,
- "loss": 0.4800986647605896,
- "mean_token_accuracy": 0.8336698412895203,
- "num_tokens": 17379549.0,
- "step": 1943
- },
- {
- "epoch": 1.4772036474164134,
- "grad_norm": 1.3451651334762573,
- "learning_rate": 2.7717334440854634e-06,
- "loss": 0.3115345239639282,
- "mean_token_accuracy": 0.908623218536377,
- "num_tokens": 17394455.0,
- "step": 1944
- },
- {
- "epoch": 1.4779635258358663,
- "grad_norm": 1.980919599533081,
- "learning_rate": 2.7696513624826422e-06,
- "loss": 0.391154944896698,
- "mean_token_accuracy": 0.8650267720222473,
- "num_tokens": 17401931.0,
- "step": 1945
- },
- {
- "epoch": 1.4787234042553192,
- "grad_norm": 1.0118765830993652,
- "learning_rate": 2.7675690916281158e-06,
- "loss": 0.3157956600189209,
- "mean_token_accuracy": 0.8827471733093262,
- "num_tokens": 17424144.0,
- "step": 1946
- },
- {
- "epoch": 1.4794832826747721,
- "grad_norm": 1.579654335975647,
- "learning_rate": 2.7654866329833e-06,
- "loss": 0.4578486382961273,
- "mean_token_accuracy": 0.8361750245094299,
- "num_tokens": 17435769.0,
- "step": 1947
- },
- {
- "epoch": 1.4802431610942248,
- "grad_norm": 1.7706717252731323,
- "learning_rate": 2.763403988009746e-06,
- "loss": 0.3564416170120239,
- "mean_token_accuracy": 0.8689201474189758,
- "num_tokens": 17444088.0,
- "step": 1948
- },
- {
- "epoch": 1.4810030395136777,
- "grad_norm": 1.2264244556427002,
- "learning_rate": 2.761321158169134e-06,
- "loss": 0.30763837695121765,
- "mean_token_accuracy": 0.8960219621658325,
- "num_tokens": 17458096.0,
- "step": 1949
- },
- {
- "epoch": 1.4817629179331306,
- "grad_norm": 1.214431881904602,
- "learning_rate": 2.759238144923274e-06,
- "loss": 0.49099457263946533,
- "mean_token_accuracy": 0.8279136419296265,
- "num_tokens": 17481062.0,
- "step": 1950
- },
- {
- "epoch": 1.4825227963525835,
- "grad_norm": 1.593892216682434,
- "learning_rate": 2.7571549497341044e-06,
- "loss": 0.3745320737361908,
- "mean_token_accuracy": 0.8690779209136963,
- "num_tokens": 17490874.0,
- "step": 1951
- },
- {
- "epoch": 1.4832826747720365,
- "grad_norm": 2.409924268722534,
- "learning_rate": 2.755071574063692e-06,
- "loss": 0.4310247600078583,
- "mean_token_accuracy": 0.8521159291267395,
- "num_tokens": 17496942.0,
- "step": 1952
- },
- {
- "epoch": 1.4840425531914894,
- "grad_norm": 1.2557463645935059,
- "learning_rate": 2.7529880193742297e-06,
- "loss": 0.34304720163345337,
- "mean_token_accuracy": 0.8748183250427246,
- "num_tokens": 17514391.0,
- "step": 1953
- },
- {
- "epoch": 1.4848024316109423,
- "grad_norm": 1.17310631275177,
- "learning_rate": 2.7509042871280373e-06,
- "loss": 0.3835817277431488,
- "mean_token_accuracy": 0.8853274583816528,
- "num_tokens": 17533289.0,
- "step": 1954
- },
- {
- "epoch": 1.4855623100303952,
- "grad_norm": 1.5261479616165161,
- "learning_rate": 2.748820378787558e-06,
- "loss": 0.4799988865852356,
- "mean_token_accuracy": 0.8252149820327759,
- "num_tokens": 17544118.0,
- "step": 1955
- },
- {
- "epoch": 1.486322188449848,
- "grad_norm": 2.030930757522583,
- "learning_rate": 2.7467362958153585e-06,
- "loss": 0.35690805315971375,
- "mean_token_accuracy": 0.8959587216377258,
- "num_tokens": 17550431.0,
- "step": 1956
- },
- {
- "epoch": 1.4870820668693008,
- "grad_norm": 2.376520872116089,
- "learning_rate": 2.7446520396741293e-06,
- "loss": 0.262234091758728,
- "mean_token_accuracy": 0.9054547548294067,
- "num_tokens": 17554853.0,
- "step": 1957
- },
- {
- "epoch": 1.487841945288754,
- "grad_norm": 1.6944479942321777,
- "learning_rate": 2.742567611826681e-06,
- "loss": 0.529259979724884,
- "mean_token_accuracy": 0.8195339441299438,
- "num_tokens": 17568016.0,
- "step": 1958
- },
- {
- "epoch": 1.4886018237082066,
- "grad_norm": 2.833029270172119,
- "learning_rate": 2.7404830137359445e-06,
- "loss": 0.30229634046554565,
- "mean_token_accuracy": 0.8933001756668091,
- "num_tokens": 17572587.0,
- "step": 1959
- },
- {
- "epoch": 1.4893617021276595,
- "grad_norm": 1.7040144205093384,
- "learning_rate": 2.7383982468649715e-06,
- "loss": 0.3166356682777405,
- "mean_token_accuracy": 0.8871906399726868,
- "num_tokens": 17580966.0,
- "step": 1960
- },
- {
- "epoch": 1.4901215805471124,
- "grad_norm": 1.7539052963256836,
- "learning_rate": 2.7363133126769326e-06,
- "loss": 0.4231064021587372,
- "mean_token_accuracy": 0.8708304166793823,
- "num_tokens": 17590907.0,
- "step": 1961
- },
- {
- "epoch": 1.4908814589665653,
- "grad_norm": 1.6198650598526,
- "learning_rate": 2.7342282126351145e-06,
- "loss": 0.4198967218399048,
- "mean_token_accuracy": 0.8723280429840088,
- "num_tokens": 17604291.0,
- "step": 1962
- },
- {
- "epoch": 1.4916413373860182,
- "grad_norm": 1.8437711000442505,
- "learning_rate": 2.73214294820292e-06,
- "loss": 0.38923323154449463,
- "mean_token_accuracy": 0.8697006106376648,
- "num_tokens": 17612291.0,
- "step": 1963
- },
- {
- "epoch": 1.4924012158054711,
- "grad_norm": 1.1129369735717773,
- "learning_rate": 2.7300575208438684e-06,
- "loss": 0.3107512593269348,
- "mean_token_accuracy": 0.878618597984314,
- "num_tokens": 17630073.0,
- "step": 1964
- },
- {
- "epoch": 1.493161094224924,
- "grad_norm": 3.0210442543029785,
- "learning_rate": 2.7279719320215924e-06,
- "loss": 0.4630751609802246,
- "mean_token_accuracy": 0.8567075729370117,
- "num_tokens": 17634758.0,
- "step": 1965
- },
- {
- "epoch": 1.493920972644377,
- "grad_norm": 2.8825972080230713,
- "learning_rate": 2.725886183199839e-06,
- "loss": 0.35351765155792236,
- "mean_token_accuracy": 0.8711981773376465,
- "num_tokens": 17639613.0,
- "step": 1966
- },
- {
- "epoch": 1.4946808510638299,
- "grad_norm": 2.111238718032837,
- "learning_rate": 2.723800275842468e-06,
- "loss": 0.3529569208621979,
- "mean_token_accuracy": 0.8679244518280029,
- "num_tokens": 17645308.0,
- "step": 1967
- },
- {
- "epoch": 1.4954407294832825,
- "grad_norm": 2.080509901046753,
- "learning_rate": 2.7217142114134466e-06,
- "loss": 0.43321219086647034,
- "mean_token_accuracy": 0.8848220109939575,
- "num_tokens": 17652292.0,
- "step": 1968
- },
- {
- "epoch": 1.4962006079027357,
- "grad_norm": 2.8686363697052,
- "learning_rate": 2.7196279913768587e-06,
- "loss": 0.417035311460495,
- "mean_token_accuracy": 0.8724601864814758,
- "num_tokens": 17656908.0,
- "step": 1969
- },
- {
- "epoch": 1.4969604863221884,
- "grad_norm": 3.294193744659424,
- "learning_rate": 2.717541617196891e-06,
- "loss": 0.3551934063434601,
- "mean_token_accuracy": 0.8838565349578857,
- "num_tokens": 17660590.0,
- "step": 1970
- },
- {
- "epoch": 1.4977203647416413,
- "grad_norm": 1.766292929649353,
- "learning_rate": 2.7154550903378425e-06,
- "loss": 0.36521971225738525,
- "mean_token_accuracy": 0.8810199499130249,
- "num_tokens": 17668214.0,
- "step": 1971
- },
- {
- "epoch": 1.4984802431610942,
- "grad_norm": 1.2127676010131836,
- "learning_rate": 2.713368412264118e-06,
- "loss": 0.35184425115585327,
- "mean_token_accuracy": 0.8672580718994141,
- "num_tokens": 17684736.0,
- "step": 1972
- },
- {
- "epoch": 1.499240121580547,
- "grad_norm": 2.268256664276123,
- "learning_rate": 2.711281584440228e-06,
- "loss": 0.40115267038345337,
- "mean_token_accuracy": 0.8517841100692749,
- "num_tokens": 17691510.0,
- "step": 1973
- },
- {
- "epoch": 1.5,
- "grad_norm": 2.7196054458618164,
- "learning_rate": 2.70919460833079e-06,
- "loss": 0.3819037675857544,
- "mean_token_accuracy": 0.8765411376953125,
- "num_tokens": 17696179.0,
- "step": 1974
- },
- {
- "epoch": 1.500759878419453,
- "grad_norm": 2.969406843185425,
- "learning_rate": 2.7071074854005206e-06,
- "loss": 0.3922455608844757,
- "mean_token_accuracy": 0.8796037435531616,
- "num_tokens": 17700597.0,
- "step": 1975
- },
- {
- "epoch": 1.5015197568389058,
- "grad_norm": 2.2965853214263916,
- "learning_rate": 2.705020217114248e-06,
- "loss": 0.5433666110038757,
- "mean_token_accuracy": 0.809639036655426,
- "num_tokens": 17708895.0,
- "step": 1976
- },
- {
- "epoch": 1.5022796352583585,
- "grad_norm": 1.5584394931793213,
- "learning_rate": 2.7029328049368942e-06,
- "loss": 0.4736343324184418,
- "mean_token_accuracy": 0.8197190761566162,
- "num_tokens": 17725202.0,
- "step": 1977
- },
- {
- "epoch": 1.5030395136778116,
- "grad_norm": 1.3903142213821411,
- "learning_rate": 2.700845250333486e-06,
- "loss": 0.4471571445465088,
- "mean_token_accuracy": 0.839043140411377,
- "num_tokens": 17742835.0,
- "step": 1978
- },
- {
- "epoch": 1.5037993920972643,
- "grad_norm": 3.080716609954834,
- "learning_rate": 2.69875755476915e-06,
- "loss": 0.45760005712509155,
- "mean_token_accuracy": 0.8366328477859497,
- "num_tokens": 17747324.0,
- "step": 1979
- },
- {
- "epoch": 1.5045592705167175,
- "grad_norm": 1.0150405168533325,
- "learning_rate": 2.696669719709111e-06,
- "loss": 0.33638954162597656,
- "mean_token_accuracy": 0.8591676354408264,
- "num_tokens": 17765565.0,
- "step": 1980
- },
- {
- "epoch": 1.5053191489361701,
- "grad_norm": 2.402927875518799,
- "learning_rate": 2.694581746618691e-06,
- "loss": 0.4086601436138153,
- "mean_token_accuracy": 0.8769911527633667,
- "num_tokens": 17771275.0,
- "step": 1981
- },
- {
- "epoch": 1.506079027355623,
- "grad_norm": 2.030583381652832,
- "learning_rate": 2.6924936369633126e-06,
- "loss": 0.5115457773208618,
- "mean_token_accuracy": 0.8054746389389038,
- "num_tokens": 17779999.0,
- "step": 1982
- },
- {
- "epoch": 1.506838905775076,
- "grad_norm": 2.575199604034424,
- "learning_rate": 2.6904053922084893e-06,
- "loss": 0.363183856010437,
- "mean_token_accuracy": 0.8716042637825012,
- "num_tokens": 17785473.0,
- "step": 1983
- },
- {
- "epoch": 1.5075987841945289,
- "grad_norm": 1.8497480154037476,
- "learning_rate": 2.688317013819832e-06,
- "loss": 0.4254384934902191,
- "mean_token_accuracy": 0.8549597263336182,
- "num_tokens": 17793812.0,
- "step": 1984
- },
- {
- "epoch": 1.5083586626139818,
- "grad_norm": 1.7786511182785034,
- "learning_rate": 2.686228503263045e-06,
- "loss": 0.33400774002075195,
- "mean_token_accuracy": 0.9027615189552307,
- "num_tokens": 17801783.0,
- "step": 1985
- },
- {
- "epoch": 1.5091185410334347,
- "grad_norm": 1.8365367650985718,
- "learning_rate": 2.684139862003927e-06,
- "loss": 0.35765063762664795,
- "mean_token_accuracy": 0.8663736581802368,
- "num_tokens": 17809562.0,
- "step": 1986
- },
- {
- "epoch": 1.5098784194528876,
- "grad_norm": 1.8817477226257324,
- "learning_rate": 2.682051091508365e-06,
- "loss": 0.4627506732940674,
- "mean_token_accuracy": 0.8358862400054932,
- "num_tokens": 17819094.0,
- "step": 1987
- },
- {
- "epoch": 1.5106382978723403,
- "grad_norm": 2.221547842025757,
- "learning_rate": 2.679962193242338e-06,
- "loss": 0.577020525932312,
- "mean_token_accuracy": 0.80013108253479,
- "num_tokens": 17826666.0,
- "step": 1988
- },
- {
- "epoch": 1.5113981762917934,
- "grad_norm": 2.6618270874023438,
- "learning_rate": 2.6778731686719177e-06,
- "loss": 0.44632256031036377,
- "mean_token_accuracy": 0.8611289262771606,
- "num_tokens": 17833172.0,
- "step": 1989
- },
- {
- "epoch": 1.512158054711246,
- "grad_norm": 2.9495689868927,
- "learning_rate": 2.67578401926326e-06,
- "loss": 0.3482511043548584,
- "mean_token_accuracy": 0.8703314661979675,
- "num_tokens": 17837220.0,
- "step": 1990
- },
- {
- "epoch": 1.5129179331306992,
- "grad_norm": 2.0943644046783447,
- "learning_rate": 2.6736947464826107e-06,
- "loss": 0.2354314625263214,
- "mean_token_accuracy": 0.9137634038925171,
- "num_tokens": 17842712.0,
- "step": 1991
- },
- {
- "epoch": 1.513677811550152,
- "grad_norm": 1.1303033828735352,
- "learning_rate": 2.671605351796302e-06,
- "loss": 0.3624761700630188,
- "mean_token_accuracy": 0.8769594430923462,
- "num_tokens": 17860902.0,
- "step": 1992
- },
- {
- "epoch": 1.5144376899696048,
- "grad_norm": 2.8921146392822266,
- "learning_rate": 2.6695158366707526e-06,
- "loss": 0.2517220973968506,
- "mean_token_accuracy": 0.8974182605743408,
- "num_tokens": 17865160.0,
- "step": 1993
- },
- {
- "epoch": 1.5151975683890577,
- "grad_norm": 2.320587158203125,
- "learning_rate": 2.667426202572463e-06,
- "loss": 0.4589889943599701,
- "mean_token_accuracy": 0.8379613161087036,
- "num_tokens": 17871994.0,
- "step": 1994
- },
- {
- "epoch": 1.5159574468085106,
- "grad_norm": 1.1407674551010132,
- "learning_rate": 2.665336450968019e-06,
- "loss": 0.34412115812301636,
- "mean_token_accuracy": 0.8776306509971619,
- "num_tokens": 17889941.0,
- "step": 1995
- },
- {
- "epoch": 1.5167173252279635,
- "grad_norm": 2.069814920425415,
- "learning_rate": 2.6632465833240895e-06,
- "loss": 0.47524404525756836,
- "mean_token_accuracy": 0.830310046672821,
- "num_tokens": 17898447.0,
- "step": 1996
- },
- {
- "epoch": 1.5174772036474165,
- "grad_norm": 1.822415828704834,
- "learning_rate": 2.661156601107424e-06,
- "loss": 0.4541318416595459,
- "mean_token_accuracy": 0.8856616020202637,
- "num_tokens": 17908729.0,
- "step": 1997
- },
- {
- "epoch": 1.5182370820668694,
- "grad_norm": 2.851428985595703,
- "learning_rate": 2.659066505784852e-06,
- "loss": 0.41761666536331177,
- "mean_token_accuracy": 0.8710572719573975,
- "num_tokens": 17913860.0,
- "step": 1998
- },
- {
- "epoch": 1.518996960486322,
- "grad_norm": 1.8483710289001465,
- "learning_rate": 2.6569762988232838e-06,
- "loss": 0.45517268776893616,
- "mean_token_accuracy": 0.8411115407943726,
- "num_tokens": 17923497.0,
- "step": 1999
- },
- {
- "epoch": 1.5197568389057752,
- "grad_norm": 1.9044219255447388,
- "learning_rate": 2.654885981689706e-06,
- "loss": 0.42533189058303833,
- "mean_token_accuracy": 0.8597894906997681,
- "num_tokens": 17932670.0,
- "step": 2000
- }
- ],
- "logging_steps": 1.0,
- "max_steps": 3948,
- "num_input_tokens_seen": 0,
- "num_train_epochs": 3,
- "save_steps": 1000,
- "stateful_callbacks": {
- "TrainerControl": {
- "args": {
- "should_epoch_stop": false,
- "should_evaluate": false,
- "should_log": false,
- "should_save": true,
- "should_training_stop": false
- },
- "attributes": {}
- }
- },
- "total_flos": 1.9547571235271475e+17,
- "train_batch_size": 1,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin
deleted file mode 100644
index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000
--- a/checkpoint-2000/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021
-size 6225
diff --git a/checkpoint-3000/chat_template.jinja b/checkpoint-3000/chat_template.jinja
deleted file mode 100644
index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000
--- a/checkpoint-3000/chat_template.jinja
+++ /dev/null
@@ -1,61 +0,0 @@
-{%- if tools %}
- {{- '<|im_start|>system\n' }}
- {%- if messages[0].role == 'system' %}
- {{- messages[0].content + '\n\n' }}
- {%- endif %}
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
- {%- for tool in tools %}
- {{- "\n" }}
- {{- tool | tojson }}
- {%- endfor %}
- {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
-{%- else %}
- {%- if messages[0].role == 'system' %}
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
- {%- endif %}
-{%- endif %}
-{%- for message in messages %}
- {%- if message.content is string %}
- {%- set content = message.content %}
- {%- else %}
- {%- set content = '' %}
- {%- endif %}
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
- {%- elif message.role == "assistant" %}
- {{- '<|im_start|>' + message.role + '\n' + content }}
- {%- if message.tool_calls %}
- {%- for tool_call in message.tool_calls %}
- {%- if (loop.first and content) or (not loop.first) %}
- {{- '\n' }}
- {%- endif %}
- {%- if tool_call.function %}
- {%- set tool_call = tool_call.function %}
- {%- endif %}
- {{- '\n{"name": "' }}
- {{- tool_call.name }}
- {{- '", "arguments": ' }}
- {%- if tool_call.arguments is string %}
- {{- tool_call.arguments }}
- {%- else %}
- {{- tool_call.arguments | tojson }}
- {%- endif %}
- {{- '}\n' }}
- {%- endfor %}
- {%- endif %}
- {{- '<|im_end|>\n' }}
- {%- elif message.role == "tool" %}
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
- {{- '<|im_start|>user' }}
- {%- endif %}
- {{- '\n\n' }}
- {{- content }}
- {{- '\n' }}
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
- {{- '<|im_end|>\n' }}
- {%- endif %}
- {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
- {{- '<|im_start|>assistant\n' }}
-{%- endif %}
\ No newline at end of file
diff --git a/checkpoint-3000/config.json b/checkpoint-3000/config.json
deleted file mode 100644
index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000
--- a/checkpoint-3000/config.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "architectures": [
- "Qwen3ForCausalLM"
- ],
- "attention_bias": false,
- "attention_dropout": 0.0,
- "bos_token_id": null,
- "dtype": "float32",
- "eos_token_id": 151645,
- "head_dim": 128,
- "hidden_act": "silu",
- "hidden_size": 2560,
- "initializer_range": 0.02,
- "intermediate_size": 9728,
- "layer_types": [
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention"
- ],
- "max_position_embeddings": 262144,
- "max_window_layers": 36,
- "model_type": "qwen3",
- "num_attention_heads": 32,
- "num_hidden_layers": 36,
- "num_key_value_heads": 8,
- "pad_token_id": 151662,
- "rms_norm_eps": 1e-06,
- "rope_parameters": {
- "rope_theta": 5000000,
- "rope_type": "default"
- },
- "sliding_window": null,
- "tie_word_embeddings": true,
- "transformers_version": "5.5.3",
- "use_cache": false,
- "use_sliding_window": false,
- "vocab_size": 151936
-}
diff --git a/checkpoint-3000/generation_config.json b/checkpoint-3000/generation_config.json
deleted file mode 100644
index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000
--- a/checkpoint-3000/generation_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "do_sample": true,
- "eos_token_id": [
- 151645,
- 151643
- ],
- "pad_token_id": 151662,
- "temperature": 0.7,
- "top_k": 20,
- "top_p": 0.8,
- "transformers_version": "5.5.3"
-}
diff --git a/checkpoint-3000/model.safetensors b/checkpoint-3000/model.safetensors
deleted file mode 100644
index 07e0e931ed749c8c0c6c086ebb969bd3c5167e3f..0000000000000000000000000000000000000000
--- a/checkpoint-3000/model.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0a87a133eb5ec5af0878395bc45e179834b11224819f981211f70acdd015060b
-size 17645743048
diff --git a/checkpoint-3000/optimizer.bin b/checkpoint-3000/optimizer.bin
deleted file mode 100644
index 18574ad6580a2815e85a104eea5910c353aaf5dc..0000000000000000000000000000000000000000
--- a/checkpoint-3000/optimizer.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0ff8e5977667fc938b297528391c931889487050b2acf34a78a42a820912cd38
-size 32180124005
diff --git a/checkpoint-3000/pytorch_model_fsdp.bin b/checkpoint-3000/pytorch_model_fsdp.bin
deleted file mode 100644
index 798e41cb07595e0af0eea0bc21a9c2bdffb4914c..0000000000000000000000000000000000000000
--- a/checkpoint-3000/pytorch_model_fsdp.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3023a52ce183c0d2cddf839ebf937f5047e153db9c651eb9f295b9a386e6b589
-size 17645897996
diff --git a/checkpoint-3000/rng_state_0.pth b/checkpoint-3000/rng_state_0.pth
deleted file mode 100644
index 5379ca97bc0c62d226d0fc37920d4937a7bb8b43..0000000000000000000000000000000000000000
--- a/checkpoint-3000/rng_state_0.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:61e957b4cd785256be4cb26eb03060ef689e1d58f1766d7f26ca36a62bec4994
-size 14917
diff --git a/checkpoint-3000/rng_state_1.pth b/checkpoint-3000/rng_state_1.pth
deleted file mode 100644
index 662ad0d5b30369c825f66c080779973608c5058e..0000000000000000000000000000000000000000
--- a/checkpoint-3000/rng_state_1.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:550c54d430b44b77b0abe44c6e3ceba90a155305315c081b7616b35e2c18d1ce
-size 14917
diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt
deleted file mode 100644
index 58a045115b7b529e69edb60002fbf90b0935a577..0000000000000000000000000000000000000000
--- a/checkpoint-3000/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b07c9eca675fb8c47d0c01728c4ef879c66a752ffdace85e7e9feac32b48ac4b
-size 1465
diff --git a/checkpoint-3000/tokenizer.json b/checkpoint-3000/tokenizer.json
deleted file mode 100644
index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000
--- a/checkpoint-3000/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
-size 11422650
diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json
deleted file mode 100644
index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000
--- a/checkpoint-3000/tokenizer_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "add_prefix_space": false,
- "backend": "tokenizers",
- "bos_token": null,
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|im_end|>",
- "errors": "replace",
- "extra_special_tokens": [
- "<|im_start|>",
- "<|im_end|>",
- "<|object_ref_start|>",
- "<|object_ref_end|>",
- "<|box_start|>",
- "<|box_end|>",
- "<|quad_start|>",
- "<|quad_end|>",
- "<|vision_start|>",
- "<|vision_end|>",
- "<|vision_pad|>",
- "<|image_pad|>",
- "<|video_pad|>"
- ],
- "is_local": false,
- "model_max_length": 1010000,
- "pad_token": "<|fim_pad|>",
- "split_special_tokens": false,
- "tokenizer_class": "Qwen2Tokenizer",
- "unk_token": null
-}
diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json
deleted file mode 100644
index 666130f045326cc7a4d60f3405606f5f0040b4a4..0000000000000000000000000000000000000000
--- a/checkpoint-3000/trainer_state.json
+++ /dev/null
@@ -1,27034 +0,0 @@
-{
- "best_global_step": null,
- "best_metric": null,
- "best_model_checkpoint": null,
- "epoch": 2.2796352583586628,
- "eval_steps": 500,
- "global_step": 3000,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0007598784194528875,
- "grad_norm": 11.767926216125488,
- "learning_rate": 0.0,
- "loss": 0.7937269806861877,
- "mean_token_accuracy": 0.7822731137275696,
- "num_tokens": 10507.0,
- "step": 1
- },
- {
- "epoch": 0.001519756838905775,
- "grad_norm": 14.9199800491333,
- "learning_rate": 2.5252525252525256e-08,
- "loss": 0.7665389776229858,
- "mean_token_accuracy": 0.8342233300209045,
- "num_tokens": 14806.0,
- "step": 2
- },
- {
- "epoch": 0.0022796352583586625,
- "grad_norm": 11.991217613220215,
- "learning_rate": 5.050505050505051e-08,
- "loss": 0.9597002267837524,
- "mean_token_accuracy": 0.7054992318153381,
- "num_tokens": 27170.0,
- "step": 3
- },
- {
- "epoch": 0.00303951367781155,
- "grad_norm": 12.958333015441895,
- "learning_rate": 7.575757575757576e-08,
- "loss": 0.9971482753753662,
- "mean_token_accuracy": 0.7261134386062622,
- "num_tokens": 33729.0,
- "step": 4
- },
- {
- "epoch": 0.003799392097264438,
- "grad_norm": 13.5665283203125,
- "learning_rate": 1.0101010101010103e-07,
- "loss": 0.9504883885383606,
- "mean_token_accuracy": 0.745307445526123,
- "num_tokens": 41174.0,
- "step": 5
- },
- {
- "epoch": 0.004559270516717325,
- "grad_norm": 10.09444808959961,
- "learning_rate": 1.2626262626262626e-07,
- "loss": 0.759548008441925,
- "mean_token_accuracy": 0.7842121124267578,
- "num_tokens": 47943.0,
- "step": 6
- },
- {
- "epoch": 0.005319148936170213,
- "grad_norm": 10.741650581359863,
- "learning_rate": 1.5151515151515152e-07,
- "loss": 0.8231598138809204,
- "mean_token_accuracy": 0.7550969123840332,
- "num_tokens": 56665.0,
- "step": 7
- },
- {
- "epoch": 0.0060790273556231,
- "grad_norm": 12.250170707702637,
- "learning_rate": 1.767676767676768e-07,
- "loss": 0.8576581478118896,
- "mean_token_accuracy": 0.7568671703338623,
- "num_tokens": 67606.0,
- "step": 8
- },
- {
- "epoch": 0.006838905775075988,
- "grad_norm": 12.828629493713379,
- "learning_rate": 2.0202020202020205e-07,
- "loss": 0.9886435866355896,
- "mean_token_accuracy": 0.733400285243988,
- "num_tokens": 74272.0,
- "step": 9
- },
- {
- "epoch": 0.007598784194528876,
- "grad_norm": 15.966923713684082,
- "learning_rate": 2.2727272727272729e-07,
- "loss": 1.064985990524292,
- "mean_token_accuracy": 0.7101132869720459,
- "num_tokens": 80524.0,
- "step": 10
- },
- {
- "epoch": 0.008358662613981762,
- "grad_norm": 10.864850044250488,
- "learning_rate": 2.525252525252525e-07,
- "loss": 0.8311550617218018,
- "mean_token_accuracy": 0.7431639432907104,
- "num_tokens": 96292.0,
- "step": 11
- },
- {
- "epoch": 0.00911854103343465,
- "grad_norm": 16.438785552978516,
- "learning_rate": 2.7777777777777776e-07,
- "loss": 1.0579866170883179,
- "mean_token_accuracy": 0.7222976684570312,
- "num_tokens": 102992.0,
- "step": 12
- },
- {
- "epoch": 0.009878419452887538,
- "grad_norm": 11.179214477539062,
- "learning_rate": 3.0303030303030305e-07,
- "loss": 0.9816144704818726,
- "mean_token_accuracy": 0.7206371426582336,
- "num_tokens": 113571.0,
- "step": 13
- },
- {
- "epoch": 0.010638297872340425,
- "grad_norm": 12.780299186706543,
- "learning_rate": 3.2828282828282834e-07,
- "loss": 0.847449004650116,
- "mean_token_accuracy": 0.7826199531555176,
- "num_tokens": 119568.0,
- "step": 14
- },
- {
- "epoch": 0.011398176291793313,
- "grad_norm": 14.800421714782715,
- "learning_rate": 3.535353535353536e-07,
- "loss": 0.9275516271591187,
- "mean_token_accuracy": 0.7655045986175537,
- "num_tokens": 126258.0,
- "step": 15
- },
- {
- "epoch": 0.0121580547112462,
- "grad_norm": 11.267602920532227,
- "learning_rate": 3.787878787878788e-07,
- "loss": 0.8464037179946899,
- "mean_token_accuracy": 0.7606508731842041,
- "num_tokens": 136831.0,
- "step": 16
- },
- {
- "epoch": 0.012917933130699088,
- "grad_norm": 12.891013145446777,
- "learning_rate": 4.040404040404041e-07,
- "loss": 0.9903074502944946,
- "mean_token_accuracy": 0.7247487306594849,
- "num_tokens": 150434.0,
- "step": 17
- },
- {
- "epoch": 0.013677811550151976,
- "grad_norm": 11.13957691192627,
- "learning_rate": 4.2929292929292934e-07,
- "loss": 0.8287211656570435,
- "mean_token_accuracy": 0.7621913552284241,
- "num_tokens": 158516.0,
- "step": 18
- },
- {
- "epoch": 0.014437689969604863,
- "grad_norm": 18.39569664001465,
- "learning_rate": 4.5454545454545457e-07,
- "loss": 1.150015115737915,
- "mean_token_accuracy": 0.7349498271942139,
- "num_tokens": 162214.0,
- "step": 19
- },
- {
- "epoch": 0.015197568389057751,
- "grad_norm": 9.353750228881836,
- "learning_rate": 4.797979797979798e-07,
- "loss": 0.7228299379348755,
- "mean_token_accuracy": 0.7969573736190796,
- "num_tokens": 173035.0,
- "step": 20
- },
- {
- "epoch": 0.015957446808510637,
- "grad_norm": 8.267163276672363,
- "learning_rate": 5.05050505050505e-07,
- "loss": 0.7358136177062988,
- "mean_token_accuracy": 0.7903937101364136,
- "num_tokens": 183568.0,
- "step": 21
- },
- {
- "epoch": 0.016717325227963525,
- "grad_norm": 11.137128829956055,
- "learning_rate": 5.303030303030304e-07,
- "loss": 1.0075397491455078,
- "mean_token_accuracy": 0.702807605266571,
- "num_tokens": 192759.0,
- "step": 22
- },
- {
- "epoch": 0.017477203647416412,
- "grad_norm": 10.734103202819824,
- "learning_rate": 5.555555555555555e-07,
- "loss": 0.8925919532775879,
- "mean_token_accuracy": 0.7475671768188477,
- "num_tokens": 201280.0,
- "step": 23
- },
- {
- "epoch": 0.0182370820668693,
- "grad_norm": 11.945566177368164,
- "learning_rate": 5.808080808080809e-07,
- "loss": 0.7260514497756958,
- "mean_token_accuracy": 0.7859152555465698,
- "num_tokens": 218053.0,
- "step": 24
- },
- {
- "epoch": 0.018996960486322188,
- "grad_norm": 18.610652923583984,
- "learning_rate": 6.060606060606061e-07,
- "loss": 0.8995465636253357,
- "mean_token_accuracy": 0.7931990623474121,
- "num_tokens": 220953.0,
- "step": 25
- },
- {
- "epoch": 0.019756838905775075,
- "grad_norm": 10.51898193359375,
- "learning_rate": 6.313131313131314e-07,
- "loss": 0.9532671570777893,
- "mean_token_accuracy": 0.7257645726203918,
- "num_tokens": 231200.0,
- "step": 26
- },
- {
- "epoch": 0.020516717325227963,
- "grad_norm": 9.581812858581543,
- "learning_rate": 6.565656565656567e-07,
- "loss": 0.9038010239601135,
- "mean_token_accuracy": 0.7390379905700684,
- "num_tokens": 237711.0,
- "step": 27
- },
- {
- "epoch": 0.02127659574468085,
- "grad_norm": 12.297484397888184,
- "learning_rate": 6.818181818181818e-07,
- "loss": 1.048936367034912,
- "mean_token_accuracy": 0.7175670862197876,
- "num_tokens": 242503.0,
- "step": 28
- },
- {
- "epoch": 0.022036474164133738,
- "grad_norm": 7.437953472137451,
- "learning_rate": 7.070707070707071e-07,
- "loss": 0.8308826684951782,
- "mean_token_accuracy": 0.7415335774421692,
- "num_tokens": 250842.0,
- "step": 29
- },
- {
- "epoch": 0.022796352583586626,
- "grad_norm": 6.134475231170654,
- "learning_rate": 7.323232323232324e-07,
- "loss": 0.647913932800293,
- "mean_token_accuracy": 0.8124054670333862,
- "num_tokens": 267453.0,
- "step": 30
- },
- {
- "epoch": 0.023556231003039513,
- "grad_norm": 6.678966045379639,
- "learning_rate": 7.575757575757576e-07,
- "loss": 0.7052810192108154,
- "mean_token_accuracy": 0.7908754348754883,
- "num_tokens": 284416.0,
- "step": 31
- },
- {
- "epoch": 0.0243161094224924,
- "grad_norm": 7.42232084274292,
- "learning_rate": 7.82828282828283e-07,
- "loss": 1.022383213043213,
- "mean_token_accuracy": 0.7053230404853821,
- "num_tokens": 292073.0,
- "step": 32
- },
- {
- "epoch": 0.02507598784194529,
- "grad_norm": 6.463219165802002,
- "learning_rate": 8.080808080808082e-07,
- "loss": 0.7603012323379517,
- "mean_token_accuracy": 0.7728140354156494,
- "num_tokens": 298550.0,
- "step": 33
- },
- {
- "epoch": 0.025835866261398176,
- "grad_norm": 5.668411731719971,
- "learning_rate": 8.333333333333333e-07,
- "loss": 0.7707852721214294,
- "mean_token_accuracy": 0.7827773094177246,
- "num_tokens": 306683.0,
- "step": 34
- },
- {
- "epoch": 0.026595744680851064,
- "grad_norm": 4.984964847564697,
- "learning_rate": 8.585858585858587e-07,
- "loss": 0.6317349672317505,
- "mean_token_accuracy": 0.8106861114501953,
- "num_tokens": 318842.0,
- "step": 35
- },
- {
- "epoch": 0.02735562310030395,
- "grad_norm": 4.421732425689697,
- "learning_rate": 8.838383838383839e-07,
- "loss": 0.6228617429733276,
- "mean_token_accuracy": 0.8023355603218079,
- "num_tokens": 329850.0,
- "step": 36
- },
- {
- "epoch": 0.02811550151975684,
- "grad_norm": 5.970808029174805,
- "learning_rate": 9.090909090909091e-07,
- "loss": 0.8443238139152527,
- "mean_token_accuracy": 0.7462409734725952,
- "num_tokens": 335844.0,
- "step": 37
- },
- {
- "epoch": 0.028875379939209727,
- "grad_norm": 4.5389084815979,
- "learning_rate": 9.343434343434345e-07,
- "loss": 0.6976436376571655,
- "mean_token_accuracy": 0.790410041809082,
- "num_tokens": 348768.0,
- "step": 38
- },
- {
- "epoch": 0.029635258358662615,
- "grad_norm": 4.116631507873535,
- "learning_rate": 9.595959595959596e-07,
- "loss": 0.6698519587516785,
- "mean_token_accuracy": 0.7818127870559692,
- "num_tokens": 355460.0,
- "step": 39
- },
- {
- "epoch": 0.030395136778115502,
- "grad_norm": 3.3714773654937744,
- "learning_rate": 9.84848484848485e-07,
- "loss": 0.5723201036453247,
- "mean_token_accuracy": 0.8100086450576782,
- "num_tokens": 368507.0,
- "step": 40
- },
- {
- "epoch": 0.03115501519756839,
- "grad_norm": 4.4438347816467285,
- "learning_rate": 1.01010101010101e-06,
- "loss": 0.7508786916732788,
- "mean_token_accuracy": 0.7711942791938782,
- "num_tokens": 376467.0,
- "step": 41
- },
- {
- "epoch": 0.031914893617021274,
- "grad_norm": 5.609974384307861,
- "learning_rate": 1.0353535353535354e-06,
- "loss": 0.566256046295166,
- "mean_token_accuracy": 0.8319284319877625,
- "num_tokens": 381399.0,
- "step": 42
- },
- {
- "epoch": 0.03267477203647416,
- "grad_norm": 5.124386787414551,
- "learning_rate": 1.0606060606060608e-06,
- "loss": 0.8151067495346069,
- "mean_token_accuracy": 0.7537785768508911,
- "num_tokens": 387389.0,
- "step": 43
- },
- {
- "epoch": 0.03343465045592705,
- "grad_norm": 3.6318116188049316,
- "learning_rate": 1.085858585858586e-06,
- "loss": 0.5989949107170105,
- "mean_token_accuracy": 0.8129256963729858,
- "num_tokens": 395302.0,
- "step": 44
- },
- {
- "epoch": 0.03419452887537994,
- "grad_norm": 2.694424629211426,
- "learning_rate": 1.111111111111111e-06,
- "loss": 0.5831396579742432,
- "mean_token_accuracy": 0.8056820631027222,
- "num_tokens": 409920.0,
- "step": 45
- },
- {
- "epoch": 0.034954407294832825,
- "grad_norm": 2.2949178218841553,
- "learning_rate": 1.1363636363636364e-06,
- "loss": 0.472550630569458,
- "mean_token_accuracy": 0.8343006372451782,
- "num_tokens": 428323.0,
- "step": 46
- },
- {
- "epoch": 0.03571428571428571,
- "grad_norm": 3.3930575847625732,
- "learning_rate": 1.1616161616161617e-06,
- "loss": 0.6246505379676819,
- "mean_token_accuracy": 0.783149003982544,
- "num_tokens": 435889.0,
- "step": 47
- },
- {
- "epoch": 0.0364741641337386,
- "grad_norm": 3.692598819732666,
- "learning_rate": 1.186868686868687e-06,
- "loss": 0.46132946014404297,
- "mean_token_accuracy": 0.8583089113235474,
- "num_tokens": 441192.0,
- "step": 48
- },
- {
- "epoch": 0.03723404255319149,
- "grad_norm": 6.571533203125,
- "learning_rate": 1.2121212121212122e-06,
- "loss": 0.9351121783256531,
- "mean_token_accuracy": 0.7580878734588623,
- "num_tokens": 444277.0,
- "step": 49
- },
- {
- "epoch": 0.037993920972644375,
- "grad_norm": 5.029570579528809,
- "learning_rate": 1.2373737373737375e-06,
- "loss": 0.6921554803848267,
- "mean_token_accuracy": 0.8131166100502014,
- "num_tokens": 447646.0,
- "step": 50
- },
- {
- "epoch": 0.03875379939209726,
- "grad_norm": 2.9174208641052246,
- "learning_rate": 1.2626262626262629e-06,
- "loss": 0.591706395149231,
- "mean_token_accuracy": 0.8108617067337036,
- "num_tokens": 461397.0,
- "step": 51
- },
- {
- "epoch": 0.03951367781155015,
- "grad_norm": 4.315536022186279,
- "learning_rate": 1.287878787878788e-06,
- "loss": 0.6986310482025146,
- "mean_token_accuracy": 0.7710754871368408,
- "num_tokens": 472047.0,
- "step": 52
- },
- {
- "epoch": 0.04027355623100304,
- "grad_norm": 2.6216275691986084,
- "learning_rate": 1.3131313131313134e-06,
- "loss": 0.5553690791130066,
- "mean_token_accuracy": 0.8167896866798401,
- "num_tokens": 482795.0,
- "step": 53
- },
- {
- "epoch": 0.041033434650455926,
- "grad_norm": 3.0562477111816406,
- "learning_rate": 1.3383838383838385e-06,
- "loss": 0.6909202337265015,
- "mean_token_accuracy": 0.7859863638877869,
- "num_tokens": 494818.0,
- "step": 54
- },
- {
- "epoch": 0.04179331306990881,
- "grad_norm": 2.1420412063598633,
- "learning_rate": 1.3636363636363636e-06,
- "loss": 0.5415265560150146,
- "mean_token_accuracy": 0.818886399269104,
- "num_tokens": 513695.0,
- "step": 55
- },
- {
- "epoch": 0.0425531914893617,
- "grad_norm": 2.9610488414764404,
- "learning_rate": 1.3888888888888892e-06,
- "loss": 0.6602212190628052,
- "mean_token_accuracy": 0.7830734252929688,
- "num_tokens": 523784.0,
- "step": 56
- },
- {
- "epoch": 0.04331306990881459,
- "grad_norm": 2.511972665786743,
- "learning_rate": 1.4141414141414143e-06,
- "loss": 0.5717809796333313,
- "mean_token_accuracy": 0.8053616285324097,
- "num_tokens": 546308.0,
- "step": 57
- },
- {
- "epoch": 0.044072948328267476,
- "grad_norm": 3.52642822265625,
- "learning_rate": 1.4393939393939396e-06,
- "loss": 0.6242594718933105,
- "mean_token_accuracy": 0.8162082433700562,
- "num_tokens": 552019.0,
- "step": 58
- },
- {
- "epoch": 0.044832826747720364,
- "grad_norm": 3.02362322807312,
- "learning_rate": 1.4646464646464648e-06,
- "loss": 0.6634255647659302,
- "mean_token_accuracy": 0.7682032585144043,
- "num_tokens": 560009.0,
- "step": 59
- },
- {
- "epoch": 0.04559270516717325,
- "grad_norm": 2.3910107612609863,
- "learning_rate": 1.48989898989899e-06,
- "loss": 0.5519146919250488,
- "mean_token_accuracy": 0.8270269632339478,
- "num_tokens": 571005.0,
- "step": 60
- },
- {
- "epoch": 0.04635258358662614,
- "grad_norm": 4.28154993057251,
- "learning_rate": 1.5151515151515152e-06,
- "loss": 0.7437789440155029,
- "mean_token_accuracy": 0.7782418131828308,
- "num_tokens": 574950.0,
- "step": 61
- },
- {
- "epoch": 0.04711246200607903,
- "grad_norm": 3.4078686237335205,
- "learning_rate": 1.5404040404040404e-06,
- "loss": 0.6345915198326111,
- "mean_token_accuracy": 0.7903392314910889,
- "num_tokens": 581657.0,
- "step": 62
- },
- {
- "epoch": 0.047872340425531915,
- "grad_norm": 2.6834158897399902,
- "learning_rate": 1.565656565656566e-06,
- "loss": 0.5981127023696899,
- "mean_token_accuracy": 0.7911489605903625,
- "num_tokens": 591267.0,
- "step": 63
- },
- {
- "epoch": 0.0486322188449848,
- "grad_norm": 2.1054461002349854,
- "learning_rate": 1.590909090909091e-06,
- "loss": 0.5523523688316345,
- "mean_token_accuracy": 0.8194501399993896,
- "num_tokens": 606787.0,
- "step": 64
- },
- {
- "epoch": 0.04939209726443769,
- "grad_norm": 3.322596788406372,
- "learning_rate": 1.6161616161616164e-06,
- "loss": 0.48417025804519653,
- "mean_token_accuracy": 0.8293706178665161,
- "num_tokens": 611068.0,
- "step": 65
- },
- {
- "epoch": 0.05015197568389058,
- "grad_norm": 2.302450180053711,
- "learning_rate": 1.6414141414141415e-06,
- "loss": 0.6498389840126038,
- "mean_token_accuracy": 0.7728497385978699,
- "num_tokens": 624452.0,
- "step": 66
- },
- {
- "epoch": 0.050911854103343465,
- "grad_norm": 2.680191993713379,
- "learning_rate": 1.6666666666666667e-06,
- "loss": 0.6347037553787231,
- "mean_token_accuracy": 0.8108306527137756,
- "num_tokens": 638049.0,
- "step": 67
- },
- {
- "epoch": 0.05167173252279635,
- "grad_norm": 3.0297021865844727,
- "learning_rate": 1.6919191919191922e-06,
- "loss": 0.5344363451004028,
- "mean_token_accuracy": 0.8113535046577454,
- "num_tokens": 643892.0,
- "step": 68
- },
- {
- "epoch": 0.05243161094224924,
- "grad_norm": 2.9283676147460938,
- "learning_rate": 1.7171717171717173e-06,
- "loss": 0.6999260187149048,
- "mean_token_accuracy": 0.7782022356987,
- "num_tokens": 654418.0,
- "step": 69
- },
- {
- "epoch": 0.05319148936170213,
- "grad_norm": 3.4098572731018066,
- "learning_rate": 1.7424242424242427e-06,
- "loss": 0.6508946418762207,
- "mean_token_accuracy": 0.7942900657653809,
- "num_tokens": 659837.0,
- "step": 70
- },
- {
- "epoch": 0.053951367781155016,
- "grad_norm": 2.6756019592285156,
- "learning_rate": 1.7676767676767678e-06,
- "loss": 0.603486180305481,
- "mean_token_accuracy": 0.8015457391738892,
- "num_tokens": 668361.0,
- "step": 71
- },
- {
- "epoch": 0.0547112462006079,
- "grad_norm": 2.2630293369293213,
- "learning_rate": 1.792929292929293e-06,
- "loss": 0.6608274579048157,
- "mean_token_accuracy": 0.7753809690475464,
- "num_tokens": 679025.0,
- "step": 72
- },
- {
- "epoch": 0.05547112462006079,
- "grad_norm": 2.123962879180908,
- "learning_rate": 1.8181818181818183e-06,
- "loss": 0.4525482654571533,
- "mean_token_accuracy": 0.8425612449645996,
- "num_tokens": 688574.0,
- "step": 73
- },
- {
- "epoch": 0.05623100303951368,
- "grad_norm": 7.90519905090332,
- "learning_rate": 1.8434343434343434e-06,
- "loss": 0.6507195830345154,
- "mean_token_accuracy": 0.7714964151382446,
- "num_tokens": 694534.0,
- "step": 74
- },
- {
- "epoch": 0.056990881458966566,
- "grad_norm": 2.372203826904297,
- "learning_rate": 1.868686868686869e-06,
- "loss": 0.4458143413066864,
- "mean_token_accuracy": 0.7991449236869812,
- "num_tokens": 703114.0,
- "step": 75
- },
- {
- "epoch": 0.057750759878419454,
- "grad_norm": 2.918677568435669,
- "learning_rate": 1.8939393939393941e-06,
- "loss": 0.5614339113235474,
- "mean_token_accuracy": 0.8211464881896973,
- "num_tokens": 709038.0,
- "step": 76
- },
- {
- "epoch": 0.05851063829787234,
- "grad_norm": 1.6106709241867065,
- "learning_rate": 1.9191919191919192e-06,
- "loss": 0.5802098512649536,
- "mean_token_accuracy": 0.8055065870285034,
- "num_tokens": 730482.0,
- "step": 77
- },
- {
- "epoch": 0.05927051671732523,
- "grad_norm": 2.8069989681243896,
- "learning_rate": 1.944444444444445e-06,
- "loss": 0.5709059238433838,
- "mean_token_accuracy": 0.8024872541427612,
- "num_tokens": 751817.0,
- "step": 78
- },
- {
- "epoch": 0.06003039513677812,
- "grad_norm": 2.641667127609253,
- "learning_rate": 1.96969696969697e-06,
- "loss": 0.6480152606964111,
- "mean_token_accuracy": 0.7912271618843079,
- "num_tokens": 759236.0,
- "step": 79
- },
- {
- "epoch": 0.060790273556231005,
- "grad_norm": 2.6034350395202637,
- "learning_rate": 1.994949494949495e-06,
- "loss": 0.5535176396369934,
- "mean_token_accuracy": 0.7980542778968811,
- "num_tokens": 766496.0,
- "step": 80
- },
- {
- "epoch": 0.06155015197568389,
- "grad_norm": 1.7095069885253906,
- "learning_rate": 2.02020202020202e-06,
- "loss": 0.4545496106147766,
- "mean_token_accuracy": 0.8229660391807556,
- "num_tokens": 780124.0,
- "step": 81
- },
- {
- "epoch": 0.06231003039513678,
- "grad_norm": 3.788830518722534,
- "learning_rate": 2.0454545454545457e-06,
- "loss": 0.6679391264915466,
- "mean_token_accuracy": 0.7942397594451904,
- "num_tokens": 784555.0,
- "step": 82
- },
- {
- "epoch": 0.06306990881458967,
- "grad_norm": 2.009831666946411,
- "learning_rate": 2.070707070707071e-06,
- "loss": 0.5067101120948792,
- "mean_token_accuracy": 0.8276634216308594,
- "num_tokens": 797459.0,
- "step": 83
- },
- {
- "epoch": 0.06382978723404255,
- "grad_norm": 2.201627731323242,
- "learning_rate": 2.095959595959596e-06,
- "loss": 0.5012127161026001,
- "mean_token_accuracy": 0.8432504534721375,
- "num_tokens": 810817.0,
- "step": 84
- },
- {
- "epoch": 0.06458966565349544,
- "grad_norm": 2.492568016052246,
- "learning_rate": 2.1212121212121216e-06,
- "loss": 0.6142797470092773,
- "mean_token_accuracy": 0.8338661193847656,
- "num_tokens": 818191.0,
- "step": 85
- },
- {
- "epoch": 0.06534954407294832,
- "grad_norm": 2.8360862731933594,
- "learning_rate": 2.1464646464646467e-06,
- "loss": 0.5569300651550293,
- "mean_token_accuracy": 0.8121030330657959,
- "num_tokens": 825325.0,
- "step": 86
- },
- {
- "epoch": 0.06610942249240122,
- "grad_norm": 2.407548427581787,
- "learning_rate": 2.171717171717172e-06,
- "loss": 0.6442930102348328,
- "mean_token_accuracy": 0.792514443397522,
- "num_tokens": 834439.0,
- "step": 87
- },
- {
- "epoch": 0.0668693009118541,
- "grad_norm": 2.340728759765625,
- "learning_rate": 2.196969696969697e-06,
- "loss": 0.6494365930557251,
- "mean_token_accuracy": 0.7746615409851074,
- "num_tokens": 843078.0,
- "step": 88
- },
- {
- "epoch": 0.067629179331307,
- "grad_norm": 1.7703697681427002,
- "learning_rate": 2.222222222222222e-06,
- "loss": 0.598991870880127,
- "mean_token_accuracy": 0.7992157340049744,
- "num_tokens": 860171.0,
- "step": 89
- },
- {
- "epoch": 0.06838905775075987,
- "grad_norm": 2.5779271125793457,
- "learning_rate": 2.2474747474747476e-06,
- "loss": 0.5693082809448242,
- "mean_token_accuracy": 0.8093700408935547,
- "num_tokens": 866669.0,
- "step": 90
- },
- {
- "epoch": 0.06914893617021277,
- "grad_norm": 2.014092206954956,
- "learning_rate": 2.2727272727272728e-06,
- "loss": 0.5346695780754089,
- "mean_token_accuracy": 0.8165590763092041,
- "num_tokens": 876698.0,
- "step": 91
- },
- {
- "epoch": 0.06990881458966565,
- "grad_norm": 1.7555919885635376,
- "learning_rate": 2.2979797979797983e-06,
- "loss": 0.5321458578109741,
- "mean_token_accuracy": 0.8166656494140625,
- "num_tokens": 889488.0,
- "step": 92
- },
- {
- "epoch": 0.07066869300911854,
- "grad_norm": 1.8631824254989624,
- "learning_rate": 2.3232323232323234e-06,
- "loss": 0.5246532559394836,
- "mean_token_accuracy": 0.8088107705116272,
- "num_tokens": 901322.0,
- "step": 93
- },
- {
- "epoch": 0.07142857142857142,
- "grad_norm": 3.2332139015197754,
- "learning_rate": 2.348484848484849e-06,
- "loss": 0.5141711235046387,
- "mean_token_accuracy": 0.8382217884063721,
- "num_tokens": 905792.0,
- "step": 94
- },
- {
- "epoch": 0.07218844984802432,
- "grad_norm": 1.7806555032730103,
- "learning_rate": 2.373737373737374e-06,
- "loss": 0.5233149528503418,
- "mean_token_accuracy": 0.8101529479026794,
- "num_tokens": 917320.0,
- "step": 95
- },
- {
- "epoch": 0.0729483282674772,
- "grad_norm": 1.8169859647750854,
- "learning_rate": 2.3989898989898993e-06,
- "loss": 0.578881561756134,
- "mean_token_accuracy": 0.8044873476028442,
- "num_tokens": 931062.0,
- "step": 96
- },
- {
- "epoch": 0.0737082066869301,
- "grad_norm": 4.677402496337891,
- "learning_rate": 2.4242424242424244e-06,
- "loss": 0.7842556238174438,
- "mean_token_accuracy": 0.7579764127731323,
- "num_tokens": 934712.0,
- "step": 97
- },
- {
- "epoch": 0.07446808510638298,
- "grad_norm": 2.6987264156341553,
- "learning_rate": 2.4494949494949495e-06,
- "loss": 0.5669287443161011,
- "mean_token_accuracy": 0.8186933994293213,
- "num_tokens": 941058.0,
- "step": 98
- },
- {
- "epoch": 0.07522796352583587,
- "grad_norm": 1.6906023025512695,
- "learning_rate": 2.474747474747475e-06,
- "loss": 0.4976363778114319,
- "mean_token_accuracy": 0.8198553323745728,
- "num_tokens": 956509.0,
- "step": 99
- },
- {
- "epoch": 0.07598784194528875,
- "grad_norm": 2.7256152629852295,
- "learning_rate": 2.5e-06,
- "loss": 0.7138420343399048,
- "mean_token_accuracy": 0.7752805948257446,
- "num_tokens": 963920.0,
- "step": 100
- },
- {
- "epoch": 0.07674772036474165,
- "grad_norm": 2.174870491027832,
- "learning_rate": 2.5252525252525258e-06,
- "loss": 0.6733541488647461,
- "mean_token_accuracy": 0.7745175361633301,
- "num_tokens": 975268.0,
- "step": 101
- },
- {
- "epoch": 0.07750759878419453,
- "grad_norm": 1.5587213039398193,
- "learning_rate": 2.5505050505050505e-06,
- "loss": 0.44223445653915405,
- "mean_token_accuracy": 0.8278359174728394,
- "num_tokens": 991837.0,
- "step": 102
- },
- {
- "epoch": 0.07826747720364742,
- "grad_norm": 2.181840658187866,
- "learning_rate": 2.575757575757576e-06,
- "loss": 0.625128448009491,
- "mean_token_accuracy": 0.7941786050796509,
- "num_tokens": 1004325.0,
- "step": 103
- },
- {
- "epoch": 0.0790273556231003,
- "grad_norm": 1.4986687898635864,
- "learning_rate": 2.601010101010101e-06,
- "loss": 0.39262527227401733,
- "mean_token_accuracy": 0.8412648439407349,
- "num_tokens": 1018331.0,
- "step": 104
- },
- {
- "epoch": 0.0797872340425532,
- "grad_norm": 2.3416061401367188,
- "learning_rate": 2.6262626262626267e-06,
- "loss": 0.5495132803916931,
- "mean_token_accuracy": 0.8193322420120239,
- "num_tokens": 1026090.0,
- "step": 105
- },
- {
- "epoch": 0.08054711246200608,
- "grad_norm": 3.8168859481811523,
- "learning_rate": 2.6515151515151514e-06,
- "loss": 0.4898706376552582,
- "mean_token_accuracy": 0.8467956185340881,
- "num_tokens": 1029955.0,
- "step": 106
- },
- {
- "epoch": 0.08130699088145897,
- "grad_norm": 4.113908767700195,
- "learning_rate": 2.676767676767677e-06,
- "loss": 0.6189584732055664,
- "mean_token_accuracy": 0.8019394278526306,
- "num_tokens": 1033598.0,
- "step": 107
- },
- {
- "epoch": 0.08206686930091185,
- "grad_norm": 2.50003981590271,
- "learning_rate": 2.7020202020202025e-06,
- "loss": 0.6479471921920776,
- "mean_token_accuracy": 0.7790026664733887,
- "num_tokens": 1042533.0,
- "step": 108
- },
- {
- "epoch": 0.08282674772036475,
- "grad_norm": 1.408934473991394,
- "learning_rate": 2.7272727272727272e-06,
- "loss": 0.3909248113632202,
- "mean_token_accuracy": 0.8477586507797241,
- "num_tokens": 1061755.0,
- "step": 109
- },
- {
- "epoch": 0.08358662613981763,
- "grad_norm": 3.360633611679077,
- "learning_rate": 2.7525252525252528e-06,
- "loss": 0.6952459812164307,
- "mean_token_accuracy": 0.777535080909729,
- "num_tokens": 1067316.0,
- "step": 110
- },
- {
- "epoch": 0.08434650455927052,
- "grad_norm": 1.8631696701049805,
- "learning_rate": 2.7777777777777783e-06,
- "loss": 0.5420593023300171,
- "mean_token_accuracy": 0.8157662749290466,
- "num_tokens": 1079930.0,
- "step": 111
- },
- {
- "epoch": 0.0851063829787234,
- "grad_norm": 2.4308314323425293,
- "learning_rate": 2.803030303030303e-06,
- "loss": 0.5863882303237915,
- "mean_token_accuracy": 0.8206346035003662,
- "num_tokens": 1088069.0,
- "step": 112
- },
- {
- "epoch": 0.0858662613981763,
- "grad_norm": 2.922808885574341,
- "learning_rate": 2.8282828282828286e-06,
- "loss": 0.5217319130897522,
- "mean_token_accuracy": 0.8253234028816223,
- "num_tokens": 1093607.0,
- "step": 113
- },
- {
- "epoch": 0.08662613981762918,
- "grad_norm": 2.3596107959747314,
- "learning_rate": 2.8535353535353537e-06,
- "loss": 0.5070714950561523,
- "mean_token_accuracy": 0.8258323669433594,
- "num_tokens": 1100405.0,
- "step": 114
- },
- {
- "epoch": 0.08738601823708207,
- "grad_norm": 3.0853066444396973,
- "learning_rate": 2.8787878787878793e-06,
- "loss": 0.591964840888977,
- "mean_token_accuracy": 0.8047322630882263,
- "num_tokens": 1107535.0,
- "step": 115
- },
- {
- "epoch": 0.08814589665653495,
- "grad_norm": 1.9251092672348022,
- "learning_rate": 2.904040404040404e-06,
- "loss": 0.5226191878318787,
- "mean_token_accuracy": 0.8022720217704773,
- "num_tokens": 1118716.0,
- "step": 116
- },
- {
- "epoch": 0.08890577507598785,
- "grad_norm": 1.9692988395690918,
- "learning_rate": 2.9292929292929295e-06,
- "loss": 0.5462069511413574,
- "mean_token_accuracy": 0.8157015442848206,
- "num_tokens": 1131917.0,
- "step": 117
- },
- {
- "epoch": 0.08966565349544073,
- "grad_norm": 1.4738909006118774,
- "learning_rate": 2.954545454545455e-06,
- "loss": 0.4564219117164612,
- "mean_token_accuracy": 0.849632978439331,
- "num_tokens": 1148534.0,
- "step": 118
- },
- {
- "epoch": 0.09042553191489362,
- "grad_norm": 2.72646164894104,
- "learning_rate": 2.97979797979798e-06,
- "loss": 0.6654808521270752,
- "mean_token_accuracy": 0.7752684354782104,
- "num_tokens": 1155438.0,
- "step": 119
- },
- {
- "epoch": 0.0911854103343465,
- "grad_norm": 2.7843852043151855,
- "learning_rate": 3.0050505050505054e-06,
- "loss": 0.5354680418968201,
- "mean_token_accuracy": 0.8196378946304321,
- "num_tokens": 1161815.0,
- "step": 120
- },
- {
- "epoch": 0.0919452887537994,
- "grad_norm": 2.8052573204040527,
- "learning_rate": 3.0303030303030305e-06,
- "loss": 0.6366757154464722,
- "mean_token_accuracy": 0.7967483997344971,
- "num_tokens": 1168295.0,
- "step": 121
- },
- {
- "epoch": 0.09270516717325228,
- "grad_norm": 2.7462735176086426,
- "learning_rate": 3.055555555555556e-06,
- "loss": 0.59470534324646,
- "mean_token_accuracy": 0.8023771047592163,
- "num_tokens": 1174502.0,
- "step": 122
- },
- {
- "epoch": 0.09346504559270517,
- "grad_norm": 2.2743821144104004,
- "learning_rate": 3.0808080808080807e-06,
- "loss": 0.5720560550689697,
- "mean_token_accuracy": 0.8162771463394165,
- "num_tokens": 1183615.0,
- "step": 123
- },
- {
- "epoch": 0.09422492401215805,
- "grad_norm": 1.8669533729553223,
- "learning_rate": 3.1060606060606063e-06,
- "loss": 0.4655378758907318,
- "mean_token_accuracy": 0.8360732793807983,
- "num_tokens": 1193761.0,
- "step": 124
- },
- {
- "epoch": 0.09498480243161095,
- "grad_norm": 1.7666901350021362,
- "learning_rate": 3.131313131313132e-06,
- "loss": 0.5524153709411621,
- "mean_token_accuracy": 0.8252713680267334,
- "num_tokens": 1207870.0,
- "step": 125
- },
- {
- "epoch": 0.09574468085106383,
- "grad_norm": 2.4720070362091064,
- "learning_rate": 3.1565656565656566e-06,
- "loss": 0.5003011226654053,
- "mean_token_accuracy": 0.8491042852401733,
- "num_tokens": 1214603.0,
- "step": 126
- },
- {
- "epoch": 0.09650455927051672,
- "grad_norm": 1.6500422954559326,
- "learning_rate": 3.181818181818182e-06,
- "loss": 0.5137069225311279,
- "mean_token_accuracy": 0.8273531198501587,
- "num_tokens": 1228717.0,
- "step": 127
- },
- {
- "epoch": 0.0972644376899696,
- "grad_norm": 3.402543067932129,
- "learning_rate": 3.2070707070707072e-06,
- "loss": 0.708167552947998,
- "mean_token_accuracy": 0.7705385684967041,
- "num_tokens": 1234361.0,
- "step": 128
- },
- {
- "epoch": 0.0980243161094225,
- "grad_norm": 2.547285795211792,
- "learning_rate": 3.232323232323233e-06,
- "loss": 0.6020137071609497,
- "mean_token_accuracy": 0.7981340289115906,
- "num_tokens": 1244169.0,
- "step": 129
- },
- {
- "epoch": 0.09878419452887538,
- "grad_norm": 2.0578792095184326,
- "learning_rate": 3.257575757575758e-06,
- "loss": 0.4425000250339508,
- "mean_token_accuracy": 0.8567807674407959,
- "num_tokens": 1252709.0,
- "step": 130
- },
- {
- "epoch": 0.09954407294832827,
- "grad_norm": 1.672614336013794,
- "learning_rate": 3.282828282828283e-06,
- "loss": 0.4860966205596924,
- "mean_token_accuracy": 0.8393139243125916,
- "num_tokens": 1265766.0,
- "step": 131
- },
- {
- "epoch": 0.10030395136778116,
- "grad_norm": 3.2560198307037354,
- "learning_rate": 3.3080808080808086e-06,
- "loss": 0.624736487865448,
- "mean_token_accuracy": 0.7875322699546814,
- "num_tokens": 1270779.0,
- "step": 132
- },
- {
- "epoch": 0.10106382978723404,
- "grad_norm": 2.4468185901641846,
- "learning_rate": 3.3333333333333333e-06,
- "loss": 0.5062227249145508,
- "mean_token_accuracy": 0.8217229843139648,
- "num_tokens": 1277113.0,
- "step": 133
- },
- {
- "epoch": 0.10182370820668693,
- "grad_norm": 2.6371328830718994,
- "learning_rate": 3.358585858585859e-06,
- "loss": 0.477113276720047,
- "mean_token_accuracy": 0.8605583906173706,
- "num_tokens": 1282514.0,
- "step": 134
- },
- {
- "epoch": 0.10258358662613981,
- "grad_norm": 2.48421311378479,
- "learning_rate": 3.3838383838383844e-06,
- "loss": 0.40855684876441956,
- "mean_token_accuracy": 0.864548921585083,
- "num_tokens": 1287859.0,
- "step": 135
- },
- {
- "epoch": 0.1033434650455927,
- "grad_norm": 1.993099331855774,
- "learning_rate": 3.409090909090909e-06,
- "loss": 0.5913145542144775,
- "mean_token_accuracy": 0.8248485922813416,
- "num_tokens": 1301074.0,
- "step": 136
- },
- {
- "epoch": 0.10410334346504559,
- "grad_norm": 3.5947680473327637,
- "learning_rate": 3.4343434343434347e-06,
- "loss": 0.5028599500656128,
- "mean_token_accuracy": 0.8367215394973755,
- "num_tokens": 1305219.0,
- "step": 137
- },
- {
- "epoch": 0.10486322188449848,
- "grad_norm": 2.5778582096099854,
- "learning_rate": 3.45959595959596e-06,
- "loss": 0.5297672748565674,
- "mean_token_accuracy": 0.8232187032699585,
- "num_tokens": 1312482.0,
- "step": 138
- },
- {
- "epoch": 0.10562310030395136,
- "grad_norm": 1.8961588144302368,
- "learning_rate": 3.4848484848484854e-06,
- "loss": 0.39954107999801636,
- "mean_token_accuracy": 0.8605833053588867,
- "num_tokens": 1323404.0,
- "step": 139
- },
- {
- "epoch": 0.10638297872340426,
- "grad_norm": 1.9687960147857666,
- "learning_rate": 3.51010101010101e-06,
- "loss": 0.48791587352752686,
- "mean_token_accuracy": 0.8200347423553467,
- "num_tokens": 1333027.0,
- "step": 140
- },
- {
- "epoch": 0.10714285714285714,
- "grad_norm": 2.520242691040039,
- "learning_rate": 3.5353535353535356e-06,
- "loss": 0.6106002330780029,
- "mean_token_accuracy": 0.790692150592804,
- "num_tokens": 1340999.0,
- "step": 141
- },
- {
- "epoch": 0.10790273556231003,
- "grad_norm": 3.751617431640625,
- "learning_rate": 3.560606060606061e-06,
- "loss": 0.48141729831695557,
- "mean_token_accuracy": 0.8421382904052734,
- "num_tokens": 1344687.0,
- "step": 142
- },
- {
- "epoch": 0.10866261398176291,
- "grad_norm": 2.7101709842681885,
- "learning_rate": 3.585858585858586e-06,
- "loss": 0.5375241637229919,
- "mean_token_accuracy": 0.8061438202857971,
- "num_tokens": 1350192.0,
- "step": 143
- },
- {
- "epoch": 0.1094224924012158,
- "grad_norm": 2.583484411239624,
- "learning_rate": 3.6111111111111115e-06,
- "loss": 0.6492470502853394,
- "mean_token_accuracy": 0.7863001823425293,
- "num_tokens": 1358148.0,
- "step": 144
- },
- {
- "epoch": 0.11018237082066869,
- "grad_norm": 1.792561650276184,
- "learning_rate": 3.6363636363636366e-06,
- "loss": 0.48480600118637085,
- "mean_token_accuracy": 0.8358709812164307,
- "num_tokens": 1369519.0,
- "step": 145
- },
- {
- "epoch": 0.11094224924012158,
- "grad_norm": 2.6480472087860107,
- "learning_rate": 3.661616161616162e-06,
- "loss": 0.5268933176994324,
- "mean_token_accuracy": 0.8214013576507568,
- "num_tokens": 1375862.0,
- "step": 146
- },
- {
- "epoch": 0.11170212765957446,
- "grad_norm": 2.3174469470977783,
- "learning_rate": 3.686868686868687e-06,
- "loss": 0.42517897486686707,
- "mean_token_accuracy": 0.8523461222648621,
- "num_tokens": 1381546.0,
- "step": 147
- },
- {
- "epoch": 0.11246200607902736,
- "grad_norm": 3.0090949535369873,
- "learning_rate": 3.7121212121212124e-06,
- "loss": 0.4042336940765381,
- "mean_token_accuracy": 0.8670448064804077,
- "num_tokens": 1385896.0,
- "step": 148
- },
- {
- "epoch": 0.11322188449848024,
- "grad_norm": 2.4928104877471924,
- "learning_rate": 3.737373737373738e-06,
- "loss": 0.6498878598213196,
- "mean_token_accuracy": 0.7967068552970886,
- "num_tokens": 1394169.0,
- "step": 149
- },
- {
- "epoch": 0.11398176291793313,
- "grad_norm": 1.5984913110733032,
- "learning_rate": 3.7626262626262627e-06,
- "loss": 0.546096920967102,
- "mean_token_accuracy": 0.8035850524902344,
- "num_tokens": 1408785.0,
- "step": 150
- },
- {
- "epoch": 0.11474164133738601,
- "grad_norm": 2.3663532733917236,
- "learning_rate": 3.7878787878787882e-06,
- "loss": 0.6111721992492676,
- "mean_token_accuracy": 0.8015355467796326,
- "num_tokens": 1417510.0,
- "step": 151
- },
- {
- "epoch": 0.11550151975683891,
- "grad_norm": 2.518932819366455,
- "learning_rate": 3.8131313131313138e-06,
- "loss": 0.5274964570999146,
- "mean_token_accuracy": 0.8155480623245239,
- "num_tokens": 1424186.0,
- "step": 152
- },
- {
- "epoch": 0.11626139817629179,
- "grad_norm": 2.14353609085083,
- "learning_rate": 3.8383838383838385e-06,
- "loss": 0.5283297896385193,
- "mean_token_accuracy": 0.8275758028030396,
- "num_tokens": 1432630.0,
- "step": 153
- },
- {
- "epoch": 0.11702127659574468,
- "grad_norm": 1.8243604898452759,
- "learning_rate": 3.863636363636364e-06,
- "loss": 0.41854870319366455,
- "mean_token_accuracy": 0.8222295045852661,
- "num_tokens": 1442691.0,
- "step": 154
- },
- {
- "epoch": 0.11778115501519756,
- "grad_norm": 2.088212251663208,
- "learning_rate": 3.88888888888889e-06,
- "loss": 0.6062943339347839,
- "mean_token_accuracy": 0.8009427785873413,
- "num_tokens": 1456890.0,
- "step": 155
- },
- {
- "epoch": 0.11854103343465046,
- "grad_norm": 1.3469511270523071,
- "learning_rate": 3.914141414141415e-06,
- "loss": 0.4390433728694916,
- "mean_token_accuracy": 0.8436295986175537,
- "num_tokens": 1475349.0,
- "step": 156
- },
- {
- "epoch": 0.11930091185410334,
- "grad_norm": 3.247023105621338,
- "learning_rate": 3.93939393939394e-06,
- "loss": 0.6490433216094971,
- "mean_token_accuracy": 0.8037861585617065,
- "num_tokens": 1479952.0,
- "step": 157
- },
- {
- "epoch": 0.12006079027355623,
- "grad_norm": 2.6610445976257324,
- "learning_rate": 3.964646464646465e-06,
- "loss": 0.6221826076507568,
- "mean_token_accuracy": 0.7848749160766602,
- "num_tokens": 1487306.0,
- "step": 158
- },
- {
- "epoch": 0.12082066869300911,
- "grad_norm": 2.3060810565948486,
- "learning_rate": 3.98989898989899e-06,
- "loss": 0.5052388310432434,
- "mean_token_accuracy": 0.8281195759773254,
- "num_tokens": 1495367.0,
- "step": 159
- },
- {
- "epoch": 0.12158054711246201,
- "grad_norm": 2.504448652267456,
- "learning_rate": 4.015151515151515e-06,
- "loss": 0.5005477666854858,
- "mean_token_accuracy": 0.8408058881759644,
- "num_tokens": 1502069.0,
- "step": 160
- },
- {
- "epoch": 0.12234042553191489,
- "grad_norm": 3.993938446044922,
- "learning_rate": 4.04040404040404e-06,
- "loss": 0.5569638013839722,
- "mean_token_accuracy": 0.8095242977142334,
- "num_tokens": 1510224.0,
- "step": 161
- },
- {
- "epoch": 0.12310030395136778,
- "grad_norm": 2.2287683486938477,
- "learning_rate": 4.065656565656566e-06,
- "loss": 0.524042546749115,
- "mean_token_accuracy": 0.8102203607559204,
- "num_tokens": 1518364.0,
- "step": 162
- },
- {
- "epoch": 0.12386018237082067,
- "grad_norm": 1.9531738758087158,
- "learning_rate": 4.0909090909090915e-06,
- "loss": 0.45794573426246643,
- "mean_token_accuracy": 0.8560376167297363,
- "num_tokens": 1528097.0,
- "step": 163
- },
- {
- "epoch": 0.12462006079027356,
- "grad_norm": 1.5841206312179565,
- "learning_rate": 4.116161616161617e-06,
- "loss": 0.5420972108840942,
- "mean_token_accuracy": 0.8092726469039917,
- "num_tokens": 1544119.0,
- "step": 164
- },
- {
- "epoch": 0.12537993920972645,
- "grad_norm": 1.7536218166351318,
- "learning_rate": 4.141414141414142e-06,
- "loss": 0.554668664932251,
- "mean_token_accuracy": 0.8193825483322144,
- "num_tokens": 1559140.0,
- "step": 165
- },
- {
- "epoch": 0.12613981762917933,
- "grad_norm": 3.545454740524292,
- "learning_rate": 4.166666666666667e-06,
- "loss": 0.580947995185852,
- "mean_token_accuracy": 0.8286383152008057,
- "num_tokens": 1563625.0,
- "step": 166
- },
- {
- "epoch": 0.12689969604863222,
- "grad_norm": 1.6608915328979492,
- "learning_rate": 4.191919191919192e-06,
- "loss": 0.5523324012756348,
- "mean_token_accuracy": 0.8155215978622437,
- "num_tokens": 1574945.0,
- "step": 167
- },
- {
- "epoch": 0.1276595744680851,
- "grad_norm": 1.4832708835601807,
- "learning_rate": 4.217171717171717e-06,
- "loss": 0.5133191347122192,
- "mean_token_accuracy": 0.8367571830749512,
- "num_tokens": 1595865.0,
- "step": 168
- },
- {
- "epoch": 0.128419452887538,
- "grad_norm": 1.7807520627975464,
- "learning_rate": 4.242424242424243e-06,
- "loss": 0.5131410360336304,
- "mean_token_accuracy": 0.8129367232322693,
- "num_tokens": 1608723.0,
- "step": 169
- },
- {
- "epoch": 0.12917933130699089,
- "grad_norm": 2.707569122314453,
- "learning_rate": 4.267676767676767e-06,
- "loss": 0.6129013299942017,
- "mean_token_accuracy": 0.7926048040390015,
- "num_tokens": 1616136.0,
- "step": 170
- },
- {
- "epoch": 0.12993920972644377,
- "grad_norm": 2.5831644535064697,
- "learning_rate": 4.292929292929293e-06,
- "loss": 0.6264227628707886,
- "mean_token_accuracy": 0.8074911236763,
- "num_tokens": 1624228.0,
- "step": 171
- },
- {
- "epoch": 0.13069908814589665,
- "grad_norm": 3.1124250888824463,
- "learning_rate": 4.3181818181818185e-06,
- "loss": 0.41763827204704285,
- "mean_token_accuracy": 0.8565453290939331,
- "num_tokens": 1628098.0,
- "step": 172
- },
- {
- "epoch": 0.13145896656534956,
- "grad_norm": 2.3214211463928223,
- "learning_rate": 4.343434343434344e-06,
- "loss": 0.421974778175354,
- "mean_token_accuracy": 0.8391546010971069,
- "num_tokens": 1634950.0,
- "step": 173
- },
- {
- "epoch": 0.13221884498480244,
- "grad_norm": 2.1010327339172363,
- "learning_rate": 4.368686868686869e-06,
- "loss": 0.5307331681251526,
- "mean_token_accuracy": 0.8139588236808777,
- "num_tokens": 1644132.0,
- "step": 174
- },
- {
- "epoch": 0.13297872340425532,
- "grad_norm": 2.533612012863159,
- "learning_rate": 4.393939393939394e-06,
- "loss": 0.5626664161682129,
- "mean_token_accuracy": 0.8029808402061462,
- "num_tokens": 1651637.0,
- "step": 175
- },
- {
- "epoch": 0.1337386018237082,
- "grad_norm": 1.669508457183838,
- "learning_rate": 4.41919191919192e-06,
- "loss": 0.5351508259773254,
- "mean_token_accuracy": 0.8281655311584473,
- "num_tokens": 1666776.0,
- "step": 176
- },
- {
- "epoch": 0.1344984802431611,
- "grad_norm": 1.7579659223556519,
- "learning_rate": 4.444444444444444e-06,
- "loss": 0.5235031247138977,
- "mean_token_accuracy": 0.8143284320831299,
- "num_tokens": 1679241.0,
- "step": 177
- },
- {
- "epoch": 0.135258358662614,
- "grad_norm": 3.123563528060913,
- "learning_rate": 4.46969696969697e-06,
- "loss": 0.43051332235336304,
- "mean_token_accuracy": 0.8518186211585999,
- "num_tokens": 1683317.0,
- "step": 178
- },
- {
- "epoch": 0.13601823708206687,
- "grad_norm": 2.2411575317382812,
- "learning_rate": 4.494949494949495e-06,
- "loss": 0.5471380949020386,
- "mean_token_accuracy": 0.8267596960067749,
- "num_tokens": 1691366.0,
- "step": 179
- },
- {
- "epoch": 0.13677811550151975,
- "grad_norm": 2.621973991394043,
- "learning_rate": 4.520202020202021e-06,
- "loss": 0.5685839653015137,
- "mean_token_accuracy": 0.8260642290115356,
- "num_tokens": 1698148.0,
- "step": 180
- },
- {
- "epoch": 0.13753799392097266,
- "grad_norm": 2.1553852558135986,
- "learning_rate": 4.5454545454545455e-06,
- "loss": 0.5703883171081543,
- "mean_token_accuracy": 0.8219090700149536,
- "num_tokens": 1707225.0,
- "step": 181
- },
- {
- "epoch": 0.13829787234042554,
- "grad_norm": 5.1767897605896,
- "learning_rate": 4.5707070707070715e-06,
- "loss": 0.32704639434814453,
- "mean_token_accuracy": 0.8754568099975586,
- "num_tokens": 1712748.0,
- "step": 182
- },
- {
- "epoch": 0.13905775075987842,
- "grad_norm": 2.609168291091919,
- "learning_rate": 4.595959595959597e-06,
- "loss": 0.5939987301826477,
- "mean_token_accuracy": 0.8034975528717041,
- "num_tokens": 1719932.0,
- "step": 183
- },
- {
- "epoch": 0.1398176291793313,
- "grad_norm": 2.2059099674224854,
- "learning_rate": 4.621212121212122e-06,
- "loss": 0.5310720205307007,
- "mean_token_accuracy": 0.8177368640899658,
- "num_tokens": 1727640.0,
- "step": 184
- },
- {
- "epoch": 0.1405775075987842,
- "grad_norm": 2.6367759704589844,
- "learning_rate": 4.646464646464647e-06,
- "loss": 0.522086501121521,
- "mean_token_accuracy": 0.826233983039856,
- "num_tokens": 1733609.0,
- "step": 185
- },
- {
- "epoch": 0.1413373860182371,
- "grad_norm": 3.326732873916626,
- "learning_rate": 4.671717171717172e-06,
- "loss": 0.4127829074859619,
- "mean_token_accuracy": 0.8551101684570312,
- "num_tokens": 1737256.0,
- "step": 186
- },
- {
- "epoch": 0.14209726443768997,
- "grad_norm": 1.828412413597107,
- "learning_rate": 4.696969696969698e-06,
- "loss": 0.5444269180297852,
- "mean_token_accuracy": 0.8350818157196045,
- "num_tokens": 1750196.0,
- "step": 187
- },
- {
- "epoch": 0.14285714285714285,
- "grad_norm": 3.209203004837036,
- "learning_rate": 4.722222222222222e-06,
- "loss": 0.5087994933128357,
- "mean_token_accuracy": 0.8349015712738037,
- "num_tokens": 1754836.0,
- "step": 188
- },
- {
- "epoch": 0.14361702127659576,
- "grad_norm": 1.7339166402816772,
- "learning_rate": 4.747474747474748e-06,
- "loss": 0.5151352286338806,
- "mean_token_accuracy": 0.8321266174316406,
- "num_tokens": 1766015.0,
- "step": 189
- },
- {
- "epoch": 0.14437689969604864,
- "grad_norm": 2.699068069458008,
- "learning_rate": 4.772727272727273e-06,
- "loss": 0.4406203031539917,
- "mean_token_accuracy": 0.8425000905990601,
- "num_tokens": 1771684.0,
- "step": 190
- },
- {
- "epoch": 0.14513677811550152,
- "grad_norm": 2.8117282390594482,
- "learning_rate": 4.7979797979797985e-06,
- "loss": 0.40428489446640015,
- "mean_token_accuracy": 0.8654326796531677,
- "num_tokens": 1776301.0,
- "step": 191
- },
- {
- "epoch": 0.1458966565349544,
- "grad_norm": 2.9204647541046143,
- "learning_rate": 4.823232323232324e-06,
- "loss": 0.4191770553588867,
- "mean_token_accuracy": 0.8574687242507935,
- "num_tokens": 1781678.0,
- "step": 192
- },
- {
- "epoch": 0.1466565349544073,
- "grad_norm": 2.1648988723754883,
- "learning_rate": 4.848484848484849e-06,
- "loss": 0.5839012861251831,
- "mean_token_accuracy": 0.8053664565086365,
- "num_tokens": 1792516.0,
- "step": 193
- },
- {
- "epoch": 0.1474164133738602,
- "grad_norm": 2.3221631050109863,
- "learning_rate": 4.873737373737374e-06,
- "loss": 0.5037894248962402,
- "mean_token_accuracy": 0.8427227139472961,
- "num_tokens": 1800192.0,
- "step": 194
- },
- {
- "epoch": 0.14817629179331307,
- "grad_norm": 2.4536430835723877,
- "learning_rate": 4.898989898989899e-06,
- "loss": 0.42326074838638306,
- "mean_token_accuracy": 0.8510633111000061,
- "num_tokens": 1806159.0,
- "step": 195
- },
- {
- "epoch": 0.14893617021276595,
- "grad_norm": 2.4875805377960205,
- "learning_rate": 4.924242424242425e-06,
- "loss": 0.539531409740448,
- "mean_token_accuracy": 0.8060250282287598,
- "num_tokens": 1813392.0,
- "step": 196
- },
- {
- "epoch": 0.14969604863221886,
- "grad_norm": 2.1664798259735107,
- "learning_rate": 4.94949494949495e-06,
- "loss": 0.42502015829086304,
- "mean_token_accuracy": 0.8503251075744629,
- "num_tokens": 1821424.0,
- "step": 197
- },
- {
- "epoch": 0.15045592705167174,
- "grad_norm": 2.568808078765869,
- "learning_rate": 4.974747474747475e-06,
- "loss": 0.5025098323822021,
- "mean_token_accuracy": 0.8182311058044434,
- "num_tokens": 1827225.0,
- "step": 198
- },
- {
- "epoch": 0.15121580547112462,
- "grad_norm": 1.9116802215576172,
- "learning_rate": 5e-06,
- "loss": 0.4907258450984955,
- "mean_token_accuracy": 0.8310189843177795,
- "num_tokens": 1836297.0,
- "step": 199
- },
- {
- "epoch": 0.1519756838905775,
- "grad_norm": 3.150765895843506,
- "learning_rate": 4.999999122701883e-06,
- "loss": 0.390616774559021,
- "mean_token_accuracy": 0.8626647591590881,
- "num_tokens": 1839984.0,
- "step": 200
- },
- {
- "epoch": 0.15273556231003038,
- "grad_norm": 3.2229044437408447,
- "learning_rate": 4.999996490808146e-06,
- "loss": 0.48009657859802246,
- "mean_token_accuracy": 0.825214147567749,
- "num_tokens": 1844610.0,
- "step": 201
- },
- {
- "epoch": 0.1534954407294833,
- "grad_norm": 1.4473289251327515,
- "learning_rate": 4.9999921043206356e-06,
- "loss": 0.40135183930397034,
- "mean_token_accuracy": 0.8537827730178833,
- "num_tokens": 1859573.0,
- "step": 202
- },
- {
- "epoch": 0.15425531914893617,
- "grad_norm": 4.072319507598877,
- "learning_rate": 4.999985963242432e-06,
- "loss": 0.6158689260482788,
- "mean_token_accuracy": 0.8075432777404785,
- "num_tokens": 1863147.0,
- "step": 203
- },
- {
- "epoch": 0.15501519756838905,
- "grad_norm": 3.15741229057312,
- "learning_rate": 4.999978067577844e-06,
- "loss": 0.4603108763694763,
- "mean_token_accuracy": 0.8418779373168945,
- "num_tokens": 1867201.0,
- "step": 204
- },
- {
- "epoch": 0.15577507598784193,
- "grad_norm": 2.1925418376922607,
- "learning_rate": 4.999968417332415e-06,
- "loss": 0.5552488565444946,
- "mean_token_accuracy": 0.8216016292572021,
- "num_tokens": 1874837.0,
- "step": 205
- },
- {
- "epoch": 0.15653495440729484,
- "grad_norm": 2.2518117427825928,
- "learning_rate": 4.999957012512916e-06,
- "loss": 0.4912569522857666,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 1881842.0,
- "step": 206
- },
- {
- "epoch": 0.15729483282674772,
- "grad_norm": 1.8223762512207031,
- "learning_rate": 4.999943853127351e-06,
- "loss": 0.47709137201309204,
- "mean_token_accuracy": 0.8311659097671509,
- "num_tokens": 1890805.0,
- "step": 207
- },
- {
- "epoch": 0.1580547112462006,
- "grad_norm": 2.066499948501587,
- "learning_rate": 4.999928939184958e-06,
- "loss": 0.44794657826423645,
- "mean_token_accuracy": 0.8513424396514893,
- "num_tokens": 1898264.0,
- "step": 208
- },
- {
- "epoch": 0.15881458966565348,
- "grad_norm": 3.53865909576416,
- "learning_rate": 4.999912270696202e-06,
- "loss": 0.5978270769119263,
- "mean_token_accuracy": 0.8080137968063354,
- "num_tokens": 1902435.0,
- "step": 209
- },
- {
- "epoch": 0.1595744680851064,
- "grad_norm": 2.0760679244995117,
- "learning_rate": 4.999893847672783e-06,
- "loss": 0.5930601358413696,
- "mean_token_accuracy": 0.8028650283813477,
- "num_tokens": 1912252.0,
- "step": 210
- },
- {
- "epoch": 0.16033434650455927,
- "grad_norm": 2.21551513671875,
- "learning_rate": 4.99987367012763e-06,
- "loss": 0.6336753964424133,
- "mean_token_accuracy": 0.7902286648750305,
- "num_tokens": 1922095.0,
- "step": 211
- },
- {
- "epoch": 0.16109422492401215,
- "grad_norm": 1.7654480934143066,
- "learning_rate": 4.999851738074904e-06,
- "loss": 0.6373403668403625,
- "mean_token_accuracy": 0.7802424430847168,
- "num_tokens": 1938962.0,
- "step": 212
- },
- {
- "epoch": 0.16185410334346503,
- "grad_norm": 2.852834701538086,
- "learning_rate": 4.9998280515300006e-06,
- "loss": 0.6418683528900146,
- "mean_token_accuracy": 0.7895716428756714,
- "num_tokens": 1944668.0,
- "step": 213
- },
- {
- "epoch": 0.16261398176291794,
- "grad_norm": 3.4737212657928467,
- "learning_rate": 4.999802610509541e-06,
- "loss": 0.6323273181915283,
- "mean_token_accuracy": 0.7982614636421204,
- "num_tokens": 1949142.0,
- "step": 214
- },
- {
- "epoch": 0.16337386018237082,
- "grad_norm": 3.0802664756774902,
- "learning_rate": 4.999775415031381e-06,
- "loss": 0.5929068326950073,
- "mean_token_accuracy": 0.8112219572067261,
- "num_tokens": 1954141.0,
- "step": 215
- },
- {
- "epoch": 0.1641337386018237,
- "grad_norm": 2.9808855056762695,
- "learning_rate": 4.999746465114609e-06,
- "loss": 0.5556406378746033,
- "mean_token_accuracy": 0.8117628693580627,
- "num_tokens": 1959406.0,
- "step": 216
- },
- {
- "epoch": 0.16489361702127658,
- "grad_norm": 1.7346166372299194,
- "learning_rate": 4.999715760779541e-06,
- "loss": 0.5122925043106079,
- "mean_token_accuracy": 0.8040724992752075,
- "num_tokens": 1971921.0,
- "step": 217
- },
- {
- "epoch": 0.1656534954407295,
- "grad_norm": 1.4183907508850098,
- "learning_rate": 4.999683302047729e-06,
- "loss": 0.46471893787384033,
- "mean_token_accuracy": 0.8381330966949463,
- "num_tokens": 1988863.0,
- "step": 218
- },
- {
- "epoch": 0.16641337386018237,
- "grad_norm": 1.6797802448272705,
- "learning_rate": 4.999649088941951e-06,
- "loss": 0.38348832726478577,
- "mean_token_accuracy": 0.8344278931617737,
- "num_tokens": 2000003.0,
- "step": 219
- },
- {
- "epoch": 0.16717325227963525,
- "grad_norm": 3.036963939666748,
- "learning_rate": 4.999613121486222e-06,
- "loss": 0.6062780618667603,
- "mean_token_accuracy": 0.8217900991439819,
- "num_tokens": 2004813.0,
- "step": 220
- },
- {
- "epoch": 0.16793313069908813,
- "grad_norm": 2.0343217849731445,
- "learning_rate": 4.999575399705782e-06,
- "loss": 0.5052450895309448,
- "mean_token_accuracy": 0.8368623852729797,
- "num_tokens": 2013565.0,
- "step": 221
- },
- {
- "epoch": 0.16869300911854104,
- "grad_norm": 2.1162009239196777,
- "learning_rate": 4.9995359236271094e-06,
- "loss": 0.5169756412506104,
- "mean_token_accuracy": 0.8339958190917969,
- "num_tokens": 2025763.0,
- "step": 222
- },
- {
- "epoch": 0.16945288753799392,
- "grad_norm": 2.055333375930786,
- "learning_rate": 4.9994946932779076e-06,
- "loss": 0.6327048540115356,
- "mean_token_accuracy": 0.8078711032867432,
- "num_tokens": 2037005.0,
- "step": 223
- },
- {
- "epoch": 0.1702127659574468,
- "grad_norm": 3.334620475769043,
- "learning_rate": 4.999451708687114e-06,
- "loss": 0.5688358545303345,
- "mean_token_accuracy": 0.8015589714050293,
- "num_tokens": 2041473.0,
- "step": 224
- },
- {
- "epoch": 0.17097264437689969,
- "grad_norm": 2.3734676837921143,
- "learning_rate": 4.999406969884897e-06,
- "loss": 0.5673821568489075,
- "mean_token_accuracy": 0.8054057359695435,
- "num_tokens": 2049397.0,
- "step": 225
- },
- {
- "epoch": 0.1717325227963526,
- "grad_norm": 1.807358980178833,
- "learning_rate": 4.999360476902656e-06,
- "loss": 0.4376158118247986,
- "mean_token_accuracy": 0.8456039428710938,
- "num_tokens": 2058721.0,
- "step": 226
- },
- {
- "epoch": 0.17249240121580547,
- "grad_norm": 3.231638193130493,
- "learning_rate": 4.999312229773022e-06,
- "loss": 0.5592809915542603,
- "mean_token_accuracy": 0.8170154094696045,
- "num_tokens": 2063455.0,
- "step": 227
- },
- {
- "epoch": 0.17325227963525835,
- "grad_norm": 2.2717151641845703,
- "learning_rate": 4.999262228529855e-06,
- "loss": 0.6144396066665649,
- "mean_token_accuracy": 0.7948470115661621,
- "num_tokens": 2071686.0,
- "step": 228
- },
- {
- "epoch": 0.17401215805471124,
- "grad_norm": 1.4171342849731445,
- "learning_rate": 4.99921047320825e-06,
- "loss": 0.43680912256240845,
- "mean_token_accuracy": 0.84850013256073,
- "num_tokens": 2086999.0,
- "step": 229
- },
- {
- "epoch": 0.17477203647416414,
- "grad_norm": 3.162736654281616,
- "learning_rate": 4.99915696384453e-06,
- "loss": 0.6025407910346985,
- "mean_token_accuracy": 0.8042335510253906,
- "num_tokens": 2092001.0,
- "step": 230
- },
- {
- "epoch": 0.17553191489361702,
- "grad_norm": 1.8672804832458496,
- "learning_rate": 4.99910170047625e-06,
- "loss": 0.5843087434768677,
- "mean_token_accuracy": 0.8016980886459351,
- "num_tokens": 2103372.0,
- "step": 231
- },
- {
- "epoch": 0.1762917933130699,
- "grad_norm": 2.967587471008301,
- "learning_rate": 4.999044683142196e-06,
- "loss": 0.5123642086982727,
- "mean_token_accuracy": 0.8216149806976318,
- "num_tokens": 2108008.0,
- "step": 232
- },
- {
- "epoch": 0.1770516717325228,
- "grad_norm": 1.9651981592178345,
- "learning_rate": 4.998985911882383e-06,
- "loss": 0.5868178606033325,
- "mean_token_accuracy": 0.7904198169708252,
- "num_tokens": 2119009.0,
- "step": 233
- },
- {
- "epoch": 0.1778115501519757,
- "grad_norm": 2.7785449028015137,
- "learning_rate": 4.998925386738063e-06,
- "loss": 0.5075510144233704,
- "mean_token_accuracy": 0.8280210494995117,
- "num_tokens": 2124915.0,
- "step": 234
- },
- {
- "epoch": 0.17857142857142858,
- "grad_norm": 2.957470417022705,
- "learning_rate": 4.998863107751711e-06,
- "loss": 0.5351958274841309,
- "mean_token_accuracy": 0.846825122833252,
- "num_tokens": 2129905.0,
- "step": 235
- },
- {
- "epoch": 0.17933130699088146,
- "grad_norm": 3.207671880722046,
- "learning_rate": 4.99879907496704e-06,
- "loss": 0.6209091544151306,
- "mean_token_accuracy": 0.789960503578186,
- "num_tokens": 2135027.0,
- "step": 236
- },
- {
- "epoch": 0.18009118541033434,
- "grad_norm": 2.018953800201416,
- "learning_rate": 4.998733288428987e-06,
- "loss": 0.601510763168335,
- "mean_token_accuracy": 0.8136930465698242,
- "num_tokens": 2147016.0,
- "step": 237
- },
- {
- "epoch": 0.18085106382978725,
- "grad_norm": 2.437281847000122,
- "learning_rate": 4.998665748183727e-06,
- "loss": 0.5813639163970947,
- "mean_token_accuracy": 0.8116716146469116,
- "num_tokens": 2155386.0,
- "step": 238
- },
- {
- "epoch": 0.18161094224924013,
- "grad_norm": 1.5708180665969849,
- "learning_rate": 4.998596454278661e-06,
- "loss": 0.5252395272254944,
- "mean_token_accuracy": 0.8193864822387695,
- "num_tokens": 2170295.0,
- "step": 239
- },
- {
- "epoch": 0.182370820668693,
- "grad_norm": 1.9921495914459229,
- "learning_rate": 4.998525406762422e-06,
- "loss": 0.5335029363632202,
- "mean_token_accuracy": 0.8120872974395752,
- "num_tokens": 2180012.0,
- "step": 240
- },
- {
- "epoch": 0.1831306990881459,
- "grad_norm": 2.6562681198120117,
- "learning_rate": 4.998452605684874e-06,
- "loss": 0.48021435737609863,
- "mean_token_accuracy": 0.8388714790344238,
- "num_tokens": 2185607.0,
- "step": 241
- },
- {
- "epoch": 0.1838905775075988,
- "grad_norm": 2.2535853385925293,
- "learning_rate": 4.998378051097111e-06,
- "loss": 0.5747300386428833,
- "mean_token_accuracy": 0.8004639148712158,
- "num_tokens": 2194105.0,
- "step": 242
- },
- {
- "epoch": 0.18465045592705168,
- "grad_norm": 1.6151788234710693,
- "learning_rate": 4.998301743051459e-06,
- "loss": 0.6190565824508667,
- "mean_token_accuracy": 0.7816627621650696,
- "num_tokens": 2210629.0,
- "step": 243
- },
- {
- "epoch": 0.18541033434650456,
- "grad_norm": 2.1088173389434814,
- "learning_rate": 4.9982236816014735e-06,
- "loss": 0.4715560972690582,
- "mean_token_accuracy": 0.8485721349716187,
- "num_tokens": 2218958.0,
- "step": 244
- },
- {
- "epoch": 0.18617021276595744,
- "grad_norm": 2.6168735027313232,
- "learning_rate": 4.998143866801941e-06,
- "loss": 0.6077103018760681,
- "mean_token_accuracy": 0.8057924509048462,
- "num_tokens": 2226368.0,
- "step": 245
- },
- {
- "epoch": 0.18693009118541035,
- "grad_norm": 2.5988616943359375,
- "learning_rate": 4.99806229870888e-06,
- "loss": 0.5021637678146362,
- "mean_token_accuracy": 0.8361666202545166,
- "num_tokens": 2232485.0,
- "step": 246
- },
- {
- "epoch": 0.18768996960486323,
- "grad_norm": 2.015887498855591,
- "learning_rate": 4.9979789773795365e-06,
- "loss": 0.4309737980365753,
- "mean_token_accuracy": 0.8508044481277466,
- "num_tokens": 2240819.0,
- "step": 247
- },
- {
- "epoch": 0.1884498480243161,
- "grad_norm": 2.3115265369415283,
- "learning_rate": 4.997893902872389e-06,
- "loss": 0.5776500701904297,
- "mean_token_accuracy": 0.8079549074172974,
- "num_tokens": 2249460.0,
- "step": 248
- },
- {
- "epoch": 0.189209726443769,
- "grad_norm": 1.7387021780014038,
- "learning_rate": 4.997807075247147e-06,
- "loss": 0.430944561958313,
- "mean_token_accuracy": 0.8483544588088989,
- "num_tokens": 2259124.0,
- "step": 249
- },
- {
- "epoch": 0.1899696048632219,
- "grad_norm": 1.6378381252288818,
- "learning_rate": 4.997718494564747e-06,
- "loss": 0.4123363792896271,
- "mean_token_accuracy": 0.8557409644126892,
- "num_tokens": 2269899.0,
- "step": 250
- },
- {
- "epoch": 0.19072948328267478,
- "grad_norm": 1.336282730102539,
- "learning_rate": 4.997628160887361e-06,
- "loss": 0.502329409122467,
- "mean_token_accuracy": 0.8186938166618347,
- "num_tokens": 2292821.0,
- "step": 251
- },
- {
- "epoch": 0.19148936170212766,
- "grad_norm": 3.3335583209991455,
- "learning_rate": 4.997536074278388e-06,
- "loss": 0.584446907043457,
- "mean_token_accuracy": 0.8062717318534851,
- "num_tokens": 2297175.0,
- "step": 252
- },
- {
- "epoch": 0.19224924012158054,
- "grad_norm": 2.246727228164673,
- "learning_rate": 4.9974422348024565e-06,
- "loss": 0.5683060884475708,
- "mean_token_accuracy": 0.8193703293800354,
- "num_tokens": 2305456.0,
- "step": 253
- },
- {
- "epoch": 0.19300911854103345,
- "grad_norm": 2.3520865440368652,
- "learning_rate": 4.997346642525429e-06,
- "loss": 0.4724946618080139,
- "mean_token_accuracy": 0.8426719307899475,
- "num_tokens": 2312241.0,
- "step": 254
- },
- {
- "epoch": 0.19376899696048633,
- "grad_norm": 2.7115702629089355,
- "learning_rate": 4.9972492975143936e-06,
- "loss": 0.5019032955169678,
- "mean_token_accuracy": 0.8253573179244995,
- "num_tokens": 2318094.0,
- "step": 255
- },
- {
- "epoch": 0.1945288753799392,
- "grad_norm": 1.705528974533081,
- "learning_rate": 4.997150199837671e-06,
- "loss": 0.45588475465774536,
- "mean_token_accuracy": 0.836666464805603,
- "num_tokens": 2329025.0,
- "step": 256
- },
- {
- "epoch": 0.1952887537993921,
- "grad_norm": 2.161400318145752,
- "learning_rate": 4.997049349564814e-06,
- "loss": 0.5170183777809143,
- "mean_token_accuracy": 0.8287534117698669,
- "num_tokens": 2337448.0,
- "step": 257
- },
- {
- "epoch": 0.196048632218845,
- "grad_norm": 2.629669189453125,
- "learning_rate": 4.996946746766602e-06,
- "loss": 0.44650501012802124,
- "mean_token_accuracy": 0.850114107131958,
- "num_tokens": 2343207.0,
- "step": 258
- },
- {
- "epoch": 0.19680851063829788,
- "grad_norm": 1.6735503673553467,
- "learning_rate": 4.996842391515045e-06,
- "loss": 0.5247820019721985,
- "mean_token_accuracy": 0.8285071849822998,
- "num_tokens": 2356801.0,
- "step": 259
- },
- {
- "epoch": 0.19756838905775076,
- "grad_norm": 1.2753115892410278,
- "learning_rate": 4.996736283883382e-06,
- "loss": 0.41870927810668945,
- "mean_token_accuracy": 0.8448047637939453,
- "num_tokens": 2377306.0,
- "step": 260
- },
- {
- "epoch": 0.19832826747720364,
- "grad_norm": 2.6947314739227295,
- "learning_rate": 4.9966284239460875e-06,
- "loss": 0.5059205889701843,
- "mean_token_accuracy": 0.8430814743041992,
- "num_tokens": 2383352.0,
- "step": 261
- },
- {
- "epoch": 0.19908814589665655,
- "grad_norm": 2.0509963035583496,
- "learning_rate": 4.996518811778858e-06,
- "loss": 0.4565388560295105,
- "mean_token_accuracy": 0.8453130722045898,
- "num_tokens": 2391149.0,
- "step": 262
- },
- {
- "epoch": 0.19984802431610943,
- "grad_norm": 2.1856348514556885,
- "learning_rate": 4.996407447458626e-06,
- "loss": 0.531380832195282,
- "mean_token_accuracy": 0.8387004137039185,
- "num_tokens": 2399875.0,
- "step": 263
- },
- {
- "epoch": 0.2006079027355623,
- "grad_norm": 2.7348573207855225,
- "learning_rate": 4.99629433106355e-06,
- "loss": 0.5242817401885986,
- "mean_token_accuracy": 0.8177423477172852,
- "num_tokens": 2406586.0,
- "step": 264
- },
- {
- "epoch": 0.2013677811550152,
- "grad_norm": 1.76587975025177,
- "learning_rate": 4.99617946267302e-06,
- "loss": 0.49298471212387085,
- "mean_token_accuracy": 0.8271149396896362,
- "num_tokens": 2418683.0,
- "step": 265
- },
- {
- "epoch": 0.20212765957446807,
- "grad_norm": 2.8129730224609375,
- "learning_rate": 4.996062842367655e-06,
- "loss": 0.46420302987098694,
- "mean_token_accuracy": 0.8453244566917419,
- "num_tokens": 2422929.0,
- "step": 266
- },
- {
- "epoch": 0.20288753799392098,
- "grad_norm": 2.575744152069092,
- "learning_rate": 4.9959444702293025e-06,
- "loss": 0.43208545446395874,
- "mean_token_accuracy": 0.8494843244552612,
- "num_tokens": 2429567.0,
- "step": 267
- },
- {
- "epoch": 0.20364741641337386,
- "grad_norm": 2.7586750984191895,
- "learning_rate": 4.995824346341041e-06,
- "loss": 0.4390473961830139,
- "mean_token_accuracy": 0.8348895311355591,
- "num_tokens": 2434700.0,
- "step": 268
- },
- {
- "epoch": 0.20440729483282674,
- "grad_norm": 1.972145438194275,
- "learning_rate": 4.99570247078718e-06,
- "loss": 0.6219544410705566,
- "mean_token_accuracy": 0.7939999103546143,
- "num_tokens": 2447007.0,
- "step": 269
- },
- {
- "epoch": 0.20516717325227962,
- "grad_norm": 2.2963485717773438,
- "learning_rate": 4.995578843653255e-06,
- "loss": 0.5008970499038696,
- "mean_token_accuracy": 0.8255308866500854,
- "num_tokens": 2453936.0,
- "step": 270
- },
- {
- "epoch": 0.20592705167173253,
- "grad_norm": 1.8897721767425537,
- "learning_rate": 4.995453465026033e-06,
- "loss": 0.5436089038848877,
- "mean_token_accuracy": 0.819086492061615,
- "num_tokens": 2464494.0,
- "step": 271
- },
- {
- "epoch": 0.2066869300911854,
- "grad_norm": 2.319728374481201,
- "learning_rate": 4.995326334993508e-06,
- "loss": 0.5136368870735168,
- "mean_token_accuracy": 0.820817232131958,
- "num_tokens": 2470938.0,
- "step": 272
- },
- {
- "epoch": 0.2074468085106383,
- "grad_norm": 2.230414390563965,
- "learning_rate": 4.9951974536449055e-06,
- "loss": 0.5272846817970276,
- "mean_token_accuracy": 0.8203279972076416,
- "num_tokens": 2478629.0,
- "step": 273
- },
- {
- "epoch": 0.20820668693009117,
- "grad_norm": 3.401937484741211,
- "learning_rate": 4.9950668210706795e-06,
- "loss": 0.4389592111110687,
- "mean_token_accuracy": 0.8647899031639099,
- "num_tokens": 2482193.0,
- "step": 274
- },
- {
- "epoch": 0.20896656534954408,
- "grad_norm": 2.1278507709503174,
- "learning_rate": 4.994934437362513e-06,
- "loss": 0.598863422870636,
- "mean_token_accuracy": 0.7945119738578796,
- "num_tokens": 2492465.0,
- "step": 275
- },
- {
- "epoch": 0.20972644376899696,
- "grad_norm": 1.9259960651397705,
- "learning_rate": 4.994800302613318e-06,
- "loss": 0.49520939588546753,
- "mean_token_accuracy": 0.8371536135673523,
- "num_tokens": 2500825.0,
- "step": 276
- },
- {
- "epoch": 0.21048632218844984,
- "grad_norm": 2.346418857574463,
- "learning_rate": 4.994664416917236e-06,
- "loss": 0.5412614345550537,
- "mean_token_accuracy": 0.810661792755127,
- "num_tokens": 2509513.0,
- "step": 277
- },
- {
- "epoch": 0.21124620060790272,
- "grad_norm": 1.3092039823532104,
- "learning_rate": 4.994526780369636e-06,
- "loss": 0.46305379271507263,
- "mean_token_accuracy": 0.8358527421951294,
- "num_tokens": 2531405.0,
- "step": 278
- },
- {
- "epoch": 0.21200607902735563,
- "grad_norm": 2.924611806869507,
- "learning_rate": 4.9943873930671175e-06,
- "loss": 0.6134544610977173,
- "mean_token_accuracy": 0.7947378754615784,
- "num_tokens": 2536744.0,
- "step": 279
- },
- {
- "epoch": 0.2127659574468085,
- "grad_norm": 2.8290598392486572,
- "learning_rate": 4.994246255107506e-06,
- "loss": 0.465520441532135,
- "mean_token_accuracy": 0.8440108299255371,
- "num_tokens": 2541184.0,
- "step": 280
- },
- {
- "epoch": 0.2135258358662614,
- "grad_norm": 3.8081259727478027,
- "learning_rate": 4.994103366589859e-06,
- "loss": 0.43394139409065247,
- "mean_token_accuracy": 0.8579148054122925,
- "num_tokens": 2545395.0,
- "step": 281
- },
- {
- "epoch": 0.21428571428571427,
- "grad_norm": 1.7994529008865356,
- "learning_rate": 4.993958727614462e-06,
- "loss": 0.5076484680175781,
- "mean_token_accuracy": 0.8270803093910217,
- "num_tokens": 2556541.0,
- "step": 282
- },
- {
- "epoch": 0.21504559270516718,
- "grad_norm": 2.5582659244537354,
- "learning_rate": 4.993812338282826e-06,
- "loss": 0.4453684389591217,
- "mean_token_accuracy": 0.8488293886184692,
- "num_tokens": 2562949.0,
- "step": 283
- },
- {
- "epoch": 0.21580547112462006,
- "grad_norm": 1.6448938846588135,
- "learning_rate": 4.993664198697694e-06,
- "loss": 0.461971640586853,
- "mean_token_accuracy": 0.824763298034668,
- "num_tokens": 2576407.0,
- "step": 284
- },
- {
- "epoch": 0.21656534954407294,
- "grad_norm": 2.1264469623565674,
- "learning_rate": 4.993514308963037e-06,
- "loss": 0.6241602897644043,
- "mean_token_accuracy": 0.7916014790534973,
- "num_tokens": 2585695.0,
- "step": 285
- },
- {
- "epoch": 0.21732522796352582,
- "grad_norm": 3.629991292953491,
- "learning_rate": 4.993362669184051e-06,
- "loss": 0.610355019569397,
- "mean_token_accuracy": 0.7847568988800049,
- "num_tokens": 2589778.0,
- "step": 286
- },
- {
- "epoch": 0.21808510638297873,
- "grad_norm": 1.9070756435394287,
- "learning_rate": 4.993209279467164e-06,
- "loss": 0.5513623952865601,
- "mean_token_accuracy": 0.7911607027053833,
- "num_tokens": 2600920.0,
- "step": 287
- },
- {
- "epoch": 0.2188449848024316,
- "grad_norm": 1.761062741279602,
- "learning_rate": 4.993054139920031e-06,
- "loss": 0.4579957127571106,
- "mean_token_accuracy": 0.8189530372619629,
- "num_tokens": 2611856.0,
- "step": 288
- },
- {
- "epoch": 0.2196048632218845,
- "grad_norm": 1.7264713048934937,
- "learning_rate": 4.992897250651535e-06,
- "loss": 0.5871305465698242,
- "mean_token_accuracy": 0.7918527126312256,
- "num_tokens": 2624730.0,
- "step": 289
- },
- {
- "epoch": 0.22036474164133737,
- "grad_norm": 1.7455977201461792,
- "learning_rate": 4.992738611771787e-06,
- "loss": 0.5475119948387146,
- "mean_token_accuracy": 0.8226917386054993,
- "num_tokens": 2635705.0,
- "step": 290
- },
- {
- "epoch": 0.22112462006079028,
- "grad_norm": 2.095095157623291,
- "learning_rate": 4.992578223392124e-06,
- "loss": 0.5952225923538208,
- "mean_token_accuracy": 0.8078469038009644,
- "num_tokens": 2643954.0,
- "step": 291
- },
- {
- "epoch": 0.22188449848024316,
- "grad_norm": 2.994664192199707,
- "learning_rate": 4.992416085625115e-06,
- "loss": 0.5432442426681519,
- "mean_token_accuracy": 0.8329008221626282,
- "num_tokens": 2648800.0,
- "step": 292
- },
- {
- "epoch": 0.22264437689969604,
- "grad_norm": 2.796790361404419,
- "learning_rate": 4.992252198584554e-06,
- "loss": 0.5168961882591248,
- "mean_token_accuracy": 0.8393474817276001,
- "num_tokens": 2653546.0,
- "step": 293
- },
- {
- "epoch": 0.22340425531914893,
- "grad_norm": 1.8610522747039795,
- "learning_rate": 4.992086562385462e-06,
- "loss": 0.5728024244308472,
- "mean_token_accuracy": 0.797406792640686,
- "num_tokens": 2667483.0,
- "step": 294
- },
- {
- "epoch": 0.22416413373860183,
- "grad_norm": 1.695472002029419,
- "learning_rate": 4.9919191771440905e-06,
- "loss": 0.5460028648376465,
- "mean_token_accuracy": 0.8123016357421875,
- "num_tokens": 2683574.0,
- "step": 295
- },
- {
- "epoch": 0.22492401215805471,
- "grad_norm": 2.8627376556396484,
- "learning_rate": 4.9917500429779165e-06,
- "loss": 0.5566985011100769,
- "mean_token_accuracy": 0.815531313419342,
- "num_tokens": 2688985.0,
- "step": 296
- },
- {
- "epoch": 0.2256838905775076,
- "grad_norm": 2.73323655128479,
- "learning_rate": 4.991579160005644e-06,
- "loss": 0.48197102546691895,
- "mean_token_accuracy": 0.8471829295158386,
- "num_tokens": 2694799.0,
- "step": 297
- },
- {
- "epoch": 0.22644376899696048,
- "grad_norm": 1.8436161279678345,
- "learning_rate": 4.991406528347206e-06,
- "loss": 0.4528339207172394,
- "mean_token_accuracy": 0.8603188395500183,
- "num_tokens": 2707321.0,
- "step": 298
- },
- {
- "epoch": 0.22720364741641338,
- "grad_norm": 2.6231515407562256,
- "learning_rate": 4.9912321481237616e-06,
- "loss": 0.5916541814804077,
- "mean_token_accuracy": 0.8050242066383362,
- "num_tokens": 2714233.0,
- "step": 299
- },
- {
- "epoch": 0.22796352583586627,
- "grad_norm": 3.08776593208313,
- "learning_rate": 4.991056019457697e-06,
- "loss": 0.4860580563545227,
- "mean_token_accuracy": 0.8464088439941406,
- "num_tokens": 2718443.0,
- "step": 300
- },
- {
- "epoch": 0.22872340425531915,
- "grad_norm": 2.2537803649902344,
- "learning_rate": 4.990878142472628e-06,
- "loss": 0.5158311128616333,
- "mean_token_accuracy": 0.824694812297821,
- "num_tokens": 2726158.0,
- "step": 301
- },
- {
- "epoch": 0.22948328267477203,
- "grad_norm": 2.1122705936431885,
- "learning_rate": 4.990698517293394e-06,
- "loss": 0.495265394449234,
- "mean_token_accuracy": 0.8343238830566406,
- "num_tokens": 2735022.0,
- "step": 302
- },
- {
- "epoch": 0.23024316109422494,
- "grad_norm": 3.5503528118133545,
- "learning_rate": 4.9905171440460645e-06,
- "loss": 0.46063232421875,
- "mean_token_accuracy": 0.8420047760009766,
- "num_tokens": 2738550.0,
- "step": 303
- },
- {
- "epoch": 0.23100303951367782,
- "grad_norm": 3.9858486652374268,
- "learning_rate": 4.990334022857932e-06,
- "loss": 0.5832710266113281,
- "mean_token_accuracy": 0.8144199848175049,
- "num_tokens": 2741720.0,
- "step": 304
- },
- {
- "epoch": 0.2317629179331307,
- "grad_norm": 2.407231330871582,
- "learning_rate": 4.990149153857519e-06,
- "loss": 0.4692630171775818,
- "mean_token_accuracy": 0.8429223299026489,
- "num_tokens": 2748693.0,
- "step": 305
- },
- {
- "epoch": 0.23252279635258358,
- "grad_norm": 1.6996397972106934,
- "learning_rate": 4.989962537174573e-06,
- "loss": 0.49143946170806885,
- "mean_token_accuracy": 0.8340128064155579,
- "num_tokens": 2761254.0,
- "step": 306
- },
- {
- "epoch": 0.23328267477203649,
- "grad_norm": 3.746432065963745,
- "learning_rate": 4.989774172940071e-06,
- "loss": 0.6282026767730713,
- "mean_token_accuracy": 0.775698184967041,
- "num_tokens": 2765115.0,
- "step": 307
- },
- {
- "epoch": 0.23404255319148937,
- "grad_norm": 2.212872266769409,
- "learning_rate": 4.989584061286211e-06,
- "loss": 0.5193763971328735,
- "mean_token_accuracy": 0.8168246746063232,
- "num_tokens": 2772345.0,
- "step": 308
- },
- {
- "epoch": 0.23480243161094225,
- "grad_norm": 1.752297282218933,
- "learning_rate": 4.989392202346423e-06,
- "loss": 0.4437984824180603,
- "mean_token_accuracy": 0.8451256155967712,
- "num_tokens": 2783072.0,
- "step": 309
- },
- {
- "epoch": 0.23556231003039513,
- "grad_norm": 2.386019706726074,
- "learning_rate": 4.989198596255361e-06,
- "loss": 0.4090752899646759,
- "mean_token_accuracy": 0.8480085134506226,
- "num_tokens": 2788757.0,
- "step": 310
- },
- {
- "epoch": 0.23632218844984804,
- "grad_norm": 3.9981489181518555,
- "learning_rate": 4.989003243148904e-06,
- "loss": 0.5149132013320923,
- "mean_token_accuracy": 0.8179056644439697,
- "num_tokens": 2792096.0,
- "step": 311
- },
- {
- "epoch": 0.23708206686930092,
- "grad_norm": 1.8723100423812866,
- "learning_rate": 4.988806143164159e-06,
- "loss": 0.4531487822532654,
- "mean_token_accuracy": 0.8400167226791382,
- "num_tokens": 2802210.0,
- "step": 312
- },
- {
- "epoch": 0.2378419452887538,
- "grad_norm": 2.3415136337280273,
- "learning_rate": 4.988607296439459e-06,
- "loss": 0.5974439978599548,
- "mean_token_accuracy": 0.8035976886749268,
- "num_tokens": 2810088.0,
- "step": 313
- },
- {
- "epoch": 0.23860182370820668,
- "grad_norm": 1.5317577123641968,
- "learning_rate": 4.98840670311436e-06,
- "loss": 0.49247145652770996,
- "mean_token_accuracy": 0.8292540311813354,
- "num_tokens": 2824005.0,
- "step": 314
- },
- {
- "epoch": 0.2393617021276596,
- "grad_norm": 2.170772075653076,
- "learning_rate": 4.988204363329648e-06,
- "loss": 0.6359974145889282,
- "mean_token_accuracy": 0.7785564661026001,
- "num_tokens": 2834680.0,
- "step": 315
- },
- {
- "epoch": 0.24012158054711247,
- "grad_norm": 3.2655932903289795,
- "learning_rate": 4.988000277227334e-06,
- "loss": 0.5080196857452393,
- "mean_token_accuracy": 0.8295877575874329,
- "num_tokens": 2838735.0,
- "step": 316
- },
- {
- "epoch": 0.24088145896656535,
- "grad_norm": 3.406589984893799,
- "learning_rate": 4.987794444950651e-06,
- "loss": 0.3939085006713867,
- "mean_token_accuracy": 0.8700719475746155,
- "num_tokens": 2842127.0,
- "step": 317
- },
- {
- "epoch": 0.24164133738601823,
- "grad_norm": 1.8211106061935425,
- "learning_rate": 4.987586866644061e-06,
- "loss": 0.5270540118217468,
- "mean_token_accuracy": 0.826683521270752,
- "num_tokens": 2853656.0,
- "step": 318
- },
- {
- "epoch": 0.24240121580547114,
- "grad_norm": 1.8429969549179077,
- "learning_rate": 4.9873775424532515e-06,
- "loss": 0.4705049991607666,
- "mean_token_accuracy": 0.8355701565742493,
- "num_tokens": 2863513.0,
- "step": 319
- },
- {
- "epoch": 0.24316109422492402,
- "grad_norm": 2.2425320148468018,
- "learning_rate": 4.9871664725251314e-06,
- "loss": 0.485736608505249,
- "mean_token_accuracy": 0.835182785987854,
- "num_tokens": 2871556.0,
- "step": 320
- },
- {
- "epoch": 0.2439209726443769,
- "grad_norm": 1.6202056407928467,
- "learning_rate": 4.986953657007841e-06,
- "loss": 0.4437887370586395,
- "mean_token_accuracy": 0.8282591700553894,
- "num_tokens": 2884335.0,
- "step": 321
- },
- {
- "epoch": 0.24468085106382978,
- "grad_norm": 1.1027268171310425,
- "learning_rate": 4.98673909605074e-06,
- "loss": 0.3770800828933716,
- "mean_token_accuracy": 0.8325437307357788,
- "num_tokens": 2904286.0,
- "step": 322
- },
- {
- "epoch": 0.2454407294832827,
- "grad_norm": 2.3239076137542725,
- "learning_rate": 4.986522789804417e-06,
- "loss": 0.5387254953384399,
- "mean_token_accuracy": 0.806242823600769,
- "num_tokens": 2910975.0,
- "step": 323
- },
- {
- "epoch": 0.24620060790273557,
- "grad_norm": 2.243482828140259,
- "learning_rate": 4.986304738420684e-06,
- "loss": 0.4396553039550781,
- "mean_token_accuracy": 0.8561904430389404,
- "num_tokens": 2917087.0,
- "step": 324
- },
- {
- "epoch": 0.24696048632218845,
- "grad_norm": 2.537264347076416,
- "learning_rate": 4.986084942052577e-06,
- "loss": 0.395110160112381,
- "mean_token_accuracy": 0.8636915683746338,
- "num_tokens": 2921887.0,
- "step": 325
- },
- {
- "epoch": 0.24772036474164133,
- "grad_norm": 2.319399118423462,
- "learning_rate": 4.9858634008543574e-06,
- "loss": 0.581517219543457,
- "mean_token_accuracy": 0.8157487511634827,
- "num_tokens": 2928996.0,
- "step": 326
- },
- {
- "epoch": 0.24848024316109424,
- "grad_norm": 1.9787474870681763,
- "learning_rate": 4.985640114981513e-06,
- "loss": 0.5084106922149658,
- "mean_token_accuracy": 0.835221529006958,
- "num_tokens": 2940302.0,
- "step": 327
- },
- {
- "epoch": 0.24924012158054712,
- "grad_norm": 2.4783265590667725,
- "learning_rate": 4.985415084590752e-06,
- "loss": 0.6062222719192505,
- "mean_token_accuracy": 0.7885516285896301,
- "num_tokens": 2946386.0,
- "step": 328
- },
- {
- "epoch": 0.25,
- "grad_norm": 2.4081411361694336,
- "learning_rate": 4.985188309840012e-06,
- "loss": 0.5079880356788635,
- "mean_token_accuracy": 0.8313904404640198,
- "num_tokens": 2952323.0,
- "step": 329
- },
- {
- "epoch": 0.2507598784194529,
- "grad_norm": 2.64993953704834,
- "learning_rate": 4.984959790888451e-06,
- "loss": 0.5461447834968567,
- "mean_token_accuracy": 0.8125468492507935,
- "num_tokens": 2958119.0,
- "step": 330
- },
- {
- "epoch": 0.25151975683890576,
- "grad_norm": 2.549734115600586,
- "learning_rate": 4.984729527896451e-06,
- "loss": 0.5998573303222656,
- "mean_token_accuracy": 0.8076666593551636,
- "num_tokens": 2964947.0,
- "step": 331
- },
- {
- "epoch": 0.25227963525835867,
- "grad_norm": 3.2185161113739014,
- "learning_rate": 4.984497521025622e-06,
- "loss": 0.4232945442199707,
- "mean_token_accuracy": 0.8543803095817566,
- "num_tokens": 2968598.0,
- "step": 332
- },
- {
- "epoch": 0.2530395136778115,
- "grad_norm": 2.588994264602661,
- "learning_rate": 4.984263770438793e-06,
- "loss": 0.460967481136322,
- "mean_token_accuracy": 0.8416207432746887,
- "num_tokens": 2974510.0,
- "step": 333
- },
- {
- "epoch": 0.25379939209726443,
- "grad_norm": 2.1373162269592285,
- "learning_rate": 4.984028276300021e-06,
- "loss": 0.49382102489471436,
- "mean_token_accuracy": 0.8388048410415649,
- "num_tokens": 2981632.0,
- "step": 334
- },
- {
- "epoch": 0.25455927051671734,
- "grad_norm": 2.2524826526641846,
- "learning_rate": 4.983791038774585e-06,
- "loss": 0.4947671890258789,
- "mean_token_accuracy": 0.8066365122795105,
- "num_tokens": 2988736.0,
- "step": 335
- },
- {
- "epoch": 0.2553191489361702,
- "grad_norm": 1.7244199514389038,
- "learning_rate": 4.983552058028985e-06,
- "loss": 0.48096776008605957,
- "mean_token_accuracy": 0.830735445022583,
- "num_tokens": 3003576.0,
- "step": 336
- },
- {
- "epoch": 0.2560790273556231,
- "grad_norm": 3.0628933906555176,
- "learning_rate": 4.9833113342309495e-06,
- "loss": 0.6027032136917114,
- "mean_token_accuracy": 0.8008694648742676,
- "num_tokens": 3009549.0,
- "step": 337
- },
- {
- "epoch": 0.256838905775076,
- "grad_norm": 2.438674211502075,
- "learning_rate": 4.983068867549427e-06,
- "loss": 0.517090916633606,
- "mean_token_accuracy": 0.827893853187561,
- "num_tokens": 3015236.0,
- "step": 338
- },
- {
- "epoch": 0.25759878419452886,
- "grad_norm": 2.131535053253174,
- "learning_rate": 4.982824658154589e-06,
- "loss": 0.6656812429428101,
- "mean_token_accuracy": 0.7772425413131714,
- "num_tokens": 3028142.0,
- "step": 339
- },
- {
- "epoch": 0.25835866261398177,
- "grad_norm": 2.3206584453582764,
- "learning_rate": 4.9825787062178315e-06,
- "loss": 0.5757625699043274,
- "mean_token_accuracy": 0.8073873519897461,
- "num_tokens": 3040996.0,
- "step": 340
- },
- {
- "epoch": 0.2591185410334346,
- "grad_norm": 1.3905521631240845,
- "learning_rate": 4.982331011911774e-06,
- "loss": 0.4193805456161499,
- "mean_token_accuracy": 0.8399466872215271,
- "num_tokens": 3061931.0,
- "step": 341
- },
- {
- "epoch": 0.25987841945288753,
- "grad_norm": 2.184173345565796,
- "learning_rate": 4.982081575410256e-06,
- "loss": 0.4751223921775818,
- "mean_token_accuracy": 0.8409271240234375,
- "num_tokens": 3069081.0,
- "step": 342
- },
- {
- "epoch": 0.26063829787234044,
- "grad_norm": 3.538764238357544,
- "learning_rate": 4.9818303968883445e-06,
- "loss": 0.8119601011276245,
- "mean_token_accuracy": 0.7442739009857178,
- "num_tokens": 3073628.0,
- "step": 343
- },
- {
- "epoch": 0.2613981762917933,
- "grad_norm": 1.8063762187957764,
- "learning_rate": 4.981577476522323e-06,
- "loss": 0.5615730881690979,
- "mean_token_accuracy": 0.8207751512527466,
- "num_tokens": 3086596.0,
- "step": 344
- },
- {
- "epoch": 0.2621580547112462,
- "grad_norm": 2.4346961975097656,
- "learning_rate": 4.981322814489703e-06,
- "loss": 0.5266709327697754,
- "mean_token_accuracy": 0.8211277723312378,
- "num_tokens": 3092631.0,
- "step": 345
- },
- {
- "epoch": 0.2629179331306991,
- "grad_norm": 1.91289484500885,
- "learning_rate": 4.981066410969215e-06,
- "loss": 0.5047177672386169,
- "mean_token_accuracy": 0.8356877565383911,
- "num_tokens": 3101102.0,
- "step": 346
- },
- {
- "epoch": 0.26367781155015196,
- "grad_norm": 2.1495707035064697,
- "learning_rate": 4.980808266140813e-06,
- "loss": 0.47876280546188354,
- "mean_token_accuracy": 0.8364313244819641,
- "num_tokens": 3107998.0,
- "step": 347
- },
- {
- "epoch": 0.26443768996960487,
- "grad_norm": 2.5961992740631104,
- "learning_rate": 4.9805483801856744e-06,
- "loss": 0.5512958765029907,
- "mean_token_accuracy": 0.8181467652320862,
- "num_tokens": 3113848.0,
- "step": 348
- },
- {
- "epoch": 0.2651975683890577,
- "grad_norm": 3.2828900814056396,
- "learning_rate": 4.980286753286196e-06,
- "loss": 0.4217945635318756,
- "mean_token_accuracy": 0.8617103099822998,
- "num_tokens": 3117652.0,
- "step": 349
- },
- {
- "epoch": 0.26595744680851063,
- "grad_norm": 1.425554871559143,
- "learning_rate": 4.980023385625996e-06,
- "loss": 0.4042487144470215,
- "mean_token_accuracy": 0.8492785692214966,
- "num_tokens": 3132336.0,
- "step": 350
- },
- {
- "epoch": 0.26671732522796354,
- "grad_norm": 2.933504104614258,
- "learning_rate": 4.979758277389919e-06,
- "loss": 0.5406704545021057,
- "mean_token_accuracy": 0.8035423755645752,
- "num_tokens": 3137544.0,
- "step": 351
- },
- {
- "epoch": 0.2674772036474164,
- "grad_norm": 1.9958966970443726,
- "learning_rate": 4.9794914287640264e-06,
- "loss": 0.5857555270195007,
- "mean_token_accuracy": 0.7965140342712402,
- "num_tokens": 3149705.0,
- "step": 352
- },
- {
- "epoch": 0.2682370820668693,
- "grad_norm": 2.467694044113159,
- "learning_rate": 4.979222839935602e-06,
- "loss": 0.6404043436050415,
- "mean_token_accuracy": 0.7823755741119385,
- "num_tokens": 3158353.0,
- "step": 353
- },
- {
- "epoch": 0.2689969604863222,
- "grad_norm": 2.0102720260620117,
- "learning_rate": 4.9789525110931545e-06,
- "loss": 0.5681496858596802,
- "mean_token_accuracy": 0.8108169436454773,
- "num_tokens": 3167121.0,
- "step": 354
- },
- {
- "epoch": 0.26975683890577506,
- "grad_norm": 2.6017866134643555,
- "learning_rate": 4.978680442426409e-06,
- "loss": 0.6309828162193298,
- "mean_token_accuracy": 0.7742617130279541,
- "num_tokens": 3175012.0,
- "step": 355
- },
- {
- "epoch": 0.270516717325228,
- "grad_norm": 1.8799268007278442,
- "learning_rate": 4.978406634126315e-06,
- "loss": 0.524029016494751,
- "mean_token_accuracy": 0.8317689895629883,
- "num_tokens": 3185331.0,
- "step": 356
- },
- {
- "epoch": 0.2712765957446808,
- "grad_norm": 1.508332371711731,
- "learning_rate": 4.978131086385041e-06,
- "loss": 0.46656402945518494,
- "mean_token_accuracy": 0.8339117765426636,
- "num_tokens": 3198813.0,
- "step": 357
- },
- {
- "epoch": 0.27203647416413373,
- "grad_norm": 3.595707654953003,
- "learning_rate": 4.977853799395976e-06,
- "loss": 0.5101234912872314,
- "mean_token_accuracy": 0.8251723051071167,
- "num_tokens": 3206557.0,
- "step": 358
- },
- {
- "epoch": 0.27279635258358664,
- "grad_norm": 3.5317916870117188,
- "learning_rate": 4.977574773353732e-06,
- "loss": 0.5684665441513062,
- "mean_token_accuracy": 0.8124493360519409,
- "num_tokens": 3210912.0,
- "step": 359
- },
- {
- "epoch": 0.2735562310030395,
- "grad_norm": 2.8606204986572266,
- "learning_rate": 4.97729400845414e-06,
- "loss": 0.4746384620666504,
- "mean_token_accuracy": 0.8195606470108032,
- "num_tokens": 3215365.0,
- "step": 360
- },
- {
- "epoch": 0.2743161094224924,
- "grad_norm": 1.8214033842086792,
- "learning_rate": 4.977011504894253e-06,
- "loss": 0.4842769503593445,
- "mean_token_accuracy": 0.82928866147995,
- "num_tokens": 3224037.0,
- "step": 361
- },
- {
- "epoch": 0.2750759878419453,
- "grad_norm": 1.628746509552002,
- "learning_rate": 4.97672726287234e-06,
- "loss": 0.4397493302822113,
- "mean_token_accuracy": 0.8606528043746948,
- "num_tokens": 3235589.0,
- "step": 362
- },
- {
- "epoch": 0.27583586626139817,
- "grad_norm": 3.557973861694336,
- "learning_rate": 4.976441282587894e-06,
- "loss": 0.5732032060623169,
- "mean_token_accuracy": 0.8041545748710632,
- "num_tokens": 3239958.0,
- "step": 363
- },
- {
- "epoch": 0.2765957446808511,
- "grad_norm": 1.3467901945114136,
- "learning_rate": 4.9761535642416284e-06,
- "loss": 0.4525323510169983,
- "mean_token_accuracy": 0.8281061053276062,
- "num_tokens": 3257703.0,
- "step": 364
- },
- {
- "epoch": 0.2773556231003039,
- "grad_norm": 2.2649986743927,
- "learning_rate": 4.9758641080354745e-06,
- "loss": 0.5074734687805176,
- "mean_token_accuracy": 0.8447474241256714,
- "num_tokens": 3264334.0,
- "step": 365
- },
- {
- "epoch": 0.27811550151975684,
- "grad_norm": 2.8667566776275635,
- "learning_rate": 4.975572914172581e-06,
- "loss": 0.5759559869766235,
- "mean_token_accuracy": 0.7976793050765991,
- "num_tokens": 3269314.0,
- "step": 366
- },
- {
- "epoch": 0.27887537993920974,
- "grad_norm": 2.2514986991882324,
- "learning_rate": 4.975279982857324e-06,
- "loss": 0.5786465406417847,
- "mean_token_accuracy": 0.8058781623840332,
- "num_tokens": 3277324.0,
- "step": 367
- },
- {
- "epoch": 0.2796352583586626,
- "grad_norm": 1.3826723098754883,
- "learning_rate": 4.97498531429529e-06,
- "loss": 0.40801727771759033,
- "mean_token_accuracy": 0.8601310849189758,
- "num_tokens": 3290530.0,
- "step": 368
- },
- {
- "epoch": 0.2803951367781155,
- "grad_norm": 2.084092617034912,
- "learning_rate": 4.97468890869329e-06,
- "loss": 0.47076648473739624,
- "mean_token_accuracy": 0.8310186862945557,
- "num_tokens": 3298325.0,
- "step": 369
- },
- {
- "epoch": 0.2811550151975684,
- "grad_norm": 1.3467998504638672,
- "learning_rate": 4.974390766259353e-06,
- "loss": 0.44668465852737427,
- "mean_token_accuracy": 0.8275353908538818,
- "num_tokens": 3314302.0,
- "step": 370
- },
- {
- "epoch": 0.28191489361702127,
- "grad_norm": 2.5921075344085693,
- "learning_rate": 4.974090887202726e-06,
- "loss": 0.5343953967094421,
- "mean_token_accuracy": 0.8110706806182861,
- "num_tokens": 3320963.0,
- "step": 371
- },
- {
- "epoch": 0.2826747720364742,
- "grad_norm": 2.042781352996826,
- "learning_rate": 4.973789271733877e-06,
- "loss": 0.6293343305587769,
- "mean_token_accuracy": 0.7800243496894836,
- "num_tokens": 3332742.0,
- "step": 372
- },
- {
- "epoch": 0.28343465045592703,
- "grad_norm": 4.822193145751953,
- "learning_rate": 4.973485920064491e-06,
- "loss": 0.6256728768348694,
- "mean_token_accuracy": 0.7962433099746704,
- "num_tokens": 3335872.0,
- "step": 373
- },
- {
- "epoch": 0.28419452887537994,
- "grad_norm": 1.260988473892212,
- "learning_rate": 4.973180832407471e-06,
- "loss": 0.38731223344802856,
- "mean_token_accuracy": 0.8385066986083984,
- "num_tokens": 3351884.0,
- "step": 374
- },
- {
- "epoch": 0.28495440729483285,
- "grad_norm": 2.669966697692871,
- "learning_rate": 4.97287400897694e-06,
- "loss": 0.5594710111618042,
- "mean_token_accuracy": 0.8097212314605713,
- "num_tokens": 3358197.0,
- "step": 375
- },
- {
- "epoch": 0.2857142857142857,
- "grad_norm": 3.0344486236572266,
- "learning_rate": 4.972565449988238e-06,
- "loss": 0.34449583292007446,
- "mean_token_accuracy": 0.8813316822052002,
- "num_tokens": 3362133.0,
- "step": 376
- },
- {
- "epoch": 0.2864741641337386,
- "grad_norm": 2.562251091003418,
- "learning_rate": 4.972255155657925e-06,
- "loss": 0.5331522822380066,
- "mean_token_accuracy": 0.8212941288948059,
- "num_tokens": 3370346.0,
- "step": 377
- },
- {
- "epoch": 0.2872340425531915,
- "grad_norm": 2.7083740234375,
- "learning_rate": 4.9719431262037755e-06,
- "loss": 0.5403046011924744,
- "mean_token_accuracy": 0.8108335733413696,
- "num_tokens": 3375588.0,
- "step": 378
- },
- {
- "epoch": 0.28799392097264437,
- "grad_norm": 1.396430492401123,
- "learning_rate": 4.971629361844785e-06,
- "loss": 0.4041529893875122,
- "mean_token_accuracy": 0.8588063716888428,
- "num_tokens": 3390749.0,
- "step": 379
- },
- {
- "epoch": 0.2887537993920973,
- "grad_norm": 1.9872784614562988,
- "learning_rate": 4.971313862801166e-06,
- "loss": 0.4336993098258972,
- "mean_token_accuracy": 0.8511303663253784,
- "num_tokens": 3399064.0,
- "step": 380
- },
- {
- "epoch": 0.28951367781155013,
- "grad_norm": 1.9652575254440308,
- "learning_rate": 4.9709966292943455e-06,
- "loss": 0.4578358232975006,
- "mean_token_accuracy": 0.8229440450668335,
- "num_tokens": 3407229.0,
- "step": 381
- },
- {
- "epoch": 0.29027355623100304,
- "grad_norm": 1.6626898050308228,
- "learning_rate": 4.970677661546972e-06,
- "loss": 0.5427594184875488,
- "mean_token_accuracy": 0.815427303314209,
- "num_tokens": 3422321.0,
- "step": 382
- },
- {
- "epoch": 0.29103343465045595,
- "grad_norm": 3.5265562534332275,
- "learning_rate": 4.970356959782909e-06,
- "loss": 0.6661460995674133,
- "mean_token_accuracy": 0.7856965065002441,
- "num_tokens": 3427442.0,
- "step": 383
- },
- {
- "epoch": 0.2917933130699088,
- "grad_norm": 1.667205572128296,
- "learning_rate": 4.970034524227239e-06,
- "loss": 0.36256325244903564,
- "mean_token_accuracy": 0.8711205720901489,
- "num_tokens": 3436662.0,
- "step": 384
- },
- {
- "epoch": 0.2925531914893617,
- "grad_norm": 1.3389486074447632,
- "learning_rate": 4.969710355106256e-06,
- "loss": 0.4282698631286621,
- "mean_token_accuracy": 0.838951587677002,
- "num_tokens": 3450060.0,
- "step": 385
- },
- {
- "epoch": 0.2933130699088146,
- "grad_norm": 2.5163397789001465,
- "learning_rate": 4.969384452647477e-06,
- "loss": 0.5176984071731567,
- "mean_token_accuracy": 0.8235267996788025,
- "num_tokens": 3456990.0,
- "step": 386
- },
- {
- "epoch": 0.29407294832826747,
- "grad_norm": 1.7588495016098022,
- "learning_rate": 4.969056817079633e-06,
- "loss": 0.49710947275161743,
- "mean_token_accuracy": 0.818520724773407,
- "num_tokens": 3468098.0,
- "step": 387
- },
- {
- "epoch": 0.2948328267477204,
- "grad_norm": 2.6381046772003174,
- "learning_rate": 4.968727448632669e-06,
- "loss": 0.4425308108329773,
- "mean_token_accuracy": 0.8451643586158752,
- "num_tokens": 3472899.0,
- "step": 388
- },
- {
- "epoch": 0.29559270516717323,
- "grad_norm": 1.6345038414001465,
- "learning_rate": 4.968396347537751e-06,
- "loss": 0.4177059829235077,
- "mean_token_accuracy": 0.8498886227607727,
- "num_tokens": 3484826.0,
- "step": 389
- },
- {
- "epoch": 0.29635258358662614,
- "grad_norm": 3.0466468334198,
- "learning_rate": 4.968063514027258e-06,
- "loss": 0.4274463951587677,
- "mean_token_accuracy": 0.8387278318405151,
- "num_tokens": 3488610.0,
- "step": 390
- },
- {
- "epoch": 0.29711246200607905,
- "grad_norm": 2.6509406566619873,
- "learning_rate": 4.967728948334784e-06,
- "loss": 0.5401753783226013,
- "mean_token_accuracy": 0.8252490162849426,
- "num_tokens": 3493657.0,
- "step": 391
- },
- {
- "epoch": 0.2978723404255319,
- "grad_norm": 1.6372219324111938,
- "learning_rate": 4.967392650695141e-06,
- "loss": 0.3862472176551819,
- "mean_token_accuracy": 0.8555525541305542,
- "num_tokens": 3505588.0,
- "step": 392
- },
- {
- "epoch": 0.2986322188449848,
- "grad_norm": 2.1615452766418457,
- "learning_rate": 4.967054621344356e-06,
- "loss": 0.57850581407547,
- "mean_token_accuracy": 0.8222678899765015,
- "num_tokens": 3514396.0,
- "step": 393
- },
- {
- "epoch": 0.2993920972644377,
- "grad_norm": 1.8610916137695312,
- "learning_rate": 4.96671486051967e-06,
- "loss": 0.5440595149993896,
- "mean_token_accuracy": 0.8196715116500854,
- "num_tokens": 3523604.0,
- "step": 394
- },
- {
- "epoch": 0.30015197568389057,
- "grad_norm": 2.9585862159729004,
- "learning_rate": 4.966373368459542e-06,
- "loss": 0.6921588182449341,
- "mean_token_accuracy": 0.7816659808158875,
- "num_tokens": 3529849.0,
- "step": 395
- },
- {
- "epoch": 0.3009118541033435,
- "grad_norm": 1.9374035596847534,
- "learning_rate": 4.966030145403642e-06,
- "loss": 0.5494055151939392,
- "mean_token_accuracy": 0.8126792907714844,
- "num_tokens": 3539529.0,
- "step": 396
- },
- {
- "epoch": 0.30167173252279633,
- "grad_norm": 1.730530023574829,
- "learning_rate": 4.965685191592859e-06,
- "loss": 0.4271572232246399,
- "mean_token_accuracy": 0.8383668661117554,
- "num_tokens": 3550972.0,
- "step": 397
- },
- {
- "epoch": 0.30243161094224924,
- "grad_norm": 3.9635560512542725,
- "learning_rate": 4.9653385072692935e-06,
- "loss": 0.5576210021972656,
- "mean_token_accuracy": 0.799404501914978,
- "num_tokens": 3554147.0,
- "step": 398
- },
- {
- "epoch": 0.30319148936170215,
- "grad_norm": 2.5731968879699707,
- "learning_rate": 4.964990092676263e-06,
- "loss": 0.5478942394256592,
- "mean_token_accuracy": 0.8220961093902588,
- "num_tokens": 3559972.0,
- "step": 399
- },
- {
- "epoch": 0.303951367781155,
- "grad_norm": 2.2096588611602783,
- "learning_rate": 4.964639948058297e-06,
- "loss": 0.35461270809173584,
- "mean_token_accuracy": 0.8640927076339722,
- "num_tokens": 3565770.0,
- "step": 400
- },
- {
- "epoch": 0.3047112462006079,
- "grad_norm": 1.7874189615249634,
- "learning_rate": 4.964288073661142e-06,
- "loss": 0.38849619030952454,
- "mean_token_accuracy": 0.8443037271499634,
- "num_tokens": 3574514.0,
- "step": 401
- },
- {
- "epoch": 0.30547112462006076,
- "grad_norm": 1.5583146810531616,
- "learning_rate": 4.963934469731756e-06,
- "loss": 0.48909449577331543,
- "mean_token_accuracy": 0.8429768681526184,
- "num_tokens": 3585877.0,
- "step": 402
- },
- {
- "epoch": 0.30623100303951367,
- "grad_norm": 3.026599645614624,
- "learning_rate": 4.963579136518312e-06,
- "loss": 0.5138992071151733,
- "mean_token_accuracy": 0.8283728361129761,
- "num_tokens": 3590412.0,
- "step": 403
- },
- {
- "epoch": 0.3069908814589666,
- "grad_norm": 2.777505874633789,
- "learning_rate": 4.963222074270197e-06,
- "loss": 0.6241534948348999,
- "mean_token_accuracy": 0.8130464553833008,
- "num_tokens": 3596246.0,
- "step": 404
- },
- {
- "epoch": 0.30775075987841943,
- "grad_norm": 2.4772839546203613,
- "learning_rate": 4.962863283238011e-06,
- "loss": 0.5930814146995544,
- "mean_token_accuracy": 0.8036394715309143,
- "num_tokens": 3602878.0,
- "step": 405
- },
- {
- "epoch": 0.30851063829787234,
- "grad_norm": 1.5049982070922852,
- "learning_rate": 4.962502763673566e-06,
- "loss": 0.4903082549571991,
- "mean_token_accuracy": 0.8184912204742432,
- "num_tokens": 3617018.0,
- "step": 406
- },
- {
- "epoch": 0.30927051671732525,
- "grad_norm": 2.453155040740967,
- "learning_rate": 4.96214051582989e-06,
- "loss": 0.5138067603111267,
- "mean_token_accuracy": 0.8336835503578186,
- "num_tokens": 3624188.0,
- "step": 407
- },
- {
- "epoch": 0.3100303951367781,
- "grad_norm": 2.4038336277008057,
- "learning_rate": 4.961776539961222e-06,
- "loss": 0.5752760171890259,
- "mean_token_accuracy": 0.8054730892181396,
- "num_tokens": 3634152.0,
- "step": 408
- },
- {
- "epoch": 0.310790273556231,
- "grad_norm": 2.629068374633789,
- "learning_rate": 4.961410836323014e-06,
- "loss": 0.5580606460571289,
- "mean_token_accuracy": 0.8121089935302734,
- "num_tokens": 3639528.0,
- "step": 409
- },
- {
- "epoch": 0.31155015197568386,
- "grad_norm": 1.4245928525924683,
- "learning_rate": 4.961043405171931e-06,
- "loss": 0.5399882793426514,
- "mean_token_accuracy": 0.812280535697937,
- "num_tokens": 3655744.0,
- "step": 410
- },
- {
- "epoch": 0.3123100303951368,
- "grad_norm": 1.5236459970474243,
- "learning_rate": 4.9606742467658505e-06,
- "loss": 0.5234690308570862,
- "mean_token_accuracy": 0.8188928365707397,
- "num_tokens": 3675010.0,
- "step": 411
- },
- {
- "epoch": 0.3130699088145897,
- "grad_norm": 2.27961802482605,
- "learning_rate": 4.960303361363863e-06,
- "loss": 0.5502505898475647,
- "mean_token_accuracy": 0.8161963224411011,
- "num_tokens": 3682328.0,
- "step": 412
- },
- {
- "epoch": 0.31382978723404253,
- "grad_norm": 1.554518222808838,
- "learning_rate": 4.959930749226269e-06,
- "loss": 0.420867919921875,
- "mean_token_accuracy": 0.8499157428741455,
- "num_tokens": 3694980.0,
- "step": 413
- },
- {
- "epoch": 0.31458966565349544,
- "grad_norm": 2.609218120574951,
- "learning_rate": 4.9595564106145825e-06,
- "loss": 0.4706704318523407,
- "mean_token_accuracy": 0.8412490487098694,
- "num_tokens": 3700033.0,
- "step": 414
- },
- {
- "epoch": 0.31534954407294835,
- "grad_norm": 1.5303231477737427,
- "learning_rate": 4.959180345791528e-06,
- "loss": 0.4668654799461365,
- "mean_token_accuracy": 0.8125015497207642,
- "num_tokens": 3715012.0,
- "step": 415
- },
- {
- "epoch": 0.3161094224924012,
- "grad_norm": 1.2774665355682373,
- "learning_rate": 4.958802555021042e-06,
- "loss": 0.4339369237422943,
- "mean_token_accuracy": 0.8442851901054382,
- "num_tokens": 3733928.0,
- "step": 416
- },
- {
- "epoch": 0.3168693009118541,
- "grad_norm": 2.1240181922912598,
- "learning_rate": 4.958423038568274e-06,
- "loss": 0.4029104709625244,
- "mean_token_accuracy": 0.8627674579620361,
- "num_tokens": 3740202.0,
- "step": 417
- },
- {
- "epoch": 0.31762917933130697,
- "grad_norm": 2.00538969039917,
- "learning_rate": 4.958041796699583e-06,
- "loss": 0.5229607820510864,
- "mean_token_accuracy": 0.8282366394996643,
- "num_tokens": 3749308.0,
- "step": 418
- },
- {
- "epoch": 0.3183890577507599,
- "grad_norm": 2.6555092334747314,
- "learning_rate": 4.957658829682539e-06,
- "loss": 0.5344101190567017,
- "mean_token_accuracy": 0.8183202743530273,
- "num_tokens": 3754595.0,
- "step": 419
- },
- {
- "epoch": 0.3191489361702128,
- "grad_norm": 1.7468839883804321,
- "learning_rate": 4.9572741377859225e-06,
- "loss": 0.5667245984077454,
- "mean_token_accuracy": 0.8080123662948608,
- "num_tokens": 3765761.0,
- "step": 420
- },
- {
- "epoch": 0.31990881458966564,
- "grad_norm": 2.9612457752227783,
- "learning_rate": 4.956887721279726e-06,
- "loss": 0.5389559864997864,
- "mean_token_accuracy": 0.8019476532936096,
- "num_tokens": 3770844.0,
- "step": 421
- },
- {
- "epoch": 0.32066869300911854,
- "grad_norm": 1.842403769493103,
- "learning_rate": 4.95649958043515e-06,
- "loss": 0.38279837369918823,
- "mean_token_accuracy": 0.858866810798645,
- "num_tokens": 3778094.0,
- "step": 422
- },
- {
- "epoch": 0.32142857142857145,
- "grad_norm": 2.3108131885528564,
- "learning_rate": 4.956109715524609e-06,
- "loss": 0.5453893542289734,
- "mean_token_accuracy": 0.8085013031959534,
- "num_tokens": 3785015.0,
- "step": 423
- },
- {
- "epoch": 0.3221884498480243,
- "grad_norm": 3.0326945781707764,
- "learning_rate": 4.9557181268217225e-06,
- "loss": 0.5550523400306702,
- "mean_token_accuracy": 0.8125876188278198,
- "num_tokens": 3789830.0,
- "step": 424
- },
- {
- "epoch": 0.3229483282674772,
- "grad_norm": 1.8851977586746216,
- "learning_rate": 4.955324814601324e-06,
- "loss": 0.4902324974536896,
- "mean_token_accuracy": 0.8205406665802002,
- "num_tokens": 3799862.0,
- "step": 425
- },
- {
- "epoch": 0.32370820668693007,
- "grad_norm": 2.6018171310424805,
- "learning_rate": 4.954929779139455e-06,
- "loss": 0.5920133590698242,
- "mean_token_accuracy": 0.8340690732002258,
- "num_tokens": 3806617.0,
- "step": 426
- },
- {
- "epoch": 0.324468085106383,
- "grad_norm": 2.4283878803253174,
- "learning_rate": 4.954533020713367e-06,
- "loss": 0.5305854082107544,
- "mean_token_accuracy": 0.8137468099594116,
- "num_tokens": 3813843.0,
- "step": 427
- },
- {
- "epoch": 0.3252279635258359,
- "grad_norm": 2.667978525161743,
- "learning_rate": 4.954134539601519e-06,
- "loss": 0.5333638787269592,
- "mean_token_accuracy": 0.8402629494667053,
- "num_tokens": 3819450.0,
- "step": 428
- },
- {
- "epoch": 0.32598784194528874,
- "grad_norm": 1.7302523851394653,
- "learning_rate": 4.953734336083582e-06,
- "loss": 0.422895610332489,
- "mean_token_accuracy": 0.8709704875946045,
- "num_tokens": 3831027.0,
- "step": 429
- },
- {
- "epoch": 0.32674772036474165,
- "grad_norm": 2.427192211151123,
- "learning_rate": 4.953332410440434e-06,
- "loss": 0.6334598064422607,
- "mean_token_accuracy": 0.7817479968070984,
- "num_tokens": 3841776.0,
- "step": 430
- },
- {
- "epoch": 0.32750759878419455,
- "grad_norm": 1.460949182510376,
- "learning_rate": 4.952928762954161e-06,
- "loss": 0.3654777705669403,
- "mean_token_accuracy": 0.8780122995376587,
- "num_tokens": 3852213.0,
- "step": 431
- },
- {
- "epoch": 0.3282674772036474,
- "grad_norm": 1.9855005741119385,
- "learning_rate": 4.952523393908059e-06,
- "loss": 0.5117089748382568,
- "mean_token_accuracy": 0.811911404132843,
- "num_tokens": 3861176.0,
- "step": 432
- },
- {
- "epoch": 0.3290273556231003,
- "grad_norm": 2.2653207778930664,
- "learning_rate": 4.952116303586631e-06,
- "loss": 0.42514950037002563,
- "mean_token_accuracy": 0.8448518514633179,
- "num_tokens": 3867164.0,
- "step": 433
- },
- {
- "epoch": 0.32978723404255317,
- "grad_norm": 1.9780964851379395,
- "learning_rate": 4.951707492275589e-06,
- "loss": 0.5095293521881104,
- "mean_token_accuracy": 0.8262748718261719,
- "num_tokens": 3876406.0,
- "step": 434
- },
- {
- "epoch": 0.3305471124620061,
- "grad_norm": 2.9480233192443848,
- "learning_rate": 4.951296960261853e-06,
- "loss": 0.3494448959827423,
- "mean_token_accuracy": 0.8781307935714722,
- "num_tokens": 3880298.0,
- "step": 435
- },
- {
- "epoch": 0.331306990881459,
- "grad_norm": 2.335571527481079,
- "learning_rate": 4.95088470783355e-06,
- "loss": 0.5456914901733398,
- "mean_token_accuracy": 0.816297173500061,
- "num_tokens": 3886487.0,
- "step": 436
- },
- {
- "epoch": 0.33206686930091184,
- "grad_norm": 2.3046419620513916,
- "learning_rate": 4.950470735280013e-06,
- "loss": 0.4835948944091797,
- "mean_token_accuracy": 0.8539175391197205,
- "num_tokens": 3892706.0,
- "step": 437
- },
- {
- "epoch": 0.33282674772036475,
- "grad_norm": 2.44047474861145,
- "learning_rate": 4.950055042891786e-06,
- "loss": 0.5154092907905579,
- "mean_token_accuracy": 0.8579919338226318,
- "num_tokens": 3899532.0,
- "step": 438
- },
- {
- "epoch": 0.33358662613981765,
- "grad_norm": 4.826764106750488,
- "learning_rate": 4.949637630960618e-06,
- "loss": 0.5270259976387024,
- "mean_token_accuracy": 0.8172192573547363,
- "num_tokens": 3902260.0,
- "step": 439
- },
- {
- "epoch": 0.3343465045592705,
- "grad_norm": 2.001574754714966,
- "learning_rate": 4.949218499779462e-06,
- "loss": 0.5413002967834473,
- "mean_token_accuracy": 0.8162837028503418,
- "num_tokens": 3911706.0,
- "step": 440
- },
- {
- "epoch": 0.3351063829787234,
- "grad_norm": 1.7998944520950317,
- "learning_rate": 4.948797649642484e-06,
- "loss": 0.5131614208221436,
- "mean_token_accuracy": 0.8367440700531006,
- "num_tokens": 3923490.0,
- "step": 441
- },
- {
- "epoch": 0.33586626139817627,
- "grad_norm": 3.4566173553466797,
- "learning_rate": 4.94837508084505e-06,
- "loss": 0.7258909940719604,
- "mean_token_accuracy": 0.771377444267273,
- "num_tokens": 3928099.0,
- "step": 442
- },
- {
- "epoch": 0.3366261398176292,
- "grad_norm": 2.0040442943573,
- "learning_rate": 4.9479507936837364e-06,
- "loss": 0.482135534286499,
- "mean_token_accuracy": 0.8339327573776245,
- "num_tokens": 3937328.0,
- "step": 443
- },
- {
- "epoch": 0.3373860182370821,
- "grad_norm": 2.949502944946289,
- "learning_rate": 4.947524788456325e-06,
- "loss": 0.6474795341491699,
- "mean_token_accuracy": 0.7951677441596985,
- "num_tokens": 3942529.0,
- "step": 444
- },
- {
- "epoch": 0.33814589665653494,
- "grad_norm": 1.5528364181518555,
- "learning_rate": 4.947097065461801e-06,
- "loss": 0.48791584372520447,
- "mean_token_accuracy": 0.8425545692443848,
- "num_tokens": 3955200.0,
- "step": 445
- },
- {
- "epoch": 0.33890577507598785,
- "grad_norm": 1.8813284635543823,
- "learning_rate": 4.946667625000358e-06,
- "loss": 0.45922309160232544,
- "mean_token_accuracy": 0.8206527233123779,
- "num_tokens": 3962975.0,
- "step": 446
- },
- {
- "epoch": 0.33966565349544076,
- "grad_norm": 1.7157847881317139,
- "learning_rate": 4.946236467373392e-06,
- "loss": 0.5454182028770447,
- "mean_token_accuracy": 0.8049604892730713,
- "num_tokens": 3973956.0,
- "step": 447
- },
- {
- "epoch": 0.3404255319148936,
- "grad_norm": 2.008857250213623,
- "learning_rate": 4.945803592883509e-06,
- "loss": 0.5151860117912292,
- "mean_token_accuracy": 0.8262045383453369,
- "num_tokens": 3982853.0,
- "step": 448
- },
- {
- "epoch": 0.3411854103343465,
- "grad_norm": 1.6632496118545532,
- "learning_rate": 4.9453690018345144e-06,
- "loss": 0.42710691690444946,
- "mean_token_accuracy": 0.8521314859390259,
- "num_tokens": 3993838.0,
- "step": 449
- },
- {
- "epoch": 0.34194528875379937,
- "grad_norm": 1.365234375,
- "learning_rate": 4.944932694531423e-06,
- "loss": 0.5172526836395264,
- "mean_token_accuracy": 0.8277045488357544,
- "num_tokens": 4014179.0,
- "step": 450
- },
- {
- "epoch": 0.3427051671732523,
- "grad_norm": 1.7610243558883667,
- "learning_rate": 4.94449467128045e-06,
- "loss": 0.42104798555374146,
- "mean_token_accuracy": 0.8552065491676331,
- "num_tokens": 4023663.0,
- "step": 451
- },
- {
- "epoch": 0.3434650455927052,
- "grad_norm": 2.3732354640960693,
- "learning_rate": 4.944054932389018e-06,
- "loss": 0.5471175909042358,
- "mean_token_accuracy": 0.8487317562103271,
- "num_tokens": 4030100.0,
- "step": 452
- },
- {
- "epoch": 0.34422492401215804,
- "grad_norm": 1.5973623991012573,
- "learning_rate": 4.943613478165753e-06,
- "loss": 0.419813871383667,
- "mean_token_accuracy": 0.8484025001525879,
- "num_tokens": 4041124.0,
- "step": 453
- },
- {
- "epoch": 0.34498480243161095,
- "grad_norm": 2.966381549835205,
- "learning_rate": 4.943170308920484e-06,
- "loss": 0.5370652675628662,
- "mean_token_accuracy": 0.8439491987228394,
- "num_tokens": 4045675.0,
- "step": 454
- },
- {
- "epoch": 0.34574468085106386,
- "grad_norm": 2.5097248554229736,
- "learning_rate": 4.9427254249642445e-06,
- "loss": 0.5776349306106567,
- "mean_token_accuracy": 0.8060523867607117,
- "num_tokens": 4053250.0,
- "step": 455
- },
- {
- "epoch": 0.3465045592705167,
- "grad_norm": 1.6779125928878784,
- "learning_rate": 4.942278826609272e-06,
- "loss": 0.5245476961135864,
- "mean_token_accuracy": 0.8168526887893677,
- "num_tokens": 4064106.0,
- "step": 456
- },
- {
- "epoch": 0.3472644376899696,
- "grad_norm": 1.5945546627044678,
- "learning_rate": 4.9418305141690045e-06,
- "loss": 0.4972047209739685,
- "mean_token_accuracy": 0.8257735967636108,
- "num_tokens": 4077687.0,
- "step": 457
- },
- {
- "epoch": 0.34802431610942247,
- "grad_norm": 2.864778757095337,
- "learning_rate": 4.9413804879580865e-06,
- "loss": 0.5372499823570251,
- "mean_token_accuracy": 0.8423776626586914,
- "num_tokens": 4082632.0,
- "step": 458
- },
- {
- "epoch": 0.3487841945288754,
- "grad_norm": 1.4797078371047974,
- "learning_rate": 4.940928748292363e-06,
- "loss": 0.5903409719467163,
- "mean_token_accuracy": 0.8061295747756958,
- "num_tokens": 4104218.0,
- "step": 459
- },
- {
- "epoch": 0.3495440729483283,
- "grad_norm": 2.4376983642578125,
- "learning_rate": 4.940475295488882e-06,
- "loss": 0.4534894824028015,
- "mean_token_accuracy": 0.8395825028419495,
- "num_tokens": 4110530.0,
- "step": 460
- },
- {
- "epoch": 0.35030395136778114,
- "grad_norm": 1.2955626249313354,
- "learning_rate": 4.940020129865895e-06,
- "loss": 0.47155818343162537,
- "mean_token_accuracy": 0.8253582715988159,
- "num_tokens": 4128398.0,
- "step": 461
- },
- {
- "epoch": 0.35106382978723405,
- "grad_norm": 2.066575527191162,
- "learning_rate": 4.9395632517428546e-06,
- "loss": 0.5555641651153564,
- "mean_token_accuracy": 0.814624547958374,
- "num_tokens": 4137623.0,
- "step": 462
- },
- {
- "epoch": 0.3518237082066869,
- "grad_norm": 1.6407525539398193,
- "learning_rate": 4.939104661440415e-06,
- "loss": 0.4361790418624878,
- "mean_token_accuracy": 0.8544459342956543,
- "num_tokens": 4152803.0,
- "step": 463
- },
- {
- "epoch": 0.3525835866261398,
- "grad_norm": 2.1685116291046143,
- "learning_rate": 4.938644359280433e-06,
- "loss": 0.5347012877464294,
- "mean_token_accuracy": 0.853853702545166,
- "num_tokens": 4160778.0,
- "step": 464
- },
- {
- "epoch": 0.3533434650455927,
- "grad_norm": 1.8824869394302368,
- "learning_rate": 4.938182345585967e-06,
- "loss": 0.5512481927871704,
- "mean_token_accuracy": 0.7985891699790955,
- "num_tokens": 4170380.0,
- "step": 465
- },
- {
- "epoch": 0.3541033434650456,
- "grad_norm": 2.2229504585266113,
- "learning_rate": 4.937718620681273e-06,
- "loss": 0.516828179359436,
- "mean_token_accuracy": 0.8265621066093445,
- "num_tokens": 4178179.0,
- "step": 466
- },
- {
- "epoch": 0.3548632218844985,
- "grad_norm": 1.955990195274353,
- "learning_rate": 4.9372531848918145e-06,
- "loss": 0.5586158037185669,
- "mean_token_accuracy": 0.8367916345596313,
- "num_tokens": 4188626.0,
- "step": 467
- },
- {
- "epoch": 0.3556231003039514,
- "grad_norm": 1.9687023162841797,
- "learning_rate": 4.936786038544251e-06,
- "loss": 0.5517531633377075,
- "mean_token_accuracy": 0.8134098052978516,
- "num_tokens": 4198144.0,
- "step": 468
- },
- {
- "epoch": 0.35638297872340424,
- "grad_norm": 1.405516505241394,
- "learning_rate": 4.9363171819664434e-06,
- "loss": 0.5305492877960205,
- "mean_token_accuracy": 0.8014427423477173,
- "num_tokens": 4222818.0,
- "step": 469
- },
- {
- "epoch": 0.35714285714285715,
- "grad_norm": 2.6355695724487305,
- "learning_rate": 4.9358466154874535e-06,
- "loss": 0.5303391218185425,
- "mean_token_accuracy": 0.8028861284255981,
- "num_tokens": 4228318.0,
- "step": 470
- },
- {
- "epoch": 0.35790273556231,
- "grad_norm": 1.5133824348449707,
- "learning_rate": 4.935374339437543e-06,
- "loss": 0.5329189300537109,
- "mean_token_accuracy": 0.8479441404342651,
- "num_tokens": 4244527.0,
- "step": 471
- },
- {
- "epoch": 0.3586626139817629,
- "grad_norm": 3.4356725215911865,
- "learning_rate": 4.934900354148173e-06,
- "loss": 0.5431582927703857,
- "mean_token_accuracy": 0.8328983783721924,
- "num_tokens": 4248034.0,
- "step": 472
- },
- {
- "epoch": 0.3594224924012158,
- "grad_norm": 2.5789499282836914,
- "learning_rate": 4.934424659952006e-06,
- "loss": 0.4141455292701721,
- "mean_token_accuracy": 0.8658635020256042,
- "num_tokens": 4252953.0,
- "step": 473
- },
- {
- "epoch": 0.3601823708206687,
- "grad_norm": 1.145262598991394,
- "learning_rate": 4.933947257182901e-06,
- "loss": 0.40294092893600464,
- "mean_token_accuracy": 0.8565847277641296,
- "num_tokens": 4277813.0,
- "step": 474
- },
- {
- "epoch": 0.3609422492401216,
- "grad_norm": 1.7242133617401123,
- "learning_rate": 4.933468146175918e-06,
- "loss": 0.6036738753318787,
- "mean_token_accuracy": 0.8072597980499268,
- "num_tokens": 4291088.0,
- "step": 475
- },
- {
- "epoch": 0.3617021276595745,
- "grad_norm": 2.3490941524505615,
- "learning_rate": 4.932987327267317e-06,
- "loss": 0.49456146359443665,
- "mean_token_accuracy": 0.8372673988342285,
- "num_tokens": 4297376.0,
- "step": 476
- },
- {
- "epoch": 0.36246200607902734,
- "grad_norm": 1.3605526685714722,
- "learning_rate": 4.932504800794553e-06,
- "loss": 0.43595948815345764,
- "mean_token_accuracy": 0.8415953516960144,
- "num_tokens": 4312054.0,
- "step": 477
- },
- {
- "epoch": 0.36322188449848025,
- "grad_norm": 1.4525885581970215,
- "learning_rate": 4.9320205670962815e-06,
- "loss": 0.5390371680259705,
- "mean_token_accuracy": 0.8101649284362793,
- "num_tokens": 4328701.0,
- "step": 478
- },
- {
- "epoch": 0.3639817629179331,
- "grad_norm": 1.9862419366836548,
- "learning_rate": 4.931534626512359e-06,
- "loss": 0.45436930656433105,
- "mean_token_accuracy": 0.8352861404418945,
- "num_tokens": 4338372.0,
- "step": 479
- },
- {
- "epoch": 0.364741641337386,
- "grad_norm": 1.7804961204528809,
- "learning_rate": 4.931046979383836e-06,
- "loss": 0.4677754044532776,
- "mean_token_accuracy": 0.840467095375061,
- "num_tokens": 4347897.0,
- "step": 480
- },
- {
- "epoch": 0.3655015197568389,
- "grad_norm": 2.066632032394409,
- "learning_rate": 4.930557626052961e-06,
- "loss": 0.42418140172958374,
- "mean_token_accuracy": 0.8528275489807129,
- "num_tokens": 4354061.0,
- "step": 481
- },
- {
- "epoch": 0.3662613981762918,
- "grad_norm": 1.6155282258987427,
- "learning_rate": 4.930066566863182e-06,
- "loss": 0.5424284934997559,
- "mean_token_accuracy": 0.825040876865387,
- "num_tokens": 4370400.0,
- "step": 482
- },
- {
- "epoch": 0.3670212765957447,
- "grad_norm": 2.1452953815460205,
- "learning_rate": 4.929573802159143e-06,
- "loss": 0.5105804204940796,
- "mean_token_accuracy": 0.8284053802490234,
- "num_tokens": 4377579.0,
- "step": 483
- },
- {
- "epoch": 0.3677811550151976,
- "grad_norm": 1.8940945863723755,
- "learning_rate": 4.929079332286685e-06,
- "loss": 0.43478304147720337,
- "mean_token_accuracy": 0.8505665063858032,
- "num_tokens": 4385686.0,
- "step": 484
- },
- {
- "epoch": 0.36854103343465044,
- "grad_norm": 1.6785860061645508,
- "learning_rate": 4.928583157592846e-06,
- "loss": 0.40227848291397095,
- "mean_token_accuracy": 0.8623573780059814,
- "num_tokens": 4396128.0,
- "step": 485
- },
- {
- "epoch": 0.36930091185410335,
- "grad_norm": 1.6416733264923096,
- "learning_rate": 4.928085278425862e-06,
- "loss": 0.526267409324646,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 4407963.0,
- "step": 486
- },
- {
- "epoch": 0.3700607902735562,
- "grad_norm": 1.8882389068603516,
- "learning_rate": 4.927585695135162e-06,
- "loss": 0.5555213093757629,
- "mean_token_accuracy": 0.8115293979644775,
- "num_tokens": 4418057.0,
- "step": 487
- },
- {
- "epoch": 0.3708206686930091,
- "grad_norm": 2.300248384475708,
- "learning_rate": 4.9270844080713735e-06,
- "loss": 0.5812339186668396,
- "mean_token_accuracy": 0.800270676612854,
- "num_tokens": 4425358.0,
- "step": 488
- },
- {
- "epoch": 0.371580547112462,
- "grad_norm": 1.6802922487258911,
- "learning_rate": 4.926581417586319e-06,
- "loss": 0.5134941935539246,
- "mean_token_accuracy": 0.8247408866882324,
- "num_tokens": 4437702.0,
- "step": 489
- },
- {
- "epoch": 0.3723404255319149,
- "grad_norm": 1.7620291709899902,
- "learning_rate": 4.926076724033016e-06,
- "loss": 0.5233973264694214,
- "mean_token_accuracy": 0.8102161884307861,
- "num_tokens": 4448584.0,
- "step": 490
- },
- {
- "epoch": 0.3731003039513678,
- "grad_norm": 1.6911998987197876,
- "learning_rate": 4.925570327765678e-06,
- "loss": 0.5337274074554443,
- "mean_token_accuracy": 0.845306396484375,
- "num_tokens": 4462651.0,
- "step": 491
- },
- {
- "epoch": 0.3738601823708207,
- "grad_norm": 1.7991242408752441,
- "learning_rate": 4.9250622291397144e-06,
- "loss": 0.31018948554992676,
- "mean_token_accuracy": 0.8857606053352356,
- "num_tokens": 4469971.0,
- "step": 492
- },
- {
- "epoch": 0.37462006079027355,
- "grad_norm": 4.9776835441589355,
- "learning_rate": 4.924552428511727e-06,
- "loss": 0.44114983081817627,
- "mean_token_accuracy": 0.8429906368255615,
- "num_tokens": 4478275.0,
- "step": 493
- },
- {
- "epoch": 0.37537993920972645,
- "grad_norm": 1.8007272481918335,
- "learning_rate": 4.924040926239515e-06,
- "loss": 0.574328601360321,
- "mean_token_accuracy": 0.7669196128845215,
- "num_tokens": 4491551.0,
- "step": 494
- },
- {
- "epoch": 0.3761398176291793,
- "grad_norm": 2.021300792694092,
- "learning_rate": 4.92352772268207e-06,
- "loss": 0.45636120438575745,
- "mean_token_accuracy": 0.840438723564148,
- "num_tokens": 4498658.0,
- "step": 495
- },
- {
- "epoch": 0.3768996960486322,
- "grad_norm": 2.369748592376709,
- "learning_rate": 4.923012818199576e-06,
- "loss": 0.5206376910209656,
- "mean_token_accuracy": 0.8521823287010193,
- "num_tokens": 4504648.0,
- "step": 496
- },
- {
- "epoch": 0.3776595744680851,
- "grad_norm": 2.733485221862793,
- "learning_rate": 4.922496213153416e-06,
- "loss": 0.5067723989486694,
- "mean_token_accuracy": 0.8168281316757202,
- "num_tokens": 4509990.0,
- "step": 497
- },
- {
- "epoch": 0.378419452887538,
- "grad_norm": 2.3751676082611084,
- "learning_rate": 4.921977907906161e-06,
- "loss": 0.49757206439971924,
- "mean_token_accuracy": 0.8325017690658569,
- "num_tokens": 4518373.0,
- "step": 498
- },
- {
- "epoch": 0.3791793313069909,
- "grad_norm": 2.1672775745391846,
- "learning_rate": 4.921457902821578e-06,
- "loss": 0.4237566590309143,
- "mean_token_accuracy": 0.8404698371887207,
- "num_tokens": 4524338.0,
- "step": 499
- },
- {
- "epoch": 0.3799392097264438,
- "grad_norm": 1.8374360799789429,
- "learning_rate": 4.9209361982646275e-06,
- "loss": 0.4995468854904175,
- "mean_token_accuracy": 0.8299649953842163,
- "num_tokens": 4533396.0,
- "step": 500
- },
- {
- "epoch": 0.38069908814589665,
- "grad_norm": 2.083967924118042,
- "learning_rate": 4.920412794601461e-06,
- "loss": 0.489935040473938,
- "mean_token_accuracy": 0.8315291404724121,
- "num_tokens": 4540941.0,
- "step": 501
- },
- {
- "epoch": 0.38145896656534956,
- "grad_norm": 2.2075610160827637,
- "learning_rate": 4.919887692199423e-06,
- "loss": 0.5233147740364075,
- "mean_token_accuracy": 0.804171085357666,
- "num_tokens": 4548215.0,
- "step": 502
- },
- {
- "epoch": 0.3822188449848024,
- "grad_norm": 2.076775312423706,
- "learning_rate": 4.9193608914270515e-06,
- "loss": 0.5785550475120544,
- "mean_token_accuracy": 0.7993186116218567,
- "num_tokens": 4558204.0,
- "step": 503
- },
- {
- "epoch": 0.3829787234042553,
- "grad_norm": 2.238546133041382,
- "learning_rate": 4.918832392654075e-06,
- "loss": 0.5287384390830994,
- "mean_token_accuracy": 0.8214945793151855,
- "num_tokens": 4565407.0,
- "step": 504
- },
- {
- "epoch": 0.3837386018237082,
- "grad_norm": 1.6783074140548706,
- "learning_rate": 4.9183021962514145e-06,
- "loss": 0.6063359379768372,
- "mean_token_accuracy": 0.7914625406265259,
- "num_tokens": 4580991.0,
- "step": 505
- },
- {
- "epoch": 0.3844984802431611,
- "grad_norm": 1.6287449598312378,
- "learning_rate": 4.917770302591183e-06,
- "loss": 0.3598247766494751,
- "mean_token_accuracy": 0.8706809878349304,
- "num_tokens": 4590579.0,
- "step": 506
- },
- {
- "epoch": 0.385258358662614,
- "grad_norm": 1.5432041883468628,
- "learning_rate": 4.917236712046682e-06,
- "loss": 0.5267890095710754,
- "mean_token_accuracy": 0.8032117486000061,
- "num_tokens": 4608380.0,
- "step": 507
- },
- {
- "epoch": 0.3860182370820669,
- "grad_norm": 1.7664037942886353,
- "learning_rate": 4.9167014249924075e-06,
- "loss": 0.3552354574203491,
- "mean_token_accuracy": 0.8569793701171875,
- "num_tokens": 4616426.0,
- "step": 508
- },
- {
- "epoch": 0.38677811550151975,
- "grad_norm": 2.1147472858428955,
- "learning_rate": 4.916164441804044e-06,
- "loss": 0.5212404727935791,
- "mean_token_accuracy": 0.8196578025817871,
- "num_tokens": 4623908.0,
- "step": 509
- },
- {
- "epoch": 0.38753799392097266,
- "grad_norm": 2.1092333793640137,
- "learning_rate": 4.915625762858467e-06,
- "loss": 0.5197038650512695,
- "mean_token_accuracy": 0.8245604634284973,
- "num_tokens": 4630956.0,
- "step": 510
- },
- {
- "epoch": 0.3882978723404255,
- "grad_norm": 1.23331880569458,
- "learning_rate": 4.915085388533743e-06,
- "loss": 0.4759839177131653,
- "mean_token_accuracy": 0.8192248344421387,
- "num_tokens": 4651269.0,
- "step": 511
- },
- {
- "epoch": 0.3890577507598784,
- "grad_norm": 2.424199104309082,
- "learning_rate": 4.914543319209126e-06,
- "loss": 0.5576270818710327,
- "mean_token_accuracy": 0.8203302621841431,
- "num_tokens": 4657296.0,
- "step": 512
- },
- {
- "epoch": 0.3898176291793313,
- "grad_norm": 2.725156307220459,
- "learning_rate": 4.913999555265062e-06,
- "loss": 0.4337949752807617,
- "mean_token_accuracy": 0.8382406234741211,
- "num_tokens": 4661850.0,
- "step": 513
- },
- {
- "epoch": 0.3905775075987842,
- "grad_norm": 2.3120534420013428,
- "learning_rate": 4.913454097083185e-06,
- "loss": 0.4941597580909729,
- "mean_token_accuracy": 0.8302834033966064,
- "num_tokens": 4667769.0,
- "step": 514
- },
- {
- "epoch": 0.3913373860182371,
- "grad_norm": 2.3111207485198975,
- "learning_rate": 4.912906945046319e-06,
- "loss": 0.5253715515136719,
- "mean_token_accuracy": 0.84515380859375,
- "num_tokens": 4674537.0,
- "step": 515
- },
- {
- "epoch": 0.39209726443769,
- "grad_norm": 1.4117841720581055,
- "learning_rate": 4.912358099538476e-06,
- "loss": 0.4521017074584961,
- "mean_token_accuracy": 0.8208256959915161,
- "num_tokens": 4690605.0,
- "step": 516
- },
- {
- "epoch": 0.39285714285714285,
- "grad_norm": 2.3742799758911133,
- "learning_rate": 4.911807560944858e-06,
- "loss": 0.41572901606559753,
- "mean_token_accuracy": 0.8550551533699036,
- "num_tokens": 4706437.0,
- "step": 517
- },
- {
- "epoch": 0.39361702127659576,
- "grad_norm": 2.4052202701568604,
- "learning_rate": 4.911255329651852e-06,
- "loss": 0.6003736257553101,
- "mean_token_accuracy": 0.8247885704040527,
- "num_tokens": 4712746.0,
- "step": 518
- },
- {
- "epoch": 0.3943768996960486,
- "grad_norm": 1.9335490465164185,
- "learning_rate": 4.910701406047037e-06,
- "loss": 0.5457713603973389,
- "mean_token_accuracy": 0.787429690361023,
- "num_tokens": 4731937.0,
- "step": 519
- },
- {
- "epoch": 0.3951367781155015,
- "grad_norm": 2.257706880569458,
- "learning_rate": 4.910145790519177e-06,
- "loss": 0.5300652980804443,
- "mean_token_accuracy": 0.8192912936210632,
- "num_tokens": 4739422.0,
- "step": 520
- },
- {
- "epoch": 0.3958966565349544,
- "grad_norm": 1.2099462747573853,
- "learning_rate": 4.9095884834582256e-06,
- "loss": 0.45872747898101807,
- "mean_token_accuracy": 0.8362667560577393,
- "num_tokens": 4757113.0,
- "step": 521
- },
- {
- "epoch": 0.3966565349544073,
- "grad_norm": 2.7991135120391846,
- "learning_rate": 4.909029485255321e-06,
- "loss": 0.49039560556411743,
- "mean_token_accuracy": 0.8260016441345215,
- "num_tokens": 4761709.0,
- "step": 522
- },
- {
- "epoch": 0.3974164133738602,
- "grad_norm": 2.2360129356384277,
- "learning_rate": 4.90846879630279e-06,
- "loss": 0.49556830525398254,
- "mean_token_accuracy": 0.827864408493042,
- "num_tokens": 4769048.0,
- "step": 523
- },
- {
- "epoch": 0.3981762917933131,
- "grad_norm": 2.5953688621520996,
- "learning_rate": 4.907906416994146e-06,
- "loss": 0.387208491563797,
- "mean_token_accuracy": 0.8467001914978027,
- "num_tokens": 4774637.0,
- "step": 524
- },
- {
- "epoch": 0.39893617021276595,
- "grad_norm": 2.1046814918518066,
- "learning_rate": 4.907342347724088e-06,
- "loss": 0.5477259755134583,
- "mean_token_accuracy": 0.8060322999954224,
- "num_tokens": 4782774.0,
- "step": 525
- },
- {
- "epoch": 0.39969604863221886,
- "grad_norm": 2.5622646808624268,
- "learning_rate": 4.906776588888502e-06,
- "loss": 0.5684159398078918,
- "mean_token_accuracy": 0.8095303177833557,
- "num_tokens": 4788766.0,
- "step": 526
- },
- {
- "epoch": 0.4004559270516717,
- "grad_norm": 1.9027913808822632,
- "learning_rate": 4.906209140884459e-06,
- "loss": 0.535524845123291,
- "mean_token_accuracy": 0.815237820148468,
- "num_tokens": 4798492.0,
- "step": 527
- },
- {
- "epoch": 0.4012158054711246,
- "grad_norm": 2.1447622776031494,
- "learning_rate": 4.905640004110216e-06,
- "loss": 0.5628632307052612,
- "mean_token_accuracy": 0.8085395097732544,
- "num_tokens": 4805737.0,
- "step": 528
- },
- {
- "epoch": 0.40197568389057753,
- "grad_norm": 1.6754741668701172,
- "learning_rate": 4.905069178965215e-06,
- "loss": 0.5046736598014832,
- "mean_token_accuracy": 0.8247535228729248,
- "num_tokens": 4816912.0,
- "step": 529
- },
- {
- "epoch": 0.4027355623100304,
- "grad_norm": 2.271230459213257,
- "learning_rate": 4.904496665850083e-06,
- "loss": 0.6086187958717346,
- "mean_token_accuracy": 0.7935276627540588,
- "num_tokens": 4824577.0,
- "step": 530
- },
- {
- "epoch": 0.4034954407294833,
- "grad_norm": 2.107595205307007,
- "learning_rate": 4.903922465166633e-06,
- "loss": 0.5431341528892517,
- "mean_token_accuracy": 0.8129537105560303,
- "num_tokens": 4831772.0,
- "step": 531
- },
- {
- "epoch": 0.40425531914893614,
- "grad_norm": 1.3860732316970825,
- "learning_rate": 4.903346577317859e-06,
- "loss": 0.45816320180892944,
- "mean_token_accuracy": 0.8328287601470947,
- "num_tokens": 4850302.0,
- "step": 532
- },
- {
- "epoch": 0.40501519756838905,
- "grad_norm": 1.9186837673187256,
- "learning_rate": 4.902769002707942e-06,
- "loss": 0.3294633626937866,
- "mean_token_accuracy": 0.8853933811187744,
- "num_tokens": 4856624.0,
- "step": 533
- },
- {
- "epoch": 0.40577507598784196,
- "grad_norm": 1.516194462776184,
- "learning_rate": 4.902189741742247e-06,
- "loss": 0.45482105016708374,
- "mean_token_accuracy": 0.8370342254638672,
- "num_tokens": 4870395.0,
- "step": 534
- },
- {
- "epoch": 0.4065349544072948,
- "grad_norm": 2.3235628604888916,
- "learning_rate": 4.901608794827321e-06,
- "loss": 0.40688639879226685,
- "mean_token_accuracy": 0.8643521666526794,
- "num_tokens": 4875645.0,
- "step": 535
- },
- {
- "epoch": 0.4072948328267477,
- "grad_norm": 2.29286527633667,
- "learning_rate": 4.9010261623708945e-06,
- "loss": 0.45482826232910156,
- "mean_token_accuracy": 0.8429383039474487,
- "num_tokens": 4881772.0,
- "step": 536
- },
- {
- "epoch": 0.40805471124620063,
- "grad_norm": 1.5907070636749268,
- "learning_rate": 4.900441844781882e-06,
- "loss": 0.5266948342323303,
- "mean_token_accuracy": 0.8348641395568848,
- "num_tokens": 4894289.0,
- "step": 537
- },
- {
- "epoch": 0.4088145896656535,
- "grad_norm": 2.1816294193267822,
- "learning_rate": 4.89985584247038e-06,
- "loss": 0.4797617793083191,
- "mean_token_accuracy": 0.8549500703811646,
- "num_tokens": 4901106.0,
- "step": 538
- },
- {
- "epoch": 0.4095744680851064,
- "grad_norm": 1.7347146272659302,
- "learning_rate": 4.899268155847667e-06,
- "loss": 0.4754739999771118,
- "mean_token_accuracy": 0.8278418183326721,
- "num_tokens": 4912131.0,
- "step": 539
- },
- {
- "epoch": 0.41033434650455924,
- "grad_norm": 2.0694527626037598,
- "learning_rate": 4.898678785326205e-06,
- "loss": 0.5071008801460266,
- "mean_token_accuracy": 0.8157946467399597,
- "num_tokens": 4921141.0,
- "step": 540
- },
- {
- "epoch": 0.41109422492401215,
- "grad_norm": 2.570047616958618,
- "learning_rate": 4.898087731319637e-06,
- "loss": 0.43639278411865234,
- "mean_token_accuracy": 0.8682913780212402,
- "num_tokens": 4926182.0,
- "step": 541
- },
- {
- "epoch": 0.41185410334346506,
- "grad_norm": 4.064006805419922,
- "learning_rate": 4.8974949942427854e-06,
- "loss": 0.539260745048523,
- "mean_token_accuracy": 0.8225528001785278,
- "num_tokens": 4929449.0,
- "step": 542
- },
- {
- "epoch": 0.4126139817629179,
- "grad_norm": 1.7644332647323608,
- "learning_rate": 4.896900574511657e-06,
- "loss": 0.472618043422699,
- "mean_token_accuracy": 0.8332902193069458,
- "num_tokens": 4939443.0,
- "step": 543
- },
- {
- "epoch": 0.4133738601823708,
- "grad_norm": 2.879918336868286,
- "learning_rate": 4.89630447254344e-06,
- "loss": 0.6360667943954468,
- "mean_token_accuracy": 0.8215296268463135,
- "num_tokens": 4950838.0,
- "step": 544
- },
- {
- "epoch": 0.41413373860182373,
- "grad_norm": 1.4575570821762085,
- "learning_rate": 4.8957066887565005e-06,
- "loss": 0.45617997646331787,
- "mean_token_accuracy": 0.8373187184333801,
- "num_tokens": 4965222.0,
- "step": 545
- },
- {
- "epoch": 0.4148936170212766,
- "grad_norm": 2.4829535484313965,
- "learning_rate": 4.895107223570386e-06,
- "loss": 0.42285341024398804,
- "mean_token_accuracy": 0.8686380386352539,
- "num_tokens": 4970724.0,
- "step": 546
- },
- {
- "epoch": 0.4156534954407295,
- "grad_norm": 2.639474630355835,
- "learning_rate": 4.894506077405824e-06,
- "loss": 0.5906289219856262,
- "mean_token_accuracy": 0.8174435496330261,
- "num_tokens": 4976766.0,
- "step": 547
- },
- {
- "epoch": 0.41641337386018235,
- "grad_norm": 2.7960562705993652,
- "learning_rate": 4.893903250684723e-06,
- "loss": 0.4518949091434479,
- "mean_token_accuracy": 0.8387585282325745,
- "num_tokens": 4980991.0,
- "step": 548
- },
- {
- "epoch": 0.41717325227963525,
- "grad_norm": 2.184176206588745,
- "learning_rate": 4.893298743830168e-06,
- "loss": 0.5223842859268188,
- "mean_token_accuracy": 0.8170937299728394,
- "num_tokens": 4987781.0,
- "step": 549
- },
- {
- "epoch": 0.41793313069908816,
- "grad_norm": 2.2393438816070557,
- "learning_rate": 4.892692557266429e-06,
- "loss": 0.5238431692123413,
- "mean_token_accuracy": 0.8217905759811401,
- "num_tokens": 4994321.0,
- "step": 550
- },
- {
- "epoch": 0.418693009118541,
- "grad_norm": 3.579047441482544,
- "learning_rate": 4.8920846914189465e-06,
- "loss": 0.5367584228515625,
- "mean_token_accuracy": 0.8312011361122131,
- "num_tokens": 4997951.0,
- "step": 551
- },
- {
- "epoch": 0.4194528875379939,
- "grad_norm": 1.6330240964889526,
- "learning_rate": 4.891475146714348e-06,
- "loss": 0.6054705381393433,
- "mean_token_accuracy": 0.7938206791877747,
- "num_tokens": 5012726.0,
- "step": 552
- },
- {
- "epoch": 0.42021276595744683,
- "grad_norm": 1.5775716304779053,
- "learning_rate": 4.8908639235804324e-06,
- "loss": 0.4774656891822815,
- "mean_token_accuracy": 0.828762948513031,
- "num_tokens": 5026751.0,
- "step": 553
- },
- {
- "epoch": 0.4209726443768997,
- "grad_norm": 1.5719101428985596,
- "learning_rate": 4.890251022446181e-06,
- "loss": 0.549429178237915,
- "mean_token_accuracy": 0.8110791444778442,
- "num_tokens": 5041861.0,
- "step": 554
- },
- {
- "epoch": 0.4217325227963526,
- "grad_norm": 1.8585275411605835,
- "learning_rate": 4.889636443741752e-06,
- "loss": 0.4448118805885315,
- "mean_token_accuracy": 0.8462690711021423,
- "num_tokens": 5052690.0,
- "step": 555
- },
- {
- "epoch": 0.42249240121580545,
- "grad_norm": 2.189202070236206,
- "learning_rate": 4.88902018789848e-06,
- "loss": 0.4296762943267822,
- "mean_token_accuracy": 0.8488791584968567,
- "num_tokens": 5058964.0,
- "step": 556
- },
- {
- "epoch": 0.42325227963525835,
- "grad_norm": 1.9328460693359375,
- "learning_rate": 4.888402255348877e-06,
- "loss": 0.5369474291801453,
- "mean_token_accuracy": 0.8184729814529419,
- "num_tokens": 5068465.0,
- "step": 557
- },
- {
- "epoch": 0.42401215805471126,
- "grad_norm": 1.6233323812484741,
- "learning_rate": 4.887782646526631e-06,
- "loss": 0.5284391641616821,
- "mean_token_accuracy": 0.8276044726371765,
- "num_tokens": 5081052.0,
- "step": 558
- },
- {
- "epoch": 0.4247720364741641,
- "grad_norm": 2.222813844680786,
- "learning_rate": 4.887161361866608e-06,
- "loss": 0.5679137706756592,
- "mean_token_accuracy": 0.8012375831604004,
- "num_tokens": 5090001.0,
- "step": 559
- },
- {
- "epoch": 0.425531914893617,
- "grad_norm": 2.1062207221984863,
- "learning_rate": 4.8865384018048494e-06,
- "loss": 0.5554201602935791,
- "mean_token_accuracy": 0.8128066062927246,
- "num_tokens": 5097644.0,
- "step": 560
- },
- {
- "epoch": 0.42629179331306993,
- "grad_norm": 1.5380984544754028,
- "learning_rate": 4.8859137667785735e-06,
- "loss": 0.4948265850543976,
- "mean_token_accuracy": 0.8258291482925415,
- "num_tokens": 5110069.0,
- "step": 561
- },
- {
- "epoch": 0.4270516717325228,
- "grad_norm": 2.0290257930755615,
- "learning_rate": 4.8852874572261715e-06,
- "loss": 0.4969530403614044,
- "mean_token_accuracy": 0.8297134637832642,
- "num_tokens": 5117452.0,
- "step": 562
- },
- {
- "epoch": 0.4278115501519757,
- "grad_norm": 1.5651452541351318,
- "learning_rate": 4.884659473587213e-06,
- "loss": 0.5353102087974548,
- "mean_token_accuracy": 0.8161719441413879,
- "num_tokens": 5133756.0,
- "step": 563
- },
- {
- "epoch": 0.42857142857142855,
- "grad_norm": 2.2470998764038086,
- "learning_rate": 4.884029816302441e-06,
- "loss": 0.5104288458824158,
- "mean_token_accuracy": 0.8081635236740112,
- "num_tokens": 5140278.0,
- "step": 564
- },
- {
- "epoch": 0.42933130699088146,
- "grad_norm": 1.726891279220581,
- "learning_rate": 4.883398485813772e-06,
- "loss": 0.4508771002292633,
- "mean_token_accuracy": 0.8548800349235535,
- "num_tokens": 5150115.0,
- "step": 565
- },
- {
- "epoch": 0.43009118541033436,
- "grad_norm": 1.4779289960861206,
- "learning_rate": 4.8827654825642984e-06,
- "loss": 0.46861088275909424,
- "mean_token_accuracy": 0.8209476470947266,
- "num_tokens": 5163225.0,
- "step": 566
- },
- {
- "epoch": 0.4308510638297872,
- "grad_norm": 1.2361034154891968,
- "learning_rate": 4.882130806998287e-06,
- "loss": 0.4591076672077179,
- "mean_token_accuracy": 0.803041934967041,
- "num_tokens": 5180342.0,
- "step": 567
- },
- {
- "epoch": 0.4316109422492401,
- "grad_norm": 1.882467269897461,
- "learning_rate": 4.881494459561177e-06,
- "loss": 0.579258143901825,
- "mean_token_accuracy": 0.8007112741470337,
- "num_tokens": 5189595.0,
- "step": 568
- },
- {
- "epoch": 0.43237082066869303,
- "grad_norm": 1.095462441444397,
- "learning_rate": 4.880856440699582e-06,
- "loss": 0.3806574046611786,
- "mean_token_accuracy": 0.8650111556053162,
- "num_tokens": 5211642.0,
- "step": 569
- },
- {
- "epoch": 0.4331306990881459,
- "grad_norm": 1.6469846963882446,
- "learning_rate": 4.880216750861288e-06,
- "loss": 0.544589638710022,
- "mean_token_accuracy": 0.8060122728347778,
- "num_tokens": 5224137.0,
- "step": 570
- },
- {
- "epoch": 0.4338905775075988,
- "grad_norm": 1.8561251163482666,
- "learning_rate": 4.879575390495254e-06,
- "loss": 0.4094924330711365,
- "mean_token_accuracy": 0.8591406345367432,
- "num_tokens": 5231588.0,
- "step": 571
- },
- {
- "epoch": 0.43465045592705165,
- "grad_norm": 3.01326847076416,
- "learning_rate": 4.878932360051611e-06,
- "loss": 0.6139192581176758,
- "mean_token_accuracy": 0.8108739852905273,
- "num_tokens": 5236853.0,
- "step": 572
- },
- {
- "epoch": 0.43541033434650456,
- "grad_norm": 2.1753034591674805,
- "learning_rate": 4.878287659981663e-06,
- "loss": 0.49082931876182556,
- "mean_token_accuracy": 0.862828254699707,
- "num_tokens": 5243264.0,
- "step": 573
- },
- {
- "epoch": 0.43617021276595747,
- "grad_norm": 1.4437755346298218,
- "learning_rate": 4.8776412907378845e-06,
- "loss": 0.5608728528022766,
- "mean_token_accuracy": 0.8271626234054565,
- "num_tokens": 5261757.0,
- "step": 574
- },
- {
- "epoch": 0.4369300911854103,
- "grad_norm": 1.786683440208435,
- "learning_rate": 4.876993252773923e-06,
- "loss": 0.4377627968788147,
- "mean_token_accuracy": 0.844936192035675,
- "num_tokens": 5271038.0,
- "step": 575
- },
- {
- "epoch": 0.4376899696048632,
- "grad_norm": 1.3425915241241455,
- "learning_rate": 4.876343546544596e-06,
- "loss": 0.44762521982192993,
- "mean_token_accuracy": 0.8397793769836426,
- "num_tokens": 5285555.0,
- "step": 576
- },
- {
- "epoch": 0.43844984802431614,
- "grad_norm": 2.1549675464630127,
- "learning_rate": 4.8756921725058935e-06,
- "loss": 0.5332942008972168,
- "mean_token_accuracy": 0.820149302482605,
- "num_tokens": 5294595.0,
- "step": 577
- },
- {
- "epoch": 0.439209726443769,
- "grad_norm": 1.5254042148590088,
- "learning_rate": 4.875039131114975e-06,
- "loss": 0.3646543622016907,
- "mean_token_accuracy": 0.8442583084106445,
- "num_tokens": 5304955.0,
- "step": 578
- },
- {
- "epoch": 0.4399696048632219,
- "grad_norm": 1.5751557350158691,
- "learning_rate": 4.8743844228301676e-06,
- "loss": 0.4854734539985657,
- "mean_token_accuracy": 0.8317523002624512,
- "num_tokens": 5317351.0,
- "step": 579
- },
- {
- "epoch": 0.44072948328267475,
- "grad_norm": 1.6950466632843018,
- "learning_rate": 4.873728048110973e-06,
- "loss": 0.5907570719718933,
- "mean_token_accuracy": 0.7946986556053162,
- "num_tokens": 5332542.0,
- "step": 580
- },
- {
- "epoch": 0.44148936170212766,
- "grad_norm": 2.1180708408355713,
- "learning_rate": 4.873070007418059e-06,
- "loss": 0.5220296382904053,
- "mean_token_accuracy": 0.8037363290786743,
- "num_tokens": 5341722.0,
- "step": 581
- },
- {
- "epoch": 0.44224924012158057,
- "grad_norm": 1.3643816709518433,
- "learning_rate": 4.872410301213265e-06,
- "loss": 0.4865502417087555,
- "mean_token_accuracy": 0.8377852439880371,
- "num_tokens": 5359359.0,
- "step": 582
- },
- {
- "epoch": 0.4430091185410334,
- "grad_norm": 1.483280897140503,
- "learning_rate": 4.871748929959598e-06,
- "loss": 0.36856764554977417,
- "mean_token_accuracy": 0.8709549903869629,
- "num_tokens": 5369749.0,
- "step": 583
- },
- {
- "epoch": 0.44376899696048633,
- "grad_norm": 1.6891541481018066,
- "learning_rate": 4.871085894121234e-06,
- "loss": 0.5768930912017822,
- "mean_token_accuracy": 0.8030461668968201,
- "num_tokens": 5383912.0,
- "step": 584
- },
- {
- "epoch": 0.44452887537993924,
- "grad_norm": 2.1318740844726562,
- "learning_rate": 4.870421194163515e-06,
- "loss": 0.4337100386619568,
- "mean_token_accuracy": 0.8562518358230591,
- "num_tokens": 5389412.0,
- "step": 585
- },
- {
- "epoch": 0.4452887537993921,
- "grad_norm": 2.540255546569824,
- "learning_rate": 4.869754830552956e-06,
- "loss": 0.4708256125450134,
- "mean_token_accuracy": 0.8446552753448486,
- "num_tokens": 5394762.0,
- "step": 586
- },
- {
- "epoch": 0.446048632218845,
- "grad_norm": 2.048015594482422,
- "learning_rate": 4.869086803757235e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8181137442588806,
- "num_tokens": 5402379.0,
- "step": 587
- },
- {
- "epoch": 0.44680851063829785,
- "grad_norm": 2.9821012020111084,
- "learning_rate": 4.868417114245199e-06,
- "loss": 0.6299797296524048,
- "mean_token_accuracy": 0.8237329125404358,
- "num_tokens": 5408229.0,
- "step": 588
- },
- {
- "epoch": 0.44756838905775076,
- "grad_norm": 1.7807202339172363,
- "learning_rate": 4.867745762486862e-06,
- "loss": 0.5176759958267212,
- "mean_token_accuracy": 0.8184244632720947,
- "num_tokens": 5418383.0,
- "step": 589
- },
- {
- "epoch": 0.44832826747720367,
- "grad_norm": 1.5466399192810059,
- "learning_rate": 4.8670727489534035e-06,
- "loss": 0.5137228965759277,
- "mean_token_accuracy": 0.8365053534507751,
- "num_tokens": 5432127.0,
- "step": 590
- },
- {
- "epoch": 0.4490881458966565,
- "grad_norm": 2.9521141052246094,
- "learning_rate": 4.866398074117173e-06,
- "loss": 0.4056887924671173,
- "mean_token_accuracy": 0.8561501502990723,
- "num_tokens": 5436062.0,
- "step": 591
- },
- {
- "epoch": 0.44984802431610943,
- "grad_norm": 2.058743953704834,
- "learning_rate": 4.86572173845168e-06,
- "loss": 0.6124799251556396,
- "mean_token_accuracy": 0.8007957339286804,
- "num_tokens": 5444989.0,
- "step": 592
- },
- {
- "epoch": 0.4506079027355623,
- "grad_norm": 2.1243767738342285,
- "learning_rate": 4.865043742431605e-06,
- "loss": 0.5659694671630859,
- "mean_token_accuracy": 0.8084750175476074,
- "num_tokens": 5453865.0,
- "step": 593
- },
- {
- "epoch": 0.4513677811550152,
- "grad_norm": 1.6732314825057983,
- "learning_rate": 4.864364086532792e-06,
- "loss": 0.47879064083099365,
- "mean_token_accuracy": 0.8346436023712158,
- "num_tokens": 5466398.0,
- "step": 594
- },
- {
- "epoch": 0.4521276595744681,
- "grad_norm": 1.3793858289718628,
- "learning_rate": 4.863682771232249e-06,
- "loss": 0.45989373326301575,
- "mean_token_accuracy": 0.8254791498184204,
- "num_tokens": 5482121.0,
- "step": 595
- },
- {
- "epoch": 0.45288753799392095,
- "grad_norm": 1.9812315702438354,
- "learning_rate": 4.862999797008149e-06,
- "loss": 0.5778874754905701,
- "mean_token_accuracy": 0.8041508197784424,
- "num_tokens": 5493000.0,
- "step": 596
- },
- {
- "epoch": 0.45364741641337386,
- "grad_norm": 3.3065083026885986,
- "learning_rate": 4.862315164339829e-06,
- "loss": 0.4623975157737732,
- "mean_token_accuracy": 0.8426318168640137,
- "num_tokens": 5496723.0,
- "step": 597
- },
- {
- "epoch": 0.45440729483282677,
- "grad_norm": 3.167119026184082,
- "learning_rate": 4.861628873707792e-06,
- "loss": 0.6984533667564392,
- "mean_token_accuracy": 0.772136926651001,
- "num_tokens": 5501161.0,
- "step": 598
- },
- {
- "epoch": 0.4551671732522796,
- "grad_norm": 2.2130985260009766,
- "learning_rate": 4.860940925593703e-06,
- "loss": 0.4823192059993744,
- "mean_token_accuracy": 0.8462972640991211,
- "num_tokens": 5509544.0,
- "step": 599
- },
- {
- "epoch": 0.45592705167173253,
- "grad_norm": 3.029191732406616,
- "learning_rate": 4.86025132048039e-06,
- "loss": 0.523664116859436,
- "mean_token_accuracy": 0.8229140043258667,
- "num_tokens": 5514586.0,
- "step": 600
- },
- {
- "epoch": 0.4566869300911854,
- "grad_norm": 1.6983962059020996,
- "learning_rate": 4.859560058851844e-06,
- "loss": 0.4832698106765747,
- "mean_token_accuracy": 0.8403248190879822,
- "num_tokens": 5525773.0,
- "step": 601
- },
- {
- "epoch": 0.4574468085106383,
- "grad_norm": 3.0504038333892822,
- "learning_rate": 4.8588671411932195e-06,
- "loss": 0.5158926248550415,
- "mean_token_accuracy": 0.8098392486572266,
- "num_tokens": 5529739.0,
- "step": 602
- },
- {
- "epoch": 0.4582066869300912,
- "grad_norm": 2.584836483001709,
- "learning_rate": 4.858172567990832e-06,
- "loss": 0.5724587440490723,
- "mean_token_accuracy": 0.8128519058227539,
- "num_tokens": 5535763.0,
- "step": 603
- },
- {
- "epoch": 0.45896656534954405,
- "grad_norm": 2.0514042377471924,
- "learning_rate": 4.857476339732162e-06,
- "loss": 0.4337679445743561,
- "mean_token_accuracy": 0.8405929207801819,
- "num_tokens": 5543075.0,
- "step": 604
- },
- {
- "epoch": 0.45972644376899696,
- "grad_norm": 2.2949347496032715,
- "learning_rate": 4.856778456905846e-06,
- "loss": 0.46532145142555237,
- "mean_token_accuracy": 0.8345137238502502,
- "num_tokens": 5549035.0,
- "step": 605
- },
- {
- "epoch": 0.46048632218844987,
- "grad_norm": 2.2067551612854004,
- "learning_rate": 4.856078920001689e-06,
- "loss": 0.5855136513710022,
- "mean_token_accuracy": 0.8043795228004456,
- "num_tokens": 5555545.0,
- "step": 606
- },
- {
- "epoch": 0.4612462006079027,
- "grad_norm": 2.101945161819458,
- "learning_rate": 4.855377729510648e-06,
- "loss": 0.6071814298629761,
- "mean_token_accuracy": 0.7973253130912781,
- "num_tokens": 5563615.0,
- "step": 607
- },
- {
- "epoch": 0.46200607902735563,
- "grad_norm": 2.5958821773529053,
- "learning_rate": 4.8546748859248504e-06,
- "loss": 0.6278061866760254,
- "mean_token_accuracy": 0.7864972352981567,
- "num_tokens": 5570078.0,
- "step": 608
- },
- {
- "epoch": 0.4627659574468085,
- "grad_norm": 2.778101921081543,
- "learning_rate": 4.853970389737576e-06,
- "loss": 0.35521194338798523,
- "mean_token_accuracy": 0.8752605319023132,
- "num_tokens": 5573995.0,
- "step": 609
- },
- {
- "epoch": 0.4635258358662614,
- "grad_norm": 2.600534677505493,
- "learning_rate": 4.8532642414432675e-06,
- "loss": 0.6541563868522644,
- "mean_token_accuracy": 0.7843613028526306,
- "num_tokens": 5580333.0,
- "step": 610
- },
- {
- "epoch": 0.4642857142857143,
- "grad_norm": 1.778337836265564,
- "learning_rate": 4.852556441537528e-06,
- "loss": 0.3561405837535858,
- "mean_token_accuracy": 0.8579353094100952,
- "num_tokens": 5588430.0,
- "step": 611
- },
- {
- "epoch": 0.46504559270516715,
- "grad_norm": 1.5653862953186035,
- "learning_rate": 4.851846990517118e-06,
- "loss": 0.6067906618118286,
- "mean_token_accuracy": 0.7919317483901978,
- "num_tokens": 5601700.0,
- "step": 612
- },
- {
- "epoch": 0.46580547112462006,
- "grad_norm": 1.6097723245620728,
- "learning_rate": 4.851135888879958e-06,
- "loss": 0.446664422750473,
- "mean_token_accuracy": 0.8441969156265259,
- "num_tokens": 5612063.0,
- "step": 613
- },
- {
- "epoch": 0.46656534954407297,
- "grad_norm": 1.961207389831543,
- "learning_rate": 4.850423137125126e-06,
- "loss": 0.5508605241775513,
- "mean_token_accuracy": 0.8240450024604797,
- "num_tokens": 5620245.0,
- "step": 614
- },
- {
- "epoch": 0.4673252279635258,
- "grad_norm": 2.2189085483551025,
- "learning_rate": 4.8497087357528585e-06,
- "loss": 0.6805076599121094,
- "mean_token_accuracy": 0.771978497505188,
- "num_tokens": 5629590.0,
- "step": 615
- },
- {
- "epoch": 0.46808510638297873,
- "grad_norm": 2.5176279544830322,
- "learning_rate": 4.8489926852645505e-06,
- "loss": 0.4512156844139099,
- "mean_token_accuracy": 0.836459755897522,
- "num_tokens": 5635259.0,
- "step": 616
- },
- {
- "epoch": 0.4688449848024316,
- "grad_norm": 1.5327287912368774,
- "learning_rate": 4.848274986162754e-06,
- "loss": 0.4884302616119385,
- "mean_token_accuracy": 0.8194037079811096,
- "num_tokens": 5649993.0,
- "step": 617
- },
- {
- "epoch": 0.4696048632218845,
- "grad_norm": 2.184554100036621,
- "learning_rate": 4.847555638951177e-06,
- "loss": 0.5141451358795166,
- "mean_token_accuracy": 0.8245922327041626,
- "num_tokens": 5657375.0,
- "step": 618
- },
- {
- "epoch": 0.4703647416413374,
- "grad_norm": 1.6143407821655273,
- "learning_rate": 4.846834644134686e-06,
- "loss": 0.4276641607284546,
- "mean_token_accuracy": 0.8481845855712891,
- "num_tokens": 5667941.0,
- "step": 619
- },
- {
- "epoch": 0.47112462006079026,
- "grad_norm": 2.3747270107269287,
- "learning_rate": 4.846112002219301e-06,
- "loss": 0.5608246922492981,
- "mean_token_accuracy": 0.8073011040687561,
- "num_tokens": 5675042.0,
- "step": 620
- },
- {
- "epoch": 0.47188449848024316,
- "grad_norm": 2.390404224395752,
- "learning_rate": 4.845387713712203e-06,
- "loss": 0.46616724133491516,
- "mean_token_accuracy": 0.8468319177627563,
- "num_tokens": 5680207.0,
- "step": 621
- },
- {
- "epoch": 0.4726443768996961,
- "grad_norm": 1.7245099544525146,
- "learning_rate": 4.844661779121723e-06,
- "loss": 0.5652435421943665,
- "mean_token_accuracy": 0.8010749816894531,
- "num_tokens": 5693759.0,
- "step": 622
- },
- {
- "epoch": 0.4734042553191489,
- "grad_norm": 2.6923108100891113,
- "learning_rate": 4.843934198957351e-06,
- "loss": 0.6254661679267883,
- "mean_token_accuracy": 0.8236024975776672,
- "num_tokens": 5699916.0,
- "step": 623
- },
- {
- "epoch": 0.47416413373860183,
- "grad_norm": 2.516901969909668,
- "learning_rate": 4.84320497372973e-06,
- "loss": 0.6334252953529358,
- "mean_token_accuracy": 0.7803834676742554,
- "num_tokens": 5706554.0,
- "step": 624
- },
- {
- "epoch": 0.4749240121580547,
- "grad_norm": 2.3744447231292725,
- "learning_rate": 4.842474103950658e-06,
- "loss": 0.4221811890602112,
- "mean_token_accuracy": 0.8639545440673828,
- "num_tokens": 5711756.0,
- "step": 625
- },
- {
- "epoch": 0.4756838905775076,
- "grad_norm": 3.2373476028442383,
- "learning_rate": 4.841741590133089e-06,
- "loss": 0.6637828946113586,
- "mean_token_accuracy": 0.7968347072601318,
- "num_tokens": 5716458.0,
- "step": 626
- },
- {
- "epoch": 0.4764437689969605,
- "grad_norm": 2.153888463973999,
- "learning_rate": 4.841007432791129e-06,
- "loss": 0.4877486228942871,
- "mean_token_accuracy": 0.8345249891281128,
- "num_tokens": 5723155.0,
- "step": 627
- },
- {
- "epoch": 0.47720364741641336,
- "grad_norm": 2.120497703552246,
- "learning_rate": 4.8402716324400375e-06,
- "loss": 0.37323033809661865,
- "mean_token_accuracy": 0.8734050393104553,
- "num_tokens": 5729171.0,
- "step": 628
- },
- {
- "epoch": 0.47796352583586627,
- "grad_norm": 1.5294172763824463,
- "learning_rate": 4.839534189596228e-06,
- "loss": 0.4057067334651947,
- "mean_token_accuracy": 0.8523319959640503,
- "num_tokens": 5740112.0,
- "step": 629
- },
- {
- "epoch": 0.4787234042553192,
- "grad_norm": 2.1913886070251465,
- "learning_rate": 4.8387951047772656e-06,
- "loss": 0.4835960865020752,
- "mean_token_accuracy": 0.8438145518302917,
- "num_tokens": 5746838.0,
- "step": 630
- },
- {
- "epoch": 0.479483282674772,
- "grad_norm": 1.482897162437439,
- "learning_rate": 4.838054378501868e-06,
- "loss": 0.46967992186546326,
- "mean_token_accuracy": 0.8315759897232056,
- "num_tokens": 5760428.0,
- "step": 631
- },
- {
- "epoch": 0.48024316109422494,
- "grad_norm": 1.38850998878479,
- "learning_rate": 4.837312011289907e-06,
- "loss": 0.41845446825027466,
- "mean_token_accuracy": 0.8557186126708984,
- "num_tokens": 5773437.0,
- "step": 632
- },
- {
- "epoch": 0.4810030395136778,
- "grad_norm": 3.8337457180023193,
- "learning_rate": 4.836568003662403e-06,
- "loss": 0.5102912187576294,
- "mean_token_accuracy": 0.830644965171814,
- "num_tokens": 5776367.0,
- "step": 633
- },
- {
- "epoch": 0.4817629179331307,
- "grad_norm": 1.2084007263183594,
- "learning_rate": 4.8358223561415304e-06,
- "loss": 0.3835333585739136,
- "mean_token_accuracy": 0.8639016151428223,
- "num_tokens": 5792246.0,
- "step": 634
- },
- {
- "epoch": 0.4825227963525836,
- "grad_norm": 1.939408540725708,
- "learning_rate": 4.835075069250613e-06,
- "loss": 0.4044850468635559,
- "mean_token_accuracy": 0.8488376140594482,
- "num_tokens": 5799853.0,
- "step": 635
- },
- {
- "epoch": 0.48328267477203646,
- "grad_norm": 1.345870852470398,
- "learning_rate": 4.8343261435141245e-06,
- "loss": 0.46660199761390686,
- "mean_token_accuracy": 0.8371681571006775,
- "num_tokens": 5817478.0,
- "step": 636
- },
- {
- "epoch": 0.48404255319148937,
- "grad_norm": 1.6531339883804321,
- "learning_rate": 4.833575579457691e-06,
- "loss": 0.3886989951133728,
- "mean_token_accuracy": 0.8763507008552551,
- "num_tokens": 5825739.0,
- "step": 637
- },
- {
- "epoch": 0.4848024316109423,
- "grad_norm": 1.6443969011306763,
- "learning_rate": 4.832823377608088e-06,
- "loss": 0.4070289731025696,
- "mean_token_accuracy": 0.8586630821228027,
- "num_tokens": 5837917.0,
- "step": 638
- },
- {
- "epoch": 0.48556231003039513,
- "grad_norm": 2.005136013031006,
- "learning_rate": 4.832069538493237e-06,
- "loss": 0.40616685152053833,
- "mean_token_accuracy": 0.8571510314941406,
- "num_tokens": 5845250.0,
- "step": 639
- },
- {
- "epoch": 0.48632218844984804,
- "grad_norm": 1.5244266986846924,
- "learning_rate": 4.831314062642213e-06,
- "loss": 0.49530288577079773,
- "mean_token_accuracy": 0.8328841924667358,
- "num_tokens": 5857407.0,
- "step": 640
- },
- {
- "epoch": 0.4870820668693009,
- "grad_norm": 1.9876971244812012,
- "learning_rate": 4.830556950585239e-06,
- "loss": 0.4583776593208313,
- "mean_token_accuracy": 0.8427221179008484,
- "num_tokens": 5865391.0,
- "step": 641
- },
- {
- "epoch": 0.4878419452887538,
- "grad_norm": 3.023336172103882,
- "learning_rate": 4.829798202853683e-06,
- "loss": 0.6134771108627319,
- "mean_token_accuracy": 0.7981935739517212,
- "num_tokens": 5870729.0,
- "step": 642
- },
- {
- "epoch": 0.4886018237082067,
- "grad_norm": 1.8889515399932861,
- "learning_rate": 4.829037819980065e-06,
- "loss": 0.4420135021209717,
- "mean_token_accuracy": 0.8480775356292725,
- "num_tokens": 5878982.0,
- "step": 643
- },
- {
- "epoch": 0.48936170212765956,
- "grad_norm": 2.2408435344696045,
- "learning_rate": 4.828275802498051e-06,
- "loss": 0.525706946849823,
- "mean_token_accuracy": 0.8271557092666626,
- "num_tokens": 5885097.0,
- "step": 644
- },
- {
- "epoch": 0.49012158054711247,
- "grad_norm": 1.9734224081039429,
- "learning_rate": 4.827512150942454e-06,
- "loss": 0.44246578216552734,
- "mean_token_accuracy": 0.8456668257713318,
- "num_tokens": 5893941.0,
- "step": 645
- },
- {
- "epoch": 0.4908814589665654,
- "grad_norm": 1.9618173837661743,
- "learning_rate": 4.8267468658492335e-06,
- "loss": 0.5119768381118774,
- "mean_token_accuracy": 0.8355510830879211,
- "num_tokens": 5902829.0,
- "step": 646
- },
- {
- "epoch": 0.49164133738601823,
- "grad_norm": 1.7181587219238281,
- "learning_rate": 4.825979947755496e-06,
- "loss": 0.5666520595550537,
- "mean_token_accuracy": 0.7951971888542175,
- "num_tokens": 5915212.0,
- "step": 647
- },
- {
- "epoch": 0.49240121580547114,
- "grad_norm": 3.0121164321899414,
- "learning_rate": 4.8252113971994955e-06,
- "loss": 0.628632128238678,
- "mean_token_accuracy": 0.8041050434112549,
- "num_tokens": 5921410.0,
- "step": 648
- },
- {
- "epoch": 0.493161094224924,
- "grad_norm": 2.9980475902557373,
- "learning_rate": 4.824441214720629e-06,
- "loss": 0.4507424831390381,
- "mean_token_accuracy": 0.8636263608932495,
- "num_tokens": 5925179.0,
- "step": 649
- },
- {
- "epoch": 0.4939209726443769,
- "grad_norm": 2.0096445083618164,
- "learning_rate": 4.823669400859441e-06,
- "loss": 0.602759838104248,
- "mean_token_accuracy": 0.8104915618896484,
- "num_tokens": 5934160.0,
- "step": 650
- },
- {
- "epoch": 0.4946808510638298,
- "grad_norm": 1.1186442375183105,
- "learning_rate": 4.8228959561576195e-06,
- "loss": 0.41168469190597534,
- "mean_token_accuracy": 0.8461419939994812,
- "num_tokens": 5954163.0,
- "step": 651
- },
- {
- "epoch": 0.49544072948328266,
- "grad_norm": 1.855465054512024,
- "learning_rate": 4.822120881157998e-06,
- "loss": 0.5049735307693481,
- "mean_token_accuracy": 0.8225747346878052,
- "num_tokens": 5963840.0,
- "step": 652
- },
- {
- "epoch": 0.49620060790273557,
- "grad_norm": 3.550563335418701,
- "learning_rate": 4.821344176404554e-06,
- "loss": 0.49025264382362366,
- "mean_token_accuracy": 0.8265978693962097,
- "num_tokens": 5967358.0,
- "step": 653
- },
- {
- "epoch": 0.4969604863221885,
- "grad_norm": 3.063910484313965,
- "learning_rate": 4.820565842442408e-06,
- "loss": 0.5652767419815063,
- "mean_token_accuracy": 0.811700701713562,
- "num_tokens": 5971858.0,
- "step": 654
- },
- {
- "epoch": 0.49772036474164133,
- "grad_norm": 2.4613308906555176,
- "learning_rate": 4.819785879817827e-06,
- "loss": 0.5296125411987305,
- "mean_token_accuracy": 0.8336488008499146,
- "num_tokens": 5977442.0,
- "step": 655
- },
- {
- "epoch": 0.49848024316109424,
- "grad_norm": 2.342519760131836,
- "learning_rate": 4.819004289078217e-06,
- "loss": 0.5753380060195923,
- "mean_token_accuracy": 0.7922406792640686,
- "num_tokens": 5984531.0,
- "step": 656
- },
- {
- "epoch": 0.4992401215805471,
- "grad_norm": 2.0410680770874023,
- "learning_rate": 4.818221070772129e-06,
- "loss": 0.5433275699615479,
- "mean_token_accuracy": 0.8043830990791321,
- "num_tokens": 5992642.0,
- "step": 657
- },
- {
- "epoch": 0.5,
- "grad_norm": 1.4999698400497437,
- "learning_rate": 4.8174362254492555e-06,
- "loss": 0.5248899459838867,
- "mean_token_accuracy": 0.8107168674468994,
- "num_tokens": 6005543.0,
- "step": 658
- },
- {
- "epoch": 0.5007598784194529,
- "grad_norm": 1.9494401216506958,
- "learning_rate": 4.816649753660431e-06,
- "loss": 0.41291385889053345,
- "mean_token_accuracy": 0.8650569915771484,
- "num_tokens": 6012185.0,
- "step": 659
- },
- {
- "epoch": 0.5015197568389058,
- "grad_norm": 2.7514095306396484,
- "learning_rate": 4.815861655957632e-06,
- "loss": 0.4244142770767212,
- "mean_token_accuracy": 0.8485112190246582,
- "num_tokens": 6016809.0,
- "step": 660
- },
- {
- "epoch": 0.5022796352583586,
- "grad_norm": 1.4354928731918335,
- "learning_rate": 4.815071932893976e-06,
- "loss": 0.4332060217857361,
- "mean_token_accuracy": 0.8386815786361694,
- "num_tokens": 6034795.0,
- "step": 661
- },
- {
- "epoch": 0.5030395136778115,
- "grad_norm": 1.3113417625427246,
- "learning_rate": 4.81428058502372e-06,
- "loss": 0.5415540933609009,
- "mean_token_accuracy": 0.8115285038948059,
- "num_tokens": 6053624.0,
- "step": 662
- },
- {
- "epoch": 0.5037993920972644,
- "grad_norm": 1.820868730545044,
- "learning_rate": 4.813487612902265e-06,
- "loss": 0.5360245108604431,
- "mean_token_accuracy": 0.8313555717468262,
- "num_tokens": 6063399.0,
- "step": 663
- },
- {
- "epoch": 0.5045592705167173,
- "grad_norm": 2.347001552581787,
- "learning_rate": 4.812693017086145e-06,
- "loss": 0.4926982820034027,
- "mean_token_accuracy": 0.8137006759643555,
- "num_tokens": 6070111.0,
- "step": 664
- },
- {
- "epoch": 0.5053191489361702,
- "grad_norm": 1.8830888271331787,
- "learning_rate": 4.811896798133042e-06,
- "loss": 0.5419014692306519,
- "mean_token_accuracy": 0.8027454614639282,
- "num_tokens": 6081090.0,
- "step": 665
- },
- {
- "epoch": 0.506079027355623,
- "grad_norm": 2.3258056640625,
- "learning_rate": 4.811098956601772e-06,
- "loss": 0.4629337787628174,
- "mean_token_accuracy": 0.8416580557823181,
- "num_tokens": 6087921.0,
- "step": 666
- },
- {
- "epoch": 0.506838905775076,
- "grad_norm": 1.9578291177749634,
- "learning_rate": 4.810299493052289e-06,
- "loss": 0.40305402874946594,
- "mean_token_accuracy": 0.8529061079025269,
- "num_tokens": 6100034.0,
- "step": 667
- },
- {
- "epoch": 0.5075987841945289,
- "grad_norm": 2.800635576248169,
- "learning_rate": 4.809498408045691e-06,
- "loss": 0.5087342262268066,
- "mean_token_accuracy": 0.8214689493179321,
- "num_tokens": 6104742.0,
- "step": 668
- },
- {
- "epoch": 0.5083586626139818,
- "grad_norm": 1.5318149328231812,
- "learning_rate": 4.808695702144206e-06,
- "loss": 0.4733222723007202,
- "mean_token_accuracy": 0.837577223777771,
- "num_tokens": 6117242.0,
- "step": 669
- },
- {
- "epoch": 0.5091185410334347,
- "grad_norm": 1.2368661165237427,
- "learning_rate": 4.807891375911207e-06,
- "loss": 0.3929097056388855,
- "mean_token_accuracy": 0.8331400752067566,
- "num_tokens": 6133509.0,
- "step": 670
- },
- {
- "epoch": 0.5098784194528876,
- "grad_norm": 2.4711415767669678,
- "learning_rate": 4.8070854299112e-06,
- "loss": 0.6294851303100586,
- "mean_token_accuracy": 0.7956781983375549,
- "num_tokens": 6140294.0,
- "step": 671
- },
- {
- "epoch": 0.5106382978723404,
- "grad_norm": 2.590961217880249,
- "learning_rate": 4.806277864709828e-06,
- "loss": 0.580160915851593,
- "mean_token_accuracy": 0.809589684009552,
- "num_tokens": 6145803.0,
- "step": 672
- },
- {
- "epoch": 0.5113981762917933,
- "grad_norm": 2.4653842449188232,
- "learning_rate": 4.805468680873874e-06,
- "loss": 0.5262120366096497,
- "mean_token_accuracy": 0.822458803653717,
- "num_tokens": 6151236.0,
- "step": 673
- },
- {
- "epoch": 0.5121580547112462,
- "grad_norm": 2.860720157623291,
- "learning_rate": 4.804657878971252e-06,
- "loss": 0.4007391035556793,
- "mean_token_accuracy": 0.8637382984161377,
- "num_tokens": 6155310.0,
- "step": 674
- },
- {
- "epoch": 0.5129179331306991,
- "grad_norm": 2.520282030105591,
- "learning_rate": 4.803845459571014e-06,
- "loss": 0.45798182487487793,
- "mean_token_accuracy": 0.8270114660263062,
- "num_tokens": 6160326.0,
- "step": 675
- },
- {
- "epoch": 0.513677811550152,
- "grad_norm": 2.7290921211242676,
- "learning_rate": 4.803031423243349e-06,
- "loss": 0.5745848417282104,
- "mean_token_accuracy": 0.8401234745979309,
- "num_tokens": 6165709.0,
- "step": 676
- },
- {
- "epoch": 0.5144376899696048,
- "grad_norm": 1.6678650379180908,
- "learning_rate": 4.802215770559578e-06,
- "loss": 0.5257721543312073,
- "mean_token_accuracy": 0.8241991996765137,
- "num_tokens": 6177875.0,
- "step": 677
- },
- {
- "epoch": 0.5151975683890577,
- "grad_norm": 2.1720468997955322,
- "learning_rate": 4.801398502092156e-06,
- "loss": 0.45342206954956055,
- "mean_token_accuracy": 0.8463799953460693,
- "num_tokens": 6185415.0,
- "step": 678
- },
- {
- "epoch": 0.5159574468085106,
- "grad_norm": 2.282259702682495,
- "learning_rate": 4.800579618414677e-06,
- "loss": 0.4864169955253601,
- "mean_token_accuracy": 0.8300632238388062,
- "num_tokens": 6191832.0,
- "step": 679
- },
- {
- "epoch": 0.5167173252279635,
- "grad_norm": 2.0092248916625977,
- "learning_rate": 4.799759120101861e-06,
- "loss": 0.5781463980674744,
- "mean_token_accuracy": 0.8267031908035278,
- "num_tokens": 6199440.0,
- "step": 680
- },
- {
- "epoch": 0.5174772036474165,
- "grad_norm": 1.396580696105957,
- "learning_rate": 4.798937007729568e-06,
- "loss": 0.49689239263534546,
- "mean_token_accuracy": 0.8257499933242798,
- "num_tokens": 6213840.0,
- "step": 681
- },
- {
- "epoch": 0.5182370820668692,
- "grad_norm": 1.9060769081115723,
- "learning_rate": 4.798113281874788e-06,
- "loss": 0.48969539999961853,
- "mean_token_accuracy": 0.8171790838241577,
- "num_tokens": 6223006.0,
- "step": 682
- },
- {
- "epoch": 0.5189969604863222,
- "grad_norm": 1.6255282163619995,
- "learning_rate": 4.797287943115642e-06,
- "loss": 0.5532330870628357,
- "mean_token_accuracy": 0.8173393607139587,
- "num_tokens": 6234857.0,
- "step": 683
- },
- {
- "epoch": 0.5197568389057751,
- "grad_norm": 1.6923905611038208,
- "learning_rate": 4.796460992031386e-06,
- "loss": 0.4880887269973755,
- "mean_token_accuracy": 0.834983229637146,
- "num_tokens": 6245252.0,
- "step": 684
- },
- {
- "epoch": 0.520516717325228,
- "grad_norm": 2.13161301612854,
- "learning_rate": 4.7956324292024045e-06,
- "loss": 0.5687593817710876,
- "mean_token_accuracy": 0.7996571063995361,
- "num_tokens": 6253726.0,
- "step": 685
- },
- {
- "epoch": 0.5212765957446809,
- "grad_norm": 2.509375810623169,
- "learning_rate": 4.794802255210217e-06,
- "loss": 0.5396929979324341,
- "mean_token_accuracy": 0.8007107973098755,
- "num_tokens": 6259238.0,
- "step": 686
- },
- {
- "epoch": 0.5220364741641338,
- "grad_norm": 2.393710136413574,
- "learning_rate": 4.793970470637469e-06,
- "loss": 0.6165191531181335,
- "mean_token_accuracy": 0.7891418933868408,
- "num_tokens": 6266325.0,
- "step": 687
- },
- {
- "epoch": 0.5227963525835866,
- "grad_norm": 1.511647343635559,
- "learning_rate": 4.7931370760679415e-06,
- "loss": 0.4773876965045929,
- "mean_token_accuracy": 0.8381044864654541,
- "num_tokens": 6277447.0,
- "step": 688
- },
- {
- "epoch": 0.5235562310030395,
- "grad_norm": 2.206587314605713,
- "learning_rate": 4.792302072086542e-06,
- "loss": 0.5482058525085449,
- "mean_token_accuracy": 0.8239108920097351,
- "num_tokens": 6285163.0,
- "step": 689
- },
- {
- "epoch": 0.5243161094224924,
- "grad_norm": 3.018146514892578,
- "learning_rate": 4.7914654592793065e-06,
- "loss": 0.4880615472793579,
- "mean_token_accuracy": 0.8361308574676514,
- "num_tokens": 6289386.0,
- "step": 690
- },
- {
- "epoch": 0.5250759878419453,
- "grad_norm": 1.6469231843948364,
- "learning_rate": 4.790627238233405e-06,
- "loss": 0.4164774715900421,
- "mean_token_accuracy": 0.8496290445327759,
- "num_tokens": 6298915.0,
- "step": 691
- },
- {
- "epoch": 0.5258358662613982,
- "grad_norm": 2.352505922317505,
- "learning_rate": 4.789787409537131e-06,
- "loss": 0.5366303324699402,
- "mean_token_accuracy": 0.8350417613983154,
- "num_tokens": 6306130.0,
- "step": 692
- },
- {
- "epoch": 0.526595744680851,
- "grad_norm": 1.7463021278381348,
- "learning_rate": 4.7889459737799105e-06,
- "loss": 0.4389137923717499,
- "mean_token_accuracy": 0.8463300466537476,
- "num_tokens": 6315503.0,
- "step": 693
- },
- {
- "epoch": 0.5273556231003039,
- "grad_norm": 2.257706642150879,
- "learning_rate": 4.788102931552294e-06,
- "loss": 0.5309344530105591,
- "mean_token_accuracy": 0.8164352178573608,
- "num_tokens": 6321852.0,
- "step": 694
- },
- {
- "epoch": 0.5281155015197568,
- "grad_norm": 2.392732620239258,
- "learning_rate": 4.787258283445962e-06,
- "loss": 0.3956204056739807,
- "mean_token_accuracy": 0.8671456575393677,
- "num_tokens": 6327380.0,
- "step": 695
- },
- {
- "epoch": 0.5288753799392097,
- "grad_norm": 2.210514545440674,
- "learning_rate": 4.786412030053721e-06,
- "loss": 0.4842875003814697,
- "mean_token_accuracy": 0.8508446216583252,
- "num_tokens": 6334898.0,
- "step": 696
- },
- {
- "epoch": 0.5296352583586627,
- "grad_norm": 1.8678946495056152,
- "learning_rate": 4.785564171969503e-06,
- "loss": 0.47399595379829407,
- "mean_token_accuracy": 0.8514996767044067,
- "num_tokens": 6346374.0,
- "step": 697
- },
- {
- "epoch": 0.5303951367781155,
- "grad_norm": 2.604079484939575,
- "learning_rate": 4.784714709788368e-06,
- "loss": 0.5950228571891785,
- "mean_token_accuracy": 0.7983481884002686,
- "num_tokens": 6351648.0,
- "step": 698
- },
- {
- "epoch": 0.5311550151975684,
- "grad_norm": 1.662381649017334,
- "learning_rate": 4.783863644106502e-06,
- "loss": 0.41616758704185486,
- "mean_token_accuracy": 0.8554803133010864,
- "num_tokens": 6360506.0,
- "step": 699
- },
- {
- "epoch": 0.5319148936170213,
- "grad_norm": 1.6300342082977295,
- "learning_rate": 4.783010975521216e-06,
- "loss": 0.43029269576072693,
- "mean_token_accuracy": 0.8443028926849365,
- "num_tokens": 6370675.0,
- "step": 700
- },
- {
- "epoch": 0.5326747720364742,
- "grad_norm": 1.731873869895935,
- "learning_rate": 4.782156704630944e-06,
- "loss": 0.4383814334869385,
- "mean_token_accuracy": 0.8443183898925781,
- "num_tokens": 6381803.0,
- "step": 701
- },
- {
- "epoch": 0.5334346504559271,
- "grad_norm": 3.1788413524627686,
- "learning_rate": 4.7813008320352475e-06,
- "loss": 0.32194480299949646,
- "mean_token_accuracy": 0.8870962858200073,
- "num_tokens": 6389263.0,
- "step": 702
- },
- {
- "epoch": 0.53419452887538,
- "grad_norm": 2.099513530731201,
- "learning_rate": 4.78044335833481e-06,
- "loss": 0.36962923407554626,
- "mean_token_accuracy": 0.8661133646965027,
- "num_tokens": 6395589.0,
- "step": 703
- },
- {
- "epoch": 0.5349544072948328,
- "grad_norm": 1.4859435558319092,
- "learning_rate": 4.77958428413144e-06,
- "loss": 0.4619954824447632,
- "mean_token_accuracy": 0.8438555002212524,
- "num_tokens": 6407470.0,
- "step": 704
- },
- {
- "epoch": 0.5357142857142857,
- "grad_norm": 1.2561073303222656,
- "learning_rate": 4.7787236100280685e-06,
- "loss": 0.3770977258682251,
- "mean_token_accuracy": 0.8515733480453491,
- "num_tokens": 6422888.0,
- "step": 705
- },
- {
- "epoch": 0.5364741641337386,
- "grad_norm": 1.4455817937850952,
- "learning_rate": 4.777861336628751e-06,
- "loss": 0.46481069922447205,
- "mean_token_accuracy": 0.8502002954483032,
- "num_tokens": 6441266.0,
- "step": 706
- },
- {
- "epoch": 0.5372340425531915,
- "grad_norm": 1.1387295722961426,
- "learning_rate": 4.7769974645386616e-06,
- "loss": 0.36964765191078186,
- "mean_token_accuracy": 0.8719524145126343,
- "num_tokens": 6463686.0,
- "step": 707
- },
- {
- "epoch": 0.5379939209726444,
- "grad_norm": 1.7179663181304932,
- "learning_rate": 4.776131994364102e-06,
- "loss": 0.4231719970703125,
- "mean_token_accuracy": 0.8416585922241211,
- "num_tokens": 6472956.0,
- "step": 708
- },
- {
- "epoch": 0.5387537993920972,
- "grad_norm": 1.6328502893447876,
- "learning_rate": 4.775264926712489e-06,
- "loss": 0.5836569666862488,
- "mean_token_accuracy": 0.8039724230766296,
- "num_tokens": 6485773.0,
- "step": 709
- },
- {
- "epoch": 0.5395136778115501,
- "grad_norm": 1.8515360355377197,
- "learning_rate": 4.774396262192368e-06,
- "loss": 0.5477553009986877,
- "mean_token_accuracy": 0.8136521577835083,
- "num_tokens": 6496379.0,
- "step": 710
- },
- {
- "epoch": 0.540273556231003,
- "grad_norm": 1.741858959197998,
- "learning_rate": 4.7735260014133986e-06,
- "loss": 0.4663267731666565,
- "mean_token_accuracy": 0.8473691940307617,
- "num_tokens": 6507652.0,
- "step": 711
- },
- {
- "epoch": 0.541033434650456,
- "grad_norm": 1.7516659498214722,
- "learning_rate": 4.772654144986364e-06,
- "loss": 0.374914288520813,
- "mean_token_accuracy": 0.8600220680236816,
- "num_tokens": 6519030.0,
- "step": 712
- },
- {
- "epoch": 0.5417933130699089,
- "grad_norm": 2.662343978881836,
- "learning_rate": 4.7717806935231665e-06,
- "loss": 0.4206875264644623,
- "mean_token_accuracy": 0.8544126749038696,
- "num_tokens": 6523669.0,
- "step": 713
- },
- {
- "epoch": 0.5425531914893617,
- "grad_norm": 1.4088834524154663,
- "learning_rate": 4.770905647636828e-06,
- "loss": 0.5824331045150757,
- "mean_token_accuracy": 0.7857901453971863,
- "num_tokens": 6540560.0,
- "step": 714
- },
- {
- "epoch": 0.5433130699088146,
- "grad_norm": 2.173656940460205,
- "learning_rate": 4.77002900794149e-06,
- "loss": 0.555023729801178,
- "mean_token_accuracy": 0.8067290782928467,
- "num_tokens": 6548946.0,
- "step": 715
- },
- {
- "epoch": 0.5440729483282675,
- "grad_norm": 2.121018648147583,
- "learning_rate": 4.769150775052411e-06,
- "loss": 0.559730052947998,
- "mean_token_accuracy": 0.8166372776031494,
- "num_tokens": 6556065.0,
- "step": 716
- },
- {
- "epoch": 0.5448328267477204,
- "grad_norm": 3.335866928100586,
- "learning_rate": 4.768270949585968e-06,
- "loss": 0.6442267894744873,
- "mean_token_accuracy": 0.7858607769012451,
- "num_tokens": 6560615.0,
- "step": 717
- },
- {
- "epoch": 0.5455927051671733,
- "grad_norm": 2.3813695907592773,
- "learning_rate": 4.767389532159659e-06,
- "loss": 0.4027421474456787,
- "mean_token_accuracy": 0.8635619282722473,
- "num_tokens": 6565841.0,
- "step": 718
- },
- {
- "epoch": 0.5463525835866262,
- "grad_norm": 2.0657708644866943,
- "learning_rate": 4.766506523392095e-06,
- "loss": 0.38899827003479004,
- "mean_token_accuracy": 0.8660480380058289,
- "num_tokens": 6572362.0,
- "step": 719
- },
- {
- "epoch": 0.547112462006079,
- "grad_norm": 1.093705415725708,
- "learning_rate": 4.765621923903005e-06,
- "loss": 0.45967352390289307,
- "mean_token_accuracy": 0.8338102102279663,
- "num_tokens": 6595998.0,
- "step": 720
- },
- {
- "epoch": 0.5478723404255319,
- "grad_norm": 2.942065954208374,
- "learning_rate": 4.764735734313236e-06,
- "loss": 0.42910510301589966,
- "mean_token_accuracy": 0.8406122922897339,
- "num_tokens": 6601075.0,
- "step": 721
- },
- {
- "epoch": 0.5486322188449848,
- "grad_norm": 2.049011707305908,
- "learning_rate": 4.763847955244749e-06,
- "loss": 0.5584231615066528,
- "mean_token_accuracy": 0.8171684741973877,
- "num_tokens": 6609310.0,
- "step": 722
- },
- {
- "epoch": 0.5493920972644377,
- "grad_norm": 2.485543966293335,
- "learning_rate": 4.762958587320623e-06,
- "loss": 0.5396170020103455,
- "mean_token_accuracy": 0.8158525824546814,
- "num_tokens": 6616185.0,
- "step": 723
- },
- {
- "epoch": 0.5501519756838906,
- "grad_norm": 1.87015962600708,
- "learning_rate": 4.762067631165049e-06,
- "loss": 0.49739527702331543,
- "mean_token_accuracy": 0.8303765654563904,
- "num_tokens": 6625629.0,
- "step": 724
- },
- {
- "epoch": 0.5509118541033434,
- "grad_norm": 4.239654541015625,
- "learning_rate": 4.761175087403336e-06,
- "loss": 0.6029239296913147,
- "mean_token_accuracy": 0.8123486042022705,
- "num_tokens": 6629194.0,
- "step": 725
- },
- {
- "epoch": 0.5516717325227963,
- "grad_norm": 2.0134730339050293,
- "learning_rate": 4.760280956661904e-06,
- "loss": 0.4777873754501343,
- "mean_token_accuracy": 0.8283513784408569,
- "num_tokens": 6636929.0,
- "step": 726
- },
- {
- "epoch": 0.5524316109422492,
- "grad_norm": 1.991780400276184,
- "learning_rate": 4.75938523956829e-06,
- "loss": 0.4631248116493225,
- "mean_token_accuracy": 0.8275107741355896,
- "num_tokens": 6645135.0,
- "step": 727
- },
- {
- "epoch": 0.5531914893617021,
- "grad_norm": 1.423792839050293,
- "learning_rate": 4.75848793675114e-06,
- "loss": 0.49630722403526306,
- "mean_token_accuracy": 0.8388000130653381,
- "num_tokens": 6662690.0,
- "step": 728
- },
- {
- "epoch": 0.5539513677811551,
- "grad_norm": 2.345294952392578,
- "learning_rate": 4.757589048840219e-06,
- "loss": 0.37830638885498047,
- "mean_token_accuracy": 0.8782080411911011,
- "num_tokens": 6667285.0,
- "step": 729
- },
- {
- "epoch": 0.5547112462006079,
- "grad_norm": 2.7452144622802734,
- "learning_rate": 4.756688576466398e-06,
- "loss": 0.51595538854599,
- "mean_token_accuracy": 0.8441770672798157,
- "num_tokens": 6672324.0,
- "step": 730
- },
- {
- "epoch": 0.5554711246200608,
- "grad_norm": 1.5247859954833984,
- "learning_rate": 4.755786520261666e-06,
- "loss": 0.48365193605422974,
- "mean_token_accuracy": 0.8276445269584656,
- "num_tokens": 6685296.0,
- "step": 731
- },
- {
- "epoch": 0.5562310030395137,
- "grad_norm": 1.4018276929855347,
- "learning_rate": 4.75488288085912e-06,
- "loss": 0.3876481354236603,
- "mean_token_accuracy": 0.8612343072891235,
- "num_tokens": 6697515.0,
- "step": 732
- },
- {
- "epoch": 0.5569908814589666,
- "grad_norm": 2.9570324420928955,
- "learning_rate": 4.753977658892967e-06,
- "loss": 0.5468149185180664,
- "mean_token_accuracy": 0.8054271340370178,
- "num_tokens": 6702194.0,
- "step": 733
- },
- {
- "epoch": 0.5577507598784195,
- "grad_norm": 1.9282715320587158,
- "learning_rate": 4.753070854998529e-06,
- "loss": 0.4758574962615967,
- "mean_token_accuracy": 0.8379775285720825,
- "num_tokens": 6709938.0,
- "step": 734
- },
- {
- "epoch": 0.5585106382978723,
- "grad_norm": 1.981264591217041,
- "learning_rate": 4.752162469812234e-06,
- "loss": 0.48461222648620605,
- "mean_token_accuracy": 0.833509087562561,
- "num_tokens": 6718125.0,
- "step": 735
- },
- {
- "epoch": 0.5592705167173252,
- "grad_norm": 1.1643427610397339,
- "learning_rate": 4.751252503971624e-06,
- "loss": 0.410121887922287,
- "mean_token_accuracy": 0.8221402764320374,
- "num_tokens": 6735125.0,
- "step": 736
- },
- {
- "epoch": 0.5600303951367781,
- "grad_norm": 1.786566972732544,
- "learning_rate": 4.750340958115346e-06,
- "loss": 0.5964341163635254,
- "mean_token_accuracy": 0.8038164377212524,
- "num_tokens": 6747369.0,
- "step": 737
- },
- {
- "epoch": 0.560790273556231,
- "grad_norm": 1.7256991863250732,
- "learning_rate": 4.749427832883158e-06,
- "loss": 0.48737066984176636,
- "mean_token_accuracy": 0.830894947052002,
- "num_tokens": 6758115.0,
- "step": 738
- },
- {
- "epoch": 0.5615501519756839,
- "grad_norm": 1.997747540473938,
- "learning_rate": 4.748513128915928e-06,
- "loss": 0.5238886475563049,
- "mean_token_accuracy": 0.8066858053207397,
- "num_tokens": 6766111.0,
- "step": 739
- },
- {
- "epoch": 0.5623100303951368,
- "grad_norm": 2.127016305923462,
- "learning_rate": 4.747596846855629e-06,
- "loss": 0.5045586228370667,
- "mean_token_accuracy": 0.821424126625061,
- "num_tokens": 6772893.0,
- "step": 740
- },
- {
- "epoch": 0.5630699088145896,
- "grad_norm": 1.7664796113967896,
- "learning_rate": 4.7466789873453446e-06,
- "loss": 0.42954835295677185,
- "mean_token_accuracy": 0.8533384799957275,
- "num_tokens": 6785133.0,
- "step": 741
- },
- {
- "epoch": 0.5638297872340425,
- "grad_norm": 1.4987404346466064,
- "learning_rate": 4.7457595510292615e-06,
- "loss": 0.5378558039665222,
- "mean_token_accuracy": 0.8184819221496582,
- "num_tokens": 6799563.0,
- "step": 742
- },
- {
- "epoch": 0.5645896656534954,
- "grad_norm": 1.4444655179977417,
- "learning_rate": 4.744838538552678e-06,
- "loss": 0.42193782329559326,
- "mean_token_accuracy": 0.837514340877533,
- "num_tokens": 6812470.0,
- "step": 743
- },
- {
- "epoch": 0.5653495440729484,
- "grad_norm": 3.867751121520996,
- "learning_rate": 4.7439159505619946e-06,
- "loss": 0.4457814693450928,
- "mean_token_accuracy": 0.8630104660987854,
- "num_tokens": 6815652.0,
- "step": 744
- },
- {
- "epoch": 0.5661094224924013,
- "grad_norm": 2.1250710487365723,
- "learning_rate": 4.74299178770472e-06,
- "loss": 0.5638922452926636,
- "mean_token_accuracy": 0.7969781160354614,
- "num_tokens": 6824566.0,
- "step": 745
- },
- {
- "epoch": 0.5668693009118541,
- "grad_norm": 2.547072410583496,
- "learning_rate": 4.742066050629465e-06,
- "loss": 0.5516207814216614,
- "mean_token_accuracy": 0.8160669803619385,
- "num_tokens": 6830589.0,
- "step": 746
- },
- {
- "epoch": 0.567629179331307,
- "grad_norm": 1.2975233793258667,
- "learning_rate": 4.741138739985951e-06,
- "loss": 0.3823344111442566,
- "mean_token_accuracy": 0.8668368458747864,
- "num_tokens": 6842707.0,
- "step": 747
- },
- {
- "epoch": 0.5683890577507599,
- "grad_norm": 1.3410450220108032,
- "learning_rate": 4.740209856424998e-06,
- "loss": 0.5148671269416809,
- "mean_token_accuracy": 0.8188045024871826,
- "num_tokens": 6857624.0,
- "step": 748
- },
- {
- "epoch": 0.5691489361702128,
- "grad_norm": 1.219467282295227,
- "learning_rate": 4.7392794005985324e-06,
- "loss": 0.3998957872390747,
- "mean_token_accuracy": 0.855175256729126,
- "num_tokens": 6875064.0,
- "step": 749
- },
- {
- "epoch": 0.5699088145896657,
- "grad_norm": 1.3530343770980835,
- "learning_rate": 4.738347373159585e-06,
- "loss": 0.5359633564949036,
- "mean_token_accuracy": 0.8178457021713257,
- "num_tokens": 6890911.0,
- "step": 750
- },
- {
- "epoch": 0.5706686930091185,
- "grad_norm": 2.146988868713379,
- "learning_rate": 4.737413774762287e-06,
- "loss": 0.4460008144378662,
- "mean_token_accuracy": 0.8172903060913086,
- "num_tokens": 6896959.0,
- "step": 751
- },
- {
- "epoch": 0.5714285714285714,
- "grad_norm": 1.456023097038269,
- "learning_rate": 4.736478606061876e-06,
- "loss": 0.43616920709609985,
- "mean_token_accuracy": 0.8465108871459961,
- "num_tokens": 6908904.0,
- "step": 752
- },
- {
- "epoch": 0.5721884498480243,
- "grad_norm": 2.9696967601776123,
- "learning_rate": 4.735541867714687e-06,
- "loss": 0.43464532494544983,
- "mean_token_accuracy": 0.8608652353286743,
- "num_tokens": 6913026.0,
- "step": 753
- },
- {
- "epoch": 0.5729483282674772,
- "grad_norm": 2.2990667819976807,
- "learning_rate": 4.73460356037816e-06,
- "loss": 0.6619116067886353,
- "mean_token_accuracy": 0.7821142673492432,
- "num_tokens": 6920588.0,
- "step": 754
- },
- {
- "epoch": 0.5737082066869301,
- "grad_norm": 2.054746389389038,
- "learning_rate": 4.733663684710835e-06,
- "loss": 0.5304250717163086,
- "mean_token_accuracy": 0.8265531063079834,
- "num_tokens": 6928910.0,
- "step": 755
- },
- {
- "epoch": 0.574468085106383,
- "grad_norm": 2.0050594806671143,
- "learning_rate": 4.732722241372354e-06,
- "loss": 0.6393026113510132,
- "mean_token_accuracy": 0.796819806098938,
- "num_tokens": 6940217.0,
- "step": 756
- },
- {
- "epoch": 0.5752279635258358,
- "grad_norm": 1.4285320043563843,
- "learning_rate": 4.731779231023456e-06,
- "loss": 0.5432837009429932,
- "mean_token_accuracy": 0.8104778528213501,
- "num_tokens": 6959101.0,
- "step": 757
- },
- {
- "epoch": 0.5759878419452887,
- "grad_norm": 2.3941943645477295,
- "learning_rate": 4.730834654325984e-06,
- "loss": 0.46550673246383667,
- "mean_token_accuracy": 0.8444503545761108,
- "num_tokens": 6965036.0,
- "step": 758
- },
- {
- "epoch": 0.5767477203647416,
- "grad_norm": 2.3850574493408203,
- "learning_rate": 4.729888511942877e-06,
- "loss": 0.4916389584541321,
- "mean_token_accuracy": 0.8228527307510376,
- "num_tokens": 6971184.0,
- "step": 759
- },
- {
- "epoch": 0.5775075987841946,
- "grad_norm": 1.627480149269104,
- "learning_rate": 4.728940804538176e-06,
- "loss": 0.5863215923309326,
- "mean_token_accuracy": 0.7995302677154541,
- "num_tokens": 6982569.0,
- "step": 760
- },
- {
- "epoch": 0.5782674772036475,
- "grad_norm": 1.1723195314407349,
- "learning_rate": 4.727991532777016e-06,
- "loss": 0.36908864974975586,
- "mean_token_accuracy": 0.8355655670166016,
- "num_tokens": 6998659.0,
- "step": 761
- },
- {
- "epoch": 0.5790273556231003,
- "grad_norm": 1.5324925184249878,
- "learning_rate": 4.727040697325634e-06,
- "loss": 0.557658851146698,
- "mean_token_accuracy": 0.8141458034515381,
- "num_tokens": 7012969.0,
- "step": 762
- },
- {
- "epoch": 0.5797872340425532,
- "grad_norm": 2.4106390476226807,
- "learning_rate": 4.726088298851362e-06,
- "loss": 0.5004243850708008,
- "mean_token_accuracy": 0.8376860618591309,
- "num_tokens": 7018301.0,
- "step": 763
- },
- {
- "epoch": 0.5805471124620061,
- "grad_norm": 2.2594921588897705,
- "learning_rate": 4.725134338022631e-06,
- "loss": 0.6067016124725342,
- "mean_token_accuracy": 0.8100241422653198,
- "num_tokens": 7025201.0,
- "step": 764
- },
- {
- "epoch": 0.581306990881459,
- "grad_norm": 1.4649826288223267,
- "learning_rate": 4.724178815508967e-06,
- "loss": 0.36200693249702454,
- "mean_token_accuracy": 0.8621826171875,
- "num_tokens": 7035112.0,
- "step": 765
- },
- {
- "epoch": 0.5820668693009119,
- "grad_norm": 2.3634560108184814,
- "learning_rate": 4.723221731980993e-06,
- "loss": 0.41862213611602783,
- "mean_token_accuracy": 0.8541463613510132,
- "num_tokens": 7040339.0,
- "step": 766
- },
- {
- "epoch": 0.5828267477203647,
- "grad_norm": 2.7798104286193848,
- "learning_rate": 4.722263088110426e-06,
- "loss": 0.4647108018398285,
- "mean_token_accuracy": 0.8505672216415405,
- "num_tokens": 7044880.0,
- "step": 767
- },
- {
- "epoch": 0.5835866261398176,
- "grad_norm": 2.070528507232666,
- "learning_rate": 4.721302884570079e-06,
- "loss": 0.5147565007209778,
- "mean_token_accuracy": 0.8113877773284912,
- "num_tokens": 7052433.0,
- "step": 768
- },
- {
- "epoch": 0.5843465045592705,
- "grad_norm": 2.1953284740448,
- "learning_rate": 4.720341122033862e-06,
- "loss": 0.5075466632843018,
- "mean_token_accuracy": 0.8474211096763611,
- "num_tokens": 7058686.0,
- "step": 769
- },
- {
- "epoch": 0.5851063829787234,
- "grad_norm": 1.9287755489349365,
- "learning_rate": 4.719377801176774e-06,
- "loss": 0.5382202863693237,
- "mean_token_accuracy": 0.8148090243339539,
- "num_tokens": 7067538.0,
- "step": 770
- },
- {
- "epoch": 0.5858662613981763,
- "grad_norm": 1.5574456453323364,
- "learning_rate": 4.718412922674913e-06,
- "loss": 0.43406790494918823,
- "mean_token_accuracy": 0.8477081060409546,
- "num_tokens": 7077853.0,
- "step": 771
- },
- {
- "epoch": 0.5866261398176292,
- "grad_norm": 1.5490336418151855,
- "learning_rate": 4.717446487205466e-06,
- "loss": 0.43164271116256714,
- "mean_token_accuracy": 0.8504570126533508,
- "num_tokens": 7091728.0,
- "step": 772
- },
- {
- "epoch": 0.587386018237082,
- "grad_norm": 1.6945984363555908,
- "learning_rate": 4.716478495446717e-06,
- "loss": 0.5153743624687195,
- "mean_token_accuracy": 0.8213579058647156,
- "num_tokens": 7108680.0,
- "step": 773
- },
- {
- "epoch": 0.5881458966565349,
- "grad_norm": 2.2633883953094482,
- "learning_rate": 4.715508948078037e-06,
- "loss": 0.45254790782928467,
- "mean_token_accuracy": 0.8392219543457031,
- "num_tokens": 7115546.0,
- "step": 774
- },
- {
- "epoch": 0.5889057750759878,
- "grad_norm": 1.5731090307235718,
- "learning_rate": 4.714537845779894e-06,
- "loss": 0.38678881525993347,
- "mean_token_accuracy": 0.8800252676010132,
- "num_tokens": 7126360.0,
- "step": 775
- },
- {
- "epoch": 0.5896656534954408,
- "grad_norm": 2.4873392581939697,
- "learning_rate": 4.7135651892338445e-06,
- "loss": 0.5190927386283875,
- "mean_token_accuracy": 0.8145407438278198,
- "num_tokens": 7135705.0,
- "step": 776
- },
- {
- "epoch": 0.5904255319148937,
- "grad_norm": 1.2931004762649536,
- "learning_rate": 4.712590979122534e-06,
- "loss": 0.3686544895172119,
- "mean_token_accuracy": 0.8720537424087524,
- "num_tokens": 7150688.0,
- "step": 777
- },
- {
- "epoch": 0.5911854103343465,
- "grad_norm": 1.6353671550750732,
- "learning_rate": 4.7116152161297045e-06,
- "loss": 0.49065062403678894,
- "mean_token_accuracy": 0.8203760385513306,
- "num_tokens": 7161040.0,
- "step": 778
- },
- {
- "epoch": 0.5919452887537994,
- "grad_norm": 1.2345483303070068,
- "learning_rate": 4.710637900940181e-06,
- "loss": 0.4004976451396942,
- "mean_token_accuracy": 0.8302007913589478,
- "num_tokens": 7178074.0,
- "step": 779
- },
- {
- "epoch": 0.5927051671732523,
- "grad_norm": 2.2506837844848633,
- "learning_rate": 4.7096590342398825e-06,
- "loss": 0.45142874121665955,
- "mean_token_accuracy": 0.8481036424636841,
- "num_tokens": 7184153.0,
- "step": 780
- },
- {
- "epoch": 0.5934650455927052,
- "grad_norm": 1.420479416847229,
- "learning_rate": 4.708678616715815e-06,
- "loss": 0.4802100360393524,
- "mean_token_accuracy": 0.8586992025375366,
- "num_tokens": 7202810.0,
- "step": 781
- },
- {
- "epoch": 0.5942249240121581,
- "grad_norm": 3.457632303237915,
- "learning_rate": 4.707696649056073e-06,
- "loss": 0.5265094041824341,
- "mean_token_accuracy": 0.8260114192962646,
- "num_tokens": 7206396.0,
- "step": 782
- },
- {
- "epoch": 0.5949848024316109,
- "grad_norm": 1.1592093706130981,
- "learning_rate": 4.706713131949839e-06,
- "loss": 0.3708173632621765,
- "mean_token_accuracy": 0.8476542234420776,
- "num_tokens": 7225034.0,
- "step": 783
- },
- {
- "epoch": 0.5957446808510638,
- "grad_norm": 1.6761400699615479,
- "learning_rate": 4.705728066087384e-06,
- "loss": 0.4137252867221832,
- "mean_token_accuracy": 0.8462049961090088,
- "num_tokens": 7237101.0,
- "step": 784
- },
- {
- "epoch": 0.5965045592705167,
- "grad_norm": 2.320185422897339,
- "learning_rate": 4.704741452160064e-06,
- "loss": 0.5157154202461243,
- "mean_token_accuracy": 0.8391785621643066,
- "num_tokens": 7243826.0,
- "step": 785
- },
- {
- "epoch": 0.5972644376899696,
- "grad_norm": 2.079423427581787,
- "learning_rate": 4.703753290860323e-06,
- "loss": 0.4734993278980255,
- "mean_token_accuracy": 0.8353281021118164,
- "num_tokens": 7250175.0,
- "step": 786
- },
- {
- "epoch": 0.5980243161094225,
- "grad_norm": 1.8215159177780151,
- "learning_rate": 4.702763582881692e-06,
- "loss": 0.520193338394165,
- "mean_token_accuracy": 0.844062864780426,
- "num_tokens": 7258868.0,
- "step": 787
- },
- {
- "epoch": 0.5987841945288754,
- "grad_norm": 1.3823071718215942,
- "learning_rate": 4.701772328918784e-06,
- "loss": 0.4177844822406769,
- "mean_token_accuracy": 0.8363165259361267,
- "num_tokens": 7271744.0,
- "step": 788
- },
- {
- "epoch": 0.5995440729483282,
- "grad_norm": 2.4749298095703125,
- "learning_rate": 4.700779529667301e-06,
- "loss": 0.5115069150924683,
- "mean_token_accuracy": 0.8473520278930664,
- "num_tokens": 7277040.0,
- "step": 789
- },
- {
- "epoch": 0.6003039513677811,
- "grad_norm": 1.7072296142578125,
- "learning_rate": 4.699785185824026e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8161447048187256,
- "num_tokens": 7288288.0,
- "step": 790
- },
- {
- "epoch": 0.601063829787234,
- "grad_norm": 1.6479384899139404,
- "learning_rate": 4.69878929808683e-06,
- "loss": 0.4445168972015381,
- "mean_token_accuracy": 0.8381255865097046,
- "num_tokens": 7298640.0,
- "step": 791
- },
- {
- "epoch": 0.601823708206687,
- "grad_norm": 1.9095896482467651,
- "learning_rate": 4.6977918671546635e-06,
- "loss": 0.5841238498687744,
- "mean_token_accuracy": 0.7971454858779907,
- "num_tokens": 7307220.0,
- "step": 792
- },
- {
- "epoch": 0.6025835866261399,
- "grad_norm": 1.9614146947860718,
- "learning_rate": 4.696792893727562e-06,
- "loss": 0.34684082865715027,
- "mean_token_accuracy": 0.8739526271820068,
- "num_tokens": 7313875.0,
- "step": 793
- },
- {
- "epoch": 0.6033434650455927,
- "grad_norm": 2.015570640563965,
- "learning_rate": 4.695792378506645e-06,
- "loss": 0.42779117822647095,
- "mean_token_accuracy": 0.8625012636184692,
- "num_tokens": 7321439.0,
- "step": 794
- },
- {
- "epoch": 0.6041033434650456,
- "grad_norm": 2.8581228256225586,
- "learning_rate": 4.694790322194111e-06,
- "loss": 0.6519991159439087,
- "mean_token_accuracy": 0.7629562616348267,
- "num_tokens": 7326916.0,
- "step": 795
- },
- {
- "epoch": 0.6048632218844985,
- "grad_norm": 2.482715368270874,
- "learning_rate": 4.693786725493242e-06,
- "loss": 0.532963216304779,
- "mean_token_accuracy": 0.832184910774231,
- "num_tokens": 7333311.0,
- "step": 796
- },
- {
- "epoch": 0.6056231003039514,
- "grad_norm": 1.6076741218566895,
- "learning_rate": 4.692781589108402e-06,
- "loss": 0.43381205201148987,
- "mean_token_accuracy": 0.8402494192123413,
- "num_tokens": 7343731.0,
- "step": 797
- },
- {
- "epoch": 0.6063829787234043,
- "grad_norm": 2.2133216857910156,
- "learning_rate": 4.691774913745033e-06,
- "loss": 0.4380851089954376,
- "mean_token_accuracy": 0.8600908517837524,
- "num_tokens": 7350224.0,
- "step": 798
- },
- {
- "epoch": 0.6071428571428571,
- "grad_norm": 2.046280860900879,
- "learning_rate": 4.690766700109659e-06,
- "loss": 0.3821919560432434,
- "mean_token_accuracy": 0.8691814541816711,
- "num_tokens": 7356717.0,
- "step": 799
- },
- {
- "epoch": 0.60790273556231,
- "grad_norm": 1.8482693433761597,
- "learning_rate": 4.689756948909884e-06,
- "loss": 0.5217651128768921,
- "mean_token_accuracy": 0.803473711013794,
- "num_tokens": 7365806.0,
- "step": 800
- },
- {
- "epoch": 0.6086626139817629,
- "grad_norm": 2.192134141921997,
- "learning_rate": 4.688745660854388e-06,
- "loss": 0.573980987071991,
- "mean_token_accuracy": 0.8198676109313965,
- "num_tokens": 7380281.0,
- "step": 801
- },
- {
- "epoch": 0.6094224924012158,
- "grad_norm": 2.363626718521118,
- "learning_rate": 4.687732836652935e-06,
- "loss": 0.5204599499702454,
- "mean_token_accuracy": 0.8373252153396606,
- "num_tokens": 7386938.0,
- "step": 802
- },
- {
- "epoch": 0.6101823708206687,
- "grad_norm": 1.9320523738861084,
- "learning_rate": 4.686718477016361e-06,
- "loss": 0.47316622734069824,
- "mean_token_accuracy": 0.830596923828125,
- "num_tokens": 7395069.0,
- "step": 803
- },
- {
- "epoch": 0.6109422492401215,
- "grad_norm": 2.6573057174682617,
- "learning_rate": 4.6857025826565845e-06,
- "loss": 0.5495861768722534,
- "mean_token_accuracy": 0.8187421560287476,
- "num_tokens": 7400563.0,
- "step": 804
- },
- {
- "epoch": 0.6117021276595744,
- "grad_norm": 2.0893123149871826,
- "learning_rate": 4.684685154286599e-06,
- "loss": 0.5362675786018372,
- "mean_token_accuracy": 0.8394701480865479,
- "num_tokens": 7406973.0,
- "step": 805
- },
- {
- "epoch": 0.6124620060790273,
- "grad_norm": 2.455130100250244,
- "learning_rate": 4.683666192620474e-06,
- "loss": 0.5405995845794678,
- "mean_token_accuracy": 0.8079100847244263,
- "num_tokens": 7412931.0,
- "step": 806
- },
- {
- "epoch": 0.6132218844984803,
- "grad_norm": 2.311915636062622,
- "learning_rate": 4.682645698373357e-06,
- "loss": 0.5395106077194214,
- "mean_token_accuracy": 0.8156260251998901,
- "num_tokens": 7419699.0,
- "step": 807
- },
- {
- "epoch": 0.6139817629179332,
- "grad_norm": 1.686838984489441,
- "learning_rate": 4.6816236722614694e-06,
- "loss": 0.6034521460533142,
- "mean_token_accuracy": 0.7855954170227051,
- "num_tokens": 7431899.0,
- "step": 808
- },
- {
- "epoch": 0.6147416413373861,
- "grad_norm": 1.682759165763855,
- "learning_rate": 4.680600115002109e-06,
- "loss": 0.48593831062316895,
- "mean_token_accuracy": 0.8229435682296753,
- "num_tokens": 7443187.0,
- "step": 809
- },
- {
- "epoch": 0.6155015197568389,
- "grad_norm": 2.064589738845825,
- "learning_rate": 4.679575027313649e-06,
- "loss": 0.5098468661308289,
- "mean_token_accuracy": 0.8234638571739197,
- "num_tokens": 7450868.0,
- "step": 810
- },
- {
- "epoch": 0.6162613981762918,
- "grad_norm": 2.2063486576080322,
- "learning_rate": 4.6785484099155324e-06,
- "loss": 0.5138497352600098,
- "mean_token_accuracy": 0.8152111172676086,
- "num_tokens": 7457176.0,
- "step": 811
- },
- {
- "epoch": 0.6170212765957447,
- "grad_norm": 1.6258726119995117,
- "learning_rate": 4.67752026352828e-06,
- "loss": 0.4064181447029114,
- "mean_token_accuracy": 0.8720619678497314,
- "num_tokens": 7466557.0,
- "step": 812
- },
- {
- "epoch": 0.6177811550151976,
- "grad_norm": 2.3309383392333984,
- "learning_rate": 4.676490588873486e-06,
- "loss": 0.5180112719535828,
- "mean_token_accuracy": 0.8233879804611206,
- "num_tokens": 7472650.0,
- "step": 813
- },
- {
- "epoch": 0.6185410334346505,
- "grad_norm": 1.4545246362686157,
- "learning_rate": 4.675459386673815e-06,
- "loss": 0.37917959690093994,
- "mean_token_accuracy": 0.8598103523254395,
- "num_tokens": 7485171.0,
- "step": 814
- },
- {
- "epoch": 0.6193009118541033,
- "grad_norm": 2.654231071472168,
- "learning_rate": 4.674426657653003e-06,
- "loss": 0.554074227809906,
- "mean_token_accuracy": 0.8026446104049683,
- "num_tokens": 7490787.0,
- "step": 815
- },
- {
- "epoch": 0.6200607902735562,
- "grad_norm": 1.5543994903564453,
- "learning_rate": 4.67339240253586e-06,
- "loss": 0.6335440278053284,
- "mean_token_accuracy": 0.783241868019104,
- "num_tokens": 7505975.0,
- "step": 816
- },
- {
- "epoch": 0.6208206686930091,
- "grad_norm": 2.079998016357422,
- "learning_rate": 4.672356622048266e-06,
- "loss": 0.5169394016265869,
- "mean_token_accuracy": 0.8088761568069458,
- "num_tokens": 7513470.0,
- "step": 817
- },
- {
- "epoch": 0.621580547112462,
- "grad_norm": 1.5971896648406982,
- "learning_rate": 4.671319316917172e-06,
- "loss": 0.44588586688041687,
- "mean_token_accuracy": 0.8518649339675903,
- "num_tokens": 7524352.0,
- "step": 818
- },
- {
- "epoch": 0.6223404255319149,
- "grad_norm": 2.477579116821289,
- "learning_rate": 4.670280487870599e-06,
- "loss": 0.5713893175125122,
- "mean_token_accuracy": 0.8116940259933472,
- "num_tokens": 7530359.0,
- "step": 819
- },
- {
- "epoch": 0.6231003039513677,
- "grad_norm": 2.066211700439453,
- "learning_rate": 4.669240135637635e-06,
- "loss": 0.5295331478118896,
- "mean_token_accuracy": 0.819536566734314,
- "num_tokens": 7536963.0,
- "step": 820
- },
- {
- "epoch": 0.6238601823708206,
- "grad_norm": 2.1217997074127197,
- "learning_rate": 4.668198260948442e-06,
- "loss": 0.6146406531333923,
- "mean_token_accuracy": 0.7932635545730591,
- "num_tokens": 7545800.0,
- "step": 821
- },
- {
- "epoch": 0.6246200607902735,
- "grad_norm": 2.0173542499542236,
- "learning_rate": 4.667154864534245e-06,
- "loss": 0.6240535974502563,
- "mean_token_accuracy": 0.7883644104003906,
- "num_tokens": 7556165.0,
- "step": 822
- },
- {
- "epoch": 0.6253799392097265,
- "grad_norm": 2.014526128768921,
- "learning_rate": 4.666109947127343e-06,
- "loss": 0.40367332100868225,
- "mean_token_accuracy": 0.8653522729873657,
- "num_tokens": 7562665.0,
- "step": 823
- },
- {
- "epoch": 0.6261398176291794,
- "grad_norm": 2.5078861713409424,
- "learning_rate": 4.665063509461098e-06,
- "loss": 0.5903617739677429,
- "mean_token_accuracy": 0.7902897596359253,
- "num_tokens": 7568922.0,
- "step": 824
- },
- {
- "epoch": 0.6268996960486323,
- "grad_norm": 2.454622745513916,
- "learning_rate": 4.664015552269938e-06,
- "loss": 0.5238361358642578,
- "mean_token_accuracy": 0.838546872138977,
- "num_tokens": 7575965.0,
- "step": 825
- },
- {
- "epoch": 0.6276595744680851,
- "grad_norm": 2.920919418334961,
- "learning_rate": 4.662966076289363e-06,
- "loss": 0.5028782486915588,
- "mean_token_accuracy": 0.8311152458190918,
- "num_tokens": 7580193.0,
- "step": 826
- },
- {
- "epoch": 0.628419452887538,
- "grad_norm": 1.545382022857666,
- "learning_rate": 4.661915082255932e-06,
- "loss": 0.4817378520965576,
- "mean_token_accuracy": 0.8373227119445801,
- "num_tokens": 7593024.0,
- "step": 827
- },
- {
- "epoch": 0.6291793313069909,
- "grad_norm": 1.5152469873428345,
- "learning_rate": 4.6608625709072766e-06,
- "loss": 0.4693033695220947,
- "mean_token_accuracy": 0.8150848150253296,
- "num_tokens": 7606459.0,
- "step": 828
- },
- {
- "epoch": 0.6299392097264438,
- "grad_norm": 2.1310224533081055,
- "learning_rate": 4.659808542982089e-06,
- "loss": 0.4653395414352417,
- "mean_token_accuracy": 0.8286294341087341,
- "num_tokens": 7613036.0,
- "step": 829
- },
- {
- "epoch": 0.6306990881458967,
- "grad_norm": 2.1949679851531982,
- "learning_rate": 4.658752999220125e-06,
- "loss": 0.3698633909225464,
- "mean_token_accuracy": 0.871590793132782,
- "num_tokens": 7618527.0,
- "step": 830
- },
- {
- "epoch": 0.6314589665653495,
- "grad_norm": 2.2770416736602783,
- "learning_rate": 4.657695940362207e-06,
- "loss": 0.5202419757843018,
- "mean_token_accuracy": 0.817577600479126,
- "num_tokens": 7624459.0,
- "step": 831
- },
- {
- "epoch": 0.6322188449848024,
- "grad_norm": 1.402042269706726,
- "learning_rate": 4.65663736715022e-06,
- "loss": 0.51531583070755,
- "mean_token_accuracy": 0.8228116631507874,
- "num_tokens": 7639371.0,
- "step": 832
- },
- {
- "epoch": 0.6329787234042553,
- "grad_norm": 3.3554883003234863,
- "learning_rate": 4.65557728032711e-06,
- "loss": 0.6771188378334045,
- "mean_token_accuracy": 0.7880028486251831,
- "num_tokens": 7643924.0,
- "step": 833
- },
- {
- "epoch": 0.6337386018237082,
- "grad_norm": 2.081040143966675,
- "learning_rate": 4.654515680636888e-06,
- "loss": 0.5712796449661255,
- "mean_token_accuracy": 0.8177868127822876,
- "num_tokens": 7651881.0,
- "step": 834
- },
- {
- "epoch": 0.6344984802431611,
- "grad_norm": 0.9128716588020325,
- "learning_rate": 4.653452568824625e-06,
- "loss": 0.3423936069011688,
- "mean_token_accuracy": 0.8782886266708374,
- "num_tokens": 7677829.0,
- "step": 835
- },
- {
- "epoch": 0.6352583586626139,
- "grad_norm": 3.49015736579895,
- "learning_rate": 4.652387945636454e-06,
- "loss": 0.34657734632492065,
- "mean_token_accuracy": 0.8770567178726196,
- "num_tokens": 7680796.0,
- "step": 836
- },
- {
- "epoch": 0.6360182370820668,
- "grad_norm": 2.026247501373291,
- "learning_rate": 4.651321811819568e-06,
- "loss": 0.5098431706428528,
- "mean_token_accuracy": 0.8216961622238159,
- "num_tokens": 7688746.0,
- "step": 837
- },
- {
- "epoch": 0.6367781155015197,
- "grad_norm": 2.444343090057373,
- "learning_rate": 4.650254168122222e-06,
- "loss": 0.5490090250968933,
- "mean_token_accuracy": 0.8092857599258423,
- "num_tokens": 7695220.0,
- "step": 838
- },
- {
- "epoch": 0.6375379939209727,
- "grad_norm": 2.0171122550964355,
- "learning_rate": 4.649185015293728e-06,
- "loss": 0.47221142053604126,
- "mean_token_accuracy": 0.8514408469200134,
- "num_tokens": 7702759.0,
- "step": 839
- },
- {
- "epoch": 0.6382978723404256,
- "grad_norm": 1.9800984859466553,
- "learning_rate": 4.64811435408446e-06,
- "loss": 0.5238803625106812,
- "mean_token_accuracy": 0.8479194641113281,
- "num_tokens": 7714017.0,
- "step": 840
- },
- {
- "epoch": 0.6390577507598785,
- "grad_norm": 3.0674357414245605,
- "learning_rate": 4.647042185245848e-06,
- "loss": 0.4668245315551758,
- "mean_token_accuracy": 0.8381714820861816,
- "num_tokens": 7717801.0,
- "step": 841
- },
- {
- "epoch": 0.6398176291793313,
- "grad_norm": 1.5672820806503296,
- "learning_rate": 4.645968509530381e-06,
- "loss": 0.4428741931915283,
- "mean_token_accuracy": 0.8416479825973511,
- "num_tokens": 7728342.0,
- "step": 842
- },
- {
- "epoch": 0.6405775075987842,
- "grad_norm": 2.3042354583740234,
- "learning_rate": 4.644893327691608e-06,
- "loss": 0.49937760829925537,
- "mean_token_accuracy": 0.827070951461792,
- "num_tokens": 7734576.0,
- "step": 843
- },
- {
- "epoch": 0.6413373860182371,
- "grad_norm": 2.057772159576416,
- "learning_rate": 4.6438166404841316e-06,
- "loss": 0.5912986993789673,
- "mean_token_accuracy": 0.805509090423584,
- "num_tokens": 7742481.0,
- "step": 844
- },
- {
- "epoch": 0.64209726443769,
- "grad_norm": 1.9688186645507812,
- "learning_rate": 4.6427384486636115e-06,
- "loss": 0.482401967048645,
- "mean_token_accuracy": 0.8358086347579956,
- "num_tokens": 7750002.0,
- "step": 845
- },
- {
- "epoch": 0.6428571428571429,
- "grad_norm": 2.6852948665618896,
- "learning_rate": 4.6416587529867665e-06,
- "loss": 0.5479315519332886,
- "mean_token_accuracy": 0.8091106414794922,
- "num_tokens": 7755578.0,
- "step": 846
- },
- {
- "epoch": 0.6436170212765957,
- "grad_norm": 2.0547337532043457,
- "learning_rate": 4.640577554211366e-06,
- "loss": 0.5327274203300476,
- "mean_token_accuracy": 0.8280376195907593,
- "num_tokens": 7763513.0,
- "step": 847
- },
- {
- "epoch": 0.6443768996960486,
- "grad_norm": 2.0328633785247803,
- "learning_rate": 4.63949485309624e-06,
- "loss": 0.4814409613609314,
- "mean_token_accuracy": 0.8527672290802002,
- "num_tokens": 7771131.0,
- "step": 848
- },
- {
- "epoch": 0.6451367781155015,
- "grad_norm": 1.5892863273620605,
- "learning_rate": 4.638410650401267e-06,
- "loss": 0.4492785334587097,
- "mean_token_accuracy": 0.846997857093811,
- "num_tokens": 7781572.0,
- "step": 849
- },
- {
- "epoch": 0.6458966565349544,
- "grad_norm": 1.8295910358428955,
- "learning_rate": 4.637324946887384e-06,
- "loss": 0.37088239192962646,
- "mean_token_accuracy": 0.8616628646850586,
- "num_tokens": 7788604.0,
- "step": 850
- },
- {
- "epoch": 0.6466565349544073,
- "grad_norm": 3.380040168762207,
- "learning_rate": 4.636237743316578e-06,
- "loss": 0.4737280607223511,
- "mean_token_accuracy": 0.855940580368042,
- "num_tokens": 7792504.0,
- "step": 851
- },
- {
- "epoch": 0.6474164133738601,
- "grad_norm": 2.8790009021759033,
- "learning_rate": 4.635149040451891e-06,
- "loss": 0.39790448546409607,
- "mean_token_accuracy": 0.8710698485374451,
- "num_tokens": 7796333.0,
- "step": 852
- },
- {
- "epoch": 0.648176291793313,
- "grad_norm": 1.914914608001709,
- "learning_rate": 4.634058839057417e-06,
- "loss": 0.2954312562942505,
- "mean_token_accuracy": 0.8880234956741333,
- "num_tokens": 7802456.0,
- "step": 853
- },
- {
- "epoch": 0.648936170212766,
- "grad_norm": 1.3709120750427246,
- "learning_rate": 4.632967139898301e-06,
- "loss": 0.43224576115608215,
- "mean_token_accuracy": 0.8446190357208252,
- "num_tokens": 7816770.0,
- "step": 854
- },
- {
- "epoch": 0.6496960486322189,
- "grad_norm": 1.6579312086105347,
- "learning_rate": 4.63187394374074e-06,
- "loss": 0.3535553514957428,
- "mean_token_accuracy": 0.8738704919815063,
- "num_tokens": 7824963.0,
- "step": 855
- },
- {
- "epoch": 0.6504559270516718,
- "grad_norm": 2.4055678844451904,
- "learning_rate": 4.63077925135198e-06,
- "loss": 0.5078744292259216,
- "mean_token_accuracy": 0.8430874347686768,
- "num_tokens": 7830962.0,
- "step": 856
- },
- {
- "epoch": 0.6512158054711246,
- "grad_norm": 2.5171499252319336,
- "learning_rate": 4.629683063500319e-06,
- "loss": 0.5172419548034668,
- "mean_token_accuracy": 0.8087141513824463,
- "num_tokens": 7836638.0,
- "step": 857
- },
- {
- "epoch": 0.6519756838905775,
- "grad_norm": 1.7588486671447754,
- "learning_rate": 4.628585380955104e-06,
- "loss": 0.5759496092796326,
- "mean_token_accuracy": 0.8043236136436462,
- "num_tokens": 7844654.0,
- "step": 858
- },
- {
- "epoch": 0.6527355623100304,
- "grad_norm": 1.5887070894241333,
- "learning_rate": 4.62748620448673e-06,
- "loss": 0.41849038004875183,
- "mean_token_accuracy": 0.8556643724441528,
- "num_tokens": 7855642.0,
- "step": 859
- },
- {
- "epoch": 0.6534954407294833,
- "grad_norm": 3.227942705154419,
- "learning_rate": 4.626385534866642e-06,
- "loss": 0.5279449224472046,
- "mean_token_accuracy": 0.8250958323478699,
- "num_tokens": 7859890.0,
- "step": 860
- },
- {
- "epoch": 0.6542553191489362,
- "grad_norm": 2.440467119216919,
- "learning_rate": 4.625283372867333e-06,
- "loss": 0.5294933319091797,
- "mean_token_accuracy": 0.8235013484954834,
- "num_tokens": 7866766.0,
- "step": 861
- },
- {
- "epoch": 0.6550151975683891,
- "grad_norm": 2.4106903076171875,
- "learning_rate": 4.624179719262342e-06,
- "loss": 0.5662813186645508,
- "mean_token_accuracy": 0.8061668872833252,
- "num_tokens": 7872809.0,
- "step": 862
- },
- {
- "epoch": 0.6557750759878419,
- "grad_norm": 3.5151145458221436,
- "learning_rate": 4.623074574826254e-06,
- "loss": 0.5471097230911255,
- "mean_token_accuracy": 0.8220691084861755,
- "num_tokens": 7876136.0,
- "step": 863
- },
- {
- "epoch": 0.6565349544072948,
- "grad_norm": 1.5319840908050537,
- "learning_rate": 4.621967940334705e-06,
- "loss": 0.4178982377052307,
- "mean_token_accuracy": 0.8517135977745056,
- "num_tokens": 7886113.0,
- "step": 864
- },
- {
- "epoch": 0.6572948328267477,
- "grad_norm": 1.63701331615448,
- "learning_rate": 4.620859816564371e-06,
- "loss": 0.4666512608528137,
- "mean_token_accuracy": 0.8223508596420288,
- "num_tokens": 7897982.0,
- "step": 865
- },
- {
- "epoch": 0.6580547112462006,
- "grad_norm": 2.1515414714813232,
- "learning_rate": 4.619750204292978e-06,
- "loss": 0.5359305143356323,
- "mean_token_accuracy": 0.8192868232727051,
- "num_tokens": 7904947.0,
- "step": 866
- },
- {
- "epoch": 0.6588145896656535,
- "grad_norm": 2.2140955924987793,
- "learning_rate": 4.618639104299294e-06,
- "loss": 0.5275633931159973,
- "mean_token_accuracy": 0.8120715618133545,
- "num_tokens": 7913913.0,
- "step": 867
- },
- {
- "epoch": 0.6595744680851063,
- "grad_norm": 1.3956893682479858,
- "learning_rate": 4.6175265173631304e-06,
- "loss": 0.4378768503665924,
- "mean_token_accuracy": 0.8479125499725342,
- "num_tokens": 7927979.0,
- "step": 868
- },
- {
- "epoch": 0.6603343465045592,
- "grad_norm": 2.98103928565979,
- "learning_rate": 4.616412444265344e-06,
- "loss": 0.42614591121673584,
- "mean_token_accuracy": 0.8595094680786133,
- "num_tokens": 7934293.0,
- "step": 869
- },
- {
- "epoch": 0.6610942249240122,
- "grad_norm": 2.554845094680786,
- "learning_rate": 4.6152968857878365e-06,
- "loss": 0.3698030412197113,
- "mean_token_accuracy": 0.8717041015625,
- "num_tokens": 7938547.0,
- "step": 870
- },
- {
- "epoch": 0.6618541033434651,
- "grad_norm": 3.0901825428009033,
- "learning_rate": 4.6141798427135475e-06,
- "loss": 0.5037497282028198,
- "mean_token_accuracy": 0.8354041576385498,
- "num_tokens": 7942829.0,
- "step": 871
- },
- {
- "epoch": 0.662613981762918,
- "grad_norm": 2.8692073822021484,
- "learning_rate": 4.6130613158264605e-06,
- "loss": 0.5418164134025574,
- "mean_token_accuracy": 0.8298909664154053,
- "num_tokens": 7949303.0,
- "step": 872
- },
- {
- "epoch": 0.6633738601823708,
- "grad_norm": 3.960404396057129,
- "learning_rate": 4.611941305911602e-06,
- "loss": 0.6284480094909668,
- "mean_token_accuracy": 0.837495744228363,
- "num_tokens": 7952486.0,
- "step": 873
- },
- {
- "epoch": 0.6641337386018237,
- "grad_norm": 2.6690115928649902,
- "learning_rate": 4.610819813755038e-06,
- "loss": 0.5214360952377319,
- "mean_token_accuracy": 0.8213508129119873,
- "num_tokens": 7957559.0,
- "step": 874
- },
- {
- "epoch": 0.6648936170212766,
- "grad_norm": 2.3376171588897705,
- "learning_rate": 4.609696840143875e-06,
- "loss": 0.46887528896331787,
- "mean_token_accuracy": 0.8438819646835327,
- "num_tokens": 7962826.0,
- "step": 875
- },
- {
- "epoch": 0.6656534954407295,
- "grad_norm": 2.2222683429718018,
- "learning_rate": 4.6085723858662575e-06,
- "loss": 0.5607719421386719,
- "mean_token_accuracy": 0.8128405809402466,
- "num_tokens": 7970131.0,
- "step": 876
- },
- {
- "epoch": 0.6664133738601824,
- "grad_norm": 2.069091558456421,
- "learning_rate": 4.607446451711372e-06,
- "loss": 0.506301760673523,
- "mean_token_accuracy": 0.8256827592849731,
- "num_tokens": 7977524.0,
- "step": 877
- },
- {
- "epoch": 0.6671732522796353,
- "grad_norm": 1.3724967241287231,
- "learning_rate": 4.606319038469443e-06,
- "loss": 0.43285101652145386,
- "mean_token_accuracy": 0.8525032997131348,
- "num_tokens": 7989174.0,
- "step": 878
- },
- {
- "epoch": 0.6679331306990881,
- "grad_norm": 2.278205156326294,
- "learning_rate": 4.605190146931731e-06,
- "loss": 0.4845905303955078,
- "mean_token_accuracy": 0.8284652829170227,
- "num_tokens": 7998524.0,
- "step": 879
- },
- {
- "epoch": 0.668693009118541,
- "grad_norm": 1.3871766328811646,
- "learning_rate": 4.604059777890537e-06,
- "loss": 0.5736679434776306,
- "mean_token_accuracy": 0.8223285675048828,
- "num_tokens": 8015776.0,
- "step": 880
- },
- {
- "epoch": 0.6694528875379939,
- "grad_norm": 1.926164984703064,
- "learning_rate": 4.602927932139197e-06,
- "loss": 0.4133230447769165,
- "mean_token_accuracy": 0.8653768301010132,
- "num_tokens": 8022979.0,
- "step": 881
- },
- {
- "epoch": 0.6702127659574468,
- "grad_norm": 2.109272003173828,
- "learning_rate": 4.601794610472083e-06,
- "loss": 0.7005600929260254,
- "mean_token_accuracy": 0.7777010202407837,
- "num_tokens": 8032618.0,
- "step": 882
- },
- {
- "epoch": 0.6709726443768997,
- "grad_norm": 2.077977418899536,
- "learning_rate": 4.6006598136846056e-06,
- "loss": 0.5278208255767822,
- "mean_token_accuracy": 0.8230358958244324,
- "num_tokens": 8040534.0,
- "step": 883
- },
- {
- "epoch": 0.6717325227963525,
- "grad_norm": 1.678581714630127,
- "learning_rate": 4.599523542573207e-06,
- "loss": 0.4955351650714874,
- "mean_token_accuracy": 0.8270003795623779,
- "num_tokens": 8052249.0,
- "step": 884
- },
- {
- "epoch": 0.6724924012158054,
- "grad_norm": 2.0751662254333496,
- "learning_rate": 4.598385797935368e-06,
- "loss": 0.5266247987747192,
- "mean_token_accuracy": 0.8263581991195679,
- "num_tokens": 8060600.0,
- "step": 885
- },
- {
- "epoch": 0.6732522796352584,
- "grad_norm": 2.418405771255493,
- "learning_rate": 4.5972465805696e-06,
- "loss": 0.4481425881385803,
- "mean_token_accuracy": 0.846164345741272,
- "num_tokens": 8066025.0,
- "step": 886
- },
- {
- "epoch": 0.6740121580547113,
- "grad_norm": 2.3936474323272705,
- "learning_rate": 4.596105891275449e-06,
- "loss": 0.4553404450416565,
- "mean_token_accuracy": 0.8412896394729614,
- "num_tokens": 8071544.0,
- "step": 887
- },
- {
- "epoch": 0.6747720364741642,
- "grad_norm": 2.2024407386779785,
- "learning_rate": 4.594963730853497e-06,
- "loss": 0.6218541860580444,
- "mean_token_accuracy": 0.7890232801437378,
- "num_tokens": 8079061.0,
- "step": 888
- },
- {
- "epoch": 0.675531914893617,
- "grad_norm": 2.51015567779541,
- "learning_rate": 4.593820100105355e-06,
- "loss": 0.5149124264717102,
- "mean_token_accuracy": 0.8241918087005615,
- "num_tokens": 8084293.0,
- "step": 889
- },
- {
- "epoch": 0.6762917933130699,
- "grad_norm": 1.8748939037322998,
- "learning_rate": 4.5926749998336665e-06,
- "loss": 0.50836181640625,
- "mean_token_accuracy": 0.8067223429679871,
- "num_tokens": 8092511.0,
- "step": 890
- },
- {
- "epoch": 0.6770516717325228,
- "grad_norm": 1.801193118095398,
- "learning_rate": 4.5915284308421075e-06,
- "loss": 0.4372861683368683,
- "mean_token_accuracy": 0.8510604500770569,
- "num_tokens": 8101174.0,
- "step": 891
- },
- {
- "epoch": 0.6778115501519757,
- "grad_norm": 2.6476457118988037,
- "learning_rate": 4.590380393935383e-06,
- "loss": 0.38700711727142334,
- "mean_token_accuracy": 0.8659796714782715,
- "num_tokens": 8105398.0,
- "step": 892
- },
- {
- "epoch": 0.6785714285714286,
- "grad_norm": 1.1147183179855347,
- "learning_rate": 4.589230889919232e-06,
- "loss": 0.38546115159988403,
- "mean_token_accuracy": 0.8570581674575806,
- "num_tokens": 8127394.0,
- "step": 893
- },
- {
- "epoch": 0.6793313069908815,
- "grad_norm": 2.908905506134033,
- "learning_rate": 4.588079919600419e-06,
- "loss": 0.5108504295349121,
- "mean_token_accuracy": 0.8121406435966492,
- "num_tokens": 8131801.0,
- "step": 894
- },
- {
- "epoch": 0.6800911854103343,
- "grad_norm": 3.1522326469421387,
- "learning_rate": 4.586927483786739e-06,
- "loss": 0.44059112668037415,
- "mean_token_accuracy": 0.8448011875152588,
- "num_tokens": 8154416.0,
- "step": 895
- },
- {
- "epoch": 0.6808510638297872,
- "grad_norm": 1.5142440795898438,
- "learning_rate": 4.585773583287017e-06,
- "loss": 0.513217568397522,
- "mean_token_accuracy": 0.8386049270629883,
- "num_tokens": 8171156.0,
- "step": 896
- },
- {
- "epoch": 0.6816109422492401,
- "grad_norm": 2.597881317138672,
- "learning_rate": 4.584618218911104e-06,
- "loss": 0.4937712550163269,
- "mean_token_accuracy": 0.8223681449890137,
- "num_tokens": 8176124.0,
- "step": 897
- },
- {
- "epoch": 0.682370820668693,
- "grad_norm": 1.8185619115829468,
- "learning_rate": 4.583461391469879e-06,
- "loss": 0.519811749458313,
- "mean_token_accuracy": 0.8169777393341064,
- "num_tokens": 8185136.0,
- "step": 898
- },
- {
- "epoch": 0.6831306990881459,
- "grad_norm": 3.2061994075775146,
- "learning_rate": 4.582303101775249e-06,
- "loss": 0.4655115008354187,
- "mean_token_accuracy": 0.8425977230072021,
- "num_tokens": 8188864.0,
- "step": 899
- },
- {
- "epoch": 0.6838905775075987,
- "grad_norm": 1.3485229015350342,
- "learning_rate": 4.581143350640146e-06,
- "loss": 0.5014470815658569,
- "mean_token_accuracy": 0.8273109197616577,
- "num_tokens": 8203460.0,
- "step": 900
- },
- {
- "epoch": 0.6846504559270516,
- "grad_norm": 1.3264713287353516,
- "learning_rate": 4.579982138878527e-06,
- "loss": 0.5073703527450562,
- "mean_token_accuracy": 0.8259357213973999,
- "num_tokens": 8219348.0,
- "step": 901
- },
- {
- "epoch": 0.6854103343465046,
- "grad_norm": 2.4436347484588623,
- "learning_rate": 4.578819467305375e-06,
- "loss": 0.47020310163497925,
- "mean_token_accuracy": 0.8567265272140503,
- "num_tokens": 8224427.0,
- "step": 902
- },
- {
- "epoch": 0.6861702127659575,
- "grad_norm": 1.921749234199524,
- "learning_rate": 4.5776553367367e-06,
- "loss": 0.622514009475708,
- "mean_token_accuracy": 0.7863982319831848,
- "num_tokens": 8233151.0,
- "step": 903
- },
- {
- "epoch": 0.6869300911854104,
- "grad_norm": 1.8815616369247437,
- "learning_rate": 4.576489747989532e-06,
- "loss": 0.4910545349121094,
- "mean_token_accuracy": 0.8147122859954834,
- "num_tokens": 8240762.0,
- "step": 904
- },
- {
- "epoch": 0.6876899696048632,
- "grad_norm": 1.2366989850997925,
- "learning_rate": 4.575322701881926e-06,
- "loss": 0.3947566747665405,
- "mean_token_accuracy": 0.873993992805481,
- "num_tokens": 8259381.0,
- "step": 905
- },
- {
- "epoch": 0.6884498480243161,
- "grad_norm": 1.5767735242843628,
- "learning_rate": 4.57415419923296e-06,
- "loss": 0.57136070728302,
- "mean_token_accuracy": 0.8028088808059692,
- "num_tokens": 8273296.0,
- "step": 906
- },
- {
- "epoch": 0.689209726443769,
- "grad_norm": 2.378675699234009,
- "learning_rate": 4.572984240862733e-06,
- "loss": 0.5894849896430969,
- "mean_token_accuracy": 0.7977708578109741,
- "num_tokens": 8280083.0,
- "step": 907
- },
- {
- "epoch": 0.6899696048632219,
- "grad_norm": 2.0401132106781006,
- "learning_rate": 4.57181282759237e-06,
- "loss": 0.5524613261222839,
- "mean_token_accuracy": 0.8138598203659058,
- "num_tokens": 8288236.0,
- "step": 908
- },
- {
- "epoch": 0.6907294832826748,
- "grad_norm": 2.293701648712158,
- "learning_rate": 4.570639960244011e-06,
- "loss": 0.5154546499252319,
- "mean_token_accuracy": 0.8234660625457764,
- "num_tokens": 8294493.0,
- "step": 909
- },
- {
- "epoch": 0.6914893617021277,
- "grad_norm": 1.9286527633666992,
- "learning_rate": 4.56946563964082e-06,
- "loss": 0.5364264845848083,
- "mean_token_accuracy": 0.8147368431091309,
- "num_tokens": 8303441.0,
- "step": 910
- },
- {
- "epoch": 0.6922492401215805,
- "grad_norm": 1.2571251392364502,
- "learning_rate": 4.5682898666069815e-06,
- "loss": 0.43535223603248596,
- "mean_token_accuracy": 0.859239935874939,
- "num_tokens": 8321548.0,
- "step": 911
- },
- {
- "epoch": 0.6930091185410334,
- "grad_norm": 1.2224860191345215,
- "learning_rate": 4.567112641967697e-06,
- "loss": 0.40205076336860657,
- "mean_token_accuracy": 0.8724711537361145,
- "num_tokens": 8335205.0,
- "step": 912
- },
- {
- "epoch": 0.6937689969604863,
- "grad_norm": 1.2064491510391235,
- "learning_rate": 4.5659339665491894e-06,
- "loss": 0.37790587544441223,
- "mean_token_accuracy": 0.8464339971542358,
- "num_tokens": 8350926.0,
- "step": 913
- },
- {
- "epoch": 0.6945288753799392,
- "grad_norm": 2.1755270957946777,
- "learning_rate": 4.5647538411786965e-06,
- "loss": 0.42034298181533813,
- "mean_token_accuracy": 0.84148108959198,
- "num_tokens": 8356739.0,
- "step": 914
- },
- {
- "epoch": 0.6952887537993921,
- "grad_norm": 1.234864592552185,
- "learning_rate": 4.563572266684478e-06,
- "loss": 0.5062938332557678,
- "mean_token_accuracy": 0.8132052421569824,
- "num_tokens": 8373660.0,
- "step": 915
- },
- {
- "epoch": 0.6960486322188449,
- "grad_norm": 2.4250621795654297,
- "learning_rate": 4.562389243895807e-06,
- "loss": 0.4907791018486023,
- "mean_token_accuracy": 0.8337979912757874,
- "num_tokens": 8378661.0,
- "step": 916
- },
- {
- "epoch": 0.6968085106382979,
- "grad_norm": 1.5018314123153687,
- "learning_rate": 4.561204773642974e-06,
- "loss": 0.41041281819343567,
- "mean_token_accuracy": 0.8569784164428711,
- "num_tokens": 8390322.0,
- "step": 917
- },
- {
- "epoch": 0.6975683890577508,
- "grad_norm": 2.797269344329834,
- "learning_rate": 4.5600188567572874e-06,
- "loss": 0.3146931529045105,
- "mean_token_accuracy": 0.8913302421569824,
- "num_tokens": 8393567.0,
- "step": 918
- },
- {
- "epoch": 0.6983282674772037,
- "grad_norm": 1.4002827405929565,
- "learning_rate": 4.558831494071069e-06,
- "loss": 0.4275597333908081,
- "mean_token_accuracy": 0.8504893779754639,
- "num_tokens": 8407119.0,
- "step": 919
- },
- {
- "epoch": 0.6990881458966566,
- "grad_norm": 1.7045831680297852,
- "learning_rate": 4.557642686417654e-06,
- "loss": 0.49593430757522583,
- "mean_token_accuracy": 0.8185091018676758,
- "num_tokens": 8417408.0,
- "step": 920
- },
- {
- "epoch": 0.6998480243161094,
- "grad_norm": 2.8818066120147705,
- "learning_rate": 4.556452434631396e-06,
- "loss": 0.637908935546875,
- "mean_token_accuracy": 0.7883946895599365,
- "num_tokens": 8422319.0,
- "step": 921
- },
- {
- "epoch": 0.7006079027355623,
- "grad_norm": 2.3587265014648438,
- "learning_rate": 4.555260739547657e-06,
- "loss": 0.38749319314956665,
- "mean_token_accuracy": 0.8774704933166504,
- "num_tokens": 8427315.0,
- "step": 922
- },
- {
- "epoch": 0.7013677811550152,
- "grad_norm": 1.6648749113082886,
- "learning_rate": 4.554067602002815e-06,
- "loss": 0.4044865369796753,
- "mean_token_accuracy": 0.8524141311645508,
- "num_tokens": 8438662.0,
- "step": 923
- },
- {
- "epoch": 0.7021276595744681,
- "grad_norm": 3.467787742614746,
- "learning_rate": 4.55287302283426e-06,
- "loss": 0.591016411781311,
- "mean_token_accuracy": 0.81184983253479,
- "num_tokens": 8442237.0,
- "step": 924
- },
- {
- "epoch": 0.702887537993921,
- "grad_norm": 2.1458635330200195,
- "learning_rate": 4.551677002880395e-06,
- "loss": 0.5017476677894592,
- "mean_token_accuracy": 0.822914183139801,
- "num_tokens": 8449494.0,
- "step": 925
- },
- {
- "epoch": 0.7036474164133738,
- "grad_norm": 2.521714448928833,
- "learning_rate": 4.550479542980632e-06,
- "loss": 0.531912088394165,
- "mean_token_accuracy": 0.8225687742233276,
- "num_tokens": 8454983.0,
- "step": 926
- },
- {
- "epoch": 0.7044072948328267,
- "grad_norm": 3.5248100757598877,
- "learning_rate": 4.549280643975394e-06,
- "loss": 0.4631815254688263,
- "mean_token_accuracy": 0.8443771600723267,
- "num_tokens": 8458504.0,
- "step": 927
- },
- {
- "epoch": 0.7051671732522796,
- "grad_norm": 2.5105819702148438,
- "learning_rate": 4.548080306706114e-06,
- "loss": 0.30487123131752014,
- "mean_token_accuracy": 0.9018767476081848,
- "num_tokens": 8462589.0,
- "step": 928
- },
- {
- "epoch": 0.7059270516717325,
- "grad_norm": 1.3367713689804077,
- "learning_rate": 4.5468785320152365e-06,
- "loss": 0.4355026185512543,
- "mean_token_accuracy": 0.8323584794998169,
- "num_tokens": 8478450.0,
- "step": 929
- },
- {
- "epoch": 0.7066869300911854,
- "grad_norm": 2.2506282329559326,
- "learning_rate": 4.545675320746212e-06,
- "loss": 0.5082957744598389,
- "mean_token_accuracy": 0.823430597782135,
- "num_tokens": 8485991.0,
- "step": 930
- },
- {
- "epoch": 0.7074468085106383,
- "grad_norm": 1.7164632081985474,
- "learning_rate": 4.544470673743502e-06,
- "loss": 0.3960164785385132,
- "mean_token_accuracy": 0.8592486381530762,
- "num_tokens": 8495217.0,
- "step": 931
- },
- {
- "epoch": 0.7082066869300911,
- "grad_norm": 1.5864969491958618,
- "learning_rate": 4.543264591852572e-06,
- "loss": 0.49114471673965454,
- "mean_token_accuracy": 0.8330780267715454,
- "num_tokens": 8508904.0,
- "step": 932
- },
- {
- "epoch": 0.708966565349544,
- "grad_norm": 2.1707003116607666,
- "learning_rate": 4.542057075919898e-06,
- "loss": 0.49895772337913513,
- "mean_token_accuracy": 0.8327431082725525,
- "num_tokens": 8515792.0,
- "step": 933
- },
- {
- "epoch": 0.709726443768997,
- "grad_norm": 1.9002083539962769,
- "learning_rate": 4.54084812679296e-06,
- "loss": 0.4548531472682953,
- "mean_token_accuracy": 0.834532618522644,
- "num_tokens": 8524006.0,
- "step": 934
- },
- {
- "epoch": 0.7104863221884499,
- "grad_norm": 1.8505141735076904,
- "learning_rate": 4.539637745320247e-06,
- "loss": 0.35716521739959717,
- "mean_token_accuracy": 0.872222900390625,
- "num_tokens": 8533647.0,
- "step": 935
- },
- {
- "epoch": 0.7112462006079028,
- "grad_norm": 2.092620849609375,
- "learning_rate": 4.53842593235125e-06,
- "loss": 0.4673694372177124,
- "mean_token_accuracy": 0.8460999131202698,
- "num_tokens": 8540734.0,
- "step": 936
- },
- {
- "epoch": 0.7120060790273556,
- "grad_norm": 2.689514636993408,
- "learning_rate": 4.537212688736466e-06,
- "loss": 0.45461273193359375,
- "mean_token_accuracy": 0.8450704216957092,
- "num_tokens": 8544948.0,
- "step": 937
- },
- {
- "epoch": 0.7127659574468085,
- "grad_norm": 2.4507734775543213,
- "learning_rate": 4.535998015327396e-06,
- "loss": 0.4571906626224518,
- "mean_token_accuracy": 0.8429360389709473,
- "num_tokens": 8550445.0,
- "step": 938
- },
- {
- "epoch": 0.7135258358662614,
- "grad_norm": 1.8960013389587402,
- "learning_rate": 4.534781912976546e-06,
- "loss": 0.4461391568183899,
- "mean_token_accuracy": 0.8487973213195801,
- "num_tokens": 8557630.0,
- "step": 939
- },
- {
- "epoch": 0.7142857142857143,
- "grad_norm": 1.602611780166626,
- "learning_rate": 4.533564382537421e-06,
- "loss": 0.5277102589607239,
- "mean_token_accuracy": 0.8330916166305542,
- "num_tokens": 8570397.0,
- "step": 940
- },
- {
- "epoch": 0.7150455927051672,
- "grad_norm": 1.8936395645141602,
- "learning_rate": 4.532345424864533e-06,
- "loss": 0.38619571924209595,
- "mean_token_accuracy": 0.8514572381973267,
- "num_tokens": 8582673.0,
- "step": 941
- },
- {
- "epoch": 0.71580547112462,
- "grad_norm": 1.3898619413375854,
- "learning_rate": 4.531125040813392e-06,
- "loss": 0.4825032949447632,
- "mean_token_accuracy": 0.833012580871582,
- "num_tokens": 8597239.0,
- "step": 942
- },
- {
- "epoch": 0.7165653495440729,
- "grad_norm": 2.128230571746826,
- "learning_rate": 4.529903231240511e-06,
- "loss": 0.4862118065357208,
- "mean_token_accuracy": 0.8210917711257935,
- "num_tokens": 8605877.0,
- "step": 943
- },
- {
- "epoch": 0.7173252279635258,
- "grad_norm": 1.6552259922027588,
- "learning_rate": 4.528679997003403e-06,
- "loss": 0.5092059373855591,
- "mean_token_accuracy": 0.8247389793395996,
- "num_tokens": 8617060.0,
- "step": 944
- },
- {
- "epoch": 0.7180851063829787,
- "grad_norm": 2.1174771785736084,
- "learning_rate": 4.52745533896058e-06,
- "loss": 0.39110174775123596,
- "mean_token_accuracy": 0.8672944903373718,
- "num_tokens": 8623306.0,
- "step": 945
- },
- {
- "epoch": 0.7188449848024316,
- "grad_norm": 2.8648383617401123,
- "learning_rate": 4.526229257971556e-06,
- "loss": 0.49864327907562256,
- "mean_token_accuracy": 0.8305130004882812,
- "num_tokens": 8627466.0,
- "step": 946
- },
- {
- "epoch": 0.7196048632218845,
- "grad_norm": 2.155514717102051,
- "learning_rate": 4.52500175489684e-06,
- "loss": 0.5070191025733948,
- "mean_token_accuracy": 0.8311188817024231,
- "num_tokens": 8634759.0,
- "step": 947
- },
- {
- "epoch": 0.7203647416413373,
- "grad_norm": 1.8432683944702148,
- "learning_rate": 4.523772830597942e-06,
- "loss": 0.5569252371788025,
- "mean_token_accuracy": 0.8070821762084961,
- "num_tokens": 8644160.0,
- "step": 948
- },
- {
- "epoch": 0.7211246200607903,
- "grad_norm": 2.8912241458892822,
- "learning_rate": 4.522542485937369e-06,
- "loss": 0.4799427390098572,
- "mean_token_accuracy": 0.8443552851676941,
- "num_tokens": 8648377.0,
- "step": 949
- },
- {
- "epoch": 0.7218844984802432,
- "grad_norm": 3.3449625968933105,
- "learning_rate": 4.521310721778622e-06,
- "loss": 0.44043463468551636,
- "mean_token_accuracy": 0.8521315455436707,
- "num_tokens": 8651846.0,
- "step": 950
- },
- {
- "epoch": 0.7226443768996961,
- "grad_norm": 1.4127917289733887,
- "learning_rate": 4.520077538986203e-06,
- "loss": 0.4700999855995178,
- "mean_token_accuracy": 0.8377952575683594,
- "num_tokens": 8665199.0,
- "step": 951
- },
- {
- "epoch": 0.723404255319149,
- "grad_norm": 2.1607301235198975,
- "learning_rate": 4.518842938425606e-06,
- "loss": 0.4374256730079651,
- "mean_token_accuracy": 0.8448896408081055,
- "num_tokens": 8672158.0,
- "step": 952
- },
- {
- "epoch": 0.7241641337386018,
- "grad_norm": 1.3442779779434204,
- "learning_rate": 4.51760692096332e-06,
- "loss": 0.38948923349380493,
- "mean_token_accuracy": 0.8598923683166504,
- "num_tokens": 8684532.0,
- "step": 953
- },
- {
- "epoch": 0.7249240121580547,
- "grad_norm": 2.0003178119659424,
- "learning_rate": 4.516369487466832e-06,
- "loss": 0.3797217011451721,
- "mean_token_accuracy": 0.8652102947235107,
- "num_tokens": 8691460.0,
- "step": 954
- },
- {
- "epoch": 0.7256838905775076,
- "grad_norm": 1.8196535110473633,
- "learning_rate": 4.5151306388046175e-06,
- "loss": 0.5676811933517456,
- "mean_token_accuracy": 0.818500816822052,
- "num_tokens": 8701624.0,
- "step": 955
- },
- {
- "epoch": 0.7264437689969605,
- "grad_norm": 2.1962296962738037,
- "learning_rate": 4.513890375846152e-06,
- "loss": 0.45399484038352966,
- "mean_token_accuracy": 0.8463879227638245,
- "num_tokens": 8707410.0,
- "step": 956
- },
- {
- "epoch": 0.7272036474164134,
- "grad_norm": 1.8798872232437134,
- "learning_rate": 4.512648699461897e-06,
- "loss": 0.5679811239242554,
- "mean_token_accuracy": 0.8089900016784668,
- "num_tokens": 8715630.0,
- "step": 957
- },
- {
- "epoch": 0.7279635258358662,
- "grad_norm": 2.3540258407592773,
- "learning_rate": 4.511405610523309e-06,
- "loss": 0.5282865762710571,
- "mean_token_accuracy": 0.8196114301681519,
- "num_tokens": 8721934.0,
- "step": 958
- },
- {
- "epoch": 0.7287234042553191,
- "grad_norm": 2.5630908012390137,
- "learning_rate": 4.510161109902837e-06,
- "loss": 0.39442378282546997,
- "mean_token_accuracy": 0.8400980830192566,
- "num_tokens": 8726511.0,
- "step": 959
- },
- {
- "epoch": 0.729483282674772,
- "grad_norm": 1.9829226732254028,
- "learning_rate": 4.508915198473919e-06,
- "loss": 0.4611976742744446,
- "mean_token_accuracy": 0.8439624309539795,
- "num_tokens": 8733460.0,
- "step": 960
- },
- {
- "epoch": 0.7302431610942249,
- "grad_norm": 3.0291950702667236,
- "learning_rate": 4.507667877110982e-06,
- "loss": 0.5158340930938721,
- "mean_token_accuracy": 0.8300060033798218,
- "num_tokens": 8737629.0,
- "step": 961
- },
- {
- "epoch": 0.7310030395136778,
- "grad_norm": 1.9208252429962158,
- "learning_rate": 4.506419146689445e-06,
- "loss": 0.3807099163532257,
- "mean_token_accuracy": 0.871469259262085,
- "num_tokens": 8744615.0,
- "step": 962
- },
- {
- "epoch": 0.7317629179331308,
- "grad_norm": 3.051565408706665,
- "learning_rate": 4.505169008085717e-06,
- "loss": 0.38461726903915405,
- "mean_token_accuracy": 0.874465823173523,
- "num_tokens": 8748154.0,
- "step": 963
- },
- {
- "epoch": 0.7325227963525835,
- "grad_norm": 1.375466227531433,
- "learning_rate": 4.503917462177192e-06,
- "loss": 0.42490679025650024,
- "mean_token_accuracy": 0.8457326889038086,
- "num_tokens": 8760965.0,
- "step": 964
- },
- {
- "epoch": 0.7332826747720365,
- "grad_norm": 2.216681957244873,
- "learning_rate": 4.5026645098422515e-06,
- "loss": 0.43149900436401367,
- "mean_token_accuracy": 0.8527278900146484,
- "num_tokens": 8766996.0,
- "step": 965
- },
- {
- "epoch": 0.7340425531914894,
- "grad_norm": 1.9422595500946045,
- "learning_rate": 4.5014101519602684e-06,
- "loss": 0.4964504539966583,
- "mean_token_accuracy": 0.8137556314468384,
- "num_tokens": 8774411.0,
- "step": 966
- },
- {
- "epoch": 0.7348024316109423,
- "grad_norm": 2.058887004852295,
- "learning_rate": 4.500154389411598e-06,
- "loss": 0.4977570176124573,
- "mean_token_accuracy": 0.8254626989364624,
- "num_tokens": 8782220.0,
- "step": 967
- },
- {
- "epoch": 0.7355623100303952,
- "grad_norm": 2.9977786540985107,
- "learning_rate": 4.498897223077582e-06,
- "loss": 0.4061415195465088,
- "mean_token_accuracy": 0.8752427101135254,
- "num_tokens": 8786120.0,
- "step": 968
- },
- {
- "epoch": 0.736322188449848,
- "grad_norm": 2.2636303901672363,
- "learning_rate": 4.49763865384055e-06,
- "loss": 0.5062161087989807,
- "mean_token_accuracy": 0.8171653747558594,
- "num_tokens": 8792459.0,
- "step": 969
- },
- {
- "epoch": 0.7370820668693009,
- "grad_norm": 1.8850842714309692,
- "learning_rate": 4.496378682583813e-06,
- "loss": 0.5014280676841736,
- "mean_token_accuracy": 0.8547511100769043,
- "num_tokens": 8800675.0,
- "step": 970
- },
- {
- "epoch": 0.7378419452887538,
- "grad_norm": 1.191985011100769,
- "learning_rate": 4.495117310191667e-06,
- "loss": 0.4713883101940155,
- "mean_token_accuracy": 0.8213596343994141,
- "num_tokens": 8820740.0,
- "step": 971
- },
- {
- "epoch": 0.7386018237082067,
- "grad_norm": 1.823000192642212,
- "learning_rate": 4.493854537549393e-06,
- "loss": 0.46332645416259766,
- "mean_token_accuracy": 0.8359860777854919,
- "num_tokens": 8828884.0,
- "step": 972
- },
- {
- "epoch": 0.7393617021276596,
- "grad_norm": 2.590446949005127,
- "learning_rate": 4.492590365543253e-06,
- "loss": 0.49074703454971313,
- "mean_token_accuracy": 0.8433758020401001,
- "num_tokens": 8833859.0,
- "step": 973
- },
- {
- "epoch": 0.7401215805471124,
- "grad_norm": 2.2762670516967773,
- "learning_rate": 4.491324795060491e-06,
- "loss": 0.39465656876564026,
- "mean_token_accuracy": 0.8734766244888306,
- "num_tokens": 8839350.0,
- "step": 974
- },
- {
- "epoch": 0.7408814589665653,
- "grad_norm": 2.698725461959839,
- "learning_rate": 4.490057826989333e-06,
- "loss": 0.5552085041999817,
- "mean_token_accuracy": 0.8132266998291016,
- "num_tokens": 8844373.0,
- "step": 975
- },
- {
- "epoch": 0.7416413373860182,
- "grad_norm": 2.704606294631958,
- "learning_rate": 4.488789462218988e-06,
- "loss": 0.3447791635990143,
- "mean_token_accuracy": 0.8736170530319214,
- "num_tokens": 8848236.0,
- "step": 976
- },
- {
- "epoch": 0.7424012158054711,
- "grad_norm": 3.1260716915130615,
- "learning_rate": 4.487519701639641e-06,
- "loss": 0.5945233702659607,
- "mean_token_accuracy": 0.7997599840164185,
- "num_tokens": 8852935.0,
- "step": 977
- },
- {
- "epoch": 0.743161094224924,
- "grad_norm": 1.6895452737808228,
- "learning_rate": 4.486248546142459e-06,
- "loss": 0.4823892116546631,
- "mean_token_accuracy": 0.8279662132263184,
- "num_tokens": 8861743.0,
- "step": 978
- },
- {
- "epoch": 0.743920972644377,
- "grad_norm": 1.9161452054977417,
- "learning_rate": 4.4849759966195885e-06,
- "loss": 0.5266581773757935,
- "mean_token_accuracy": 0.8218623399734497,
- "num_tokens": 8870601.0,
- "step": 979
- },
- {
- "epoch": 0.7446808510638298,
- "grad_norm": 1.6894301176071167,
- "learning_rate": 4.483702053964154e-06,
- "loss": 0.4186219573020935,
- "mean_token_accuracy": 0.8471781015396118,
- "num_tokens": 8885617.0,
- "step": 980
- },
- {
- "epoch": 0.7454407294832827,
- "grad_norm": 1.6319992542266846,
- "learning_rate": 4.482426719070258e-06,
- "loss": 0.541317880153656,
- "mean_token_accuracy": 0.8216162323951721,
- "num_tokens": 8897595.0,
- "step": 981
- },
- {
- "epoch": 0.7462006079027356,
- "grad_norm": 5.102413177490234,
- "learning_rate": 4.4811499928329775e-06,
- "loss": 0.3928517699241638,
- "mean_token_accuracy": 0.858033299446106,
- "num_tokens": 8901682.0,
- "step": 982
- },
- {
- "epoch": 0.7469604863221885,
- "grad_norm": 2.213860273361206,
- "learning_rate": 4.479871876148368e-06,
- "loss": 0.4276347756385803,
- "mean_token_accuracy": 0.8529798984527588,
- "num_tokens": 8908088.0,
- "step": 983
- },
- {
- "epoch": 0.7477203647416414,
- "grad_norm": 1.2180038690567017,
- "learning_rate": 4.478592369913464e-06,
- "loss": 0.3941590189933777,
- "mean_token_accuracy": 0.8608149290084839,
- "num_tokens": 8925876.0,
- "step": 984
- },
- {
- "epoch": 0.7484802431610942,
- "grad_norm": 2.849802255630493,
- "learning_rate": 4.477311475026271e-06,
- "loss": 0.42190325260162354,
- "mean_token_accuracy": 0.860505223274231,
- "num_tokens": 8930190.0,
- "step": 985
- },
- {
- "epoch": 0.7492401215805471,
- "grad_norm": 1.704128384590149,
- "learning_rate": 4.476029192385769e-06,
- "loss": 0.4786282777786255,
- "mean_token_accuracy": 0.8302322626113892,
- "num_tokens": 8938340.0,
- "step": 986
- },
- {
- "epoch": 0.75,
- "grad_norm": 2.06322979927063,
- "learning_rate": 4.474745522891915e-06,
- "loss": 0.4648786187171936,
- "mean_token_accuracy": 0.8366481065750122,
- "num_tokens": 8944633.0,
- "step": 987
- },
- {
- "epoch": 0.7507598784194529,
- "grad_norm": 2.0745396614074707,
- "learning_rate": 4.473460467445637e-06,
- "loss": 0.5744885206222534,
- "mean_token_accuracy": 0.8357284069061279,
- "num_tokens": 8954457.0,
- "step": 988
- },
- {
- "epoch": 0.7515197568389058,
- "grad_norm": 1.9281407594680786,
- "learning_rate": 4.472174026948836e-06,
- "loss": 0.528974175453186,
- "mean_token_accuracy": 0.8083580732345581,
- "num_tokens": 8962701.0,
- "step": 989
- },
- {
- "epoch": 0.7522796352583586,
- "grad_norm": 3.012381076812744,
- "learning_rate": 4.470886202304385e-06,
- "loss": 0.48754751682281494,
- "mean_token_accuracy": 0.8368391990661621,
- "num_tokens": 8967272.0,
- "step": 990
- },
- {
- "epoch": 0.7530395136778115,
- "grad_norm": 1.691826581954956,
- "learning_rate": 4.469596994416131e-06,
- "loss": 0.484740674495697,
- "mean_token_accuracy": 0.8500643968582153,
- "num_tokens": 8976615.0,
- "step": 991
- },
- {
- "epoch": 0.7537993920972644,
- "grad_norm": 2.4961965084075928,
- "learning_rate": 4.468306404188887e-06,
- "loss": 0.50777268409729,
- "mean_token_accuracy": 0.8168395757675171,
- "num_tokens": 8983235.0,
- "step": 992
- },
- {
- "epoch": 0.7545592705167173,
- "grad_norm": 1.512007713317871,
- "learning_rate": 4.467014432528441e-06,
- "loss": 0.4583340287208557,
- "mean_token_accuracy": 0.8465162515640259,
- "num_tokens": 8993815.0,
- "step": 993
- },
- {
- "epoch": 0.7553191489361702,
- "grad_norm": 1.9362257719039917,
- "learning_rate": 4.465721080341547e-06,
- "loss": 0.6027892827987671,
- "mean_token_accuracy": 0.8052380084991455,
- "num_tokens": 9002697.0,
- "step": 994
- },
- {
- "epoch": 0.756079027355623,
- "grad_norm": 2.473632335662842,
- "learning_rate": 4.4644263485359316e-06,
- "loss": 0.5394320487976074,
- "mean_token_accuracy": 0.834665834903717,
- "num_tokens": 9007428.0,
- "step": 995
- },
- {
- "epoch": 0.756838905775076,
- "grad_norm": 2.2527434825897217,
- "learning_rate": 4.463130238020284e-06,
- "loss": 0.5485198497772217,
- "mean_token_accuracy": 0.8090173006057739,
- "num_tokens": 9013570.0,
- "step": 996
- },
- {
- "epoch": 0.7575987841945289,
- "grad_norm": 1.4130940437316895,
- "learning_rate": 4.4618327497042676e-06,
- "loss": 0.37994423508644104,
- "mean_token_accuracy": 0.8625167012214661,
- "num_tokens": 9025485.0,
- "step": 997
- },
- {
- "epoch": 0.7583586626139818,
- "grad_norm": 2.685115098953247,
- "learning_rate": 4.460533884498509e-06,
- "loss": 0.447973370552063,
- "mean_token_accuracy": 0.8564165234565735,
- "num_tokens": 9030355.0,
- "step": 998
- },
- {
- "epoch": 0.7591185410334347,
- "grad_norm": 3.2743139266967773,
- "learning_rate": 4.4592336433146e-06,
- "loss": 0.45275989174842834,
- "mean_token_accuracy": 0.8462578058242798,
- "num_tokens": 9034406.0,
- "step": 999
- },
- {
- "epoch": 0.7598784194528876,
- "grad_norm": 1.9383049011230469,
- "learning_rate": 4.457932027065102e-06,
- "loss": 0.5387729406356812,
- "mean_token_accuracy": 0.8357330560684204,
- "num_tokens": 9041502.0,
- "step": 1000
- },
- {
- "epoch": 0.7606382978723404,
- "grad_norm": 2.7348275184631348,
- "learning_rate": 4.456629036663537e-06,
- "loss": 0.4448447823524475,
- "mean_token_accuracy": 0.8453642129898071,
- "num_tokens": 9046088.0,
- "step": 1001
- },
- {
- "epoch": 0.7613981762917933,
- "grad_norm": 1.8477401733398438,
- "learning_rate": 4.455324673024396e-06,
- "loss": 0.5766505002975464,
- "mean_token_accuracy": 0.8074213862419128,
- "num_tokens": 9055678.0,
- "step": 1002
- },
- {
- "epoch": 0.7621580547112462,
- "grad_norm": 3.134481430053711,
- "learning_rate": 4.4540189370631315e-06,
- "loss": 0.5690872669219971,
- "mean_token_accuracy": 0.8414670825004578,
- "num_tokens": 9062006.0,
- "step": 1003
- },
- {
- "epoch": 0.7629179331306991,
- "grad_norm": 1.7933398485183716,
- "learning_rate": 4.452711829696158e-06,
- "loss": 0.4898291826248169,
- "mean_token_accuracy": 0.8259007930755615,
- "num_tokens": 9070754.0,
- "step": 1004
- },
- {
- "epoch": 0.763677811550152,
- "grad_norm": 1.2552275657653809,
- "learning_rate": 4.451403351840855e-06,
- "loss": 0.4280198812484741,
- "mean_token_accuracy": 0.8409112691879272,
- "num_tokens": 9085306.0,
- "step": 1005
- },
- {
- "epoch": 0.7644376899696048,
- "grad_norm": 1.6749331951141357,
- "learning_rate": 4.450093504415562e-06,
- "loss": 0.3723178505897522,
- "mean_token_accuracy": 0.8545734882354736,
- "num_tokens": 9102453.0,
- "step": 1006
- },
- {
- "epoch": 0.7651975683890577,
- "grad_norm": 2.7514500617980957,
- "learning_rate": 4.44878228833958e-06,
- "loss": 0.5463190674781799,
- "mean_token_accuracy": 0.8121639490127563,
- "num_tokens": 9108342.0,
- "step": 1007
- },
- {
- "epoch": 0.7659574468085106,
- "grad_norm": 1.3322733640670776,
- "learning_rate": 4.447469704533172e-06,
- "loss": 0.573723316192627,
- "mean_token_accuracy": 0.8065711259841919,
- "num_tokens": 9123712.0,
- "step": 1008
- },
- {
- "epoch": 0.7667173252279635,
- "grad_norm": 2.6893765926361084,
- "learning_rate": 4.446155753917559e-06,
- "loss": 0.6856257915496826,
- "mean_token_accuracy": 0.7718256711959839,
- "num_tokens": 9130728.0,
- "step": 1009
- },
- {
- "epoch": 0.7674772036474165,
- "grad_norm": 1.792765498161316,
- "learning_rate": 4.444840437414923e-06,
- "loss": 0.48203110694885254,
- "mean_token_accuracy": 0.8419194221496582,
- "num_tokens": 9137983.0,
- "step": 1010
- },
- {
- "epoch": 0.7682370820668692,
- "grad_norm": 1.4957399368286133,
- "learning_rate": 4.443523755948401e-06,
- "loss": 0.4372181296348572,
- "mean_token_accuracy": 0.8491764664649963,
- "num_tokens": 9148081.0,
- "step": 1011
- },
- {
- "epoch": 0.7689969604863222,
- "grad_norm": 1.7294867038726807,
- "learning_rate": 4.442205710442095e-06,
- "loss": 0.54277503490448,
- "mean_token_accuracy": 0.8196806907653809,
- "num_tokens": 9158407.0,
- "step": 1012
- },
- {
- "epoch": 0.7697568389057751,
- "grad_norm": 2.2091221809387207,
- "learning_rate": 4.4408863018210564e-06,
- "loss": 0.4888187646865845,
- "mean_token_accuracy": 0.8384175300598145,
- "num_tokens": 9164754.0,
- "step": 1013
- },
- {
- "epoch": 0.770516717325228,
- "grad_norm": 1.7615830898284912,
- "learning_rate": 4.439565531011299e-06,
- "loss": 0.4640008211135864,
- "mean_token_accuracy": 0.8424701690673828,
- "num_tokens": 9172715.0,
- "step": 1014
- },
- {
- "epoch": 0.7712765957446809,
- "grad_norm": 1.6796128749847412,
- "learning_rate": 4.43824339893979e-06,
- "loss": 0.5227609276771545,
- "mean_token_accuracy": 0.8135923743247986,
- "num_tokens": 9183214.0,
- "step": 1015
- },
- {
- "epoch": 0.7720364741641338,
- "grad_norm": 2.1485698223114014,
- "learning_rate": 4.436919906534452e-06,
- "loss": 0.4857056140899658,
- "mean_token_accuracy": 0.8323013782501221,
- "num_tokens": 9190360.0,
- "step": 1016
- },
- {
- "epoch": 0.7727963525835866,
- "grad_norm": 2.7842206954956055,
- "learning_rate": 4.4355950547241645e-06,
- "loss": 0.46406883001327515,
- "mean_token_accuracy": 0.859869122505188,
- "num_tokens": 9194523.0,
- "step": 1017
- },
- {
- "epoch": 0.7735562310030395,
- "grad_norm": 2.3774640560150146,
- "learning_rate": 4.434268844438758e-06,
- "loss": 0.5625549554824829,
- "mean_token_accuracy": 0.8188897371292114,
- "num_tokens": 9201155.0,
- "step": 1018
- },
- {
- "epoch": 0.7743161094224924,
- "grad_norm": 2.004427909851074,
- "learning_rate": 4.432941276609018e-06,
- "loss": 0.5164387226104736,
- "mean_token_accuracy": 0.829569935798645,
- "num_tokens": 9209269.0,
- "step": 1019
- },
- {
- "epoch": 0.7750759878419453,
- "grad_norm": 1.7218989133834839,
- "learning_rate": 4.431612352166684e-06,
- "loss": 0.481005996465683,
- "mean_token_accuracy": 0.8359906673431396,
- "num_tokens": 9220860.0,
- "step": 1020
- },
- {
- "epoch": 0.7758358662613982,
- "grad_norm": 2.197108507156372,
- "learning_rate": 4.4302820720444454e-06,
- "loss": 0.440413236618042,
- "mean_token_accuracy": 0.8412867784500122,
- "num_tokens": 9226414.0,
- "step": 1021
- },
- {
- "epoch": 0.776595744680851,
- "grad_norm": 2.6995162963867188,
- "learning_rate": 4.428950437175944e-06,
- "loss": 0.3884299397468567,
- "mean_token_accuracy": 0.8696021437644958,
- "num_tokens": 9230898.0,
- "step": 1022
- },
- {
- "epoch": 0.7773556231003039,
- "grad_norm": 2.1671667098999023,
- "learning_rate": 4.427617448495772e-06,
- "loss": 0.5747478008270264,
- "mean_token_accuracy": 0.7842930555343628,
- "num_tokens": 9238479.0,
- "step": 1023
- },
- {
- "epoch": 0.7781155015197568,
- "grad_norm": 1.6299028396606445,
- "learning_rate": 4.426283106939474e-06,
- "loss": 0.39478403329849243,
- "mean_token_accuracy": 0.8685503602027893,
- "num_tokens": 9248263.0,
- "step": 1024
- },
- {
- "epoch": 0.7788753799392097,
- "grad_norm": 2.2621798515319824,
- "learning_rate": 4.424947413443539e-06,
- "loss": 0.4582178592681885,
- "mean_token_accuracy": 0.8312377333641052,
- "num_tokens": 9254168.0,
- "step": 1025
- },
- {
- "epoch": 0.7796352583586627,
- "grad_norm": 2.121091365814209,
- "learning_rate": 4.423610368945411e-06,
- "loss": 0.5315121412277222,
- "mean_token_accuracy": 0.8121483325958252,
- "num_tokens": 9261808.0,
- "step": 1026
- },
- {
- "epoch": 0.7803951367781155,
- "grad_norm": 1.8558297157287598,
- "learning_rate": 4.422271974383479e-06,
- "loss": 0.4299176037311554,
- "mean_token_accuracy": 0.8452648520469666,
- "num_tokens": 9269264.0,
- "step": 1027
- },
- {
- "epoch": 0.7811550151975684,
- "grad_norm": 1.9089949131011963,
- "learning_rate": 4.420932230697079e-06,
- "loss": 0.43876272439956665,
- "mean_token_accuracy": 0.8434094190597534,
- "num_tokens": 9277381.0,
- "step": 1028
- },
- {
- "epoch": 0.7819148936170213,
- "grad_norm": 1.8619649410247803,
- "learning_rate": 4.419591138826495e-06,
- "loss": 0.48798668384552,
- "mean_token_accuracy": 0.8281317353248596,
- "num_tokens": 9285413.0,
- "step": 1029
- },
- {
- "epoch": 0.7826747720364742,
- "grad_norm": 1.3273087739944458,
- "learning_rate": 4.418248699712955e-06,
- "loss": 0.4611460864543915,
- "mean_token_accuracy": 0.8233213424682617,
- "num_tokens": 9300805.0,
- "step": 1030
- },
- {
- "epoch": 0.7834346504559271,
- "grad_norm": 1.0473746061325073,
- "learning_rate": 4.416904914298637e-06,
- "loss": 0.36537665128707886,
- "mean_token_accuracy": 0.8671857118606567,
- "num_tokens": 9320035.0,
- "step": 1031
- },
- {
- "epoch": 0.78419452887538,
- "grad_norm": 1.9130918979644775,
- "learning_rate": 4.415559783526661e-06,
- "loss": 0.4916655123233795,
- "mean_token_accuracy": 0.8266351222991943,
- "num_tokens": 9326795.0,
- "step": 1032
- },
- {
- "epoch": 0.7849544072948328,
- "grad_norm": 2.0001816749572754,
- "learning_rate": 4.414213308341092e-06,
- "loss": 0.5711008310317993,
- "mean_token_accuracy": 0.8093076348304749,
- "num_tokens": 9335625.0,
- "step": 1033
- },
- {
- "epoch": 0.7857142857142857,
- "grad_norm": 3.933542251586914,
- "learning_rate": 4.412865489686936e-06,
- "loss": 0.621616542339325,
- "mean_token_accuracy": 0.7938898801803589,
- "num_tokens": 9339080.0,
- "step": 1034
- },
- {
- "epoch": 0.7864741641337386,
- "grad_norm": 2.061558961868286,
- "learning_rate": 4.411516328510145e-06,
- "loss": 0.583686113357544,
- "mean_token_accuracy": 0.8216883540153503,
- "num_tokens": 9348581.0,
- "step": 1035
- },
- {
- "epoch": 0.7872340425531915,
- "grad_norm": 1.9401264190673828,
- "learning_rate": 4.410165825757613e-06,
- "loss": 0.4905240535736084,
- "mean_token_accuracy": 0.8229951858520508,
- "num_tokens": 9356032.0,
- "step": 1036
- },
- {
- "epoch": 0.7879939209726444,
- "grad_norm": 3.620547294616699,
- "learning_rate": 4.408813982377175e-06,
- "loss": 0.4269888997077942,
- "mean_token_accuracy": 0.8713940978050232,
- "num_tokens": 9359061.0,
- "step": 1037
- },
- {
- "epoch": 0.7887537993920972,
- "grad_norm": 1.2027851343154907,
- "learning_rate": 4.407460799317605e-06,
- "loss": 0.39972418546676636,
- "mean_token_accuracy": 0.8610097765922546,
- "num_tokens": 9377068.0,
- "step": 1038
- },
- {
- "epoch": 0.7895136778115501,
- "grad_norm": 2.566753387451172,
- "learning_rate": 4.40610627752862e-06,
- "loss": 0.45267152786254883,
- "mean_token_accuracy": 0.83243328332901,
- "num_tokens": 9383604.0,
- "step": 1039
- },
- {
- "epoch": 0.790273556231003,
- "grad_norm": 2.940094470977783,
- "learning_rate": 4.404750417960876e-06,
- "loss": 0.42862242460250854,
- "mean_token_accuracy": 0.8582849502563477,
- "num_tokens": 9387541.0,
- "step": 1040
- },
- {
- "epoch": 0.791033434650456,
- "grad_norm": 2.0223944187164307,
- "learning_rate": 4.403393221565966e-06,
- "loss": 0.4349963665008545,
- "mean_token_accuracy": 0.8453047871589661,
- "num_tokens": 9394382.0,
- "step": 1041
- },
- {
- "epoch": 0.7917933130699089,
- "grad_norm": 2.9399030208587646,
- "learning_rate": 4.402034689296425e-06,
- "loss": 0.32197174429893494,
- "mean_token_accuracy": 0.8953392505645752,
- "num_tokens": 9397741.0,
- "step": 1042
- },
- {
- "epoch": 0.7925531914893617,
- "grad_norm": 2.819016456604004,
- "learning_rate": 4.400674822105721e-06,
- "loss": 0.6790289878845215,
- "mean_token_accuracy": 0.8135063648223877,
- "num_tokens": 9403509.0,
- "step": 1043
- },
- {
- "epoch": 0.7933130699088146,
- "grad_norm": 1.3225977420806885,
- "learning_rate": 4.399313620948262e-06,
- "loss": 0.42203834652900696,
- "mean_token_accuracy": 0.8399381637573242,
- "num_tokens": 9418870.0,
- "step": 1044
- },
- {
- "epoch": 0.7940729483282675,
- "grad_norm": 1.7822176218032837,
- "learning_rate": 4.397951086779392e-06,
- "loss": 0.4666554927825928,
- "mean_token_accuracy": 0.8364764451980591,
- "num_tokens": 9427640.0,
- "step": 1045
- },
- {
- "epoch": 0.7948328267477204,
- "grad_norm": 3.186439037322998,
- "learning_rate": 4.396587220555389e-06,
- "loss": 0.6048363447189331,
- "mean_token_accuracy": 0.7806557416915894,
- "num_tokens": 9431927.0,
- "step": 1046
- },
- {
- "epoch": 0.7955927051671733,
- "grad_norm": 3.0804805755615234,
- "learning_rate": 4.395222023233467e-06,
- "loss": 0.445969820022583,
- "mean_token_accuracy": 0.850671112537384,
- "num_tokens": 9436136.0,
- "step": 1047
- },
- {
- "epoch": 0.7963525835866262,
- "grad_norm": 1.675968885421753,
- "learning_rate": 4.393855495771774e-06,
- "loss": 0.4311422109603882,
- "mean_token_accuracy": 0.8449079990386963,
- "num_tokens": 9445189.0,
- "step": 1048
- },
- {
- "epoch": 0.797112462006079,
- "grad_norm": 2.342410087585449,
- "learning_rate": 4.3924876391293915e-06,
- "loss": 0.5733606219291687,
- "mean_token_accuracy": 0.8156592845916748,
- "num_tokens": 9451939.0,
- "step": 1049
- },
- {
- "epoch": 0.7978723404255319,
- "grad_norm": 1.5967470407485962,
- "learning_rate": 4.391118454266335e-06,
- "loss": 0.46664729714393616,
- "mean_token_accuracy": 0.8091695308685303,
- "num_tokens": 9463968.0,
- "step": 1050
- },
- {
- "epoch": 0.7986322188449848,
- "grad_norm": 1.5777863264083862,
- "learning_rate": 4.389747942143549e-06,
- "loss": 0.46028903126716614,
- "mean_token_accuracy": 0.8347330093383789,
- "num_tokens": 9475561.0,
- "step": 1051
- },
- {
- "epoch": 0.7993920972644377,
- "grad_norm": 2.7630488872528076,
- "learning_rate": 4.388376103722914e-06,
- "loss": 0.5618188977241516,
- "mean_token_accuracy": 0.8273467421531677,
- "num_tokens": 9480661.0,
- "step": 1052
- },
- {
- "epoch": 0.8001519756838906,
- "grad_norm": 2.093397378921509,
- "learning_rate": 4.387002939967237e-06,
- "loss": 0.2998353838920593,
- "mean_token_accuracy": 0.8905231952667236,
- "num_tokens": 9485924.0,
- "step": 1053
- },
- {
- "epoch": 0.8009118541033434,
- "grad_norm": 1.4385871887207031,
- "learning_rate": 4.38562845184026e-06,
- "loss": 0.4944111704826355,
- "mean_token_accuracy": 0.8403056263923645,
- "num_tokens": 9500128.0,
- "step": 1054
- },
- {
- "epoch": 0.8016717325227963,
- "grad_norm": 1.6393156051635742,
- "learning_rate": 4.384252640306649e-06,
- "loss": 0.5727907419204712,
- "mean_token_accuracy": 0.7849414348602295,
- "num_tokens": 9511569.0,
- "step": 1055
- },
- {
- "epoch": 0.8024316109422492,
- "grad_norm": 2.3909664154052734,
- "learning_rate": 4.382875506332002e-06,
- "loss": 0.4760419726371765,
- "mean_token_accuracy": 0.8408266305923462,
- "num_tokens": 9517244.0,
- "step": 1056
- },
- {
- "epoch": 0.8031914893617021,
- "grad_norm": 1.7288594245910645,
- "learning_rate": 4.381497050882845e-06,
- "loss": 0.5375926494598389,
- "mean_token_accuracy": 0.8138614892959595,
- "num_tokens": 9528736.0,
- "step": 1057
- },
- {
- "epoch": 0.8039513677811551,
- "grad_norm": 2.093407392501831,
- "learning_rate": 4.380117274926632e-06,
- "loss": 0.46659404039382935,
- "mean_token_accuracy": 0.8450702428817749,
- "num_tokens": 9536200.0,
- "step": 1058
- },
- {
- "epoch": 0.8047112462006079,
- "grad_norm": 1.6835898160934448,
- "learning_rate": 4.3787361794317405e-06,
- "loss": 0.43157699704170227,
- "mean_token_accuracy": 0.8279973268508911,
- "num_tokens": 9546314.0,
- "step": 1059
- },
- {
- "epoch": 0.8054711246200608,
- "grad_norm": 1.983067512512207,
- "learning_rate": 4.377353765367479e-06,
- "loss": 0.5021739602088928,
- "mean_token_accuracy": 0.8274815082550049,
- "num_tokens": 9554375.0,
- "step": 1060
- },
- {
- "epoch": 0.8062310030395137,
- "grad_norm": 2.0472030639648438,
- "learning_rate": 4.375970033704078e-06,
- "loss": 0.34298190474510193,
- "mean_token_accuracy": 0.8900876045227051,
- "num_tokens": 9560230.0,
- "step": 1061
- },
- {
- "epoch": 0.8069908814589666,
- "grad_norm": 1.9613717794418335,
- "learning_rate": 4.374584985412692e-06,
- "loss": 0.3826758861541748,
- "mean_token_accuracy": 0.839923620223999,
- "num_tokens": 9566809.0,
- "step": 1062
- },
- {
- "epoch": 0.8077507598784195,
- "grad_norm": 1.991289496421814,
- "learning_rate": 4.373198621465405e-06,
- "loss": 0.5492525100708008,
- "mean_token_accuracy": 0.8153272867202759,
- "num_tokens": 9576810.0,
- "step": 1063
- },
- {
- "epoch": 0.8085106382978723,
- "grad_norm": 2.421370506286621,
- "learning_rate": 4.3718109428352155e-06,
- "loss": 0.5240297317504883,
- "mean_token_accuracy": 0.8087242245674133,
- "num_tokens": 9582906.0,
- "step": 1064
- },
- {
- "epoch": 0.8092705167173252,
- "grad_norm": 3.697765588760376,
- "learning_rate": 4.370421950496055e-06,
- "loss": 0.6096476912498474,
- "mean_token_accuracy": 0.787585973739624,
- "num_tokens": 9586920.0,
- "step": 1065
- },
- {
- "epoch": 0.8100303951367781,
- "grad_norm": 2.0767786502838135,
- "learning_rate": 4.369031645422768e-06,
- "loss": 0.41120079159736633,
- "mean_token_accuracy": 0.8513731956481934,
- "num_tokens": 9593902.0,
- "step": 1066
- },
- {
- "epoch": 0.810790273556231,
- "grad_norm": 2.5968732833862305,
- "learning_rate": 4.367640028591126e-06,
- "loss": 0.3364982008934021,
- "mean_token_accuracy": 0.8786963224411011,
- "num_tokens": 9597745.0,
- "step": 1067
- },
- {
- "epoch": 0.8115501519756839,
- "grad_norm": 2.165742874145508,
- "learning_rate": 4.366247100977818e-06,
- "loss": 0.406129390001297,
- "mean_token_accuracy": 0.868243932723999,
- "num_tokens": 9603496.0,
- "step": 1068
- },
- {
- "epoch": 0.8123100303951368,
- "grad_norm": 2.0493404865264893,
- "learning_rate": 4.364852863560456e-06,
- "loss": 0.5356296300888062,
- "mean_token_accuracy": 0.8191947340965271,
- "num_tokens": 9610898.0,
- "step": 1069
- },
- {
- "epoch": 0.8130699088145896,
- "grad_norm": 2.3224308490753174,
- "learning_rate": 4.363457317317568e-06,
- "loss": 0.41461923718452454,
- "mean_token_accuracy": 0.8537945747375488,
- "num_tokens": 9616626.0,
- "step": 1070
- },
- {
- "epoch": 0.8138297872340425,
- "grad_norm": 1.7387986183166504,
- "learning_rate": 4.362060463228603e-06,
- "loss": 0.5134786367416382,
- "mean_token_accuracy": 0.8511737585067749,
- "num_tokens": 9626223.0,
- "step": 1071
- },
- {
- "epoch": 0.8145896656534954,
- "grad_norm": 3.0270655155181885,
- "learning_rate": 4.360662302273926e-06,
- "loss": 0.3410695791244507,
- "mean_token_accuracy": 0.8746449947357178,
- "num_tokens": 9629455.0,
- "step": 1072
- },
- {
- "epoch": 0.8153495440729484,
- "grad_norm": 1.7727062702178955,
- "learning_rate": 4.35926283543482e-06,
- "loss": 0.4610968828201294,
- "mean_token_accuracy": 0.8444793224334717,
- "num_tokens": 9638070.0,
- "step": 1073
- },
- {
- "epoch": 0.8161094224924013,
- "grad_norm": 3.6333565711975098,
- "learning_rate": 4.357862063693486e-06,
- "loss": 0.3881273865699768,
- "mean_token_accuracy": 0.8757344484329224,
- "num_tokens": 9641028.0,
- "step": 1074
- },
- {
- "epoch": 0.8168693009118541,
- "grad_norm": 3.024042844772339,
- "learning_rate": 4.356459988033039e-06,
- "loss": 0.3853808641433716,
- "mean_token_accuracy": 0.8602254390716553,
- "num_tokens": 9645730.0,
- "step": 1075
- },
- {
- "epoch": 0.817629179331307,
- "grad_norm": 2.3359482288360596,
- "learning_rate": 4.355056609437509e-06,
- "loss": 0.4852045476436615,
- "mean_token_accuracy": 0.8502728343009949,
- "num_tokens": 9650975.0,
- "step": 1076
- },
- {
- "epoch": 0.8183890577507599,
- "grad_norm": 2.2390685081481934,
- "learning_rate": 4.353651928891842e-06,
- "loss": 0.5287341475486755,
- "mean_token_accuracy": 0.8247801065444946,
- "num_tokens": 9657471.0,
- "step": 1077
- },
- {
- "epoch": 0.8191489361702128,
- "grad_norm": 2.3809144496917725,
- "learning_rate": 4.352245947381897e-06,
- "loss": 0.5218510627746582,
- "mean_token_accuracy": 0.8149170875549316,
- "num_tokens": 9664108.0,
- "step": 1078
- },
- {
- "epoch": 0.8199088145896657,
- "grad_norm": 1.7072309255599976,
- "learning_rate": 4.3508386658944455e-06,
- "loss": 0.46481168270111084,
- "mean_token_accuracy": 0.834963321685791,
- "num_tokens": 9673175.0,
- "step": 1079
- },
- {
- "epoch": 0.8206686930091185,
- "grad_norm": 1.7383702993392944,
- "learning_rate": 4.349430085417171e-06,
- "loss": 0.4505952000617981,
- "mean_token_accuracy": 0.8507769107818604,
- "num_tokens": 9682800.0,
- "step": 1080
- },
- {
- "epoch": 0.8214285714285714,
- "grad_norm": 2.4308547973632812,
- "learning_rate": 4.348020206938672e-06,
- "loss": 0.4832455515861511,
- "mean_token_accuracy": 0.8538393974304199,
- "num_tokens": 9688123.0,
- "step": 1081
- },
- {
- "epoch": 0.8221884498480243,
- "grad_norm": 2.2686192989349365,
- "learning_rate": 4.3466090314484526e-06,
- "loss": 0.5112563371658325,
- "mean_token_accuracy": 0.8308460712432861,
- "num_tokens": 9694299.0,
- "step": 1082
- },
- {
- "epoch": 0.8229483282674772,
- "grad_norm": 2.806093454360962,
- "learning_rate": 4.345196559936931e-06,
- "loss": 0.4818246364593506,
- "mean_token_accuracy": 0.86617112159729,
- "num_tokens": 9698471.0,
- "step": 1083
- },
- {
- "epoch": 0.8237082066869301,
- "grad_norm": 1.7340706586837769,
- "learning_rate": 4.343782793395435e-06,
- "loss": 0.38246971368789673,
- "mean_token_accuracy": 0.8675198554992676,
- "num_tokens": 9706444.0,
- "step": 1084
- },
- {
- "epoch": 0.824468085106383,
- "grad_norm": 1.664942741394043,
- "learning_rate": 4.3423677328162e-06,
- "loss": 0.498797208070755,
- "mean_token_accuracy": 0.8447319865226746,
- "num_tokens": 9716765.0,
- "step": 1085
- },
- {
- "epoch": 0.8252279635258358,
- "grad_norm": 1.3608235120773315,
- "learning_rate": 4.340951379192369e-06,
- "loss": 0.41961491107940674,
- "mean_token_accuracy": 0.8339346647262573,
- "num_tokens": 9729564.0,
- "step": 1086
- },
- {
- "epoch": 0.8259878419452887,
- "grad_norm": 1.642503261566162,
- "learning_rate": 4.3395337335179945e-06,
- "loss": 0.5477945804595947,
- "mean_token_accuracy": 0.8117889761924744,
- "num_tokens": 9741217.0,
- "step": 1087
- },
- {
- "epoch": 0.8267477203647416,
- "grad_norm": 3.0345044136047363,
- "learning_rate": 4.338114796788035e-06,
- "loss": 0.5024623870849609,
- "mean_token_accuracy": 0.8333141207695007,
- "num_tokens": 9744941.0,
- "step": 1088
- },
- {
- "epoch": 0.8275075987841946,
- "grad_norm": 1.3096630573272705,
- "learning_rate": 4.336694569998354e-06,
- "loss": 0.44169723987579346,
- "mean_token_accuracy": 0.859926700592041,
- "num_tokens": 9757854.0,
- "step": 1089
- },
- {
- "epoch": 0.8282674772036475,
- "grad_norm": 2.203279495239258,
- "learning_rate": 4.3352730541457215e-06,
- "loss": 0.5283265113830566,
- "mean_token_accuracy": 0.8053759932518005,
- "num_tokens": 9764096.0,
- "step": 1090
- },
- {
- "epoch": 0.8290273556231003,
- "grad_norm": 1.3774312734603882,
- "learning_rate": 4.333850250227814e-06,
- "loss": 0.4584103226661682,
- "mean_token_accuracy": 0.8342611193656921,
- "num_tokens": 9777768.0,
- "step": 1091
- },
- {
- "epoch": 0.8297872340425532,
- "grad_norm": 1.822637915611267,
- "learning_rate": 4.332426159243206e-06,
- "loss": 0.5432791709899902,
- "mean_token_accuracy": 0.8136210441589355,
- "num_tokens": 9791276.0,
- "step": 1092
- },
- {
- "epoch": 0.8305471124620061,
- "grad_norm": 3.0190067291259766,
- "learning_rate": 4.331000782191384e-06,
- "loss": 0.5018150806427002,
- "mean_token_accuracy": 0.8234807252883911,
- "num_tokens": 9794902.0,
- "step": 1093
- },
- {
- "epoch": 0.831306990881459,
- "grad_norm": 2.09987735748291,
- "learning_rate": 4.329574120072728e-06,
- "loss": 0.4270891547203064,
- "mean_token_accuracy": 0.8544977903366089,
- "num_tokens": 9800903.0,
- "step": 1094
- },
- {
- "epoch": 0.8320668693009119,
- "grad_norm": 1.969549536705017,
- "learning_rate": 4.328146173888528e-06,
- "loss": 0.45801427960395813,
- "mean_token_accuracy": 0.8334714770317078,
- "num_tokens": 9808719.0,
- "step": 1095
- },
- {
- "epoch": 0.8328267477203647,
- "grad_norm": 1.4565571546554565,
- "learning_rate": 4.32671694464097e-06,
- "loss": 0.34864288568496704,
- "mean_token_accuracy": 0.8689061999320984,
- "num_tokens": 9818262.0,
- "step": 1096
- },
- {
- "epoch": 0.8335866261398176,
- "grad_norm": 1.2163832187652588,
- "learning_rate": 4.3252864333331424e-06,
- "loss": 0.37953704595565796,
- "mean_token_accuracy": 0.866554856300354,
- "num_tokens": 9833942.0,
- "step": 1097
- },
- {
- "epoch": 0.8343465045592705,
- "grad_norm": 1.6112010478973389,
- "learning_rate": 4.323854640969033e-06,
- "loss": 0.5442801713943481,
- "mean_token_accuracy": 0.8190416097640991,
- "num_tokens": 9844765.0,
- "step": 1098
- },
- {
- "epoch": 0.8351063829787234,
- "grad_norm": 1.8190315961837769,
- "learning_rate": 4.322421568553529e-06,
- "loss": 0.48271381855010986,
- "mean_token_accuracy": 0.8203652501106262,
- "num_tokens": 9852625.0,
- "step": 1099
- },
- {
- "epoch": 0.8358662613981763,
- "grad_norm": 2.7897756099700928,
- "learning_rate": 4.320987217092416e-06,
- "loss": 0.4086323380470276,
- "mean_token_accuracy": 0.8504934310913086,
- "num_tokens": 9856888.0,
- "step": 1100
- },
- {
- "epoch": 0.8366261398176292,
- "grad_norm": 1.7035977840423584,
- "learning_rate": 4.319551587592377e-06,
- "loss": 0.6325064301490784,
- "mean_token_accuracy": 0.788190484046936,
- "num_tokens": 9869419.0,
- "step": 1101
- },
- {
- "epoch": 0.837386018237082,
- "grad_norm": 2.609731912612915,
- "learning_rate": 4.318114681060989e-06,
- "loss": 0.519314706325531,
- "mean_token_accuracy": 0.8469992280006409,
- "num_tokens": 9874553.0,
- "step": 1102
- },
- {
- "epoch": 0.8381458966565349,
- "grad_norm": 1.2519766092300415,
- "learning_rate": 4.316676498506735e-06,
- "loss": 0.3566005825996399,
- "mean_token_accuracy": 0.8588439226150513,
- "num_tokens": 9886498.0,
- "step": 1103
- },
- {
- "epoch": 0.8389057750759878,
- "grad_norm": 1.430892825126648,
- "learning_rate": 4.3152370409389795e-06,
- "loss": 0.5250182747840881,
- "mean_token_accuracy": 0.8164948225021362,
- "num_tokens": 9900256.0,
- "step": 1104
- },
- {
- "epoch": 0.8396656534954408,
- "grad_norm": 3.1245436668395996,
- "learning_rate": 4.3137963093679945e-06,
- "loss": 0.3173971176147461,
- "mean_token_accuracy": 0.8835347890853882,
- "num_tokens": 9903899.0,
- "step": 1105
- },
- {
- "epoch": 0.8404255319148937,
- "grad_norm": 3.131812572479248,
- "learning_rate": 4.3123543048049395e-06,
- "loss": 0.6567763090133667,
- "mean_token_accuracy": 0.8233605027198792,
- "num_tokens": 9908798.0,
- "step": 1106
- },
- {
- "epoch": 0.8411854103343465,
- "grad_norm": 1.3551725149154663,
- "learning_rate": 4.310911028261867e-06,
- "loss": 0.3993729054927826,
- "mean_token_accuracy": 0.8529655933380127,
- "num_tokens": 9922577.0,
- "step": 1107
- },
- {
- "epoch": 0.8419452887537994,
- "grad_norm": 2.572533130645752,
- "learning_rate": 4.309466480751726e-06,
- "loss": 0.40906503796577454,
- "mean_token_accuracy": 0.8630726933479309,
- "num_tokens": 9926890.0,
- "step": 1108
- },
- {
- "epoch": 0.8427051671732523,
- "grad_norm": 1.9146469831466675,
- "learning_rate": 4.308020663288356e-06,
- "loss": 0.48423194885253906,
- "mean_token_accuracy": 0.8370280861854553,
- "num_tokens": 9934293.0,
- "step": 1109
- },
- {
- "epoch": 0.8434650455927052,
- "grad_norm": 1.6178001165390015,
- "learning_rate": 4.306573576886485e-06,
- "loss": 0.4262213408946991,
- "mean_token_accuracy": 0.839401125907898,
- "num_tokens": 9944513.0,
- "step": 1110
- },
- {
- "epoch": 0.8442249240121581,
- "grad_norm": 2.4444572925567627,
- "learning_rate": 4.305125222561736e-06,
- "loss": 0.5199950933456421,
- "mean_token_accuracy": 0.8507720232009888,
- "num_tokens": 9949512.0,
- "step": 1111
- },
- {
- "epoch": 0.8449848024316109,
- "grad_norm": 1.7983134984970093,
- "learning_rate": 4.303675601330618e-06,
- "loss": 0.36155956983566284,
- "mean_token_accuracy": 0.8568712472915649,
- "num_tokens": 9956402.0,
- "step": 1112
- },
- {
- "epoch": 0.8457446808510638,
- "grad_norm": 2.391096353530884,
- "learning_rate": 4.302224714210532e-06,
- "loss": 0.5391949415206909,
- "mean_token_accuracy": 0.8183057308197021,
- "num_tokens": 9961606.0,
- "step": 1113
- },
- {
- "epoch": 0.8465045592705167,
- "grad_norm": 1.8520214557647705,
- "learning_rate": 4.3007725622197675e-06,
- "loss": 0.5758882761001587,
- "mean_token_accuracy": 0.7924330234527588,
- "num_tokens": 9971473.0,
- "step": 1114
- },
- {
- "epoch": 0.8472644376899696,
- "grad_norm": 2.436640739440918,
- "learning_rate": 4.2993191463775e-06,
- "loss": 0.3837985396385193,
- "mean_token_accuracy": 0.8620110750198364,
- "num_tokens": 9976333.0,
- "step": 1115
- },
- {
- "epoch": 0.8480243161094225,
- "grad_norm": 1.7287120819091797,
- "learning_rate": 4.29786446770379e-06,
- "loss": 0.40066856145858765,
- "mean_token_accuracy": 0.8618333339691162,
- "num_tokens": 9985617.0,
- "step": 1116
- },
- {
- "epoch": 0.8487841945288754,
- "grad_norm": 2.0310518741607666,
- "learning_rate": 4.296408527219592e-06,
- "loss": 0.5465943217277527,
- "mean_token_accuracy": 0.812044620513916,
- "num_tokens": 9995363.0,
- "step": 1117
- },
- {
- "epoch": 0.8495440729483282,
- "grad_norm": 1.4858589172363281,
- "learning_rate": 4.294951325946737e-06,
- "loss": 0.45840176939964294,
- "mean_token_accuracy": 0.8432979583740234,
- "num_tokens": 10006400.0,
- "step": 1118
- },
- {
- "epoch": 0.8503039513677811,
- "grad_norm": 1.6153514385223389,
- "learning_rate": 4.293492864907947e-06,
- "loss": 0.5225611925125122,
- "mean_token_accuracy": 0.8180211186408997,
- "num_tokens": 10018352.0,
- "step": 1119
- },
- {
- "epoch": 0.851063829787234,
- "grad_norm": 2.1178412437438965,
- "learning_rate": 4.2920331451268246e-06,
- "loss": 0.5580621361732483,
- "mean_token_accuracy": 0.8211709260940552,
- "num_tokens": 10025614.0,
- "step": 1120
- },
- {
- "epoch": 0.851823708206687,
- "grad_norm": 2.036839246749878,
- "learning_rate": 4.2905721676278585e-06,
- "loss": 0.4658433198928833,
- "mean_token_accuracy": 0.8380423784255981,
- "num_tokens": 10032489.0,
- "step": 1121
- },
- {
- "epoch": 0.8525835866261399,
- "grad_norm": 2.0056262016296387,
- "learning_rate": 4.28910993343642e-06,
- "loss": 0.47023308277130127,
- "mean_token_accuracy": 0.8340359926223755,
- "num_tokens": 10040050.0,
- "step": 1122
- },
- {
- "epoch": 0.8533434650455927,
- "grad_norm": 2.540024518966675,
- "learning_rate": 4.2876464435787576e-06,
- "loss": 0.502303957939148,
- "mean_token_accuracy": 0.8288739919662476,
- "num_tokens": 10045042.0,
- "step": 1123
- },
- {
- "epoch": 0.8541033434650456,
- "grad_norm": 1.7894693613052368,
- "learning_rate": 4.286181699082008e-06,
- "loss": 0.4732973575592041,
- "mean_token_accuracy": 0.8340568542480469,
- "num_tokens": 10054424.0,
- "step": 1124
- },
- {
- "epoch": 0.8548632218844985,
- "grad_norm": 1.5601223707199097,
- "learning_rate": 4.284715700974186e-06,
- "loss": 0.472471684217453,
- "mean_token_accuracy": 0.8274722695350647,
- "num_tokens": 10065523.0,
- "step": 1125
- },
- {
- "epoch": 0.8556231003039514,
- "grad_norm": 1.7326055765151978,
- "learning_rate": 4.283248450284182e-06,
- "loss": 0.5924872159957886,
- "mean_token_accuracy": 0.7943467497825623,
- "num_tokens": 10076839.0,
- "step": 1126
- },
- {
- "epoch": 0.8563829787234043,
- "grad_norm": 1.5165479183197021,
- "learning_rate": 4.281779948041772e-06,
- "loss": 0.44768425822257996,
- "mean_token_accuracy": 0.8394696712493896,
- "num_tokens": 10088168.0,
- "step": 1127
- },
- {
- "epoch": 0.8571428571428571,
- "grad_norm": 1.5448920726776123,
- "learning_rate": 4.280310195277606e-06,
- "loss": 0.4458175003528595,
- "mean_token_accuracy": 0.835773229598999,
- "num_tokens": 10100306.0,
- "step": 1128
- },
- {
- "epoch": 0.85790273556231,
- "grad_norm": 1.6311609745025635,
- "learning_rate": 4.278839193023214e-06,
- "loss": 0.4158072769641876,
- "mean_token_accuracy": 0.8482539653778076,
- "num_tokens": 10110581.0,
- "step": 1129
- },
- {
- "epoch": 0.8586626139817629,
- "grad_norm": 1.6714754104614258,
- "learning_rate": 4.277366942311001e-06,
- "loss": 0.3686875104904175,
- "mean_token_accuracy": 0.8681533336639404,
- "num_tokens": 10118799.0,
- "step": 1130
- },
- {
- "epoch": 0.8594224924012158,
- "grad_norm": 2.1604413986206055,
- "learning_rate": 4.2758934441742494e-06,
- "loss": 0.37267982959747314,
- "mean_token_accuracy": 0.8520427346229553,
- "num_tokens": 10124734.0,
- "step": 1131
- },
- {
- "epoch": 0.8601823708206687,
- "grad_norm": 2.123013973236084,
- "learning_rate": 4.274418699647117e-06,
- "loss": 0.49963313341140747,
- "mean_token_accuracy": 0.8248758912086487,
- "num_tokens": 10131965.0,
- "step": 1132
- },
- {
- "epoch": 0.8609422492401215,
- "grad_norm": 1.4308786392211914,
- "learning_rate": 4.272942709764638e-06,
- "loss": 0.48666873574256897,
- "mean_token_accuracy": 0.8304717540740967,
- "num_tokens": 10145164.0,
- "step": 1133
- },
- {
- "epoch": 0.8617021276595744,
- "grad_norm": 1.7952618598937988,
- "learning_rate": 4.271465475562716e-06,
- "loss": 0.5536223649978638,
- "mean_token_accuracy": 0.8093959093093872,
- "num_tokens": 10154083.0,
- "step": 1134
- },
- {
- "epoch": 0.8624620060790273,
- "grad_norm": 2.0622456073760986,
- "learning_rate": 4.269986998078132e-06,
- "loss": 0.5173629522323608,
- "mean_token_accuracy": 0.8285619020462036,
- "num_tokens": 10161889.0,
- "step": 1135
- },
- {
- "epoch": 0.8632218844984803,
- "grad_norm": 2.0707509517669678,
- "learning_rate": 4.268507278348539e-06,
- "loss": 0.5871608257293701,
- "mean_token_accuracy": 0.7827386856079102,
- "num_tokens": 10170726.0,
- "step": 1136
- },
- {
- "epoch": 0.8639817629179332,
- "grad_norm": 2.054368257522583,
- "learning_rate": 4.2670263174124615e-06,
- "loss": 0.5788969993591309,
- "mean_token_accuracy": 0.7967237234115601,
- "num_tokens": 10178474.0,
- "step": 1137
- },
- {
- "epoch": 0.8647416413373861,
- "grad_norm": 1.901846170425415,
- "learning_rate": 4.265544116309294e-06,
- "loss": 0.5405587553977966,
- "mean_token_accuracy": 0.8151819705963135,
- "num_tokens": 10187013.0,
- "step": 1138
- },
- {
- "epoch": 0.8655015197568389,
- "grad_norm": 2.901285409927368,
- "learning_rate": 4.264060676079302e-06,
- "loss": 0.44101861119270325,
- "mean_token_accuracy": 0.8433429002761841,
- "num_tokens": 10191517.0,
- "step": 1139
- },
- {
- "epoch": 0.8662613981762918,
- "grad_norm": 2.4168388843536377,
- "learning_rate": 4.262575997763622e-06,
- "loss": 0.4686204195022583,
- "mean_token_accuracy": 0.8505309820175171,
- "num_tokens": 10196948.0,
- "step": 1140
- },
- {
- "epoch": 0.8670212765957447,
- "grad_norm": 1.9588396549224854,
- "learning_rate": 4.2610900824042575e-06,
- "loss": 0.47056013345718384,
- "mean_token_accuracy": 0.8280024528503418,
- "num_tokens": 10204292.0,
- "step": 1141
- },
- {
- "epoch": 0.8677811550151976,
- "grad_norm": 2.569150924682617,
- "learning_rate": 4.2596029310440826e-06,
- "loss": 0.573108434677124,
- "mean_token_accuracy": 0.8108246326446533,
- "num_tokens": 10209571.0,
- "step": 1142
- },
- {
- "epoch": 0.8685410334346505,
- "grad_norm": 2.038032293319702,
- "learning_rate": 4.258114544726835e-06,
- "loss": 0.40545332431793213,
- "mean_token_accuracy": 0.8611703515052795,
- "num_tokens": 10215716.0,
- "step": 1143
- },
- {
- "epoch": 0.8693009118541033,
- "grad_norm": 1.9884231090545654,
- "learning_rate": 4.256624924497124e-06,
- "loss": 0.40085992217063904,
- "mean_token_accuracy": 0.8615031242370605,
- "num_tokens": 10222775.0,
- "step": 1144
- },
- {
- "epoch": 0.8700607902735562,
- "grad_norm": 1.912842035293579,
- "learning_rate": 4.25513407140042e-06,
- "loss": 0.41022324562072754,
- "mean_token_accuracy": 0.8459607362747192,
- "num_tokens": 10229589.0,
- "step": 1145
- },
- {
- "epoch": 0.8708206686930091,
- "grad_norm": 1.9190576076507568,
- "learning_rate": 4.253641986483063e-06,
- "loss": 0.5541447401046753,
- "mean_token_accuracy": 0.8256468772888184,
- "num_tokens": 10240633.0,
- "step": 1146
- },
- {
- "epoch": 0.871580547112462,
- "grad_norm": 1.3742294311523438,
- "learning_rate": 4.2521486707922545e-06,
- "loss": 0.3680543899536133,
- "mean_token_accuracy": 0.8654477596282959,
- "num_tokens": 10251252.0,
- "step": 1147
- },
- {
- "epoch": 0.8723404255319149,
- "grad_norm": 1.4438525438308716,
- "learning_rate": 4.250654125376062e-06,
- "loss": 0.45830875635147095,
- "mean_token_accuracy": 0.8433834314346313,
- "num_tokens": 10263980.0,
- "step": 1148
- },
- {
- "epoch": 0.8731003039513677,
- "grad_norm": 2.1273653507232666,
- "learning_rate": 4.249158351283414e-06,
- "loss": 0.4129376709461212,
- "mean_token_accuracy": 0.861556351184845,
- "num_tokens": 10270426.0,
- "step": 1149
- },
- {
- "epoch": 0.8738601823708206,
- "grad_norm": 2.598440647125244,
- "learning_rate": 4.247661349564103e-06,
- "loss": 0.418030709028244,
- "mean_token_accuracy": 0.86553955078125,
- "num_tokens": 10275493.0,
- "step": 1150
- },
- {
- "epoch": 0.8746200607902735,
- "grad_norm": 1.6852490901947021,
- "learning_rate": 4.246163121268782e-06,
- "loss": 0.6403408050537109,
- "mean_token_accuracy": 0.7966094017028809,
- "num_tokens": 10287989.0,
- "step": 1151
- },
- {
- "epoch": 0.8753799392097265,
- "grad_norm": 2.5013794898986816,
- "learning_rate": 4.244663667448965e-06,
- "loss": 0.49922505021095276,
- "mean_token_accuracy": 0.8318735361099243,
- "num_tokens": 10293360.0,
- "step": 1152
- },
- {
- "epoch": 0.8761398176291794,
- "grad_norm": 1.2022709846496582,
- "learning_rate": 4.243162989157027e-06,
- "loss": 0.4414965510368347,
- "mean_token_accuracy": 0.8338693380355835,
- "num_tokens": 10310558.0,
- "step": 1153
- },
- {
- "epoch": 0.8768996960486323,
- "grad_norm": 1.9903281927108765,
- "learning_rate": 4.241661087446202e-06,
- "loss": 0.4277610778808594,
- "mean_token_accuracy": 0.8560749292373657,
- "num_tokens": 10316983.0,
- "step": 1154
- },
- {
- "epoch": 0.8776595744680851,
- "grad_norm": 2.104923725128174,
- "learning_rate": 4.240157963370583e-06,
- "loss": 0.44431713223457336,
- "mean_token_accuracy": 0.8785282969474792,
- "num_tokens": 10323294.0,
- "step": 1155
- },
- {
- "epoch": 0.878419452887538,
- "grad_norm": 2.8364813327789307,
- "learning_rate": 4.2386536179851175e-06,
- "loss": 0.49948397278785706,
- "mean_token_accuracy": 0.8305255174636841,
- "num_tokens": 10327662.0,
- "step": 1156
- },
- {
- "epoch": 0.8791793313069909,
- "grad_norm": 1.9493682384490967,
- "learning_rate": 4.2371480523456156e-06,
- "loss": 0.45867404341697693,
- "mean_token_accuracy": 0.8373264074325562,
- "num_tokens": 10335699.0,
- "step": 1157
- },
- {
- "epoch": 0.8799392097264438,
- "grad_norm": 2.268616199493408,
- "learning_rate": 4.235641267508741e-06,
- "loss": 0.4547857940196991,
- "mean_token_accuracy": 0.8252766132354736,
- "num_tokens": 10342464.0,
- "step": 1158
- },
- {
- "epoch": 0.8806990881458967,
- "grad_norm": 2.1334283351898193,
- "learning_rate": 4.234133264532012e-06,
- "loss": 0.39503124356269836,
- "mean_token_accuracy": 0.8648351430892944,
- "num_tokens": 10347514.0,
- "step": 1159
- },
- {
- "epoch": 0.8814589665653495,
- "grad_norm": 1.2775357961654663,
- "learning_rate": 4.232624044473805e-06,
- "loss": 0.39945733547210693,
- "mean_token_accuracy": 0.8369829654693604,
- "num_tokens": 10363316.0,
- "step": 1160
- },
- {
- "epoch": 0.8822188449848024,
- "grad_norm": 2.458413600921631,
- "learning_rate": 4.231113608393348e-06,
- "loss": 0.5020045638084412,
- "mean_token_accuracy": 0.8295938968658447,
- "num_tokens": 10368401.0,
- "step": 1161
- },
- {
- "epoch": 0.8829787234042553,
- "grad_norm": 1.7464948892593384,
- "learning_rate": 4.229601957350722e-06,
- "loss": 0.5335392951965332,
- "mean_token_accuracy": 0.8134858012199402,
- "num_tokens": 10378337.0,
- "step": 1162
- },
- {
- "epoch": 0.8837386018237082,
- "grad_norm": 3.1152119636535645,
- "learning_rate": 4.228089092406863e-06,
- "loss": 0.4811682105064392,
- "mean_token_accuracy": 0.8460187315940857,
- "num_tokens": 10382362.0,
- "step": 1163
- },
- {
- "epoch": 0.8844984802431611,
- "grad_norm": 2.190847158432007,
- "learning_rate": 4.226575014623557e-06,
- "loss": 0.4428049921989441,
- "mean_token_accuracy": 0.8382467031478882,
- "num_tokens": 10388211.0,
- "step": 1164
- },
- {
- "epoch": 0.8852583586626139,
- "grad_norm": 1.860153079032898,
- "learning_rate": 4.225059725063444e-06,
- "loss": 0.5265918970108032,
- "mean_token_accuracy": 0.8181334733963013,
- "num_tokens": 10398873.0,
- "step": 1165
- },
- {
- "epoch": 0.8860182370820668,
- "grad_norm": 1.3372713327407837,
- "learning_rate": 4.22354322479001e-06,
- "loss": 0.43202850222587585,
- "mean_token_accuracy": 0.8432420492172241,
- "num_tokens": 10413158.0,
- "step": 1166
- },
- {
- "epoch": 0.8867781155015197,
- "grad_norm": 1.3653379678726196,
- "learning_rate": 4.222025514867596e-06,
- "loss": 0.43780991435050964,
- "mean_token_accuracy": 0.8441485166549683,
- "num_tokens": 10428137.0,
- "step": 1167
- },
- {
- "epoch": 0.8875379939209727,
- "grad_norm": 3.0230672359466553,
- "learning_rate": 4.220506596361387e-06,
- "loss": 0.6039337515830994,
- "mean_token_accuracy": 0.8274872303009033,
- "num_tokens": 10432586.0,
- "step": 1168
- },
- {
- "epoch": 0.8882978723404256,
- "grad_norm": 2.2180392742156982,
- "learning_rate": 4.218986470337419e-06,
- "loss": 0.5453792810440063,
- "mean_token_accuracy": 0.8127184510231018,
- "num_tokens": 10439471.0,
- "step": 1169
- },
- {
- "epoch": 0.8890577507598785,
- "grad_norm": 1.8519103527069092,
- "learning_rate": 4.217465137862575e-06,
- "loss": 0.5145469903945923,
- "mean_token_accuracy": 0.8178654909133911,
- "num_tokens": 10450471.0,
- "step": 1170
- },
- {
- "epoch": 0.8898176291793313,
- "grad_norm": 2.034008026123047,
- "learning_rate": 4.215942600004586e-06,
- "loss": 0.44061461091041565,
- "mean_token_accuracy": 0.8572084307670593,
- "num_tokens": 10457382.0,
- "step": 1171
- },
- {
- "epoch": 0.8905775075987842,
- "grad_norm": 3.4304304122924805,
- "learning_rate": 4.214418857832025e-06,
- "loss": 0.44397830963134766,
- "mean_token_accuracy": 0.842149019241333,
- "num_tokens": 10460650.0,
- "step": 1172
- },
- {
- "epoch": 0.8913373860182371,
- "grad_norm": 1.9021750688552856,
- "learning_rate": 4.212893912414316e-06,
- "loss": 0.3769867420196533,
- "mean_token_accuracy": 0.8806171417236328,
- "num_tokens": 10468214.0,
- "step": 1173
- },
- {
- "epoch": 0.89209726443769,
- "grad_norm": 1.9704062938690186,
- "learning_rate": 4.211367764821722e-06,
- "loss": 0.5501819849014282,
- "mean_token_accuracy": 0.8176811337471008,
- "num_tokens": 10476739.0,
- "step": 1174
- },
- {
- "epoch": 0.8928571428571429,
- "grad_norm": 1.4350415468215942,
- "learning_rate": 4.209840416125353e-06,
- "loss": 0.41897401213645935,
- "mean_token_accuracy": 0.8498011827468872,
- "num_tokens": 10491769.0,
- "step": 1175
- },
- {
- "epoch": 0.8936170212765957,
- "grad_norm": 3.8237783908843994,
- "learning_rate": 4.208311867397162e-06,
- "loss": 0.5296977162361145,
- "mean_token_accuracy": 0.8168715834617615,
- "num_tokens": 10494958.0,
- "step": 1176
- },
- {
- "epoch": 0.8943768996960486,
- "grad_norm": 2.04784893989563,
- "learning_rate": 4.206782119709942e-06,
- "loss": 0.476105272769928,
- "mean_token_accuracy": 0.834011435508728,
- "num_tokens": 10502077.0,
- "step": 1177
- },
- {
- "epoch": 0.8951367781155015,
- "grad_norm": 1.8839610815048218,
- "learning_rate": 4.205251174137329e-06,
- "loss": 0.49628815054893494,
- "mean_token_accuracy": 0.8212119936943054,
- "num_tokens": 10510077.0,
- "step": 1178
- },
- {
- "epoch": 0.8958966565349544,
- "grad_norm": 1.2100634574890137,
- "learning_rate": 4.2037190317538e-06,
- "loss": 0.4931519329547882,
- "mean_token_accuracy": 0.8170043230056763,
- "num_tokens": 10528373.0,
- "step": 1179
- },
- {
- "epoch": 0.8966565349544073,
- "grad_norm": 1.884637713432312,
- "learning_rate": 4.202185693634671e-06,
- "loss": 0.4913347363471985,
- "mean_token_accuracy": 0.8234949707984924,
- "num_tokens": 10537108.0,
- "step": 1180
- },
- {
- "epoch": 0.8974164133738601,
- "grad_norm": 1.5062434673309326,
- "learning_rate": 4.200651160856099e-06,
- "loss": 0.4160492420196533,
- "mean_token_accuracy": 0.845937192440033,
- "num_tokens": 10547577.0,
- "step": 1181
- },
- {
- "epoch": 0.898176291793313,
- "grad_norm": 2.331169605255127,
- "learning_rate": 4.1991154344950755e-06,
- "loss": 0.6532632112503052,
- "mean_token_accuracy": 0.7743191123008728,
- "num_tokens": 10556328.0,
- "step": 1182
- },
- {
- "epoch": 0.898936170212766,
- "grad_norm": 1.3538362979888916,
- "learning_rate": 4.197578515629435e-06,
- "loss": 0.4437566101551056,
- "mean_token_accuracy": 0.8427901268005371,
- "num_tokens": 10570026.0,
- "step": 1183
- },
- {
- "epoch": 0.8996960486322189,
- "grad_norm": 2.3828957080841064,
- "learning_rate": 4.196040405337846e-06,
- "loss": 0.6185290217399597,
- "mean_token_accuracy": 0.7969824075698853,
- "num_tokens": 10576465.0,
- "step": 1184
- },
- {
- "epoch": 0.9004559270516718,
- "grad_norm": 2.4759042263031006,
- "learning_rate": 4.194501104699813e-06,
- "loss": 0.46489226818084717,
- "mean_token_accuracy": 0.8472316265106201,
- "num_tokens": 10582034.0,
- "step": 1185
- },
- {
- "epoch": 0.9012158054711246,
- "grad_norm": 1.9215164184570312,
- "learning_rate": 4.192960614795676e-06,
- "loss": 0.48001551628112793,
- "mean_token_accuracy": 0.8371596336364746,
- "num_tokens": 10590556.0,
- "step": 1186
- },
- {
- "epoch": 0.9019756838905775,
- "grad_norm": 2.2717080116271973,
- "learning_rate": 4.19141893670661e-06,
- "loss": 0.40083563327789307,
- "mean_token_accuracy": 0.8464195728302002,
- "num_tokens": 10595661.0,
- "step": 1187
- },
- {
- "epoch": 0.9027355623100304,
- "grad_norm": 2.187122344970703,
- "learning_rate": 4.189876071514624e-06,
- "loss": 0.4942901134490967,
- "mean_token_accuracy": 0.8186990022659302,
- "num_tokens": 10603366.0,
- "step": 1188
- },
- {
- "epoch": 0.9034954407294833,
- "grad_norm": 1.542414665222168,
- "learning_rate": 4.188332020302561e-06,
- "loss": 0.4731982946395874,
- "mean_token_accuracy": 0.8487229347229004,
- "num_tokens": 10616203.0,
- "step": 1189
- },
- {
- "epoch": 0.9042553191489362,
- "grad_norm": 0.9957579970359802,
- "learning_rate": 4.186786784154096e-06,
- "loss": 0.33211836218833923,
- "mean_token_accuracy": 0.870644748210907,
- "num_tokens": 10633294.0,
- "step": 1190
- },
- {
- "epoch": 0.9050151975683891,
- "grad_norm": 2.593867540359497,
- "learning_rate": 4.1852403641537344e-06,
- "loss": 0.6825464963912964,
- "mean_token_accuracy": 0.7716869115829468,
- "num_tokens": 10640615.0,
- "step": 1191
- },
- {
- "epoch": 0.9057750759878419,
- "grad_norm": 2.0424516201019287,
- "learning_rate": 4.183692761386813e-06,
- "loss": 0.5672709941864014,
- "mean_token_accuracy": 0.7973801493644714,
- "num_tokens": 10649845.0,
- "step": 1192
- },
- {
- "epoch": 0.9065349544072948,
- "grad_norm": 1.429018259048462,
- "learning_rate": 4.1821439769395e-06,
- "loss": 0.5427846908569336,
- "mean_token_accuracy": 0.8200292587280273,
- "num_tokens": 10665898.0,
- "step": 1193
- },
- {
- "epoch": 0.9072948328267477,
- "grad_norm": 1.9764264822006226,
- "learning_rate": 4.180594011898791e-06,
- "loss": 0.4784567356109619,
- "mean_token_accuracy": 0.82924485206604,
- "num_tokens": 10673595.0,
- "step": 1194
- },
- {
- "epoch": 0.9080547112462006,
- "grad_norm": 1.4004309177398682,
- "learning_rate": 4.1790428673525104e-06,
- "loss": 0.4791432023048401,
- "mean_token_accuracy": 0.8334879875183105,
- "num_tokens": 10687892.0,
- "step": 1195
- },
- {
- "epoch": 0.9088145896656535,
- "grad_norm": 2.2207727432250977,
- "learning_rate": 4.177490544389313e-06,
- "loss": 0.5089365243911743,
- "mean_token_accuracy": 0.8270776271820068,
- "num_tokens": 10694911.0,
- "step": 1196
- },
- {
- "epoch": 0.9095744680851063,
- "grad_norm": 2.2890450954437256,
- "learning_rate": 4.175937044098678e-06,
- "loss": 0.5152267813682556,
- "mean_token_accuracy": 0.8527299165725708,
- "num_tokens": 10700512.0,
- "step": 1197
- },
- {
- "epoch": 0.9103343465045592,
- "grad_norm": 1.7938050031661987,
- "learning_rate": 4.1743823675709115e-06,
- "loss": 0.3507300615310669,
- "mean_token_accuracy": 0.8694599866867065,
- "num_tokens": 10707953.0,
- "step": 1198
- },
- {
- "epoch": 0.9110942249240122,
- "grad_norm": 1.4368808269500732,
- "learning_rate": 4.172826515897146e-06,
- "loss": 0.407418429851532,
- "mean_token_accuracy": 0.8432893753051758,
- "num_tokens": 10717485.0,
- "step": 1199
- },
- {
- "epoch": 0.9118541033434651,
- "grad_norm": 1.735339879989624,
- "learning_rate": 4.171269490169337e-06,
- "loss": 0.46996885538101196,
- "mean_token_accuracy": 0.8331948518753052,
- "num_tokens": 10726160.0,
- "step": 1200
- },
- {
- "epoch": 0.912613981762918,
- "grad_norm": 1.7859221696853638,
- "learning_rate": 4.1697112914802665e-06,
- "loss": 0.5325199365615845,
- "mean_token_accuracy": 0.8179605007171631,
- "num_tokens": 10736284.0,
- "step": 1201
- },
- {
- "epoch": 0.9133738601823708,
- "grad_norm": 2.6394896507263184,
- "learning_rate": 4.168151920923536e-06,
- "loss": 0.4039744734764099,
- "mean_token_accuracy": 0.8545527458190918,
- "num_tokens": 10740673.0,
- "step": 1202
- },
- {
- "epoch": 0.9141337386018237,
- "grad_norm": 1.910988211631775,
- "learning_rate": 4.1665913795935755e-06,
- "loss": 0.5190291404724121,
- "mean_token_accuracy": 0.8203921318054199,
- "num_tokens": 10751946.0,
- "step": 1203
- },
- {
- "epoch": 0.9148936170212766,
- "grad_norm": 3.0006964206695557,
- "learning_rate": 4.16502966858563e-06,
- "loss": 0.5856777429580688,
- "mean_token_accuracy": 0.8061224222183228,
- "num_tokens": 10756795.0,
- "step": 1204
- },
- {
- "epoch": 0.9156534954407295,
- "grad_norm": 1.7396167516708374,
- "learning_rate": 4.163466788995768e-06,
- "loss": 0.54935222864151,
- "mean_token_accuracy": 0.8052443265914917,
- "num_tokens": 10767202.0,
- "step": 1205
- },
- {
- "epoch": 0.9164133738601824,
- "grad_norm": 2.143735885620117,
- "learning_rate": 4.161902741920881e-06,
- "loss": 0.5020298361778259,
- "mean_token_accuracy": 0.8249630928039551,
- "num_tokens": 10774329.0,
- "step": 1206
- },
- {
- "epoch": 0.9171732522796353,
- "grad_norm": 2.8871893882751465,
- "learning_rate": 4.160337528458676e-06,
- "loss": 0.5154489278793335,
- "mean_token_accuracy": 0.8276848793029785,
- "num_tokens": 10778929.0,
- "step": 1207
- },
- {
- "epoch": 0.9179331306990881,
- "grad_norm": 1.4642788171768188,
- "learning_rate": 4.15877114970768e-06,
- "loss": 0.5033774375915527,
- "mean_token_accuracy": 0.8296241164207458,
- "num_tokens": 10790928.0,
- "step": 1208
- },
- {
- "epoch": 0.918693009118541,
- "grad_norm": 1.8313497304916382,
- "learning_rate": 4.1572036067672386e-06,
- "loss": 0.5674909353256226,
- "mean_token_accuracy": 0.7975562214851379,
- "num_tokens": 10801372.0,
- "step": 1209
- },
- {
- "epoch": 0.9194528875379939,
- "grad_norm": 2.005958080291748,
- "learning_rate": 4.155634900737513e-06,
- "loss": 0.5557019114494324,
- "mean_token_accuracy": 0.8141391277313232,
- "num_tokens": 10809150.0,
- "step": 1210
- },
- {
- "epoch": 0.9202127659574468,
- "grad_norm": 2.333519697189331,
- "learning_rate": 4.154065032719482e-06,
- "loss": 0.6990420818328857,
- "mean_token_accuracy": 0.7565394043922424,
- "num_tokens": 10816612.0,
- "step": 1211
- },
- {
- "epoch": 0.9209726443768997,
- "grad_norm": 1.4472655057907104,
- "learning_rate": 4.152494003814939e-06,
- "loss": 0.541398286819458,
- "mean_token_accuracy": 0.8027358055114746,
- "num_tokens": 10833840.0,
- "step": 1212
- },
- {
- "epoch": 0.9217325227963525,
- "grad_norm": 1.6183619499206543,
- "learning_rate": 4.150921815126493e-06,
- "loss": 0.6096762418746948,
- "mean_token_accuracy": 0.7994354963302612,
- "num_tokens": 10846367.0,
- "step": 1213
- },
- {
- "epoch": 0.9224924012158054,
- "grad_norm": 2.614919900894165,
- "learning_rate": 4.149348467757566e-06,
- "loss": 0.41846764087677,
- "mean_token_accuracy": 0.8555068969726562,
- "num_tokens": 10850836.0,
- "step": 1214
- },
- {
- "epoch": 0.9232522796352584,
- "grad_norm": 1.4419831037521362,
- "learning_rate": 4.147773962812393e-06,
- "loss": 0.4139535427093506,
- "mean_token_accuracy": 0.845671534538269,
- "num_tokens": 10864228.0,
- "step": 1215
- },
- {
- "epoch": 0.9240121580547113,
- "grad_norm": 2.3868865966796875,
- "learning_rate": 4.146198301396025e-06,
- "loss": 0.3357275128364563,
- "mean_token_accuracy": 0.8829520344734192,
- "num_tokens": 10868920.0,
- "step": 1216
- },
- {
- "epoch": 0.9247720364741642,
- "grad_norm": 1.7685474157333374,
- "learning_rate": 4.14462148461432e-06,
- "loss": 0.45333072543144226,
- "mean_token_accuracy": 0.8505891561508179,
- "num_tokens": 10877286.0,
- "step": 1217
- },
- {
- "epoch": 0.925531914893617,
- "grad_norm": 1.7627625465393066,
- "learning_rate": 4.143043513573949e-06,
- "loss": 0.5028705596923828,
- "mean_token_accuracy": 0.825471043586731,
- "num_tokens": 10887047.0,
- "step": 1218
- },
- {
- "epoch": 0.9262917933130699,
- "grad_norm": 1.3168725967407227,
- "learning_rate": 4.141464389382392e-06,
- "loss": 0.5494637489318848,
- "mean_token_accuracy": 0.8121747970581055,
- "num_tokens": 10903599.0,
- "step": 1219
- },
- {
- "epoch": 0.9270516717325228,
- "grad_norm": 2.5180399417877197,
- "learning_rate": 4.13988411314794e-06,
- "loss": 0.6134277582168579,
- "mean_token_accuracy": 0.7983006834983826,
- "num_tokens": 10909791.0,
- "step": 1220
- },
- {
- "epoch": 0.9278115501519757,
- "grad_norm": 1.1889166831970215,
- "learning_rate": 4.13830268597969e-06,
- "loss": 0.36713096499443054,
- "mean_token_accuracy": 0.8416121006011963,
- "num_tokens": 10925794.0,
- "step": 1221
- },
- {
- "epoch": 0.9285714285714286,
- "grad_norm": 2.142422676086426,
- "learning_rate": 4.136720108987552e-06,
- "loss": 0.4427933096885681,
- "mean_token_accuracy": 0.8427745699882507,
- "num_tokens": 10931622.0,
- "step": 1222
- },
- {
- "epoch": 0.9293313069908815,
- "grad_norm": 1.908564567565918,
- "learning_rate": 4.1351363832822364e-06,
- "loss": 0.5088109374046326,
- "mean_token_accuracy": 0.8309272527694702,
- "num_tokens": 10940843.0,
- "step": 1223
- },
- {
- "epoch": 0.9300911854103343,
- "grad_norm": 1.2862322330474854,
- "learning_rate": 4.133551509975264e-06,
- "loss": 0.3963761329650879,
- "mean_token_accuracy": 0.8602159023284912,
- "num_tokens": 10954481.0,
- "step": 1224
- },
- {
- "epoch": 0.9308510638297872,
- "grad_norm": 1.5876200199127197,
- "learning_rate": 4.13196549017896e-06,
- "loss": 0.4311184287071228,
- "mean_token_accuracy": 0.8460899591445923,
- "num_tokens": 10963501.0,
- "step": 1225
- },
- {
- "epoch": 0.9316109422492401,
- "grad_norm": 2.459878444671631,
- "learning_rate": 4.130378325006453e-06,
- "loss": 0.5016295313835144,
- "mean_token_accuracy": 0.8125218152999878,
- "num_tokens": 10968850.0,
- "step": 1226
- },
- {
- "epoch": 0.932370820668693,
- "grad_norm": 2.059718370437622,
- "learning_rate": 4.128790015571679e-06,
- "loss": 0.48982277512550354,
- "mean_token_accuracy": 0.8327049016952515,
- "num_tokens": 10976642.0,
- "step": 1227
- },
- {
- "epoch": 0.9331306990881459,
- "grad_norm": 1.3719185590744019,
- "learning_rate": 4.127200562989372e-06,
- "loss": 0.38778752088546753,
- "mean_token_accuracy": 0.8623501062393188,
- "num_tokens": 10988703.0,
- "step": 1228
- },
- {
- "epoch": 0.9338905775075987,
- "grad_norm": 1.302140712738037,
- "learning_rate": 4.125609968375073e-06,
- "loss": 0.4887842535972595,
- "mean_token_accuracy": 0.8322232961654663,
- "num_tokens": 11005981.0,
- "step": 1229
- },
- {
- "epoch": 0.9346504559270516,
- "grad_norm": 1.819624423980713,
- "learning_rate": 4.12401823284512e-06,
- "loss": 0.49825209379196167,
- "mean_token_accuracy": 0.8278916478157043,
- "num_tokens": 11014145.0,
- "step": 1230
- },
- {
- "epoch": 0.9354103343465046,
- "grad_norm": 1.2762807607650757,
- "learning_rate": 4.122425357516658e-06,
- "loss": 0.433994323015213,
- "mean_token_accuracy": 0.853028416633606,
- "num_tokens": 11029232.0,
- "step": 1231
- },
- {
- "epoch": 0.9361702127659575,
- "grad_norm": 2.2171671390533447,
- "learning_rate": 4.1208313435076255e-06,
- "loss": 0.38436949253082275,
- "mean_token_accuracy": 0.8616260290145874,
- "num_tokens": 11034743.0,
- "step": 1232
- },
- {
- "epoch": 0.9369300911854104,
- "grad_norm": 1.355879545211792,
- "learning_rate": 4.119236191936764e-06,
- "loss": 0.5378084182739258,
- "mean_token_accuracy": 0.8256701231002808,
- "num_tokens": 11048149.0,
- "step": 1233
- },
- {
- "epoch": 0.9376899696048632,
- "grad_norm": 2.66812801361084,
- "learning_rate": 4.117639903923611e-06,
- "loss": 0.5236451625823975,
- "mean_token_accuracy": 0.8431973457336426,
- "num_tokens": 11052295.0,
- "step": 1234
- },
- {
- "epoch": 0.9384498480243161,
- "grad_norm": 1.5740545988082886,
- "learning_rate": 4.116042480588505e-06,
- "loss": 0.44322824478149414,
- "mean_token_accuracy": 0.8436908721923828,
- "num_tokens": 11062066.0,
- "step": 1235
- },
- {
- "epoch": 0.939209726443769,
- "grad_norm": 1.230706810951233,
- "learning_rate": 4.114443923052577e-06,
- "loss": 0.3325323462486267,
- "mean_token_accuracy": 0.8674666881561279,
- "num_tokens": 11074300.0,
- "step": 1236
- },
- {
- "epoch": 0.9399696048632219,
- "grad_norm": 1.9870070219039917,
- "learning_rate": 4.112844232437757e-06,
- "loss": 0.5711548328399658,
- "mean_token_accuracy": 0.8081738948822021,
- "num_tokens": 11082297.0,
- "step": 1237
- },
- {
- "epoch": 0.9407294832826748,
- "grad_norm": 1.3020970821380615,
- "learning_rate": 4.11124340986677e-06,
- "loss": 0.4187922477722168,
- "mean_token_accuracy": 0.8566171526908875,
- "num_tokens": 11096810.0,
- "step": 1238
- },
- {
- "epoch": 0.9414893617021277,
- "grad_norm": 2.1399197578430176,
- "learning_rate": 4.109641456463135e-06,
- "loss": 0.5293116569519043,
- "mean_token_accuracy": 0.8176157474517822,
- "num_tokens": 11102761.0,
- "step": 1239
- },
- {
- "epoch": 0.9422492401215805,
- "grad_norm": 1.3503763675689697,
- "learning_rate": 4.108038373351163e-06,
- "loss": 0.4907652735710144,
- "mean_token_accuracy": 0.8204987049102783,
- "num_tokens": 11118480.0,
- "step": 1240
- },
- {
- "epoch": 0.9430091185410334,
- "grad_norm": 1.9571399688720703,
- "learning_rate": 4.106434161655962e-06,
- "loss": 0.4709656536579132,
- "mean_token_accuracy": 0.8371885418891907,
- "num_tokens": 11126265.0,
- "step": 1241
- },
- {
- "epoch": 0.9437689969604863,
- "grad_norm": 2.1277313232421875,
- "learning_rate": 4.104828822503427e-06,
- "loss": 0.4010283350944519,
- "mean_token_accuracy": 0.8586333990097046,
- "num_tokens": 11133022.0,
- "step": 1242
- },
- {
- "epoch": 0.9445288753799392,
- "grad_norm": 1.6745036840438843,
- "learning_rate": 4.103222357020248e-06,
- "loss": 0.562545657157898,
- "mean_token_accuracy": 0.8052060604095459,
- "num_tokens": 11145255.0,
- "step": 1243
- },
- {
- "epoch": 0.9452887537993921,
- "grad_norm": 2.3616299629211426,
- "learning_rate": 4.101614766333904e-06,
- "loss": 0.5878340601921082,
- "mean_token_accuracy": 0.796745777130127,
- "num_tokens": 11152020.0,
- "step": 1244
- },
- {
- "epoch": 0.9460486322188449,
- "grad_norm": 1.6182078123092651,
- "learning_rate": 4.100006051572664e-06,
- "loss": 0.5357589721679688,
- "mean_token_accuracy": 0.8089962005615234,
- "num_tokens": 11163112.0,
- "step": 1245
- },
- {
- "epoch": 0.9468085106382979,
- "grad_norm": 1.911770224571228,
- "learning_rate": 4.098396213865587e-06,
- "loss": 0.49805426597595215,
- "mean_token_accuracy": 0.8289647102355957,
- "num_tokens": 11171768.0,
- "step": 1246
- },
- {
- "epoch": 0.9475683890577508,
- "grad_norm": 1.649155616760254,
- "learning_rate": 4.096785254342518e-06,
- "loss": 0.5756166577339172,
- "mean_token_accuracy": 0.807680606842041,
- "num_tokens": 11183527.0,
- "step": 1247
- },
- {
- "epoch": 0.9483282674772037,
- "grad_norm": 1.8922761678695679,
- "learning_rate": 4.095173174134091e-06,
- "loss": 0.44688963890075684,
- "mean_token_accuracy": 0.8375608921051025,
- "num_tokens": 11191494.0,
- "step": 1248
- },
- {
- "epoch": 0.9490881458966566,
- "grad_norm": 2.9044547080993652,
- "learning_rate": 4.093559974371725e-06,
- "loss": 0.48609739542007446,
- "mean_token_accuracy": 0.8404892086982727,
- "num_tokens": 11195837.0,
- "step": 1249
- },
- {
- "epoch": 0.9498480243161094,
- "grad_norm": 2.287506580352783,
- "learning_rate": 4.091945656187626e-06,
- "loss": 0.5260225534439087,
- "mean_token_accuracy": 0.8181945085525513,
- "num_tokens": 11202174.0,
- "step": 1250
- },
- {
- "epoch": 0.9506079027355623,
- "grad_norm": 1.7908886671066284,
- "learning_rate": 4.090330220714785e-06,
- "loss": 0.4207724928855896,
- "mean_token_accuracy": 0.8616912364959717,
- "num_tokens": 11209995.0,
- "step": 1251
- },
- {
- "epoch": 0.9513677811550152,
- "grad_norm": 2.905418634414673,
- "learning_rate": 4.0887136690869774e-06,
- "loss": 0.4209241271018982,
- "mean_token_accuracy": 0.8561323285102844,
- "num_tokens": 11213799.0,
- "step": 1252
- },
- {
- "epoch": 0.9521276595744681,
- "grad_norm": 2.814150333404541,
- "learning_rate": 4.08709600243876e-06,
- "loss": 0.36855608224868774,
- "mean_token_accuracy": 0.8764539361000061,
- "num_tokens": 11217643.0,
- "step": 1253
- },
- {
- "epoch": 0.952887537993921,
- "grad_norm": 1.9385707378387451,
- "learning_rate": 4.0854772219054735e-06,
- "loss": 0.531031608581543,
- "mean_token_accuracy": 0.80600905418396,
- "num_tokens": 11225871.0,
- "step": 1254
- },
- {
- "epoch": 0.9536474164133738,
- "grad_norm": 2.103058099746704,
- "learning_rate": 4.083857328623243e-06,
- "loss": 0.4576364755630493,
- "mean_token_accuracy": 0.8447524905204773,
- "num_tokens": 11231829.0,
- "step": 1255
- },
- {
- "epoch": 0.9544072948328267,
- "grad_norm": 1.7518818378448486,
- "learning_rate": 4.082236323728969e-06,
- "loss": 0.5386767983436584,
- "mean_token_accuracy": 0.8055596351623535,
- "num_tokens": 11240977.0,
- "step": 1256
- },
- {
- "epoch": 0.9551671732522796,
- "grad_norm": 1.8434966802597046,
- "learning_rate": 4.0806142083603365e-06,
- "loss": 0.5415925979614258,
- "mean_token_accuracy": 0.809962272644043,
- "num_tokens": 11249616.0,
- "step": 1257
- },
- {
- "epoch": 0.9559270516717325,
- "grad_norm": 1.7341015338897705,
- "learning_rate": 4.078990983655807e-06,
- "loss": 0.4621101915836334,
- "mean_token_accuracy": 0.8330386877059937,
- "num_tokens": 11258616.0,
- "step": 1258
- },
- {
- "epoch": 0.9566869300911854,
- "grad_norm": 1.8589727878570557,
- "learning_rate": 4.077366650754624e-06,
- "loss": 0.4031238555908203,
- "mean_token_accuracy": 0.842434287071228,
- "num_tokens": 11266006.0,
- "step": 1259
- },
- {
- "epoch": 0.9574468085106383,
- "grad_norm": 1.657175898551941,
- "learning_rate": 4.075741210796806e-06,
- "loss": 0.41686388850212097,
- "mean_token_accuracy": 0.8443650007247925,
- "num_tokens": 11275601.0,
- "step": 1260
- },
- {
- "epoch": 0.9582066869300911,
- "grad_norm": 2.4303717613220215,
- "learning_rate": 4.07411466492315e-06,
- "loss": 0.4554435610771179,
- "mean_token_accuracy": 0.853043794631958,
- "num_tokens": 11280650.0,
- "step": 1261
- },
- {
- "epoch": 0.958966565349544,
- "grad_norm": 2.3653745651245117,
- "learning_rate": 4.072487014275228e-06,
- "loss": 0.4304995536804199,
- "mean_token_accuracy": 0.8462260961532593,
- "num_tokens": 11285637.0,
- "step": 1262
- },
- {
- "epoch": 0.959726443768997,
- "grad_norm": 1.6689718961715698,
- "learning_rate": 4.070858259995388e-06,
- "loss": 0.5290807485580444,
- "mean_token_accuracy": 0.8176917433738708,
- "num_tokens": 11299110.0,
- "step": 1263
- },
- {
- "epoch": 0.9604863221884499,
- "grad_norm": 2.103879451751709,
- "learning_rate": 4.069228403226751e-06,
- "loss": 0.4620879888534546,
- "mean_token_accuracy": 0.835270345211029,
- "num_tokens": 11305564.0,
- "step": 1264
- },
- {
- "epoch": 0.9612462006079028,
- "grad_norm": 2.139012575149536,
- "learning_rate": 4.067597445113216e-06,
- "loss": 0.5143396258354187,
- "mean_token_accuracy": 0.8191739320755005,
- "num_tokens": 11311870.0,
- "step": 1265
- },
- {
- "epoch": 0.9620060790273556,
- "grad_norm": 1.3971210718154907,
- "learning_rate": 4.06596538679945e-06,
- "loss": 0.472080260515213,
- "mean_token_accuracy": 0.8321092128753662,
- "num_tokens": 11323970.0,
- "step": 1266
- },
- {
- "epoch": 0.9627659574468085,
- "grad_norm": 1.4965174198150635,
- "learning_rate": 4.064332229430895e-06,
- "loss": 0.359701007604599,
- "mean_token_accuracy": 0.8903120160102844,
- "num_tokens": 11333412.0,
- "step": 1267
- },
- {
- "epoch": 0.9635258358662614,
- "grad_norm": 1.1898726224899292,
- "learning_rate": 4.062697974153764e-06,
- "loss": 0.3423798084259033,
- "mean_token_accuracy": 0.8661491870880127,
- "num_tokens": 11347657.0,
- "step": 1268
- },
- {
- "epoch": 0.9642857142857143,
- "grad_norm": 1.4952168464660645,
- "learning_rate": 4.06106262211504e-06,
- "loss": 0.4214417338371277,
- "mean_token_accuracy": 0.8362159729003906,
- "num_tokens": 11357786.0,
- "step": 1269
- },
- {
- "epoch": 0.9650455927051672,
- "grad_norm": 1.7949583530426025,
- "learning_rate": 4.059426174462476e-06,
- "loss": 0.59087735414505,
- "mean_token_accuracy": 0.7965556979179382,
- "num_tokens": 11370561.0,
- "step": 1270
- },
- {
- "epoch": 0.96580547112462,
- "grad_norm": 1.8973214626312256,
- "learning_rate": 4.057788632344594e-06,
- "loss": 0.47525322437286377,
- "mean_token_accuracy": 0.8317365050315857,
- "num_tokens": 11378507.0,
- "step": 1271
- },
- {
- "epoch": 0.9665653495440729,
- "grad_norm": 1.8665250539779663,
- "learning_rate": 4.056149996910683e-06,
- "loss": 0.3537125587463379,
- "mean_token_accuracy": 0.8921569585800171,
- "num_tokens": 11385186.0,
- "step": 1272
- },
- {
- "epoch": 0.9673252279635258,
- "grad_norm": 1.5072317123413086,
- "learning_rate": 4.054510269310803e-06,
- "loss": 0.5145624876022339,
- "mean_token_accuracy": 0.8265488147735596,
- "num_tokens": 11397125.0,
- "step": 1273
- },
- {
- "epoch": 0.9680851063829787,
- "grad_norm": 1.520525574684143,
- "learning_rate": 4.052869450695776e-06,
- "loss": 0.44322293996810913,
- "mean_token_accuracy": 0.8403642177581787,
- "num_tokens": 11409919.0,
- "step": 1274
- },
- {
- "epoch": 0.9688449848024316,
- "grad_norm": 1.3764475584030151,
- "learning_rate": 4.051227542217192e-06,
- "loss": 0.5774400234222412,
- "mean_token_accuracy": 0.804118275642395,
- "num_tokens": 11425900.0,
- "step": 1275
- },
- {
- "epoch": 0.9696048632218845,
- "grad_norm": 1.3922648429870605,
- "learning_rate": 4.049584545027406e-06,
- "loss": 0.42727944254875183,
- "mean_token_accuracy": 0.8654505014419556,
- "num_tokens": 11438787.0,
- "step": 1276
- },
- {
- "epoch": 0.9703647416413373,
- "grad_norm": 1.8505840301513672,
- "learning_rate": 4.047940460279537e-06,
- "loss": 0.490803062915802,
- "mean_token_accuracy": 0.8340574502944946,
- "num_tokens": 11447997.0,
- "step": 1277
- },
- {
- "epoch": 0.9711246200607903,
- "grad_norm": 2.28271222114563,
- "learning_rate": 4.046295289127466e-06,
- "loss": 0.588828444480896,
- "mean_token_accuracy": 0.833497166633606,
- "num_tokens": 11454072.0,
- "step": 1278
- },
- {
- "epoch": 0.9718844984802432,
- "grad_norm": 2.4242560863494873,
- "learning_rate": 4.044649032725836e-06,
- "loss": 0.5128831267356873,
- "mean_token_accuracy": 0.8225122690200806,
- "num_tokens": 11460211.0,
- "step": 1279
- },
- {
- "epoch": 0.9726443768996961,
- "grad_norm": 2.1738455295562744,
- "learning_rate": 4.0430016922300566e-06,
- "loss": 0.441631942987442,
- "mean_token_accuracy": 0.841723620891571,
- "num_tokens": 11466814.0,
- "step": 1280
- },
- {
- "epoch": 0.973404255319149,
- "grad_norm": 2.541599988937378,
- "learning_rate": 4.0413532687962926e-06,
- "loss": 0.5062629580497742,
- "mean_token_accuracy": 0.8013502359390259,
- "num_tokens": 11472371.0,
- "step": 1281
- },
- {
- "epoch": 0.9741641337386018,
- "grad_norm": 2.8011014461517334,
- "learning_rate": 4.039703763581472e-06,
- "loss": 0.5061966776847839,
- "mean_token_accuracy": 0.829810380935669,
- "num_tokens": 11476672.0,
- "step": 1282
- },
- {
- "epoch": 0.9749240121580547,
- "grad_norm": 2.4505462646484375,
- "learning_rate": 4.038053177743279e-06,
- "loss": 0.43407535552978516,
- "mean_token_accuracy": 0.8428469896316528,
- "num_tokens": 11481297.0,
- "step": 1283
- },
- {
- "epoch": 0.9756838905775076,
- "grad_norm": 2.1618378162384033,
- "learning_rate": 4.036401512440161e-06,
- "loss": 0.6056663393974304,
- "mean_token_accuracy": 0.7977457642555237,
- "num_tokens": 11488657.0,
- "step": 1284
- },
- {
- "epoch": 0.9764437689969605,
- "grad_norm": 1.9192147254943848,
- "learning_rate": 4.034748768831319e-06,
- "loss": 0.524390697479248,
- "mean_token_accuracy": 0.8120636940002441,
- "num_tokens": 11496485.0,
- "step": 1285
- },
- {
- "epoch": 0.9772036474164134,
- "grad_norm": 2.766435384750366,
- "learning_rate": 4.033094948076713e-06,
- "loss": 0.5494908690452576,
- "mean_token_accuracy": 0.8141890168190002,
- "num_tokens": 11501341.0,
- "step": 1286
- },
- {
- "epoch": 0.9779635258358662,
- "grad_norm": 1.3519539833068848,
- "learning_rate": 4.031440051337056e-06,
- "loss": 0.4339691400527954,
- "mean_token_accuracy": 0.8400131464004517,
- "num_tokens": 11512843.0,
- "step": 1287
- },
- {
- "epoch": 0.9787234042553191,
- "grad_norm": 1.2492141723632812,
- "learning_rate": 4.02978407977382e-06,
- "loss": 0.4433518052101135,
- "mean_token_accuracy": 0.8432940244674683,
- "num_tokens": 11530227.0,
- "step": 1288
- },
- {
- "epoch": 0.979483282674772,
- "grad_norm": 1.6597715616226196,
- "learning_rate": 4.02812703454923e-06,
- "loss": 0.602222204208374,
- "mean_token_accuracy": 0.786965548992157,
- "num_tokens": 11543955.0,
- "step": 1289
- },
- {
- "epoch": 0.9802431610942249,
- "grad_norm": 1.6621816158294678,
- "learning_rate": 4.026468916826262e-06,
- "loss": 0.35662174224853516,
- "mean_token_accuracy": 0.8716133832931519,
- "num_tokens": 11552064.0,
- "step": 1290
- },
- {
- "epoch": 0.9810030395136778,
- "grad_norm": 4.539844989776611,
- "learning_rate": 4.024809727768648e-06,
- "loss": 0.543423593044281,
- "mean_token_accuracy": 0.8293194770812988,
- "num_tokens": 11555595.0,
- "step": 1291
- },
- {
- "epoch": 0.9817629179331308,
- "grad_norm": 1.4026556015014648,
- "learning_rate": 4.023149468540871e-06,
- "loss": 0.4301237165927887,
- "mean_token_accuracy": 0.8358224630355835,
- "num_tokens": 11572275.0,
- "step": 1292
- },
- {
- "epoch": 0.9825227963525835,
- "grad_norm": 1.611262321472168,
- "learning_rate": 4.021488140308165e-06,
- "loss": 0.5378580689430237,
- "mean_token_accuracy": 0.8173760771751404,
- "num_tokens": 11584299.0,
- "step": 1293
- },
- {
- "epoch": 0.9832826747720365,
- "grad_norm": 4.138631820678711,
- "learning_rate": 4.019825744236514e-06,
- "loss": 0.40272149443626404,
- "mean_token_accuracy": 0.8648844957351685,
- "num_tokens": 11586705.0,
- "step": 1294
- },
- {
- "epoch": 0.9840425531914894,
- "grad_norm": 3.177703619003296,
- "learning_rate": 4.018162281492651e-06,
- "loss": 0.5320103168487549,
- "mean_token_accuracy": 0.8250276446342468,
- "num_tokens": 11590689.0,
- "step": 1295
- },
- {
- "epoch": 0.9848024316109423,
- "grad_norm": 2.727597713470459,
- "learning_rate": 4.016497753244058e-06,
- "loss": 0.5662774443626404,
- "mean_token_accuracy": 0.8074625730514526,
- "num_tokens": 11596092.0,
- "step": 1296
- },
- {
- "epoch": 0.9855623100303952,
- "grad_norm": 1.485139012336731,
- "learning_rate": 4.014832160658966e-06,
- "loss": 0.5414972305297852,
- "mean_token_accuracy": 0.8082696199417114,
- "num_tokens": 11613785.0,
- "step": 1297
- },
- {
- "epoch": 0.986322188449848,
- "grad_norm": 2.4025990962982178,
- "learning_rate": 4.013165504906352e-06,
- "loss": 0.6556503772735596,
- "mean_token_accuracy": 0.7785214781761169,
- "num_tokens": 11620421.0,
- "step": 1298
- },
- {
- "epoch": 0.9870820668693009,
- "grad_norm": 1.878273606300354,
- "learning_rate": 4.011497787155938e-06,
- "loss": 0.4221133887767792,
- "mean_token_accuracy": 0.850035548210144,
- "num_tokens": 11627998.0,
- "step": 1299
- },
- {
- "epoch": 0.9878419452887538,
- "grad_norm": 2.0430715084075928,
- "learning_rate": 4.009829008578192e-06,
- "loss": 0.5205984711647034,
- "mean_token_accuracy": 0.819183349609375,
- "num_tokens": 11636279.0,
- "step": 1300
- },
- {
- "epoch": 0.9886018237082067,
- "grad_norm": 3.4769439697265625,
- "learning_rate": 4.00815917034433e-06,
- "loss": 0.5449948310852051,
- "mean_token_accuracy": 0.8240023851394653,
- "num_tokens": 11639638.0,
- "step": 1301
- },
- {
- "epoch": 0.9893617021276596,
- "grad_norm": 2.4783987998962402,
- "learning_rate": 4.006488273626307e-06,
- "loss": 0.4316832423210144,
- "mean_token_accuracy": 0.8474695086479187,
- "num_tokens": 11645463.0,
- "step": 1302
- },
- {
- "epoch": 0.9901215805471124,
- "grad_norm": 1.881475567817688,
- "learning_rate": 4.004816319596822e-06,
- "loss": 0.5157331824302673,
- "mean_token_accuracy": 0.826042652130127,
- "num_tokens": 11653955.0,
- "step": 1303
- },
- {
- "epoch": 0.9908814589665653,
- "grad_norm": 2.6569254398345947,
- "learning_rate": 4.003143309429317e-06,
- "loss": 0.46492767333984375,
- "mean_token_accuracy": 0.8320850133895874,
- "num_tokens": 11659357.0,
- "step": 1304
- },
- {
- "epoch": 0.9916413373860182,
- "grad_norm": 2.4917593002319336,
- "learning_rate": 4.0014692442979756e-06,
- "loss": 0.459585040807724,
- "mean_token_accuracy": 0.8457611799240112,
- "num_tokens": 11664207.0,
- "step": 1305
- },
- {
- "epoch": 0.9924012158054711,
- "grad_norm": 2.6885526180267334,
- "learning_rate": 3.999794125377721e-06,
- "loss": 0.4677402973175049,
- "mean_token_accuracy": 0.8307361602783203,
- "num_tokens": 11668879.0,
- "step": 1306
- },
- {
- "epoch": 0.993161094224924,
- "grad_norm": 1.9737319946289062,
- "learning_rate": 3.998117953844215e-06,
- "loss": 0.44684839248657227,
- "mean_token_accuracy": 0.8367687463760376,
- "num_tokens": 11676081.0,
- "step": 1307
- },
- {
- "epoch": 0.993920972644377,
- "grad_norm": 1.4333021640777588,
- "learning_rate": 3.996440730873861e-06,
- "loss": 0.526146650314331,
- "mean_token_accuracy": 0.816251814365387,
- "num_tokens": 11689333.0,
- "step": 1308
- },
- {
- "epoch": 0.9946808510638298,
- "grad_norm": 1.3689230680465698,
- "learning_rate": 3.9947624576437975e-06,
- "loss": 0.40214329957962036,
- "mean_token_accuracy": 0.8610327839851379,
- "num_tokens": 11701540.0,
- "step": 1309
- },
- {
- "epoch": 0.9954407294832827,
- "grad_norm": 1.2435375452041626,
- "learning_rate": 3.9930831353319025e-06,
- "loss": 0.4532913267612457,
- "mean_token_accuracy": 0.8415389060974121,
- "num_tokens": 11717920.0,
- "step": 1310
- },
- {
- "epoch": 0.9962006079027356,
- "grad_norm": 1.9968011379241943,
- "learning_rate": 3.9914027651167866e-06,
- "loss": 0.46954160928726196,
- "mean_token_accuracy": 0.8351103663444519,
- "num_tokens": 11724999.0,
- "step": 1311
- },
- {
- "epoch": 0.9969604863221885,
- "grad_norm": 1.9521311521530151,
- "learning_rate": 3.989721348177801e-06,
- "loss": 0.5068016052246094,
- "mean_token_accuracy": 0.8220845460891724,
- "num_tokens": 11732569.0,
- "step": 1312
- },
- {
- "epoch": 0.9977203647416414,
- "grad_norm": 2.7332582473754883,
- "learning_rate": 3.988038885695028e-06,
- "loss": 0.4154692590236664,
- "mean_token_accuracy": 0.8493857383728027,
- "num_tokens": 11736759.0,
- "step": 1313
- },
- {
- "epoch": 0.9984802431610942,
- "grad_norm": 1.8656952381134033,
- "learning_rate": 3.986355378849284e-06,
- "loss": 0.4151354134082794,
- "mean_token_accuracy": 0.83440101146698,
- "num_tokens": 11743827.0,
- "step": 1314
- },
- {
- "epoch": 0.9992401215805471,
- "grad_norm": 1.304006576538086,
- "learning_rate": 3.984670828822118e-06,
- "loss": 0.4926128089427948,
- "mean_token_accuracy": 0.8603005409240723,
- "num_tokens": 11757707.0,
- "step": 1315
- },
- {
- "epoch": 1.0,
- "grad_norm": 1.497079610824585,
- "learning_rate": 3.982985236795815e-06,
- "loss": 0.43342477083206177,
- "mean_token_accuracy": 0.8550825119018555,
- "num_tokens": 11769678.0,
- "step": 1316
- },
- {
- "epoch": 1.000759878419453,
- "grad_norm": 2.870274543762207,
- "learning_rate": 3.981298603953385e-06,
- "loss": 0.3723528981208801,
- "mean_token_accuracy": 0.8745899796485901,
- "num_tokens": 11773290.0,
- "step": 1317
- },
- {
- "epoch": 1.0015197568389058,
- "grad_norm": 1.3442503213882446,
- "learning_rate": 3.979610931478574e-06,
- "loss": 0.34688329696655273,
- "mean_token_accuracy": 0.8749074935913086,
- "num_tokens": 11786400.0,
- "step": 1318
- },
- {
- "epoch": 1.0022796352583587,
- "grad_norm": 1.7272238731384277,
- "learning_rate": 3.977922220555855e-06,
- "loss": 0.28274932503700256,
- "mean_token_accuracy": 0.896713137626648,
- "num_tokens": 11793059.0,
- "step": 1319
- },
- {
- "epoch": 1.0030395136778116,
- "grad_norm": 1.7362451553344727,
- "learning_rate": 3.976232472370431e-06,
- "loss": 0.5494794845581055,
- "mean_token_accuracy": 0.8341718912124634,
- "num_tokens": 11802593.0,
- "step": 1320
- },
- {
- "epoch": 1.0037993920972645,
- "grad_norm": 1.3316494226455688,
- "learning_rate": 3.97454168810823e-06,
- "loss": 0.41505366563796997,
- "mean_token_accuracy": 0.8581969738006592,
- "num_tokens": 11813925.0,
- "step": 1321
- },
- {
- "epoch": 1.0045592705167172,
- "grad_norm": 1.6152615547180176,
- "learning_rate": 3.972849868955913e-06,
- "loss": 0.44761013984680176,
- "mean_token_accuracy": 0.8413045406341553,
- "num_tokens": 11825709.0,
- "step": 1322
- },
- {
- "epoch": 1.0053191489361701,
- "grad_norm": 2.1172471046447754,
- "learning_rate": 3.97115701610086e-06,
- "loss": 0.3903353810310364,
- "mean_token_accuracy": 0.8662760257720947,
- "num_tokens": 11832070.0,
- "step": 1323
- },
- {
- "epoch": 1.006079027355623,
- "grad_norm": 1.5923868417739868,
- "learning_rate": 3.969463130731183e-06,
- "loss": 0.4491051137447357,
- "mean_token_accuracy": 0.8677828311920166,
- "num_tokens": 11843154.0,
- "step": 1324
- },
- {
- "epoch": 1.006838905775076,
- "grad_norm": 1.6848995685577393,
- "learning_rate": 3.967768214035716e-06,
- "loss": 0.45765817165374756,
- "mean_token_accuracy": 0.8401060104370117,
- "num_tokens": 11854826.0,
- "step": 1325
- },
- {
- "epoch": 1.0075987841945289,
- "grad_norm": 2.3739020824432373,
- "learning_rate": 3.966072267204014e-06,
- "loss": 0.4482722580432892,
- "mean_token_accuracy": 0.8368916511535645,
- "num_tokens": 11860559.0,
- "step": 1326
- },
- {
- "epoch": 1.0083586626139818,
- "grad_norm": 1.5403034687042236,
- "learning_rate": 3.964375291426361e-06,
- "loss": 0.35589972138404846,
- "mean_token_accuracy": 0.8728118538856506,
- "num_tokens": 11871959.0,
- "step": 1327
- },
- {
- "epoch": 1.0091185410334347,
- "grad_norm": 1.6750119924545288,
- "learning_rate": 3.962677287893758e-06,
- "loss": 0.35873427987098694,
- "mean_token_accuracy": 0.9027186632156372,
- "num_tokens": 11881818.0,
- "step": 1328
- },
- {
- "epoch": 1.0098784194528876,
- "grad_norm": 1.5489170551300049,
- "learning_rate": 3.9609782577979305e-06,
- "loss": 0.3634672462940216,
- "mean_token_accuracy": 0.8582607507705688,
- "num_tokens": 11891084.0,
- "step": 1329
- },
- {
- "epoch": 1.0106382978723405,
- "grad_norm": 2.43859601020813,
- "learning_rate": 3.959278202331323e-06,
- "loss": 0.3640799820423126,
- "mean_token_accuracy": 0.88062584400177,
- "num_tokens": 11896032.0,
- "step": 1330
- },
- {
- "epoch": 1.0113981762917934,
- "grad_norm": 3.612184524536133,
- "learning_rate": 3.9575771226870986e-06,
- "loss": 0.3733130097389221,
- "mean_token_accuracy": 0.8946067094802856,
- "num_tokens": 11899479.0,
- "step": 1331
- },
- {
- "epoch": 1.012158054711246,
- "grad_norm": 1.541355848312378,
- "learning_rate": 3.955875020059141e-06,
- "loss": 0.320593923330307,
- "mean_token_accuracy": 0.9057406783103943,
- "num_tokens": 11910179.0,
- "step": 1332
- },
- {
- "epoch": 1.012917933130699,
- "grad_norm": 2.0565030574798584,
- "learning_rate": 3.954171895642052e-06,
- "loss": 0.3341682553291321,
- "mean_token_accuracy": 0.8829344511032104,
- "num_tokens": 11916489.0,
- "step": 1333
- },
- {
- "epoch": 1.013677811550152,
- "grad_norm": 2.9732539653778076,
- "learning_rate": 3.9524677506311505e-06,
- "loss": 0.38488566875457764,
- "mean_token_accuracy": 0.8752974271774292,
- "num_tokens": 11920682.0,
- "step": 1334
- },
- {
- "epoch": 1.0144376899696048,
- "grad_norm": 2.7697458267211914,
- "learning_rate": 3.950762586222469e-06,
- "loss": 0.39864760637283325,
- "mean_token_accuracy": 0.8593167662620544,
- "num_tokens": 11925233.0,
- "step": 1335
- },
- {
- "epoch": 1.0151975683890577,
- "grad_norm": 2.2302119731903076,
- "learning_rate": 3.949056403612758e-06,
- "loss": 0.3985682725906372,
- "mean_token_accuracy": 0.8677899837493896,
- "num_tokens": 11932000.0,
- "step": 1336
- },
- {
- "epoch": 1.0159574468085106,
- "grad_norm": 2.360572576522827,
- "learning_rate": 3.947349203999485e-06,
- "loss": 0.36940714716911316,
- "mean_token_accuracy": 0.8760676383972168,
- "num_tokens": 11937569.0,
- "step": 1337
- },
- {
- "epoch": 1.0167173252279635,
- "grad_norm": 1.3383921384811401,
- "learning_rate": 3.945640988580824e-06,
- "loss": 0.40628793835639954,
- "mean_token_accuracy": 0.866442084312439,
- "num_tokens": 11955679.0,
- "step": 1338
- },
- {
- "epoch": 1.0174772036474165,
- "grad_norm": 2.1502623558044434,
- "learning_rate": 3.943931758555669e-06,
- "loss": 0.4493565559387207,
- "mean_token_accuracy": 0.8307522535324097,
- "num_tokens": 11962734.0,
- "step": 1339
- },
- {
- "epoch": 1.0182370820668694,
- "grad_norm": 2.4737331867218018,
- "learning_rate": 3.942221515123624e-06,
- "loss": 0.28508758544921875,
- "mean_token_accuracy": 0.8967142105102539,
- "num_tokens": 11967783.0,
- "step": 1340
- },
- {
- "epoch": 1.0189969604863223,
- "grad_norm": 2.4525370597839355,
- "learning_rate": 3.940510259485002e-06,
- "loss": 0.40227818489074707,
- "mean_token_accuracy": 0.8618967533111572,
- "num_tokens": 11972918.0,
- "step": 1341
- },
- {
- "epoch": 1.0197568389057752,
- "grad_norm": 1.7299731969833374,
- "learning_rate": 3.938797992840828e-06,
- "loss": 0.26339593529701233,
- "mean_token_accuracy": 0.9004406929016113,
- "num_tokens": 11981250.0,
- "step": 1342
- },
- {
- "epoch": 1.0205167173252279,
- "grad_norm": 2.8756747245788574,
- "learning_rate": 3.937084716392839e-06,
- "loss": 0.47792482376098633,
- "mean_token_accuracy": 0.8440839052200317,
- "num_tokens": 11986356.0,
- "step": 1343
- },
- {
- "epoch": 1.0212765957446808,
- "grad_norm": 2.104473114013672,
- "learning_rate": 3.935370431343475e-06,
- "loss": 0.36723971366882324,
- "mean_token_accuracy": 0.8831232786178589,
- "num_tokens": 11994495.0,
- "step": 1344
- },
- {
- "epoch": 1.0220364741641337,
- "grad_norm": 1.9173074960708618,
- "learning_rate": 3.933655138895889e-06,
- "loss": 0.409319669008255,
- "mean_token_accuracy": 0.8632645606994629,
- "num_tokens": 12002060.0,
- "step": 1345
- },
- {
- "epoch": 1.0227963525835866,
- "grad_norm": 2.958311080932617,
- "learning_rate": 3.9319388402539395e-06,
- "loss": 0.5390093922615051,
- "mean_token_accuracy": 0.8204828500747681,
- "num_tokens": 12007588.0,
- "step": 1346
- },
- {
- "epoch": 1.0235562310030395,
- "grad_norm": 1.6470831632614136,
- "learning_rate": 3.930221536622192e-06,
- "loss": 0.4524633288383484,
- "mean_token_accuracy": 0.8516575694084167,
- "num_tokens": 12018831.0,
- "step": 1347
- },
- {
- "epoch": 1.0243161094224924,
- "grad_norm": 1.3160780668258667,
- "learning_rate": 3.928503229205913e-06,
- "loss": 0.4180558919906616,
- "mean_token_accuracy": 0.8495022058486938,
- "num_tokens": 12033947.0,
- "step": 1348
- },
- {
- "epoch": 1.0250759878419453,
- "grad_norm": 1.9686089754104614,
- "learning_rate": 3.92678391921108e-06,
- "loss": 0.41927334666252136,
- "mean_token_accuracy": 0.8462997674942017,
- "num_tokens": 12042005.0,
- "step": 1349
- },
- {
- "epoch": 1.0258358662613982,
- "grad_norm": 2.351778507232666,
- "learning_rate": 3.92506360784437e-06,
- "loss": 0.2946245074272156,
- "mean_token_accuracy": 0.9170923233032227,
- "num_tokens": 12046579.0,
- "step": 1350
- },
- {
- "epoch": 1.0265957446808511,
- "grad_norm": 2.0636913776397705,
- "learning_rate": 3.923342296313162e-06,
- "loss": 0.3422774076461792,
- "mean_token_accuracy": 0.8809213638305664,
- "num_tokens": 12053214.0,
- "step": 1351
- },
- {
- "epoch": 1.027355623100304,
- "grad_norm": 1.7272592782974243,
- "learning_rate": 3.92161998582554e-06,
- "loss": 0.5864541530609131,
- "mean_token_accuracy": 0.7986117601394653,
- "num_tokens": 12068522.0,
- "step": 1352
- },
- {
- "epoch": 1.028115501519757,
- "grad_norm": 0.8980231881141663,
- "learning_rate": 3.919896677590289e-06,
- "loss": 0.2964550256729126,
- "mean_token_accuracy": 0.8911845088005066,
- "num_tokens": 12093834.0,
- "step": 1353
- },
- {
- "epoch": 1.0288753799392096,
- "grad_norm": 1.6031712293624878,
- "learning_rate": 3.918172372816892e-06,
- "loss": 0.37254488468170166,
- "mean_token_accuracy": 0.8615843057632446,
- "num_tokens": 12104393.0,
- "step": 1354
- },
- {
- "epoch": 1.0296352583586625,
- "grad_norm": 1.282134771347046,
- "learning_rate": 3.916447072715531e-06,
- "loss": 0.3522927761077881,
- "mean_token_accuracy": 0.8713657259941101,
- "num_tokens": 12118671.0,
- "step": 1355
- },
- {
- "epoch": 1.0303951367781155,
- "grad_norm": 2.1986680030822754,
- "learning_rate": 3.914720778497091e-06,
- "loss": 0.3716316223144531,
- "mean_token_accuracy": 0.8661249279975891,
- "num_tokens": 12125178.0,
- "step": 1356
- },
- {
- "epoch": 1.0311550151975684,
- "grad_norm": 1.5937882661819458,
- "learning_rate": 3.91299349137315e-06,
- "loss": 0.48067355155944824,
- "mean_token_accuracy": 0.8284252882003784,
- "num_tokens": 12136785.0,
- "step": 1357
- },
- {
- "epoch": 1.0319148936170213,
- "grad_norm": 1.6743099689483643,
- "learning_rate": 3.9112652125559845e-06,
- "loss": 0.4461551308631897,
- "mean_token_accuracy": 0.8381845355033875,
- "num_tokens": 12150066.0,
- "step": 1358
- },
- {
- "epoch": 1.0326747720364742,
- "grad_norm": 2.2346715927124023,
- "learning_rate": 3.909535943258567e-06,
- "loss": 0.3148220181465149,
- "mean_token_accuracy": 0.8797591924667358,
- "num_tokens": 12155506.0,
- "step": 1359
- },
- {
- "epoch": 1.033434650455927,
- "grad_norm": 1.9608992338180542,
- "learning_rate": 3.907805684694567e-06,
- "loss": 0.32598960399627686,
- "mean_token_accuracy": 0.8819410800933838,
- "num_tokens": 12163261.0,
- "step": 1360
- },
- {
- "epoch": 1.03419452887538,
- "grad_norm": 2.413477897644043,
- "learning_rate": 3.906074438078343e-06,
- "loss": 0.38179588317871094,
- "mean_token_accuracy": 0.8739585876464844,
- "num_tokens": 12169254.0,
- "step": 1361
- },
- {
- "epoch": 1.034954407294833,
- "grad_norm": 2.0258278846740723,
- "learning_rate": 3.904342204624955e-06,
- "loss": 0.33240315318107605,
- "mean_token_accuracy": 0.8808181285858154,
- "num_tokens": 12175379.0,
- "step": 1362
- },
- {
- "epoch": 1.0357142857142858,
- "grad_norm": 2.4111437797546387,
- "learning_rate": 3.9026089855501475e-06,
- "loss": 0.412802517414093,
- "mean_token_accuracy": 0.8504396677017212,
- "num_tokens": 12182007.0,
- "step": 1363
- },
- {
- "epoch": 1.0364741641337385,
- "grad_norm": 2.0424840450286865,
- "learning_rate": 3.900874782070362e-06,
- "loss": 0.2914797067642212,
- "mean_token_accuracy": 0.8731886148452759,
- "num_tokens": 12187743.0,
- "step": 1364
- },
- {
- "epoch": 1.0372340425531914,
- "grad_norm": 2.9248716831207275,
- "learning_rate": 3.899139595402729e-06,
- "loss": 0.34071338176727295,
- "mean_token_accuracy": 0.8736443519592285,
- "num_tokens": 12191830.0,
- "step": 1365
- },
- {
- "epoch": 1.0379939209726443,
- "grad_norm": 2.240220785140991,
- "learning_rate": 3.8974034267650695e-06,
- "loss": 0.23049014806747437,
- "mean_token_accuracy": 0.9000070691108704,
- "num_tokens": 12196460.0,
- "step": 1366
- },
- {
- "epoch": 1.0387537993920972,
- "grad_norm": 1.5038460493087769,
- "learning_rate": 3.895666277375892e-06,
- "loss": 0.32255327701568604,
- "mean_token_accuracy": 0.873004674911499,
- "num_tokens": 12206230.0,
- "step": 1367
- },
- {
- "epoch": 1.0395136778115501,
- "grad_norm": 1.2339142560958862,
- "learning_rate": 3.893928148454398e-06,
- "loss": 0.4069131314754486,
- "mean_token_accuracy": 0.8461740016937256,
- "num_tokens": 12226502.0,
- "step": 1368
- },
- {
- "epoch": 1.040273556231003,
- "grad_norm": 2.531553268432617,
- "learning_rate": 3.89218904122047e-06,
- "loss": 0.43681037425994873,
- "mean_token_accuracy": 0.8497104048728943,
- "num_tokens": 12232241.0,
- "step": 1369
- },
- {
- "epoch": 1.041033434650456,
- "grad_norm": 3.8404815196990967,
- "learning_rate": 3.890448956894682e-06,
- "loss": 0.3241814970970154,
- "mean_token_accuracy": 0.884732723236084,
- "num_tokens": 12235126.0,
- "step": 1370
- },
- {
- "epoch": 1.0417933130699089,
- "grad_norm": 2.9608030319213867,
- "learning_rate": 3.888707896698293e-06,
- "loss": 0.4641021490097046,
- "mean_token_accuracy": 0.8496800661087036,
- "num_tokens": 12240630.0,
- "step": 1371
- },
- {
- "epoch": 1.0425531914893618,
- "grad_norm": 2.1166417598724365,
- "learning_rate": 3.886965861853243e-06,
- "loss": 0.42038479447364807,
- "mean_token_accuracy": 0.8512747287750244,
- "num_tokens": 12247969.0,
- "step": 1372
- },
- {
- "epoch": 1.0433130699088147,
- "grad_norm": 2.5918161869049072,
- "learning_rate": 3.885222853582163e-06,
- "loss": 0.2871917188167572,
- "mean_token_accuracy": 0.9129709601402283,
- "num_tokens": 12252161.0,
- "step": 1373
- },
- {
- "epoch": 1.0440729483282676,
- "grad_norm": 2.4261348247528076,
- "learning_rate": 3.88347887310836e-06,
- "loss": 0.4003123342990875,
- "mean_token_accuracy": 0.8570356369018555,
- "num_tokens": 12258135.0,
- "step": 1374
- },
- {
- "epoch": 1.0448328267477203,
- "grad_norm": 1.3439548015594482,
- "learning_rate": 3.881733921655829e-06,
- "loss": 0.3278140425682068,
- "mean_token_accuracy": 0.8831373453140259,
- "num_tokens": 12272849.0,
- "step": 1375
- },
- {
- "epoch": 1.0455927051671732,
- "grad_norm": 1.527989387512207,
- "learning_rate": 3.879988000449243e-06,
- "loss": 0.33789363503456116,
- "mean_token_accuracy": 0.8825669884681702,
- "num_tokens": 12283281.0,
- "step": 1376
- },
- {
- "epoch": 1.046352583586626,
- "grad_norm": 1.6755503416061401,
- "learning_rate": 3.878241110713957e-06,
- "loss": 0.4816160798072815,
- "mean_token_accuracy": 0.8193758726119995,
- "num_tokens": 12295422.0,
- "step": 1377
- },
- {
- "epoch": 1.047112462006079,
- "grad_norm": 2.8110361099243164,
- "learning_rate": 3.876493253676004e-06,
- "loss": 0.38662949204444885,
- "mean_token_accuracy": 0.8611986637115479,
- "num_tokens": 12299806.0,
- "step": 1378
- },
- {
- "epoch": 1.047872340425532,
- "grad_norm": 1.86097252368927,
- "learning_rate": 3.8747444305621e-06,
- "loss": 0.27612629532814026,
- "mean_token_accuracy": 0.8984048366546631,
- "num_tokens": 12306599.0,
- "step": 1379
- },
- {
- "epoch": 1.0486322188449848,
- "grad_norm": 2.361828565597534,
- "learning_rate": 3.872994642599635e-06,
- "loss": 0.469953715801239,
- "mean_token_accuracy": 0.8464452028274536,
- "num_tokens": 12314249.0,
- "step": 1380
- },
- {
- "epoch": 1.0493920972644377,
- "grad_norm": 1.9524794816970825,
- "learning_rate": 3.871243891016676e-06,
- "loss": 0.5419625043869019,
- "mean_token_accuracy": 0.8468329906463623,
- "num_tokens": 12324987.0,
- "step": 1381
- },
- {
- "epoch": 1.0501519756838906,
- "grad_norm": 1.6931511163711548,
- "learning_rate": 3.869492177041971e-06,
- "loss": 0.3791416883468628,
- "mean_token_accuracy": 0.8692882061004639,
- "num_tokens": 12336864.0,
- "step": 1382
- },
- {
- "epoch": 1.0509118541033435,
- "grad_norm": 1.909692406654358,
- "learning_rate": 3.867739501904938e-06,
- "loss": 0.27974557876586914,
- "mean_token_accuracy": 0.9004636406898499,
- "num_tokens": 12343093.0,
- "step": 1383
- },
- {
- "epoch": 1.0516717325227964,
- "grad_norm": 1.415162205696106,
- "learning_rate": 3.8659858668356735e-06,
- "loss": 0.38928335905075073,
- "mean_token_accuracy": 0.8491984009742737,
- "num_tokens": 12356613.0,
- "step": 1384
- },
- {
- "epoch": 1.0524316109422491,
- "grad_norm": 1.8195741176605225,
- "learning_rate": 3.864231273064944e-06,
- "loss": 0.3798758089542389,
- "mean_token_accuracy": 0.8728072047233582,
- "num_tokens": 12364860.0,
- "step": 1385
- },
- {
- "epoch": 1.053191489361702,
- "grad_norm": 1.8481454849243164,
- "learning_rate": 3.862475721824193e-06,
- "loss": 0.269635945558548,
- "mean_token_accuracy": 0.899247407913208,
- "num_tokens": 12371841.0,
- "step": 1386
- },
- {
- "epoch": 1.053951367781155,
- "grad_norm": 1.7838784456253052,
- "learning_rate": 3.8607192143455325e-06,
- "loss": 0.36971768736839294,
- "mean_token_accuracy": 0.8833638429641724,
- "num_tokens": 12380685.0,
- "step": 1387
- },
- {
- "epoch": 1.0547112462006079,
- "grad_norm": 1.333358645439148,
- "learning_rate": 3.858961751861748e-06,
- "loss": 0.4039418399333954,
- "mean_token_accuracy": 0.8541078567504883,
- "num_tokens": 12394072.0,
- "step": 1388
- },
- {
- "epoch": 1.0554711246200608,
- "grad_norm": 2.1600265502929688,
- "learning_rate": 3.857203335606294e-06,
- "loss": 0.38211894035339355,
- "mean_token_accuracy": 0.8549972772598267,
- "num_tokens": 12400449.0,
- "step": 1389
- },
- {
- "epoch": 1.0562310030395137,
- "grad_norm": 2.914902687072754,
- "learning_rate": 3.855443966813295e-06,
- "loss": 0.2237374186515808,
- "mean_token_accuracy": 0.9253600835800171,
- "num_tokens": 12403758.0,
- "step": 1390
- },
- {
- "epoch": 1.0569908814589666,
- "grad_norm": 2.2361080646514893,
- "learning_rate": 3.853683646717543e-06,
- "loss": 0.3359566926956177,
- "mean_token_accuracy": 0.898173451423645,
- "num_tokens": 12410374.0,
- "step": 1391
- },
- {
- "epoch": 1.0577507598784195,
- "grad_norm": 2.3639304637908936,
- "learning_rate": 3.8519223765544985e-06,
- "loss": 0.3844943046569824,
- "mean_token_accuracy": 0.863599419593811,
- "num_tokens": 12416016.0,
- "step": 1392
- },
- {
- "epoch": 1.0585106382978724,
- "grad_norm": 2.202971935272217,
- "learning_rate": 3.85016015756029e-06,
- "loss": 0.3546281158924103,
- "mean_token_accuracy": 0.8907540440559387,
- "num_tokens": 12422026.0,
- "step": 1393
- },
- {
- "epoch": 1.0592705167173253,
- "grad_norm": 1.1279661655426025,
- "learning_rate": 3.848396990971709e-06,
- "loss": 0.31522464752197266,
- "mean_token_accuracy": 0.8662257194519043,
- "num_tokens": 12439964.0,
- "step": 1394
- },
- {
- "epoch": 1.0600303951367782,
- "grad_norm": 2.4731740951538086,
- "learning_rate": 3.846632878026214e-06,
- "loss": 0.456442266702652,
- "mean_token_accuracy": 0.8516958951950073,
- "num_tokens": 12446231.0,
- "step": 1395
- },
- {
- "epoch": 1.060790273556231,
- "grad_norm": 1.7631878852844238,
- "learning_rate": 3.844867819961928e-06,
- "loss": 0.487227201461792,
- "mean_token_accuracy": 0.8466947078704834,
- "num_tokens": 12459989.0,
- "step": 1396
- },
- {
- "epoch": 1.0615501519756838,
- "grad_norm": 2.4468278884887695,
- "learning_rate": 3.843101818017637e-06,
- "loss": 0.3367291986942291,
- "mean_token_accuracy": 0.8734689950942993,
- "num_tokens": 12465741.0,
- "step": 1397
- },
- {
- "epoch": 1.0623100303951367,
- "grad_norm": 1.9045145511627197,
- "learning_rate": 3.841334873432789e-06,
- "loss": 0.4652615487575531,
- "mean_token_accuracy": 0.8333107233047485,
- "num_tokens": 12474963.0,
- "step": 1398
- },
- {
- "epoch": 1.0630699088145896,
- "grad_norm": 1.6816917657852173,
- "learning_rate": 3.839566987447492e-06,
- "loss": 0.4144279956817627,
- "mean_token_accuracy": 0.8472539186477661,
- "num_tokens": 12485521.0,
- "step": 1399
- },
- {
- "epoch": 1.0638297872340425,
- "grad_norm": 1.8990092277526855,
- "learning_rate": 3.837798161302518e-06,
- "loss": 0.4040985405445099,
- "mean_token_accuracy": 0.8514704704284668,
- "num_tokens": 12493495.0,
- "step": 1400
- },
- {
- "epoch": 1.0645896656534954,
- "grad_norm": 2.27785325050354,
- "learning_rate": 3.836028396239297e-06,
- "loss": 0.43425723910331726,
- "mean_token_accuracy": 0.8795069456100464,
- "num_tokens": 12499789.0,
- "step": 1401
- },
- {
- "epoch": 1.0653495440729484,
- "grad_norm": 2.5130882263183594,
- "learning_rate": 3.8342576934999184e-06,
- "loss": 0.33892524242401123,
- "mean_token_accuracy": 0.8717449903488159,
- "num_tokens": 12504885.0,
- "step": 1402
- },
- {
- "epoch": 1.0661094224924013,
- "grad_norm": 2.650040864944458,
- "learning_rate": 3.832486054327131e-06,
- "loss": 0.4200317859649658,
- "mean_token_accuracy": 0.8616159558296204,
- "num_tokens": 12509783.0,
- "step": 1403
- },
- {
- "epoch": 1.0668693009118542,
- "grad_norm": 2.9176881313323975,
- "learning_rate": 3.830713479964335e-06,
- "loss": 0.37018489837646484,
- "mean_token_accuracy": 0.8676021695137024,
- "num_tokens": 12514441.0,
- "step": 1404
- },
- {
- "epoch": 1.067629179331307,
- "grad_norm": 1.6430318355560303,
- "learning_rate": 3.828939971655595e-06,
- "loss": 0.27539193630218506,
- "mean_token_accuracy": 0.9077831506729126,
- "num_tokens": 12523677.0,
- "step": 1405
- },
- {
- "epoch": 1.06838905775076,
- "grad_norm": 1.3683708906173706,
- "learning_rate": 3.827165530645627e-06,
- "loss": 0.4085099697113037,
- "mean_token_accuracy": 0.8579255938529968,
- "num_tokens": 12540104.0,
- "step": 1406
- },
- {
- "epoch": 1.0691489361702127,
- "grad_norm": 2.528465747833252,
- "learning_rate": 3.825390158179802e-06,
- "loss": 0.42462456226348877,
- "mean_token_accuracy": 0.852813720703125,
- "num_tokens": 12548239.0,
- "step": 1407
- },
- {
- "epoch": 1.0699088145896656,
- "grad_norm": 1.8288795948028564,
- "learning_rate": 3.823613855504144e-06,
- "loss": 0.412417471408844,
- "mean_token_accuracy": 0.8622130751609802,
- "num_tokens": 12557316.0,
- "step": 1408
- },
- {
- "epoch": 1.0706686930091185,
- "grad_norm": 2.341794490814209,
- "learning_rate": 3.82183662386533e-06,
- "loss": 0.2996668815612793,
- "mean_token_accuracy": 0.8964041471481323,
- "num_tokens": 12562377.0,
- "step": 1409
- },
- {
- "epoch": 1.0714285714285714,
- "grad_norm": 2.555877208709717,
- "learning_rate": 3.82005846451069e-06,
- "loss": 0.4184221625328064,
- "mean_token_accuracy": 0.8678828477859497,
- "num_tokens": 12568516.0,
- "step": 1410
- },
- {
- "epoch": 1.0721884498480243,
- "grad_norm": 2.081308126449585,
- "learning_rate": 3.8182793786882065e-06,
- "loss": 0.4376835823059082,
- "mean_token_accuracy": 0.8409077525138855,
- "num_tokens": 12576598.0,
- "step": 1411
- },
- {
- "epoch": 1.0729483282674772,
- "grad_norm": 2.0272316932678223,
- "learning_rate": 3.816499367646508e-06,
- "loss": 0.3630060851573944,
- "mean_token_accuracy": 0.8762413263320923,
- "num_tokens": 12584587.0,
- "step": 1412
- },
- {
- "epoch": 1.0737082066869301,
- "grad_norm": 2.6382484436035156,
- "learning_rate": 3.814718432634877e-06,
- "loss": 0.4244990348815918,
- "mean_token_accuracy": 0.8509312272071838,
- "num_tokens": 12590028.0,
- "step": 1413
- },
- {
- "epoch": 1.074468085106383,
- "grad_norm": 2.429800271987915,
- "learning_rate": 3.8129365749032398e-06,
- "loss": 0.36990004777908325,
- "mean_token_accuracy": 0.8749774098396301,
- "num_tokens": 12594984.0,
- "step": 1414
- },
- {
- "epoch": 1.075227963525836,
- "grad_norm": 3.5939090251922607,
- "learning_rate": 3.8111537957021736e-06,
- "loss": 0.4245661199092865,
- "mean_token_accuracy": 0.8481623530387878,
- "num_tokens": 12598494.0,
- "step": 1415
- },
- {
- "epoch": 1.0759878419452888,
- "grad_norm": 2.705955982208252,
- "learning_rate": 3.809370096282903e-06,
- "loss": 0.41851678490638733,
- "mean_token_accuracy": 0.8548051714897156,
- "num_tokens": 12603876.0,
- "step": 1416
- },
- {
- "epoch": 1.0767477203647418,
- "grad_norm": 1.7812079191207886,
- "learning_rate": 3.807585477897296e-06,
- "loss": 0.47113919258117676,
- "mean_token_accuracy": 0.8346904516220093,
- "num_tokens": 12613402.0,
- "step": 1417
- },
- {
- "epoch": 1.0775075987841944,
- "grad_norm": 1.4335212707519531,
- "learning_rate": 3.8057999417978654e-06,
- "loss": 0.3802063465118408,
- "mean_token_accuracy": 0.8563423156738281,
- "num_tokens": 12626865.0,
- "step": 1418
- },
- {
- "epoch": 1.0782674772036474,
- "grad_norm": 1.9171305894851685,
- "learning_rate": 3.8040134892377702e-06,
- "loss": 0.20898357033729553,
- "mean_token_accuracy": 0.9189738035202026,
- "num_tokens": 12632593.0,
- "step": 1419
- },
- {
- "epoch": 1.0790273556231003,
- "grad_norm": 1.4996821880340576,
- "learning_rate": 3.802226121470811e-06,
- "loss": 0.4203261137008667,
- "mean_token_accuracy": 0.8479211330413818,
- "num_tokens": 12646395.0,
- "step": 1420
- },
- {
- "epoch": 1.0797872340425532,
- "grad_norm": 2.2007253170013428,
- "learning_rate": 3.800437839751432e-06,
- "loss": 0.40370577573776245,
- "mean_token_accuracy": 0.8427679538726807,
- "num_tokens": 12653508.0,
- "step": 1421
- },
- {
- "epoch": 1.080547112462006,
- "grad_norm": 1.7266581058502197,
- "learning_rate": 3.7986486453347183e-06,
- "loss": 0.46750491857528687,
- "mean_token_accuracy": 0.8429205417633057,
- "num_tokens": 12666329.0,
- "step": 1422
- },
- {
- "epoch": 1.081306990881459,
- "grad_norm": 1.4716318845748901,
- "learning_rate": 3.796858539476394e-06,
- "loss": 0.3330317735671997,
- "mean_token_accuracy": 0.879012942314148,
- "num_tokens": 12676741.0,
- "step": 1423
- },
- {
- "epoch": 1.082066869300912,
- "grad_norm": 2.652127265930176,
- "learning_rate": 3.795067523432826e-06,
- "loss": 0.35365715622901917,
- "mean_token_accuracy": 0.8796792030334473,
- "num_tokens": 12681479.0,
- "step": 1424
- },
- {
- "epoch": 1.0828267477203648,
- "grad_norm": 1.2937829494476318,
- "learning_rate": 3.793275598461017e-06,
- "loss": 0.25272446870803833,
- "mean_token_accuracy": 0.9231734275817871,
- "num_tokens": 12694238.0,
- "step": 1425
- },
- {
- "epoch": 1.0835866261398177,
- "grad_norm": 1.3831220865249634,
- "learning_rate": 3.7914827658186104e-06,
- "loss": 0.4935331344604492,
- "mean_token_accuracy": 0.8417420387268066,
- "num_tokens": 12712857.0,
- "step": 1426
- },
- {
- "epoch": 1.0843465045592706,
- "grad_norm": 3.059525728225708,
- "learning_rate": 3.7896890267638832e-06,
- "loss": 0.2592190206050873,
- "mean_token_accuracy": 0.9040263295173645,
- "num_tokens": 12716766.0,
- "step": 1427
- },
- {
- "epoch": 1.0851063829787233,
- "grad_norm": 2.8399202823638916,
- "learning_rate": 3.787894382555752e-06,
- "loss": 0.32098138332366943,
- "mean_token_accuracy": 0.8838302493095398,
- "num_tokens": 12720774.0,
- "step": 1428
- },
- {
- "epoch": 1.0858662613981762,
- "grad_norm": 2.618479013442993,
- "learning_rate": 3.7860988344537664e-06,
- "loss": 0.425255686044693,
- "mean_token_accuracy": 0.8564130067825317,
- "num_tokens": 12726506.0,
- "step": 1429
- },
- {
- "epoch": 1.0866261398176291,
- "grad_norm": 1.3108669519424438,
- "learning_rate": 3.7843023837181126e-06,
- "loss": 0.40220165252685547,
- "mean_token_accuracy": 0.8588873147964478,
- "num_tokens": 12742814.0,
- "step": 1430
- },
- {
- "epoch": 1.087386018237082,
- "grad_norm": 2.2083566188812256,
- "learning_rate": 3.782505031609607e-06,
- "loss": 0.318379282951355,
- "mean_token_accuracy": 0.8887606859207153,
- "num_tokens": 12748388.0,
- "step": 1431
- },
- {
- "epoch": 1.088145896656535,
- "grad_norm": 1.922358751296997,
- "learning_rate": 3.7807067793897006e-06,
- "loss": 0.2519589364528656,
- "mean_token_accuracy": 0.8936764001846313,
- "num_tokens": 12754761.0,
- "step": 1432
- },
- {
- "epoch": 1.0889057750759878,
- "grad_norm": 1.7367439270019531,
- "learning_rate": 3.778907628320477e-06,
- "loss": 0.3970367908477783,
- "mean_token_accuracy": 0.858735203742981,
- "num_tokens": 12764016.0,
- "step": 1433
- },
- {
- "epoch": 1.0896656534954408,
- "grad_norm": 2.1931066513061523,
- "learning_rate": 3.77710757966465e-06,
- "loss": 0.5250554084777832,
- "mean_token_accuracy": 0.8356746435165405,
- "num_tokens": 12772272.0,
- "step": 1434
- },
- {
- "epoch": 1.0904255319148937,
- "grad_norm": 1.718337893486023,
- "learning_rate": 3.775306634685562e-06,
- "loss": 0.283231645822525,
- "mean_token_accuracy": 0.9009919166564941,
- "num_tokens": 12780706.0,
- "step": 1435
- },
- {
- "epoch": 1.0911854103343466,
- "grad_norm": 2.1985926628112793,
- "learning_rate": 3.773504794647187e-06,
- "loss": 0.3913170397281647,
- "mean_token_accuracy": 0.8909255266189575,
- "num_tokens": 12787052.0,
- "step": 1436
- },
- {
- "epoch": 1.0919452887537995,
- "grad_norm": 2.8687937259674072,
- "learning_rate": 3.771702060814123e-06,
- "loss": 0.3135771155357361,
- "mean_token_accuracy": 0.9016125202178955,
- "num_tokens": 12791854.0,
- "step": 1437
- },
- {
- "epoch": 1.0927051671732522,
- "grad_norm": 4.203946590423584,
- "learning_rate": 3.7698984344516e-06,
- "loss": 0.3642737865447998,
- "mean_token_accuracy": 0.8842349052429199,
- "num_tokens": 12794969.0,
- "step": 1438
- },
- {
- "epoch": 1.093465045592705,
- "grad_norm": 1.5134642124176025,
- "learning_rate": 3.7680939168254733e-06,
- "loss": 0.3732057213783264,
- "mean_token_accuracy": 0.8671083450317383,
- "num_tokens": 12808480.0,
- "step": 1439
- },
- {
- "epoch": 1.094224924012158,
- "grad_norm": 3.2103970050811768,
- "learning_rate": 3.7662885092022206e-06,
- "loss": 0.3556194603443146,
- "mean_token_accuracy": 0.8786529302597046,
- "num_tokens": 12812654.0,
- "step": 1440
- },
- {
- "epoch": 1.094984802431611,
- "grad_norm": 2.2774064540863037,
- "learning_rate": 3.7644822128489476e-06,
- "loss": 0.38409674167633057,
- "mean_token_accuracy": 0.866563081741333,
- "num_tokens": 12819854.0,
- "step": 1441
- },
- {
- "epoch": 1.0957446808510638,
- "grad_norm": 1.8250885009765625,
- "learning_rate": 3.7626750290333824e-06,
- "loss": 0.3812350034713745,
- "mean_token_accuracy": 0.8676212430000305,
- "num_tokens": 12830338.0,
- "step": 1442
- },
- {
- "epoch": 1.0965045592705167,
- "grad_norm": 1.8337891101837158,
- "learning_rate": 3.7608669590238765e-06,
- "loss": 0.3892471194267273,
- "mean_token_accuracy": 0.8616238832473755,
- "num_tokens": 12840340.0,
- "step": 1443
- },
- {
- "epoch": 1.0972644376899696,
- "grad_norm": 1.5300254821777344,
- "learning_rate": 3.7590580040894025e-06,
- "loss": 0.35288217663764954,
- "mean_token_accuracy": 0.8625509738922119,
- "num_tokens": 12853144.0,
- "step": 1444
- },
- {
- "epoch": 1.0980243161094225,
- "grad_norm": 2.152683734893799,
- "learning_rate": 3.7572481654995554e-06,
- "loss": 0.4004772901535034,
- "mean_token_accuracy": 0.858427107334137,
- "num_tokens": 12859970.0,
- "step": 1445
- },
- {
- "epoch": 1.0987841945288754,
- "grad_norm": 1.532832145690918,
- "learning_rate": 3.755437444524548e-06,
- "loss": 0.46820127964019775,
- "mean_token_accuracy": 0.8585472106933594,
- "num_tokens": 12875243.0,
- "step": 1446
- },
- {
- "epoch": 1.0995440729483283,
- "grad_norm": 1.6485342979431152,
- "learning_rate": 3.7536258424352164e-06,
- "loss": 0.46329325437545776,
- "mean_token_accuracy": 0.8376060724258423,
- "num_tokens": 12886383.0,
- "step": 1447
- },
- {
- "epoch": 1.1003039513677813,
- "grad_norm": 2.402256488800049,
- "learning_rate": 3.75181336050301e-06,
- "loss": 0.43916207551956177,
- "mean_token_accuracy": 0.8448786735534668,
- "num_tokens": 12892613.0,
- "step": 1448
- },
- {
- "epoch": 1.101063829787234,
- "grad_norm": 1.3893651962280273,
- "learning_rate": 3.7500000000000005e-06,
- "loss": 0.3919021785259247,
- "mean_token_accuracy": 0.8495820760726929,
- "num_tokens": 12905523.0,
- "step": 1449
- },
- {
- "epoch": 1.1018237082066868,
- "grad_norm": 1.5519827604293823,
- "learning_rate": 3.7481857621988734e-06,
- "loss": 0.4710700809955597,
- "mean_token_accuracy": 0.8387632369995117,
- "num_tokens": 12918236.0,
- "step": 1450
- },
- {
- "epoch": 1.1025835866261398,
- "grad_norm": 2.0141353607177734,
- "learning_rate": 3.74637064837293e-06,
- "loss": 0.30866751074790955,
- "mean_token_accuracy": 0.9059321880340576,
- "num_tokens": 12924391.0,
- "step": 1451
- },
- {
- "epoch": 1.1033434650455927,
- "grad_norm": 1.2201496362686157,
- "learning_rate": 3.7445546597960882e-06,
- "loss": 0.3938257396221161,
- "mean_token_accuracy": 0.8726630210876465,
- "num_tokens": 12943338.0,
- "step": 1452
- },
- {
- "epoch": 1.1041033434650456,
- "grad_norm": 2.29434871673584,
- "learning_rate": 3.742737797742878e-06,
- "loss": 0.4347776174545288,
- "mean_token_accuracy": 0.840569257736206,
- "num_tokens": 12950636.0,
- "step": 1453
- },
- {
- "epoch": 1.1048632218844985,
- "grad_norm": 2.3875105381011963,
- "learning_rate": 3.7409200634884425e-06,
- "loss": 0.48353564739227295,
- "mean_token_accuracy": 0.8207056522369385,
- "num_tokens": 12957635.0,
- "step": 1454
- },
- {
- "epoch": 1.1056231003039514,
- "grad_norm": 2.3539648056030273,
- "learning_rate": 3.7391014583085384e-06,
- "loss": 0.3532431721687317,
- "mean_token_accuracy": 0.8903788924217224,
- "num_tokens": 12963032.0,
- "step": 1455
- },
- {
- "epoch": 1.1063829787234043,
- "grad_norm": 1.5611135959625244,
- "learning_rate": 3.737281983479534e-06,
- "loss": 0.4734863042831421,
- "mean_token_accuracy": 0.8413879871368408,
- "num_tokens": 12977170.0,
- "step": 1456
- },
- {
- "epoch": 1.1071428571428572,
- "grad_norm": 1.474320411682129,
- "learning_rate": 3.735461640278404e-06,
- "loss": 0.41854286193847656,
- "mean_token_accuracy": 0.8499876856803894,
- "num_tokens": 12993750.0,
- "step": 1457
- },
- {
- "epoch": 1.1079027355623101,
- "grad_norm": 2.6873273849487305,
- "learning_rate": 3.733640429982738e-06,
- "loss": 0.47637903690338135,
- "mean_token_accuracy": 0.83599853515625,
- "num_tokens": 12999058.0,
- "step": 1458
- },
- {
- "epoch": 1.108662613981763,
- "grad_norm": 1.4575026035308838,
- "learning_rate": 3.731818353870729e-06,
- "loss": 0.38441652059555054,
- "mean_token_accuracy": 0.8582364320755005,
- "num_tokens": 13013864.0,
- "step": 1459
- },
- {
- "epoch": 1.1094224924012157,
- "grad_norm": 1.7722690105438232,
- "learning_rate": 3.729995413221183e-06,
- "loss": 0.4224998950958252,
- "mean_token_accuracy": 0.8511888384819031,
- "num_tokens": 13023714.0,
- "step": 1460
- },
- {
- "epoch": 1.1101823708206686,
- "grad_norm": 2.625760555267334,
- "learning_rate": 3.7281716093135068e-06,
- "loss": 0.3487582802772522,
- "mean_token_accuracy": 0.8834779262542725,
- "num_tokens": 13028608.0,
- "step": 1461
- },
- {
- "epoch": 1.1109422492401215,
- "grad_norm": 1.2554056644439697,
- "learning_rate": 3.726346943427719e-06,
- "loss": 0.33312469720840454,
- "mean_token_accuracy": 0.8704153299331665,
- "num_tokens": 13044901.0,
- "step": 1462
- },
- {
- "epoch": 1.1117021276595744,
- "grad_norm": 2.1109910011291504,
- "learning_rate": 3.7245214168444388e-06,
- "loss": 0.387290894985199,
- "mean_token_accuracy": 0.860816240310669,
- "num_tokens": 13051452.0,
- "step": 1463
- },
- {
- "epoch": 1.1124620060790273,
- "grad_norm": 3.159201145172119,
- "learning_rate": 3.722695030844891e-06,
- "loss": 0.37690871953964233,
- "mean_token_accuracy": 0.8717561960220337,
- "num_tokens": 13055131.0,
- "step": 1464
- },
- {
- "epoch": 1.1132218844984803,
- "grad_norm": 1.3810011148452759,
- "learning_rate": 3.7208677867109042e-06,
- "loss": 0.36598485708236694,
- "mean_token_accuracy": 0.8683375120162964,
- "num_tokens": 13069798.0,
- "step": 1465
- },
- {
- "epoch": 1.1139817629179332,
- "grad_norm": 2.500849485397339,
- "learning_rate": 3.7190396857249087e-06,
- "loss": 0.2781746983528137,
- "mean_token_accuracy": 0.9026005268096924,
- "num_tokens": 13075127.0,
- "step": 1466
- },
- {
- "epoch": 1.114741641337386,
- "grad_norm": 1.7445712089538574,
- "learning_rate": 3.7172107291699356e-06,
- "loss": 0.5055314302444458,
- "mean_token_accuracy": 0.8252174258232117,
- "num_tokens": 13084843.0,
- "step": 1467
- },
- {
- "epoch": 1.115501519756839,
- "grad_norm": 1.6386256217956543,
- "learning_rate": 3.7153809183296174e-06,
- "loss": 0.38478314876556396,
- "mean_token_accuracy": 0.8600847721099854,
- "num_tokens": 13096517.0,
- "step": 1468
- },
- {
- "epoch": 1.1162613981762919,
- "grad_norm": 2.3818395137786865,
- "learning_rate": 3.713550254488185e-06,
- "loss": 0.40308547019958496,
- "mean_token_accuracy": 0.8628184795379639,
- "num_tokens": 13102324.0,
- "step": 1469
- },
- {
- "epoch": 1.1170212765957448,
- "grad_norm": 1.73163640499115,
- "learning_rate": 3.7117187389304703e-06,
- "loss": 0.5035421848297119,
- "mean_token_accuracy": 0.8229597210884094,
- "num_tokens": 13113763.0,
- "step": 1470
- },
- {
- "epoch": 1.1177811550151975,
- "grad_norm": 3.147177219390869,
- "learning_rate": 3.7098863729418997e-06,
- "loss": 0.557449221611023,
- "mean_token_accuracy": 0.8266849517822266,
- "num_tokens": 13118849.0,
- "step": 1471
- },
- {
- "epoch": 1.1185410334346504,
- "grad_norm": 1.5061391592025757,
- "learning_rate": 3.7080531578085e-06,
- "loss": 0.3759554922580719,
- "mean_token_accuracy": 0.8541903495788574,
- "num_tokens": 13131337.0,
- "step": 1472
- },
- {
- "epoch": 1.1193009118541033,
- "grad_norm": 2.172346353530884,
- "learning_rate": 3.7062190948168906e-06,
- "loss": 0.41491609811782837,
- "mean_token_accuracy": 0.8531454801559448,
- "num_tokens": 13139767.0,
- "step": 1473
- },
- {
- "epoch": 1.1200607902735562,
- "grad_norm": 2.1527154445648193,
- "learning_rate": 3.7043841852542884e-06,
- "loss": 0.4309239387512207,
- "mean_token_accuracy": 0.8327745199203491,
- "num_tokens": 13147210.0,
- "step": 1474
- },
- {
- "epoch": 1.1208206686930091,
- "grad_norm": 1.8342832326889038,
- "learning_rate": 3.7025484304085035e-06,
- "loss": 0.34393298625946045,
- "mean_token_accuracy": 0.8948153257369995,
- "num_tokens": 13154831.0,
- "step": 1475
- },
- {
- "epoch": 1.121580547112462,
- "grad_norm": 2.509291172027588,
- "learning_rate": 3.7007118315679384e-06,
- "loss": 0.4479471445083618,
- "mean_token_accuracy": 0.8280234336853027,
- "num_tokens": 13161040.0,
- "step": 1476
- },
- {
- "epoch": 1.122340425531915,
- "grad_norm": 2.914710521697998,
- "learning_rate": 3.6988743900215895e-06,
- "loss": 0.3724832832813263,
- "mean_token_accuracy": 0.863893985748291,
- "num_tokens": 13164975.0,
- "step": 1477
- },
- {
- "epoch": 1.1231003039513678,
- "grad_norm": 3.274808645248413,
- "learning_rate": 3.6970361070590443e-06,
- "loss": 0.4088161885738373,
- "mean_token_accuracy": 0.8474822044372559,
- "num_tokens": 13168826.0,
- "step": 1478
- },
- {
- "epoch": 1.1238601823708207,
- "grad_norm": 2.861546277999878,
- "learning_rate": 3.695196983970481e-06,
- "loss": 0.45837992429733276,
- "mean_token_accuracy": 0.8579759001731873,
- "num_tokens": 13173794.0,
- "step": 1479
- },
- {
- "epoch": 1.1246200607902737,
- "grad_norm": 1.9491597414016724,
- "learning_rate": 3.6933570220466654e-06,
- "loss": 0.4333910346031189,
- "mean_token_accuracy": 0.8444236516952515,
- "num_tokens": 13181598.0,
- "step": 1480
- },
- {
- "epoch": 1.1253799392097266,
- "grad_norm": 1.329848051071167,
- "learning_rate": 3.6915162225789546e-06,
- "loss": 0.36404621601104736,
- "mean_token_accuracy": 0.8694117069244385,
- "num_tokens": 13196381.0,
- "step": 1481
- },
- {
- "epoch": 1.1261398176291793,
- "grad_norm": 1.8854197263717651,
- "learning_rate": 3.6896745868592924e-06,
- "loss": 0.4085756838321686,
- "mean_token_accuracy": 0.855188250541687,
- "num_tokens": 13205236.0,
- "step": 1482
- },
- {
- "epoch": 1.1268996960486322,
- "grad_norm": 3.01684832572937,
- "learning_rate": 3.6878321161802106e-06,
- "loss": 0.28105655312538147,
- "mean_token_accuracy": 0.9009426236152649,
- "num_tokens": 13209380.0,
- "step": 1483
- },
- {
- "epoch": 1.127659574468085,
- "grad_norm": 1.8051308393478394,
- "learning_rate": 3.685988811834823e-06,
- "loss": 0.3314531147480011,
- "mean_token_accuracy": 0.8805814385414124,
- "num_tokens": 13217714.0,
- "step": 1484
- },
- {
- "epoch": 1.128419452887538,
- "grad_norm": 1.61757493019104,
- "learning_rate": 3.684144675116836e-06,
- "loss": 0.4543863534927368,
- "mean_token_accuracy": 0.8400536775588989,
- "num_tokens": 13229330.0,
- "step": 1485
- },
- {
- "epoch": 1.1291793313069909,
- "grad_norm": 1.602686882019043,
- "learning_rate": 3.682299707320532e-06,
- "loss": 0.3653204143047333,
- "mean_token_accuracy": 0.8655825853347778,
- "num_tokens": 13242872.0,
- "step": 1486
- },
- {
- "epoch": 1.1299392097264438,
- "grad_norm": 2.3093113899230957,
- "learning_rate": 3.680453909740782e-06,
- "loss": 0.4383693039417267,
- "mean_token_accuracy": 0.839782178401947,
- "num_tokens": 13248976.0,
- "step": 1487
- },
- {
- "epoch": 1.1306990881458967,
- "grad_norm": 1.180559754371643,
- "learning_rate": 3.6786072836730376e-06,
- "loss": 0.5354755520820618,
- "mean_token_accuracy": 0.8151205778121948,
- "num_tokens": 13272896.0,
- "step": 1488
- },
- {
- "epoch": 1.1314589665653496,
- "grad_norm": 1.9554040431976318,
- "learning_rate": 3.6767598304133325e-06,
- "loss": 0.4485316872596741,
- "mean_token_accuracy": 0.8399936556816101,
- "num_tokens": 13280757.0,
- "step": 1489
- },
- {
- "epoch": 1.1322188449848025,
- "grad_norm": 2.236471176147461,
- "learning_rate": 3.674911551258279e-06,
- "loss": 0.45594364404678345,
- "mean_token_accuracy": 0.8552400469779968,
- "num_tokens": 13287328.0,
- "step": 1490
- },
- {
- "epoch": 1.1329787234042552,
- "grad_norm": 2.5228686332702637,
- "learning_rate": 3.673062447505072e-06,
- "loss": 0.4048641622066498,
- "mean_token_accuracy": 0.8617376685142517,
- "num_tokens": 13292716.0,
- "step": 1491
- },
- {
- "epoch": 1.1337386018237081,
- "grad_norm": 1.1274473667144775,
- "learning_rate": 3.6712125204514836e-06,
- "loss": 0.3848876357078552,
- "mean_token_accuracy": 0.8672975301742554,
- "num_tokens": 13313403.0,
- "step": 1492
- },
- {
- "epoch": 1.134498480243161,
- "grad_norm": 2.349541425704956,
- "learning_rate": 3.6693617713958633e-06,
- "loss": 0.3166058361530304,
- "mean_token_accuracy": 0.8896721601486206,
- "num_tokens": 13318720.0,
- "step": 1493
- },
- {
- "epoch": 1.135258358662614,
- "grad_norm": 2.2438278198242188,
- "learning_rate": 3.6675102016371387e-06,
- "loss": 0.5418218970298767,
- "mean_token_accuracy": 0.8256527185440063,
- "num_tokens": 13325360.0,
- "step": 1494
- },
- {
- "epoch": 1.1360182370820668,
- "grad_norm": 2.21268892288208,
- "learning_rate": 3.665657812474812e-06,
- "loss": 0.48603951930999756,
- "mean_token_accuracy": 0.8273470401763916,
- "num_tokens": 13333217.0,
- "step": 1495
- },
- {
- "epoch": 1.1367781155015197,
- "grad_norm": 2.6105997562408447,
- "learning_rate": 3.6638046052089614e-06,
- "loss": 0.31221291422843933,
- "mean_token_accuracy": 0.888375997543335,
- "num_tokens": 13338413.0,
- "step": 1496
- },
- {
- "epoch": 1.1375379939209727,
- "grad_norm": 3.655658483505249,
- "learning_rate": 3.661950581140239e-06,
- "loss": 0.3609023988246918,
- "mean_token_accuracy": 0.8838576078414917,
- "num_tokens": 13341499.0,
- "step": 1497
- },
- {
- "epoch": 1.1382978723404256,
- "grad_norm": 2.242009162902832,
- "learning_rate": 3.660095741569871e-06,
- "loss": 0.40022802352905273,
- "mean_token_accuracy": 0.8559960722923279,
- "num_tokens": 13347917.0,
- "step": 1498
- },
- {
- "epoch": 1.1390577507598785,
- "grad_norm": 1.7958979606628418,
- "learning_rate": 3.658240087799655e-06,
- "loss": 0.499157190322876,
- "mean_token_accuracy": 0.8423802256584167,
- "num_tokens": 13361570.0,
- "step": 1499
- },
- {
- "epoch": 1.1398176291793314,
- "grad_norm": 2.5406908988952637,
- "learning_rate": 3.6563836211319593e-06,
- "loss": 0.4090137481689453,
- "mean_token_accuracy": 0.8769663572311401,
- "num_tokens": 13367183.0,
- "step": 1500
- },
- {
- "epoch": 1.1405775075987843,
- "grad_norm": 1.9861716032028198,
- "learning_rate": 3.654526342869724e-06,
- "loss": 0.5125207304954529,
- "mean_token_accuracy": 0.8315266370773315,
- "num_tokens": 13376767.0,
- "step": 1501
- },
- {
- "epoch": 1.141337386018237,
- "grad_norm": 1.731188178062439,
- "learning_rate": 3.65266825431646e-06,
- "loss": 0.39452576637268066,
- "mean_token_accuracy": 0.8585706353187561,
- "num_tokens": 13388437.0,
- "step": 1502
- },
- {
- "epoch": 1.1420972644376899,
- "grad_norm": 1.5203773975372314,
- "learning_rate": 3.6508093567762425e-06,
- "loss": 0.39466819167137146,
- "mean_token_accuracy": 0.8584027886390686,
- "num_tokens": 13399727.0,
- "step": 1503
- },
- {
- "epoch": 1.1428571428571428,
- "grad_norm": 2.606462001800537,
- "learning_rate": 3.6489496515537204e-06,
- "loss": 0.4521079361438751,
- "mean_token_accuracy": 0.8413360118865967,
- "num_tokens": 13408426.0,
- "step": 1504
- },
- {
- "epoch": 1.1436170212765957,
- "grad_norm": 2.6207993030548096,
- "learning_rate": 3.647089139954104e-06,
- "loss": 0.4709353446960449,
- "mean_token_accuracy": 0.8397113084793091,
- "num_tokens": 13413506.0,
- "step": 1505
- },
- {
- "epoch": 1.1443768996960486,
- "grad_norm": 1.7214165925979614,
- "learning_rate": 3.6452278232831734e-06,
- "loss": 0.45506367087364197,
- "mean_token_accuracy": 0.8466023206710815,
- "num_tokens": 13424592.0,
- "step": 1506
- },
- {
- "epoch": 1.1451367781155015,
- "grad_norm": 1.7111759185791016,
- "learning_rate": 3.643365702847272e-06,
- "loss": 0.5016278624534607,
- "mean_token_accuracy": 0.8196234703063965,
- "num_tokens": 13434421.0,
- "step": 1507
- },
- {
- "epoch": 1.1458966565349544,
- "grad_norm": 1.7528148889541626,
- "learning_rate": 3.641502779953307e-06,
- "loss": 0.5020896196365356,
- "mean_token_accuracy": 0.826249361038208,
- "num_tokens": 13445286.0,
- "step": 1508
- },
- {
- "epoch": 1.1466565349544073,
- "grad_norm": 1.3470909595489502,
- "learning_rate": 3.639639055908751e-06,
- "loss": 0.45765724778175354,
- "mean_token_accuracy": 0.8380560278892517,
- "num_tokens": 13465030.0,
- "step": 1509
- },
- {
- "epoch": 1.1474164133738602,
- "grad_norm": 2.4846835136413574,
- "learning_rate": 3.6377745320216346e-06,
- "loss": 0.46488267183303833,
- "mean_token_accuracy": 0.8393925428390503,
- "num_tokens": 13470883.0,
- "step": 1510
- },
- {
- "epoch": 1.1481762917933132,
- "grad_norm": 1.770201563835144,
- "learning_rate": 3.635909209600555e-06,
- "loss": 0.5262179374694824,
- "mean_token_accuracy": 0.8201162815093994,
- "num_tokens": 13482558.0,
- "step": 1511
- },
- {
- "epoch": 1.148936170212766,
- "grad_norm": 1.5955098867416382,
- "learning_rate": 3.6340430899546656e-06,
- "loss": 0.430621862411499,
- "mean_token_accuracy": 0.8488553762435913,
- "num_tokens": 13493003.0,
- "step": 1512
- },
- {
- "epoch": 1.1496960486322187,
- "grad_norm": 2.846176862716675,
- "learning_rate": 3.632176174393682e-06,
- "loss": 0.23461638391017914,
- "mean_token_accuracy": 0.9218817353248596,
- "num_tokens": 13496566.0,
- "step": 1513
- },
- {
- "epoch": 1.1504559270516717,
- "grad_norm": 1.9606610536575317,
- "learning_rate": 3.630308464227877e-06,
- "loss": 0.4940161108970642,
- "mean_token_accuracy": 0.8474864959716797,
- "num_tokens": 13504843.0,
- "step": 1514
- },
- {
- "epoch": 1.1512158054711246,
- "grad_norm": 1.1588608026504517,
- "learning_rate": 3.628439960768082e-06,
- "loss": 0.32650992274284363,
- "mean_token_accuracy": 0.8797246217727661,
- "num_tokens": 13521513.0,
- "step": 1515
- },
- {
- "epoch": 1.1519756838905775,
- "grad_norm": 1.3566495180130005,
- "learning_rate": 3.6265706653256837e-06,
- "loss": 0.4359064996242523,
- "mean_token_accuracy": 0.8379859328269958,
- "num_tokens": 13540608.0,
- "step": 1516
- },
- {
- "epoch": 1.1527355623100304,
- "grad_norm": 1.4728609323501587,
- "learning_rate": 3.624700579212626e-06,
- "loss": 0.29939693212509155,
- "mean_token_accuracy": 0.8831408023834229,
- "num_tokens": 13550641.0,
- "step": 1517
- },
- {
- "epoch": 1.1534954407294833,
- "grad_norm": 2.162325382232666,
- "learning_rate": 3.6228297037414077e-06,
- "loss": 0.4097636938095093,
- "mean_token_accuracy": 0.8575425148010254,
- "num_tokens": 13556931.0,
- "step": 1518
- },
- {
- "epoch": 1.1542553191489362,
- "grad_norm": 1.754439353942871,
- "learning_rate": 3.6209580402250816e-06,
- "loss": 0.400202214717865,
- "mean_token_accuracy": 0.8569821119308472,
- "num_tokens": 13565491.0,
- "step": 1519
- },
- {
- "epoch": 1.155015197568389,
- "grad_norm": 1.5250083208084106,
- "learning_rate": 3.619085589977251e-06,
- "loss": 0.43330419063568115,
- "mean_token_accuracy": 0.8492985963821411,
- "num_tokens": 13577147.0,
- "step": 1520
- },
- {
- "epoch": 1.155775075987842,
- "grad_norm": 1.9108905792236328,
- "learning_rate": 3.617212354312076e-06,
- "loss": 0.30567464232444763,
- "mean_token_accuracy": 0.8850164413452148,
- "num_tokens": 13584366.0,
- "step": 1521
- },
- {
- "epoch": 1.156534954407295,
- "grad_norm": 2.2574243545532227,
- "learning_rate": 3.615338334544265e-06,
- "loss": 0.4391738772392273,
- "mean_token_accuracy": 0.839765727519989,
- "num_tokens": 13591816.0,
- "step": 1522
- },
- {
- "epoch": 1.1572948328267478,
- "grad_norm": 2.1235218048095703,
- "learning_rate": 3.6134635319890763e-06,
- "loss": 0.45043107867240906,
- "mean_token_accuracy": 0.8385299444198608,
- "num_tokens": 13599736.0,
- "step": 1523
- },
- {
- "epoch": 1.1580547112462005,
- "grad_norm": 2.2274110317230225,
- "learning_rate": 3.611587947962319e-06,
- "loss": 0.3623226284980774,
- "mean_token_accuracy": 0.8724044561386108,
- "num_tokens": 13605354.0,
- "step": 1524
- },
- {
- "epoch": 1.1588145896656534,
- "grad_norm": 3.414236545562744,
- "learning_rate": 3.6097115837803504e-06,
- "loss": 0.30060696601867676,
- "mean_token_accuracy": 0.8971061706542969,
- "num_tokens": 13608851.0,
- "step": 1525
- },
- {
- "epoch": 1.1595744680851063,
- "grad_norm": 2.496264696121216,
- "learning_rate": 3.6078344407600744e-06,
- "loss": 0.3567180037498474,
- "mean_token_accuracy": 0.8596180081367493,
- "num_tokens": 13614339.0,
- "step": 1526
- },
- {
- "epoch": 1.1603343465045592,
- "grad_norm": 2.0191843509674072,
- "learning_rate": 3.6059565202189433e-06,
- "loss": 0.43206095695495605,
- "mean_token_accuracy": 0.8464000821113586,
- "num_tokens": 13622395.0,
- "step": 1527
- },
- {
- "epoch": 1.1610942249240122,
- "grad_norm": 1.5475906133651733,
- "learning_rate": 3.604077823474954e-06,
- "loss": 0.4535648226737976,
- "mean_token_accuracy": 0.8391586542129517,
- "num_tokens": 13635356.0,
- "step": 1528
- },
- {
- "epoch": 1.161854103343465,
- "grad_norm": 2.1348211765289307,
- "learning_rate": 3.6021983518466468e-06,
- "loss": 0.2733963429927826,
- "mean_token_accuracy": 0.9007417559623718,
- "num_tokens": 13640641.0,
- "step": 1529
- },
- {
- "epoch": 1.162613981762918,
- "grad_norm": 2.8452792167663574,
- "learning_rate": 3.600318106653108e-06,
- "loss": 0.29591235518455505,
- "mean_token_accuracy": 0.8934413194656372,
- "num_tokens": 13644995.0,
- "step": 1530
- },
- {
- "epoch": 1.1633738601823709,
- "grad_norm": 2.342907190322876,
- "learning_rate": 3.5984370892139663e-06,
- "loss": 0.4675130248069763,
- "mean_token_accuracy": 0.8352028131484985,
- "num_tokens": 13652695.0,
- "step": 1531
- },
- {
- "epoch": 1.1641337386018238,
- "grad_norm": 2.3480238914489746,
- "learning_rate": 3.5965553008493924e-06,
- "loss": 0.3114515542984009,
- "mean_token_accuracy": 0.8845353126525879,
- "num_tokens": 13658101.0,
- "step": 1532
- },
- {
- "epoch": 1.1648936170212765,
- "grad_norm": 1.8608155250549316,
- "learning_rate": 3.594672742880097e-06,
- "loss": 0.3864145278930664,
- "mean_token_accuracy": 0.867354154586792,
- "num_tokens": 13666042.0,
- "step": 1533
- },
- {
- "epoch": 1.1656534954407296,
- "grad_norm": 1.4756088256835938,
- "learning_rate": 3.5927894166273324e-06,
- "loss": 0.3671600818634033,
- "mean_token_accuracy": 0.8695988655090332,
- "num_tokens": 13678253.0,
- "step": 1534
- },
- {
- "epoch": 1.1664133738601823,
- "grad_norm": 2.8831355571746826,
- "learning_rate": 3.5909053234128893e-06,
- "loss": 0.267184317111969,
- "mean_token_accuracy": 0.9008115530014038,
- "num_tokens": 13681790.0,
- "step": 1535
- },
- {
- "epoch": 1.1671732522796352,
- "grad_norm": 2.1984763145446777,
- "learning_rate": 3.5890204645590964e-06,
- "loss": 0.4431505799293518,
- "mean_token_accuracy": 0.8623673915863037,
- "num_tokens": 13688444.0,
- "step": 1536
- },
- {
- "epoch": 1.167933130699088,
- "grad_norm": 1.8271523714065552,
- "learning_rate": 3.5871348413888207e-06,
- "loss": 0.3861040771007538,
- "mean_token_accuracy": 0.8624277114868164,
- "num_tokens": 13696872.0,
- "step": 1537
- },
- {
- "epoch": 1.168693009118541,
- "grad_norm": 1.6313756704330444,
- "learning_rate": 3.585248455225466e-06,
- "loss": 0.3775154948234558,
- "mean_token_accuracy": 0.8624461889266968,
- "num_tokens": 13706167.0,
- "step": 1538
- },
- {
- "epoch": 1.169452887537994,
- "grad_norm": 2.4377901554107666,
- "learning_rate": 3.5833613073929684e-06,
- "loss": 0.2308957427740097,
- "mean_token_accuracy": 0.920600175857544,
- "num_tokens": 13710367.0,
- "step": 1539
- },
- {
- "epoch": 1.1702127659574468,
- "grad_norm": 2.2621750831604004,
- "learning_rate": 3.5814733992158025e-06,
- "loss": 0.33167219161987305,
- "mean_token_accuracy": 0.8963261842727661,
- "num_tokens": 13716384.0,
- "step": 1540
- },
- {
- "epoch": 1.1709726443768997,
- "grad_norm": 1.3178150653839111,
- "learning_rate": 3.579584732018975e-06,
- "loss": 0.3276631832122803,
- "mean_token_accuracy": 0.8853521347045898,
- "num_tokens": 13731031.0,
- "step": 1541
- },
- {
- "epoch": 1.1717325227963526,
- "grad_norm": 2.177750587463379,
- "learning_rate": 3.577695307128024e-06,
- "loss": 0.48177266120910645,
- "mean_token_accuracy": 0.830329418182373,
- "num_tokens": 13737925.0,
- "step": 1542
- },
- {
- "epoch": 1.1724924012158056,
- "grad_norm": 2.2268829345703125,
- "learning_rate": 3.5758051258690223e-06,
- "loss": 0.48843517899513245,
- "mean_token_accuracy": 0.8310644030570984,
- "num_tokens": 13746039.0,
- "step": 1543
- },
- {
- "epoch": 1.1732522796352582,
- "grad_norm": 1.498701572418213,
- "learning_rate": 3.5739141895685708e-06,
- "loss": 0.4542962312698364,
- "mean_token_accuracy": 0.8500330448150635,
- "num_tokens": 13765002.0,
- "step": 1544
- },
- {
- "epoch": 1.1740121580547112,
- "grad_norm": 1.786670446395874,
- "learning_rate": 3.5720224995538023e-06,
- "loss": 0.27367928624153137,
- "mean_token_accuracy": 0.8916142582893372,
- "num_tokens": 13774113.0,
- "step": 1545
- },
- {
- "epoch": 1.174772036474164,
- "grad_norm": 2.0311272144317627,
- "learning_rate": 3.5701300571523757e-06,
- "loss": 0.559987485408783,
- "mean_token_accuracy": 0.8266973495483398,
- "num_tokens": 13783912.0,
- "step": 1546
- },
- {
- "epoch": 1.175531914893617,
- "grad_norm": 1.8732186555862427,
- "learning_rate": 3.5682368636924825e-06,
- "loss": 0.5184751152992249,
- "mean_token_accuracy": 0.8450918197631836,
- "num_tokens": 13792728.0,
- "step": 1547
- },
- {
- "epoch": 1.1762917933130699,
- "grad_norm": 1.4410661458969116,
- "learning_rate": 3.566342920502837e-06,
- "loss": 0.383536696434021,
- "mean_token_accuracy": 0.8672217726707458,
- "num_tokens": 13813590.0,
- "step": 1548
- },
- {
- "epoch": 1.1770516717325228,
- "grad_norm": 3.06056547164917,
- "learning_rate": 3.564448228912682e-06,
- "loss": 0.3941686153411865,
- "mean_token_accuracy": 0.8696402311325073,
- "num_tokens": 13817704.0,
- "step": 1549
- },
- {
- "epoch": 1.1778115501519757,
- "grad_norm": 1.6150329113006592,
- "learning_rate": 3.562552790251785e-06,
- "loss": 0.41606605052948,
- "mean_token_accuracy": 0.8488572835922241,
- "num_tokens": 13831303.0,
- "step": 1550
- },
- {
- "epoch": 1.1785714285714286,
- "grad_norm": 2.1199934482574463,
- "learning_rate": 3.5606566058504377e-06,
- "loss": 0.3974752426147461,
- "mean_token_accuracy": 0.8686345219612122,
- "num_tokens": 13837613.0,
- "step": 1551
- },
- {
- "epoch": 1.1793313069908815,
- "grad_norm": 1.5683876276016235,
- "learning_rate": 3.558759677039455e-06,
- "loss": 0.35225993394851685,
- "mean_token_accuracy": 0.8710784316062927,
- "num_tokens": 13846779.0,
- "step": 1552
- },
- {
- "epoch": 1.1800911854103344,
- "grad_norm": 1.4644675254821777,
- "learning_rate": 3.5568620051501755e-06,
- "loss": 0.38400042057037354,
- "mean_token_accuracy": 0.8548328876495361,
- "num_tokens": 13860713.0,
- "step": 1553
- },
- {
- "epoch": 1.1808510638297873,
- "grad_norm": 1.461491346359253,
- "learning_rate": 3.5549635915144578e-06,
- "loss": 0.4572640061378479,
- "mean_token_accuracy": 0.8506045937538147,
- "num_tokens": 13877289.0,
- "step": 1554
- },
- {
- "epoch": 1.18161094224924,
- "grad_norm": 2.6364715099334717,
- "learning_rate": 3.553064437464682e-06,
- "loss": 0.3954341411590576,
- "mean_token_accuracy": 0.8561649322509766,
- "num_tokens": 13882064.0,
- "step": 1555
- },
- {
- "epoch": 1.182370820668693,
- "grad_norm": 2.027273654937744,
- "learning_rate": 3.551164544333745e-06,
- "loss": 0.47625732421875,
- "mean_token_accuracy": 0.8349384069442749,
- "num_tokens": 13890306.0,
- "step": 1556
- },
- {
- "epoch": 1.1831306990881458,
- "grad_norm": 2.8427743911743164,
- "learning_rate": 3.549263913455069e-06,
- "loss": 0.4273033142089844,
- "mean_token_accuracy": 0.8541387319564819,
- "num_tokens": 13894882.0,
- "step": 1557
- },
- {
- "epoch": 1.1838905775075987,
- "grad_norm": 1.6298975944519043,
- "learning_rate": 3.5473625461625884e-06,
- "loss": 0.4378639757633209,
- "mean_token_accuracy": 0.8634963631629944,
- "num_tokens": 13906152.0,
- "step": 1558
- },
- {
- "epoch": 1.1846504559270516,
- "grad_norm": 2.4098947048187256,
- "learning_rate": 3.5454604437907535e-06,
- "loss": 0.47236716747283936,
- "mean_token_accuracy": 0.8646864891052246,
- "num_tokens": 13911803.0,
- "step": 1559
- },
- {
- "epoch": 1.1854103343465046,
- "grad_norm": 1.5972497463226318,
- "learning_rate": 3.543557607674537e-06,
- "loss": 0.3001407980918884,
- "mean_token_accuracy": 0.8927055597305298,
- "num_tokens": 13921304.0,
- "step": 1560
- },
- {
- "epoch": 1.1861702127659575,
- "grad_norm": 2.1140005588531494,
- "learning_rate": 3.54165403914942e-06,
- "loss": 0.41898271441459656,
- "mean_token_accuracy": 0.8542245626449585,
- "num_tokens": 13929434.0,
- "step": 1561
- },
- {
- "epoch": 1.1869300911854104,
- "grad_norm": 1.8733803033828735,
- "learning_rate": 3.539749739551401e-06,
- "loss": 0.35469961166381836,
- "mean_token_accuracy": 0.8805290460586548,
- "num_tokens": 13937781.0,
- "step": 1562
- },
- {
- "epoch": 1.1876899696048633,
- "grad_norm": 2.2805802822113037,
- "learning_rate": 3.53784471021699e-06,
- "loss": 0.44496792554855347,
- "mean_token_accuracy": 0.8454172611236572,
- "num_tokens": 13944394.0,
- "step": 1563
- },
- {
- "epoch": 1.1884498480243162,
- "grad_norm": 0.9728449583053589,
- "learning_rate": 3.535938952483211e-06,
- "loss": 0.3156968355178833,
- "mean_token_accuracy": 0.8739837408065796,
- "num_tokens": 13966712.0,
- "step": 1564
- },
- {
- "epoch": 1.189209726443769,
- "grad_norm": 3.025338888168335,
- "learning_rate": 3.534032467687597e-06,
- "loss": 0.30036938190460205,
- "mean_token_accuracy": 0.9058252573013306,
- "num_tokens": 13970183.0,
- "step": 1565
- },
- {
- "epoch": 1.1899696048632218,
- "grad_norm": 2.0659425258636475,
- "learning_rate": 3.532125257168193e-06,
- "loss": 0.30619731545448303,
- "mean_token_accuracy": 0.9041587710380554,
- "num_tokens": 13976657.0,
- "step": 1566
- },
- {
- "epoch": 1.1907294832826747,
- "grad_norm": 3.2036776542663574,
- "learning_rate": 3.5302173222635526e-06,
- "loss": 0.4145944118499756,
- "mean_token_accuracy": 0.8502328395843506,
- "num_tokens": 13981198.0,
- "step": 1567
- },
- {
- "epoch": 1.1914893617021276,
- "grad_norm": 1.7767539024353027,
- "learning_rate": 3.5283086643127396e-06,
- "loss": 0.437128484249115,
- "mean_token_accuracy": 0.8965631723403931,
- "num_tokens": 13990259.0,
- "step": 1568
- },
- {
- "epoch": 1.1922492401215805,
- "grad_norm": 1.7777384519577026,
- "learning_rate": 3.5263992846553203e-06,
- "loss": 0.33831220865249634,
- "mean_token_accuracy": 0.8734279870986938,
- "num_tokens": 13999363.0,
- "step": 1569
- },
- {
- "epoch": 1.1930091185410334,
- "grad_norm": 1.6710708141326904,
- "learning_rate": 3.5244891846313733e-06,
- "loss": 0.4005590081214905,
- "mean_token_accuracy": 0.8820298314094543,
- "num_tokens": 14008719.0,
- "step": 1570
- },
- {
- "epoch": 1.1937689969604863,
- "grad_norm": 1.0378777980804443,
- "learning_rate": 3.5225783655814798e-06,
- "loss": 0.3174915313720703,
- "mean_token_accuracy": 0.8894162774085999,
- "num_tokens": 14025806.0,
- "step": 1571
- },
- {
- "epoch": 1.1945288753799392,
- "grad_norm": 1.2647521495819092,
- "learning_rate": 3.520666828846726e-06,
- "loss": 0.4173050820827484,
- "mean_token_accuracy": 0.8437265157699585,
- "num_tokens": 14046445.0,
- "step": 1572
- },
- {
- "epoch": 1.1952887537993921,
- "grad_norm": 2.8625528812408447,
- "learning_rate": 3.518754575768702e-06,
- "loss": 0.37182557582855225,
- "mean_token_accuracy": 0.8660947680473328,
- "num_tokens": 14051197.0,
- "step": 1573
- },
- {
- "epoch": 1.196048632218845,
- "grad_norm": 1.1213171482086182,
- "learning_rate": 3.516841607689501e-06,
- "loss": 0.332731157541275,
- "mean_token_accuracy": 0.8573278784751892,
- "num_tokens": 14070817.0,
- "step": 1574
- },
- {
- "epoch": 1.196808510638298,
- "grad_norm": 1.197508692741394,
- "learning_rate": 3.5149279259517165e-06,
- "loss": 0.34058472514152527,
- "mean_token_accuracy": 0.8603571653366089,
- "num_tokens": 14085301.0,
- "step": 1575
- },
- {
- "epoch": 1.1975683890577509,
- "grad_norm": 4.019949913024902,
- "learning_rate": 3.5130135318984454e-06,
- "loss": 0.3094622492790222,
- "mean_token_accuracy": 0.8905094861984253,
- "num_tokens": 14088107.0,
- "step": 1576
- },
- {
- "epoch": 1.1983282674772036,
- "grad_norm": 2.591181755065918,
- "learning_rate": 3.5110984268732827e-06,
- "loss": 0.3407078981399536,
- "mean_token_accuracy": 0.880385160446167,
- "num_tokens": 14092887.0,
- "step": 1577
- },
- {
- "epoch": 1.1990881458966565,
- "grad_norm": 1.3069331645965576,
- "learning_rate": 3.509182612220322e-06,
- "loss": 0.3761988878250122,
- "mean_token_accuracy": 0.862013041973114,
- "num_tokens": 14109216.0,
- "step": 1578
- },
- {
- "epoch": 1.1998480243161094,
- "grad_norm": 1.7802022695541382,
- "learning_rate": 3.507266089284157e-06,
- "loss": 0.3824652135372162,
- "mean_token_accuracy": 0.8707721829414368,
- "num_tokens": 14119645.0,
- "step": 1579
- },
- {
- "epoch": 1.2006079027355623,
- "grad_norm": 2.7937185764312744,
- "learning_rate": 3.5053488594098763e-06,
- "loss": 0.33828890323638916,
- "mean_token_accuracy": 0.8765541315078735,
- "num_tokens": 14124628.0,
- "step": 1580
- },
- {
- "epoch": 1.2013677811550152,
- "grad_norm": 1.892671823501587,
- "learning_rate": 3.5034309239430664e-06,
- "loss": 0.3476094603538513,
- "mean_token_accuracy": 0.9053795337677002,
- "num_tokens": 14131756.0,
- "step": 1581
- },
- {
- "epoch": 1.202127659574468,
- "grad_norm": 1.6857695579528809,
- "learning_rate": 3.501512284229807e-06,
- "loss": 0.5397108793258667,
- "mean_token_accuracy": 0.8173421025276184,
- "num_tokens": 14143024.0,
- "step": 1582
- },
- {
- "epoch": 1.202887537993921,
- "grad_norm": 2.501737117767334,
- "learning_rate": 3.4995929416166756e-06,
- "loss": 0.4192458391189575,
- "mean_token_accuracy": 0.8558136224746704,
- "num_tokens": 14149499.0,
- "step": 1583
- },
- {
- "epoch": 1.203647416413374,
- "grad_norm": 2.0133907794952393,
- "learning_rate": 3.4976728974507387e-06,
- "loss": 0.4791576564311981,
- "mean_token_accuracy": 0.8253597021102905,
- "num_tokens": 14158381.0,
- "step": 1584
- },
- {
- "epoch": 1.2044072948328268,
- "grad_norm": 2.984611988067627,
- "learning_rate": 3.4957521530795576e-06,
- "loss": 0.3040750026702881,
- "mean_token_accuracy": 0.8902391791343689,
- "num_tokens": 14162419.0,
- "step": 1585
- },
- {
- "epoch": 1.2051671732522795,
- "grad_norm": 1.518591284751892,
- "learning_rate": 3.493830709851185e-06,
- "loss": 0.35539618134498596,
- "mean_token_accuracy": 0.8737183809280396,
- "num_tokens": 14173048.0,
- "step": 1586
- },
- {
- "epoch": 1.2059270516717326,
- "grad_norm": 2.628758192062378,
- "learning_rate": 3.4919085691141636e-06,
- "loss": 0.33340200781822205,
- "mean_token_accuracy": 0.8705098628997803,
- "num_tokens": 14178255.0,
- "step": 1587
- },
- {
- "epoch": 1.2066869300911853,
- "grad_norm": 2.5565974712371826,
- "learning_rate": 3.4899857322175252e-06,
- "loss": 0.44939476251602173,
- "mean_token_accuracy": 0.8315504193305969,
- "num_tokens": 14183808.0,
- "step": 1588
- },
- {
- "epoch": 1.2074468085106382,
- "grad_norm": 1.7521045207977295,
- "learning_rate": 3.4880622005107916e-06,
- "loss": 0.3168621063232422,
- "mean_token_accuracy": 0.8824669122695923,
- "num_tokens": 14192186.0,
- "step": 1589
- },
- {
- "epoch": 1.2082066869300911,
- "grad_norm": 1.9816104173660278,
- "learning_rate": 3.486137975343971e-06,
- "loss": 0.3892582058906555,
- "mean_token_accuracy": 0.8524188995361328,
- "num_tokens": 14200512.0,
- "step": 1590
- },
- {
- "epoch": 1.208966565349544,
- "grad_norm": 1.459800124168396,
- "learning_rate": 3.484213058067559e-06,
- "loss": 0.45930033922195435,
- "mean_token_accuracy": 0.8408471345901489,
- "num_tokens": 14215232.0,
- "step": 1591
- },
- {
- "epoch": 1.209726443768997,
- "grad_norm": 2.015493154525757,
- "learning_rate": 3.482287450032536e-06,
- "loss": 0.5514016151428223,
- "mean_token_accuracy": 0.8456779718399048,
- "num_tokens": 14225402.0,
- "step": 1592
- },
- {
- "epoch": 1.2104863221884499,
- "grad_norm": 3.4511911869049072,
- "learning_rate": 3.4803611525903687e-06,
- "loss": 0.4772771894931793,
- "mean_token_accuracy": 0.8558698892593384,
- "num_tokens": 14229038.0,
- "step": 1593
- },
- {
- "epoch": 1.2112462006079028,
- "grad_norm": 2.2247982025146484,
- "learning_rate": 3.4784341670930067e-06,
- "loss": 0.4042825996875763,
- "mean_token_accuracy": 0.8635870218276978,
- "num_tokens": 14237057.0,
- "step": 1594
- },
- {
- "epoch": 1.2120060790273557,
- "grad_norm": 2.0534820556640625,
- "learning_rate": 3.4765064948928813e-06,
- "loss": 0.34057414531707764,
- "mean_token_accuracy": 0.8800770044326782,
- "num_tokens": 14243013.0,
- "step": 1595
- },
- {
- "epoch": 1.2127659574468086,
- "grad_norm": 2.594703197479248,
- "learning_rate": 3.474578137342909e-06,
- "loss": 0.4997410774230957,
- "mean_token_accuracy": 0.8302106261253357,
- "num_tokens": 14251210.0,
- "step": 1596
- },
- {
- "epoch": 1.2135258358662613,
- "grad_norm": 2.517833948135376,
- "learning_rate": 3.4726490957964836e-06,
- "loss": 0.3630390465259552,
- "mean_token_accuracy": 0.8679884672164917,
- "num_tokens": 14255893.0,
- "step": 1597
- },
- {
- "epoch": 1.2142857142857142,
- "grad_norm": 1.5177065134048462,
- "learning_rate": 3.4707193716074816e-06,
- "loss": 0.36218544840812683,
- "mean_token_accuracy": 0.879178524017334,
- "num_tokens": 14268143.0,
- "step": 1598
- },
- {
- "epoch": 1.215045592705167,
- "grad_norm": 2.215291738510132,
- "learning_rate": 3.4687889661302577e-06,
- "loss": 0.4166645407676697,
- "mean_token_accuracy": 0.8495793342590332,
- "num_tokens": 14276794.0,
- "step": 1599
- },
- {
- "epoch": 1.21580547112462,
- "grad_norm": 1.534294843673706,
- "learning_rate": 3.466857880719645e-06,
- "loss": 0.2635883092880249,
- "mean_token_accuracy": 0.8971712589263916,
- "num_tokens": 14287000.0,
- "step": 1600
- },
- {
- "epoch": 1.216565349544073,
- "grad_norm": 1.2338658571243286,
- "learning_rate": 3.464926116730953e-06,
- "loss": 0.339110404253006,
- "mean_token_accuracy": 0.895592987537384,
- "num_tokens": 14303217.0,
- "step": 1601
- },
- {
- "epoch": 1.2173252279635258,
- "grad_norm": 1.8717178106307983,
- "learning_rate": 3.462993675519968e-06,
- "loss": 0.41204726696014404,
- "mean_token_accuracy": 0.8560728430747986,
- "num_tokens": 14311372.0,
- "step": 1602
- },
- {
- "epoch": 1.2180851063829787,
- "grad_norm": 2.844160795211792,
- "learning_rate": 3.4610605584429526e-06,
- "loss": 0.4129520058631897,
- "mean_token_accuracy": 0.8555002212524414,
- "num_tokens": 14316244.0,
- "step": 1603
- },
- {
- "epoch": 1.2188449848024316,
- "grad_norm": 1.099926471710205,
- "learning_rate": 3.4591267668566412e-06,
- "loss": 0.35783132910728455,
- "mean_token_accuracy": 0.8693175315856934,
- "num_tokens": 14338414.0,
- "step": 1604
- },
- {
- "epoch": 1.2196048632218845,
- "grad_norm": 1.6448384523391724,
- "learning_rate": 3.457192302118244e-06,
- "loss": 0.42060258984565735,
- "mean_token_accuracy": 0.8557323217391968,
- "num_tokens": 14349143.0,
- "step": 1605
- },
- {
- "epoch": 1.2203647416413375,
- "grad_norm": 2.097529888153076,
- "learning_rate": 3.455257165585444e-06,
- "loss": 0.5227499008178711,
- "mean_token_accuracy": 0.828961968421936,
- "num_tokens": 14360032.0,
- "step": 1606
- },
- {
- "epoch": 1.2211246200607904,
- "grad_norm": 1.602988600730896,
- "learning_rate": 3.453321358616393e-06,
- "loss": 0.3537187874317169,
- "mean_token_accuracy": 0.8776708841323853,
- "num_tokens": 14370005.0,
- "step": 1607
- },
- {
- "epoch": 1.221884498480243,
- "grad_norm": 2.358971357345581,
- "learning_rate": 3.4513848825697145e-06,
- "loss": 0.3448919653892517,
- "mean_token_accuracy": 0.8887944221496582,
- "num_tokens": 14375718.0,
- "step": 1608
- },
- {
- "epoch": 1.222644376899696,
- "grad_norm": 1.72306227684021,
- "learning_rate": 3.4494477388045035e-06,
- "loss": 0.36985084414482117,
- "mean_token_accuracy": 0.859595537185669,
- "num_tokens": 14385016.0,
- "step": 1609
- },
- {
- "epoch": 1.2234042553191489,
- "grad_norm": 1.5494085550308228,
- "learning_rate": 3.4475099286803204e-06,
- "loss": 0.49003708362579346,
- "mean_token_accuracy": 0.8701964616775513,
- "num_tokens": 14399277.0,
- "step": 1610
- },
- {
- "epoch": 1.2241641337386018,
- "grad_norm": 2.6874046325683594,
- "learning_rate": 3.445571453557196e-06,
- "loss": 0.3424490690231323,
- "mean_token_accuracy": 0.8835943937301636,
- "num_tokens": 14404182.0,
- "step": 1611
- },
- {
- "epoch": 1.2249240121580547,
- "grad_norm": 2.2163190841674805,
- "learning_rate": 3.443632314795627e-06,
- "loss": 0.40944457054138184,
- "mean_token_accuracy": 0.8649888038635254,
- "num_tokens": 14410158.0,
- "step": 1612
- },
- {
- "epoch": 1.2256838905775076,
- "grad_norm": 2.7961158752441406,
- "learning_rate": 3.4416925137565756e-06,
- "loss": 0.17890746891498566,
- "mean_token_accuracy": 0.9439430832862854,
- "num_tokens": 14413285.0,
- "step": 1613
- },
- {
- "epoch": 1.2264437689969605,
- "grad_norm": 1.421451210975647,
- "learning_rate": 3.439752051801467e-06,
- "loss": 0.33948683738708496,
- "mean_token_accuracy": 0.8754585981369019,
- "num_tokens": 14424674.0,
- "step": 1614
- },
- {
- "epoch": 1.2272036474164134,
- "grad_norm": 2.105196237564087,
- "learning_rate": 3.4378109302921946e-06,
- "loss": 0.40009379386901855,
- "mean_token_accuracy": 0.8600341081619263,
- "num_tokens": 14432400.0,
- "step": 1615
- },
- {
- "epoch": 1.2279635258358663,
- "grad_norm": 2.004122734069824,
- "learning_rate": 3.4358691505911105e-06,
- "loss": 0.46013444662094116,
- "mean_token_accuracy": 0.8400925993919373,
- "num_tokens": 14440741.0,
- "step": 1616
- },
- {
- "epoch": 1.2287234042553192,
- "grad_norm": 1.8407535552978516,
- "learning_rate": 3.4339267140610317e-06,
- "loss": 0.38828906416893005,
- "mean_token_accuracy": 0.8582802414894104,
- "num_tokens": 14448698.0,
- "step": 1617
- },
- {
- "epoch": 1.2294832826747721,
- "grad_norm": 2.4285924434661865,
- "learning_rate": 3.4319836220652334e-06,
- "loss": 0.3109283447265625,
- "mean_token_accuracy": 0.8888344764709473,
- "num_tokens": 14453674.0,
- "step": 1618
- },
- {
- "epoch": 1.2302431610942248,
- "grad_norm": 1.6322550773620605,
- "learning_rate": 3.430039875967454e-06,
- "loss": 0.5222204327583313,
- "mean_token_accuracy": 0.825019121170044,
- "num_tokens": 14465736.0,
- "step": 1619
- },
- {
- "epoch": 1.2310030395136777,
- "grad_norm": 2.307573080062866,
- "learning_rate": 3.428095477131888e-06,
- "loss": 0.29477375745773315,
- "mean_token_accuracy": 0.8899064660072327,
- "num_tokens": 14471266.0,
- "step": 1620
- },
- {
- "epoch": 1.2317629179331306,
- "grad_norm": 1.8044531345367432,
- "learning_rate": 3.4261504269231904e-06,
- "loss": 0.4883342981338501,
- "mean_token_accuracy": 0.8310165405273438,
- "num_tokens": 14481679.0,
- "step": 1621
- },
- {
- "epoch": 1.2325227963525835,
- "grad_norm": 2.7585411071777344,
- "learning_rate": 3.4242047267064714e-06,
- "loss": 0.45369645953178406,
- "mean_token_accuracy": 0.8432134985923767,
- "num_tokens": 14487299.0,
- "step": 1622
- },
- {
- "epoch": 1.2332826747720365,
- "grad_norm": 2.687490701675415,
- "learning_rate": 3.4222583778472997e-06,
- "loss": 0.5627540349960327,
- "mean_token_accuracy": 0.8186438083648682,
- "num_tokens": 14494254.0,
- "step": 1623
- },
- {
- "epoch": 1.2340425531914894,
- "grad_norm": 2.622443199157715,
- "learning_rate": 3.4203113817116955e-06,
- "loss": 0.28697147965431213,
- "mean_token_accuracy": 0.8861737847328186,
- "num_tokens": 14498632.0,
- "step": 1624
- },
- {
- "epoch": 1.2348024316109423,
- "grad_norm": 2.6943359375,
- "learning_rate": 3.4183637396661372e-06,
- "loss": 0.25273287296295166,
- "mean_token_accuracy": 0.9104914665222168,
- "num_tokens": 14502797.0,
- "step": 1625
- },
- {
- "epoch": 1.2355623100303952,
- "grad_norm": 2.428189992904663,
- "learning_rate": 3.4164154530775552e-06,
- "loss": 0.4213451147079468,
- "mean_token_accuracy": 0.851524293422699,
- "num_tokens": 14508503.0,
- "step": 1626
- },
- {
- "epoch": 1.236322188449848,
- "grad_norm": 2.1722824573516846,
- "learning_rate": 3.4144665233133318e-06,
- "loss": 0.35238856077194214,
- "mean_token_accuracy": 0.8730837106704712,
- "num_tokens": 14516126.0,
- "step": 1627
- },
- {
- "epoch": 1.237082066869301,
- "grad_norm": 2.291365146636963,
- "learning_rate": 3.4125169517413005e-06,
- "loss": 0.43963465094566345,
- "mean_token_accuracy": 0.8525444865226746,
- "num_tokens": 14522507.0,
- "step": 1628
- },
- {
- "epoch": 1.237841945288754,
- "grad_norm": 1.6181648969650269,
- "learning_rate": 3.410566739729746e-06,
- "loss": 0.2799680233001709,
- "mean_token_accuracy": 0.8915654420852661,
- "num_tokens": 14531025.0,
- "step": 1629
- },
- {
- "epoch": 1.2386018237082066,
- "grad_norm": 1.4039218425750732,
- "learning_rate": 3.408615888647402e-06,
- "loss": 0.29756587743759155,
- "mean_token_accuracy": 0.8951715230941772,
- "num_tokens": 14543770.0,
- "step": 1630
- },
- {
- "epoch": 1.2393617021276595,
- "grad_norm": 2.148325204849243,
- "learning_rate": 3.4066643998634506e-06,
- "loss": 0.3983418345451355,
- "mean_token_accuracy": 0.8635951280593872,
- "num_tokens": 14550896.0,
- "step": 1631
- },
- {
- "epoch": 1.2401215805471124,
- "grad_norm": 1.5225859880447388,
- "learning_rate": 3.4047122747475227e-06,
- "loss": 0.3247569799423218,
- "mean_token_accuracy": 0.8727027177810669,
- "num_tokens": 14562181.0,
- "step": 1632
- },
- {
- "epoch": 1.2408814589665653,
- "grad_norm": 3.99835467338562,
- "learning_rate": 3.402759514669694e-06,
- "loss": 0.4317352771759033,
- "mean_token_accuracy": 0.8488142490386963,
- "num_tokens": 14565521.0,
- "step": 1633
- },
- {
- "epoch": 1.2416413373860182,
- "grad_norm": 1.7306902408599854,
- "learning_rate": 3.4008061210004872e-06,
- "loss": 0.389854371547699,
- "mean_token_accuracy": 0.8553084135055542,
- "num_tokens": 14574633.0,
- "step": 1634
- },
- {
- "epoch": 1.2424012158054711,
- "grad_norm": 2.3614673614501953,
- "learning_rate": 3.3988520951108683e-06,
- "loss": 0.3150152564048767,
- "mean_token_accuracy": 0.8865959644317627,
- "num_tokens": 14580240.0,
- "step": 1635
- },
- {
- "epoch": 1.243161094224924,
- "grad_norm": 1.5625747442245483,
- "learning_rate": 3.3968974383722497e-06,
- "loss": 0.43160033226013184,
- "mean_token_accuracy": 0.840155839920044,
- "num_tokens": 14594255.0,
- "step": 1636
- },
- {
- "epoch": 1.243920972644377,
- "grad_norm": 1.871620535850525,
- "learning_rate": 3.3949421521564825e-06,
- "loss": 0.49550193548202515,
- "mean_token_accuracy": 0.8315126299858093,
- "num_tokens": 14605416.0,
- "step": 1637
- },
- {
- "epoch": 1.2446808510638299,
- "grad_norm": 2.111304759979248,
- "learning_rate": 3.392986237835863e-06,
- "loss": 0.2794899046421051,
- "mean_token_accuracy": 0.9049773216247559,
- "num_tokens": 14611711.0,
- "step": 1638
- },
- {
- "epoch": 1.2454407294832828,
- "grad_norm": 3.7479894161224365,
- "learning_rate": 3.391029696783127e-06,
- "loss": 0.469397634267807,
- "mean_token_accuracy": 0.8352956771850586,
- "num_tokens": 14615536.0,
- "step": 1639
- },
- {
- "epoch": 1.2462006079027357,
- "grad_norm": 3.277726650238037,
- "learning_rate": 3.389072530371451e-06,
- "loss": 0.35431790351867676,
- "mean_token_accuracy": 0.8822286128997803,
- "num_tokens": 14619390.0,
- "step": 1640
- },
- {
- "epoch": 1.2469604863221884,
- "grad_norm": 1.9583072662353516,
- "learning_rate": 3.3871147399744482e-06,
- "loss": 0.3708694577217102,
- "mean_token_accuracy": 0.8720351457595825,
- "num_tokens": 14626573.0,
- "step": 1641
- },
- {
- "epoch": 1.2477203647416413,
- "grad_norm": 1.8734042644500732,
- "learning_rate": 3.385156326966173e-06,
- "loss": 0.48163774609565735,
- "mean_token_accuracy": 0.8479621410369873,
- "num_tokens": 14636382.0,
- "step": 1642
- },
- {
- "epoch": 1.2484802431610942,
- "grad_norm": 2.0085532665252686,
- "learning_rate": 3.383197292721114e-06,
- "loss": 0.4893198311328888,
- "mean_token_accuracy": 0.838238000869751,
- "num_tokens": 14645083.0,
- "step": 1643
- },
- {
- "epoch": 1.249240121580547,
- "grad_norm": 2.0874593257904053,
- "learning_rate": 3.3812376386141966e-06,
- "loss": 0.4610505700111389,
- "mean_token_accuracy": 0.8441368341445923,
- "num_tokens": 14654048.0,
- "step": 1644
- },
- {
- "epoch": 1.25,
- "grad_norm": 1.6887420415878296,
- "learning_rate": 3.379277366020782e-06,
- "loss": 0.3628596067428589,
- "mean_token_accuracy": 0.8838590383529663,
- "num_tokens": 14662317.0,
- "step": 1645
- },
- {
- "epoch": 1.250759878419453,
- "grad_norm": 2.389002561569214,
- "learning_rate": 3.3773164763166653e-06,
- "loss": 0.21903495490550995,
- "mean_token_accuracy": 0.9249413013458252,
- "num_tokens": 14666394.0,
- "step": 1646
- },
- {
- "epoch": 1.2515197568389058,
- "grad_norm": 1.7091087102890015,
- "learning_rate": 3.3753549708780736e-06,
- "loss": 0.37802332639694214,
- "mean_token_accuracy": 0.8644627332687378,
- "num_tokens": 14676214.0,
- "step": 1647
- },
- {
- "epoch": 1.2522796352583587,
- "grad_norm": 2.5717999935150146,
- "learning_rate": 3.3733928510816677e-06,
- "loss": 0.4236462116241455,
- "mean_token_accuracy": 0.8519910573959351,
- "num_tokens": 14681681.0,
- "step": 1648
- },
- {
- "epoch": 1.2530395136778116,
- "grad_norm": 1.958856463432312,
- "learning_rate": 3.3714301183045382e-06,
- "loss": 0.3923419415950775,
- "mean_token_accuracy": 0.8720202445983887,
- "num_tokens": 14690419.0,
- "step": 1649
- },
- {
- "epoch": 1.2537993920972643,
- "grad_norm": 1.5900038480758667,
- "learning_rate": 3.369466773924207e-06,
- "loss": 0.4182325601577759,
- "mean_token_accuracy": 0.8515387177467346,
- "num_tokens": 14699790.0,
- "step": 1650
- },
- {
- "epoch": 1.2545592705167175,
- "grad_norm": 1.260547161102295,
- "learning_rate": 3.3675028193186243e-06,
- "loss": 0.3915718197822571,
- "mean_token_accuracy": 0.8536830544471741,
- "num_tokens": 14717502.0,
- "step": 1651
- },
- {
- "epoch": 1.2553191489361701,
- "grad_norm": 1.8152283430099487,
- "learning_rate": 3.365538255866169e-06,
- "loss": 0.424524188041687,
- "mean_token_accuracy": 0.8434420824050903,
- "num_tokens": 14726591.0,
- "step": 1652
- },
- {
- "epoch": 1.256079027355623,
- "grad_norm": 1.3357285261154175,
- "learning_rate": 3.3635730849456484e-06,
- "loss": 0.2949739396572113,
- "mean_token_accuracy": 0.8868321180343628,
- "num_tokens": 14739911.0,
- "step": 1653
- },
- {
- "epoch": 1.256838905775076,
- "grad_norm": 1.1770358085632324,
- "learning_rate": 3.3616073079362925e-06,
- "loss": 0.29939576983451843,
- "mean_token_accuracy": 0.8923654556274414,
- "num_tokens": 14755521.0,
- "step": 1654
- },
- {
- "epoch": 1.2575987841945289,
- "grad_norm": 2.059162139892578,
- "learning_rate": 3.3596409262177633e-06,
- "loss": 0.4562555253505707,
- "mean_token_accuracy": 0.8585271239280701,
- "num_tokens": 14764173.0,
- "step": 1655
- },
- {
- "epoch": 1.2583586626139818,
- "grad_norm": 1.430752158164978,
- "learning_rate": 3.357673941170139e-06,
- "loss": 0.35301265120506287,
- "mean_token_accuracy": 0.8920517563819885,
- "num_tokens": 14775596.0,
- "step": 1656
- },
- {
- "epoch": 1.2591185410334347,
- "grad_norm": 1.6066302061080933,
- "learning_rate": 3.3557063541739283e-06,
- "loss": 0.41129636764526367,
- "mean_token_accuracy": 0.8512256145477295,
- "num_tokens": 14786289.0,
- "step": 1657
- },
- {
- "epoch": 1.2598784194528876,
- "grad_norm": 1.5471590757369995,
- "learning_rate": 3.353738166610058e-06,
- "loss": 0.3935067057609558,
- "mean_token_accuracy": 0.8514131903648376,
- "num_tokens": 14798672.0,
- "step": 1658
- },
- {
- "epoch": 1.2606382978723405,
- "grad_norm": 1.3455181121826172,
- "learning_rate": 3.35176937985988e-06,
- "loss": 0.3486790657043457,
- "mean_token_accuracy": 0.8644362688064575,
- "num_tokens": 14811603.0,
- "step": 1659
- },
- {
- "epoch": 1.2613981762917934,
- "grad_norm": 1.891432762145996,
- "learning_rate": 3.349799995305162e-06,
- "loss": 0.3325638175010681,
- "mean_token_accuracy": 0.8844645023345947,
- "num_tokens": 14819256.0,
- "step": 1660
- },
- {
- "epoch": 1.262158054711246,
- "grad_norm": 2.600614309310913,
- "learning_rate": 3.3478300143280946e-06,
- "loss": 0.30310919880867004,
- "mean_token_accuracy": 0.9103429317474365,
- "num_tokens": 14823706.0,
- "step": 1661
- },
- {
- "epoch": 1.2629179331306992,
- "grad_norm": 3.8636202812194824,
- "learning_rate": 3.3458594383112868e-06,
- "loss": 0.28377676010131836,
- "mean_token_accuracy": 0.9047091007232666,
- "num_tokens": 14826688.0,
- "step": 1662
- },
- {
- "epoch": 1.263677811550152,
- "grad_norm": 2.3100268840789795,
- "learning_rate": 3.343888268637765e-06,
- "loss": 0.4723394513130188,
- "mean_token_accuracy": 0.8306777477264404,
- "num_tokens": 14835471.0,
- "step": 1663
- },
- {
- "epoch": 1.2644376899696048,
- "grad_norm": 1.7582160234451294,
- "learning_rate": 3.341916506690971e-06,
- "loss": 0.48168784379959106,
- "mean_token_accuracy": 0.8281306028366089,
- "num_tokens": 14846513.0,
- "step": 1664
- },
- {
- "epoch": 1.2651975683890577,
- "grad_norm": 2.166055917739868,
- "learning_rate": 3.3399441538547638e-06,
- "loss": 0.4626024067401886,
- "mean_token_accuracy": 0.8377980589866638,
- "num_tokens": 14853408.0,
- "step": 1665
- },
- {
- "epoch": 1.2659574468085106,
- "grad_norm": 2.23038911819458,
- "learning_rate": 3.337971211513417e-06,
- "loss": 0.38434159755706787,
- "mean_token_accuracy": 0.8708412647247314,
- "num_tokens": 14859919.0,
- "step": 1666
- },
- {
- "epoch": 1.2667173252279635,
- "grad_norm": 2.092505693435669,
- "learning_rate": 3.3359976810516164e-06,
- "loss": 0.35072219371795654,
- "mean_token_accuracy": 0.8761640191078186,
- "num_tokens": 14865624.0,
- "step": 1667
- },
- {
- "epoch": 1.2674772036474165,
- "grad_norm": 1.8255130052566528,
- "learning_rate": 3.3340235638544633e-06,
- "loss": 0.4404270648956299,
- "mean_token_accuracy": 0.836356520652771,
- "num_tokens": 14874181.0,
- "step": 1668
- },
- {
- "epoch": 1.2682370820668694,
- "grad_norm": 1.9889036417007446,
- "learning_rate": 3.332048861307467e-06,
- "loss": 0.4199368357658386,
- "mean_token_accuracy": 0.8508217334747314,
- "num_tokens": 14882275.0,
- "step": 1669
- },
- {
- "epoch": 1.2689969604863223,
- "grad_norm": 4.050281047821045,
- "learning_rate": 3.330073574796551e-06,
- "loss": 0.4271625280380249,
- "mean_token_accuracy": 0.8471108675003052,
- "num_tokens": 14893633.0,
- "step": 1670
- },
- {
- "epoch": 1.2697568389057752,
- "grad_norm": 1.998838186264038,
- "learning_rate": 3.328097705708047e-06,
- "loss": 0.34743767976760864,
- "mean_token_accuracy": 0.8771528005599976,
- "num_tokens": 14899859.0,
- "step": 1671
- },
- {
- "epoch": 1.2705167173252279,
- "grad_norm": 1.7989062070846558,
- "learning_rate": 3.3261212554286977e-06,
- "loss": 0.5267184376716614,
- "mean_token_accuracy": 0.8323302268981934,
- "num_tokens": 14911131.0,
- "step": 1672
- },
- {
- "epoch": 1.2712765957446808,
- "grad_norm": 1.312070369720459,
- "learning_rate": 3.324144225345649e-06,
- "loss": 0.4675425887107849,
- "mean_token_accuracy": 0.8157106637954712,
- "num_tokens": 14928955.0,
- "step": 1673
- },
- {
- "epoch": 1.2720364741641337,
- "grad_norm": 2.0547919273376465,
- "learning_rate": 3.3221666168464584e-06,
- "loss": 0.33704331517219543,
- "mean_token_accuracy": 0.8621441125869751,
- "num_tokens": 14935536.0,
- "step": 1674
- },
- {
- "epoch": 1.2727963525835866,
- "grad_norm": 2.810413122177124,
- "learning_rate": 3.320188431319088e-06,
- "loss": 0.4007563292980194,
- "mean_token_accuracy": 0.8649672269821167,
- "num_tokens": 14940219.0,
- "step": 1675
- },
- {
- "epoch": 1.2735562310030395,
- "grad_norm": 1.3516674041748047,
- "learning_rate": 3.318209670151904e-06,
- "loss": 0.3457040786743164,
- "mean_token_accuracy": 0.8698287010192871,
- "num_tokens": 14952904.0,
- "step": 1676
- },
- {
- "epoch": 1.2743161094224924,
- "grad_norm": 2.440643310546875,
- "learning_rate": 3.3162303347336765e-06,
- "loss": 0.5195086002349854,
- "mean_token_accuracy": 0.8348199129104614,
- "num_tokens": 14958623.0,
- "step": 1677
- },
- {
- "epoch": 1.2750759878419453,
- "grad_norm": 1.3264343738555908,
- "learning_rate": 3.3142504264535808e-06,
- "loss": 0.2990425229072571,
- "mean_token_accuracy": 0.8961933851242065,
- "num_tokens": 14971494.0,
- "step": 1678
- },
- {
- "epoch": 1.2758358662613982,
- "grad_norm": 1.3106894493103027,
- "learning_rate": 3.3122699467011913e-06,
- "loss": 0.291853666305542,
- "mean_token_accuracy": 0.893449068069458,
- "num_tokens": 14985239.0,
- "step": 1679
- },
- {
- "epoch": 1.2765957446808511,
- "grad_norm": 2.5387396812438965,
- "learning_rate": 3.3102888968664857e-06,
- "loss": 0.4336916208267212,
- "mean_token_accuracy": 0.8447890877723694,
- "num_tokens": 14991453.0,
- "step": 1680
- },
- {
- "epoch": 1.2773556231003038,
- "grad_norm": 2.7052135467529297,
- "learning_rate": 3.308307278339842e-06,
- "loss": 0.3279378116130829,
- "mean_token_accuracy": 0.8935879468917847,
- "num_tokens": 14995428.0,
- "step": 1681
- },
- {
- "epoch": 1.278115501519757,
- "grad_norm": 1.6251261234283447,
- "learning_rate": 3.306325092512034e-06,
- "loss": 0.32066458463668823,
- "mean_token_accuracy": 0.8909799456596375,
- "num_tokens": 15004841.0,
- "step": 1682
- },
- {
- "epoch": 1.2788753799392096,
- "grad_norm": 2.3014605045318604,
- "learning_rate": 3.3043423407742374e-06,
- "loss": 0.3523373603820801,
- "mean_token_accuracy": 0.8810735940933228,
- "num_tokens": 15010742.0,
- "step": 1683
- },
- {
- "epoch": 1.2796352583586625,
- "grad_norm": 2.9563019275665283,
- "learning_rate": 3.3023590245180237e-06,
- "loss": 0.39715707302093506,
- "mean_token_accuracy": 0.8779881000518799,
- "num_tokens": 15015357.0,
- "step": 1684
- },
- {
- "epoch": 1.2803951367781155,
- "grad_norm": 1.5787957906723022,
- "learning_rate": 3.300375145135361e-06,
- "loss": 0.44630166888237,
- "mean_token_accuracy": 0.8400174975395203,
- "num_tokens": 15031360.0,
- "step": 1685
- },
- {
- "epoch": 1.2811550151975684,
- "grad_norm": 1.6753438711166382,
- "learning_rate": 3.2983907040186112e-06,
- "loss": 0.3235800862312317,
- "mean_token_accuracy": 0.8938044309616089,
- "num_tokens": 15040276.0,
- "step": 1686
- },
- {
- "epoch": 1.2819148936170213,
- "grad_norm": 1.7331148386001587,
- "learning_rate": 3.296405702560532e-06,
- "loss": 0.39061424136161804,
- "mean_token_accuracy": 0.8599754571914673,
- "num_tokens": 15049725.0,
- "step": 1687
- },
- {
- "epoch": 1.2826747720364742,
- "grad_norm": 2.2029430866241455,
- "learning_rate": 3.294420142154274e-06,
- "loss": 0.43598297238349915,
- "mean_token_accuracy": 0.8663698434829712,
- "num_tokens": 15058182.0,
- "step": 1688
- },
- {
- "epoch": 1.283434650455927,
- "grad_norm": 2.943964958190918,
- "learning_rate": 3.29243402419338e-06,
- "loss": 0.405210942029953,
- "mean_token_accuracy": 0.854996919631958,
- "num_tokens": 15062920.0,
- "step": 1689
- },
- {
- "epoch": 1.28419452887538,
- "grad_norm": 1.9343379735946655,
- "learning_rate": 3.2904473500717826e-06,
- "loss": 0.35011449456214905,
- "mean_token_accuracy": 0.8745867013931274,
- "num_tokens": 15070298.0,
- "step": 1690
- },
- {
- "epoch": 1.284954407294833,
- "grad_norm": 2.559859037399292,
- "learning_rate": 3.2884601211838087e-06,
- "loss": 0.38816407322883606,
- "mean_token_accuracy": 0.854763388633728,
- "num_tokens": 15075667.0,
- "step": 1691
- },
- {
- "epoch": 1.2857142857142856,
- "grad_norm": 1.4357839822769165,
- "learning_rate": 3.2864723389241697e-06,
- "loss": 0.4512745141983032,
- "mean_token_accuracy": 0.8398592472076416,
- "num_tokens": 15090291.0,
- "step": 1692
- },
- {
- "epoch": 1.2864741641337387,
- "grad_norm": 1.7643728256225586,
- "learning_rate": 3.284484004687969e-06,
- "loss": 0.3536742627620697,
- "mean_token_accuracy": 0.8726381063461304,
- "num_tokens": 15099325.0,
- "step": 1693
- },
- {
- "epoch": 1.2872340425531914,
- "grad_norm": 1.853173017501831,
- "learning_rate": 3.2824951198706958e-06,
- "loss": 0.36579740047454834,
- "mean_token_accuracy": 0.8988048434257507,
- "num_tokens": 15107090.0,
- "step": 1694
- },
- {
- "epoch": 1.2879939209726443,
- "grad_norm": 1.6526862382888794,
- "learning_rate": 3.280505685868226e-06,
- "loss": 0.3853636682033539,
- "mean_token_accuracy": 0.8743607997894287,
- "num_tokens": 15117818.0,
- "step": 1695
- },
- {
- "epoch": 1.2887537993920972,
- "grad_norm": 2.790398597717285,
- "learning_rate": 3.278515704076821e-06,
- "loss": 0.2707311511039734,
- "mean_token_accuracy": 0.9034668803215027,
- "num_tokens": 15121641.0,
- "step": 1696
- },
- {
- "epoch": 1.2895136778115501,
- "grad_norm": 1.69557523727417,
- "learning_rate": 3.276525175893126e-06,
- "loss": 0.3707970082759857,
- "mean_token_accuracy": 0.8617855906486511,
- "num_tokens": 15130414.0,
- "step": 1697
- },
- {
- "epoch": 1.290273556231003,
- "grad_norm": 1.1360478401184082,
- "learning_rate": 3.274534102714172e-06,
- "loss": 0.3368082344532013,
- "mean_token_accuracy": 0.8781654834747314,
- "num_tokens": 15148307.0,
- "step": 1698
- },
- {
- "epoch": 1.291033434650456,
- "grad_norm": 1.5894653797149658,
- "learning_rate": 3.272542485937369e-06,
- "loss": 0.3870658278465271,
- "mean_token_accuracy": 0.8830926418304443,
- "num_tokens": 15161841.0,
- "step": 1699
- },
- {
- "epoch": 1.2917933130699089,
- "grad_norm": 2.3735709190368652,
- "learning_rate": 3.270550326960511e-06,
- "loss": 0.3873991370201111,
- "mean_token_accuracy": 0.8729057908058167,
- "num_tokens": 15167733.0,
- "step": 1700
- },
- {
- "epoch": 1.2925531914893618,
- "grad_norm": 1.3739598989486694,
- "learning_rate": 3.268557627181772e-06,
- "loss": 0.30831626057624817,
- "mean_token_accuracy": 0.8695719242095947,
- "num_tokens": 15180861.0,
- "step": 1701
- },
- {
- "epoch": 1.2933130699088147,
- "grad_norm": 1.7526969909667969,
- "learning_rate": 3.2665643879997054e-06,
- "loss": 0.4716024398803711,
- "mean_token_accuracy": 0.8303275108337402,
- "num_tokens": 15191642.0,
- "step": 1702
- },
- {
- "epoch": 1.2940729483282674,
- "grad_norm": 2.7866084575653076,
- "learning_rate": 3.2645706108132426e-06,
- "loss": 0.33337634801864624,
- "mean_token_accuracy": 0.8790726065635681,
- "num_tokens": 15196038.0,
- "step": 1703
- },
- {
- "epoch": 1.2948328267477205,
- "grad_norm": 2.319765090942383,
- "learning_rate": 3.2625762970216944e-06,
- "loss": 0.3999716639518738,
- "mean_token_accuracy": 0.8693568706512451,
- "num_tokens": 15202075.0,
- "step": 1704
- },
- {
- "epoch": 1.2955927051671732,
- "grad_norm": 3.18292498588562,
- "learning_rate": 3.2605814480247454e-06,
- "loss": 0.4579541087150574,
- "mean_token_accuracy": 0.8516187071800232,
- "num_tokens": 15206886.0,
- "step": 1705
- },
- {
- "epoch": 1.296352583586626,
- "grad_norm": 2.1816933155059814,
- "learning_rate": 3.258586065222459e-06,
- "loss": 0.5198885202407837,
- "mean_token_accuracy": 0.8170592784881592,
- "num_tokens": 15214088.0,
- "step": 1706
- },
- {
- "epoch": 1.297112462006079,
- "grad_norm": 1.9076340198516846,
- "learning_rate": 3.2565901500152702e-06,
- "loss": 0.49752360582351685,
- "mean_token_accuracy": 0.8681992292404175,
- "num_tokens": 15226046.0,
- "step": 1707
- },
- {
- "epoch": 1.297872340425532,
- "grad_norm": 2.0223331451416016,
- "learning_rate": 3.2545937038039904e-06,
- "loss": 0.4515793025493622,
- "mean_token_accuracy": 0.8429619073867798,
- "num_tokens": 15234993.0,
- "step": 1708
- },
- {
- "epoch": 1.2986322188449848,
- "grad_norm": 2.5089669227600098,
- "learning_rate": 3.2525967279898017e-06,
- "loss": 0.43628376722335815,
- "mean_token_accuracy": 0.8493682146072388,
- "num_tokens": 15240575.0,
- "step": 1709
- },
- {
- "epoch": 1.2993920972644377,
- "grad_norm": 2.8347091674804688,
- "learning_rate": 3.2505992239742582e-06,
- "loss": 0.25112441182136536,
- "mean_token_accuracy": 0.908825159072876,
- "num_tokens": 15244085.0,
- "step": 1710
- },
- {
- "epoch": 1.3001519756838906,
- "grad_norm": 2.3157572746276855,
- "learning_rate": 3.2486011931592863e-06,
- "loss": 0.482818067073822,
- "mean_token_accuracy": 0.8305923938751221,
- "num_tokens": 15250377.0,
- "step": 1711
- },
- {
- "epoch": 1.3009118541033435,
- "grad_norm": 3.169052839279175,
- "learning_rate": 3.2466026369471804e-06,
- "loss": 0.3493242561817169,
- "mean_token_accuracy": 0.86913001537323,
- "num_tokens": 15255041.0,
- "step": 1712
- },
- {
- "epoch": 1.3016717325227964,
- "grad_norm": 1.4475083351135254,
- "learning_rate": 3.2446035567406033e-06,
- "loss": 0.4177290201187134,
- "mean_token_accuracy": 0.8497589826583862,
- "num_tokens": 15266946.0,
- "step": 1713
- },
- {
- "epoch": 1.3024316109422491,
- "grad_norm": 1.6473008394241333,
- "learning_rate": 3.2426039539425875e-06,
- "loss": 0.5272886753082275,
- "mean_token_accuracy": 0.8440133333206177,
- "num_tokens": 15279263.0,
- "step": 1714
- },
- {
- "epoch": 1.3031914893617023,
- "grad_norm": 2.3996543884277344,
- "learning_rate": 3.240603829956531e-06,
- "loss": 0.4272066652774811,
- "mean_token_accuracy": 0.8495640754699707,
- "num_tokens": 15285213.0,
- "step": 1715
- },
- {
- "epoch": 1.303951367781155,
- "grad_norm": 1.63034987449646,
- "learning_rate": 3.238603186186198e-06,
- "loss": 0.4034635126590729,
- "mean_token_accuracy": 0.8638584613800049,
- "num_tokens": 15295974.0,
- "step": 1716
- },
- {
- "epoch": 1.3047112462006079,
- "grad_norm": 2.153608798980713,
- "learning_rate": 3.2366020240357166e-06,
- "loss": 0.30712565779685974,
- "mean_token_accuracy": 0.8863866329193115,
- "num_tokens": 15302220.0,
- "step": 1717
- },
- {
- "epoch": 1.3054711246200608,
- "grad_norm": 2.9814558029174805,
- "learning_rate": 3.2346003449095803e-06,
- "loss": 0.3922840356826782,
- "mean_token_accuracy": 0.868030309677124,
- "num_tokens": 15306747.0,
- "step": 1718
- },
- {
- "epoch": 1.3062310030395137,
- "grad_norm": 3.3417985439300537,
- "learning_rate": 3.2325981502126434e-06,
- "loss": 0.30750396847724915,
- "mean_token_accuracy": 0.9065356850624084,
- "num_tokens": 15310309.0,
- "step": 1719
- },
- {
- "epoch": 1.3069908814589666,
- "grad_norm": 2.237682819366455,
- "learning_rate": 3.2305954413501252e-06,
- "loss": 0.35068294405937195,
- "mean_token_accuracy": 0.8887614011764526,
- "num_tokens": 15316463.0,
- "step": 1720
- },
- {
- "epoch": 1.3077507598784195,
- "grad_norm": 1.9526605606079102,
- "learning_rate": 3.228592219727602e-06,
- "loss": 0.42061835527420044,
- "mean_token_accuracy": 0.8456839323043823,
- "num_tokens": 15323984.0,
- "step": 1721
- },
- {
- "epoch": 1.3085106382978724,
- "grad_norm": 1.6454212665557861,
- "learning_rate": 3.226588486751012e-06,
- "loss": 0.5189976692199707,
- "mean_token_accuracy": 0.8187375068664551,
- "num_tokens": 15338807.0,
- "step": 1722
- },
- {
- "epoch": 1.3092705167173253,
- "grad_norm": 1.4521609544754028,
- "learning_rate": 3.2245842438266526e-06,
- "loss": 0.329673171043396,
- "mean_token_accuracy": 0.853867769241333,
- "num_tokens": 15350400.0,
- "step": 1723
- },
- {
- "epoch": 1.3100303951367782,
- "grad_norm": 1.8750989437103271,
- "learning_rate": 3.222579492361179e-06,
- "loss": 0.4635341167449951,
- "mean_token_accuracy": 0.8393422365188599,
- "num_tokens": 15360557.0,
- "step": 1724
- },
- {
- "epoch": 1.310790273556231,
- "grad_norm": 1.2728849649429321,
- "learning_rate": 3.220574233761603e-06,
- "loss": 0.3255572021007538,
- "mean_token_accuracy": 0.8989741802215576,
- "num_tokens": 15376548.0,
- "step": 1725
- },
- {
- "epoch": 1.3115501519756838,
- "grad_norm": 3.5155694484710693,
- "learning_rate": 3.2185684694352913e-06,
- "loss": 0.34204089641571045,
- "mean_token_accuracy": 0.8781906366348267,
- "num_tokens": 15380304.0,
- "step": 1726
- },
- {
- "epoch": 1.3123100303951367,
- "grad_norm": 2.059800148010254,
- "learning_rate": 3.216562200789968e-06,
- "loss": 0.36288338899612427,
- "mean_token_accuracy": 0.8595278263092041,
- "num_tokens": 15387653.0,
- "step": 1727
- },
- {
- "epoch": 1.3130699088145896,
- "grad_norm": 3.5388240814208984,
- "learning_rate": 3.214555429233707e-06,
- "loss": 0.5434849858283997,
- "mean_token_accuracy": 0.8074631690979004,
- "num_tokens": 15391662.0,
- "step": 1728
- },
- {
- "epoch": 1.3138297872340425,
- "grad_norm": 2.8595592975616455,
- "learning_rate": 3.2125481561749406e-06,
- "loss": 0.5113687515258789,
- "mean_token_accuracy": 0.8448649644851685,
- "num_tokens": 15397536.0,
- "step": 1729
- },
- {
- "epoch": 1.3145896656534954,
- "grad_norm": 2.50386905670166,
- "learning_rate": 3.210540383022449e-06,
- "loss": 0.5293697118759155,
- "mean_token_accuracy": 0.8096445798873901,
- "num_tokens": 15403478.0,
- "step": 1730
- },
- {
- "epoch": 1.3153495440729484,
- "grad_norm": 1.880035400390625,
- "learning_rate": 3.208532111185365e-06,
- "loss": 0.5344835519790649,
- "mean_token_accuracy": 0.8172965049743652,
- "num_tokens": 15413812.0,
- "step": 1731
- },
- {
- "epoch": 1.3161094224924013,
- "grad_norm": 1.3688768148422241,
- "learning_rate": 3.2065233420731717e-06,
- "loss": 0.2577427327632904,
- "mean_token_accuracy": 0.9142681360244751,
- "num_tokens": 15423583.0,
- "step": 1732
- },
- {
- "epoch": 1.3168693009118542,
- "grad_norm": 1.7945705652236938,
- "learning_rate": 3.2045140770956987e-06,
- "loss": 0.3983926773071289,
- "mean_token_accuracy": 0.8652000427246094,
- "num_tokens": 15432473.0,
- "step": 1733
- },
- {
- "epoch": 1.3176291793313069,
- "grad_norm": 1.8243350982666016,
- "learning_rate": 3.2025043176631283e-06,
- "loss": 0.48644185066223145,
- "mean_token_accuracy": 0.8319193124771118,
- "num_tokens": 15445463.0,
- "step": 1734
- },
- {
- "epoch": 1.31838905775076,
- "grad_norm": 2.000094175338745,
- "learning_rate": 3.2004940651859844e-06,
- "loss": 0.43567317724227905,
- "mean_token_accuracy": 0.8857482671737671,
- "num_tokens": 15452382.0,
- "step": 1735
- },
- {
- "epoch": 1.3191489361702127,
- "grad_norm": 2.379974365234375,
- "learning_rate": 3.198483321075141e-06,
- "loss": 0.5153506398200989,
- "mean_token_accuracy": 0.8295865654945374,
- "num_tokens": 15458740.0,
- "step": 1736
- },
- {
- "epoch": 1.3199088145896656,
- "grad_norm": 1.6564184427261353,
- "learning_rate": 3.196472086741815e-06,
- "loss": 0.508430540561676,
- "mean_token_accuracy": 0.8181540369987488,
- "num_tokens": 15471844.0,
- "step": 1737
- },
- {
- "epoch": 1.3206686930091185,
- "grad_norm": 2.006925344467163,
- "learning_rate": 3.194460363597569e-06,
- "loss": 0.34542378783226013,
- "mean_token_accuracy": 0.8827437162399292,
- "num_tokens": 15478414.0,
- "step": 1738
- },
- {
- "epoch": 1.3214285714285714,
- "grad_norm": 3.589045763015747,
- "learning_rate": 3.192448153054306e-06,
- "loss": 0.4385780096054077,
- "mean_token_accuracy": 0.8480287790298462,
- "num_tokens": 15482063.0,
- "step": 1739
- },
- {
- "epoch": 1.3221884498480243,
- "grad_norm": 1.9797427654266357,
- "learning_rate": 3.190435456524275e-06,
- "loss": 0.4330386519432068,
- "mean_token_accuracy": 0.8458058834075928,
- "num_tokens": 15489803.0,
- "step": 1740
- },
- {
- "epoch": 1.3229483282674772,
- "grad_norm": 1.4777411222457886,
- "learning_rate": 3.188422275420063e-06,
- "loss": 0.3997895419597626,
- "mean_token_accuracy": 0.8639512062072754,
- "num_tokens": 15501103.0,
- "step": 1741
- },
- {
- "epoch": 1.3237082066869301,
- "grad_norm": 2.882338523864746,
- "learning_rate": 3.186408611154597e-06,
- "loss": 0.2336438149213791,
- "mean_token_accuracy": 0.9176726937294006,
- "num_tokens": 15504854.0,
- "step": 1742
- },
- {
- "epoch": 1.324468085106383,
- "grad_norm": 2.353503704071045,
- "learning_rate": 3.184394465141146e-06,
- "loss": 0.4107069671154022,
- "mean_token_accuracy": 0.8677014112472534,
- "num_tokens": 15510662.0,
- "step": 1743
- },
- {
- "epoch": 1.325227963525836,
- "grad_norm": 2.6551976203918457,
- "learning_rate": 3.1823798387933134e-06,
- "loss": 0.3862302899360657,
- "mean_token_accuracy": 0.8819445371627808,
- "num_tokens": 15515681.0,
- "step": 1744
- },
- {
- "epoch": 1.3259878419452886,
- "grad_norm": 1.478572964668274,
- "learning_rate": 3.180364733525043e-06,
- "loss": 0.43972986936569214,
- "mean_token_accuracy": 0.832388162612915,
- "num_tokens": 15529542.0,
- "step": 1745
- },
- {
- "epoch": 1.3267477203647418,
- "grad_norm": 1.6003550291061401,
- "learning_rate": 3.178349150750612e-06,
- "loss": 0.3404902219772339,
- "mean_token_accuracy": 0.8764007091522217,
- "num_tokens": 15538865.0,
- "step": 1746
- },
- {
- "epoch": 1.3275075987841944,
- "grad_norm": 2.130689859390259,
- "learning_rate": 3.1763330918846347e-06,
- "loss": 0.383136510848999,
- "mean_token_accuracy": 0.8652247190475464,
- "num_tokens": 15545567.0,
- "step": 1747
- },
- {
- "epoch": 1.3282674772036474,
- "grad_norm": 2.395937442779541,
- "learning_rate": 3.1743165583420586e-06,
- "loss": 0.3870319128036499,
- "mean_token_accuracy": 0.8618065118789673,
- "num_tokens": 15551090.0,
- "step": 1748
- },
- {
- "epoch": 1.3290273556231003,
- "grad_norm": 2.0841057300567627,
- "learning_rate": 3.1722995515381644e-06,
- "loss": 0.4838739335536957,
- "mean_token_accuracy": 0.8548711538314819,
- "num_tokens": 15558913.0,
- "step": 1749
- },
- {
- "epoch": 1.3297872340425532,
- "grad_norm": 1.4237847328186035,
- "learning_rate": 3.1702820728885657e-06,
- "loss": 0.40350261330604553,
- "mean_token_accuracy": 0.858984649181366,
- "num_tokens": 15572045.0,
- "step": 1750
- },
- {
- "epoch": 1.330547112462006,
- "grad_norm": 2.2641282081604004,
- "learning_rate": 3.1682641238092064e-06,
- "loss": 0.5117636919021606,
- "mean_token_accuracy": 0.8078924417495728,
- "num_tokens": 15579753.0,
- "step": 1751
- },
- {
- "epoch": 1.331306990881459,
- "grad_norm": 1.0010309219360352,
- "learning_rate": 3.1662457057163603e-06,
- "loss": 0.3220978379249573,
- "mean_token_accuracy": 0.8786559104919434,
- "num_tokens": 15602823.0,
- "step": 1752
- },
- {
- "epoch": 1.332066869300912,
- "grad_norm": 2.441230535507202,
- "learning_rate": 3.164226820026632e-06,
- "loss": 0.37529727816581726,
- "mean_token_accuracy": 0.8886898756027222,
- "num_tokens": 15608473.0,
- "step": 1753
- },
- {
- "epoch": 1.3328267477203648,
- "grad_norm": 1.2960991859436035,
- "learning_rate": 3.162207468156952e-06,
- "loss": 0.3393767476081848,
- "mean_token_accuracy": 0.8766993284225464,
- "num_tokens": 15620893.0,
- "step": 1754
- },
- {
- "epoch": 1.3335866261398177,
- "grad_norm": 2.0806996822357178,
- "learning_rate": 3.16018765152458e-06,
- "loss": 0.38034507632255554,
- "mean_token_accuracy": 0.8854838609695435,
- "num_tokens": 15627068.0,
- "step": 1755
- },
- {
- "epoch": 1.3343465045592704,
- "grad_norm": 1.4316699504852295,
- "learning_rate": 3.1581673715471007e-06,
- "loss": 0.3665890693664551,
- "mean_token_accuracy": 0.870919406414032,
- "num_tokens": 15641070.0,
- "step": 1756
- },
- {
- "epoch": 1.3351063829787235,
- "grad_norm": 1.3466622829437256,
- "learning_rate": 3.1561466296424247e-06,
- "loss": 0.37387198209762573,
- "mean_token_accuracy": 0.8633951544761658,
- "num_tokens": 15653777.0,
- "step": 1757
- },
- {
- "epoch": 1.3358662613981762,
- "grad_norm": 1.8108628988265991,
- "learning_rate": 3.154125427228786e-06,
- "loss": 0.38428938388824463,
- "mean_token_accuracy": 0.85402512550354,
- "num_tokens": 15662494.0,
- "step": 1758
- },
- {
- "epoch": 1.3366261398176291,
- "grad_norm": 1.3221700191497803,
- "learning_rate": 3.152103765724743e-06,
- "loss": 0.42825520038604736,
- "mean_token_accuracy": 0.8435465097427368,
- "num_tokens": 15677552.0,
- "step": 1759
- },
- {
- "epoch": 1.337386018237082,
- "grad_norm": 2.6247692108154297,
- "learning_rate": 3.150081646549174e-06,
- "loss": 0.36186715960502625,
- "mean_token_accuracy": 0.8767328262329102,
- "num_tokens": 15682103.0,
- "step": 1760
- },
- {
- "epoch": 1.338145896656535,
- "grad_norm": 2.1469814777374268,
- "learning_rate": 3.1480590711212823e-06,
- "loss": 0.3734385669231415,
- "mean_token_accuracy": 0.8711104393005371,
- "num_tokens": 15689182.0,
- "step": 1761
- },
- {
- "epoch": 1.3389057750759878,
- "grad_norm": 2.1702585220336914,
- "learning_rate": 3.1460360408605866e-06,
- "loss": 0.2795315086841583,
- "mean_token_accuracy": 0.8892190456390381,
- "num_tokens": 15694272.0,
- "step": 1762
- },
- {
- "epoch": 1.3396656534954408,
- "grad_norm": 1.918797254562378,
- "learning_rate": 3.144012557186931e-06,
- "loss": 0.4363473057746887,
- "mean_token_accuracy": 0.8573931455612183,
- "num_tokens": 15703532.0,
- "step": 1763
- },
- {
- "epoch": 1.3404255319148937,
- "grad_norm": 2.5579960346221924,
- "learning_rate": 3.14198862152047e-06,
- "loss": 0.406247079372406,
- "mean_token_accuracy": 0.8617593050003052,
- "num_tokens": 15708652.0,
- "step": 1764
- },
- {
- "epoch": 1.3411854103343466,
- "grad_norm": 2.3617870807647705,
- "learning_rate": 3.1399642352816825e-06,
- "loss": 0.2839522659778595,
- "mean_token_accuracy": 0.8996064066886902,
- "num_tokens": 15713598.0,
- "step": 1765
- },
- {
- "epoch": 1.3419452887537995,
- "grad_norm": 1.248302936553955,
- "learning_rate": 3.1379393998913594e-06,
- "loss": 0.2922290861606598,
- "mean_token_accuracy": 0.8948773145675659,
- "num_tokens": 15726693.0,
- "step": 1766
- },
- {
- "epoch": 1.3427051671732522,
- "grad_norm": 2.143599510192871,
- "learning_rate": 3.135914116770609e-06,
- "loss": 0.32176223397254944,
- "mean_token_accuracy": 0.8808754682540894,
- "num_tokens": 15731901.0,
- "step": 1767
- },
- {
- "epoch": 1.3434650455927053,
- "grad_norm": 4.226369857788086,
- "learning_rate": 3.1338883873408517e-06,
- "loss": 0.4682556390762329,
- "mean_token_accuracy": 0.8566025495529175,
- "num_tokens": 15735029.0,
- "step": 1768
- },
- {
- "epoch": 1.344224924012158,
- "grad_norm": 1.8695988655090332,
- "learning_rate": 3.1318622130238237e-06,
- "loss": 0.4297192394733429,
- "mean_token_accuracy": 0.8419148921966553,
- "num_tokens": 15744310.0,
- "step": 1769
- },
- {
- "epoch": 1.344984802431611,
- "grad_norm": 2.4321305751800537,
- "learning_rate": 3.1298355952415714e-06,
- "loss": 0.36076444387435913,
- "mean_token_accuracy": 0.8826035261154175,
- "num_tokens": 15749337.0,
- "step": 1770
- },
- {
- "epoch": 1.3457446808510638,
- "grad_norm": 1.5500011444091797,
- "learning_rate": 3.127808535416454e-06,
- "loss": 0.48664039373397827,
- "mean_token_accuracy": 0.844344437122345,
- "num_tokens": 15761096.0,
- "step": 1771
- },
- {
- "epoch": 1.3465045592705167,
- "grad_norm": 2.1498289108276367,
- "learning_rate": 3.1257810349711388e-06,
- "loss": 0.4841752052307129,
- "mean_token_accuracy": 0.8324567079544067,
- "num_tokens": 15768646.0,
- "step": 1772
- },
- {
- "epoch": 1.3472644376899696,
- "grad_norm": 1.2995187044143677,
- "learning_rate": 3.1237530953286046e-06,
- "loss": 0.492019385099411,
- "mean_token_accuracy": 0.8285316228866577,
- "num_tokens": 15788401.0,
- "step": 1773
- },
- {
- "epoch": 1.3480243161094225,
- "grad_norm": 2.324819803237915,
- "learning_rate": 3.121724717912138e-06,
- "loss": 0.33166298270225525,
- "mean_token_accuracy": 0.8856451511383057,
- "num_tokens": 15794097.0,
- "step": 1774
- },
- {
- "epoch": 1.3487841945288754,
- "grad_norm": 1.9611430168151855,
- "learning_rate": 3.11969590414533e-06,
- "loss": 0.3974284827709198,
- "mean_token_accuracy": 0.8751305937767029,
- "num_tokens": 15801065.0,
- "step": 1775
- },
- {
- "epoch": 1.3495440729483283,
- "grad_norm": 1.7084417343139648,
- "learning_rate": 3.1176666554520827e-06,
- "loss": 0.38729435205459595,
- "mean_token_accuracy": 0.8680770397186279,
- "num_tokens": 15810353.0,
- "step": 1776
- },
- {
- "epoch": 1.3503039513677813,
- "grad_norm": 1.7616240978240967,
- "learning_rate": 3.1156369732566006e-06,
- "loss": 0.4271578788757324,
- "mean_token_accuracy": 0.843730092048645,
- "num_tokens": 15821889.0,
- "step": 1777
- },
- {
- "epoch": 1.351063829787234,
- "grad_norm": 2.030747413635254,
- "learning_rate": 3.113606858983391e-06,
- "loss": 0.361891508102417,
- "mean_token_accuracy": 0.8522407412528992,
- "num_tokens": 15830800.0,
- "step": 1778
- },
- {
- "epoch": 1.3518237082066868,
- "grad_norm": 1.4842649698257446,
- "learning_rate": 3.1115763140572686e-06,
- "loss": 0.466334730386734,
- "mean_token_accuracy": 0.8433995246887207,
- "num_tokens": 15849422.0,
- "step": 1779
- },
- {
- "epoch": 1.3525835866261398,
- "grad_norm": 1.6595379114151,
- "learning_rate": 3.109545339903347e-06,
- "loss": 0.4622533321380615,
- "mean_token_accuracy": 0.8526314496994019,
- "num_tokens": 15860431.0,
- "step": 1780
- },
- {
- "epoch": 1.3533434650455927,
- "grad_norm": 2.1235809326171875,
- "learning_rate": 3.107513937947041e-06,
- "loss": 0.42694270610809326,
- "mean_token_accuracy": 0.854864239692688,
- "num_tokens": 15869044.0,
- "step": 1781
- },
- {
- "epoch": 1.3541033434650456,
- "grad_norm": 1.5889263153076172,
- "learning_rate": 3.1054821096140675e-06,
- "loss": 0.41838499903678894,
- "mean_token_accuracy": 0.8671513795852661,
- "num_tokens": 15878598.0,
- "step": 1782
- },
- {
- "epoch": 1.3548632218844985,
- "grad_norm": 2.2261741161346436,
- "learning_rate": 3.1034498563304435e-06,
- "loss": 0.4045066237449646,
- "mean_token_accuracy": 0.843826949596405,
- "num_tokens": 15885167.0,
- "step": 1783
- },
- {
- "epoch": 1.3556231003039514,
- "grad_norm": 2.2569329738616943,
- "learning_rate": 3.1014171795224794e-06,
- "loss": 0.36677104234695435,
- "mean_token_accuracy": 0.8747833967208862,
- "num_tokens": 15891308.0,
- "step": 1784
- },
- {
- "epoch": 1.3563829787234043,
- "grad_norm": 2.1027088165283203,
- "learning_rate": 3.0993840806167884e-06,
- "loss": 0.437946081161499,
- "mean_token_accuracy": 0.8370785117149353,
- "num_tokens": 15898952.0,
- "step": 1785
- },
- {
- "epoch": 1.3571428571428572,
- "grad_norm": 1.8768929243087769,
- "learning_rate": 3.0973505610402767e-06,
- "loss": 0.4201734662055969,
- "mean_token_accuracy": 0.8474810123443604,
- "num_tokens": 15907340.0,
- "step": 1786
- },
- {
- "epoch": 1.35790273556231,
- "grad_norm": 1.7216229438781738,
- "learning_rate": 3.0953166222201474e-06,
- "loss": 0.4225231409072876,
- "mean_token_accuracy": 0.8437749147415161,
- "num_tokens": 15917852.0,
- "step": 1787
- },
- {
- "epoch": 1.358662613981763,
- "grad_norm": 2.6256966590881348,
- "learning_rate": 3.093282265583895e-06,
- "loss": 0.435439795255661,
- "mean_token_accuracy": 0.8452040553092957,
- "num_tokens": 15923739.0,
- "step": 1788
- },
- {
- "epoch": 1.3594224924012157,
- "grad_norm": 2.90028977394104,
- "learning_rate": 3.0912474925593124e-06,
- "loss": 0.3730456829071045,
- "mean_token_accuracy": 0.8766646385192871,
- "num_tokens": 15927943.0,
- "step": 1789
- },
- {
- "epoch": 1.3601823708206686,
- "grad_norm": 1.5966626405715942,
- "learning_rate": 3.0892123045744787e-06,
- "loss": 0.42150455713272095,
- "mean_token_accuracy": 0.854656457901001,
- "num_tokens": 15939922.0,
- "step": 1790
- },
- {
- "epoch": 1.3609422492401215,
- "grad_norm": 1.8069748878479004,
- "learning_rate": 3.0871767030577686e-06,
- "loss": 0.4954872131347656,
- "mean_token_accuracy": 0.8289790153503418,
- "num_tokens": 15950095.0,
- "step": 1791
- },
- {
- "epoch": 1.3617021276595744,
- "grad_norm": 2.0855250358581543,
- "learning_rate": 3.085140689437846e-06,
- "loss": 0.41999945044517517,
- "mean_token_accuracy": 0.8517382144927979,
- "num_tokens": 15957972.0,
- "step": 1792
- },
- {
- "epoch": 1.3624620060790273,
- "grad_norm": 2.108659267425537,
- "learning_rate": 3.0831042651436634e-06,
- "loss": 0.3668023645877838,
- "mean_token_accuracy": 0.8710855841636658,
- "num_tokens": 15965614.0,
- "step": 1793
- },
- {
- "epoch": 1.3632218844984803,
- "grad_norm": 1.3799632787704468,
- "learning_rate": 3.0810674316044602e-06,
- "loss": 0.351409375667572,
- "mean_token_accuracy": 0.870837390422821,
- "num_tokens": 15978854.0,
- "step": 1794
- },
- {
- "epoch": 1.3639817629179332,
- "grad_norm": 1.540397047996521,
- "learning_rate": 3.0790301902497664e-06,
- "loss": 0.403600811958313,
- "mean_token_accuracy": 0.8485002517700195,
- "num_tokens": 15993324.0,
- "step": 1795
- },
- {
- "epoch": 1.364741641337386,
- "grad_norm": 1.946882963180542,
- "learning_rate": 3.076992542509396e-06,
- "loss": 0.40118327736854553,
- "mean_token_accuracy": 0.8607497811317444,
- "num_tokens": 16001937.0,
- "step": 1796
- },
- {
- "epoch": 1.365501519756839,
- "grad_norm": 2.0464305877685547,
- "learning_rate": 3.0749544898134487e-06,
- "loss": 0.31742292642593384,
- "mean_token_accuracy": 0.8878391981124878,
- "num_tokens": 16009277.0,
- "step": 1797
- },
- {
- "epoch": 1.3662613981762917,
- "grad_norm": 2.091754913330078,
- "learning_rate": 3.072916033592307e-06,
- "loss": 0.31580421328544617,
- "mean_token_accuracy": 0.8875244855880737,
- "num_tokens": 16015756.0,
- "step": 1798
- },
- {
- "epoch": 1.3670212765957448,
- "grad_norm": 3.4449212551116943,
- "learning_rate": 3.0708771752766397e-06,
- "loss": 0.4692591726779938,
- "mean_token_accuracy": 0.8456202149391174,
- "num_tokens": 16019912.0,
- "step": 1799
- },
- {
- "epoch": 1.3677811550151975,
- "grad_norm": 1.600419521331787,
- "learning_rate": 3.068837916297396e-06,
- "loss": 0.40389442443847656,
- "mean_token_accuracy": 0.8378961086273193,
- "num_tokens": 16032637.0,
- "step": 1800
- },
- {
- "epoch": 1.3685410334346504,
- "grad_norm": 1.5282686948776245,
- "learning_rate": 3.0667982580858047e-06,
- "loss": 0.379841685295105,
- "mean_token_accuracy": 0.8752143383026123,
- "num_tokens": 16045205.0,
- "step": 1801
- },
- {
- "epoch": 1.3693009118541033,
- "grad_norm": 2.486079454421997,
- "learning_rate": 3.0647582020733773e-06,
- "loss": 0.41060030460357666,
- "mean_token_accuracy": 0.8575131893157959,
- "num_tokens": 16051189.0,
- "step": 1802
- },
- {
- "epoch": 1.3700607902735562,
- "grad_norm": 1.9458621740341187,
- "learning_rate": 3.062717749691904e-06,
- "loss": 0.4442213773727417,
- "mean_token_accuracy": 0.8451495170593262,
- "num_tokens": 16059700.0,
- "step": 1803
- },
- {
- "epoch": 1.3708206686930091,
- "grad_norm": 1.4333001375198364,
- "learning_rate": 3.0606769023734535e-06,
- "loss": 0.39132001996040344,
- "mean_token_accuracy": 0.8609901666641235,
- "num_tokens": 16072458.0,
- "step": 1804
- },
- {
- "epoch": 1.371580547112462,
- "grad_norm": 1.490355372428894,
- "learning_rate": 3.0586356615503693e-06,
- "loss": 0.4108564257621765,
- "mean_token_accuracy": 0.8871046304702759,
- "num_tokens": 16083142.0,
- "step": 1805
- },
- {
- "epoch": 1.372340425531915,
- "grad_norm": 1.7765129804611206,
- "learning_rate": 3.056594028655274e-06,
- "loss": 0.3850266635417938,
- "mean_token_accuracy": 0.8923365473747253,
- "num_tokens": 16092519.0,
- "step": 1806
- },
- {
- "epoch": 1.3731003039513678,
- "grad_norm": 1.955661416053772,
- "learning_rate": 3.0545520051210637e-06,
- "loss": 0.4665378928184509,
- "mean_token_accuracy": 0.837419867515564,
- "num_tokens": 16100618.0,
- "step": 1807
- },
- {
- "epoch": 1.3738601823708207,
- "grad_norm": 3.259265422821045,
- "learning_rate": 3.052509592380909e-06,
- "loss": 0.24722981452941895,
- "mean_token_accuracy": 0.9106054306030273,
- "num_tokens": 16103836.0,
- "step": 1808
- },
- {
- "epoch": 1.3746200607902734,
- "grad_norm": 1.7995736598968506,
- "learning_rate": 3.050466791868254e-06,
- "loss": 0.4982220530509949,
- "mean_token_accuracy": 0.8298169374465942,
- "num_tokens": 16114727.0,
- "step": 1809
- },
- {
- "epoch": 1.3753799392097266,
- "grad_norm": 1.9643093347549438,
- "learning_rate": 3.048423605016815e-06,
- "loss": 0.5076829195022583,
- "mean_token_accuracy": 0.8303098678588867,
- "num_tokens": 16129491.0,
- "step": 1810
- },
- {
- "epoch": 1.3761398176291793,
- "grad_norm": 3.505594491958618,
- "learning_rate": 3.0463800332605787e-06,
- "loss": 0.27466052770614624,
- "mean_token_accuracy": 0.9018045663833618,
- "num_tokens": 16132640.0,
- "step": 1811
- },
- {
- "epoch": 1.3768996960486322,
- "grad_norm": 1.798437237739563,
- "learning_rate": 3.0443360780338034e-06,
- "loss": 0.4004853069782257,
- "mean_token_accuracy": 0.8569544553756714,
- "num_tokens": 16143317.0,
- "step": 1812
- },
- {
- "epoch": 1.377659574468085,
- "grad_norm": 2.276740789413452,
- "learning_rate": 3.042291740771014e-06,
- "loss": 0.3823797106742859,
- "mean_token_accuracy": 0.8764113783836365,
- "num_tokens": 16148898.0,
- "step": 1813
- },
- {
- "epoch": 1.378419452887538,
- "grad_norm": 2.5051357746124268,
- "learning_rate": 3.0402470229070057e-06,
- "loss": 0.40365856885910034,
- "mean_token_accuracy": 0.8809891939163208,
- "num_tokens": 16153815.0,
- "step": 1814
- },
- {
- "epoch": 1.3791793313069909,
- "grad_norm": 1.2379236221313477,
- "learning_rate": 3.03820192587684e-06,
- "loss": 0.3955119848251343,
- "mean_token_accuracy": 0.8536627292633057,
- "num_tokens": 16167783.0,
- "step": 1815
- },
- {
- "epoch": 1.3799392097264438,
- "grad_norm": 2.2286343574523926,
- "learning_rate": 3.036156451115846e-06,
- "loss": 0.39647501707077026,
- "mean_token_accuracy": 0.8621993064880371,
- "num_tokens": 16174707.0,
- "step": 1816
- },
- {
- "epoch": 1.3806990881458967,
- "grad_norm": 1.884639024734497,
- "learning_rate": 3.034110600059616e-06,
- "loss": 0.31612110137939453,
- "mean_token_accuracy": 0.8942475318908691,
- "num_tokens": 16181919.0,
- "step": 1817
- },
- {
- "epoch": 1.3814589665653496,
- "grad_norm": 1.891312599182129,
- "learning_rate": 3.0320643741440052e-06,
- "loss": 0.46209126710891724,
- "mean_token_accuracy": 0.8374713659286499,
- "num_tokens": 16189276.0,
- "step": 1818
- },
- {
- "epoch": 1.3822188449848025,
- "grad_norm": 2.507478713989258,
- "learning_rate": 3.0300177748051375e-06,
- "loss": 0.37601593136787415,
- "mean_token_accuracy": 0.8633589148521423,
- "num_tokens": 16194346.0,
- "step": 1819
- },
- {
- "epoch": 1.3829787234042552,
- "grad_norm": 1.5046696662902832,
- "learning_rate": 3.0279708034793907e-06,
- "loss": 0.3284982144832611,
- "mean_token_accuracy": 0.8792630434036255,
- "num_tokens": 16205457.0,
- "step": 1820
- },
- {
- "epoch": 1.3837386018237083,
- "grad_norm": 2.4244449138641357,
- "learning_rate": 3.025923461603412e-06,
- "loss": 0.40939009189605713,
- "mean_token_accuracy": 0.8596426248550415,
- "num_tokens": 16211866.0,
- "step": 1821
- },
- {
- "epoch": 1.384498480243161,
- "grad_norm": 2.8656933307647705,
- "learning_rate": 3.0238757506141013e-06,
- "loss": 0.4397110044956207,
- "mean_token_accuracy": 0.8597331047058105,
- "num_tokens": 16216607.0,
- "step": 1822
- },
- {
- "epoch": 1.385258358662614,
- "grad_norm": 2.0718610286712646,
- "learning_rate": 3.0218276719486245e-06,
- "loss": 0.49057573080062866,
- "mean_token_accuracy": 0.8325331211090088,
- "num_tokens": 16224014.0,
- "step": 1823
- },
- {
- "epoch": 1.3860182370820668,
- "grad_norm": 1.054450273513794,
- "learning_rate": 3.019779227044398e-06,
- "loss": 0.3758106827735901,
- "mean_token_accuracy": 0.8689473867416382,
- "num_tokens": 16248627.0,
- "step": 1824
- },
- {
- "epoch": 1.3867781155015197,
- "grad_norm": 2.1115148067474365,
- "learning_rate": 3.0177304173391038e-06,
- "loss": 0.502967119216919,
- "mean_token_accuracy": 0.823198676109314,
- "num_tokens": 16256255.0,
- "step": 1825
- },
- {
- "epoch": 1.3875379939209727,
- "grad_norm": 2.207277297973633,
- "learning_rate": 3.015681244270672e-06,
- "loss": 0.3458971083164215,
- "mean_token_accuracy": 0.8930196762084961,
- "num_tokens": 16261823.0,
- "step": 1826
- },
- {
- "epoch": 1.3882978723404256,
- "grad_norm": 1.289669156074524,
- "learning_rate": 3.0136317092772923e-06,
- "loss": 0.4422765374183655,
- "mean_token_accuracy": 0.8358346819877625,
- "num_tokens": 16280659.0,
- "step": 1827
- },
- {
- "epoch": 1.3890577507598785,
- "grad_norm": 2.233865737915039,
- "learning_rate": 3.0115818137974066e-06,
- "loss": 0.3643006384372711,
- "mean_token_accuracy": 0.8682862520217896,
- "num_tokens": 16286356.0,
- "step": 1828
- },
- {
- "epoch": 1.3898176291793314,
- "grad_norm": 1.0950042009353638,
- "learning_rate": 3.0095315592697126e-06,
- "loss": 0.34712421894073486,
- "mean_token_accuracy": 0.8578766584396362,
- "num_tokens": 16307298.0,
- "step": 1829
- },
- {
- "epoch": 1.3905775075987843,
- "grad_norm": 1.1708037853240967,
- "learning_rate": 3.007480947133155e-06,
- "loss": 0.33152541518211365,
- "mean_token_accuracy": 0.894973874092102,
- "num_tokens": 16323232.0,
- "step": 1830
- },
- {
- "epoch": 1.391337386018237,
- "grad_norm": 1.2226970195770264,
- "learning_rate": 3.0054299788269343e-06,
- "loss": 0.3915635943412781,
- "mean_token_accuracy": 0.8575779795646667,
- "num_tokens": 16339273.0,
- "step": 1831
- },
- {
- "epoch": 1.39209726443769,
- "grad_norm": 1.2226042747497559,
- "learning_rate": 3.0033786557904982e-06,
- "loss": 0.45846253633499146,
- "mean_token_accuracy": 0.8290432691574097,
- "num_tokens": 16360145.0,
- "step": 1832
- },
- {
- "epoch": 1.3928571428571428,
- "grad_norm": 2.0117406845092773,
- "learning_rate": 3.001326979463545e-06,
- "loss": 0.3837882876396179,
- "mean_token_accuracy": 0.8941739797592163,
- "num_tokens": 16366602.0,
- "step": 1833
- },
- {
- "epoch": 1.3936170212765957,
- "grad_norm": 1.8419997692108154,
- "learning_rate": 2.9992749512860177e-06,
- "loss": 0.40777021646499634,
- "mean_token_accuracy": 0.854655385017395,
- "num_tokens": 16375611.0,
- "step": 1834
- },
- {
- "epoch": 1.3943768996960486,
- "grad_norm": 1.9405122995376587,
- "learning_rate": 2.9972225726981114e-06,
- "loss": 0.46685922145843506,
- "mean_token_accuracy": 0.8493201732635498,
- "num_tokens": 16384878.0,
- "step": 1835
- },
- {
- "epoch": 1.3951367781155015,
- "grad_norm": 1.2425674200057983,
- "learning_rate": 2.995169845140264e-06,
- "loss": 0.394692063331604,
- "mean_token_accuracy": 0.851348876953125,
- "num_tokens": 16404452.0,
- "step": 1836
- },
- {
- "epoch": 1.3958966565349544,
- "grad_norm": 1.2215365171432495,
- "learning_rate": 2.9931167700531575e-06,
- "loss": 0.31412452459335327,
- "mean_token_accuracy": 0.882760763168335,
- "num_tokens": 16419358.0,
- "step": 1837
- },
- {
- "epoch": 1.3966565349544073,
- "grad_norm": 1.912168025970459,
- "learning_rate": 2.9910633488777198e-06,
- "loss": 0.5065487623214722,
- "mean_token_accuracy": 0.8524355292320251,
- "num_tokens": 16430418.0,
- "step": 1838
- },
- {
- "epoch": 1.3974164133738602,
- "grad_norm": 2.2173948287963867,
- "learning_rate": 2.989009583055121e-06,
- "loss": 0.4290938377380371,
- "mean_token_accuracy": 0.8381836414337158,
- "num_tokens": 16438267.0,
- "step": 1839
- },
- {
- "epoch": 1.3981762917933132,
- "grad_norm": 1.8293484449386597,
- "learning_rate": 2.9869554740267726e-06,
- "loss": 0.41683733463287354,
- "mean_token_accuracy": 0.8548779487609863,
- "num_tokens": 16447382.0,
- "step": 1840
- },
- {
- "epoch": 1.398936170212766,
- "grad_norm": 1.835015892982483,
- "learning_rate": 2.9849010232343274e-06,
- "loss": 0.5080599784851074,
- "mean_token_accuracy": 0.8193596601486206,
- "num_tokens": 16458541.0,
- "step": 1841
- },
- {
- "epoch": 1.3996960486322187,
- "grad_norm": 2.031339645385742,
- "learning_rate": 2.982846232119679e-06,
- "loss": 0.5168882012367249,
- "mean_token_accuracy": 0.8525956869125366,
- "num_tokens": 16467747.0,
- "step": 1842
- },
- {
- "epoch": 1.4004559270516717,
- "grad_norm": 1.5554167032241821,
- "learning_rate": 2.9807911021249573e-06,
- "loss": 0.35098958015441895,
- "mean_token_accuracy": 0.888373851776123,
- "num_tokens": 16479319.0,
- "step": 1843
- },
- {
- "epoch": 1.4012158054711246,
- "grad_norm": 1.7183740139007568,
- "learning_rate": 2.9787356346925327e-06,
- "loss": 0.41263148188591003,
- "mean_token_accuracy": 0.8478364944458008,
- "num_tokens": 16489952.0,
- "step": 1844
- },
- {
- "epoch": 1.4019756838905775,
- "grad_norm": 1.7743209600448608,
- "learning_rate": 2.9766798312650112e-06,
- "loss": 0.4211183190345764,
- "mean_token_accuracy": 0.8641136884689331,
- "num_tokens": 16498655.0,
- "step": 1845
- },
- {
- "epoch": 1.4027355623100304,
- "grad_norm": 2.141300916671753,
- "learning_rate": 2.9746236932852355e-06,
- "loss": 0.49548980593681335,
- "mean_token_accuracy": 0.8304252028465271,
- "num_tokens": 16506348.0,
- "step": 1846
- },
- {
- "epoch": 1.4034954407294833,
- "grad_norm": 2.341571807861328,
- "learning_rate": 2.9725672221962804e-06,
- "loss": 0.40804803371429443,
- "mean_token_accuracy": 0.8545800447463989,
- "num_tokens": 16513091.0,
- "step": 1847
- },
- {
- "epoch": 1.4042553191489362,
- "grad_norm": 1.934428095817566,
- "learning_rate": 2.9705104194414587e-06,
- "loss": 0.30029812455177307,
- "mean_token_accuracy": 0.9032052755355835,
- "num_tokens": 16519455.0,
- "step": 1848
- },
- {
- "epoch": 1.405015197568389,
- "grad_norm": 1.420804500579834,
- "learning_rate": 2.9684532864643123e-06,
- "loss": 0.4384060502052307,
- "mean_token_accuracy": 0.8465110063552856,
- "num_tokens": 16533222.0,
- "step": 1849
- },
- {
- "epoch": 1.405775075987842,
- "grad_norm": 2.1180737018585205,
- "learning_rate": 2.9663958247086165e-06,
- "loss": 0.3915565609931946,
- "mean_token_accuracy": 0.8633890748023987,
- "num_tokens": 16539489.0,
- "step": 1850
- },
- {
- "epoch": 1.4065349544072947,
- "grad_norm": 1.408048152923584,
- "learning_rate": 2.964338035618378e-06,
- "loss": 0.46166157722473145,
- "mean_token_accuracy": 0.8305013179779053,
- "num_tokens": 16555785.0,
- "step": 1851
- },
- {
- "epoch": 1.4072948328267478,
- "grad_norm": 1.3418530225753784,
- "learning_rate": 2.9622799206378306e-06,
- "loss": 0.5314373970031738,
- "mean_token_accuracy": 0.81779944896698,
- "num_tokens": 16578111.0,
- "step": 1852
- },
- {
- "epoch": 1.4080547112462005,
- "grad_norm": 1.4634262323379517,
- "learning_rate": 2.9602214812114414e-06,
- "loss": 0.4859408140182495,
- "mean_token_accuracy": 0.8261818885803223,
- "num_tokens": 16591976.0,
- "step": 1853
- },
- {
- "epoch": 1.4088145896656534,
- "grad_norm": 1.4840295314788818,
- "learning_rate": 2.9581627187838997e-06,
- "loss": 0.4079628586769104,
- "mean_token_accuracy": 0.8549603223800659,
- "num_tokens": 16603631.0,
- "step": 1854
- },
- {
- "epoch": 1.4095744680851063,
- "grad_norm": 2.1474642753601074,
- "learning_rate": 2.956103634800126e-06,
- "loss": 0.32997995615005493,
- "mean_token_accuracy": 0.8836915493011475,
- "num_tokens": 16609875.0,
- "step": 1855
- },
- {
- "epoch": 1.4103343465045592,
- "grad_norm": 2.627460241317749,
- "learning_rate": 2.9540442307052643e-06,
- "loss": 0.3229186236858368,
- "mean_token_accuracy": 0.8852157592773438,
- "num_tokens": 16614113.0,
- "step": 1856
- },
- {
- "epoch": 1.4110942249240122,
- "grad_norm": 1.9569811820983887,
- "learning_rate": 2.9519845079446824e-06,
- "loss": 0.5057883858680725,
- "mean_token_accuracy": 0.8585711717605591,
- "num_tokens": 16624611.0,
- "step": 1857
- },
- {
- "epoch": 1.411854103343465,
- "grad_norm": 2.0604090690612793,
- "learning_rate": 2.949924467963975e-06,
- "loss": 0.4681510329246521,
- "mean_token_accuracy": 0.8390560150146484,
- "num_tokens": 16632938.0,
- "step": 1858
- },
- {
- "epoch": 1.412613981762918,
- "grad_norm": 2.5430450439453125,
- "learning_rate": 2.9478641122089563e-06,
- "loss": 0.3090999126434326,
- "mean_token_accuracy": 0.8943990468978882,
- "num_tokens": 16637135.0,
- "step": 1859
- },
- {
- "epoch": 1.4133738601823709,
- "grad_norm": 1.3275387287139893,
- "learning_rate": 2.945803442125663e-06,
- "loss": 0.3592180013656616,
- "mean_token_accuracy": 0.8678265810012817,
- "num_tokens": 16650322.0,
- "step": 1860
- },
- {
- "epoch": 1.4141337386018238,
- "grad_norm": 1.9070929288864136,
- "learning_rate": 2.943742459160354e-06,
- "loss": 0.5332518815994263,
- "mean_token_accuracy": 0.8475706577301025,
- "num_tokens": 16660240.0,
- "step": 1861
- },
- {
- "epoch": 1.4148936170212765,
- "grad_norm": 2.8724546432495117,
- "learning_rate": 2.9416811647595052e-06,
- "loss": 0.5052884817123413,
- "mean_token_accuracy": 0.8363175392150879,
- "num_tokens": 16665481.0,
- "step": 1862
- },
- {
- "epoch": 1.4156534954407296,
- "grad_norm": 4.203817844390869,
- "learning_rate": 2.939619560369813e-06,
- "loss": 0.546925961971283,
- "mean_token_accuracy": 0.834044337272644,
- "num_tokens": 16669615.0,
- "step": 1863
- },
- {
- "epoch": 1.4164133738601823,
- "grad_norm": 1.6466281414031982,
- "learning_rate": 2.9375576474381907e-06,
- "loss": 0.3474533259868622,
- "mean_token_accuracy": 0.8571163415908813,
- "num_tokens": 16678893.0,
- "step": 1864
- },
- {
- "epoch": 1.4171732522796352,
- "grad_norm": 1.8885842561721802,
- "learning_rate": 2.9354954274117683e-06,
- "loss": 0.3726021349430084,
- "mean_token_accuracy": 0.8629094958305359,
- "num_tokens": 16685939.0,
- "step": 1865
- },
- {
- "epoch": 1.417933130699088,
- "grad_norm": 2.830599784851074,
- "learning_rate": 2.9334329017378898e-06,
- "loss": 0.4138668477535248,
- "mean_token_accuracy": 0.8670746088027954,
- "num_tokens": 16690012.0,
- "step": 1866
- },
- {
- "epoch": 1.418693009118541,
- "grad_norm": 1.6838961839675903,
- "learning_rate": 2.9313700718641167e-06,
- "loss": 0.33954259753227234,
- "mean_token_accuracy": 0.8660278916358948,
- "num_tokens": 16700061.0,
- "step": 1867
- },
- {
- "epoch": 1.419452887537994,
- "grad_norm": 2.8767011165618896,
- "learning_rate": 2.9293069392382224e-06,
- "loss": 0.4650302827358246,
- "mean_token_accuracy": 0.8448452949523926,
- "num_tokens": 16705072.0,
- "step": 1868
- },
- {
- "epoch": 1.4202127659574468,
- "grad_norm": 1.5901305675506592,
- "learning_rate": 2.927243505308192e-06,
- "loss": 0.40838998556137085,
- "mean_token_accuracy": 0.8560664653778076,
- "num_tokens": 16714763.0,
- "step": 1869
- },
- {
- "epoch": 1.4209726443768997,
- "grad_norm": 1.3293657302856445,
- "learning_rate": 2.925179771522223e-06,
- "loss": 0.34712862968444824,
- "mean_token_accuracy": 0.8633697032928467,
- "num_tokens": 16729575.0,
- "step": 1870
- },
- {
- "epoch": 1.4217325227963526,
- "grad_norm": 1.7465964555740356,
- "learning_rate": 2.9231157393287234e-06,
- "loss": 0.48190903663635254,
- "mean_token_accuracy": 0.8255834579467773,
- "num_tokens": 16742529.0,
- "step": 1871
- },
- {
- "epoch": 1.4224924012158056,
- "grad_norm": 1.865749716758728,
- "learning_rate": 2.9210514101763116e-06,
- "loss": 0.4912028908729553,
- "mean_token_accuracy": 0.8309572339057922,
- "num_tokens": 16753989.0,
- "step": 1872
- },
- {
- "epoch": 1.4232522796352582,
- "grad_norm": 2.55780291557312,
- "learning_rate": 2.9189867855138103e-06,
- "loss": 0.4550635814666748,
- "mean_token_accuracy": 0.8584091067314148,
- "num_tokens": 16758906.0,
- "step": 1873
- },
- {
- "epoch": 1.4240121580547114,
- "grad_norm": 1.867530107498169,
- "learning_rate": 2.9169218667902562e-06,
- "loss": 0.3524911105632782,
- "mean_token_accuracy": 0.8715004920959473,
- "num_tokens": 16765969.0,
- "step": 1874
- },
- {
- "epoch": 1.424772036474164,
- "grad_norm": 1.8886862993240356,
- "learning_rate": 2.9148566554548857e-06,
- "loss": 0.37144535779953003,
- "mean_token_accuracy": 0.8640961050987244,
- "num_tokens": 16773935.0,
- "step": 1875
- },
- {
- "epoch": 1.425531914893617,
- "grad_norm": 1.266065239906311,
- "learning_rate": 2.912791152957145e-06,
- "loss": 0.3341747522354126,
- "mean_token_accuracy": 0.8929134607315063,
- "num_tokens": 16787780.0,
- "step": 1876
- },
- {
- "epoch": 1.4262917933130699,
- "grad_norm": 2.524888753890991,
- "learning_rate": 2.9107253607466833e-06,
- "loss": 0.33709171414375305,
- "mean_token_accuracy": 0.8857531547546387,
- "num_tokens": 16792753.0,
- "step": 1877
- },
- {
- "epoch": 1.4270516717325228,
- "grad_norm": 1.9269018173217773,
- "learning_rate": 2.908659280273354e-06,
- "loss": 0.32599249482154846,
- "mean_token_accuracy": 0.8777773380279541,
- "num_tokens": 16799904.0,
- "step": 1878
- },
- {
- "epoch": 1.4278115501519757,
- "grad_norm": 1.9844375848770142,
- "learning_rate": 2.9065929129872097e-06,
- "loss": 0.4086732268333435,
- "mean_token_accuracy": 0.8505409955978394,
- "num_tokens": 16807774.0,
- "step": 1879
- },
- {
- "epoch": 1.4285714285714286,
- "grad_norm": 4.0958662033081055,
- "learning_rate": 2.9045262603385073e-06,
- "loss": 0.3838827610015869,
- "mean_token_accuracy": 0.877601146697998,
- "num_tokens": 16810908.0,
- "step": 1880
- },
- {
- "epoch": 1.4293313069908815,
- "grad_norm": 1.7323768138885498,
- "learning_rate": 2.902459323777704e-06,
- "loss": 0.37459003925323486,
- "mean_token_accuracy": 0.8655836582183838,
- "num_tokens": 16819494.0,
- "step": 1881
- },
- {
- "epoch": 1.4300911854103344,
- "grad_norm": 2.608043670654297,
- "learning_rate": 2.900392104755455e-06,
- "loss": 0.5798726677894592,
- "mean_token_accuracy": 0.8382592797279358,
- "num_tokens": 16827745.0,
- "step": 1882
- },
- {
- "epoch": 1.4308510638297873,
- "grad_norm": 1.3262078762054443,
- "learning_rate": 2.8983246047226137e-06,
- "loss": 0.3724595904350281,
- "mean_token_accuracy": 0.8651963472366333,
- "num_tokens": 16844171.0,
- "step": 1883
- },
- {
- "epoch": 1.43161094224924,
- "grad_norm": 1.7250545024871826,
- "learning_rate": 2.8962568251302327e-06,
- "loss": 0.3478979468345642,
- "mean_token_accuracy": 0.8807886242866516,
- "num_tokens": 16852838.0,
- "step": 1884
- },
- {
- "epoch": 1.4323708206686931,
- "grad_norm": 2.114525318145752,
- "learning_rate": 2.8941887674295573e-06,
- "loss": 0.5156140327453613,
- "mean_token_accuracy": 0.825178861618042,
- "num_tokens": 16861087.0,
- "step": 1885
- },
- {
- "epoch": 1.4331306990881458,
- "grad_norm": 2.400829792022705,
- "learning_rate": 2.892120433072031e-06,
- "loss": 0.2807392477989197,
- "mean_token_accuracy": 0.8907361030578613,
- "num_tokens": 16866557.0,
- "step": 1886
- },
- {
- "epoch": 1.4338905775075987,
- "grad_norm": 2.490880012512207,
- "learning_rate": 2.8900518235092908e-06,
- "loss": 0.2615952491760254,
- "mean_token_accuracy": 0.9152894020080566,
- "num_tokens": 16871357.0,
- "step": 1887
- },
- {
- "epoch": 1.4346504559270516,
- "grad_norm": 1.9058431386947632,
- "learning_rate": 2.887982940193165e-06,
- "loss": 0.43623363971710205,
- "mean_token_accuracy": 0.84696364402771,
- "num_tokens": 16879016.0,
- "step": 1888
- },
- {
- "epoch": 1.4354103343465046,
- "grad_norm": 1.4520210027694702,
- "learning_rate": 2.8859137845756785e-06,
- "loss": 0.3961856961250305,
- "mean_token_accuracy": 0.8518897294998169,
- "num_tokens": 16892254.0,
- "step": 1889
- },
- {
- "epoch": 1.4361702127659575,
- "grad_norm": 2.500274896621704,
- "learning_rate": 2.8838443581090415e-06,
- "loss": 0.41457289457321167,
- "mean_token_accuracy": 0.8751448392868042,
- "num_tokens": 16897156.0,
- "step": 1890
- },
- {
- "epoch": 1.4369300911854104,
- "grad_norm": 2.9312057495117188,
- "learning_rate": 2.8817746622456585e-06,
- "loss": 0.45875269174575806,
- "mean_token_accuracy": 0.8411039113998413,
- "num_tokens": 16902291.0,
- "step": 1891
- },
- {
- "epoch": 1.4376899696048633,
- "grad_norm": 2.367419481277466,
- "learning_rate": 2.879704698438121e-06,
- "loss": 0.3643629848957062,
- "mean_token_accuracy": 0.8771071434020996,
- "num_tokens": 16908128.0,
- "step": 1892
- },
- {
- "epoch": 1.4384498480243162,
- "grad_norm": 1.9907705783843994,
- "learning_rate": 2.8776344681392106e-06,
- "loss": 0.3206835389137268,
- "mean_token_accuracy": 0.879996657371521,
- "num_tokens": 16914918.0,
- "step": 1893
- },
- {
- "epoch": 1.439209726443769,
- "grad_norm": 3.536956310272217,
- "learning_rate": 2.875563972801893e-06,
- "loss": 0.3640141785144806,
- "mean_token_accuracy": 0.8814959526062012,
- "num_tokens": 16918187.0,
- "step": 1894
- },
- {
- "epoch": 1.4399696048632218,
- "grad_norm": 1.3451156616210938,
- "learning_rate": 2.8734932138793226e-06,
- "loss": 0.3427346348762512,
- "mean_token_accuracy": 0.8835382461547852,
- "num_tokens": 16931135.0,
- "step": 1895
- },
- {
- "epoch": 1.4407294832826747,
- "grad_norm": 2.0735955238342285,
- "learning_rate": 2.871422192824837e-06,
- "loss": 0.4265315532684326,
- "mean_token_accuracy": 0.8452677726745605,
- "num_tokens": 16937995.0,
- "step": 1896
- },
- {
- "epoch": 1.4414893617021276,
- "grad_norm": 1.5124932527542114,
- "learning_rate": 2.8693509110919597e-06,
- "loss": 0.497121661901474,
- "mean_token_accuracy": 0.815092921257019,
- "num_tokens": 16952743.0,
- "step": 1897
- },
- {
- "epoch": 1.4422492401215805,
- "grad_norm": 3.716669797897339,
- "learning_rate": 2.867279370134395e-06,
- "loss": 0.5452651381492615,
- "mean_token_accuracy": 0.8150380849838257,
- "num_tokens": 16956797.0,
- "step": 1898
- },
- {
- "epoch": 1.4430091185410334,
- "grad_norm": 1.3571398258209229,
- "learning_rate": 2.8652075714060296e-06,
- "loss": 0.4249724745750427,
- "mean_token_accuracy": 0.8675867915153503,
- "num_tokens": 16974494.0,
- "step": 1899
- },
- {
- "epoch": 1.4437689969604863,
- "grad_norm": 2.310673475265503,
- "learning_rate": 2.863135516360932e-06,
- "loss": 0.39368677139282227,
- "mean_token_accuracy": 0.878392219543457,
- "num_tokens": 16980612.0,
- "step": 1900
- },
- {
- "epoch": 1.4445288753799392,
- "grad_norm": 1.9025533199310303,
- "learning_rate": 2.8610632064533517e-06,
- "loss": 0.4786127805709839,
- "mean_token_accuracy": 0.8720556497573853,
- "num_tokens": 16992262.0,
- "step": 1901
- },
- {
- "epoch": 1.4452887537993921,
- "grad_norm": 2.528564453125,
- "learning_rate": 2.8589906431377133e-06,
- "loss": 0.4223094582557678,
- "mean_token_accuracy": 0.8513246178627014,
- "num_tokens": 16997717.0,
- "step": 1902
- },
- {
- "epoch": 1.446048632218845,
- "grad_norm": 1.010425329208374,
- "learning_rate": 2.8569178278686222e-06,
- "loss": 0.3908255696296692,
- "mean_token_accuracy": 0.8620463609695435,
- "num_tokens": 17020903.0,
- "step": 1903
- },
- {
- "epoch": 1.4468085106382977,
- "grad_norm": 1.5760232210159302,
- "learning_rate": 2.8548447621008614e-06,
- "loss": 0.4134044051170349,
- "mean_token_accuracy": 0.8472093343734741,
- "num_tokens": 17035250.0,
- "step": 1904
- },
- {
- "epoch": 1.4475683890577509,
- "grad_norm": 2.0668535232543945,
- "learning_rate": 2.8527714472893866e-06,
- "loss": 0.44095730781555176,
- "mean_token_accuracy": 0.881983757019043,
- "num_tokens": 17042170.0,
- "step": 1905
- },
- {
- "epoch": 1.4483282674772036,
- "grad_norm": 1.1620599031448364,
- "learning_rate": 2.85069788488933e-06,
- "loss": 0.3607163429260254,
- "mean_token_accuracy": 0.8684282898902893,
- "num_tokens": 17061937.0,
- "step": 1906
- },
- {
- "epoch": 1.4490881458966565,
- "grad_norm": 2.1316568851470947,
- "learning_rate": 2.8486240763559984e-06,
- "loss": 0.3478124141693115,
- "mean_token_accuracy": 0.8772403001785278,
- "num_tokens": 17068628.0,
- "step": 1907
- },
- {
- "epoch": 1.4498480243161094,
- "grad_norm": 2.4756391048431396,
- "learning_rate": 2.8465500231448707e-06,
- "loss": 0.46441152691841125,
- "mean_token_accuracy": 0.8436450958251953,
- "num_tokens": 17075495.0,
- "step": 1908
- },
- {
- "epoch": 1.4506079027355623,
- "grad_norm": 2.249720573425293,
- "learning_rate": 2.844475726711595e-06,
- "loss": 0.41565513610839844,
- "mean_token_accuracy": 0.8525094985961914,
- "num_tokens": 17080940.0,
- "step": 1909
- },
- {
- "epoch": 1.4513677811550152,
- "grad_norm": 2.3081841468811035,
- "learning_rate": 2.8424011885119956e-06,
- "loss": 0.49903199076652527,
- "mean_token_accuracy": 0.8212426900863647,
- "num_tokens": 17092024.0,
- "step": 1910
- },
- {
- "epoch": 1.452127659574468,
- "grad_norm": 1.2929959297180176,
- "learning_rate": 2.8403264100020613e-06,
- "loss": 0.47038257122039795,
- "mean_token_accuracy": 0.8319816589355469,
- "num_tokens": 17108840.0,
- "step": 1911
- },
- {
- "epoch": 1.452887537993921,
- "grad_norm": 1.6476463079452515,
- "learning_rate": 2.8382513926379508e-06,
- "loss": 0.42287829518318176,
- "mean_token_accuracy": 0.8555682897567749,
- "num_tokens": 17119704.0,
- "step": 1912
- },
- {
- "epoch": 1.453647416413374,
- "grad_norm": 1.759998083114624,
- "learning_rate": 2.836176137875993e-06,
- "loss": 0.40904951095581055,
- "mean_token_accuracy": 0.8698266744613647,
- "num_tokens": 17130676.0,
- "step": 1913
- },
- {
- "epoch": 1.4544072948328268,
- "grad_norm": 1.510909914970398,
- "learning_rate": 2.8341006471726817e-06,
- "loss": 0.47834792733192444,
- "mean_token_accuracy": 0.8335825204849243,
- "num_tokens": 17146304.0,
- "step": 1914
- },
- {
- "epoch": 1.4551671732522795,
- "grad_norm": 3.538071632385254,
- "learning_rate": 2.832024921984674e-06,
- "loss": 0.34059035778045654,
- "mean_token_accuracy": 0.8769031763076782,
- "num_tokens": 17150458.0,
- "step": 1915
- },
- {
- "epoch": 1.4559270516717326,
- "grad_norm": 2.3368659019470215,
- "learning_rate": 2.8299489637687955e-06,
- "loss": 0.43068382143974304,
- "mean_token_accuracy": 0.845360517501831,
- "num_tokens": 17157368.0,
- "step": 1916
- },
- {
- "epoch": 1.4566869300911853,
- "grad_norm": 1.8720396757125854,
- "learning_rate": 2.8278727739820334e-06,
- "loss": 0.37013399600982666,
- "mean_token_accuracy": 0.854241132736206,
- "num_tokens": 17166325.0,
- "step": 1917
- },
- {
- "epoch": 1.4574468085106382,
- "grad_norm": 1.6706892251968384,
- "learning_rate": 2.825796354081537e-06,
- "loss": 0.5397020578384399,
- "mean_token_accuracy": 0.8309713006019592,
- "num_tokens": 17178920.0,
- "step": 1918
- },
- {
- "epoch": 1.4582066869300911,
- "grad_norm": 2.729210376739502,
- "learning_rate": 2.8237197055246175e-06,
- "loss": 0.25137859582901,
- "mean_token_accuracy": 0.9148792028427124,
- "num_tokens": 17183107.0,
- "step": 1919
- },
- {
- "epoch": 1.458966565349544,
- "grad_norm": 3.023500680923462,
- "learning_rate": 2.821642829768748e-06,
- "loss": 0.43312495946884155,
- "mean_token_accuracy": 0.8481811285018921,
- "num_tokens": 17187853.0,
- "step": 1920
- },
- {
- "epoch": 1.459726443768997,
- "grad_norm": 1.8108519315719604,
- "learning_rate": 2.8195657282715595e-06,
- "loss": 0.5101792216300964,
- "mean_token_accuracy": 0.8315553069114685,
- "num_tokens": 17199247.0,
- "step": 1921
- },
- {
- "epoch": 1.4604863221884499,
- "grad_norm": 2.0262672901153564,
- "learning_rate": 2.817488402490841e-06,
- "loss": 0.4449934959411621,
- "mean_token_accuracy": 0.8634527325630188,
- "num_tokens": 17206348.0,
- "step": 1922
- },
- {
- "epoch": 1.4612462006079028,
- "grad_norm": 2.6163926124572754,
- "learning_rate": 2.8154108538845405e-06,
- "loss": 0.43052345514297485,
- "mean_token_accuracy": 0.8375401496887207,
- "num_tokens": 17211702.0,
- "step": 1923
- },
- {
- "epoch": 1.4620060790273557,
- "grad_norm": 2.0854408740997314,
- "learning_rate": 2.813333083910761e-06,
- "loss": 0.5011380910873413,
- "mean_token_accuracy": 0.8359915018081665,
- "num_tokens": 17219096.0,
- "step": 1924
- },
- {
- "epoch": 1.4627659574468086,
- "grad_norm": 2.2081687450408936,
- "learning_rate": 2.8112550940277615e-06,
- "loss": 0.5239193439483643,
- "mean_token_accuracy": 0.8499593734741211,
- "num_tokens": 17229266.0,
- "step": 1925
- },
- {
- "epoch": 1.4635258358662613,
- "grad_norm": 1.798343539237976,
- "learning_rate": 2.809176885693956e-06,
- "loss": 0.4515029191970825,
- "mean_token_accuracy": 0.8400485515594482,
- "num_tokens": 17239280.0,
- "step": 1926
- },
- {
- "epoch": 1.4642857142857144,
- "grad_norm": 1.897887945175171,
- "learning_rate": 2.807098460367911e-06,
- "loss": 0.35935714840888977,
- "mean_token_accuracy": 0.8776072263717651,
- "num_tokens": 17247132.0,
- "step": 1927
- },
- {
- "epoch": 1.465045592705167,
- "grad_norm": 2.705836296081543,
- "learning_rate": 2.8050198195083445e-06,
- "loss": 0.3728443682193756,
- "mean_token_accuracy": 0.8649885654449463,
- "num_tokens": 17251865.0,
- "step": 1928
- },
- {
- "epoch": 1.46580547112462,
- "grad_norm": 1.841178059577942,
- "learning_rate": 2.802940964574127e-06,
- "loss": 0.40604841709136963,
- "mean_token_accuracy": 0.8537783622741699,
- "num_tokens": 17260163.0,
- "step": 1929
- },
- {
- "epoch": 1.466565349544073,
- "grad_norm": 2.7393605709075928,
- "learning_rate": 2.800861897024279e-06,
- "loss": 0.39346879720687866,
- "mean_token_accuracy": 0.8628787994384766,
- "num_tokens": 17264876.0,
- "step": 1930
- },
- {
- "epoch": 1.4673252279635258,
- "grad_norm": 1.84367835521698,
- "learning_rate": 2.798782618317971e-06,
- "loss": 0.37411895394325256,
- "mean_token_accuracy": 0.8605265617370605,
- "num_tokens": 17273049.0,
- "step": 1931
- },
- {
- "epoch": 1.4680851063829787,
- "grad_norm": 1.6546733379364014,
- "learning_rate": 2.796703129914519e-06,
- "loss": 0.4997844099998474,
- "mean_token_accuracy": 0.8267433643341064,
- "num_tokens": 17285074.0,
- "step": 1932
- },
- {
- "epoch": 1.4688449848024316,
- "grad_norm": 2.2749221324920654,
- "learning_rate": 2.79462343327339e-06,
- "loss": 0.35453367233276367,
- "mean_token_accuracy": 0.8746850490570068,
- "num_tokens": 17290273.0,
- "step": 1933
- },
- {
- "epoch": 1.4696048632218845,
- "grad_norm": 1.7142518758773804,
- "learning_rate": 2.7925435298541944e-06,
- "loss": 0.345878541469574,
- "mean_token_accuracy": 0.8600981831550598,
- "num_tokens": 17301045.0,
- "step": 1934
- },
- {
- "epoch": 1.4703647416413375,
- "grad_norm": 3.163342237472534,
- "learning_rate": 2.7904634211166877e-06,
- "loss": 0.4356975853443146,
- "mean_token_accuracy": 0.8460350036621094,
- "num_tokens": 17305108.0,
- "step": 1935
- },
- {
- "epoch": 1.4711246200607904,
- "grad_norm": 1.6377612352371216,
- "learning_rate": 2.7883831085207707e-06,
- "loss": 0.4459729790687561,
- "mean_token_accuracy": 0.8463394641876221,
- "num_tokens": 17315479.0,
- "step": 1936
- },
- {
- "epoch": 1.471884498480243,
- "grad_norm": 1.865268588066101,
- "learning_rate": 2.7863025935264876e-06,
- "loss": 0.394723117351532,
- "mean_token_accuracy": 0.864177942276001,
- "num_tokens": 17324795.0,
- "step": 1937
- },
- {
- "epoch": 1.4726443768996962,
- "grad_norm": 1.241937518119812,
- "learning_rate": 2.784221877594024e-06,
- "loss": 0.2752220630645752,
- "mean_token_accuracy": 0.8998259902000427,
- "num_tokens": 17338000.0,
- "step": 1938
- },
- {
- "epoch": 1.4734042553191489,
- "grad_norm": 1.8013651371002197,
- "learning_rate": 2.7821409621837042e-06,
- "loss": 0.4251005947589874,
- "mean_token_accuracy": 0.8518919348716736,
- "num_tokens": 17347351.0,
- "step": 1939
- },
- {
- "epoch": 1.4741641337386018,
- "grad_norm": 1.2902207374572754,
- "learning_rate": 2.7800598487559976e-06,
- "loss": 0.3640727400779724,
- "mean_token_accuracy": 0.8592870235443115,
- "num_tokens": 17362335.0,
- "step": 1940
- },
- {
- "epoch": 1.4749240121580547,
- "grad_norm": 2.5427513122558594,
- "learning_rate": 2.777978538771508e-06,
- "loss": 0.38166797161102295,
- "mean_token_accuracy": 0.8653234839439392,
- "num_tokens": 17367733.0,
- "step": 1941
- },
- {
- "epoch": 1.4756838905775076,
- "grad_norm": 1.7793641090393066,
- "learning_rate": 2.7758970336909795e-06,
- "loss": 0.3113783895969391,
- "mean_token_accuracy": 0.8812868595123291,
- "num_tokens": 17375267.0,
- "step": 1942
- },
- {
- "epoch": 1.4764437689969605,
- "grad_norm": 3.4031741619110107,
- "learning_rate": 2.7738153349752923e-06,
- "loss": 0.4800986647605896,
- "mean_token_accuracy": 0.8336698412895203,
- "num_tokens": 17379549.0,
- "step": 1943
- },
- {
- "epoch": 1.4772036474164134,
- "grad_norm": 1.3451651334762573,
- "learning_rate": 2.7717334440854634e-06,
- "loss": 0.3115345239639282,
- "mean_token_accuracy": 0.908623218536377,
- "num_tokens": 17394455.0,
- "step": 1944
- },
- {
- "epoch": 1.4779635258358663,
- "grad_norm": 1.980919599533081,
- "learning_rate": 2.7696513624826422e-06,
- "loss": 0.391154944896698,
- "mean_token_accuracy": 0.8650267720222473,
- "num_tokens": 17401931.0,
- "step": 1945
- },
- {
- "epoch": 1.4787234042553192,
- "grad_norm": 1.0118765830993652,
- "learning_rate": 2.7675690916281158e-06,
- "loss": 0.3157956600189209,
- "mean_token_accuracy": 0.8827471733093262,
- "num_tokens": 17424144.0,
- "step": 1946
- },
- {
- "epoch": 1.4794832826747721,
- "grad_norm": 1.579654335975647,
- "learning_rate": 2.7654866329833e-06,
- "loss": 0.4578486382961273,
- "mean_token_accuracy": 0.8361750245094299,
- "num_tokens": 17435769.0,
- "step": 1947
- },
- {
- "epoch": 1.4802431610942248,
- "grad_norm": 1.7706717252731323,
- "learning_rate": 2.763403988009746e-06,
- "loss": 0.3564416170120239,
- "mean_token_accuracy": 0.8689201474189758,
- "num_tokens": 17444088.0,
- "step": 1948
- },
- {
- "epoch": 1.4810030395136777,
- "grad_norm": 1.2264244556427002,
- "learning_rate": 2.761321158169134e-06,
- "loss": 0.30763837695121765,
- "mean_token_accuracy": 0.8960219621658325,
- "num_tokens": 17458096.0,
- "step": 1949
- },
- {
- "epoch": 1.4817629179331306,
- "grad_norm": 1.214431881904602,
- "learning_rate": 2.759238144923274e-06,
- "loss": 0.49099457263946533,
- "mean_token_accuracy": 0.8279136419296265,
- "num_tokens": 17481062.0,
- "step": 1950
- },
- {
- "epoch": 1.4825227963525835,
- "grad_norm": 1.593892216682434,
- "learning_rate": 2.7571549497341044e-06,
- "loss": 0.3745320737361908,
- "mean_token_accuracy": 0.8690779209136963,
- "num_tokens": 17490874.0,
- "step": 1951
- },
- {
- "epoch": 1.4832826747720365,
- "grad_norm": 2.409924268722534,
- "learning_rate": 2.755071574063692e-06,
- "loss": 0.4310247600078583,
- "mean_token_accuracy": 0.8521159291267395,
- "num_tokens": 17496942.0,
- "step": 1952
- },
- {
- "epoch": 1.4840425531914894,
- "grad_norm": 1.2557463645935059,
- "learning_rate": 2.7529880193742297e-06,
- "loss": 0.34304720163345337,
- "mean_token_accuracy": 0.8748183250427246,
- "num_tokens": 17514391.0,
- "step": 1953
- },
- {
- "epoch": 1.4848024316109423,
- "grad_norm": 1.17310631275177,
- "learning_rate": 2.7509042871280373e-06,
- "loss": 0.3835817277431488,
- "mean_token_accuracy": 0.8853274583816528,
- "num_tokens": 17533289.0,
- "step": 1954
- },
- {
- "epoch": 1.4855623100303952,
- "grad_norm": 1.5261479616165161,
- "learning_rate": 2.748820378787558e-06,
- "loss": 0.4799988865852356,
- "mean_token_accuracy": 0.8252149820327759,
- "num_tokens": 17544118.0,
- "step": 1955
- },
- {
- "epoch": 1.486322188449848,
- "grad_norm": 2.030930757522583,
- "learning_rate": 2.7467362958153585e-06,
- "loss": 0.35690805315971375,
- "mean_token_accuracy": 0.8959587216377258,
- "num_tokens": 17550431.0,
- "step": 1956
- },
- {
- "epoch": 1.4870820668693008,
- "grad_norm": 2.376520872116089,
- "learning_rate": 2.7446520396741293e-06,
- "loss": 0.262234091758728,
- "mean_token_accuracy": 0.9054547548294067,
- "num_tokens": 17554853.0,
- "step": 1957
- },
- {
- "epoch": 1.487841945288754,
- "grad_norm": 1.6944479942321777,
- "learning_rate": 2.742567611826681e-06,
- "loss": 0.529259979724884,
- "mean_token_accuracy": 0.8195339441299438,
- "num_tokens": 17568016.0,
- "step": 1958
- },
- {
- "epoch": 1.4886018237082066,
- "grad_norm": 2.833029270172119,
- "learning_rate": 2.7404830137359445e-06,
- "loss": 0.30229634046554565,
- "mean_token_accuracy": 0.8933001756668091,
- "num_tokens": 17572587.0,
- "step": 1959
- },
- {
- "epoch": 1.4893617021276595,
- "grad_norm": 1.7040144205093384,
- "learning_rate": 2.7383982468649715e-06,
- "loss": 0.3166356682777405,
- "mean_token_accuracy": 0.8871906399726868,
- "num_tokens": 17580966.0,
- "step": 1960
- },
- {
- "epoch": 1.4901215805471124,
- "grad_norm": 1.7539052963256836,
- "learning_rate": 2.7363133126769326e-06,
- "loss": 0.4231064021587372,
- "mean_token_accuracy": 0.8708304166793823,
- "num_tokens": 17590907.0,
- "step": 1961
- },
- {
- "epoch": 1.4908814589665653,
- "grad_norm": 1.6198650598526,
- "learning_rate": 2.7342282126351145e-06,
- "loss": 0.4198967218399048,
- "mean_token_accuracy": 0.8723280429840088,
- "num_tokens": 17604291.0,
- "step": 1962
- },
- {
- "epoch": 1.4916413373860182,
- "grad_norm": 1.8437711000442505,
- "learning_rate": 2.73214294820292e-06,
- "loss": 0.38923323154449463,
- "mean_token_accuracy": 0.8697006106376648,
- "num_tokens": 17612291.0,
- "step": 1963
- },
- {
- "epoch": 1.4924012158054711,
- "grad_norm": 1.1129369735717773,
- "learning_rate": 2.7300575208438684e-06,
- "loss": 0.3107512593269348,
- "mean_token_accuracy": 0.878618597984314,
- "num_tokens": 17630073.0,
- "step": 1964
- },
- {
- "epoch": 1.493161094224924,
- "grad_norm": 3.0210442543029785,
- "learning_rate": 2.7279719320215924e-06,
- "loss": 0.4630751609802246,
- "mean_token_accuracy": 0.8567075729370117,
- "num_tokens": 17634758.0,
- "step": 1965
- },
- {
- "epoch": 1.493920972644377,
- "grad_norm": 2.8825972080230713,
- "learning_rate": 2.725886183199839e-06,
- "loss": 0.35351765155792236,
- "mean_token_accuracy": 0.8711981773376465,
- "num_tokens": 17639613.0,
- "step": 1966
- },
- {
- "epoch": 1.4946808510638299,
- "grad_norm": 2.111238718032837,
- "learning_rate": 2.723800275842468e-06,
- "loss": 0.3529569208621979,
- "mean_token_accuracy": 0.8679244518280029,
- "num_tokens": 17645308.0,
- "step": 1967
- },
- {
- "epoch": 1.4954407294832825,
- "grad_norm": 2.080509901046753,
- "learning_rate": 2.7217142114134466e-06,
- "loss": 0.43321219086647034,
- "mean_token_accuracy": 0.8848220109939575,
- "num_tokens": 17652292.0,
- "step": 1968
- },
- {
- "epoch": 1.4962006079027357,
- "grad_norm": 2.8686363697052,
- "learning_rate": 2.7196279913768587e-06,
- "loss": 0.417035311460495,
- "mean_token_accuracy": 0.8724601864814758,
- "num_tokens": 17656908.0,
- "step": 1969
- },
- {
- "epoch": 1.4969604863221884,
- "grad_norm": 3.294193744659424,
- "learning_rate": 2.717541617196891e-06,
- "loss": 0.3551934063434601,
- "mean_token_accuracy": 0.8838565349578857,
- "num_tokens": 17660590.0,
- "step": 1970
- },
- {
- "epoch": 1.4977203647416413,
- "grad_norm": 1.766292929649353,
- "learning_rate": 2.7154550903378425e-06,
- "loss": 0.36521971225738525,
- "mean_token_accuracy": 0.8810199499130249,
- "num_tokens": 17668214.0,
- "step": 1971
- },
- {
- "epoch": 1.4984802431610942,
- "grad_norm": 1.2127676010131836,
- "learning_rate": 2.713368412264118e-06,
- "loss": 0.35184425115585327,
- "mean_token_accuracy": 0.8672580718994141,
- "num_tokens": 17684736.0,
- "step": 1972
- },
- {
- "epoch": 1.499240121580547,
- "grad_norm": 2.268256664276123,
- "learning_rate": 2.711281584440228e-06,
- "loss": 0.40115267038345337,
- "mean_token_accuracy": 0.8517841100692749,
- "num_tokens": 17691510.0,
- "step": 1973
- },
- {
- "epoch": 1.5,
- "grad_norm": 2.7196054458618164,
- "learning_rate": 2.70919460833079e-06,
- "loss": 0.3819037675857544,
- "mean_token_accuracy": 0.8765411376953125,
- "num_tokens": 17696179.0,
- "step": 1974
- },
- {
- "epoch": 1.500759878419453,
- "grad_norm": 2.969406843185425,
- "learning_rate": 2.7071074854005206e-06,
- "loss": 0.3922455608844757,
- "mean_token_accuracy": 0.8796037435531616,
- "num_tokens": 17700597.0,
- "step": 1975
- },
- {
- "epoch": 1.5015197568389058,
- "grad_norm": 2.2965853214263916,
- "learning_rate": 2.705020217114248e-06,
- "loss": 0.5433666110038757,
- "mean_token_accuracy": 0.809639036655426,
- "num_tokens": 17708895.0,
- "step": 1976
- },
- {
- "epoch": 1.5022796352583585,
- "grad_norm": 1.5584394931793213,
- "learning_rate": 2.7029328049368942e-06,
- "loss": 0.4736343324184418,
- "mean_token_accuracy": 0.8197190761566162,
- "num_tokens": 17725202.0,
- "step": 1977
- },
- {
- "epoch": 1.5030395136778116,
- "grad_norm": 1.3903142213821411,
- "learning_rate": 2.700845250333486e-06,
- "loss": 0.4471571445465088,
- "mean_token_accuracy": 0.839043140411377,
- "num_tokens": 17742835.0,
- "step": 1978
- },
- {
- "epoch": 1.5037993920972643,
- "grad_norm": 3.080716609954834,
- "learning_rate": 2.69875755476915e-06,
- "loss": 0.45760005712509155,
- "mean_token_accuracy": 0.8366328477859497,
- "num_tokens": 17747324.0,
- "step": 1979
- },
- {
- "epoch": 1.5045592705167175,
- "grad_norm": 1.0150405168533325,
- "learning_rate": 2.696669719709111e-06,
- "loss": 0.33638954162597656,
- "mean_token_accuracy": 0.8591676354408264,
- "num_tokens": 17765565.0,
- "step": 1980
- },
- {
- "epoch": 1.5053191489361701,
- "grad_norm": 2.402927875518799,
- "learning_rate": 2.694581746618691e-06,
- "loss": 0.4086601436138153,
- "mean_token_accuracy": 0.8769911527633667,
- "num_tokens": 17771275.0,
- "step": 1981
- },
- {
- "epoch": 1.506079027355623,
- "grad_norm": 2.030583381652832,
- "learning_rate": 2.6924936369633126e-06,
- "loss": 0.5115457773208618,
- "mean_token_accuracy": 0.8054746389389038,
- "num_tokens": 17779999.0,
- "step": 1982
- },
- {
- "epoch": 1.506838905775076,
- "grad_norm": 2.575199604034424,
- "learning_rate": 2.6904053922084893e-06,
- "loss": 0.363183856010437,
- "mean_token_accuracy": 0.8716042637825012,
- "num_tokens": 17785473.0,
- "step": 1983
- },
- {
- "epoch": 1.5075987841945289,
- "grad_norm": 1.8497480154037476,
- "learning_rate": 2.688317013819832e-06,
- "loss": 0.4254384934902191,
- "mean_token_accuracy": 0.8549597263336182,
- "num_tokens": 17793812.0,
- "step": 1984
- },
- {
- "epoch": 1.5083586626139818,
- "grad_norm": 1.7786511182785034,
- "learning_rate": 2.686228503263045e-06,
- "loss": 0.33400774002075195,
- "mean_token_accuracy": 0.9027615189552307,
- "num_tokens": 17801783.0,
- "step": 1985
- },
- {
- "epoch": 1.5091185410334347,
- "grad_norm": 1.8365367650985718,
- "learning_rate": 2.684139862003927e-06,
- "loss": 0.35765063762664795,
- "mean_token_accuracy": 0.8663736581802368,
- "num_tokens": 17809562.0,
- "step": 1986
- },
- {
- "epoch": 1.5098784194528876,
- "grad_norm": 1.8817477226257324,
- "learning_rate": 2.682051091508365e-06,
- "loss": 0.4627506732940674,
- "mean_token_accuracy": 0.8358862400054932,
- "num_tokens": 17819094.0,
- "step": 1987
- },
- {
- "epoch": 1.5106382978723403,
- "grad_norm": 2.221547842025757,
- "learning_rate": 2.679962193242338e-06,
- "loss": 0.577020525932312,
- "mean_token_accuracy": 0.80013108253479,
- "num_tokens": 17826666.0,
- "step": 1988
- },
- {
- "epoch": 1.5113981762917934,
- "grad_norm": 2.6618270874023438,
- "learning_rate": 2.6778731686719177e-06,
- "loss": 0.44632256031036377,
- "mean_token_accuracy": 0.8611289262771606,
- "num_tokens": 17833172.0,
- "step": 1989
- },
- {
- "epoch": 1.512158054711246,
- "grad_norm": 2.9495689868927,
- "learning_rate": 2.67578401926326e-06,
- "loss": 0.3482511043548584,
- "mean_token_accuracy": 0.8703314661979675,
- "num_tokens": 17837220.0,
- "step": 1990
- },
- {
- "epoch": 1.5129179331306992,
- "grad_norm": 2.0943644046783447,
- "learning_rate": 2.6736947464826107e-06,
- "loss": 0.2354314625263214,
- "mean_token_accuracy": 0.9137634038925171,
- "num_tokens": 17842712.0,
- "step": 1991
- },
- {
- "epoch": 1.513677811550152,
- "grad_norm": 1.1303033828735352,
- "learning_rate": 2.671605351796302e-06,
- "loss": 0.3624761700630188,
- "mean_token_accuracy": 0.8769594430923462,
- "num_tokens": 17860902.0,
- "step": 1992
- },
- {
- "epoch": 1.5144376899696048,
- "grad_norm": 2.8921146392822266,
- "learning_rate": 2.6695158366707526e-06,
- "loss": 0.2517220973968506,
- "mean_token_accuracy": 0.8974182605743408,
- "num_tokens": 17865160.0,
- "step": 1993
- },
- {
- "epoch": 1.5151975683890577,
- "grad_norm": 2.320587158203125,
- "learning_rate": 2.667426202572463e-06,
- "loss": 0.4589889943599701,
- "mean_token_accuracy": 0.8379613161087036,
- "num_tokens": 17871994.0,
- "step": 1994
- },
- {
- "epoch": 1.5159574468085106,
- "grad_norm": 1.1407674551010132,
- "learning_rate": 2.665336450968019e-06,
- "loss": 0.34412115812301636,
- "mean_token_accuracy": 0.8776306509971619,
- "num_tokens": 17889941.0,
- "step": 1995
- },
- {
- "epoch": 1.5167173252279635,
- "grad_norm": 2.069814920425415,
- "learning_rate": 2.6632465833240895e-06,
- "loss": 0.47524404525756836,
- "mean_token_accuracy": 0.830310046672821,
- "num_tokens": 17898447.0,
- "step": 1996
- },
- {
- "epoch": 1.5174772036474165,
- "grad_norm": 1.822415828704834,
- "learning_rate": 2.661156601107424e-06,
- "loss": 0.4541318416595459,
- "mean_token_accuracy": 0.8856616020202637,
- "num_tokens": 17908729.0,
- "step": 1997
- },
- {
- "epoch": 1.5182370820668694,
- "grad_norm": 2.851428985595703,
- "learning_rate": 2.659066505784852e-06,
- "loss": 0.41761666536331177,
- "mean_token_accuracy": 0.8710572719573975,
- "num_tokens": 17913860.0,
- "step": 1998
- },
- {
- "epoch": 1.518996960486322,
- "grad_norm": 1.8483710289001465,
- "learning_rate": 2.6569762988232838e-06,
- "loss": 0.45517268776893616,
- "mean_token_accuracy": 0.8411115407943726,
- "num_tokens": 17923497.0,
- "step": 1999
- },
- {
- "epoch": 1.5197568389057752,
- "grad_norm": 1.9044219255447388,
- "learning_rate": 2.654885981689706e-06,
- "loss": 0.42533189058303833,
- "mean_token_accuracy": 0.8597894906997681,
- "num_tokens": 17932670.0,
- "step": 2000
- },
- {
- "epoch": 1.5205167173252279,
- "grad_norm": 1.8170348405838013,
- "learning_rate": 2.652795555851184e-06,
- "loss": 0.4009692072868347,
- "mean_token_accuracy": 0.8553036451339722,
- "num_tokens": 17941616.0,
- "step": 2001
- },
- {
- "epoch": 1.521276595744681,
- "grad_norm": 1.4704090356826782,
- "learning_rate": 2.6507050227748595e-06,
- "loss": 0.3732764720916748,
- "mean_token_accuracy": 0.8788566589355469,
- "num_tokens": 17957187.0,
- "step": 2002
- },
- {
- "epoch": 1.5220364741641337,
- "grad_norm": 1.6681534051895142,
- "learning_rate": 2.648614383927949e-06,
- "loss": 0.341326504945755,
- "mean_token_accuracy": 0.874875545501709,
- "num_tokens": 17966668.0,
- "step": 2003
- },
- {
- "epoch": 1.5227963525835866,
- "grad_norm": 1.8578619956970215,
- "learning_rate": 2.646523640777741e-06,
- "loss": 0.3937399983406067,
- "mean_token_accuracy": 0.8656851053237915,
- "num_tokens": 17976194.0,
- "step": 2004
- },
- {
- "epoch": 1.5235562310030395,
- "grad_norm": 1.7520431280136108,
- "learning_rate": 2.6444327947916037e-06,
- "loss": 0.3392767906188965,
- "mean_token_accuracy": 0.8799679279327393,
- "num_tokens": 17984492.0,
- "step": 2005
- },
- {
- "epoch": 1.5243161094224924,
- "grad_norm": 3.4649906158447266,
- "learning_rate": 2.6423418474369707e-06,
- "loss": 0.3451516032218933,
- "mean_token_accuracy": 0.8753262758255005,
- "num_tokens": 17988240.0,
- "step": 2006
- },
- {
- "epoch": 1.5250759878419453,
- "grad_norm": 1.8037052154541016,
- "learning_rate": 2.64025080018135e-06,
- "loss": 0.34428173303604126,
- "mean_token_accuracy": 0.8719067573547363,
- "num_tokens": 17996644.0,
- "step": 2007
- },
- {
- "epoch": 1.5258358662613982,
- "grad_norm": 1.743722677230835,
- "learning_rate": 2.6381596544923184e-06,
- "loss": 0.4446655213832855,
- "mean_token_accuracy": 0.8612518906593323,
- "num_tokens": 18005109.0,
- "step": 2008
- },
- {
- "epoch": 1.5265957446808511,
- "grad_norm": 1.3357981443405151,
- "learning_rate": 2.636068411837523e-06,
- "loss": 0.38647788763046265,
- "mean_token_accuracy": 0.858294665813446,
- "num_tokens": 18018193.0,
- "step": 2009
- },
- {
- "epoch": 1.5273556231003038,
- "grad_norm": 1.4848440885543823,
- "learning_rate": 2.6339770736846794e-06,
- "loss": 0.3597261607646942,
- "mean_token_accuracy": 0.8760983943939209,
- "num_tokens": 18028959.0,
- "step": 2010
- },
- {
- "epoch": 1.528115501519757,
- "grad_norm": 2.356933832168579,
- "learning_rate": 2.6318856415015664e-06,
- "loss": 0.2697138488292694,
- "mean_token_accuracy": 0.9078473448753357,
- "num_tokens": 18033946.0,
- "step": 2011
- },
- {
- "epoch": 1.5288753799392096,
- "grad_norm": 1.964368224143982,
- "learning_rate": 2.629794116756035e-06,
- "loss": 0.41349685192108154,
- "mean_token_accuracy": 0.8567900657653809,
- "num_tokens": 18042724.0,
- "step": 2012
- },
- {
- "epoch": 1.5296352583586628,
- "grad_norm": 1.5630402565002441,
- "learning_rate": 2.627702500915995e-06,
- "loss": 0.49310681223869324,
- "mean_token_accuracy": 0.8229681253433228,
- "num_tokens": 18054396.0,
- "step": 2013
- },
- {
- "epoch": 1.5303951367781155,
- "grad_norm": 1.6657718420028687,
- "learning_rate": 2.625610795449424e-06,
- "loss": 0.4263935387134552,
- "mean_token_accuracy": 0.8634918332099915,
- "num_tokens": 18064347.0,
- "step": 2014
- },
- {
- "epoch": 1.5311550151975684,
- "grad_norm": 1.3684180974960327,
- "learning_rate": 2.6235190018243623e-06,
- "loss": 0.2903984487056732,
- "mean_token_accuracy": 0.8930408358573914,
- "num_tokens": 18076826.0,
- "step": 2015
- },
- {
- "epoch": 1.5319148936170213,
- "grad_norm": 1.635044813156128,
- "learning_rate": 2.6214271215089106e-06,
- "loss": 0.3066539168357849,
- "mean_token_accuracy": 0.8912158012390137,
- "num_tokens": 18085761.0,
- "step": 2016
- },
- {
- "epoch": 1.5326747720364742,
- "grad_norm": 2.431518316268921,
- "learning_rate": 2.6193351559712294e-06,
- "loss": 0.31123271584510803,
- "mean_token_accuracy": 0.8865828514099121,
- "num_tokens": 18091715.0,
- "step": 2017
- },
- {
- "epoch": 1.533434650455927,
- "grad_norm": 1.8317419290542603,
- "learning_rate": 2.6172431066795428e-06,
- "loss": 0.5042020082473755,
- "mean_token_accuracy": 0.8245081901550293,
- "num_tokens": 18102095.0,
- "step": 2018
- },
- {
- "epoch": 1.53419452887538,
- "grad_norm": 3.4221980571746826,
- "learning_rate": 2.6151509751021307e-06,
- "loss": 0.2885819971561432,
- "mean_token_accuracy": 0.8997149467468262,
- "num_tokens": 18105456.0,
- "step": 2019
- },
- {
- "epoch": 1.534954407294833,
- "grad_norm": 1.4435855150222778,
- "learning_rate": 2.6130587627073315e-06,
- "loss": 0.45573529601097107,
- "mean_token_accuracy": 0.837191104888916,
- "num_tokens": 18119039.0,
- "step": 2020
- },
- {
- "epoch": 1.5357142857142856,
- "grad_norm": 1.5748237371444702,
- "learning_rate": 2.6109664709635413e-06,
- "loss": 0.4561889171600342,
- "mean_token_accuracy": 0.8334558010101318,
- "num_tokens": 18132150.0,
- "step": 2021
- },
- {
- "epoch": 1.5364741641337387,
- "grad_norm": 2.8278751373291016,
- "learning_rate": 2.60887410133921e-06,
- "loss": 0.3495104908943176,
- "mean_token_accuracy": 0.8926796913146973,
- "num_tokens": 18136528.0,
- "step": 2022
- },
- {
- "epoch": 1.5372340425531914,
- "grad_norm": 2.5045573711395264,
- "learning_rate": 2.606781655302843e-06,
- "loss": 0.45362481474876404,
- "mean_token_accuracy": 0.8379551768302917,
- "num_tokens": 18142581.0,
- "step": 2023
- },
- {
- "epoch": 1.5379939209726445,
- "grad_norm": 2.5984106063842773,
- "learning_rate": 2.604689134322999e-06,
- "loss": 0.4210243821144104,
- "mean_token_accuracy": 0.8571645021438599,
- "num_tokens": 18148152.0,
- "step": 2024
- },
- {
- "epoch": 1.5387537993920972,
- "grad_norm": 1.7180702686309814,
- "learning_rate": 2.602596539868292e-06,
- "loss": 0.2478562295436859,
- "mean_token_accuracy": 0.9227135181427002,
- "num_tokens": 18155435.0,
- "step": 2025
- },
- {
- "epoch": 1.5395136778115501,
- "grad_norm": 2.3721933364868164,
- "learning_rate": 2.6005038734073833e-06,
- "loss": 0.3820664584636688,
- "mean_token_accuracy": 0.8788443803787231,
- "num_tokens": 18161403.0,
- "step": 2026
- },
- {
- "epoch": 1.540273556231003,
- "grad_norm": 1.4967509508132935,
- "learning_rate": 2.5984111364089875e-06,
- "loss": 0.34247124195098877,
- "mean_token_accuracy": 0.8809049129486084,
- "num_tokens": 18173724.0,
- "step": 2027
- },
- {
- "epoch": 1.541033434650456,
- "grad_norm": 2.5226845741271973,
- "learning_rate": 2.5963183303418682e-06,
- "loss": 0.2647642493247986,
- "mean_token_accuracy": 0.8988642692565918,
- "num_tokens": 18178927.0,
- "step": 2028
- },
- {
- "epoch": 1.5417933130699089,
- "grad_norm": 2.217228412628174,
- "learning_rate": 2.594225456674837e-06,
- "loss": 0.37754058837890625,
- "mean_token_accuracy": 0.8660204410552979,
- "num_tokens": 18185268.0,
- "step": 2029
- },
- {
- "epoch": 1.5425531914893615,
- "grad_norm": 2.336409091949463,
- "learning_rate": 2.592132516876753e-06,
- "loss": 0.45098528265953064,
- "mean_token_accuracy": 0.842115044593811,
- "num_tokens": 18192372.0,
- "step": 2030
- },
- {
- "epoch": 1.5433130699088147,
- "grad_norm": 3.5437142848968506,
- "learning_rate": 2.5900395124165216e-06,
- "loss": 0.5326460003852844,
- "mean_token_accuracy": 0.8125103712081909,
- "num_tokens": 18199182.0,
- "step": 2031
- },
- {
- "epoch": 1.5440729483282674,
- "grad_norm": 1.5785651206970215,
- "learning_rate": 2.5879464447630947e-06,
- "loss": 0.3714991509914398,
- "mean_token_accuracy": 0.8711390495300293,
- "num_tokens": 18209045.0,
- "step": 2032
- },
- {
- "epoch": 1.5448328267477205,
- "grad_norm": 2.3616182804107666,
- "learning_rate": 2.5858533153854676e-06,
- "loss": 0.4548399746417999,
- "mean_token_accuracy": 0.8411449193954468,
- "num_tokens": 18215487.0,
- "step": 2033
- },
- {
- "epoch": 1.5455927051671732,
- "grad_norm": 2.0750479698181152,
- "learning_rate": 2.583760125752679e-06,
- "loss": 0.3980535566806793,
- "mean_token_accuracy": 0.8603327870368958,
- "num_tokens": 18222606.0,
- "step": 2034
- },
- {
- "epoch": 1.5463525835866263,
- "grad_norm": 2.609295129776001,
- "learning_rate": 2.58166687733381e-06,
- "loss": 0.40177756547927856,
- "mean_token_accuracy": 0.8652099370956421,
- "num_tokens": 18227341.0,
- "step": 2035
- },
- {
- "epoch": 1.547112462006079,
- "grad_norm": 2.1621339321136475,
- "learning_rate": 2.5795735715979826e-06,
- "loss": 0.45104342699050903,
- "mean_token_accuracy": 0.8481369018554688,
- "num_tokens": 18235820.0,
- "step": 2036
- },
- {
- "epoch": 1.547872340425532,
- "grad_norm": 1.0381370782852173,
- "learning_rate": 2.577480210014359e-06,
- "loss": 0.32621103525161743,
- "mean_token_accuracy": 0.8867391347885132,
- "num_tokens": 18258307.0,
- "step": 2037
- },
- {
- "epoch": 1.5486322188449848,
- "grad_norm": 1.7634375095367432,
- "learning_rate": 2.575386794052142e-06,
- "loss": 0.5115169882774353,
- "mean_token_accuracy": 0.818779468536377,
- "num_tokens": 18272782.0,
- "step": 2038
- },
- {
- "epoch": 1.5493920972644377,
- "grad_norm": 1.874875545501709,
- "learning_rate": 2.5732933251805716e-06,
- "loss": 0.4381459951400757,
- "mean_token_accuracy": 0.8594684600830078,
- "num_tokens": 18282618.0,
- "step": 2039
- },
- {
- "epoch": 1.5501519756838906,
- "grad_norm": 2.1316351890563965,
- "learning_rate": 2.571199804868923e-06,
- "loss": 0.5410124063491821,
- "mean_token_accuracy": 0.8247587084770203,
- "num_tokens": 18289750.0,
- "step": 2040
- },
- {
- "epoch": 1.5509118541033433,
- "grad_norm": 1.7574573755264282,
- "learning_rate": 2.569106234586511e-06,
- "loss": 0.29967373609542847,
- "mean_token_accuracy": 0.8913218975067139,
- "num_tokens": 18298110.0,
- "step": 2041
- },
- {
- "epoch": 1.5516717325227964,
- "grad_norm": 1.929626703262329,
- "learning_rate": 2.5670126158026843e-06,
- "loss": 0.3287760019302368,
- "mean_token_accuracy": 0.8870488405227661,
- "num_tokens": 18305702.0,
- "step": 2042
- },
- {
- "epoch": 1.5524316109422491,
- "grad_norm": 3.020153284072876,
- "learning_rate": 2.5649189499868233e-06,
- "loss": 0.38523542881011963,
- "mean_token_accuracy": 0.854824960231781,
- "num_tokens": 18309830.0,
- "step": 2043
- },
- {
- "epoch": 1.5531914893617023,
- "grad_norm": 1.6378421783447266,
- "learning_rate": 2.5628252386083443e-06,
- "loss": 0.47371378540992737,
- "mean_token_accuracy": 0.8627713918685913,
- "num_tokens": 18322820.0,
- "step": 2044
- },
- {
- "epoch": 1.553951367781155,
- "grad_norm": 1.3711130619049072,
- "learning_rate": 2.560731483136694e-06,
- "loss": 0.3319293260574341,
- "mean_token_accuracy": 0.8704103231430054,
- "num_tokens": 18335074.0,
- "step": 2045
- },
- {
- "epoch": 1.5547112462006079,
- "grad_norm": 1.7589185237884521,
- "learning_rate": 2.558637685041352e-06,
- "loss": 0.4446021020412445,
- "mean_token_accuracy": 0.8446722626686096,
- "num_tokens": 18344115.0,
- "step": 2046
- },
- {
- "epoch": 1.5554711246200608,
- "grad_norm": 2.5249195098876953,
- "learning_rate": 2.5565438457918247e-06,
- "loss": 0.4625541865825653,
- "mean_token_accuracy": 0.8451195359230042,
- "num_tokens": 18349235.0,
- "step": 2047
- },
- {
- "epoch": 1.5562310030395137,
- "grad_norm": 1.0562543869018555,
- "learning_rate": 2.5544499668576508e-06,
- "loss": 0.33747735619544983,
- "mean_token_accuracy": 0.8503615856170654,
- "num_tokens": 18368253.0,
- "step": 2048
- },
- {
- "epoch": 1.5569908814589666,
- "grad_norm": 2.9451215267181396,
- "learning_rate": 2.5523560497083927e-06,
- "loss": 0.3958815932273865,
- "mean_token_accuracy": 0.8393744826316833,
- "num_tokens": 18372887.0,
- "step": 2049
- },
- {
- "epoch": 1.5577507598784195,
- "grad_norm": 1.3597660064697266,
- "learning_rate": 2.5502620958136444e-06,
- "loss": 0.46281275153160095,
- "mean_token_accuracy": 0.8269470930099487,
- "num_tokens": 18388074.0,
- "step": 2050
- },
- {
- "epoch": 1.5585106382978724,
- "grad_norm": 3.269068717956543,
- "learning_rate": 2.548168106643022e-06,
- "loss": 0.2309008538722992,
- "mean_token_accuracy": 0.9178205728530884,
- "num_tokens": 18391406.0,
- "step": 2051
- },
- {
- "epoch": 1.559270516717325,
- "grad_norm": 2.1459391117095947,
- "learning_rate": 2.546074083666169e-06,
- "loss": 0.4006733298301697,
- "mean_token_accuracy": 0.8631902933120728,
- "num_tokens": 18397497.0,
- "step": 2052
- },
- {
- "epoch": 1.5600303951367782,
- "grad_norm": 1.4614566564559937,
- "learning_rate": 2.5439800283527495e-06,
- "loss": 0.40810418128967285,
- "mean_token_accuracy": 0.8473483920097351,
- "num_tokens": 18409474.0,
- "step": 2053
- },
- {
- "epoch": 1.560790273556231,
- "grad_norm": 2.084808826446533,
- "learning_rate": 2.541885942172454e-06,
- "loss": 0.34967708587646484,
- "mean_token_accuracy": 0.8707003593444824,
- "num_tokens": 18416400.0,
- "step": 2054
- },
- {
- "epoch": 1.561550151975684,
- "grad_norm": 1.90664541721344,
- "learning_rate": 2.539791826594991e-06,
- "loss": 0.37694251537323,
- "mean_token_accuracy": 0.8704941272735596,
- "num_tokens": 18424206.0,
- "step": 2055
- },
- {
- "epoch": 1.5623100303951367,
- "grad_norm": 1.880176305770874,
- "learning_rate": 2.537697683090093e-06,
- "loss": 0.32510411739349365,
- "mean_token_accuracy": 0.8848961591720581,
- "num_tokens": 18431676.0,
- "step": 2056
- },
- {
- "epoch": 1.5630699088145896,
- "grad_norm": 2.133375406265259,
- "learning_rate": 2.5356035131275096e-06,
- "loss": 0.30538493394851685,
- "mean_token_accuracy": 0.8890067338943481,
- "num_tokens": 18438014.0,
- "step": 2057
- },
- {
- "epoch": 1.5638297872340425,
- "grad_norm": 2.3495655059814453,
- "learning_rate": 2.5335093181770105e-06,
- "loss": 0.3126775324344635,
- "mean_token_accuracy": 0.8865689039230347,
- "num_tokens": 18443604.0,
- "step": 2058
- },
- {
- "epoch": 1.5645896656534954,
- "grad_norm": 2.37949538230896,
- "learning_rate": 2.531415099708382e-06,
- "loss": 0.3257793188095093,
- "mean_token_accuracy": 0.8809669017791748,
- "num_tokens": 18448654.0,
- "step": 2059
- },
- {
- "epoch": 1.5653495440729484,
- "grad_norm": 1.8285472393035889,
- "learning_rate": 2.5293208591914265e-06,
- "loss": 0.32376936078071594,
- "mean_token_accuracy": 0.8816431760787964,
- "num_tokens": 18456619.0,
- "step": 2060
- },
- {
- "epoch": 1.5661094224924013,
- "grad_norm": 2.3238534927368164,
- "learning_rate": 2.5272265980959644e-06,
- "loss": 0.40366506576538086,
- "mean_token_accuracy": 0.8496750593185425,
- "num_tokens": 18462788.0,
- "step": 2061
- },
- {
- "epoch": 1.5668693009118542,
- "grad_norm": 1.8954942226409912,
- "learning_rate": 2.525132317891827e-06,
- "loss": 0.3405473828315735,
- "mean_token_accuracy": 0.8849360942840576,
- "num_tokens": 18470719.0,
- "step": 2062
- },
- {
- "epoch": 1.5676291793313069,
- "grad_norm": 1.6268190145492554,
- "learning_rate": 2.523038020048861e-06,
- "loss": 0.3662685751914978,
- "mean_token_accuracy": 0.8865662813186646,
- "num_tokens": 18482095.0,
- "step": 2063
- },
- {
- "epoch": 1.56838905775076,
- "grad_norm": 2.5198733806610107,
- "learning_rate": 2.5209437060369266e-06,
- "loss": 0.3968311548233032,
- "mean_token_accuracy": 0.8643308281898499,
- "num_tokens": 18488069.0,
- "step": 2064
- },
- {
- "epoch": 1.5691489361702127,
- "grad_norm": 2.9197335243225098,
- "learning_rate": 2.518849377325893e-06,
- "loss": 0.24738386273384094,
- "mean_token_accuracy": 0.91959547996521,
- "num_tokens": 18491762.0,
- "step": 2065
- },
- {
- "epoch": 1.5699088145896658,
- "grad_norm": 1.5914254188537598,
- "learning_rate": 2.51675503538564e-06,
- "loss": 0.33473581075668335,
- "mean_token_accuracy": 0.8794662952423096,
- "num_tokens": 18501316.0,
- "step": 2066
- },
- {
- "epoch": 1.5706686930091185,
- "grad_norm": 2.5130460262298584,
- "learning_rate": 2.5146606816860597e-06,
- "loss": 0.4067240357398987,
- "mean_token_accuracy": 0.8564209342002869,
- "num_tokens": 18507169.0,
- "step": 2067
- },
- {
- "epoch": 1.5714285714285714,
- "grad_norm": 2.093353509902954,
- "learning_rate": 2.5125663176970475e-06,
- "loss": 0.4312136769294739,
- "mean_token_accuracy": 0.8540225028991699,
- "num_tokens": 18514536.0,
- "step": 2068
- },
- {
- "epoch": 1.5721884498480243,
- "grad_norm": 1.284495234489441,
- "learning_rate": 2.5104719448885103e-06,
- "loss": 0.3813856542110443,
- "mean_token_accuracy": 0.8435653448104858,
- "num_tokens": 18529947.0,
- "step": 2069
- },
- {
- "epoch": 1.5729483282674772,
- "grad_norm": 2.0383973121643066,
- "learning_rate": 2.5083775647303583e-06,
- "loss": 0.4428079426288605,
- "mean_token_accuracy": 0.8841741681098938,
- "num_tokens": 18537109.0,
- "step": 2070
- },
- {
- "epoch": 1.5737082066869301,
- "grad_norm": 1.7991697788238525,
- "learning_rate": 2.5062831786925102e-06,
- "loss": 0.460052490234375,
- "mean_token_accuracy": 0.8459943532943726,
- "num_tokens": 18547108.0,
- "step": 2071
- },
- {
- "epoch": 1.574468085106383,
- "grad_norm": 2.2168822288513184,
- "learning_rate": 2.5041887882448845e-06,
- "loss": 0.2863885462284088,
- "mean_token_accuracy": 0.906816840171814,
- "num_tokens": 18552357.0,
- "step": 2072
- },
- {
- "epoch": 1.575227963525836,
- "grad_norm": 3.918499708175659,
- "learning_rate": 2.5020943948574056e-06,
- "loss": 0.3439999222755432,
- "mean_token_accuracy": 0.8742123246192932,
- "num_tokens": 18555272.0,
- "step": 2073
- },
- {
- "epoch": 1.5759878419452886,
- "grad_norm": 1.773869514465332,
- "learning_rate": 2.5e-06,
- "loss": 0.2815646827220917,
- "mean_token_accuracy": 0.8939872980117798,
- "num_tokens": 18562989.0,
- "step": 2074
- },
- {
- "epoch": 1.5767477203647418,
- "grad_norm": 1.8675572872161865,
- "learning_rate": 2.497905605142595e-06,
- "loss": 0.5005829930305481,
- "mean_token_accuracy": 0.8242729902267456,
- "num_tokens": 18575587.0,
- "step": 2075
- },
- {
- "epoch": 1.5775075987841944,
- "grad_norm": 2.3143508434295654,
- "learning_rate": 2.4958112117551163e-06,
- "loss": 0.42472895979881287,
- "mean_token_accuracy": 0.8540043830871582,
- "num_tokens": 18581666.0,
- "step": 2076
- },
- {
- "epoch": 1.5782674772036476,
- "grad_norm": 2.529740333557129,
- "learning_rate": 2.4937168213074906e-06,
- "loss": 0.24539905786514282,
- "mean_token_accuracy": 0.9041235446929932,
- "num_tokens": 18585773.0,
- "step": 2077
- },
- {
- "epoch": 1.5790273556231003,
- "grad_norm": 2.5188395977020264,
- "learning_rate": 2.491622435269642e-06,
- "loss": 0.23059265315532684,
- "mean_token_accuracy": 0.9204603433609009,
- "num_tokens": 18589915.0,
- "step": 2078
- },
- {
- "epoch": 1.5797872340425532,
- "grad_norm": 2.7752444744110107,
- "learning_rate": 2.489528055111491e-06,
- "loss": 0.452225923538208,
- "mean_token_accuracy": 0.8444918990135193,
- "num_tokens": 18595488.0,
- "step": 2079
- },
- {
- "epoch": 1.580547112462006,
- "grad_norm": 1.174774408340454,
- "learning_rate": 2.487433682302953e-06,
- "loss": 0.3399246633052826,
- "mean_token_accuracy": 0.8608446717262268,
- "num_tokens": 18613756.0,
- "step": 2080
- },
- {
- "epoch": 1.581306990881459,
- "grad_norm": 1.515575647354126,
- "learning_rate": 2.485339318313941e-06,
- "loss": 0.45886170864105225,
- "mean_token_accuracy": 0.8479131460189819,
- "num_tokens": 18629610.0,
- "step": 2081
- },
- {
- "epoch": 1.582066869300912,
- "grad_norm": 1.7039403915405273,
- "learning_rate": 2.4832449646143605e-06,
- "loss": 0.349803626537323,
- "mean_token_accuracy": 0.8721815347671509,
- "num_tokens": 18637523.0,
- "step": 2082
- },
- {
- "epoch": 1.5828267477203646,
- "grad_norm": 3.2289421558380127,
- "learning_rate": 2.4811506226741077e-06,
- "loss": 0.4967171549797058,
- "mean_token_accuracy": 0.8303675651550293,
- "num_tokens": 18641826.0,
- "step": 2083
- },
- {
- "epoch": 1.5835866261398177,
- "grad_norm": 1.71235990524292,
- "learning_rate": 2.4790562939630738e-06,
- "loss": 0.4202485680580139,
- "mean_token_accuracy": 0.8581224679946899,
- "num_tokens": 18653146.0,
- "step": 2084
- },
- {
- "epoch": 1.5843465045592704,
- "grad_norm": 1.710036277770996,
- "learning_rate": 2.4769619799511392e-06,
- "loss": 0.3942421078681946,
- "mean_token_accuracy": 0.8553562164306641,
- "num_tokens": 18663826.0,
- "step": 2085
- },
- {
- "epoch": 1.5851063829787235,
- "grad_norm": 1.464859127998352,
- "learning_rate": 2.474867682108174e-06,
- "loss": 0.4093329906463623,
- "mean_token_accuracy": 0.8598780632019043,
- "num_tokens": 18675325.0,
- "step": 2086
- },
- {
- "epoch": 1.5858662613981762,
- "grad_norm": 2.083707809448242,
- "learning_rate": 2.472773401904037e-06,
- "loss": 0.4252093434333801,
- "mean_token_accuracy": 0.8433356881141663,
- "num_tokens": 18682416.0,
- "step": 2087
- },
- {
- "epoch": 1.5866261398176293,
- "grad_norm": 1.5577973127365112,
- "learning_rate": 2.470679140808574e-06,
- "loss": 0.3680085241794586,
- "mean_token_accuracy": 0.8609116077423096,
- "num_tokens": 18694445.0,
- "step": 2088
- },
- {
- "epoch": 1.587386018237082,
- "grad_norm": 2.1617276668548584,
- "learning_rate": 2.4685849002916184e-06,
- "loss": 0.40488749742507935,
- "mean_token_accuracy": 0.8429721593856812,
- "num_tokens": 18701204.0,
- "step": 2089
- },
- {
- "epoch": 1.588145896656535,
- "grad_norm": 2.046678304672241,
- "learning_rate": 2.4664906818229903e-06,
- "loss": 0.329141229391098,
- "mean_token_accuracy": 0.8830771446228027,
- "num_tokens": 18708354.0,
- "step": 2090
- },
- {
- "epoch": 1.5889057750759878,
- "grad_norm": 2.7741200923919678,
- "learning_rate": 2.4643964868724916e-06,
- "loss": 0.42294493317604065,
- "mean_token_accuracy": 0.8612706065177917,
- "num_tokens": 18713017.0,
- "step": 2091
- },
- {
- "epoch": 1.5896656534954408,
- "grad_norm": 2.085151433944702,
- "learning_rate": 2.4623023169099074e-06,
- "loss": 0.39038220047950745,
- "mean_token_accuracy": 0.861169695854187,
- "num_tokens": 18721423.0,
- "step": 2092
- },
- {
- "epoch": 1.5904255319148937,
- "grad_norm": 2.8721165657043457,
- "learning_rate": 2.4602081734050093e-06,
- "loss": 0.27753859758377075,
- "mean_token_accuracy": 0.8959167003631592,
- "num_tokens": 18725044.0,
- "step": 2093
- },
- {
- "epoch": 1.5911854103343464,
- "grad_norm": 1.7388207912445068,
- "learning_rate": 2.4581140578275473e-06,
- "loss": 0.3570033311843872,
- "mean_token_accuracy": 0.8715590238571167,
- "num_tokens": 18733891.0,
- "step": 2094
- },
- {
- "epoch": 1.5919452887537995,
- "grad_norm": 2.3645241260528564,
- "learning_rate": 2.456019971647251e-06,
- "loss": 0.38982006907463074,
- "mean_token_accuracy": 0.8734139800071716,
- "num_tokens": 18740464.0,
- "step": 2095
- },
- {
- "epoch": 1.5927051671732522,
- "grad_norm": 3.674072027206421,
- "learning_rate": 2.4539259163338317e-06,
- "loss": 0.4068281650543213,
- "mean_token_accuracy": 0.8397839069366455,
- "num_tokens": 18744857.0,
- "step": 2096
- },
- {
- "epoch": 1.5934650455927053,
- "grad_norm": 1.8209186792373657,
- "learning_rate": 2.4518318933569786e-06,
- "loss": 0.3471015691757202,
- "mean_token_accuracy": 0.8709044456481934,
- "num_tokens": 18752414.0,
- "step": 2097
- },
- {
- "epoch": 1.594224924012158,
- "grad_norm": 1.8138704299926758,
- "learning_rate": 2.449737904186357e-06,
- "loss": 0.3438487648963928,
- "mean_token_accuracy": 0.8766711950302124,
- "num_tokens": 18760587.0,
- "step": 2098
- },
- {
- "epoch": 1.594984802431611,
- "grad_norm": 1.7893842458724976,
- "learning_rate": 2.447643950291608e-06,
- "loss": 0.43519508838653564,
- "mean_token_accuracy": 0.8682907819747925,
- "num_tokens": 18770293.0,
- "step": 2099
- },
- {
- "epoch": 1.5957446808510638,
- "grad_norm": 1.4305094480514526,
- "learning_rate": 2.4455500331423505e-06,
- "loss": 0.37106508016586304,
- "mean_token_accuracy": 0.8611354827880859,
- "num_tokens": 18782456.0,
- "step": 2100
- },
- {
- "epoch": 1.5965045592705167,
- "grad_norm": 2.0797057151794434,
- "learning_rate": 2.4434561542081765e-06,
- "loss": 0.43942689895629883,
- "mean_token_accuracy": 0.8477288484573364,
- "num_tokens": 18789547.0,
- "step": 2101
- },
- {
- "epoch": 1.5972644376899696,
- "grad_norm": 1.2983288764953613,
- "learning_rate": 2.441362314958649e-06,
- "loss": 0.46385765075683594,
- "mean_token_accuracy": 0.8340978622436523,
- "num_tokens": 18809456.0,
- "step": 2102
- },
- {
- "epoch": 1.5980243161094225,
- "grad_norm": 2.60866641998291,
- "learning_rate": 2.439268516863306e-06,
- "loss": 0.3106239140033722,
- "mean_token_accuracy": 0.8859497308731079,
- "num_tokens": 18813781.0,
- "step": 2103
- },
- {
- "epoch": 1.5987841945288754,
- "grad_norm": 3.389376163482666,
- "learning_rate": 2.4371747613916566e-06,
- "loss": 0.44926169514656067,
- "mean_token_accuracy": 0.8664819002151489,
- "num_tokens": 18817666.0,
- "step": 2104
- },
- {
- "epoch": 1.5995440729483281,
- "grad_norm": 3.3417351245880127,
- "learning_rate": 2.4350810500131776e-06,
- "loss": 0.4786076545715332,
- "mean_token_accuracy": 0.8357523679733276,
- "num_tokens": 18823717.0,
- "step": 2105
- },
- {
- "epoch": 1.6003039513677813,
- "grad_norm": 1.5215197801589966,
- "learning_rate": 2.4329873841973174e-06,
- "loss": 0.4123923182487488,
- "mean_token_accuracy": 0.853337287902832,
- "num_tokens": 18835163.0,
- "step": 2106
- },
- {
- "epoch": 1.601063829787234,
- "grad_norm": 1.8798415660858154,
- "learning_rate": 2.4308937654134893e-06,
- "loss": 0.45594000816345215,
- "mean_token_accuracy": 0.8553717732429504,
- "num_tokens": 18843923.0,
- "step": 2107
- },
- {
- "epoch": 1.601823708206687,
- "grad_norm": 2.1012487411499023,
- "learning_rate": 2.428800195131078e-06,
- "loss": 0.4340161085128784,
- "mean_token_accuracy": 0.8448120355606079,
- "num_tokens": 18851852.0,
- "step": 2108
- },
- {
- "epoch": 1.6025835866261398,
- "grad_norm": 2.827080726623535,
- "learning_rate": 2.4267066748194297e-06,
- "loss": 0.25922513008117676,
- "mean_token_accuracy": 0.9024698734283447,
- "num_tokens": 18856113.0,
- "step": 2109
- },
- {
- "epoch": 1.6033434650455927,
- "grad_norm": 1.641032338142395,
- "learning_rate": 2.4246132059478582e-06,
- "loss": 0.591558575630188,
- "mean_token_accuracy": 0.7960667610168457,
- "num_tokens": 18870618.0,
- "step": 2110
- },
- {
- "epoch": 1.6041033434650456,
- "grad_norm": 2.600771188735962,
- "learning_rate": 2.4225197899856416e-06,
- "loss": 0.382815957069397,
- "mean_token_accuracy": 0.8654585480690002,
- "num_tokens": 18875456.0,
- "step": 2111
- },
- {
- "epoch": 1.6048632218844985,
- "grad_norm": 1.5125449895858765,
- "learning_rate": 2.4204264284020182e-06,
- "loss": 0.4643454849720001,
- "mean_token_accuracy": 0.837038516998291,
- "num_tokens": 18887979.0,
- "step": 2112
- },
- {
- "epoch": 1.6056231003039514,
- "grad_norm": 1.7571941614151,
- "learning_rate": 2.4183331226661913e-06,
- "loss": 0.30713701248168945,
- "mean_token_accuracy": 0.8856921195983887,
- "num_tokens": 18896143.0,
- "step": 2113
- },
- {
- "epoch": 1.6063829787234043,
- "grad_norm": 2.124593496322632,
- "learning_rate": 2.4162398742473216e-06,
- "loss": 0.2873607575893402,
- "mean_token_accuracy": 0.8986717462539673,
- "num_tokens": 18902364.0,
- "step": 2114
- },
- {
- "epoch": 1.6071428571428572,
- "grad_norm": 2.3496272563934326,
- "learning_rate": 2.4141466846145332e-06,
- "loss": 0.33715200424194336,
- "mean_token_accuracy": 0.8816461563110352,
- "num_tokens": 18908038.0,
- "step": 2115
- },
- {
- "epoch": 1.60790273556231,
- "grad_norm": 1.2783573865890503,
- "learning_rate": 2.4120535552369057e-06,
- "loss": 0.45153388381004333,
- "mean_token_accuracy": 0.8345640897750854,
- "num_tokens": 18926687.0,
- "step": 2116
- },
- {
- "epoch": 1.608662613981763,
- "grad_norm": 2.1481080055236816,
- "learning_rate": 2.4099604875834796e-06,
- "loss": 0.43976694345474243,
- "mean_token_accuracy": 0.847899317741394,
- "num_tokens": 18932974.0,
- "step": 2117
- },
- {
- "epoch": 1.6094224924012157,
- "grad_norm": 1.8669065237045288,
- "learning_rate": 2.407867483123248e-06,
- "loss": 0.4649358093738556,
- "mean_token_accuracy": 0.8310785293579102,
- "num_tokens": 18942551.0,
- "step": 2118
- },
- {
- "epoch": 1.6101823708206688,
- "grad_norm": 2.7667746543884277,
- "learning_rate": 2.4057745433251637e-06,
- "loss": 0.4542210102081299,
- "mean_token_accuracy": 0.8450086116790771,
- "num_tokens": 18947525.0,
- "step": 2119
- },
- {
- "epoch": 1.6109422492401215,
- "grad_norm": 2.2865076065063477,
- "learning_rate": 2.4036816696581326e-06,
- "loss": 0.34291431307792664,
- "mean_token_accuracy": 0.8741394281387329,
- "num_tokens": 18952967.0,
- "step": 2120
- },
- {
- "epoch": 1.6117021276595744,
- "grad_norm": 3.055197238922119,
- "learning_rate": 2.401588863591013e-06,
- "loss": 0.4686807692050934,
- "mean_token_accuracy": 0.8440030217170715,
- "num_tokens": 18958257.0,
- "step": 2121
- },
- {
- "epoch": 1.6124620060790273,
- "grad_norm": 2.268456220626831,
- "learning_rate": 2.3994961265926166e-06,
- "loss": 0.440069317817688,
- "mean_token_accuracy": 0.8534891605377197,
- "num_tokens": 18964745.0,
- "step": 2122
- },
- {
- "epoch": 1.6132218844984803,
- "grad_norm": 2.061185359954834,
- "learning_rate": 2.3974034601317085e-06,
- "loss": 0.4383159279823303,
- "mean_token_accuracy": 0.8484808802604675,
- "num_tokens": 18972136.0,
- "step": 2123
- },
- {
- "epoch": 1.6139817629179332,
- "grad_norm": 1.5121275186538696,
- "learning_rate": 2.3953108656770018e-06,
- "loss": 0.42403632402420044,
- "mean_token_accuracy": 0.8467602133750916,
- "num_tokens": 18985353.0,
- "step": 2124
- },
- {
- "epoch": 1.614741641337386,
- "grad_norm": 1.9965397119522095,
- "learning_rate": 2.3932183446971584e-06,
- "loss": 0.3915751576423645,
- "mean_token_accuracy": 0.8622956275939941,
- "num_tokens": 18992017.0,
- "step": 2125
- },
- {
- "epoch": 1.615501519756839,
- "grad_norm": 1.6688618659973145,
- "learning_rate": 2.3911258986607907e-06,
- "loss": 0.468288391828537,
- "mean_token_accuracy": 0.8372251987457275,
- "num_tokens": 19001930.0,
- "step": 2126
- },
- {
- "epoch": 1.6162613981762917,
- "grad_norm": 1.8984699249267578,
- "learning_rate": 2.3890335290364596e-06,
- "loss": 0.3082895278930664,
- "mean_token_accuracy": 0.8815990686416626,
- "num_tokens": 19009712.0,
- "step": 2127
- },
- {
- "epoch": 1.6170212765957448,
- "grad_norm": 2.6934773921966553,
- "learning_rate": 2.386941237292669e-06,
- "loss": 0.48406022787094116,
- "mean_token_accuracy": 0.8300775289535522,
- "num_tokens": 19015212.0,
- "step": 2128
- },
- {
- "epoch": 1.6177811550151975,
- "grad_norm": 1.6615487337112427,
- "learning_rate": 2.3848490248978693e-06,
- "loss": 0.45227736234664917,
- "mean_token_accuracy": 0.8421006798744202,
- "num_tokens": 19027115.0,
- "step": 2129
- },
- {
- "epoch": 1.6185410334346506,
- "grad_norm": 1.4625248908996582,
- "learning_rate": 2.3827568933204576e-06,
- "loss": 0.4141014814376831,
- "mean_token_accuracy": 0.8479453325271606,
- "num_tokens": 19041103.0,
- "step": 2130
- },
- {
- "epoch": 1.6193009118541033,
- "grad_norm": 1.856701135635376,
- "learning_rate": 2.3806648440287715e-06,
- "loss": 0.3440483808517456,
- "mean_token_accuracy": 0.8978210687637329,
- "num_tokens": 19048124.0,
- "step": 2131
- },
- {
- "epoch": 1.6200607902735562,
- "grad_norm": 1.7056550979614258,
- "learning_rate": 2.378572878491091e-06,
- "loss": 0.4136195182800293,
- "mean_token_accuracy": 0.8579289317131042,
- "num_tokens": 19057113.0,
- "step": 2132
- },
- {
- "epoch": 1.6208206686930091,
- "grad_norm": 1.4673033952713013,
- "learning_rate": 2.376480998175638e-06,
- "loss": 0.40176504850387573,
- "mean_token_accuracy": 0.8677150011062622,
- "num_tokens": 19068258.0,
- "step": 2133
- },
- {
- "epoch": 1.621580547112462,
- "grad_norm": 2.12859845161438,
- "learning_rate": 2.3743892045505764e-06,
- "loss": 0.39754825830459595,
- "mean_token_accuracy": 0.8486959934234619,
- "num_tokens": 19075469.0,
- "step": 2134
- },
- {
- "epoch": 1.622340425531915,
- "grad_norm": 1.474247694015503,
- "learning_rate": 2.372297499084006e-06,
- "loss": 0.3546760678291321,
- "mean_token_accuracy": 0.8767229318618774,
- "num_tokens": 19086744.0,
- "step": 2135
- },
- {
- "epoch": 1.6231003039513676,
- "grad_norm": 1.9945709705352783,
- "learning_rate": 2.3702058832439667e-06,
- "loss": 0.4200798273086548,
- "mean_token_accuracy": 0.8435655832290649,
- "num_tokens": 19095903.0,
- "step": 2136
- },
- {
- "epoch": 1.6238601823708207,
- "grad_norm": 2.71991229057312,
- "learning_rate": 2.368114358498434e-06,
- "loss": 0.44925457239151,
- "mean_token_accuracy": 0.8348450660705566,
- "num_tokens": 19100864.0,
- "step": 2137
- },
- {
- "epoch": 1.6246200607902734,
- "grad_norm": 2.817664623260498,
- "learning_rate": 2.366022926315322e-06,
- "loss": 0.44386279582977295,
- "mean_token_accuracy": 0.8739628791809082,
- "num_tokens": 19105355.0,
- "step": 2138
- },
- {
- "epoch": 1.6253799392097266,
- "grad_norm": 1.3673229217529297,
- "learning_rate": 2.3639315881624776e-06,
- "loss": 0.3693230152130127,
- "mean_token_accuracy": 0.8698620796203613,
- "num_tokens": 19116748.0,
- "step": 2139
- },
- {
- "epoch": 1.6261398176291793,
- "grad_norm": 2.712531805038452,
- "learning_rate": 2.361840345507683e-06,
- "loss": 0.4442938268184662,
- "mean_token_accuracy": 0.8433241844177246,
- "num_tokens": 19121437.0,
- "step": 2140
- },
- {
- "epoch": 1.6268996960486324,
- "grad_norm": 2.2885231971740723,
- "learning_rate": 2.359749199818651e-06,
- "loss": 0.4021872878074646,
- "mean_token_accuracy": 0.8605252504348755,
- "num_tokens": 19127633.0,
- "step": 2141
- },
- {
- "epoch": 1.627659574468085,
- "grad_norm": 1.9257299900054932,
- "learning_rate": 2.3576581525630297e-06,
- "loss": 0.3577788472175598,
- "mean_token_accuracy": 0.8691596388816833,
- "num_tokens": 19134450.0,
- "step": 2142
- },
- {
- "epoch": 1.628419452887538,
- "grad_norm": 1.5035467147827148,
- "learning_rate": 2.355567205208397e-06,
- "loss": 0.3800235986709595,
- "mean_token_accuracy": 0.867794394493103,
- "num_tokens": 19146149.0,
- "step": 2143
- },
- {
- "epoch": 1.6291793313069909,
- "grad_norm": 2.110445737838745,
- "learning_rate": 2.353476359222259e-06,
- "loss": 0.34394145011901855,
- "mean_token_accuracy": 0.8777303695678711,
- "num_tokens": 19152017.0,
- "step": 2144
- },
- {
- "epoch": 1.6299392097264438,
- "grad_norm": 1.1713787317276,
- "learning_rate": 2.351385616072052e-06,
- "loss": 0.4060516357421875,
- "mean_token_accuracy": 0.8411345481872559,
- "num_tokens": 19172089.0,
- "step": 2145
- },
- {
- "epoch": 1.6306990881458967,
- "grad_norm": 1.7600529193878174,
- "learning_rate": 2.3492949772251418e-06,
- "loss": 0.5299694538116455,
- "mean_token_accuracy": 0.8218191862106323,
- "num_tokens": 19184041.0,
- "step": 2146
- },
- {
- "epoch": 1.6314589665653494,
- "grad_norm": 1.7126617431640625,
- "learning_rate": 2.3472044441488175e-06,
- "loss": 0.38628721237182617,
- "mean_token_accuracy": 0.8526935577392578,
- "num_tokens": 19193101.0,
- "step": 2147
- },
- {
- "epoch": 1.6322188449848025,
- "grad_norm": 1.210344672203064,
- "learning_rate": 2.345114018310295e-06,
- "loss": 0.2732373774051666,
- "mean_token_accuracy": 0.8903822898864746,
- "num_tokens": 19206697.0,
- "step": 2148
- },
- {
- "epoch": 1.6329787234042552,
- "grad_norm": 1.6693075895309448,
- "learning_rate": 2.3430237011767166e-06,
- "loss": 0.3472709655761719,
- "mean_token_accuracy": 0.8767187595367432,
- "num_tokens": 19217008.0,
- "step": 2149
- },
- {
- "epoch": 1.6337386018237083,
- "grad_norm": 1.5242515802383423,
- "learning_rate": 2.3409334942151485e-06,
- "loss": 0.4345507025718689,
- "mean_token_accuracy": 0.8481311202049255,
- "num_tokens": 19231573.0,
- "step": 2150
- },
- {
- "epoch": 1.634498480243161,
- "grad_norm": 2.470122814178467,
- "learning_rate": 2.3388433988925767e-06,
- "loss": 0.4453052878379822,
- "mean_token_accuracy": 0.8411355018615723,
- "num_tokens": 19237076.0,
- "step": 2151
- },
- {
- "epoch": 1.635258358662614,
- "grad_norm": 2.4177467823028564,
- "learning_rate": 2.3367534166759105e-06,
- "loss": 0.454534113407135,
- "mean_token_accuracy": 0.8635509014129639,
- "num_tokens": 19242890.0,
- "step": 2152
- },
- {
- "epoch": 1.6360182370820668,
- "grad_norm": 2.8036744594573975,
- "learning_rate": 2.3346635490319815e-06,
- "loss": 0.4396413564682007,
- "mean_token_accuracy": 0.8491836786270142,
- "num_tokens": 19247492.0,
- "step": 2153
- },
- {
- "epoch": 1.6367781155015197,
- "grad_norm": 1.9286335706710815,
- "learning_rate": 2.3325737974275382e-06,
- "loss": 0.34988659620285034,
- "mean_token_accuracy": 0.8704243898391724,
- "num_tokens": 19254966.0,
- "step": 2154
- },
- {
- "epoch": 1.6375379939209727,
- "grad_norm": 1.8929904699325562,
- "learning_rate": 2.3304841633292487e-06,
- "loss": 0.4195491671562195,
- "mean_token_accuracy": 0.857181966304779,
- "num_tokens": 19263324.0,
- "step": 2155
- },
- {
- "epoch": 1.6382978723404256,
- "grad_norm": 2.2598466873168945,
- "learning_rate": 2.328394648203698e-06,
- "loss": 0.37977826595306396,
- "mean_token_accuracy": 0.8626722097396851,
- "num_tokens": 19269363.0,
- "step": 2156
- },
- {
- "epoch": 1.6390577507598785,
- "grad_norm": 1.8118126392364502,
- "learning_rate": 2.32630525351739e-06,
- "loss": 0.3532063364982605,
- "mean_token_accuracy": 0.8677854537963867,
- "num_tokens": 19277360.0,
- "step": 2157
- },
- {
- "epoch": 1.6398176291793312,
- "grad_norm": 1.5216798782348633,
- "learning_rate": 2.324215980736741e-06,
- "loss": 0.38609349727630615,
- "mean_token_accuracy": 0.8685325980186462,
- "num_tokens": 19292159.0,
- "step": 2158
- },
- {
- "epoch": 1.6405775075987843,
- "grad_norm": 3.0511462688446045,
- "learning_rate": 2.3221268313280836e-06,
- "loss": 0.21988365054130554,
- "mean_token_accuracy": 0.9172534942626953,
- "num_tokens": 19295735.0,
- "step": 2159
- },
- {
- "epoch": 1.641337386018237,
- "grad_norm": 1.957828164100647,
- "learning_rate": 2.320037806757662e-06,
- "loss": 0.3868909478187561,
- "mean_token_accuracy": 0.8605331182479858,
- "num_tokens": 19303287.0,
- "step": 2160
- },
- {
- "epoch": 1.64209726443769,
- "grad_norm": 2.590040922164917,
- "learning_rate": 2.317948908491636e-06,
- "loss": 0.3940129578113556,
- "mean_token_accuracy": 0.8814224004745483,
- "num_tokens": 19308101.0,
- "step": 2161
- },
- {
- "epoch": 1.6428571428571428,
- "grad_norm": 2.859248161315918,
- "learning_rate": 2.315860137996074e-06,
- "loss": 0.3437344431877136,
- "mean_token_accuracy": 0.8789017200469971,
- "num_tokens": 19313026.0,
- "step": 2162
- },
- {
- "epoch": 1.6436170212765957,
- "grad_norm": 1.1788666248321533,
- "learning_rate": 2.3137714967369544e-06,
- "loss": 0.3976179361343384,
- "mean_token_accuracy": 0.8383771181106567,
- "num_tokens": 19331103.0,
- "step": 2163
- },
- {
- "epoch": 1.6443768996960486,
- "grad_norm": 1.8409802913665771,
- "learning_rate": 2.3116829861801687e-06,
- "loss": 0.41898879408836365,
- "mean_token_accuracy": 0.8575010299682617,
- "num_tokens": 19340866.0,
- "step": 2164
- },
- {
- "epoch": 1.6451367781155015,
- "grad_norm": 1.4124691486358643,
- "learning_rate": 2.3095946077915115e-06,
- "loss": 0.333813339471817,
- "mean_token_accuracy": 0.8766071796417236,
- "num_tokens": 19353673.0,
- "step": 2165
- },
- {
- "epoch": 1.6458966565349544,
- "grad_norm": 1.76325261592865,
- "learning_rate": 2.307506363036688e-06,
- "loss": 0.4158991575241089,
- "mean_token_accuracy": 0.8522704839706421,
- "num_tokens": 19363635.0,
- "step": 2166
- },
- {
- "epoch": 1.6466565349544073,
- "grad_norm": 1.758833885192871,
- "learning_rate": 2.305418253381309e-06,
- "loss": 0.298480749130249,
- "mean_token_accuracy": 0.888424277305603,
- "num_tokens": 19372291.0,
- "step": 2167
- },
- {
- "epoch": 1.6474164133738602,
- "grad_norm": 1.6387488842010498,
- "learning_rate": 2.3033302802908895e-06,
- "loss": 0.4309447109699249,
- "mean_token_accuracy": 0.8672212362289429,
- "num_tokens": 19383480.0,
- "step": 2168
- },
- {
- "epoch": 1.648176291793313,
- "grad_norm": 1.5251084566116333,
- "learning_rate": 2.301242445230851e-06,
- "loss": 0.44890880584716797,
- "mean_token_accuracy": 0.847392737865448,
- "num_tokens": 19394810.0,
- "step": 2169
- },
- {
- "epoch": 1.648936170212766,
- "grad_norm": 1.6106950044631958,
- "learning_rate": 2.299154749666515e-06,
- "loss": 0.4403916597366333,
- "mean_token_accuracy": 0.8379756212234497,
- "num_tokens": 19405551.0,
- "step": 2170
- },
- {
- "epoch": 1.6496960486322187,
- "grad_norm": 1.4238437414169312,
- "learning_rate": 2.2970671950631066e-06,
- "loss": 0.4015567898750305,
- "mean_token_accuracy": 0.851482629776001,
- "num_tokens": 19418621.0,
- "step": 2171
- },
- {
- "epoch": 1.6504559270516719,
- "grad_norm": 1.3026156425476074,
- "learning_rate": 2.2949797828857527e-06,
- "loss": 0.3680947422981262,
- "mean_token_accuracy": 0.8641397953033447,
- "num_tokens": 19432118.0,
- "step": 2172
- },
- {
- "epoch": 1.6512158054711246,
- "grad_norm": 2.1265358924865723,
- "learning_rate": 2.2928925145994798e-06,
- "loss": 0.43980664014816284,
- "mean_token_accuracy": 0.8358430862426758,
- "num_tokens": 19439069.0,
- "step": 2173
- },
- {
- "epoch": 1.6519756838905775,
- "grad_norm": 1.8399443626403809,
- "learning_rate": 2.290805391669212e-06,
- "loss": 0.29801061749458313,
- "mean_token_accuracy": 0.8773187398910522,
- "num_tokens": 19446745.0,
- "step": 2174
- },
- {
- "epoch": 1.6527355623100304,
- "grad_norm": 1.8680047988891602,
- "learning_rate": 2.2887184155597725e-06,
- "loss": 0.3235543966293335,
- "mean_token_accuracy": 0.8754611015319824,
- "num_tokens": 19455266.0,
- "step": 2175
- },
- {
- "epoch": 1.6534954407294833,
- "grad_norm": 2.3048481941223145,
- "learning_rate": 2.286631587735883e-06,
- "loss": 0.4011988043785095,
- "mean_token_accuracy": 0.8531811237335205,
- "num_tokens": 19461049.0,
- "step": 2176
- },
- {
- "epoch": 1.6542553191489362,
- "grad_norm": 2.6067066192626953,
- "learning_rate": 2.2845449096621583e-06,
- "loss": 0.4957500696182251,
- "mean_token_accuracy": 0.8255549073219299,
- "num_tokens": 19466884.0,
- "step": 2177
- },
- {
- "epoch": 1.655015197568389,
- "grad_norm": 1.5211488008499146,
- "learning_rate": 2.282458382803109e-06,
- "loss": 0.32245099544525146,
- "mean_token_accuracy": 0.8865629434585571,
- "num_tokens": 19477294.0,
- "step": 2178
- },
- {
- "epoch": 1.655775075987842,
- "grad_norm": 2.245542526245117,
- "learning_rate": 2.280372008623142e-06,
- "loss": 0.3790864944458008,
- "mean_token_accuracy": 0.8766552209854126,
- "num_tokens": 19483385.0,
- "step": 2179
- },
- {
- "epoch": 1.6565349544072947,
- "grad_norm": 2.1158151626586914,
- "learning_rate": 2.2782857885865538e-06,
- "loss": 0.4726812243461609,
- "mean_token_accuracy": 0.8384029865264893,
- "num_tokens": 19491367.0,
- "step": 2180
- },
- {
- "epoch": 1.6572948328267478,
- "grad_norm": 3.301389694213867,
- "learning_rate": 2.2761997241575335e-06,
- "loss": 0.37664809823036194,
- "mean_token_accuracy": 0.8913813829421997,
- "num_tokens": 19494876.0,
- "step": 2181
- },
- {
- "epoch": 1.6580547112462005,
- "grad_norm": 2.2964162826538086,
- "learning_rate": 2.274113816800161e-06,
- "loss": 0.4110721945762634,
- "mean_token_accuracy": 0.8551756143569946,
- "num_tokens": 19500546.0,
- "step": 2182
- },
- {
- "epoch": 1.6588145896656536,
- "grad_norm": 3.368161916732788,
- "learning_rate": 2.272028067978408e-06,
- "loss": 0.39089250564575195,
- "mean_token_accuracy": 0.8786845207214355,
- "num_tokens": 19504142.0,
- "step": 2183
- },
- {
- "epoch": 1.6595744680851063,
- "grad_norm": 1.7299834489822388,
- "learning_rate": 2.2699424791561324e-06,
- "loss": 0.5205090641975403,
- "mean_token_accuracy": 0.8394201993942261,
- "num_tokens": 19514523.0,
- "step": 2184
- },
- {
- "epoch": 1.6603343465045592,
- "grad_norm": 2.045919418334961,
- "learning_rate": 2.267857051797081e-06,
- "loss": 0.49093255400657654,
- "mean_token_accuracy": 0.8338311910629272,
- "num_tokens": 19522439.0,
- "step": 2185
- },
- {
- "epoch": 1.6610942249240122,
- "grad_norm": 1.2035714387893677,
- "learning_rate": 2.265771787364886e-06,
- "loss": 0.37247753143310547,
- "mean_token_accuracy": 0.8873692750930786,
- "num_tokens": 19536717.0,
- "step": 2186
- },
- {
- "epoch": 1.661854103343465,
- "grad_norm": 2.6186633110046387,
- "learning_rate": 2.263686687323068e-06,
- "loss": 0.3318040370941162,
- "mean_token_accuracy": 0.8720577955245972,
- "num_tokens": 19541966.0,
- "step": 2187
- },
- {
- "epoch": 1.662613981762918,
- "grad_norm": 2.6845929622650146,
- "learning_rate": 2.261601753135029e-06,
- "loss": 0.32441991567611694,
- "mean_token_accuracy": 0.8700553178787231,
- "num_tokens": 19546644.0,
- "step": 2188
- },
- {
- "epoch": 1.6633738601823707,
- "grad_norm": 2.078998327255249,
- "learning_rate": 2.259516986264057e-06,
- "loss": 0.3424156904220581,
- "mean_token_accuracy": 0.8707810044288635,
- "num_tokens": 19553472.0,
- "step": 2189
- },
- {
- "epoch": 1.6641337386018238,
- "grad_norm": 2.380747079849243,
- "learning_rate": 2.2574323881733202e-06,
- "loss": 0.4994799494743347,
- "mean_token_accuracy": 0.817003607749939,
- "num_tokens": 19560502.0,
- "step": 2190
- },
- {
- "epoch": 1.6648936170212765,
- "grad_norm": 1.2984378337860107,
- "learning_rate": 2.255347960325871e-06,
- "loss": 0.33139657974243164,
- "mean_token_accuracy": 0.8763977289199829,
- "num_tokens": 19575624.0,
- "step": 2191
- },
- {
- "epoch": 1.6656534954407296,
- "grad_norm": 1.3232799768447876,
- "learning_rate": 2.2532637041846423e-06,
- "loss": 0.32994017004966736,
- "mean_token_accuracy": 0.8790634274482727,
- "num_tokens": 19588636.0,
- "step": 2192
- },
- {
- "epoch": 1.6664133738601823,
- "grad_norm": 2.11212158203125,
- "learning_rate": 2.2511796212124424e-06,
- "loss": 0.3140082359313965,
- "mean_token_accuracy": 0.8946622014045715,
- "num_tokens": 19594917.0,
- "step": 2193
- },
- {
- "epoch": 1.6671732522796354,
- "grad_norm": 2.7206521034240723,
- "learning_rate": 2.2490957128719627e-06,
- "loss": 0.3723612427711487,
- "mean_token_accuracy": 0.8781955242156982,
- "num_tokens": 19599310.0,
- "step": 2194
- },
- {
- "epoch": 1.667933130699088,
- "grad_norm": 2.6681952476501465,
- "learning_rate": 2.247011980625771e-06,
- "loss": 0.3740317225456238,
- "mean_token_accuracy": 0.8780536651611328,
- "num_tokens": 19604172.0,
- "step": 2195
- },
- {
- "epoch": 1.668693009118541,
- "grad_norm": 1.8933384418487549,
- "learning_rate": 2.2449284259363093e-06,
- "loss": 0.3359421491622925,
- "mean_token_accuracy": 0.8785334825515747,
- "num_tokens": 19612030.0,
- "step": 2196
- },
- {
- "epoch": 1.669452887537994,
- "grad_norm": 2.4779889583587646,
- "learning_rate": 2.2428450502658964e-06,
- "loss": 0.3724144399166107,
- "mean_token_accuracy": 0.8739810585975647,
- "num_tokens": 19617800.0,
- "step": 2197
- },
- {
- "epoch": 1.6702127659574468,
- "grad_norm": 3.0661120414733887,
- "learning_rate": 2.240761855076727e-06,
- "loss": 0.3627531826496124,
- "mean_token_accuracy": 0.865296483039856,
- "num_tokens": 19621885.0,
- "step": 2198
- },
- {
- "epoch": 1.6709726443768997,
- "grad_norm": 2.431708574295044,
- "learning_rate": 2.238678841830867e-06,
- "loss": 0.31396129727363586,
- "mean_token_accuracy": 0.9026765823364258,
- "num_tokens": 19627122.0,
- "step": 2199
- },
- {
- "epoch": 1.6717325227963524,
- "grad_norm": 2.5498745441436768,
- "learning_rate": 2.2365960119902543e-06,
- "loss": 0.3193191885948181,
- "mean_token_accuracy": 0.8750600218772888,
- "num_tokens": 19631771.0,
- "step": 2200
- },
- {
- "epoch": 1.6724924012158056,
- "grad_norm": 2.0419046878814697,
- "learning_rate": 2.2345133670167e-06,
- "loss": 0.32747960090637207,
- "mean_token_accuracy": 0.8603148460388184,
- "num_tokens": 19638972.0,
- "step": 2201
- },
- {
- "epoch": 1.6732522796352582,
- "grad_norm": 2.0412306785583496,
- "learning_rate": 2.232430908371885e-06,
- "loss": 0.4701780676841736,
- "mean_token_accuracy": 0.8318476676940918,
- "num_tokens": 19647968.0,
- "step": 2202
- },
- {
- "epoch": 1.6740121580547114,
- "grad_norm": 2.054070472717285,
- "learning_rate": 2.2303486375173586e-06,
- "loss": 0.33284813165664673,
- "mean_token_accuracy": 0.8760920763015747,
- "num_tokens": 19654032.0,
- "step": 2203
- },
- {
- "epoch": 1.674772036474164,
- "grad_norm": 1.6053217649459839,
- "learning_rate": 2.228266555914538e-06,
- "loss": 0.34431374073028564,
- "mean_token_accuracy": 0.8764770030975342,
- "num_tokens": 19663785.0,
- "step": 2204
- },
- {
- "epoch": 1.675531914893617,
- "grad_norm": 1.474494457244873,
- "learning_rate": 2.2261846650247077e-06,
- "loss": 0.3541037440299988,
- "mean_token_accuracy": 0.8782497644424438,
- "num_tokens": 19675498.0,
- "step": 2205
- },
- {
- "epoch": 1.6762917933130699,
- "grad_norm": 1.9318026304244995,
- "learning_rate": 2.224102966309021e-06,
- "loss": 0.4291660189628601,
- "mean_token_accuracy": 0.8424201607704163,
- "num_tokens": 19684576.0,
- "step": 2206
- },
- {
- "epoch": 1.6770516717325228,
- "grad_norm": 2.2150020599365234,
- "learning_rate": 2.2220214612284925e-06,
- "loss": 0.46187907457351685,
- "mean_token_accuracy": 0.840459942817688,
- "num_tokens": 19690412.0,
- "step": 2207
- },
- {
- "epoch": 1.6778115501519757,
- "grad_norm": 1.667281150817871,
- "learning_rate": 2.2199401512440037e-06,
- "loss": 0.37440744042396545,
- "mean_token_accuracy": 0.8694081902503967,
- "num_tokens": 19699600.0,
- "step": 2208
- },
- {
- "epoch": 1.6785714285714286,
- "grad_norm": 2.6446619033813477,
- "learning_rate": 2.2178590378162957e-06,
- "loss": 0.3301953077316284,
- "mean_token_accuracy": 0.8992182016372681,
- "num_tokens": 19704162.0,
- "step": 2209
- },
- {
- "epoch": 1.6793313069908815,
- "grad_norm": 1.4266780614852905,
- "learning_rate": 2.215778122405977e-06,
- "loss": 0.3811204135417938,
- "mean_token_accuracy": 0.861638069152832,
- "num_tokens": 19716511.0,
- "step": 2210
- },
- {
- "epoch": 1.6800911854103342,
- "grad_norm": 1.826087474822998,
- "learning_rate": 2.2136974064735132e-06,
- "loss": 0.4790012836456299,
- "mean_token_accuracy": 0.8404909372329712,
- "num_tokens": 19726645.0,
- "step": 2211
- },
- {
- "epoch": 1.6808510638297873,
- "grad_norm": 1.8551808595657349,
- "learning_rate": 2.2116168914792293e-06,
- "loss": 0.40999075770378113,
- "mean_token_accuracy": 0.8419463634490967,
- "num_tokens": 19735601.0,
- "step": 2212
- },
- {
- "epoch": 1.68161094224924,
- "grad_norm": 2.560124158859253,
- "learning_rate": 2.209536578883313e-06,
- "loss": 0.43428558111190796,
- "mean_token_accuracy": 0.8689159750938416,
- "num_tokens": 19741138.0,
- "step": 2213
- },
- {
- "epoch": 1.6823708206686931,
- "grad_norm": 2.0154869556427,
- "learning_rate": 2.207456470145807e-06,
- "loss": 0.43633338809013367,
- "mean_token_accuracy": 0.8646916151046753,
- "num_tokens": 19751929.0,
- "step": 2214
- },
- {
- "epoch": 1.6831306990881458,
- "grad_norm": 1.3583155870437622,
- "learning_rate": 2.205376566726611e-06,
- "loss": 0.3050280511379242,
- "mean_token_accuracy": 0.8998798727989197,
- "num_tokens": 19764012.0,
- "step": 2215
- },
- {
- "epoch": 1.6838905775075987,
- "grad_norm": 1.266262173652649,
- "learning_rate": 2.2032968700854813e-06,
- "loss": 0.4039713144302368,
- "mean_token_accuracy": 0.8571382164955139,
- "num_tokens": 19780683.0,
- "step": 2216
- },
- {
- "epoch": 1.6846504559270516,
- "grad_norm": 1.864356517791748,
- "learning_rate": 2.2012173816820297e-06,
- "loss": 0.361503541469574,
- "mean_token_accuracy": 0.868161678314209,
- "num_tokens": 19788907.0,
- "step": 2217
- },
- {
- "epoch": 1.6854103343465046,
- "grad_norm": 1.320155382156372,
- "learning_rate": 2.1991381029757216e-06,
- "loss": 0.28228244185447693,
- "mean_token_accuracy": 0.8945217132568359,
- "num_tokens": 19800354.0,
- "step": 2218
- },
- {
- "epoch": 1.6861702127659575,
- "grad_norm": 1.9706367254257202,
- "learning_rate": 2.1970590354258745e-06,
- "loss": 0.2849377989768982,
- "mean_token_accuracy": 0.9065699577331543,
- "num_tokens": 19806735.0,
- "step": 2219
- },
- {
- "epoch": 1.6869300911854104,
- "grad_norm": 1.9150370359420776,
- "learning_rate": 2.1949801804916563e-06,
- "loss": 0.4125257730484009,
- "mean_token_accuracy": 0.8642163872718811,
- "num_tokens": 19814056.0,
- "step": 2220
- },
- {
- "epoch": 1.6876899696048633,
- "grad_norm": 2.062589645385742,
- "learning_rate": 2.19290153963209e-06,
- "loss": 0.451707124710083,
- "mean_token_accuracy": 0.8311163187026978,
- "num_tokens": 19821263.0,
- "step": 2221
- },
- {
- "epoch": 1.688449848024316,
- "grad_norm": 1.3959208726882935,
- "learning_rate": 2.190823114306045e-06,
- "loss": 0.3326707184314728,
- "mean_token_accuracy": 0.9037837982177734,
- "num_tokens": 19835163.0,
- "step": 2222
- },
- {
- "epoch": 1.689209726443769,
- "grad_norm": 2.09995698928833,
- "learning_rate": 2.188744905972239e-06,
- "loss": 0.4144105315208435,
- "mean_token_accuracy": 0.8512029051780701,
- "num_tokens": 19843164.0,
- "step": 2223
- },
- {
- "epoch": 1.6899696048632218,
- "grad_norm": 1.4759427309036255,
- "learning_rate": 2.186666916089239e-06,
- "loss": 0.4707002639770508,
- "mean_token_accuracy": 0.8371601104736328,
- "num_tokens": 19858551.0,
- "step": 2224
- },
- {
- "epoch": 1.690729483282675,
- "grad_norm": 2.3398702144622803,
- "learning_rate": 2.1845891461154604e-06,
- "loss": 0.34672820568084717,
- "mean_token_accuracy": 0.879936695098877,
- "num_tokens": 19864348.0,
- "step": 2225
- },
- {
- "epoch": 1.6914893617021276,
- "grad_norm": 1.6283963918685913,
- "learning_rate": 2.1825115975091594e-06,
- "loss": 0.31835079193115234,
- "mean_token_accuracy": 0.8695961833000183,
- "num_tokens": 19873560.0,
- "step": 2226
- },
- {
- "epoch": 1.6922492401215805,
- "grad_norm": 2.035759687423706,
- "learning_rate": 2.1804342717284414e-06,
- "loss": 0.43110257387161255,
- "mean_token_accuracy": 0.8593922853469849,
- "num_tokens": 19880796.0,
- "step": 2227
- },
- {
- "epoch": 1.6930091185410334,
- "grad_norm": 2.1340725421905518,
- "learning_rate": 2.1783571702312523e-06,
- "loss": 0.46967440843582153,
- "mean_token_accuracy": 0.8839266300201416,
- "num_tokens": 19887911.0,
- "step": 2228
- },
- {
- "epoch": 1.6937689969604863,
- "grad_norm": 1.710340142250061,
- "learning_rate": 2.176280294475383e-06,
- "loss": 0.4167519807815552,
- "mean_token_accuracy": 0.8526116609573364,
- "num_tokens": 19896674.0,
- "step": 2229
- },
- {
- "epoch": 1.6945288753799392,
- "grad_norm": 1.7793304920196533,
- "learning_rate": 2.174203645918464e-06,
- "loss": 0.3875434994697571,
- "mean_token_accuracy": 0.8637192249298096,
- "num_tokens": 19904825.0,
- "step": 2230
- },
- {
- "epoch": 1.6952887537993921,
- "grad_norm": 1.7908778190612793,
- "learning_rate": 2.172127226017967e-06,
- "loss": 0.42065349221229553,
- "mean_token_accuracy": 0.850834846496582,
- "num_tokens": 19914377.0,
- "step": 2231
- },
- {
- "epoch": 1.696048632218845,
- "grad_norm": 3.0943970680236816,
- "learning_rate": 2.1700510362312053e-06,
- "loss": 0.44845050573349,
- "mean_token_accuracy": 0.8460367918014526,
- "num_tokens": 19918929.0,
- "step": 2232
- },
- {
- "epoch": 1.6968085106382977,
- "grad_norm": 1.5586018562316895,
- "learning_rate": 2.1679750780153265e-06,
- "loss": 0.4723482131958008,
- "mean_token_accuracy": 0.871384859085083,
- "num_tokens": 19932738.0,
- "step": 2233
- },
- {
- "epoch": 1.6975683890577509,
- "grad_norm": 2.014230728149414,
- "learning_rate": 2.1658993528273196e-06,
- "loss": 0.43307146430015564,
- "mean_token_accuracy": 0.8677935600280762,
- "num_tokens": 19940246.0,
- "step": 2234
- },
- {
- "epoch": 1.6983282674772036,
- "grad_norm": 1.528979778289795,
- "learning_rate": 2.163823862124007e-06,
- "loss": 0.3897377550601959,
- "mean_token_accuracy": 0.8737689256668091,
- "num_tokens": 19951187.0,
- "step": 2235
- },
- {
- "epoch": 1.6990881458966567,
- "grad_norm": 1.9856207370758057,
- "learning_rate": 2.1617486073620496e-06,
- "loss": 0.4285745620727539,
- "mean_token_accuracy": 0.8744081258773804,
- "num_tokens": 19957768.0,
- "step": 2236
- },
- {
- "epoch": 1.6998480243161094,
- "grad_norm": 2.130525827407837,
- "learning_rate": 2.15967358999794e-06,
- "loss": 0.405293732881546,
- "mean_token_accuracy": 0.8588452935218811,
- "num_tokens": 19965354.0,
- "step": 2237
- },
- {
- "epoch": 1.7006079027355623,
- "grad_norm": 1.665329098701477,
- "learning_rate": 2.1575988114880057e-06,
- "loss": 0.42987754940986633,
- "mean_token_accuracy": 0.846322238445282,
- "num_tokens": 19975780.0,
- "step": 2238
- },
- {
- "epoch": 1.7013677811550152,
- "grad_norm": 1.0725677013397217,
- "learning_rate": 2.155524273288405e-06,
- "loss": 0.31892159581184387,
- "mean_token_accuracy": 0.8692483305931091,
- "num_tokens": 19995875.0,
- "step": 2239
- },
- {
- "epoch": 1.702127659574468,
- "grad_norm": 2.282604455947876,
- "learning_rate": 2.15344997685513e-06,
- "loss": 0.4460654556751251,
- "mean_token_accuracy": 0.8623759746551514,
- "num_tokens": 20001466.0,
- "step": 2240
- },
- {
- "epoch": 1.702887537993921,
- "grad_norm": 1.1385949850082397,
- "learning_rate": 2.1513759236440024e-06,
- "loss": 0.37046104669570923,
- "mean_token_accuracy": 0.8637164831161499,
- "num_tokens": 20020998.0,
- "step": 2241
- },
- {
- "epoch": 1.7036474164133737,
- "grad_norm": 1.5521315336227417,
- "learning_rate": 2.1493021151106704e-06,
- "loss": 0.4526556134223938,
- "mean_token_accuracy": 0.8675785064697266,
- "num_tokens": 20032750.0,
- "step": 2242
- },
- {
- "epoch": 1.7044072948328268,
- "grad_norm": 1.7777446508407593,
- "learning_rate": 2.147228552710614e-06,
- "loss": 0.41294580698013306,
- "mean_token_accuracy": 0.8597785234451294,
- "num_tokens": 20041901.0,
- "step": 2243
- },
- {
- "epoch": 1.7051671732522795,
- "grad_norm": 1.5157700777053833,
- "learning_rate": 2.145155237899139e-06,
- "loss": 0.4158926010131836,
- "mean_token_accuracy": 0.8512611985206604,
- "num_tokens": 20053705.0,
- "step": 2244
- },
- {
- "epoch": 1.7059270516717326,
- "grad_norm": 1.5116809606552124,
- "learning_rate": 2.143082172131378e-06,
- "loss": 0.43943172693252563,
- "mean_token_accuracy": 0.8429899215698242,
- "num_tokens": 20069468.0,
- "step": 2245
- },
- {
- "epoch": 1.7066869300911853,
- "grad_norm": 1.6095285415649414,
- "learning_rate": 2.141009356862288e-06,
- "loss": 0.41325604915618896,
- "mean_token_accuracy": 0.8832963705062866,
- "num_tokens": 20080596.0,
- "step": 2246
- },
- {
- "epoch": 1.7074468085106385,
- "grad_norm": 1.39210844039917,
- "learning_rate": 2.138936793546649e-06,
- "loss": 0.3945302963256836,
- "mean_token_accuracy": 0.8698325753211975,
- "num_tokens": 20094158.0,
- "step": 2247
- },
- {
- "epoch": 1.7082066869300911,
- "grad_norm": 2.9576594829559326,
- "learning_rate": 2.1368644836390684e-06,
- "loss": 0.16507276892662048,
- "mean_token_accuracy": 0.9410445690155029,
- "num_tokens": 20097002.0,
- "step": 2248
- },
- {
- "epoch": 1.708966565349544,
- "grad_norm": 1.7631266117095947,
- "learning_rate": 2.134792428593971e-06,
- "loss": 0.519780695438385,
- "mean_token_accuracy": 0.8276066780090332,
- "num_tokens": 20107947.0,
- "step": 2249
- },
- {
- "epoch": 1.709726443768997,
- "grad_norm": 2.144636869430542,
- "learning_rate": 2.1327206298656055e-06,
- "loss": 0.32923734188079834,
- "mean_token_accuracy": 0.8766019344329834,
- "num_tokens": 20113676.0,
- "step": 2250
- },
- {
- "epoch": 1.7104863221884499,
- "grad_norm": 1.9511034488677979,
- "learning_rate": 2.130649088908041e-06,
- "loss": 0.4043842554092407,
- "mean_token_accuracy": 0.8525843620300293,
- "num_tokens": 20120787.0,
- "step": 2251
- },
- {
- "epoch": 1.7112462006079028,
- "grad_norm": 1.5001336336135864,
- "learning_rate": 2.1285778071751638e-06,
- "loss": 0.4800187051296234,
- "mean_token_accuracy": 0.8398486375808716,
- "num_tokens": 20133534.0,
- "step": 2252
- },
- {
- "epoch": 1.7120060790273555,
- "grad_norm": 1.435195803642273,
- "learning_rate": 2.126506786120678e-06,
- "loss": 0.44489604234695435,
- "mean_token_accuracy": 0.8444881439208984,
- "num_tokens": 20151787.0,
- "step": 2253
- },
- {
- "epoch": 1.7127659574468086,
- "grad_norm": 1.3056137561798096,
- "learning_rate": 2.1244360271981073e-06,
- "loss": 0.300567090511322,
- "mean_token_accuracy": 0.8903113007545471,
- "num_tokens": 20163390.0,
- "step": 2254
- },
- {
- "epoch": 1.7135258358662613,
- "grad_norm": 1.7347925901412964,
- "learning_rate": 2.1223655318607907e-06,
- "loss": 0.30601179599761963,
- "mean_token_accuracy": 0.8845717906951904,
- "num_tokens": 20171354.0,
- "step": 2255
- },
- {
- "epoch": 1.7142857142857144,
- "grad_norm": 1.316306471824646,
- "learning_rate": 2.1202953015618794e-06,
- "loss": 0.3972984552383423,
- "mean_token_accuracy": 0.845410943031311,
- "num_tokens": 20184464.0,
- "step": 2256
- },
- {
- "epoch": 1.715045592705167,
- "grad_norm": 2.1052892208099365,
- "learning_rate": 2.1182253377543428e-06,
- "loss": 0.3357020616531372,
- "mean_token_accuracy": 0.8853542804718018,
- "num_tokens": 20190539.0,
- "step": 2257
- },
- {
- "epoch": 1.71580547112462,
- "grad_norm": 1.4192553758621216,
- "learning_rate": 2.116155641890959e-06,
- "loss": 0.3881692588329315,
- "mean_token_accuracy": 0.8442144989967346,
- "num_tokens": 20204570.0,
- "step": 2258
- },
- {
- "epoch": 1.716565349544073,
- "grad_norm": 2.134113311767578,
- "learning_rate": 2.1140862154243223e-06,
- "loss": 0.37803274393081665,
- "mean_token_accuracy": 0.8703107237815857,
- "num_tokens": 20210535.0,
- "step": 2259
- },
- {
- "epoch": 1.7173252279635258,
- "grad_norm": 2.9149155616760254,
- "learning_rate": 2.1120170598068353e-06,
- "loss": 0.34860676527023315,
- "mean_token_accuracy": 0.8734345436096191,
- "num_tokens": 20214375.0,
- "step": 2260
- },
- {
- "epoch": 1.7180851063829787,
- "grad_norm": 1.6855589151382446,
- "learning_rate": 2.109948176490711e-06,
- "loss": 0.3676984906196594,
- "mean_token_accuracy": 0.8531560301780701,
- "num_tokens": 20223791.0,
- "step": 2261
- },
- {
- "epoch": 1.7188449848024316,
- "grad_norm": 2.09671950340271,
- "learning_rate": 2.10787956692797e-06,
- "loss": 0.41744115948677063,
- "mean_token_accuracy": 0.8570001125335693,
- "num_tokens": 20231254.0,
- "step": 2262
- },
- {
- "epoch": 1.7196048632218845,
- "grad_norm": 3.148813009262085,
- "learning_rate": 2.1058112325704436e-06,
- "loss": 0.20556189119815826,
- "mean_token_accuracy": 0.926898717880249,
- "num_tokens": 20234470.0,
- "step": 2263
- },
- {
- "epoch": 1.7203647416413372,
- "grad_norm": 1.9707107543945312,
- "learning_rate": 2.103743174869769e-06,
- "loss": 0.40733110904693604,
- "mean_token_accuracy": 0.8740406036376953,
- "num_tokens": 20242286.0,
- "step": 2264
- },
- {
- "epoch": 1.7211246200607904,
- "grad_norm": 1.2756069898605347,
- "learning_rate": 2.1016753952773867e-06,
- "loss": 0.3940718173980713,
- "mean_token_accuracy": 0.860906720161438,
- "num_tokens": 20260382.0,
- "step": 2265
- },
- {
- "epoch": 1.721884498480243,
- "grad_norm": 1.5074653625488281,
- "learning_rate": 2.0996078952445453e-06,
- "loss": 0.3353617191314697,
- "mean_token_accuracy": 0.8809853792190552,
- "num_tokens": 20271665.0,
- "step": 2266
- },
- {
- "epoch": 1.7226443768996962,
- "grad_norm": 1.4331210851669312,
- "learning_rate": 2.0975406762222966e-06,
- "loss": 0.32260069251060486,
- "mean_token_accuracy": 0.901330828666687,
- "num_tokens": 20283122.0,
- "step": 2267
- },
- {
- "epoch": 1.7234042553191489,
- "grad_norm": 2.2378969192504883,
- "learning_rate": 2.095473739661494e-06,
- "loss": 0.39086243510246277,
- "mean_token_accuracy": 0.8681687116622925,
- "num_tokens": 20289243.0,
- "step": 2268
- },
- {
- "epoch": 1.7241641337386018,
- "grad_norm": 2.754582405090332,
- "learning_rate": 2.093407087012791e-06,
- "loss": 0.42927244305610657,
- "mean_token_accuracy": 0.8594136834144592,
- "num_tokens": 20294537.0,
- "step": 2269
- },
- {
- "epoch": 1.7249240121580547,
- "grad_norm": 2.2721824645996094,
- "learning_rate": 2.091340719726647e-06,
- "loss": 0.42479783296585083,
- "mean_token_accuracy": 0.8411722183227539,
- "num_tokens": 20301502.0,
- "step": 2270
- },
- {
- "epoch": 1.7256838905775076,
- "grad_norm": 2.3230299949645996,
- "learning_rate": 2.089274639253317e-06,
- "loss": 0.4218963384628296,
- "mean_token_accuracy": 0.8498032093048096,
- "num_tokens": 20307710.0,
- "step": 2271
- },
- {
- "epoch": 1.7264437689969605,
- "grad_norm": 2.3499748706817627,
- "learning_rate": 2.0872088470428553e-06,
- "loss": 0.4472277760505676,
- "mean_token_accuracy": 0.8487255573272705,
- "num_tokens": 20313945.0,
- "step": 2272
- },
- {
- "epoch": 1.7272036474164134,
- "grad_norm": 1.3709690570831299,
- "learning_rate": 2.0851433445451142e-06,
- "loss": 0.38701117038726807,
- "mean_token_accuracy": 0.8592075109481812,
- "num_tokens": 20328023.0,
- "step": 2273
- },
- {
- "epoch": 1.7279635258358663,
- "grad_norm": 1.1293425559997559,
- "learning_rate": 2.0830781332097446e-06,
- "loss": 0.34000539779663086,
- "mean_token_accuracy": 0.8779317140579224,
- "num_tokens": 20346767.0,
- "step": 2274
- },
- {
- "epoch": 1.728723404255319,
- "grad_norm": 2.9770123958587646,
- "learning_rate": 2.08101321448619e-06,
- "loss": 0.4437636733055115,
- "mean_token_accuracy": 0.8398602604866028,
- "num_tokens": 20352306.0,
- "step": 2275
- },
- {
- "epoch": 1.7294832826747721,
- "grad_norm": 3.510955572128296,
- "learning_rate": 2.0789485898236897e-06,
- "loss": 0.3359706401824951,
- "mean_token_accuracy": 0.8872498273849487,
- "num_tokens": 20355560.0,
- "step": 2276
- },
- {
- "epoch": 1.7302431610942248,
- "grad_norm": 2.0873279571533203,
- "learning_rate": 2.076884260671276e-06,
- "loss": 0.38720619678497314,
- "mean_token_accuracy": 0.865881621837616,
- "num_tokens": 20362802.0,
- "step": 2277
- },
- {
- "epoch": 1.731003039513678,
- "grad_norm": 2.4871230125427246,
- "learning_rate": 2.0748202284777775e-06,
- "loss": 0.3250775933265686,
- "mean_token_accuracy": 0.8867610692977905,
- "num_tokens": 20367080.0,
- "step": 2278
- },
- {
- "epoch": 1.7317629179331306,
- "grad_norm": 3.5603582859039307,
- "learning_rate": 2.072756494691809e-06,
- "loss": 0.35600754618644714,
- "mean_token_accuracy": 0.8781189918518066,
- "num_tokens": 20370625.0,
- "step": 2279
- },
- {
- "epoch": 1.7325227963525835,
- "grad_norm": 2.0948755741119385,
- "learning_rate": 2.070693060761779e-06,
- "loss": 0.3558604419231415,
- "mean_token_accuracy": 0.902066707611084,
- "num_tokens": 20376835.0,
- "step": 2280
- },
- {
- "epoch": 1.7332826747720365,
- "grad_norm": 2.391188859939575,
- "learning_rate": 2.0686299281358837e-06,
- "loss": 0.36596938967704773,
- "mean_token_accuracy": 0.8741272687911987,
- "num_tokens": 20382282.0,
- "step": 2281
- },
- {
- "epoch": 1.7340425531914894,
- "grad_norm": 1.6906369924545288,
- "learning_rate": 2.0665670982621107e-06,
- "loss": 0.5241266489028931,
- "mean_token_accuracy": 0.8091107606887817,
- "num_tokens": 20393736.0,
- "step": 2282
- },
- {
- "epoch": 1.7348024316109423,
- "grad_norm": 1.7578394412994385,
- "learning_rate": 2.0645045725882334e-06,
- "loss": 0.37041786313056946,
- "mean_token_accuracy": 0.8907113075256348,
- "num_tokens": 20402715.0,
- "step": 2283
- },
- {
- "epoch": 1.7355623100303952,
- "grad_norm": 2.191727638244629,
- "learning_rate": 2.0624423525618097e-06,
- "loss": 0.43301627039909363,
- "mean_token_accuracy": 0.8706433773040771,
- "num_tokens": 20409976.0,
- "step": 2284
- },
- {
- "epoch": 1.736322188449848,
- "grad_norm": 1.958005666732788,
- "learning_rate": 2.0603804396301875e-06,
- "loss": 0.29002684354782104,
- "mean_token_accuracy": 0.8914110660552979,
- "num_tokens": 20417099.0,
- "step": 2285
- },
- {
- "epoch": 1.7370820668693008,
- "grad_norm": 2.477837085723877,
- "learning_rate": 2.058318835240495e-06,
- "loss": 0.2953898310661316,
- "mean_token_accuracy": 0.8975275754928589,
- "num_tokens": 20422251.0,
- "step": 2286
- },
- {
- "epoch": 1.737841945288754,
- "grad_norm": 2.156764268875122,
- "learning_rate": 2.0562575408396475e-06,
- "loss": 0.4063698649406433,
- "mean_token_accuracy": 0.8497642278671265,
- "num_tokens": 20429338.0,
- "step": 2287
- },
- {
- "epoch": 1.7386018237082066,
- "grad_norm": 1.6748939752578735,
- "learning_rate": 2.0541965578743373e-06,
- "loss": 0.3272587060928345,
- "mean_token_accuracy": 0.8646700382232666,
- "num_tokens": 20439680.0,
- "step": 2288
- },
- {
- "epoch": 1.7393617021276597,
- "grad_norm": 1.9948776960372925,
- "learning_rate": 2.0521358877910446e-06,
- "loss": 0.36843347549438477,
- "mean_token_accuracy": 0.8613901138305664,
- "num_tokens": 20448492.0,
- "step": 2289
- },
- {
- "epoch": 1.7401215805471124,
- "grad_norm": 2.231428623199463,
- "learning_rate": 2.0500755320360263e-06,
- "loss": 0.3905152380466461,
- "mean_token_accuracy": 0.8980990052223206,
- "num_tokens": 20453945.0,
- "step": 2290
- },
- {
- "epoch": 1.7408814589665653,
- "grad_norm": 2.2187650203704834,
- "learning_rate": 2.048015492055319e-06,
- "loss": 0.45920854806900024,
- "mean_token_accuracy": 0.8282852172851562,
- "num_tokens": 20462378.0,
- "step": 2291
- },
- {
- "epoch": 1.7416413373860182,
- "grad_norm": 2.0668466091156006,
- "learning_rate": 2.045955769294737e-06,
- "loss": 0.3227751553058624,
- "mean_token_accuracy": 0.8805934190750122,
- "num_tokens": 20469822.0,
- "step": 2292
- },
- {
- "epoch": 1.7424012158054711,
- "grad_norm": 1.9162774085998535,
- "learning_rate": 2.0438963651998747e-06,
- "loss": 0.4604800343513489,
- "mean_token_accuracy": 0.8441175818443298,
- "num_tokens": 20479099.0,
- "step": 2293
- },
- {
- "epoch": 1.743161094224924,
- "grad_norm": 2.645329713821411,
- "learning_rate": 2.0418372812161015e-06,
- "loss": 0.3239654004573822,
- "mean_token_accuracy": 0.8888648748397827,
- "num_tokens": 20483926.0,
- "step": 2294
- },
- {
- "epoch": 1.743920972644377,
- "grad_norm": 1.39468514919281,
- "learning_rate": 2.03977851878856e-06,
- "loss": 0.4003690183162689,
- "mean_token_accuracy": 0.8769714832305908,
- "num_tokens": 20496501.0,
- "step": 2295
- },
- {
- "epoch": 1.7446808510638299,
- "grad_norm": 3.509174346923828,
- "learning_rate": 2.0377200793621694e-06,
- "loss": 0.2948213517665863,
- "mean_token_accuracy": 0.8972329497337341,
- "num_tokens": 20500000.0,
- "step": 2296
- },
- {
- "epoch": 1.7454407294832825,
- "grad_norm": 1.5033894777297974,
- "learning_rate": 2.0356619643816234e-06,
- "loss": 0.40694737434387207,
- "mean_token_accuracy": 0.8607243895530701,
- "num_tokens": 20513473.0,
- "step": 2297
- },
- {
- "epoch": 1.7462006079027357,
- "grad_norm": 1.4324895143508911,
- "learning_rate": 2.0336041752913843e-06,
- "loss": 0.3899157643318176,
- "mean_token_accuracy": 0.858935534954071,
- "num_tokens": 20524516.0,
- "step": 2298
- },
- {
- "epoch": 1.7469604863221884,
- "grad_norm": 2.359544277191162,
- "learning_rate": 2.031546713535688e-06,
- "loss": 0.369213342666626,
- "mean_token_accuracy": 0.8741403818130493,
- "num_tokens": 20530421.0,
- "step": 2299
- },
- {
- "epoch": 1.7477203647416415,
- "grad_norm": 2.282637357711792,
- "learning_rate": 2.029489580558542e-06,
- "loss": 0.3255441188812256,
- "mean_token_accuracy": 0.9045462608337402,
- "num_tokens": 20535954.0,
- "step": 2300
- },
- {
- "epoch": 1.7484802431610942,
- "grad_norm": 1.7367198467254639,
- "learning_rate": 2.0274327778037204e-06,
- "loss": 0.43890488147735596,
- "mean_token_accuracy": 0.8494667410850525,
- "num_tokens": 20548638.0,
- "step": 2301
- },
- {
- "epoch": 1.749240121580547,
- "grad_norm": 1.6236488819122314,
- "learning_rate": 2.0253763067147657e-06,
- "loss": 0.4440777897834778,
- "mean_token_accuracy": 0.8414230942726135,
- "num_tokens": 20559263.0,
- "step": 2302
- },
- {
- "epoch": 1.75,
- "grad_norm": 1.3755455017089844,
- "learning_rate": 2.0233201687349888e-06,
- "loss": 0.3473797142505646,
- "mean_token_accuracy": 0.8742472529411316,
- "num_tokens": 20573109.0,
- "step": 2303
- },
- {
- "epoch": 1.750759878419453,
- "grad_norm": 3.271153688430786,
- "learning_rate": 2.0212643653074677e-06,
- "loss": 0.4965784549713135,
- "mean_token_accuracy": 0.8596988916397095,
- "num_tokens": 20578525.0,
- "step": 2304
- },
- {
- "epoch": 1.7515197568389058,
- "grad_norm": 2.6341168880462646,
- "learning_rate": 2.019208897875043e-06,
- "loss": 0.37775442004203796,
- "mean_token_accuracy": 0.8721816539764404,
- "num_tokens": 20583641.0,
- "step": 2305
- },
- {
- "epoch": 1.7522796352583585,
- "grad_norm": 1.8308569192886353,
- "learning_rate": 2.0171537678803222e-06,
- "loss": 0.3243415355682373,
- "mean_token_accuracy": 0.8837124109268188,
- "num_tokens": 20591725.0,
- "step": 2306
- },
- {
- "epoch": 1.7530395136778116,
- "grad_norm": 2.4362998008728027,
- "learning_rate": 2.015098976765673e-06,
- "loss": 0.3738787770271301,
- "mean_token_accuracy": 0.8974303007125854,
- "num_tokens": 20596587.0,
- "step": 2307
- },
- {
- "epoch": 1.7537993920972643,
- "grad_norm": 3.2920920848846436,
- "learning_rate": 2.0130445259732282e-06,
- "loss": 0.33901530504226685,
- "mean_token_accuracy": 0.9019063115119934,
- "num_tokens": 20600379.0,
- "step": 2308
- },
- {
- "epoch": 1.7545592705167175,
- "grad_norm": 1.290475606918335,
- "learning_rate": 2.01099041694488e-06,
- "loss": 0.37150678038597107,
- "mean_token_accuracy": 0.8542044758796692,
- "num_tokens": 20614340.0,
- "step": 2309
- },
- {
- "epoch": 1.7553191489361701,
- "grad_norm": 2.7794933319091797,
- "learning_rate": 2.0089366511222815e-06,
- "loss": 0.3746095895767212,
- "mean_token_accuracy": 0.8653185367584229,
- "num_tokens": 20622056.0,
- "step": 2310
- },
- {
- "epoch": 1.756079027355623,
- "grad_norm": 2.2112278938293457,
- "learning_rate": 2.006883229946843e-06,
- "loss": 0.35793858766555786,
- "mean_token_accuracy": 0.875727653503418,
- "num_tokens": 20628930.0,
- "step": 2311
- },
- {
- "epoch": 1.756838905775076,
- "grad_norm": 1.5240603685379028,
- "learning_rate": 2.0048301548597365e-06,
- "loss": 0.512831449508667,
- "mean_token_accuracy": 0.8139172792434692,
- "num_tokens": 20643159.0,
- "step": 2312
- },
- {
- "epoch": 1.7575987841945289,
- "grad_norm": 1.810485601425171,
- "learning_rate": 2.0027774273018894e-06,
- "loss": 0.43870818614959717,
- "mean_token_accuracy": 0.8313089609146118,
- "num_tokens": 20651914.0,
- "step": 2313
- },
- {
- "epoch": 1.7583586626139818,
- "grad_norm": 1.748178243637085,
- "learning_rate": 2.0007250487139827e-06,
- "loss": 0.42277514934539795,
- "mean_token_accuracy": 0.8463197946548462,
- "num_tokens": 20660054.0,
- "step": 2314
- },
- {
- "epoch": 1.7591185410334347,
- "grad_norm": 1.511717677116394,
- "learning_rate": 1.998673020536456e-06,
- "loss": 0.38304439187049866,
- "mean_token_accuracy": 0.8508470058441162,
- "num_tokens": 20673371.0,
- "step": 2315
- },
- {
- "epoch": 1.7598784194528876,
- "grad_norm": 1.7790700197219849,
- "learning_rate": 1.996621344209503e-06,
- "loss": 0.3838311433792114,
- "mean_token_accuracy": 0.8676829934120178,
- "num_tokens": 20682072.0,
- "step": 2316
- },
- {
- "epoch": 1.7606382978723403,
- "grad_norm": 1.9128468036651611,
- "learning_rate": 1.994570021173067e-06,
- "loss": 0.40384364128112793,
- "mean_token_accuracy": 0.8747294545173645,
- "num_tokens": 20689000.0,
- "step": 2317
- },
- {
- "epoch": 1.7613981762917934,
- "grad_norm": 3.286569118499756,
- "learning_rate": 1.9925190528668455e-06,
- "loss": 0.38019680976867676,
- "mean_token_accuracy": 0.8678069114685059,
- "num_tokens": 20692763.0,
- "step": 2318
- },
- {
- "epoch": 1.762158054711246,
- "grad_norm": 1.6108927726745605,
- "learning_rate": 1.990468440730288e-06,
- "loss": 0.3144170045852661,
- "mean_token_accuracy": 0.8695170879364014,
- "num_tokens": 20702620.0,
- "step": 2319
- },
- {
- "epoch": 1.7629179331306992,
- "grad_norm": 3.185225009918213,
- "learning_rate": 1.9884181862025938e-06,
- "loss": 0.41619348526000977,
- "mean_token_accuracy": 0.8543670177459717,
- "num_tokens": 20706857.0,
- "step": 2320
- },
- {
- "epoch": 1.763677811550152,
- "grad_norm": 2.3699469566345215,
- "learning_rate": 1.986368290722709e-06,
- "loss": 0.5115842819213867,
- "mean_token_accuracy": 0.8141909837722778,
- "num_tokens": 20713997.0,
- "step": 2321
- },
- {
- "epoch": 1.7644376899696048,
- "grad_norm": 1.4449706077575684,
- "learning_rate": 1.9843187557293286e-06,
- "loss": 0.419655442237854,
- "mean_token_accuracy": 0.8545533418655396,
- "num_tokens": 20726548.0,
- "step": 2322
- },
- {
- "epoch": 1.7651975683890577,
- "grad_norm": 2.127614974975586,
- "learning_rate": 1.9822695826608975e-06,
- "loss": 0.43722522258758545,
- "mean_token_accuracy": 0.8542283773422241,
- "num_tokens": 20733469.0,
- "step": 2323
- },
- {
- "epoch": 1.7659574468085106,
- "grad_norm": 3.3081557750701904,
- "learning_rate": 1.9802207729556023e-06,
- "loss": 0.30904972553253174,
- "mean_token_accuracy": 0.8896352648735046,
- "num_tokens": 20737190.0,
- "step": 2324
- },
- {
- "epoch": 1.7667173252279635,
- "grad_norm": 2.603506326675415,
- "learning_rate": 1.978172328051377e-06,
- "loss": 0.30952537059783936,
- "mean_token_accuracy": 0.8868587017059326,
- "num_tokens": 20741780.0,
- "step": 2325
- },
- {
- "epoch": 1.7674772036474165,
- "grad_norm": 2.576824903488159,
- "learning_rate": 1.9761242493858987e-06,
- "loss": 0.29593953490257263,
- "mean_token_accuracy": 0.888198733329773,
- "num_tokens": 20746324.0,
- "step": 2326
- },
- {
- "epoch": 1.7682370820668694,
- "grad_norm": 1.6168320178985596,
- "learning_rate": 1.9740765383965894e-06,
- "loss": 0.5093998908996582,
- "mean_token_accuracy": 0.8301646709442139,
- "num_tokens": 20760140.0,
- "step": 2327
- },
- {
- "epoch": 1.768996960486322,
- "grad_norm": 2.1162400245666504,
- "learning_rate": 1.9720291965206097e-06,
- "loss": 0.36714404821395874,
- "mean_token_accuracy": 0.8699671626091003,
- "num_tokens": 20766961.0,
- "step": 2328
- },
- {
- "epoch": 1.7697568389057752,
- "grad_norm": 1.046911597251892,
- "learning_rate": 1.969982225194864e-06,
- "loss": 0.40783989429473877,
- "mean_token_accuracy": 0.8474892377853394,
- "num_tokens": 20786737.0,
- "step": 2329
- },
- {
- "epoch": 1.7705167173252279,
- "grad_norm": 1.7059568166732788,
- "learning_rate": 1.9679356258559943e-06,
- "loss": 0.44083845615386963,
- "mean_token_accuracy": 0.841221034526825,
- "num_tokens": 20798907.0,
- "step": 2330
- },
- {
- "epoch": 1.771276595744681,
- "grad_norm": 1.5157767534255981,
- "learning_rate": 1.9658893999403847e-06,
- "loss": 0.4671107828617096,
- "mean_token_accuracy": 0.8252813816070557,
- "num_tokens": 20814304.0,
- "step": 2331
- },
- {
- "epoch": 1.7720364741641337,
- "grad_norm": 2.1340525150299072,
- "learning_rate": 1.9638435488841543e-06,
- "loss": 0.4088709354400635,
- "mean_token_accuracy": 0.8595127463340759,
- "num_tokens": 20821827.0,
- "step": 2332
- },
- {
- "epoch": 1.7727963525835866,
- "grad_norm": 1.948072910308838,
- "learning_rate": 1.96179807412316e-06,
- "loss": 0.3692860007286072,
- "mean_token_accuracy": 0.8678920269012451,
- "num_tokens": 20828612.0,
- "step": 2333
- },
- {
- "epoch": 1.7735562310030395,
- "grad_norm": 1.5731977224349976,
- "learning_rate": 1.959752977092995e-06,
- "loss": 0.3743135929107666,
- "mean_token_accuracy": 0.8723479509353638,
- "num_tokens": 20838497.0,
- "step": 2334
- },
- {
- "epoch": 1.7743161094224924,
- "grad_norm": 1.5506012439727783,
- "learning_rate": 1.957708259228987e-06,
- "loss": 0.4403391182422638,
- "mean_token_accuracy": 0.854604959487915,
- "num_tokens": 20851603.0,
- "step": 2335
- },
- {
- "epoch": 1.7750759878419453,
- "grad_norm": 1.154336929321289,
- "learning_rate": 1.9556639219661983e-06,
- "loss": 0.5281188488006592,
- "mean_token_accuracy": 0.8101300001144409,
- "num_tokens": 20875661.0,
- "step": 2336
- },
- {
- "epoch": 1.7758358662613982,
- "grad_norm": 4.720771312713623,
- "learning_rate": 1.9536199667394217e-06,
- "loss": 0.44419822096824646,
- "mean_token_accuracy": 0.8740090131759644,
- "num_tokens": 20886971.0,
- "step": 2337
- },
- {
- "epoch": 1.7765957446808511,
- "grad_norm": 1.5492230653762817,
- "learning_rate": 1.9515763949831852e-06,
- "loss": 0.4538637697696686,
- "mean_token_accuracy": 0.8362185955047607,
- "num_tokens": 20899212.0,
- "step": 2338
- },
- {
- "epoch": 1.7773556231003038,
- "grad_norm": 1.354101538658142,
- "learning_rate": 1.9495332081317466e-06,
- "loss": 0.4341534376144409,
- "mean_token_accuracy": 0.8380170464515686,
- "num_tokens": 20913065.0,
- "step": 2339
- },
- {
- "epoch": 1.778115501519757,
- "grad_norm": 1.5805599689483643,
- "learning_rate": 1.947490407619092e-06,
- "loss": 0.40928739309310913,
- "mean_token_accuracy": 0.8524469137191772,
- "num_tokens": 20922919.0,
- "step": 2340
- },
- {
- "epoch": 1.7788753799392096,
- "grad_norm": 2.097221851348877,
- "learning_rate": 1.945447994878937e-06,
- "loss": 0.4816104769706726,
- "mean_token_accuracy": 0.888654351234436,
- "num_tokens": 20931350.0,
- "step": 2341
- },
- {
- "epoch": 1.7796352583586628,
- "grad_norm": 1.7193297147750854,
- "learning_rate": 1.9434059713447264e-06,
- "loss": 0.44925639033317566,
- "mean_token_accuracy": 0.8500319123268127,
- "num_tokens": 20940546.0,
- "step": 2342
- },
- {
- "epoch": 1.7803951367781155,
- "grad_norm": 1.5971747636795044,
- "learning_rate": 1.9413643384496315e-06,
- "loss": 0.29559412598609924,
- "mean_token_accuracy": 0.8871279954910278,
- "num_tokens": 20950604.0,
- "step": 2343
- },
- {
- "epoch": 1.7811550151975684,
- "grad_norm": 2.788029670715332,
- "learning_rate": 1.9393230976265478e-06,
- "loss": 0.31713539361953735,
- "mean_token_accuracy": 0.8866176605224609,
- "num_tokens": 20955296.0,
- "step": 2344
- },
- {
- "epoch": 1.7819148936170213,
- "grad_norm": 1.5747952461242676,
- "learning_rate": 1.937282250308096e-06,
- "loss": 0.41813358664512634,
- "mean_token_accuracy": 0.8418053984642029,
- "num_tokens": 20967664.0,
- "step": 2345
- },
- {
- "epoch": 1.7826747720364742,
- "grad_norm": 2.0813145637512207,
- "learning_rate": 1.935241797926623e-06,
- "loss": 0.39056286215782166,
- "mean_token_accuracy": 0.8601781129837036,
- "num_tokens": 20975895.0,
- "step": 2346
- },
- {
- "epoch": 1.783434650455927,
- "grad_norm": 2.143022298812866,
- "learning_rate": 1.933201741914196e-06,
- "loss": 0.40797823667526245,
- "mean_token_accuracy": 0.8846398591995239,
- "num_tokens": 20983683.0,
- "step": 2347
- },
- {
- "epoch": 1.78419452887538,
- "grad_norm": 1.8451775312423706,
- "learning_rate": 1.931162083702606e-06,
- "loss": 0.34083136916160583,
- "mean_token_accuracy": 0.8643462657928467,
- "num_tokens": 20992621.0,
- "step": 2348
- },
- {
- "epoch": 1.784954407294833,
- "grad_norm": 1.8603935241699219,
- "learning_rate": 1.9291228247233607e-06,
- "loss": 0.4860231280326843,
- "mean_token_accuracy": 0.8391251564025879,
- "num_tokens": 21002427.0,
- "step": 2349
- },
- {
- "epoch": 1.7857142857142856,
- "grad_norm": 2.751711845397949,
- "learning_rate": 1.9270839664076937e-06,
- "loss": 0.30588358640670776,
- "mean_token_accuracy": 0.8836315274238586,
- "num_tokens": 21006898.0,
- "step": 2350
- },
- {
- "epoch": 1.7864741641337387,
- "grad_norm": 1.0335345268249512,
- "learning_rate": 1.9250455101865526e-06,
- "loss": 0.3119634985923767,
- "mean_token_accuracy": 0.8912283182144165,
- "num_tokens": 21024930.0,
- "step": 2351
- },
- {
- "epoch": 1.7872340425531914,
- "grad_norm": 2.4693806171417236,
- "learning_rate": 1.9230074574906043e-06,
- "loss": 0.1976669877767563,
- "mean_token_accuracy": 0.928974986076355,
- "num_tokens": 21029027.0,
- "step": 2352
- },
- {
- "epoch": 1.7879939209726445,
- "grad_norm": 1.2892690896987915,
- "learning_rate": 1.920969809750234e-06,
- "loss": 0.46008217334747314,
- "mean_token_accuracy": 0.8299605846405029,
- "num_tokens": 21047671.0,
- "step": 2353
- },
- {
- "epoch": 1.7887537993920972,
- "grad_norm": 3.162534713745117,
- "learning_rate": 1.91893256839554e-06,
- "loss": 0.2916071116924286,
- "mean_token_accuracy": 0.8932807445526123,
- "num_tokens": 21051555.0,
- "step": 2354
- },
- {
- "epoch": 1.7895136778115501,
- "grad_norm": 1.7627713680267334,
- "learning_rate": 1.916895734856338e-06,
- "loss": 0.3223535120487213,
- "mean_token_accuracy": 0.8852578401565552,
- "num_tokens": 21060056.0,
- "step": 2355
- },
- {
- "epoch": 1.790273556231003,
- "grad_norm": 1.9448071718215942,
- "learning_rate": 1.9148593105621542e-06,
- "loss": 0.3650452196598053,
- "mean_token_accuracy": 0.8709862232208252,
- "num_tokens": 21067190.0,
- "step": 2356
- },
- {
- "epoch": 1.791033434650456,
- "grad_norm": 2.026644229888916,
- "learning_rate": 1.9128232969422318e-06,
- "loss": 0.3620566427707672,
- "mean_token_accuracy": 0.865707516670227,
- "num_tokens": 21075197.0,
- "step": 2357
- },
- {
- "epoch": 1.7917933130699089,
- "grad_norm": 2.2628564834594727,
- "learning_rate": 1.9107876954255217e-06,
- "loss": 0.353444367647171,
- "mean_token_accuracy": 0.8590385913848877,
- "num_tokens": 21080823.0,
- "step": 2358
- },
- {
- "epoch": 1.7925531914893615,
- "grad_norm": 2.5959067344665527,
- "learning_rate": 1.908752507440689e-06,
- "loss": 0.43711763620376587,
- "mean_token_accuracy": 0.8539710640907288,
- "num_tokens": 21086016.0,
- "step": 2359
- },
- {
- "epoch": 1.7933130699088147,
- "grad_norm": 1.6228864192962646,
- "learning_rate": 1.906717734416105e-06,
- "loss": 0.38630396127700806,
- "mean_token_accuracy": 0.8611987829208374,
- "num_tokens": 21096573.0,
- "step": 2360
- },
- {
- "epoch": 1.7940729483282674,
- "grad_norm": 1.8471404314041138,
- "learning_rate": 1.9046833777798534e-06,
- "loss": 0.46608641743659973,
- "mean_token_accuracy": 0.8782031536102295,
- "num_tokens": 21105817.0,
- "step": 2361
- },
- {
- "epoch": 1.7948328267477205,
- "grad_norm": 2.6532235145568848,
- "learning_rate": 1.9026494389597239e-06,
- "loss": 0.3310372829437256,
- "mean_token_accuracy": 0.8781720399856567,
- "num_tokens": 21111192.0,
- "step": 2362
- },
- {
- "epoch": 1.7955927051671732,
- "grad_norm": 2.172534942626953,
- "learning_rate": 1.9006159193832124e-06,
- "loss": 0.49921661615371704,
- "mean_token_accuracy": 0.8215196132659912,
- "num_tokens": 21117878.0,
- "step": 2363
- },
- {
- "epoch": 1.7963525835866263,
- "grad_norm": 1.6507720947265625,
- "learning_rate": 1.8985828204775206e-06,
- "loss": 0.4189162850379944,
- "mean_token_accuracy": 0.8520572185516357,
- "num_tokens": 21128287.0,
- "step": 2364
- },
- {
- "epoch": 1.797112462006079,
- "grad_norm": 1.5932034254074097,
- "learning_rate": 1.8965501436695578e-06,
- "loss": 0.45531854033470154,
- "mean_token_accuracy": 0.8391242027282715,
- "num_tokens": 21140605.0,
- "step": 2365
- },
- {
- "epoch": 1.797872340425532,
- "grad_norm": 2.4680638313293457,
- "learning_rate": 1.894517890385933e-06,
- "loss": 0.41174983978271484,
- "mean_token_accuracy": 0.8616886138916016,
- "num_tokens": 21147045.0,
- "step": 2366
- },
- {
- "epoch": 1.7986322188449848,
- "grad_norm": 1.61875319480896,
- "learning_rate": 1.8924860620529594e-06,
- "loss": 0.47573935985565186,
- "mean_token_accuracy": 0.8347671031951904,
- "num_tokens": 21157253.0,
- "step": 2367
- },
- {
- "epoch": 1.7993920972644377,
- "grad_norm": 3.4389333724975586,
- "learning_rate": 1.8904546600966539e-06,
- "loss": 0.34975939989089966,
- "mean_token_accuracy": 0.8915865421295166,
- "num_tokens": 21160486.0,
- "step": 2368
- },
- {
- "epoch": 1.8001519756838906,
- "grad_norm": 2.0069527626037598,
- "learning_rate": 1.888423685942732e-06,
- "loss": 0.379585325717926,
- "mean_token_accuracy": 0.8605983257293701,
- "num_tokens": 21168016.0,
- "step": 2369
- },
- {
- "epoch": 1.8009118541033433,
- "grad_norm": 3.0740530490875244,
- "learning_rate": 1.886393141016609e-06,
- "loss": 0.5244829058647156,
- "mean_token_accuracy": 0.8282772302627563,
- "num_tokens": 21172851.0,
- "step": 2370
- },
- {
- "epoch": 1.8016717325227964,
- "grad_norm": 1.5724968910217285,
- "learning_rate": 1.8843630267434e-06,
- "loss": 0.2020694762468338,
- "mean_token_accuracy": 0.8882503509521484,
- "num_tokens": 21179866.0,
- "step": 2371
- },
- {
- "epoch": 1.8024316109422491,
- "grad_norm": 2.1539509296417236,
- "learning_rate": 1.8823333445479175e-06,
- "loss": 0.37903186678886414,
- "mean_token_accuracy": 0.8525497317314148,
- "num_tokens": 21186941.0,
- "step": 2372
- },
- {
- "epoch": 1.8031914893617023,
- "grad_norm": 2.0247764587402344,
- "learning_rate": 1.8803040958546708e-06,
- "loss": 0.293364018201828,
- "mean_token_accuracy": 0.8954306244850159,
- "num_tokens": 21193659.0,
- "step": 2373
- },
- {
- "epoch": 1.803951367781155,
- "grad_norm": 1.7034926414489746,
- "learning_rate": 1.8782752820878636e-06,
- "loss": 0.33828210830688477,
- "mean_token_accuracy": 0.9032940864562988,
- "num_tokens": 21201399.0,
- "step": 2374
- },
- {
- "epoch": 1.8047112462006079,
- "grad_norm": 1.7864601612091064,
- "learning_rate": 1.8762469046713954e-06,
- "loss": 0.3165147006511688,
- "mean_token_accuracy": 0.8997465372085571,
- "num_tokens": 21209105.0,
- "step": 2375
- },
- {
- "epoch": 1.8054711246200608,
- "grad_norm": 2.3371729850769043,
- "learning_rate": 1.8742189650288617e-06,
- "loss": 0.4036901593208313,
- "mean_token_accuracy": 0.8549420833587646,
- "num_tokens": 21215429.0,
- "step": 2376
- },
- {
- "epoch": 1.8062310030395137,
- "grad_norm": 1.7922348976135254,
- "learning_rate": 1.872191464583547e-06,
- "loss": 0.4366671144962311,
- "mean_token_accuracy": 0.8614166975021362,
- "num_tokens": 21226823.0,
- "step": 2377
- },
- {
- "epoch": 1.8069908814589666,
- "grad_norm": 2.1667943000793457,
- "learning_rate": 1.8701644047584294e-06,
- "loss": 0.3543647825717926,
- "mean_token_accuracy": 0.9031318426132202,
- "num_tokens": 21232823.0,
- "step": 2378
- },
- {
- "epoch": 1.8077507598784195,
- "grad_norm": 1.7554421424865723,
- "learning_rate": 1.868137786976177e-06,
- "loss": 0.32704365253448486,
- "mean_token_accuracy": 0.8990532755851746,
- "num_tokens": 21242036.0,
- "step": 2379
- },
- {
- "epoch": 1.8085106382978724,
- "grad_norm": 1.6723839044570923,
- "learning_rate": 1.8661116126591492e-06,
- "loss": 0.3665752410888672,
- "mean_token_accuracy": 0.8828305006027222,
- "num_tokens": 21251290.0,
- "step": 2380
- },
- {
- "epoch": 1.809270516717325,
- "grad_norm": 1.5078409910202026,
- "learning_rate": 1.8640858832293924e-06,
- "loss": 0.368108332157135,
- "mean_token_accuracy": 0.8720884323120117,
- "num_tokens": 21263510.0,
- "step": 2381
- },
- {
- "epoch": 1.8100303951367782,
- "grad_norm": 2.245553493499756,
- "learning_rate": 1.8620606001086423e-06,
- "loss": 0.3189915716648102,
- "mean_token_accuracy": 0.9015103578567505,
- "num_tokens": 21269690.0,
- "step": 2382
- },
- {
- "epoch": 1.810790273556231,
- "grad_norm": 1.780027151107788,
- "learning_rate": 1.8600357647183188e-06,
- "loss": 0.40369710326194763,
- "mean_token_accuracy": 0.8539618253707886,
- "num_tokens": 21278523.0,
- "step": 2383
- },
- {
- "epoch": 1.811550151975684,
- "grad_norm": 2.1727912425994873,
- "learning_rate": 1.8580113784795306e-06,
- "loss": 0.29285651445388794,
- "mean_token_accuracy": 0.8954071998596191,
- "num_tokens": 21284717.0,
- "step": 2384
- },
- {
- "epoch": 1.8123100303951367,
- "grad_norm": 2.310225248336792,
- "learning_rate": 1.8559874428130708e-06,
- "loss": 0.3090948760509491,
- "mean_token_accuracy": 0.8853784203529358,
- "num_tokens": 21290484.0,
- "step": 2385
- },
- {
- "epoch": 1.8130699088145896,
- "grad_norm": 1.6556873321533203,
- "learning_rate": 1.8539639591394131e-06,
- "loss": 0.4425269663333893,
- "mean_token_accuracy": 0.8488757610321045,
- "num_tokens": 21302588.0,
- "step": 2386
- },
- {
- "epoch": 1.8138297872340425,
- "grad_norm": 1.9238256216049194,
- "learning_rate": 1.8519409288787182e-06,
- "loss": 0.4781329929828644,
- "mean_token_accuracy": 0.8392970561981201,
- "num_tokens": 21310598.0,
- "step": 2387
- },
- {
- "epoch": 1.8145896656534954,
- "grad_norm": 1.4976142644882202,
- "learning_rate": 1.8499183534508263e-06,
- "loss": 0.36829859018325806,
- "mean_token_accuracy": 0.8687542676925659,
- "num_tokens": 21322668.0,
- "step": 2388
- },
- {
- "epoch": 1.8153495440729484,
- "grad_norm": 2.0216941833496094,
- "learning_rate": 1.8478962342752584e-06,
- "loss": 0.385962575674057,
- "mean_token_accuracy": 0.8908089399337769,
- "num_tokens": 21330378.0,
- "step": 2389
- },
- {
- "epoch": 1.8161094224924013,
- "grad_norm": 1.647863507270813,
- "learning_rate": 1.8458745727712142e-06,
- "loss": 0.30903705954551697,
- "mean_token_accuracy": 0.8914397954940796,
- "num_tokens": 21339932.0,
- "step": 2390
- },
- {
- "epoch": 1.8168693009118542,
- "grad_norm": 1.5832399129867554,
- "learning_rate": 1.8438533703575757e-06,
- "loss": 0.3636384606361389,
- "mean_token_accuracy": 0.8611595630645752,
- "num_tokens": 21351557.0,
- "step": 2391
- },
- {
- "epoch": 1.8176291793313069,
- "grad_norm": 3.0069241523742676,
- "learning_rate": 1.8418326284528997e-06,
- "loss": 0.37970617413520813,
- "mean_token_accuracy": 0.8620643615722656,
- "num_tokens": 21355704.0,
- "step": 2392
- },
- {
- "epoch": 1.81838905775076,
- "grad_norm": 2.004526376724243,
- "learning_rate": 1.8398123484754204e-06,
- "loss": 0.5333225131034851,
- "mean_token_accuracy": 0.8062554597854614,
- "num_tokens": 21364640.0,
- "step": 2393
- },
- {
- "epoch": 1.8191489361702127,
- "grad_norm": 1.449981689453125,
- "learning_rate": 1.8377925318430478e-06,
- "loss": 0.3736325800418854,
- "mean_token_accuracy": 0.858788251876831,
- "num_tokens": 21377025.0,
- "step": 2394
- },
- {
- "epoch": 1.8199088145896658,
- "grad_norm": 1.1959524154663086,
- "learning_rate": 1.8357731799733686e-06,
- "loss": 0.3272058963775635,
- "mean_token_accuracy": 0.8840590715408325,
- "num_tokens": 21395378.0,
- "step": 2395
- },
- {
- "epoch": 1.8206686930091185,
- "grad_norm": 2.134742498397827,
- "learning_rate": 1.8337542942836406e-06,
- "loss": 0.3737856149673462,
- "mean_token_accuracy": 0.8674061298370361,
- "num_tokens": 21402106.0,
- "step": 2396
- },
- {
- "epoch": 1.8214285714285714,
- "grad_norm": 2.2179460525512695,
- "learning_rate": 1.8317358761907945e-06,
- "loss": 0.37301796674728394,
- "mean_token_accuracy": 0.8605623245239258,
- "num_tokens": 21408367.0,
- "step": 2397
- },
- {
- "epoch": 1.8221884498480243,
- "grad_norm": 2.1718010902404785,
- "learning_rate": 1.8297179271114345e-06,
- "loss": 0.2772231101989746,
- "mean_token_accuracy": 0.8997501730918884,
- "num_tokens": 21414274.0,
- "step": 2398
- },
- {
- "epoch": 1.8229483282674772,
- "grad_norm": 1.410933494567871,
- "learning_rate": 1.827700448461836e-06,
- "loss": 0.4834601581096649,
- "mean_token_accuracy": 0.8382522463798523,
- "num_tokens": 21429120.0,
- "step": 2399
- },
- {
- "epoch": 1.8237082066869301,
- "grad_norm": 3.4779679775238037,
- "learning_rate": 1.8256834416579423e-06,
- "loss": 0.44643428921699524,
- "mean_token_accuracy": 0.8308249711990356,
- "num_tokens": 21432437.0,
- "step": 2400
- },
- {
- "epoch": 1.824468085106383,
- "grad_norm": 1.374484658241272,
- "learning_rate": 1.8236669081153657e-06,
- "loss": 0.3947869837284088,
- "mean_token_accuracy": 0.8605848550796509,
- "num_tokens": 21445656.0,
- "step": 2401
- },
- {
- "epoch": 1.825227963525836,
- "grad_norm": 1.9599316120147705,
- "learning_rate": 1.8216508492493887e-06,
- "loss": 0.49040719866752625,
- "mean_token_accuracy": 0.839459240436554,
- "num_tokens": 21452889.0,
- "step": 2402
- },
- {
- "epoch": 1.8259878419452886,
- "grad_norm": 2.1267881393432617,
- "learning_rate": 1.8196352664749578e-06,
- "loss": 0.3233179450035095,
- "mean_token_accuracy": 0.8841243386268616,
- "num_tokens": 21458788.0,
- "step": 2403
- },
- {
- "epoch": 1.8267477203647418,
- "grad_norm": 2.6356115341186523,
- "learning_rate": 1.8176201612066874e-06,
- "loss": 0.43436336517333984,
- "mean_token_accuracy": 0.850265622138977,
- "num_tokens": 21464305.0,
- "step": 2404
- },
- {
- "epoch": 1.8275075987841944,
- "grad_norm": 2.0232386589050293,
- "learning_rate": 1.8156055348588548e-06,
- "loss": 0.37281763553619385,
- "mean_token_accuracy": 0.8616300821304321,
- "num_tokens": 21471722.0,
- "step": 2405
- },
- {
- "epoch": 1.8282674772036476,
- "grad_norm": 3.2616260051727295,
- "learning_rate": 1.8135913888454034e-06,
- "loss": 0.2882898151874542,
- "mean_token_accuracy": 0.9001147747039795,
- "num_tokens": 21475400.0,
- "step": 2406
- },
- {
- "epoch": 1.8290273556231003,
- "grad_norm": 2.1665611267089844,
- "learning_rate": 1.8115777245799383e-06,
- "loss": 0.45269185304641724,
- "mean_token_accuracy": 0.8420798778533936,
- "num_tokens": 21481827.0,
- "step": 2407
- },
- {
- "epoch": 1.8297872340425532,
- "grad_norm": 1.4406569004058838,
- "learning_rate": 1.8095645434757261e-06,
- "loss": 0.43665701150894165,
- "mean_token_accuracy": 0.8401381969451904,
- "num_tokens": 21496441.0,
- "step": 2408
- },
- {
- "epoch": 1.830547112462006,
- "grad_norm": 1.6756342649459839,
- "learning_rate": 1.8075518469456944e-06,
- "loss": 0.3521783947944641,
- "mean_token_accuracy": 0.8737466335296631,
- "num_tokens": 21505568.0,
- "step": 2409
- },
- {
- "epoch": 1.831306990881459,
- "grad_norm": 1.6623140573501587,
- "learning_rate": 1.8055396364024318e-06,
- "loss": 0.344537615776062,
- "mean_token_accuracy": 0.886972188949585,
- "num_tokens": 21513252.0,
- "step": 2410
- },
- {
- "epoch": 1.832066869300912,
- "grad_norm": 2.064835548400879,
- "learning_rate": 1.803527913258186e-06,
- "loss": 0.3252706229686737,
- "mean_token_accuracy": 0.885245680809021,
- "num_tokens": 21520242.0,
- "step": 2411
- },
- {
- "epoch": 1.8328267477203646,
- "grad_norm": 1.9969112873077393,
- "learning_rate": 1.8015166789248606e-06,
- "loss": 0.34694376587867737,
- "mean_token_accuracy": 0.8818766474723816,
- "num_tokens": 21527524.0,
- "step": 2412
- },
- {
- "epoch": 1.8335866261398177,
- "grad_norm": 2.086148977279663,
- "learning_rate": 1.7995059348140165e-06,
- "loss": 0.23109188675880432,
- "mean_token_accuracy": 0.912773609161377,
- "num_tokens": 21532829.0,
- "step": 2413
- },
- {
- "epoch": 1.8343465045592704,
- "grad_norm": 1.80828058719635,
- "learning_rate": 1.7974956823368728e-06,
- "loss": 0.5422223210334778,
- "mean_token_accuracy": 0.8058640956878662,
- "num_tokens": 21544440.0,
- "step": 2414
- },
- {
- "epoch": 1.8351063829787235,
- "grad_norm": 1.8121788501739502,
- "learning_rate": 1.7954859229043017e-06,
- "loss": 0.3674035668373108,
- "mean_token_accuracy": 0.8628277778625488,
- "num_tokens": 21553160.0,
- "step": 2415
- },
- {
- "epoch": 1.8358662613981762,
- "grad_norm": 1.9307979345321655,
- "learning_rate": 1.7934766579268292e-06,
- "loss": 0.4528796672821045,
- "mean_token_accuracy": 0.8328302502632141,
- "num_tokens": 21563485.0,
- "step": 2416
- },
- {
- "epoch": 1.8366261398176293,
- "grad_norm": 1.2312756776809692,
- "learning_rate": 1.7914678888146347e-06,
- "loss": 0.40424543619155884,
- "mean_token_accuracy": 0.8571025133132935,
- "num_tokens": 21582662.0,
- "step": 2417
- },
- {
- "epoch": 1.837386018237082,
- "grad_norm": 1.6305770874023438,
- "learning_rate": 1.7894596169775514e-06,
- "loss": 0.36575305461883545,
- "mean_token_accuracy": 0.8768579959869385,
- "num_tokens": 21592930.0,
- "step": 2418
- },
- {
- "epoch": 1.838145896656535,
- "grad_norm": 1.8107178211212158,
- "learning_rate": 1.7874518438250598e-06,
- "loss": 0.3260963261127472,
- "mean_token_accuracy": 0.896018385887146,
- "num_tokens": 21600509.0,
- "step": 2419
- },
- {
- "epoch": 1.8389057750759878,
- "grad_norm": 2.7195847034454346,
- "learning_rate": 1.785444570766293e-06,
- "loss": 0.2728347182273865,
- "mean_token_accuracy": 0.9178709983825684,
- "num_tokens": 21604489.0,
- "step": 2420
- },
- {
- "epoch": 1.8396656534954408,
- "grad_norm": 1.9783591032028198,
- "learning_rate": 1.7834377992100332e-06,
- "loss": 0.3136378526687622,
- "mean_token_accuracy": 0.8844017386436462,
- "num_tokens": 21612060.0,
- "step": 2421
- },
- {
- "epoch": 1.8404255319148937,
- "grad_norm": 2.1911418437957764,
- "learning_rate": 1.7814315305647095e-06,
- "loss": 0.39013993740081787,
- "mean_token_accuracy": 0.8688976764678955,
- "num_tokens": 21618778.0,
- "step": 2422
- },
- {
- "epoch": 1.8411854103343464,
- "grad_norm": 1.9143604040145874,
- "learning_rate": 1.779425766238398e-06,
- "loss": 0.5113036632537842,
- "mean_token_accuracy": 0.8329141139984131,
- "num_tokens": 21628976.0,
- "step": 2423
- },
- {
- "epoch": 1.8419452887537995,
- "grad_norm": 1.4184197187423706,
- "learning_rate": 1.7774205076388207e-06,
- "loss": 0.3821067810058594,
- "mean_token_accuracy": 0.8604007959365845,
- "num_tokens": 21643145.0,
- "step": 2424
- },
- {
- "epoch": 1.8427051671732522,
- "grad_norm": 2.45896577835083,
- "learning_rate": 1.7754157561733476e-06,
- "loss": 0.3004961311817169,
- "mean_token_accuracy": 0.89884352684021,
- "num_tokens": 21647441.0,
- "step": 2425
- },
- {
- "epoch": 1.8434650455927053,
- "grad_norm": 1.7999277114868164,
- "learning_rate": 1.7734115132489887e-06,
- "loss": 0.42533132433891296,
- "mean_token_accuracy": 0.8838746547698975,
- "num_tokens": 21657445.0,
- "step": 2426
- },
- {
- "epoch": 1.844224924012158,
- "grad_norm": 2.099728584289551,
- "learning_rate": 1.7714077802723994e-06,
- "loss": 0.36200380325317383,
- "mean_token_accuracy": 0.86548912525177,
- "num_tokens": 21663966.0,
- "step": 2427
- },
- {
- "epoch": 1.844984802431611,
- "grad_norm": 2.1970369815826416,
- "learning_rate": 1.7694045586498754e-06,
- "loss": 0.34944331645965576,
- "mean_token_accuracy": 0.8670865297317505,
- "num_tokens": 21670051.0,
- "step": 2428
- },
- {
- "epoch": 1.8457446808510638,
- "grad_norm": 2.2928519248962402,
- "learning_rate": 1.7674018497873568e-06,
- "loss": 0.39500880241394043,
- "mean_token_accuracy": 0.8744652271270752,
- "num_tokens": 21676054.0,
- "step": 2429
- },
- {
- "epoch": 1.8465045592705167,
- "grad_norm": 1.7598960399627686,
- "learning_rate": 1.7653996550904208e-06,
- "loss": 0.40113672614097595,
- "mean_token_accuracy": 0.8552819490432739,
- "num_tokens": 21685514.0,
- "step": 2430
- },
- {
- "epoch": 1.8472644376899696,
- "grad_norm": 2.0529749393463135,
- "learning_rate": 1.7633979759642844e-06,
- "loss": 0.47586584091186523,
- "mean_token_accuracy": 0.8412872552871704,
- "num_tokens": 21693282.0,
- "step": 2431
- },
- {
- "epoch": 1.8480243161094225,
- "grad_norm": 2.2423181533813477,
- "learning_rate": 1.7613968138138027e-06,
- "loss": 0.2757381796836853,
- "mean_token_accuracy": 0.8992017507553101,
- "num_tokens": 21698439.0,
- "step": 2432
- },
- {
- "epoch": 1.8487841945288754,
- "grad_norm": 1.3280467987060547,
- "learning_rate": 1.7593961700434692e-06,
- "loss": 0.29535043239593506,
- "mean_token_accuracy": 0.8943840861320496,
- "num_tokens": 21711823.0,
- "step": 2433
- },
- {
- "epoch": 1.8495440729483281,
- "grad_norm": 2.589221715927124,
- "learning_rate": 1.7573960460574133e-06,
- "loss": 0.46775516867637634,
- "mean_token_accuracy": 0.8654797673225403,
- "num_tokens": 21717180.0,
- "step": 2434
- },
- {
- "epoch": 1.8503039513677813,
- "grad_norm": 2.1137642860412598,
- "learning_rate": 1.7553964432593976e-06,
- "loss": 0.3808780610561371,
- "mean_token_accuracy": 0.8759565353393555,
- "num_tokens": 21723980.0,
- "step": 2435
- },
- {
- "epoch": 1.851063829787234,
- "grad_norm": 2.386967182159424,
- "learning_rate": 1.75339736305282e-06,
- "loss": 0.42688336968421936,
- "mean_token_accuracy": 0.8488960266113281,
- "num_tokens": 21730411.0,
- "step": 2436
- },
- {
- "epoch": 1.851823708206687,
- "grad_norm": 1.586552619934082,
- "learning_rate": 1.7513988068407145e-06,
- "loss": 0.33497530221939087,
- "mean_token_accuracy": 0.8809621334075928,
- "num_tokens": 21740228.0,
- "step": 2437
- },
- {
- "epoch": 1.8525835866261398,
- "grad_norm": 2.107167959213257,
- "learning_rate": 1.7494007760257428e-06,
- "loss": 0.3801528513431549,
- "mean_token_accuracy": 0.8666986227035522,
- "num_tokens": 21746718.0,
- "step": 2438
- },
- {
- "epoch": 1.8533434650455927,
- "grad_norm": 2.514514684677124,
- "learning_rate": 1.7474032720101991e-06,
- "loss": 0.285498708486557,
- "mean_token_accuracy": 0.901540219783783,
- "num_tokens": 21751009.0,
- "step": 2439
- },
- {
- "epoch": 1.8541033434650456,
- "grad_norm": 1.8152034282684326,
- "learning_rate": 1.7454062961960102e-06,
- "loss": 0.3704795241355896,
- "mean_token_accuracy": 0.8630262613296509,
- "num_tokens": 21760164.0,
- "step": 2440
- },
- {
- "epoch": 1.8548632218844985,
- "grad_norm": 2.714531183242798,
- "learning_rate": 1.7434098499847308e-06,
- "loss": 0.5070809125900269,
- "mean_token_accuracy": 0.8408594131469727,
- "num_tokens": 21765602.0,
- "step": 2441
- },
- {
- "epoch": 1.8556231003039514,
- "grad_norm": 2.173832893371582,
- "learning_rate": 1.7414139347775423e-06,
- "loss": 0.3500945568084717,
- "mean_token_accuracy": 0.8733699321746826,
- "num_tokens": 21772029.0,
- "step": 2442
- },
- {
- "epoch": 1.8563829787234043,
- "grad_norm": 1.580376148223877,
- "learning_rate": 1.7394185519752546e-06,
- "loss": 0.5137908458709717,
- "mean_token_accuracy": 0.8141944408416748,
- "num_tokens": 21784531.0,
- "step": 2443
- },
- {
- "epoch": 1.8571428571428572,
- "grad_norm": 2.079318046569824,
- "learning_rate": 1.7374237029783064e-06,
- "loss": 0.41820770502090454,
- "mean_token_accuracy": 0.8513275384902954,
- "num_tokens": 21792047.0,
- "step": 2444
- },
- {
- "epoch": 1.85790273556231,
- "grad_norm": 2.6890387535095215,
- "learning_rate": 1.7354293891867582e-06,
- "loss": 0.3810037672519684,
- "mean_token_accuracy": 0.8790096044540405,
- "num_tokens": 21796634.0,
- "step": 2445
- },
- {
- "epoch": 1.858662613981763,
- "grad_norm": 2.161081552505493,
- "learning_rate": 1.7334356120002956e-06,
- "loss": 0.48064762353897095,
- "mean_token_accuracy": 0.8329977989196777,
- "num_tokens": 21803509.0,
- "step": 2446
- },
- {
- "epoch": 1.8594224924012157,
- "grad_norm": 1.9201551675796509,
- "learning_rate": 1.7314423728182283e-06,
- "loss": 0.36369895935058594,
- "mean_token_accuracy": 0.8713955879211426,
- "num_tokens": 21810528.0,
- "step": 2447
- },
- {
- "epoch": 1.8601823708206688,
- "grad_norm": 1.8095223903656006,
- "learning_rate": 1.7294496730394897e-06,
- "loss": 0.41493499279022217,
- "mean_token_accuracy": 0.855312705039978,
- "num_tokens": 21821176.0,
- "step": 2448
- },
- {
- "epoch": 1.8609422492401215,
- "grad_norm": 2.172389507293701,
- "learning_rate": 1.7274575140626318e-06,
- "loss": 0.3467463552951813,
- "mean_token_accuracy": 0.8801594972610474,
- "num_tokens": 21827486.0,
- "step": 2449
- },
- {
- "epoch": 1.8617021276595744,
- "grad_norm": 2.8139185905456543,
- "learning_rate": 1.7254658972858293e-06,
- "loss": 0.35121995210647583,
- "mean_token_accuracy": 0.8741901516914368,
- "num_tokens": 21831915.0,
- "step": 2450
- },
- {
- "epoch": 1.8624620060790273,
- "grad_norm": 1.2572762966156006,
- "learning_rate": 1.7234748241068742e-06,
- "loss": 0.3775328993797302,
- "mean_token_accuracy": 0.8547425866127014,
- "num_tokens": 21849623.0,
- "step": 2451
- },
- {
- "epoch": 1.8632218844984803,
- "grad_norm": 1.2357900142669678,
- "learning_rate": 1.7214842959231796e-06,
- "loss": 0.28715917468070984,
- "mean_token_accuracy": 0.9034290313720703,
- "num_tokens": 21864507.0,
- "step": 2452
- },
- {
- "epoch": 1.8639817629179332,
- "grad_norm": 1.2349165678024292,
- "learning_rate": 1.719494314131775e-06,
- "loss": 0.27918580174446106,
- "mean_token_accuracy": 0.9073119759559631,
- "num_tokens": 21878519.0,
- "step": 2453
- },
- {
- "epoch": 1.864741641337386,
- "grad_norm": 1.960353136062622,
- "learning_rate": 1.7175048801293042e-06,
- "loss": 0.49304282665252686,
- "mean_token_accuracy": 0.8193954229354858,
- "num_tokens": 21886861.0,
- "step": 2454
- },
- {
- "epoch": 1.865501519756839,
- "grad_norm": 1.480118751525879,
- "learning_rate": 1.7155159953120315e-06,
- "loss": 0.39433127641677856,
- "mean_token_accuracy": 0.8674266338348389,
- "num_tokens": 21899131.0,
- "step": 2455
- },
- {
- "epoch": 1.8662613981762917,
- "grad_norm": 2.3136367797851562,
- "learning_rate": 1.7135276610758309e-06,
- "loss": 0.40943437814712524,
- "mean_token_accuracy": 0.8511340022087097,
- "num_tokens": 21905550.0,
- "step": 2456
- },
- {
- "epoch": 1.8670212765957448,
- "grad_norm": 1.3622872829437256,
- "learning_rate": 1.7115398788161923e-06,
- "loss": 0.4255254566669464,
- "mean_token_accuracy": 0.8457357883453369,
- "num_tokens": 21919943.0,
- "step": 2457
- },
- {
- "epoch": 1.8677811550151975,
- "grad_norm": 1.8197853565216064,
- "learning_rate": 1.7095526499282172e-06,
- "loss": 0.33384573459625244,
- "mean_token_accuracy": 0.8757365942001343,
- "num_tokens": 21928368.0,
- "step": 2458
- },
- {
- "epoch": 1.8685410334346506,
- "grad_norm": 1.8771090507507324,
- "learning_rate": 1.7075659758066207e-06,
- "loss": 0.38854318857192993,
- "mean_token_accuracy": 0.8565001487731934,
- "num_tokens": 21936624.0,
- "step": 2459
- },
- {
- "epoch": 1.8693009118541033,
- "grad_norm": 1.449811577796936,
- "learning_rate": 1.7055798578457267e-06,
- "loss": 0.45504286885261536,
- "mean_token_accuracy": 0.8338158130645752,
- "num_tokens": 21952192.0,
- "step": 2460
- },
- {
- "epoch": 1.8700607902735562,
- "grad_norm": 2.253678321838379,
- "learning_rate": 1.703594297439469e-06,
- "loss": 0.44300752878189087,
- "mean_token_accuracy": 0.8451106548309326,
- "num_tokens": 21959107.0,
- "step": 2461
- },
- {
- "epoch": 1.8708206686930091,
- "grad_norm": 2.5431747436523438,
- "learning_rate": 1.7016092959813892e-06,
- "loss": 0.34692925214767456,
- "mean_token_accuracy": 0.8823766708374023,
- "num_tokens": 21964543.0,
- "step": 2462
- },
- {
- "epoch": 1.871580547112462,
- "grad_norm": 2.7001953125,
- "learning_rate": 1.6996248548646393e-06,
- "loss": 0.5270686745643616,
- "mean_token_accuracy": 0.8366886377334595,
- "num_tokens": 21970157.0,
- "step": 2463
- },
- {
- "epoch": 1.872340425531915,
- "grad_norm": 2.3855581283569336,
- "learning_rate": 1.6976409754819767e-06,
- "loss": 0.40109893679618835,
- "mean_token_accuracy": 0.8477234840393066,
- "num_tokens": 21976046.0,
- "step": 2464
- },
- {
- "epoch": 1.8731003039513676,
- "grad_norm": 1.6014364957809448,
- "learning_rate": 1.6956576592257635e-06,
- "loss": 0.4344262480735779,
- "mean_token_accuracy": 0.8464433550834656,
- "num_tokens": 21986299.0,
- "step": 2465
- },
- {
- "epoch": 1.8738601823708207,
- "grad_norm": 2.221372127532959,
- "learning_rate": 1.6936749074879663e-06,
- "loss": 0.24239015579223633,
- "mean_token_accuracy": 0.9185566306114197,
- "num_tokens": 21991541.0,
- "step": 2466
- },
- {
- "epoch": 1.8746200607902734,
- "grad_norm": 1.6672178506851196,
- "learning_rate": 1.6916927216601593e-06,
- "loss": 0.35219496488571167,
- "mean_token_accuracy": 0.8668237328529358,
- "num_tokens": 22000797.0,
- "step": 2467
- },
- {
- "epoch": 1.8753799392097266,
- "grad_norm": 1.364131212234497,
- "learning_rate": 1.6897111031335145e-06,
- "loss": 0.4456409513950348,
- "mean_token_accuracy": 0.8350487947463989,
- "num_tokens": 22018297.0,
- "step": 2468
- },
- {
- "epoch": 1.8761398176291793,
- "grad_norm": 1.4535794258117676,
- "learning_rate": 1.6877300532988095e-06,
- "loss": 0.395782470703125,
- "mean_token_accuracy": 0.8482908010482788,
- "num_tokens": 22030096.0,
- "step": 2469
- },
- {
- "epoch": 1.8768996960486324,
- "grad_norm": 2.0192270278930664,
- "learning_rate": 1.6857495735464196e-06,
- "loss": 0.31406813859939575,
- "mean_token_accuracy": 0.889453649520874,
- "num_tokens": 22036082.0,
- "step": 2470
- },
- {
- "epoch": 1.877659574468085,
- "grad_norm": 2.159257173538208,
- "learning_rate": 1.6837696652663244e-06,
- "loss": 0.43942126631736755,
- "mean_token_accuracy": 0.8518660068511963,
- "num_tokens": 22043413.0,
- "step": 2471
- },
- {
- "epoch": 1.878419452887538,
- "grad_norm": 1.9774882793426514,
- "learning_rate": 1.681790329848097e-06,
- "loss": 0.42464935779571533,
- "mean_token_accuracy": 0.8545591831207275,
- "num_tokens": 22050290.0,
- "step": 2472
- },
- {
- "epoch": 1.8791793313069909,
- "grad_norm": 1.0219167470932007,
- "learning_rate": 1.6798115686809125e-06,
- "loss": 0.36917346715927124,
- "mean_token_accuracy": 0.8650286197662354,
- "num_tokens": 22070408.0,
- "step": 2473
- },
- {
- "epoch": 1.8799392097264438,
- "grad_norm": 1.2943378686904907,
- "learning_rate": 1.677833383153542e-06,
- "loss": 0.3434808850288391,
- "mean_token_accuracy": 0.878541111946106,
- "num_tokens": 22083567.0,
- "step": 2474
- },
- {
- "epoch": 1.8806990881458967,
- "grad_norm": 3.582855224609375,
- "learning_rate": 1.6758557746543518e-06,
- "loss": 0.39738911390304565,
- "mean_token_accuracy": 0.8951535224914551,
- "num_tokens": 22086886.0,
- "step": 2475
- },
- {
- "epoch": 1.8814589665653494,
- "grad_norm": 1.680220365524292,
- "learning_rate": 1.673878744571304e-06,
- "loss": 0.38146206736564636,
- "mean_token_accuracy": 0.8596681356430054,
- "num_tokens": 22095564.0,
- "step": 2476
- },
- {
- "epoch": 1.8822188449848025,
- "grad_norm": 1.448194146156311,
- "learning_rate": 1.6719022942919527e-06,
- "loss": 0.43309977650642395,
- "mean_token_accuracy": 0.8669528961181641,
- "num_tokens": 22109333.0,
- "step": 2477
- },
- {
- "epoch": 1.8829787234042552,
- "grad_norm": 1.5353537797927856,
- "learning_rate": 1.6699264252034498e-06,
- "loss": 0.4479079842567444,
- "mean_token_accuracy": 0.8379873037338257,
- "num_tokens": 22124735.0,
- "step": 2478
- },
- {
- "epoch": 1.8837386018237083,
- "grad_norm": 1.1744320392608643,
- "learning_rate": 1.6679511386925337e-06,
- "loss": 0.31951260566711426,
- "mean_token_accuracy": 0.8792685270309448,
- "num_tokens": 22140882.0,
- "step": 2479
- },
- {
- "epoch": 1.884498480243161,
- "grad_norm": 2.1996841430664062,
- "learning_rate": 1.6659764361455383e-06,
- "loss": 0.39045992493629456,
- "mean_token_accuracy": 0.8587675094604492,
- "num_tokens": 22146843.0,
- "step": 2480
- },
- {
- "epoch": 1.885258358662614,
- "grad_norm": 3.494931697845459,
- "learning_rate": 1.6640023189483836e-06,
- "loss": 0.44756871461868286,
- "mean_token_accuracy": 0.8643628358840942,
- "num_tokens": 22150504.0,
- "step": 2481
- },
- {
- "epoch": 1.8860182370820668,
- "grad_norm": 2.2455973625183105,
- "learning_rate": 1.6620287884865831e-06,
- "loss": 0.3308878540992737,
- "mean_token_accuracy": 0.8748078942298889,
- "num_tokens": 22156537.0,
- "step": 2482
- },
- {
- "epoch": 1.8867781155015197,
- "grad_norm": 2.31868314743042,
- "learning_rate": 1.6600558461452368e-06,
- "loss": 0.46583569049835205,
- "mean_token_accuracy": 0.8438903093338013,
- "num_tokens": 22163501.0,
- "step": 2483
- },
- {
- "epoch": 1.8875379939209727,
- "grad_norm": 1.5695412158966064,
- "learning_rate": 1.65808349330903e-06,
- "loss": 0.351986825466156,
- "mean_token_accuracy": 0.8707568645477295,
- "num_tokens": 22173880.0,
- "step": 2484
- },
- {
- "epoch": 1.8882978723404256,
- "grad_norm": 1.4109563827514648,
- "learning_rate": 1.656111731362236e-06,
- "loss": 0.36058586835861206,
- "mean_token_accuracy": 0.8606001138687134,
- "num_tokens": 22189000.0,
- "step": 2485
- },
- {
- "epoch": 1.8890577507598785,
- "grad_norm": 1.0398776531219482,
- "learning_rate": 1.6541405616887138e-06,
- "loss": 0.36524999141693115,
- "mean_token_accuracy": 0.8690586090087891,
- "num_tokens": 22209187.0,
- "step": 2486
- },
- {
- "epoch": 1.8898176291793312,
- "grad_norm": 2.1050004959106445,
- "learning_rate": 1.6521699856719065e-06,
- "loss": 0.2988269329071045,
- "mean_token_accuracy": 0.8887280225753784,
- "num_tokens": 22215539.0,
- "step": 2487
- },
- {
- "epoch": 1.8905775075987843,
- "grad_norm": 2.5606791973114014,
- "learning_rate": 1.650200004694839e-06,
- "loss": 0.41077330708503723,
- "mean_token_accuracy": 0.8436049818992615,
- "num_tokens": 22221133.0,
- "step": 2488
- },
- {
- "epoch": 1.891337386018237,
- "grad_norm": 1.5786094665527344,
- "learning_rate": 1.6482306201401211e-06,
- "loss": 0.4217292368412018,
- "mean_token_accuracy": 0.859939455986023,
- "num_tokens": 22231578.0,
- "step": 2489
- },
- {
- "epoch": 1.89209726443769,
- "grad_norm": 1.7131884098052979,
- "learning_rate": 1.6462618333899422e-06,
- "loss": 0.3945464789867401,
- "mean_token_accuracy": 0.8679244518280029,
- "num_tokens": 22241252.0,
- "step": 2490
- },
- {
- "epoch": 1.8928571428571428,
- "grad_norm": 2.8350300788879395,
- "learning_rate": 1.6442936458260723e-06,
- "loss": 0.3992699384689331,
- "mean_token_accuracy": 0.8717275857925415,
- "num_tokens": 22246226.0,
- "step": 2491
- },
- {
- "epoch": 1.8936170212765957,
- "grad_norm": 2.2180120944976807,
- "learning_rate": 1.6423260588298608e-06,
- "loss": 0.3381099998950958,
- "mean_token_accuracy": 0.8968075513839722,
- "num_tokens": 22252355.0,
- "step": 2492
- },
- {
- "epoch": 1.8943768996960486,
- "grad_norm": 2.6498866081237793,
- "learning_rate": 1.6403590737822378e-06,
- "loss": 0.36339250206947327,
- "mean_token_accuracy": 0.8633373379707336,
- "num_tokens": 22257407.0,
- "step": 2493
- },
- {
- "epoch": 1.8951367781155015,
- "grad_norm": 2.634241819381714,
- "learning_rate": 1.6383926920637077e-06,
- "loss": 0.2562698721885681,
- "mean_token_accuracy": 0.8999600410461426,
- "num_tokens": 22261858.0,
- "step": 2494
- },
- {
- "epoch": 1.8958966565349544,
- "grad_norm": 2.0163333415985107,
- "learning_rate": 1.6364269150543533e-06,
- "loss": 0.3413389027118683,
- "mean_token_accuracy": 0.8718398809432983,
- "num_tokens": 22268517.0,
- "step": 2495
- },
- {
- "epoch": 1.8966565349544073,
- "grad_norm": 2.8333005905151367,
- "learning_rate": 1.6344617441338311e-06,
- "loss": 0.4354540705680847,
- "mean_token_accuracy": 0.8491238355636597,
- "num_tokens": 22273648.0,
- "step": 2496
- },
- {
- "epoch": 1.8974164133738602,
- "grad_norm": 1.6280957460403442,
- "learning_rate": 1.6324971806813766e-06,
- "loss": 0.3015792965888977,
- "mean_token_accuracy": 0.8937206268310547,
- "num_tokens": 22282521.0,
- "step": 2497
- },
- {
- "epoch": 1.898176291793313,
- "grad_norm": 1.2246302366256714,
- "learning_rate": 1.6305332260757937e-06,
- "loss": 0.26619502902030945,
- "mean_token_accuracy": 0.8886681199073792,
- "num_tokens": 22295179.0,
- "step": 2498
- },
- {
- "epoch": 1.898936170212766,
- "grad_norm": 2.4014432430267334,
- "learning_rate": 1.6285698816954626e-06,
- "loss": 0.3735058903694153,
- "mean_token_accuracy": 0.8693109750747681,
- "num_tokens": 22300681.0,
- "step": 2499
- },
- {
- "epoch": 1.8996960486322187,
- "grad_norm": 1.4447300434112549,
- "learning_rate": 1.6266071489183327e-06,
- "loss": 0.40768876671791077,
- "mean_token_accuracy": 0.8556059002876282,
- "num_tokens": 22312442.0,
- "step": 2500
- },
- {
- "epoch": 1.9004559270516719,
- "grad_norm": 2.1339821815490723,
- "learning_rate": 1.6246450291219268e-06,
- "loss": 0.33442017436027527,
- "mean_token_accuracy": 0.8837105631828308,
- "num_tokens": 22318779.0,
- "step": 2501
- },
- {
- "epoch": 1.9012158054711246,
- "grad_norm": 2.8564913272857666,
- "learning_rate": 1.6226835236833356e-06,
- "loss": 0.36013197898864746,
- "mean_token_accuracy": 0.8810569047927856,
- "num_tokens": 22323390.0,
- "step": 2502
- },
- {
- "epoch": 1.9019756838905775,
- "grad_norm": 2.1201915740966797,
- "learning_rate": 1.620722633979219e-06,
- "loss": 0.4587489664554596,
- "mean_token_accuracy": 0.8517274856567383,
- "num_tokens": 22330275.0,
- "step": 2503
- },
- {
- "epoch": 1.9027355623100304,
- "grad_norm": 2.211402177810669,
- "learning_rate": 1.6187623613858038e-06,
- "loss": 0.3698349595069885,
- "mean_token_accuracy": 0.8768182992935181,
- "num_tokens": 22336041.0,
- "step": 2504
- },
- {
- "epoch": 1.9034954407294833,
- "grad_norm": 1.421604871749878,
- "learning_rate": 1.6168027072788868e-06,
- "loss": 0.38086453080177307,
- "mean_token_accuracy": 0.8622198104858398,
- "num_tokens": 22349310.0,
- "step": 2505
- },
- {
- "epoch": 1.9042553191489362,
- "grad_norm": 2.4304113388061523,
- "learning_rate": 1.6148436730338279e-06,
- "loss": 0.34694477915763855,
- "mean_token_accuracy": 0.8833136558532715,
- "num_tokens": 22355069.0,
- "step": 2506
- },
- {
- "epoch": 1.905015197568389,
- "grad_norm": 2.1076772212982178,
- "learning_rate": 1.6128852600255518e-06,
- "loss": 0.4973800778388977,
- "mean_token_accuracy": 0.851190984249115,
- "num_tokens": 22362402.0,
- "step": 2507
- },
- {
- "epoch": 1.905775075987842,
- "grad_norm": 3.0934200286865234,
- "learning_rate": 1.6109274696285496e-06,
- "loss": 0.46498024463653564,
- "mean_token_accuracy": 0.8436626195907593,
- "num_tokens": 22367390.0,
- "step": 2508
- },
- {
- "epoch": 1.9065349544072947,
- "grad_norm": 2.0114359855651855,
- "learning_rate": 1.6089703032168736e-06,
- "loss": 0.45143815875053406,
- "mean_token_accuracy": 0.852748692035675,
- "num_tokens": 22377032.0,
- "step": 2509
- },
- {
- "epoch": 1.9072948328267478,
- "grad_norm": 1.8780893087387085,
- "learning_rate": 1.6070137621641382e-06,
- "loss": 0.3977179527282715,
- "mean_token_accuracy": 0.8556262850761414,
- "num_tokens": 22386880.0,
- "step": 2510
- },
- {
- "epoch": 1.9080547112462005,
- "grad_norm": 1.6748069524765015,
- "learning_rate": 1.6050578478435184e-06,
- "loss": 0.35590440034866333,
- "mean_token_accuracy": 0.8702141046524048,
- "num_tokens": 22396616.0,
- "step": 2511
- },
- {
- "epoch": 1.9088145896656536,
- "grad_norm": 0.9799401760101318,
- "learning_rate": 1.6031025616277512e-06,
- "loss": 0.3325427770614624,
- "mean_token_accuracy": 0.8771291971206665,
- "num_tokens": 22419580.0,
- "step": 2512
- },
- {
- "epoch": 1.9095744680851063,
- "grad_norm": 1.5084866285324097,
- "learning_rate": 1.6011479048891323e-06,
- "loss": 0.44336390495300293,
- "mean_token_accuracy": 0.8786209225654602,
- "num_tokens": 22434235.0,
- "step": 2513
- },
- {
- "epoch": 1.9103343465045592,
- "grad_norm": 1.8544305562973022,
- "learning_rate": 1.5991938789995138e-06,
- "loss": 0.3055306375026703,
- "mean_token_accuracy": 0.9043174982070923,
- "num_tokens": 22442003.0,
- "step": 2514
- },
- {
- "epoch": 1.9110942249240122,
- "grad_norm": 4.29932165145874,
- "learning_rate": 1.5972404853303061e-06,
- "loss": 0.386760413646698,
- "mean_token_accuracy": 0.8914207220077515,
- "num_tokens": 22444787.0,
- "step": 2515
- },
- {
- "epoch": 1.911854103343465,
- "grad_norm": 1.7560505867004395,
- "learning_rate": 1.595287725252478e-06,
- "loss": 0.4141422510147095,
- "mean_token_accuracy": 0.862310528755188,
- "num_tokens": 22453625.0,
- "step": 2516
- },
- {
- "epoch": 1.912613981762918,
- "grad_norm": 2.685443878173828,
- "learning_rate": 1.5933356001365502e-06,
- "loss": 0.36217260360717773,
- "mean_token_accuracy": 0.868883490562439,
- "num_tokens": 22458597.0,
- "step": 2517
- },
- {
- "epoch": 1.9133738601823707,
- "grad_norm": 2.2587239742279053,
- "learning_rate": 1.591384111352599e-06,
- "loss": 0.5298880934715271,
- "mean_token_accuracy": 0.821168839931488,
- "num_tokens": 22466091.0,
- "step": 2518
- },
- {
- "epoch": 1.9141337386018238,
- "grad_norm": 2.273380756378174,
- "learning_rate": 1.5894332602702545e-06,
- "loss": 0.3194117546081543,
- "mean_token_accuracy": 0.8849239945411682,
- "num_tokens": 22471785.0,
- "step": 2519
- },
- {
- "epoch": 1.9148936170212765,
- "grad_norm": 2.314634084701538,
- "learning_rate": 1.5874830482587003e-06,
- "loss": 0.457550585269928,
- "mean_token_accuracy": 0.8367670774459839,
- "num_tokens": 22479091.0,
- "step": 2520
- },
- {
- "epoch": 1.9156534954407296,
- "grad_norm": 2.16206693649292,
- "learning_rate": 1.585533476686669e-06,
- "loss": 0.43055859208106995,
- "mean_token_accuracy": 0.8659856915473938,
- "num_tokens": 22487379.0,
- "step": 2521
- },
- {
- "epoch": 1.9164133738601823,
- "grad_norm": 2.2091798782348633,
- "learning_rate": 1.5835845469224447e-06,
- "loss": 0.45421302318573,
- "mean_token_accuracy": 0.8418087959289551,
- "num_tokens": 22493755.0,
- "step": 2522
- },
- {
- "epoch": 1.9171732522796354,
- "grad_norm": 1.6166985034942627,
- "learning_rate": 1.5816362603338632e-06,
- "loss": 0.5211667418479919,
- "mean_token_accuracy": 0.809440016746521,
- "num_tokens": 22506648.0,
- "step": 2523
- },
- {
- "epoch": 1.917933130699088,
- "grad_norm": 2.4998703002929688,
- "learning_rate": 1.5796886182883053e-06,
- "loss": 0.45915648341178894,
- "mean_token_accuracy": 0.833067774772644,
- "num_tokens": 22513216.0,
- "step": 2524
- },
- {
- "epoch": 1.918693009118541,
- "grad_norm": 1.492928147315979,
- "learning_rate": 1.577741622152702e-06,
- "loss": 0.45581498742103577,
- "mean_token_accuracy": 0.8531479835510254,
- "num_tokens": 22524908.0,
- "step": 2525
- },
- {
- "epoch": 1.919452887537994,
- "grad_norm": 2.0502207279205322,
- "learning_rate": 1.5757952732935288e-06,
- "loss": 0.4156759977340698,
- "mean_token_accuracy": 0.8677599430084229,
- "num_tokens": 22532275.0,
- "step": 2526
- },
- {
- "epoch": 1.9202127659574468,
- "grad_norm": 2.4572031497955322,
- "learning_rate": 1.5738495730768104e-06,
- "loss": 0.43373313546180725,
- "mean_token_accuracy": 0.8435516357421875,
- "num_tokens": 22538272.0,
- "step": 2527
- },
- {
- "epoch": 1.9209726443768997,
- "grad_norm": 2.071903705596924,
- "learning_rate": 1.5719045228681127e-06,
- "loss": 0.3211413621902466,
- "mean_token_accuracy": 0.87841796875,
- "num_tokens": 22545487.0,
- "step": 2528
- },
- {
- "epoch": 1.9217325227963524,
- "grad_norm": 1.6742064952850342,
- "learning_rate": 1.5699601240325474e-06,
- "loss": 0.3704240322113037,
- "mean_token_accuracy": 0.8646563291549683,
- "num_tokens": 22554840.0,
- "step": 2529
- },
- {
- "epoch": 1.9224924012158056,
- "grad_norm": 1.0941399335861206,
- "learning_rate": 1.5680163779347668e-06,
- "loss": 0.3595704436302185,
- "mean_token_accuracy": 0.8680597543716431,
- "num_tokens": 22572627.0,
- "step": 2530
- },
- {
- "epoch": 1.9232522796352582,
- "grad_norm": 2.9815237522125244,
- "learning_rate": 1.5660732859389687e-06,
- "loss": 0.2941335141658783,
- "mean_token_accuracy": 0.8847303986549377,
- "num_tokens": 22576851.0,
- "step": 2531
- },
- {
- "epoch": 1.9240121580547114,
- "grad_norm": 2.898106813430786,
- "learning_rate": 1.5641308494088903e-06,
- "loss": 0.4066317081451416,
- "mean_token_accuracy": 0.8469538688659668,
- "num_tokens": 22581431.0,
- "step": 2532
- },
- {
- "epoch": 1.924772036474164,
- "grad_norm": 1.6757515668869019,
- "learning_rate": 1.5621890697078069e-06,
- "loss": 0.33923569321632385,
- "mean_token_accuracy": 0.8790708184242249,
- "num_tokens": 22590648.0,
- "step": 2533
- },
- {
- "epoch": 1.925531914893617,
- "grad_norm": 1.747314214706421,
- "learning_rate": 1.5602479481985333e-06,
- "loss": 0.4865703582763672,
- "mean_token_accuracy": 0.8314566612243652,
- "num_tokens": 22600153.0,
- "step": 2534
- },
- {
- "epoch": 1.9262917933130699,
- "grad_norm": 2.7927849292755127,
- "learning_rate": 1.5583074862434254e-06,
- "loss": 0.335658460855484,
- "mean_token_accuracy": 0.8769067525863647,
- "num_tokens": 22604864.0,
- "step": 2535
- },
- {
- "epoch": 1.9270516717325228,
- "grad_norm": 2.2553000450134277,
- "learning_rate": 1.5563676852043738e-06,
- "loss": 0.4442562460899353,
- "mean_token_accuracy": 0.8381515145301819,
- "num_tokens": 22611102.0,
- "step": 2536
- },
- {
- "epoch": 1.9278115501519757,
- "grad_norm": 1.1937638521194458,
- "learning_rate": 1.5544285464428044e-06,
- "loss": 0.38608425855636597,
- "mean_token_accuracy": 0.8589644432067871,
- "num_tokens": 22627781.0,
- "step": 2537
- },
- {
- "epoch": 1.9285714285714286,
- "grad_norm": 3.282639980316162,
- "learning_rate": 1.55249007131968e-06,
- "loss": 0.31231993436813354,
- "mean_token_accuracy": 0.8917703032493591,
- "num_tokens": 22632341.0,
- "step": 2538
- },
- {
- "epoch": 1.9293313069908815,
- "grad_norm": 2.3212976455688477,
- "learning_rate": 1.5505522611954977e-06,
- "loss": 0.34952571988105774,
- "mean_token_accuracy": 0.8752106428146362,
- "num_tokens": 22638572.0,
- "step": 2539
- },
- {
- "epoch": 1.9300911854103342,
- "grad_norm": 1.389098882675171,
- "learning_rate": 1.548615117430286e-06,
- "loss": 0.4298851788043976,
- "mean_token_accuracy": 0.871698260307312,
- "num_tokens": 22651875.0,
- "step": 2540
- },
- {
- "epoch": 1.9308510638297873,
- "grad_norm": 1.5333977937698364,
- "learning_rate": 1.5466786413836077e-06,
- "loss": 0.45540744066238403,
- "mean_token_accuracy": 0.8409075736999512,
- "num_tokens": 22662903.0,
- "step": 2541
- },
- {
- "epoch": 1.93161094224924,
- "grad_norm": 1.7833251953125,
- "learning_rate": 1.5447428344145565e-06,
- "loss": 0.333247572183609,
- "mean_token_accuracy": 0.8796100616455078,
- "num_tokens": 22671125.0,
- "step": 2542
- },
- {
- "epoch": 1.9323708206686931,
- "grad_norm": 1.5165303945541382,
- "learning_rate": 1.5428076978817564e-06,
- "loss": 0.3085063099861145,
- "mean_token_accuracy": 0.888705849647522,
- "num_tokens": 22681482.0,
- "step": 2543
- },
- {
- "epoch": 1.9331306990881458,
- "grad_norm": 2.3556196689605713,
- "learning_rate": 1.5408732331433596e-06,
- "loss": 0.44008776545524597,
- "mean_token_accuracy": 0.8578170537948608,
- "num_tokens": 22686952.0,
- "step": 2544
- },
- {
- "epoch": 1.9338905775075987,
- "grad_norm": 2.9572882652282715,
- "learning_rate": 1.538939441557048e-06,
- "loss": 0.3779261112213135,
- "mean_token_accuracy": 0.8657241463661194,
- "num_tokens": 22691211.0,
- "step": 2545
- },
- {
- "epoch": 1.9346504559270516,
- "grad_norm": 2.373473644256592,
- "learning_rate": 1.5370063244800326e-06,
- "loss": 0.4113072454929352,
- "mean_token_accuracy": 0.872116208076477,
- "num_tokens": 22697442.0,
- "step": 2546
- },
- {
- "epoch": 1.9354103343465046,
- "grad_norm": 2.270207643508911,
- "learning_rate": 1.5350738832690479e-06,
- "loss": 0.4021070897579193,
- "mean_token_accuracy": 0.8750372529029846,
- "num_tokens": 22703693.0,
- "step": 2547
- },
- {
- "epoch": 1.9361702127659575,
- "grad_norm": 2.429445266723633,
- "learning_rate": 1.5331421192803565e-06,
- "loss": 0.40210235118865967,
- "mean_token_accuracy": 0.8593704104423523,
- "num_tokens": 22709285.0,
- "step": 2548
- },
- {
- "epoch": 1.9369300911854104,
- "grad_norm": 1.4576458930969238,
- "learning_rate": 1.5312110338697427e-06,
- "loss": 0.44822201132774353,
- "mean_token_accuracy": 0.8737322688102722,
- "num_tokens": 22723743.0,
- "step": 2549
- },
- {
- "epoch": 1.9376899696048633,
- "grad_norm": 2.1008098125457764,
- "learning_rate": 1.5292806283925192e-06,
- "loss": 0.3514235019683838,
- "mean_token_accuracy": 0.8689005374908447,
- "num_tokens": 22730135.0,
- "step": 2550
- },
- {
- "epoch": 1.938449848024316,
- "grad_norm": 1.9786806106567383,
- "learning_rate": 1.5273509042035172e-06,
- "loss": 0.4483771324157715,
- "mean_token_accuracy": 0.8353633880615234,
- "num_tokens": 22738717.0,
- "step": 2551
- },
- {
- "epoch": 1.939209726443769,
- "grad_norm": 1.0649693012237549,
- "learning_rate": 1.5254218626570927e-06,
- "loss": 0.30712205171585083,
- "mean_token_accuracy": 0.8802675008773804,
- "num_tokens": 22757346.0,
- "step": 2552
- },
- {
- "epoch": 1.9399696048632218,
- "grad_norm": 3.0401108264923096,
- "learning_rate": 1.5234935051071193e-06,
- "loss": 0.5213959217071533,
- "mean_token_accuracy": 0.8249514102935791,
- "num_tokens": 22762169.0,
- "step": 2553
- },
- {
- "epoch": 1.940729483282675,
- "grad_norm": 2.892486572265625,
- "learning_rate": 1.521565832906994e-06,
- "loss": 0.5694394111633301,
- "mean_token_accuracy": 0.8139263391494751,
- "num_tokens": 22767824.0,
- "step": 2554
- },
- {
- "epoch": 1.9414893617021276,
- "grad_norm": 1.6187207698822021,
- "learning_rate": 1.519638847409632e-06,
- "loss": 0.46748271584510803,
- "mean_token_accuracy": 0.8541051149368286,
- "num_tokens": 22778195.0,
- "step": 2555
- },
- {
- "epoch": 1.9422492401215805,
- "grad_norm": 1.3857731819152832,
- "learning_rate": 1.5177125499674639e-06,
- "loss": 0.35661786794662476,
- "mean_token_accuracy": 0.8711516857147217,
- "num_tokens": 22792353.0,
- "step": 2556
- },
- {
- "epoch": 1.9430091185410334,
- "grad_norm": 1.108441710472107,
- "learning_rate": 1.515786941932441e-06,
- "loss": 0.3537200391292572,
- "mean_token_accuracy": 0.8739079833030701,
- "num_tokens": 22813185.0,
- "step": 2557
- },
- {
- "epoch": 1.9437689969604863,
- "grad_norm": 2.0528404712677,
- "learning_rate": 1.5138620246560295e-06,
- "loss": 0.4161028265953064,
- "mean_token_accuracy": 0.8385938405990601,
- "num_tokens": 22821227.0,
- "step": 2558
- },
- {
- "epoch": 1.9445288753799392,
- "grad_norm": 1.5123628377914429,
- "learning_rate": 1.5119377994892095e-06,
- "loss": 0.4420986473560333,
- "mean_token_accuracy": 0.8664361834526062,
- "num_tokens": 22835064.0,
- "step": 2559
- },
- {
- "epoch": 1.9452887537993921,
- "grad_norm": 2.5354838371276855,
- "learning_rate": 1.5100142677824752e-06,
- "loss": 0.3837323784828186,
- "mean_token_accuracy": 0.8607655763626099,
- "num_tokens": 22840455.0,
- "step": 2560
- },
- {
- "epoch": 1.946048632218845,
- "grad_norm": 1.1354057788848877,
- "learning_rate": 1.5080914308858375e-06,
- "loss": 0.39776813983917236,
- "mean_token_accuracy": 0.8586497902870178,
- "num_tokens": 22858828.0,
- "step": 2561
- },
- {
- "epoch": 1.9468085106382977,
- "grad_norm": 1.576740026473999,
- "learning_rate": 1.5061692901488161e-06,
- "loss": 0.3167848289012909,
- "mean_token_accuracy": 0.8876185417175293,
- "num_tokens": 22868674.0,
- "step": 2562
- },
- {
- "epoch": 1.9475683890577509,
- "grad_norm": 1.4835401773452759,
- "learning_rate": 1.5042478469204437e-06,
- "loss": 0.44950318336486816,
- "mean_token_accuracy": 0.8526639342308044,
- "num_tokens": 22883019.0,
- "step": 2563
- },
- {
- "epoch": 1.9483282674772036,
- "grad_norm": 1.617073655128479,
- "learning_rate": 1.502327102549262e-06,
- "loss": 0.45711010694503784,
- "mean_token_accuracy": 0.834361732006073,
- "num_tokens": 22896834.0,
- "step": 2564
- },
- {
- "epoch": 1.9490881458966567,
- "grad_norm": 1.3348414897918701,
- "learning_rate": 1.5004070583833252e-06,
- "loss": 0.3691314458847046,
- "mean_token_accuracy": 0.8779371380805969,
- "num_tokens": 22912350.0,
- "step": 2565
- },
- {
- "epoch": 1.9498480243161094,
- "grad_norm": 1.711234450340271,
- "learning_rate": 1.4984877157701932e-06,
- "loss": 0.38726937770843506,
- "mean_token_accuracy": 0.8704015016555786,
- "num_tokens": 22922575.0,
- "step": 2566
- },
- {
- "epoch": 1.9506079027355623,
- "grad_norm": 2.4587950706481934,
- "learning_rate": 1.4965690760569346e-06,
- "loss": 0.4455464482307434,
- "mean_token_accuracy": 0.8481032252311707,
- "num_tokens": 22928717.0,
- "step": 2567
- },
- {
- "epoch": 1.9513677811550152,
- "grad_norm": 2.4189560413360596,
- "learning_rate": 1.4946511405901237e-06,
- "loss": 0.4120418429374695,
- "mean_token_accuracy": 0.8519487380981445,
- "num_tokens": 22934977.0,
- "step": 2568
- },
- {
- "epoch": 1.952127659574468,
- "grad_norm": 1.2503050565719604,
- "learning_rate": 1.4927339107158437e-06,
- "loss": 0.4434332251548767,
- "mean_token_accuracy": 0.8448144793510437,
- "num_tokens": 22950061.0,
- "step": 2569
- },
- {
- "epoch": 1.952887537993921,
- "grad_norm": 1.788493275642395,
- "learning_rate": 1.4908173877796784e-06,
- "loss": 0.49203023314476013,
- "mean_token_accuracy": 0.8601495623588562,
- "num_tokens": 22961838.0,
- "step": 2570
- },
- {
- "epoch": 1.9536474164133737,
- "grad_norm": 1.4260050058364868,
- "learning_rate": 1.4889015731267186e-06,
- "loss": 0.3286570906639099,
- "mean_token_accuracy": 0.882429838180542,
- "num_tokens": 22973192.0,
- "step": 2571
- },
- {
- "epoch": 1.9544072948328268,
- "grad_norm": 1.6754822731018066,
- "learning_rate": 1.486986468101555e-06,
- "loss": 0.34655290842056274,
- "mean_token_accuracy": 0.8807861804962158,
- "num_tokens": 22983661.0,
- "step": 2572
- },
- {
- "epoch": 1.9551671732522795,
- "grad_norm": 1.9064570665359497,
- "learning_rate": 1.4850720740482842e-06,
- "loss": 0.34020254015922546,
- "mean_token_accuracy": 0.86677086353302,
- "num_tokens": 22991231.0,
- "step": 2573
- },
- {
- "epoch": 1.9559270516717326,
- "grad_norm": 1.977444052696228,
- "learning_rate": 1.4831583923105e-06,
- "loss": 0.21505260467529297,
- "mean_token_accuracy": 0.921241819858551,
- "num_tokens": 22996828.0,
- "step": 2574
- },
- {
- "epoch": 1.9566869300911853,
- "grad_norm": 1.1019235849380493,
- "learning_rate": 1.481245424231298e-06,
- "loss": 0.3804295063018799,
- "mean_token_accuracy": 0.8582668900489807,
- "num_tokens": 23016018.0,
- "step": 2575
- },
- {
- "epoch": 1.9574468085106385,
- "grad_norm": 1.7943179607391357,
- "learning_rate": 1.4793331711532743e-06,
- "loss": 0.38565245270729065,
- "mean_token_accuracy": 0.8599048256874084,
- "num_tokens": 23024461.0,
- "step": 2576
- },
- {
- "epoch": 1.9582066869300911,
- "grad_norm": 2.273824453353882,
- "learning_rate": 1.4774216344185204e-06,
- "loss": 0.46297723054885864,
- "mean_token_accuracy": 0.8294345140457153,
- "num_tokens": 23031687.0,
- "step": 2577
- },
- {
- "epoch": 1.958966565349544,
- "grad_norm": 2.308509111404419,
- "learning_rate": 1.4755108153686275e-06,
- "loss": 0.4366525411605835,
- "mean_token_accuracy": 0.8515903949737549,
- "num_tokens": 23037072.0,
- "step": 2578
- },
- {
- "epoch": 1.959726443768997,
- "grad_norm": 2.069028377532959,
- "learning_rate": 1.4736007153446803e-06,
- "loss": 0.33900877833366394,
- "mean_token_accuracy": 0.8937177658081055,
- "num_tokens": 23043207.0,
- "step": 2579
- },
- {
- "epoch": 1.9604863221884499,
- "grad_norm": 2.905163288116455,
- "learning_rate": 1.4716913356872614e-06,
- "loss": 0.3708382844924927,
- "mean_token_accuracy": 0.8936747312545776,
- "num_tokens": 23047020.0,
- "step": 2580
- },
- {
- "epoch": 1.9612462006079028,
- "grad_norm": 2.4153175354003906,
- "learning_rate": 1.4697826777364478e-06,
- "loss": 0.473562091588974,
- "mean_token_accuracy": 0.8350275158882141,
- "num_tokens": 23053282.0,
- "step": 2581
- },
- {
- "epoch": 1.9620060790273555,
- "grad_norm": 2.21589994430542,
- "learning_rate": 1.467874742831808e-06,
- "loss": 0.3812660276889801,
- "mean_token_accuracy": 0.8623865842819214,
- "num_tokens": 23059399.0,
- "step": 2582
- },
- {
- "epoch": 1.9627659574468086,
- "grad_norm": 1.0847623348236084,
- "learning_rate": 1.4659675323124037e-06,
- "loss": 0.3846944570541382,
- "mean_token_accuracy": 0.8633466958999634,
- "num_tokens": 23081005.0,
- "step": 2583
- },
- {
- "epoch": 1.9635258358662613,
- "grad_norm": 1.8754645586013794,
- "learning_rate": 1.46406104751679e-06,
- "loss": 0.3460300862789154,
- "mean_token_accuracy": 0.8757443428039551,
- "num_tokens": 23088710.0,
- "step": 2584
- },
- {
- "epoch": 1.9642857142857144,
- "grad_norm": 2.13075852394104,
- "learning_rate": 1.462155289783011e-06,
- "loss": 0.3060935139656067,
- "mean_token_accuracy": 0.9070644378662109,
- "num_tokens": 23094862.0,
- "step": 2585
- },
- {
- "epoch": 1.965045592705167,
- "grad_norm": 2.9674458503723145,
- "learning_rate": 1.4602502604486e-06,
- "loss": 0.4464406371116638,
- "mean_token_accuracy": 0.8497441411018372,
- "num_tokens": 23099821.0,
- "step": 2586
- },
- {
- "epoch": 1.96580547112462,
- "grad_norm": 1.9171007871627808,
- "learning_rate": 1.45834596085058e-06,
- "loss": 0.3905114531517029,
- "mean_token_accuracy": 0.8564352989196777,
- "num_tokens": 23107804.0,
- "step": 2587
- },
- {
- "epoch": 1.966565349544073,
- "grad_norm": 2.0817408561706543,
- "learning_rate": 1.456442392325463e-06,
- "loss": 0.3903818130493164,
- "mean_token_accuracy": 0.8671162128448486,
- "num_tokens": 23115224.0,
- "step": 2588
- },
- {
- "epoch": 1.9673252279635258,
- "grad_norm": 2.6379549503326416,
- "learning_rate": 1.4545395562092467e-06,
- "loss": 0.22965987026691437,
- "mean_token_accuracy": 0.9160916805267334,
- "num_tokens": 23119184.0,
- "step": 2589
- },
- {
- "epoch": 1.9680851063829787,
- "grad_norm": 2.525221824645996,
- "learning_rate": 1.4526374538374133e-06,
- "loss": 0.4132574498653412,
- "mean_token_accuracy": 0.8486990332603455,
- "num_tokens": 23124679.0,
- "step": 2590
- },
- {
- "epoch": 1.9688449848024316,
- "grad_norm": 2.0362391471862793,
- "learning_rate": 1.4507360865449318e-06,
- "loss": 0.29624345898628235,
- "mean_token_accuracy": 0.888127863407135,
- "num_tokens": 23130756.0,
- "step": 2591
- },
- {
- "epoch": 1.9696048632218845,
- "grad_norm": 1.5150481462478638,
- "learning_rate": 1.4488354556662553e-06,
- "loss": 0.3852264881134033,
- "mean_token_accuracy": 0.8532775640487671,
- "num_tokens": 23141597.0,
- "step": 2592
- },
- {
- "epoch": 1.9703647416413372,
- "grad_norm": 1.5255193710327148,
- "learning_rate": 1.4469355625353199e-06,
- "loss": 0.37015780806541443,
- "mean_token_accuracy": 0.8669752478599548,
- "num_tokens": 23152487.0,
- "step": 2593
- },
- {
- "epoch": 1.9711246200607904,
- "grad_norm": 1.1780041456222534,
- "learning_rate": 1.4450364084855433e-06,
- "loss": 0.34421291947364807,
- "mean_token_accuracy": 0.8593694567680359,
- "num_tokens": 23168769.0,
- "step": 2594
- },
- {
- "epoch": 1.971884498480243,
- "grad_norm": 2.4549946784973145,
- "learning_rate": 1.4431379948498254e-06,
- "loss": 0.4000544548034668,
- "mean_token_accuracy": 0.8551953434944153,
- "num_tokens": 23175428.0,
- "step": 2595
- },
- {
- "epoch": 1.9726443768996962,
- "grad_norm": 2.374192476272583,
- "learning_rate": 1.4412403229605453e-06,
- "loss": 0.31329840421676636,
- "mean_token_accuracy": 0.8917277455329895,
- "num_tokens": 23180678.0,
- "step": 2596
- },
- {
- "epoch": 1.9734042553191489,
- "grad_norm": 1.268515706062317,
- "learning_rate": 1.4393433941495638e-06,
- "loss": 0.34808623790740967,
- "mean_token_accuracy": 0.8726245164871216,
- "num_tokens": 23194733.0,
- "step": 2597
- },
- {
- "epoch": 1.9741641337386018,
- "grad_norm": 2.0898988246917725,
- "learning_rate": 1.4374472097482156e-06,
- "loss": 0.45849233865737915,
- "mean_token_accuracy": 0.8414266109466553,
- "num_tokens": 23202211.0,
- "step": 2598
- },
- {
- "epoch": 1.9749240121580547,
- "grad_norm": 2.1497802734375,
- "learning_rate": 1.4355517710873184e-06,
- "loss": 0.4304521977901459,
- "mean_token_accuracy": 0.8502874374389648,
- "num_tokens": 23209623.0,
- "step": 2599
- },
- {
- "epoch": 1.9756838905775076,
- "grad_norm": 1.821786880493164,
- "learning_rate": 1.4336570794971643e-06,
- "loss": 0.3910462558269501,
- "mean_token_accuracy": 0.8962477445602417,
- "num_tokens": 23218904.0,
- "step": 2600
- },
- {
- "epoch": 1.9764437689969605,
- "grad_norm": 2.2523093223571777,
- "learning_rate": 1.4317631363075186e-06,
- "loss": 0.3456020951271057,
- "mean_token_accuracy": 0.8703117370605469,
- "num_tokens": 23225602.0,
- "step": 2601
- },
- {
- "epoch": 1.9772036474164134,
- "grad_norm": 1.6920030117034912,
- "learning_rate": 1.4298699428476236e-06,
- "loss": 0.4629668593406677,
- "mean_token_accuracy": 0.841956615447998,
- "num_tokens": 23236812.0,
- "step": 2602
- },
- {
- "epoch": 1.9779635258358663,
- "grad_norm": 1.8796344995498657,
- "learning_rate": 1.427977500446199e-06,
- "loss": 0.3302173316478729,
- "mean_token_accuracy": 0.8769404888153076,
- "num_tokens": 23245851.0,
- "step": 2603
- },
- {
- "epoch": 1.978723404255319,
- "grad_norm": 2.4003775119781494,
- "learning_rate": 1.4260858104314299e-06,
- "loss": 0.48402607440948486,
- "mean_token_accuracy": 0.8477497100830078,
- "num_tokens": 23252429.0,
- "step": 2604
- },
- {
- "epoch": 1.9794832826747721,
- "grad_norm": 3.576800584793091,
- "learning_rate": 1.4241948741309783e-06,
- "loss": 0.2943669259548187,
- "mean_token_accuracy": 0.8933546543121338,
- "num_tokens": 23255431.0,
- "step": 2605
- },
- {
- "epoch": 1.9802431610942248,
- "grad_norm": 2.7589938640594482,
- "learning_rate": 1.4223046928719764e-06,
- "loss": 0.5138746500015259,
- "mean_token_accuracy": 0.817468523979187,
- "num_tokens": 23261351.0,
- "step": 2606
- },
- {
- "epoch": 1.981003039513678,
- "grad_norm": 1.6950130462646484,
- "learning_rate": 1.420415267981026e-06,
- "loss": 0.2744991183280945,
- "mean_token_accuracy": 0.9005721211433411,
- "num_tokens": 23269482.0,
- "step": 2607
- },
- {
- "epoch": 1.9817629179331306,
- "grad_norm": 1.5962934494018555,
- "learning_rate": 1.418526600784198e-06,
- "loss": 0.4629114270210266,
- "mean_token_accuracy": 0.8337699174880981,
- "num_tokens": 23279796.0,
- "step": 2608
- },
- {
- "epoch": 1.9825227963525835,
- "grad_norm": 1.4962197542190552,
- "learning_rate": 1.4166386926070322e-06,
- "loss": 0.4217689633369446,
- "mean_token_accuracy": 0.8445580005645752,
- "num_tokens": 23293050.0,
- "step": 2609
- },
- {
- "epoch": 1.9832826747720365,
- "grad_norm": 1.4243721961975098,
- "learning_rate": 1.414751544774535e-06,
- "loss": 0.4888152480125427,
- "mean_token_accuracy": 0.8298524022102356,
- "num_tokens": 23308501.0,
- "step": 2610
- },
- {
- "epoch": 1.9840425531914894,
- "grad_norm": 1.5776121616363525,
- "learning_rate": 1.412865158611179e-06,
- "loss": 0.3156965970993042,
- "mean_token_accuracy": 0.8773540258407593,
- "num_tokens": 23317401.0,
- "step": 2611
- },
- {
- "epoch": 1.9848024316109423,
- "grad_norm": 1.4690552949905396,
- "learning_rate": 1.4109795354409045e-06,
- "loss": 0.35854774713516235,
- "mean_token_accuracy": 0.869156002998352,
- "num_tokens": 23328891.0,
- "step": 2612
- },
- {
- "epoch": 1.9855623100303952,
- "grad_norm": 1.5036180019378662,
- "learning_rate": 1.4090946765871105e-06,
- "loss": 0.3579009771347046,
- "mean_token_accuracy": 0.8698509931564331,
- "num_tokens": 23340473.0,
- "step": 2613
- },
- {
- "epoch": 1.986322188449848,
- "grad_norm": 2.0811538696289062,
- "learning_rate": 1.4072105833726685e-06,
- "loss": 0.2905905246734619,
- "mean_token_accuracy": 0.9131759405136108,
- "num_tokens": 23346480.0,
- "step": 2614
- },
- {
- "epoch": 1.9870820668693008,
- "grad_norm": 1.2866275310516357,
- "learning_rate": 1.4053272571199037e-06,
- "loss": 0.4091147184371948,
- "mean_token_accuracy": 0.8537255525588989,
- "num_tokens": 23361957.0,
- "step": 2615
- },
- {
- "epoch": 1.987841945288754,
- "grad_norm": 1.439497470855713,
- "learning_rate": 1.4034446991506084e-06,
- "loss": 0.4888972342014313,
- "mean_token_accuracy": 0.8451695442199707,
- "num_tokens": 23374936.0,
- "step": 2616
- },
- {
- "epoch": 1.9886018237082066,
- "grad_norm": 1.758204698562622,
- "learning_rate": 1.401562910786034e-06,
- "loss": 0.4976118803024292,
- "mean_token_accuracy": 0.8346713781356812,
- "num_tokens": 23386102.0,
- "step": 2617
- },
- {
- "epoch": 1.9893617021276597,
- "grad_norm": 1.436486840248108,
- "learning_rate": 1.3996818933468926e-06,
- "loss": 0.42407113313674927,
- "mean_token_accuracy": 0.8529444932937622,
- "num_tokens": 23398645.0,
- "step": 2618
- },
- {
- "epoch": 1.9901215805471124,
- "grad_norm": 2.1466588973999023,
- "learning_rate": 1.397801648153354e-06,
- "loss": 0.45519331097602844,
- "mean_token_accuracy": 0.8460411429405212,
- "num_tokens": 23406162.0,
- "step": 2619
- },
- {
- "epoch": 1.9908814589665653,
- "grad_norm": 2.0492005348205566,
- "learning_rate": 1.395922176525047e-06,
- "loss": 0.31093084812164307,
- "mean_token_accuracy": 0.8927264213562012,
- "num_tokens": 23412051.0,
- "step": 2620
- },
- {
- "epoch": 1.9916413373860182,
- "grad_norm": 2.2639048099517822,
- "learning_rate": 1.3940434797810567e-06,
- "loss": 0.3804079592227936,
- "mean_token_accuracy": 0.8720212578773499,
- "num_tokens": 23418252.0,
- "step": 2621
- },
- {
- "epoch": 1.9924012158054711,
- "grad_norm": 1.9541687965393066,
- "learning_rate": 1.3921655592399256e-06,
- "loss": 0.38776344060897827,
- "mean_token_accuracy": 0.858753502368927,
- "num_tokens": 23425901.0,
- "step": 2622
- },
- {
- "epoch": 1.993161094224924,
- "grad_norm": 1.5119032859802246,
- "learning_rate": 1.3902884162196509e-06,
- "loss": 0.39581215381622314,
- "mean_token_accuracy": 0.8539663553237915,
- "num_tokens": 23439390.0,
- "step": 2623
- },
- {
- "epoch": 1.993920972644377,
- "grad_norm": 2.1608591079711914,
- "learning_rate": 1.388412052037682e-06,
- "loss": 0.41801220178604126,
- "mean_token_accuracy": 0.8703387975692749,
- "num_tokens": 23445725.0,
- "step": 2624
- },
- {
- "epoch": 1.9946808510638299,
- "grad_norm": 2.463165521621704,
- "learning_rate": 1.3865364680109239e-06,
- "loss": 0.3252835273742676,
- "mean_token_accuracy": 0.9031686186790466,
- "num_tokens": 23451122.0,
- "step": 2625
- },
- {
- "epoch": 1.9954407294832825,
- "grad_norm": 1.1901201009750366,
- "learning_rate": 1.384661665455736e-06,
- "loss": 0.3358447253704071,
- "mean_token_accuracy": 0.8767676949501038,
- "num_tokens": 23467381.0,
- "step": 2626
- },
- {
- "epoch": 1.9962006079027357,
- "grad_norm": 1.3035757541656494,
- "learning_rate": 1.3827876456879247e-06,
- "loss": 0.3736562430858612,
- "mean_token_accuracy": 0.849855899810791,
- "num_tokens": 23482192.0,
- "step": 2627
- },
- {
- "epoch": 1.9969604863221884,
- "grad_norm": 1.8807034492492676,
- "learning_rate": 1.3809144100227483e-06,
- "loss": 0.45943766832351685,
- "mean_token_accuracy": 0.8456380367279053,
- "num_tokens": 23495167.0,
- "step": 2628
- },
- {
- "epoch": 1.9977203647416415,
- "grad_norm": 2.3645784854888916,
- "learning_rate": 1.3790419597749198e-06,
- "loss": 0.4271511435508728,
- "mean_token_accuracy": 0.846099853515625,
- "num_tokens": 23500790.0,
- "step": 2629
- },
- {
- "epoch": 1.9984802431610942,
- "grad_norm": 1.8451792001724243,
- "learning_rate": 1.3771702962585928e-06,
- "loss": 0.38092344999313354,
- "mean_token_accuracy": 0.8641276359558105,
- "num_tokens": 23508845.0,
- "step": 2630
- },
- {
- "epoch": 1.999240121580547,
- "grad_norm": 1.1115045547485352,
- "learning_rate": 1.3752994207873743e-06,
- "loss": 0.35954269766807556,
- "mean_token_accuracy": 0.8642125129699707,
- "num_tokens": 23527929.0,
- "step": 2631
- },
- {
- "epoch": 2.0,
- "grad_norm": 1.406253457069397,
- "learning_rate": 1.373429334674317e-06,
- "loss": 0.33467042446136475,
- "mean_token_accuracy": 0.8713197708129883,
- "num_tokens": 23539356.0,
- "step": 2632
- },
- {
- "epoch": 2.0007598784194527,
- "grad_norm": 2.8150978088378906,
- "learning_rate": 1.3715600392319186e-06,
- "loss": 0.22929656505584717,
- "mean_token_accuracy": 0.9197485446929932,
- "num_tokens": 23543746.0,
- "step": 2633
- },
- {
- "epoch": 2.001519756838906,
- "grad_norm": 2.6291964054107666,
- "learning_rate": 1.369691535772123e-06,
- "loss": 0.290000855922699,
- "mean_token_accuracy": 0.8979663848876953,
- "num_tokens": 23548633.0,
- "step": 2634
- },
- {
- "epoch": 2.0022796352583585,
- "grad_norm": 1.724357008934021,
- "learning_rate": 1.3678238256063193e-06,
- "loss": 0.3717018663883209,
- "mean_token_accuracy": 0.8743406534194946,
- "num_tokens": 23557187.0,
- "step": 2635
- },
- {
- "epoch": 2.0030395136778116,
- "grad_norm": 2.3801965713500977,
- "learning_rate": 1.3659569100453346e-06,
- "loss": 0.3452329635620117,
- "mean_token_accuracy": 0.8799462914466858,
- "num_tokens": 23563321.0,
- "step": 2636
- },
- {
- "epoch": 2.0037993920972643,
- "grad_norm": 1.8925955295562744,
- "learning_rate": 1.3640907903994455e-06,
- "loss": 0.32880955934524536,
- "mean_token_accuracy": 0.888347864151001,
- "num_tokens": 23570571.0,
- "step": 2637
- },
- {
- "epoch": 2.0045592705167175,
- "grad_norm": 1.0761849880218506,
- "learning_rate": 1.3622254679783665e-06,
- "loss": 0.395224004983902,
- "mean_token_accuracy": 0.8637001514434814,
- "num_tokens": 23589504.0,
- "step": 2638
- },
- {
- "epoch": 2.00531914893617,
- "grad_norm": 2.1172127723693848,
- "learning_rate": 1.3603609440912508e-06,
- "loss": 0.32195356488227844,
- "mean_token_accuracy": 0.8984324932098389,
- "num_tokens": 23595586.0,
- "step": 2639
- },
- {
- "epoch": 2.0060790273556233,
- "grad_norm": 2.127723217010498,
- "learning_rate": 1.3584972200466936e-06,
- "loss": 0.4710606634616852,
- "mean_token_accuracy": 0.8563182950019836,
- "num_tokens": 23602747.0,
- "step": 2640
- },
- {
- "epoch": 2.006838905775076,
- "grad_norm": 1.9752192497253418,
- "learning_rate": 1.356634297152729e-06,
- "loss": 0.24204617738723755,
- "mean_token_accuracy": 0.9082983136177063,
- "num_tokens": 23609005.0,
- "step": 2641
- },
- {
- "epoch": 2.007598784194529,
- "grad_norm": 2.5435397624969482,
- "learning_rate": 1.3547721767168273e-06,
- "loss": 0.16702288389205933,
- "mean_token_accuracy": 0.9353867769241333,
- "num_tokens": 23612852.0,
- "step": 2642
- },
- {
- "epoch": 2.0083586626139818,
- "grad_norm": 1.8113304376602173,
- "learning_rate": 1.3529108600458967e-06,
- "loss": 0.4245433509349823,
- "mean_token_accuracy": 0.8446527719497681,
- "num_tokens": 23621462.0,
- "step": 2643
- },
- {
- "epoch": 2.0091185410334345,
- "grad_norm": 1.0438088178634644,
- "learning_rate": 1.3510503484462807e-06,
- "loss": 0.3710743188858032,
- "mean_token_accuracy": 0.8731123208999634,
- "num_tokens": 23642029.0,
- "step": 2644
- },
- {
- "epoch": 2.0098784194528876,
- "grad_norm": 1.9650516510009766,
- "learning_rate": 1.349190643223758e-06,
- "loss": 0.32384324073791504,
- "mean_token_accuracy": 0.8859044313430786,
- "num_tokens": 23648970.0,
- "step": 2645
- },
- {
- "epoch": 2.0106382978723403,
- "grad_norm": 1.4213180541992188,
- "learning_rate": 1.347331745683542e-06,
- "loss": 0.42391857504844666,
- "mean_token_accuracy": 0.8568997383117676,
- "num_tokens": 23663012.0,
- "step": 2646
- },
- {
- "epoch": 2.0113981762917934,
- "grad_norm": 1.852386236190796,
- "learning_rate": 1.3454736571302761e-06,
- "loss": 0.37283188104629517,
- "mean_token_accuracy": 0.9096506834030151,
- "num_tokens": 23671632.0,
- "step": 2647
- },
- {
- "epoch": 2.012158054711246,
- "grad_norm": 1.8350872993469238,
- "learning_rate": 1.3436163788680411e-06,
- "loss": 0.21148793399333954,
- "mean_token_accuracy": 0.9306647181510925,
- "num_tokens": 23678554.0,
- "step": 2648
- },
- {
- "epoch": 2.012917933130699,
- "grad_norm": 1.8285188674926758,
- "learning_rate": 1.3417599122003464e-06,
- "loss": 0.2638583183288574,
- "mean_token_accuracy": 0.904695987701416,
- "num_tokens": 23686905.0,
- "step": 2649
- },
- {
- "epoch": 2.013677811550152,
- "grad_norm": 1.1955424547195435,
- "learning_rate": 1.3399042584301298e-06,
- "loss": 0.30598434805870056,
- "mean_token_accuracy": 0.8953701257705688,
- "num_tokens": 23702734.0,
- "step": 2650
- },
- {
- "epoch": 2.014437689969605,
- "grad_norm": 1.5378512144088745,
- "learning_rate": 1.3380494188597603e-06,
- "loss": 0.33754611015319824,
- "mean_token_accuracy": 0.9063926935195923,
- "num_tokens": 23715891.0,
- "step": 2651
- },
- {
- "epoch": 2.0151975683890577,
- "grad_norm": 1.6957111358642578,
- "learning_rate": 1.3361953947910394e-06,
- "loss": 0.26302939653396606,
- "mean_token_accuracy": 0.90192711353302,
- "num_tokens": 23724034.0,
- "step": 2652
- },
- {
- "epoch": 2.015957446808511,
- "grad_norm": 1.1756837368011475,
- "learning_rate": 1.334342187525189e-06,
- "loss": 0.3312695622444153,
- "mean_token_accuracy": 0.870500385761261,
- "num_tokens": 23741241.0,
- "step": 2653
- },
- {
- "epoch": 2.0167173252279635,
- "grad_norm": 1.027145266532898,
- "learning_rate": 1.3324897983628621e-06,
- "loss": 0.2534530758857727,
- "mean_token_accuracy": 0.894199550151825,
- "num_tokens": 23758399.0,
- "step": 2654
- },
- {
- "epoch": 2.0174772036474162,
- "grad_norm": 2.2585113048553467,
- "learning_rate": 1.330638228604137e-06,
- "loss": 0.4558389186859131,
- "mean_token_accuracy": 0.8372241258621216,
- "num_tokens": 23766871.0,
- "step": 2655
- },
- {
- "epoch": 2.0182370820668694,
- "grad_norm": 1.886893630027771,
- "learning_rate": 1.3287874795485168e-06,
- "loss": 0.29894912242889404,
- "mean_token_accuracy": 0.9086098670959473,
- "num_tokens": 23774935.0,
- "step": 2656
- },
- {
- "epoch": 2.018996960486322,
- "grad_norm": 2.082537889480591,
- "learning_rate": 1.3269375524949286e-06,
- "loss": 0.39323803782463074,
- "mean_token_accuracy": 0.8598287105560303,
- "num_tokens": 23781303.0,
- "step": 2657
- },
- {
- "epoch": 2.019756838905775,
- "grad_norm": 1.7059803009033203,
- "learning_rate": 1.3250884487417227e-06,
- "loss": 0.17909850180149078,
- "mean_token_accuracy": 0.9276094436645508,
- "num_tokens": 23789148.0,
- "step": 2658
- },
- {
- "epoch": 2.020516717325228,
- "grad_norm": 2.150275945663452,
- "learning_rate": 1.3232401695866686e-06,
- "loss": 0.3707781434059143,
- "mean_token_accuracy": 0.8587700128555298,
- "num_tokens": 23795484.0,
- "step": 2659
- },
- {
- "epoch": 2.021276595744681,
- "grad_norm": 2.0554518699645996,
- "learning_rate": 1.321392716326963e-06,
- "loss": 0.33217954635620117,
- "mean_token_accuracy": 0.874828577041626,
- "num_tokens": 23802968.0,
- "step": 2660
- },
- {
- "epoch": 2.0220364741641337,
- "grad_norm": 2.4556071758270264,
- "learning_rate": 1.3195460902592193e-06,
- "loss": 0.2790899872779846,
- "mean_token_accuracy": 0.9071618914604187,
- "num_tokens": 23807788.0,
- "step": 2661
- },
- {
- "epoch": 2.022796352583587,
- "grad_norm": 1.7501509189605713,
- "learning_rate": 1.3177002926794685e-06,
- "loss": 0.3080750107765198,
- "mean_token_accuracy": 0.8942672610282898,
- "num_tokens": 23816023.0,
- "step": 2662
- },
- {
- "epoch": 2.0235562310030395,
- "grad_norm": 1.3934804201126099,
- "learning_rate": 1.3158553248831658e-06,
- "loss": 0.286912202835083,
- "mean_token_accuracy": 0.9284837245941162,
- "num_tokens": 23827186.0,
- "step": 2663
- },
- {
- "epoch": 2.024316109422492,
- "grad_norm": 1.2530465126037598,
- "learning_rate": 1.3140111881651773e-06,
- "loss": 0.2630627155303955,
- "mean_token_accuracy": 0.9029854536056519,
- "num_tokens": 23841399.0,
- "step": 2664
- },
- {
- "epoch": 2.0250759878419453,
- "grad_norm": 1.3417384624481201,
- "learning_rate": 1.312167883819791e-06,
- "loss": 0.37794870138168335,
- "mean_token_accuracy": 0.8722256422042847,
- "num_tokens": 23856061.0,
- "step": 2665
- },
- {
- "epoch": 2.025835866261398,
- "grad_norm": 2.234257698059082,
- "learning_rate": 1.3103254131407082e-06,
- "loss": 0.2739933133125305,
- "mean_token_accuracy": 0.9055665135383606,
- "num_tokens": 23861865.0,
- "step": 2666
- },
- {
- "epoch": 2.026595744680851,
- "grad_norm": 1.4187006950378418,
- "learning_rate": 1.308483777421046e-06,
- "loss": 0.24370817840099335,
- "mean_token_accuracy": 0.9145886301994324,
- "num_tokens": 23873632.0,
- "step": 2667
- },
- {
- "epoch": 2.027355623100304,
- "grad_norm": 2.3645882606506348,
- "learning_rate": 1.3066429779533352e-06,
- "loss": 0.23659822344779968,
- "mean_token_accuracy": 0.9209753274917603,
- "num_tokens": 23878866.0,
- "step": 2668
- },
- {
- "epoch": 2.028115501519757,
- "grad_norm": 1.4782226085662842,
- "learning_rate": 1.3048030160295196e-06,
- "loss": 0.3353138267993927,
- "mean_token_accuracy": 0.8747807741165161,
- "num_tokens": 23891089.0,
- "step": 2669
- },
- {
- "epoch": 2.0288753799392096,
- "grad_norm": 2.051754951477051,
- "learning_rate": 1.3029638929409555e-06,
- "loss": 0.2905973196029663,
- "mean_token_accuracy": 0.887441873550415,
- "num_tokens": 23897653.0,
- "step": 2670
- },
- {
- "epoch": 2.0296352583586628,
- "grad_norm": 1.322279453277588,
- "learning_rate": 1.3011256099784103e-06,
- "loss": 0.3938416540622711,
- "mean_token_accuracy": 0.8911079168319702,
- "num_tokens": 23912525.0,
- "step": 2671
- },
- {
- "epoch": 2.0303951367781155,
- "grad_norm": 1.87980318069458,
- "learning_rate": 1.2992881684320627e-06,
- "loss": 0.16637520492076874,
- "mean_token_accuracy": 0.9472321271896362,
- "num_tokens": 23918752.0,
- "step": 2672
- },
- {
- "epoch": 2.0311550151975686,
- "grad_norm": 2.0867233276367188,
- "learning_rate": 1.297451569591498e-06,
- "loss": 0.37282776832580566,
- "mean_token_accuracy": 0.8688399195671082,
- "num_tokens": 23925918.0,
- "step": 2673
- },
- {
- "epoch": 2.0319148936170213,
- "grad_norm": 1.129468560218811,
- "learning_rate": 1.2956158147457116e-06,
- "loss": 0.33072173595428467,
- "mean_token_accuracy": 0.8788217306137085,
- "num_tokens": 23944702.0,
- "step": 2674
- },
- {
- "epoch": 2.032674772036474,
- "grad_norm": 3.6016290187835693,
- "learning_rate": 1.2937809051831102e-06,
- "loss": 0.28343498706817627,
- "mean_token_accuracy": 0.911794900894165,
- "num_tokens": 23948417.0,
- "step": 2675
- },
- {
- "epoch": 2.033434650455927,
- "grad_norm": 1.4904811382293701,
- "learning_rate": 1.2919468421915008e-06,
- "loss": 0.4072638750076294,
- "mean_token_accuracy": 0.8615934252738953,
- "num_tokens": 23963654.0,
- "step": 2676
- },
- {
- "epoch": 2.0341945288753798,
- "grad_norm": 2.90740704536438,
- "learning_rate": 1.2901136270580994e-06,
- "loss": 0.3685106635093689,
- "mean_token_accuracy": 0.8923419713973999,
- "num_tokens": 23968608.0,
- "step": 2677
- },
- {
- "epoch": 2.034954407294833,
- "grad_norm": 1.8772104978561401,
- "learning_rate": 1.2882812610695305e-06,
- "loss": 0.2947828471660614,
- "mean_token_accuracy": 0.9065762758255005,
- "num_tokens": 23978298.0,
- "step": 2678
- },
- {
- "epoch": 2.0357142857142856,
- "grad_norm": 1.2135536670684814,
- "learning_rate": 1.2864497455118152e-06,
- "loss": 0.36015012860298157,
- "mean_token_accuracy": 0.8481813073158264,
- "num_tokens": 23995784.0,
- "step": 2679
- },
- {
- "epoch": 2.0364741641337387,
- "grad_norm": 1.941889762878418,
- "learning_rate": 1.2846190816703836e-06,
- "loss": 0.3004198670387268,
- "mean_token_accuracy": 0.8843618631362915,
- "num_tokens": 24002651.0,
- "step": 2680
- },
- {
- "epoch": 2.0372340425531914,
- "grad_norm": 1.8905075788497925,
- "learning_rate": 1.2827892708300648e-06,
- "loss": 0.26640570163726807,
- "mean_token_accuracy": 0.9079146385192871,
- "num_tokens": 24010400.0,
- "step": 2681
- },
- {
- "epoch": 2.0379939209726445,
- "grad_norm": 1.2975934743881226,
- "learning_rate": 1.280960314275092e-06,
- "loss": 0.19093887507915497,
- "mean_token_accuracy": 0.9277223348617554,
- "num_tokens": 24021528.0,
- "step": 2682
- },
- {
- "epoch": 2.038753799392097,
- "grad_norm": 1.6483098268508911,
- "learning_rate": 1.279132213289096e-06,
- "loss": 0.29260069131851196,
- "mean_token_accuracy": 0.892486572265625,
- "num_tokens": 24030470.0,
- "step": 2683
- },
- {
- "epoch": 2.0395136778115504,
- "grad_norm": 1.6875916719436646,
- "learning_rate": 1.2773049691551103e-06,
- "loss": 0.3784627914428711,
- "mean_token_accuracy": 0.8682783842086792,
- "num_tokens": 24041608.0,
- "step": 2684
- },
- {
- "epoch": 2.040273556231003,
- "grad_norm": 2.1055848598480225,
- "learning_rate": 1.2754785831555617e-06,
- "loss": 0.14676237106323242,
- "mean_token_accuracy": 0.9532995223999023,
- "num_tokens": 24046687.0,
- "step": 2685
- },
- {
- "epoch": 2.0410334346504557,
- "grad_norm": 1.3862961530685425,
- "learning_rate": 1.273653056572282e-06,
- "loss": 0.34408485889434814,
- "mean_token_accuracy": 0.8748919367790222,
- "num_tokens": 24059147.0,
- "step": 2686
- },
- {
- "epoch": 2.041793313069909,
- "grad_norm": 2.936876058578491,
- "learning_rate": 1.2718283906864939e-06,
- "loss": 0.2471027672290802,
- "mean_token_accuracy": 0.9177526235580444,
- "num_tokens": 24062963.0,
- "step": 2687
- },
- {
- "epoch": 2.0425531914893615,
- "grad_norm": 1.3992520570755005,
- "learning_rate": 1.2700045867788184e-06,
- "loss": 0.421109139919281,
- "mean_token_accuracy": 0.8664785623550415,
- "num_tokens": 24077912.0,
- "step": 2688
- },
- {
- "epoch": 2.0433130699088147,
- "grad_norm": 3.0531985759735107,
- "learning_rate": 1.2681816461292715e-06,
- "loss": 0.292591392993927,
- "mean_token_accuracy": 0.8992351293563843,
- "num_tokens": 24082058.0,
- "step": 2689
- },
- {
- "epoch": 2.0440729483282674,
- "grad_norm": 1.4562251567840576,
- "learning_rate": 1.2663595700172631e-06,
- "loss": 0.39367130398750305,
- "mean_token_accuracy": 0.8894597887992859,
- "num_tokens": 24093954.0,
- "step": 2690
- },
- {
- "epoch": 2.0448328267477205,
- "grad_norm": 1.9354028701782227,
- "learning_rate": 1.2645383597215965e-06,
- "loss": 0.28203579783439636,
- "mean_token_accuracy": 0.9011955261230469,
- "num_tokens": 24100590.0,
- "step": 2691
- },
- {
- "epoch": 2.045592705167173,
- "grad_norm": 1.5010690689086914,
- "learning_rate": 1.2627180165204671e-06,
- "loss": 0.3463609516620636,
- "mean_token_accuracy": 0.8978298306465149,
- "num_tokens": 24111104.0,
- "step": 2692
- },
- {
- "epoch": 2.0463525835866263,
- "grad_norm": 2.585813045501709,
- "learning_rate": 1.2608985416914616e-06,
- "loss": 0.2142711877822876,
- "mean_token_accuracy": 0.9260460138320923,
- "num_tokens": 24115301.0,
- "step": 2693
- },
- {
- "epoch": 2.047112462006079,
- "grad_norm": 2.317268133163452,
- "learning_rate": 1.259079936511558e-06,
- "loss": 0.14454546570777893,
- "mean_token_accuracy": 0.9498077034950256,
- "num_tokens": 24120295.0,
- "step": 2694
- },
- {
- "epoch": 2.047872340425532,
- "grad_norm": 1.966550350189209,
- "learning_rate": 1.257262202257124e-06,
- "loss": 0.20745311677455902,
- "mean_token_accuracy": 0.9157166481018066,
- "num_tokens": 24127158.0,
- "step": 2695
- },
- {
- "epoch": 2.048632218844985,
- "grad_norm": 1.6521401405334473,
- "learning_rate": 1.2554453402039124e-06,
- "loss": 0.2547406256198883,
- "mean_token_accuracy": 0.9356101751327515,
- "num_tokens": 24135620.0,
- "step": 2696
- },
- {
- "epoch": 2.0493920972644375,
- "grad_norm": 2.341756582260132,
- "learning_rate": 1.2536293516270704e-06,
- "loss": 0.35540008544921875,
- "mean_token_accuracy": 0.874363899230957,
- "num_tokens": 24141766.0,
- "step": 2697
- },
- {
- "epoch": 2.0501519756838906,
- "grad_norm": 1.7938716411590576,
- "learning_rate": 1.251814237801128e-06,
- "loss": 0.37250861525535583,
- "mean_token_accuracy": 0.8644422292709351,
- "num_tokens": 24149997.0,
- "step": 2698
- },
- {
- "epoch": 2.0509118541033433,
- "grad_norm": 2.0868122577667236,
- "learning_rate": 1.2500000000000007e-06,
- "loss": 0.44527092576026917,
- "mean_token_accuracy": 0.8510264158248901,
- "num_tokens": 24158208.0,
- "step": 2699
- },
- {
- "epoch": 2.0516717325227964,
- "grad_norm": 2.412604808807373,
- "learning_rate": 1.24818663949699e-06,
- "loss": 0.19276219606399536,
- "mean_token_accuracy": 0.9317681789398193,
- "num_tokens": 24162905.0,
- "step": 2700
- },
- {
- "epoch": 2.052431610942249,
- "grad_norm": 1.4488455057144165,
- "learning_rate": 1.246374157564785e-06,
- "loss": 0.3493705093860626,
- "mean_token_accuracy": 0.9016396999359131,
- "num_tokens": 24175852.0,
- "step": 2701
- },
- {
- "epoch": 2.0531914893617023,
- "grad_norm": 2.1629185676574707,
- "learning_rate": 1.2445625554754526e-06,
- "loss": 0.30588388442993164,
- "mean_token_accuracy": 0.8871392011642456,
- "num_tokens": 24181507.0,
- "step": 2702
- },
- {
- "epoch": 2.053951367781155,
- "grad_norm": 2.0489449501037598,
- "learning_rate": 1.2427518345004459e-06,
- "loss": 0.4578161835670471,
- "mean_token_accuracy": 0.8498104214668274,
- "num_tokens": 24191918.0,
- "step": 2703
- },
- {
- "epoch": 2.054711246200608,
- "grad_norm": 2.063019037246704,
- "learning_rate": 1.2409419959105981e-06,
- "loss": 0.31680572032928467,
- "mean_token_accuracy": 0.8809083700180054,
- "num_tokens": 24199336.0,
- "step": 2704
- },
- {
- "epoch": 2.0554711246200608,
- "grad_norm": 2.4594223499298096,
- "learning_rate": 1.239133040976124e-06,
- "loss": 0.3048282265663147,
- "mean_token_accuracy": 0.8897095322608948,
- "num_tokens": 24205118.0,
- "step": 2705
- },
- {
- "epoch": 2.056231003039514,
- "grad_norm": 1.6359999179840088,
- "learning_rate": 1.237324970966618e-06,
- "loss": 0.4312370717525482,
- "mean_token_accuracy": 0.8526142835617065,
- "num_tokens": 24215792.0,
- "step": 2706
- },
- {
- "epoch": 2.0569908814589666,
- "grad_norm": 1.5534536838531494,
- "learning_rate": 1.2355177871510538e-06,
- "loss": 0.3647908568382263,
- "mean_token_accuracy": 0.8680631518363953,
- "num_tokens": 24235325.0,
- "step": 2707
- },
- {
- "epoch": 2.0577507598784193,
- "grad_norm": 2.4902515411376953,
- "learning_rate": 1.2337114907977798e-06,
- "loss": 0.3605276942253113,
- "mean_token_accuracy": 0.8776376843452454,
- "num_tokens": 24241502.0,
- "step": 2708
- },
- {
- "epoch": 2.0585106382978724,
- "grad_norm": 1.7282993793487549,
- "learning_rate": 1.2319060831745273e-06,
- "loss": 0.38326722383499146,
- "mean_token_accuracy": 0.8531644344329834,
- "num_tokens": 24252665.0,
- "step": 2709
- },
- {
- "epoch": 2.059270516717325,
- "grad_norm": 1.4213361740112305,
- "learning_rate": 1.2301015655484006e-06,
- "loss": 0.32221150398254395,
- "mean_token_accuracy": 0.8890664577484131,
- "num_tokens": 24266409.0,
- "step": 2710
- },
- {
- "epoch": 2.060030395136778,
- "grad_norm": 2.6412453651428223,
- "learning_rate": 1.2282979391858767e-06,
- "loss": 0.20225220918655396,
- "mean_token_accuracy": 0.9287782311439514,
- "num_tokens": 24271069.0,
- "step": 2711
- },
- {
- "epoch": 2.060790273556231,
- "grad_norm": 3.2601654529571533,
- "learning_rate": 1.2264952053528145e-06,
- "loss": 0.23259003460407257,
- "mean_token_accuracy": 0.9290606379508972,
- "num_tokens": 24274992.0,
- "step": 2712
- },
- {
- "epoch": 2.061550151975684,
- "grad_norm": 1.6633410453796387,
- "learning_rate": 1.2246933653144386e-06,
- "loss": 0.355314165353775,
- "mean_token_accuracy": 0.870380163192749,
- "num_tokens": 24284917.0,
- "step": 2713
- },
- {
- "epoch": 2.0623100303951367,
- "grad_norm": 2.9081318378448486,
- "learning_rate": 1.2228924203353507e-06,
- "loss": 0.38050833344459534,
- "mean_token_accuracy": 0.8879997730255127,
- "num_tokens": 24289694.0,
- "step": 2714
- },
- {
- "epoch": 2.06306990881459,
- "grad_norm": 3.2404227256774902,
- "learning_rate": 1.2210923716795233e-06,
- "loss": 0.2502570152282715,
- "mean_token_accuracy": 0.9150978922843933,
- "num_tokens": 24293254.0,
- "step": 2715
- },
- {
- "epoch": 2.0638297872340425,
- "grad_norm": 1.9262174367904663,
- "learning_rate": 1.2192932206103e-06,
- "loss": 0.26763200759887695,
- "mean_token_accuracy": 0.9203122854232788,
- "num_tokens": 24300881.0,
- "step": 2716
- },
- {
- "epoch": 2.0645896656534957,
- "grad_norm": 1.6790109872817993,
- "learning_rate": 1.2174949683903943e-06,
- "loss": 0.22275440394878387,
- "mean_token_accuracy": 0.9212621450424194,
- "num_tokens": 24309288.0,
- "step": 2717
- },
- {
- "epoch": 2.0653495440729484,
- "grad_norm": 1.8272414207458496,
- "learning_rate": 1.2156976162818895e-06,
- "loss": 0.3183424472808838,
- "mean_token_accuracy": 0.8813169002532959,
- "num_tokens": 24316980.0,
- "step": 2718
- },
- {
- "epoch": 2.066109422492401,
- "grad_norm": 2.7388651371002197,
- "learning_rate": 1.2139011655462338e-06,
- "loss": 0.24794816970825195,
- "mean_token_accuracy": 0.9109550714492798,
- "num_tokens": 24321867.0,
- "step": 2719
- },
- {
- "epoch": 2.066869300911854,
- "grad_norm": 1.4866925477981567,
- "learning_rate": 1.2121056174442484e-06,
- "loss": 0.24177205562591553,
- "mean_token_accuracy": 0.9102780818939209,
- "num_tokens": 24332874.0,
- "step": 2720
- },
- {
- "epoch": 2.067629179331307,
- "grad_norm": 1.6006059646606445,
- "learning_rate": 1.2103109732361178e-06,
- "loss": 0.29220807552337646,
- "mean_token_accuracy": 0.8947570323944092,
- "num_tokens": 24342790.0,
- "step": 2721
- },
- {
- "epoch": 2.06838905775076,
- "grad_norm": 2.2688677310943604,
- "learning_rate": 1.208517234181391e-06,
- "loss": 0.39247143268585205,
- "mean_token_accuracy": 0.8514304161071777,
- "num_tokens": 24349329.0,
- "step": 2722
- },
- {
- "epoch": 2.0691489361702127,
- "grad_norm": 2.404534339904785,
- "learning_rate": 1.2067244015389829e-06,
- "loss": 0.4461793303489685,
- "mean_token_accuracy": 0.8531662821769714,
- "num_tokens": 24356287.0,
- "step": 2723
- },
- {
- "epoch": 2.069908814589666,
- "grad_norm": 1.813341498374939,
- "learning_rate": 1.204932476567175e-06,
- "loss": 0.38300177454948425,
- "mean_token_accuracy": 0.8597674369812012,
- "num_tokens": 24366181.0,
- "step": 2724
- },
- {
- "epoch": 2.0706686930091185,
- "grad_norm": 3.49125337600708,
- "learning_rate": 1.2031414605236066e-06,
- "loss": 0.33281540870666504,
- "mean_token_accuracy": 0.8774969577789307,
- "num_tokens": 24370362.0,
- "step": 2725
- },
- {
- "epoch": 2.0714285714285716,
- "grad_norm": 1.7682114839553833,
- "learning_rate": 1.2013513546652827e-06,
- "loss": 0.3001813590526581,
- "mean_token_accuracy": 0.8840254545211792,
- "num_tokens": 24380469.0,
- "step": 2726
- },
- {
- "epoch": 2.0721884498480243,
- "grad_norm": 2.3688952922821045,
- "learning_rate": 1.1995621602485685e-06,
- "loss": 0.20055249333381653,
- "mean_token_accuracy": 0.9246129989624023,
- "num_tokens": 24385474.0,
- "step": 2727
- },
- {
- "epoch": 2.072948328267477,
- "grad_norm": 2.3368382453918457,
- "learning_rate": 1.1977738785291894e-06,
- "loss": 0.18379954993724823,
- "mean_token_accuracy": 0.9385529160499573,
- "num_tokens": 24390002.0,
- "step": 2728
- },
- {
- "epoch": 2.07370820668693,
- "grad_norm": 1.857473373413086,
- "learning_rate": 1.1959865107622306e-06,
- "loss": 0.4606894552707672,
- "mean_token_accuracy": 0.8437427282333374,
- "num_tokens": 24400880.0,
- "step": 2729
- },
- {
- "epoch": 2.074468085106383,
- "grad_norm": 1.2714136838912964,
- "learning_rate": 1.1942000582021355e-06,
- "loss": 0.21171459555625916,
- "mean_token_accuracy": 0.9216019511222839,
- "num_tokens": 24413113.0,
- "step": 2730
- },
- {
- "epoch": 2.075227963525836,
- "grad_norm": 2.2025210857391357,
- "learning_rate": 1.1924145221027048e-06,
- "loss": 0.44211941957473755,
- "mean_token_accuracy": 0.8538386821746826,
- "num_tokens": 24420504.0,
- "step": 2731
- },
- {
- "epoch": 2.0759878419452886,
- "grad_norm": 1.6706589460372925,
- "learning_rate": 1.190629903717097e-06,
- "loss": 0.35163265466690063,
- "mean_token_accuracy": 0.8716240525245667,
- "num_tokens": 24430203.0,
- "step": 2732
- },
- {
- "epoch": 2.0767477203647418,
- "grad_norm": 2.299182176589966,
- "learning_rate": 1.1888462042978268e-06,
- "loss": 0.30983975529670715,
- "mean_token_accuracy": 0.8859797716140747,
- "num_tokens": 24437387.0,
- "step": 2733
- },
- {
- "epoch": 2.0775075987841944,
- "grad_norm": 2.975123167037964,
- "learning_rate": 1.1870634250967606e-06,
- "loss": 0.23585952818393707,
- "mean_token_accuracy": 0.9167368412017822,
- "num_tokens": 24441176.0,
- "step": 2734
- },
- {
- "epoch": 2.0782674772036476,
- "grad_norm": 1.1052464246749878,
- "learning_rate": 1.1852815673651246e-06,
- "loss": 0.24136316776275635,
- "mean_token_accuracy": 0.8897353410720825,
- "num_tokens": 24457092.0,
- "step": 2735
- },
- {
- "epoch": 2.0790273556231003,
- "grad_norm": 1.5531870126724243,
- "learning_rate": 1.1835006323534926e-06,
- "loss": 0.302223265171051,
- "mean_token_accuracy": 0.8940514326095581,
- "num_tokens": 24467643.0,
- "step": 2736
- },
- {
- "epoch": 2.0797872340425534,
- "grad_norm": 1.706140398979187,
- "learning_rate": 1.1817206213117943e-06,
- "loss": 0.39235255122184753,
- "mean_token_accuracy": 0.8615218997001648,
- "num_tokens": 24477715.0,
- "step": 2737
- },
- {
- "epoch": 2.080547112462006,
- "grad_norm": 2.1109750270843506,
- "learning_rate": 1.1799415354893103e-06,
- "loss": 0.2526751756668091,
- "mean_token_accuracy": 0.9108465909957886,
- "num_tokens": 24484248.0,
- "step": 2738
- },
- {
- "epoch": 2.0813069908814588,
- "grad_norm": 1.9943277835845947,
- "learning_rate": 1.178163376134671e-06,
- "loss": 0.3540172874927521,
- "mean_token_accuracy": 0.9131139516830444,
- "num_tokens": 24492207.0,
- "step": 2739
- },
- {
- "epoch": 2.082066869300912,
- "grad_norm": 1.9536099433898926,
- "learning_rate": 1.1763861444958573e-06,
- "loss": 0.3902950584888458,
- "mean_token_accuracy": 0.8611530065536499,
- "num_tokens": 24501567.0,
- "step": 2740
- },
- {
- "epoch": 2.0828267477203646,
- "grad_norm": 3.146925926208496,
- "learning_rate": 1.1746098418201987e-06,
- "loss": 0.43440669775009155,
- "mean_token_accuracy": 0.8709320425987244,
- "num_tokens": 24506684.0,
- "step": 2741
- },
- {
- "epoch": 2.0835866261398177,
- "grad_norm": 2.763427495956421,
- "learning_rate": 1.172834469354373e-06,
- "loss": 0.3513452410697937,
- "mean_token_accuracy": 0.8774256110191345,
- "num_tokens": 24511509.0,
- "step": 2742
- },
- {
- "epoch": 2.0843465045592704,
- "grad_norm": 2.773829221725464,
- "learning_rate": 1.1710600283444048e-06,
- "loss": 0.24668049812316895,
- "mean_token_accuracy": 0.9146889448165894,
- "num_tokens": 24516030.0,
- "step": 2743
- },
- {
- "epoch": 2.0851063829787235,
- "grad_norm": 1.666471242904663,
- "learning_rate": 1.169286520035666e-06,
- "loss": 0.36206915974617004,
- "mean_token_accuracy": 0.8711973428726196,
- "num_tokens": 24526656.0,
- "step": 2744
- },
- {
- "epoch": 2.085866261398176,
- "grad_norm": 2.818890333175659,
- "learning_rate": 1.1675139456728702e-06,
- "loss": 0.32967281341552734,
- "mean_token_accuracy": 0.880983829498291,
- "num_tokens": 24531625.0,
- "step": 2745
- },
- {
- "epoch": 2.0866261398176293,
- "grad_norm": 1.09058678150177,
- "learning_rate": 1.1657423065000811e-06,
- "loss": 0.36224377155303955,
- "mean_token_accuracy": 0.8708326816558838,
- "num_tokens": 24557123.0,
- "step": 2746
- },
- {
- "epoch": 2.087386018237082,
- "grad_norm": 1.1434987783432007,
- "learning_rate": 1.1639716037607036e-06,
- "loss": 0.26490458846092224,
- "mean_token_accuracy": 0.9131897687911987,
- "num_tokens": 24573223.0,
- "step": 2747
- },
- {
- "epoch": 2.088145896656535,
- "grad_norm": 2.437505006790161,
- "learning_rate": 1.1622018386974829e-06,
- "loss": 0.18964408338069916,
- "mean_token_accuracy": 0.9271818399429321,
- "num_tokens": 24578306.0,
- "step": 2748
- },
- {
- "epoch": 2.088905775075988,
- "grad_norm": 1.797308325767517,
- "learning_rate": 1.160433012552508e-06,
- "loss": 0.3090781569480896,
- "mean_token_accuracy": 0.8960750102996826,
- "num_tokens": 24587562.0,
- "step": 2749
- },
- {
- "epoch": 2.0896656534954405,
- "grad_norm": 2.4050841331481934,
- "learning_rate": 1.1586651265672122e-06,
- "loss": 0.4001041054725647,
- "mean_token_accuracy": 0.8588370084762573,
- "num_tokens": 24594223.0,
- "step": 2750
- },
- {
- "epoch": 2.0904255319148937,
- "grad_norm": 1.8757156133651733,
- "learning_rate": 1.1568981819823636e-06,
- "loss": 0.37845075130462646,
- "mean_token_accuracy": 0.866146445274353,
- "num_tokens": 24602556.0,
- "step": 2751
- },
- {
- "epoch": 2.0911854103343464,
- "grad_norm": 1.8205114603042603,
- "learning_rate": 1.1551321800380722e-06,
- "loss": 0.24738016724586487,
- "mean_token_accuracy": 0.923284113407135,
- "num_tokens": 24611627.0,
- "step": 2752
- },
- {
- "epoch": 2.0919452887537995,
- "grad_norm": 2.107512950897217,
- "learning_rate": 1.153367121973786e-06,
- "loss": 0.3062688410282135,
- "mean_token_accuracy": 0.8909003734588623,
- "num_tokens": 24619569.0,
- "step": 2753
- },
- {
- "epoch": 2.092705167173252,
- "grad_norm": 1.93110191822052,
- "learning_rate": 1.1516030090282915e-06,
- "loss": 0.38658422231674194,
- "mean_token_accuracy": 0.869437038898468,
- "num_tokens": 24628869.0,
- "step": 2754
- },
- {
- "epoch": 2.0934650455927053,
- "grad_norm": 2.3618004322052,
- "learning_rate": 1.1498398424397106e-06,
- "loss": 0.19193072617053986,
- "mean_token_accuracy": 0.9329519271850586,
- "num_tokens": 24633724.0,
- "step": 2755
- },
- {
- "epoch": 2.094224924012158,
- "grad_norm": 2.274510622024536,
- "learning_rate": 1.1480776234455024e-06,
- "loss": 0.24939998984336853,
- "mean_token_accuracy": 0.9104958772659302,
- "num_tokens": 24642762.0,
- "step": 2756
- },
- {
- "epoch": 2.094984802431611,
- "grad_norm": 1.7468934059143066,
- "learning_rate": 1.1463163532824572e-06,
- "loss": 0.3876607418060303,
- "mean_token_accuracy": 0.8540539145469666,
- "num_tokens": 24652138.0,
- "step": 2757
- },
- {
- "epoch": 2.095744680851064,
- "grad_norm": 2.905381441116333,
- "learning_rate": 1.1445560331867054e-06,
- "loss": 0.33666878938674927,
- "mean_token_accuracy": 0.8805598616600037,
- "num_tokens": 24656612.0,
- "step": 2758
- },
- {
- "epoch": 2.096504559270517,
- "grad_norm": 1.5513007640838623,
- "learning_rate": 1.142796664393707e-06,
- "loss": 0.25168463587760925,
- "mean_token_accuracy": 0.925534725189209,
- "num_tokens": 24667132.0,
- "step": 2759
- },
- {
- "epoch": 2.0972644376899696,
- "grad_norm": 1.6804249286651611,
- "learning_rate": 1.141038248138253e-06,
- "loss": 0.3862859010696411,
- "mean_token_accuracy": 0.8686253428459167,
- "num_tokens": 24679274.0,
- "step": 2760
- },
- {
- "epoch": 2.0980243161094223,
- "grad_norm": 1.7432880401611328,
- "learning_rate": 1.1392807856544682e-06,
- "loss": 0.3200700879096985,
- "mean_token_accuracy": 0.9188123941421509,
- "num_tokens": 24688628.0,
- "step": 2761
- },
- {
- "epoch": 2.0987841945288754,
- "grad_norm": 1.8734468221664429,
- "learning_rate": 1.1375242781758077e-06,
- "loss": 0.34758424758911133,
- "mean_token_accuracy": 0.8724187016487122,
- "num_tokens": 24698159.0,
- "step": 2762
- },
- {
- "epoch": 2.099544072948328,
- "grad_norm": 3.7156829833984375,
- "learning_rate": 1.1357687269350564e-06,
- "loss": 0.30014732480049133,
- "mean_token_accuracy": 0.9021577835083008,
- "num_tokens": 24701797.0,
- "step": 2763
- },
- {
- "epoch": 2.1003039513677813,
- "grad_norm": 1.5196985006332397,
- "learning_rate": 1.1340141331643276e-06,
- "loss": 0.45747464895248413,
- "mean_token_accuracy": 0.839891791343689,
- "num_tokens": 24716468.0,
- "step": 2764
- },
- {
- "epoch": 2.101063829787234,
- "grad_norm": 1.978009581565857,
- "learning_rate": 1.132260498095062e-06,
- "loss": 0.3130183815956116,
- "mean_token_accuracy": 0.90610271692276,
- "num_tokens": 24723211.0,
- "step": 2765
- },
- {
- "epoch": 2.101823708206687,
- "grad_norm": 1.5883251428604126,
- "learning_rate": 1.1305078229580294e-06,
- "loss": 0.30493029952049255,
- "mean_token_accuracy": 0.8889745473861694,
- "num_tokens": 24733839.0,
- "step": 2766
- },
- {
- "epoch": 2.1025835866261398,
- "grad_norm": 1.2397783994674683,
- "learning_rate": 1.128756108983325e-06,
- "loss": 0.2606407105922699,
- "mean_token_accuracy": 0.9061247110366821,
- "num_tokens": 24747488.0,
- "step": 2767
- },
- {
- "epoch": 2.103343465045593,
- "grad_norm": 1.3046784400939941,
- "learning_rate": 1.1270053574003658e-06,
- "loss": 0.38750404119491577,
- "mean_token_accuracy": 0.8777017593383789,
- "num_tokens": 24763893.0,
- "step": 2768
- },
- {
- "epoch": 2.1041033434650456,
- "grad_norm": 1.499266266822815,
- "learning_rate": 1.1252555694379005e-06,
- "loss": 0.4804937243461609,
- "mean_token_accuracy": 0.8344086408615112,
- "num_tokens": 24779323.0,
- "step": 2769
- },
- {
- "epoch": 2.1048632218844983,
- "grad_norm": 1.211094856262207,
- "learning_rate": 1.123506746323997e-06,
- "loss": 0.3579246997833252,
- "mean_token_accuracy": 0.8705919981002808,
- "num_tokens": 24794965.0,
- "step": 2770
- },
- {
- "epoch": 2.1056231003039514,
- "grad_norm": 2.490551471710205,
- "learning_rate": 1.1217588892860446e-06,
- "loss": 0.4084790349006653,
- "mean_token_accuracy": 0.8553222417831421,
- "num_tokens": 24800614.0,
- "step": 2771
- },
- {
- "epoch": 2.106382978723404,
- "grad_norm": 1.5249632596969604,
- "learning_rate": 1.1200119995507572e-06,
- "loss": 0.36853182315826416,
- "mean_token_accuracy": 0.8847414255142212,
- "num_tokens": 24812886.0,
- "step": 2772
- },
- {
- "epoch": 2.107142857142857,
- "grad_norm": 1.8510968685150146,
- "learning_rate": 1.1182660783441719e-06,
- "loss": 0.2918103337287903,
- "mean_token_accuracy": 0.8898224830627441,
- "num_tokens": 24821545.0,
- "step": 2773
- },
- {
- "epoch": 2.10790273556231,
- "grad_norm": 1.7721803188323975,
- "learning_rate": 1.11652112689164e-06,
- "loss": 0.2920452654361725,
- "mean_token_accuracy": 0.8879085779190063,
- "num_tokens": 24831526.0,
- "step": 2774
- },
- {
- "epoch": 2.108662613981763,
- "grad_norm": 1.3987336158752441,
- "learning_rate": 1.1147771464178378e-06,
- "loss": 0.4407062828540802,
- "mean_token_accuracy": 0.8472493886947632,
- "num_tokens": 24845847.0,
- "step": 2775
- },
- {
- "epoch": 2.1094224924012157,
- "grad_norm": 1.8927375078201294,
- "learning_rate": 1.1130341381467569e-06,
- "loss": 0.36293038725852966,
- "mean_token_accuracy": 0.8881135582923889,
- "num_tokens": 24854760.0,
- "step": 2776
- },
- {
- "epoch": 2.110182370820669,
- "grad_norm": 3.0480666160583496,
- "learning_rate": 1.111292103301708e-06,
- "loss": 0.30395108461380005,
- "mean_token_accuracy": 0.9036306142807007,
- "num_tokens": 24859051.0,
- "step": 2777
- },
- {
- "epoch": 2.1109422492401215,
- "grad_norm": 1.5833618640899658,
- "learning_rate": 1.1095510431053176e-06,
- "loss": 0.26424330472946167,
- "mean_token_accuracy": 0.9020674824714661,
- "num_tokens": 24869853.0,
- "step": 2778
- },
- {
- "epoch": 2.1117021276595747,
- "grad_norm": 1.645459532737732,
- "learning_rate": 1.1078109587795311e-06,
- "loss": 0.3563994765281677,
- "mean_token_accuracy": 0.8732106685638428,
- "num_tokens": 24880184.0,
- "step": 2779
- },
- {
- "epoch": 2.1124620060790273,
- "grad_norm": 2.2964093685150146,
- "learning_rate": 1.1060718515456022e-06,
- "loss": 0.19739922881126404,
- "mean_token_accuracy": 0.9273765087127686,
- "num_tokens": 24885398.0,
- "step": 2780
- },
- {
- "epoch": 2.11322188449848,
- "grad_norm": 2.094024181365967,
- "learning_rate": 1.1043337226241075e-06,
- "loss": 0.3321923315525055,
- "mean_token_accuracy": 0.8865819573402405,
- "num_tokens": 24893908.0,
- "step": 2781
- },
- {
- "epoch": 2.113981762917933,
- "grad_norm": 1.9787025451660156,
- "learning_rate": 1.1025965732349318e-06,
- "loss": 0.37631168961524963,
- "mean_token_accuracy": 0.8808693885803223,
- "num_tokens": 24901270.0,
- "step": 2782
- },
- {
- "epoch": 2.114741641337386,
- "grad_norm": 2.376060724258423,
- "learning_rate": 1.100860404597271e-06,
- "loss": 0.2591894268989563,
- "mean_token_accuracy": 0.9174780249595642,
- "num_tokens": 24906578.0,
- "step": 2783
- },
- {
- "epoch": 2.115501519756839,
- "grad_norm": 1.0967903137207031,
- "learning_rate": 1.0991252179296389e-06,
- "loss": 0.26626938581466675,
- "mean_token_accuracy": 0.9305505752563477,
- "num_tokens": 24922329.0,
- "step": 2784
- },
- {
- "epoch": 2.1162613981762917,
- "grad_norm": 3.3701183795928955,
- "learning_rate": 1.0973910144498534e-06,
- "loss": 0.2710079848766327,
- "mean_token_accuracy": 0.9095271825790405,
- "num_tokens": 24925777.0,
- "step": 2785
- },
- {
- "epoch": 2.117021276595745,
- "grad_norm": 1.636264681816101,
- "learning_rate": 1.0956577953750461e-06,
- "loss": 0.2995981276035309,
- "mean_token_accuracy": 0.8988568782806396,
- "num_tokens": 24934230.0,
- "step": 2786
- },
- {
- "epoch": 2.1177811550151975,
- "grad_norm": 2.3107731342315674,
- "learning_rate": 1.093925561921657e-06,
- "loss": 0.3424459397792816,
- "mean_token_accuracy": 0.9100210070610046,
- "num_tokens": 24939830.0,
- "step": 2787
- },
- {
- "epoch": 2.1185410334346506,
- "grad_norm": 1.814764380455017,
- "learning_rate": 1.0921943153054343e-06,
- "loss": 0.3182154893875122,
- "mean_token_accuracy": 0.883027195930481,
- "num_tokens": 24947764.0,
- "step": 2788
- },
- {
- "epoch": 2.1193009118541033,
- "grad_norm": 1.693555235862732,
- "learning_rate": 1.0904640567414332e-06,
- "loss": 0.3685447573661804,
- "mean_token_accuracy": 0.8900846242904663,
- "num_tokens": 24957680.0,
- "step": 2789
- },
- {
- "epoch": 2.1200607902735564,
- "grad_norm": 1.0726022720336914,
- "learning_rate": 1.088734787444017e-06,
- "loss": 0.28461548686027527,
- "mean_token_accuracy": 0.9026681184768677,
- "num_tokens": 24975181.0,
- "step": 2790
- },
- {
- "epoch": 2.120820668693009,
- "grad_norm": 1.3013874292373657,
- "learning_rate": 1.0870065086268506e-06,
- "loss": 0.28222548961639404,
- "mean_token_accuracy": 0.9041857719421387,
- "num_tokens": 24993211.0,
- "step": 2791
- },
- {
- "epoch": 2.121580547112462,
- "grad_norm": 2.592106580734253,
- "learning_rate": 1.085279221502909e-06,
- "loss": 0.31733593344688416,
- "mean_token_accuracy": 0.90151047706604,
- "num_tokens": 24998151.0,
- "step": 2792
- },
- {
- "epoch": 2.122340425531915,
- "grad_norm": 2.649210214614868,
- "learning_rate": 1.0835529272844694e-06,
- "loss": 0.341595321893692,
- "mean_token_accuracy": 0.8989696502685547,
- "num_tokens": 25003399.0,
- "step": 2793
- },
- {
- "epoch": 2.1231003039513676,
- "grad_norm": 2.376619577407837,
- "learning_rate": 1.0818276271831094e-06,
- "loss": 0.2770065665245056,
- "mean_token_accuracy": 0.8967875242233276,
- "num_tokens": 25009686.0,
- "step": 2794
- },
- {
- "epoch": 2.1238601823708207,
- "grad_norm": 2.1539604663848877,
- "learning_rate": 1.080103322409711e-06,
- "loss": 0.37501147389411926,
- "mean_token_accuracy": 0.8768513202667236,
- "num_tokens": 25016339.0,
- "step": 2795
- },
- {
- "epoch": 2.1246200607902734,
- "grad_norm": 2.5727670192718506,
- "learning_rate": 1.0783800141744607e-06,
- "loss": 0.31852903962135315,
- "mean_token_accuracy": 0.8897477388381958,
- "num_tokens": 25021410.0,
- "step": 2796
- },
- {
- "epoch": 2.1253799392097266,
- "grad_norm": 2.1428916454315186,
- "learning_rate": 1.0766577036868395e-06,
- "loss": 0.2348000407218933,
- "mean_token_accuracy": 0.9012142419815063,
- "num_tokens": 25027375.0,
- "step": 2797
- },
- {
- "epoch": 2.1261398176291793,
- "grad_norm": 2.4231064319610596,
- "learning_rate": 1.074936392155631e-06,
- "loss": 0.30580806732177734,
- "mean_token_accuracy": 0.8963108658790588,
- "num_tokens": 25033211.0,
- "step": 2798
- },
- {
- "epoch": 2.1268996960486324,
- "grad_norm": 2.1027259826660156,
- "learning_rate": 1.073216080788921e-06,
- "loss": 0.2508814334869385,
- "mean_token_accuracy": 0.9095165729522705,
- "num_tokens": 25040316.0,
- "step": 2799
- },
- {
- "epoch": 2.127659574468085,
- "grad_norm": 1.6513079404830933,
- "learning_rate": 1.0714967707940876e-06,
- "loss": 0.40694183111190796,
- "mean_token_accuracy": 0.8895826935768127,
- "num_tokens": 25054978.0,
- "step": 2800
- },
- {
- "epoch": 2.128419452887538,
- "grad_norm": 2.0551133155822754,
- "learning_rate": 1.0697784633778093e-06,
- "loss": 0.3452662229537964,
- "mean_token_accuracy": 0.8710684776306152,
- "num_tokens": 25062755.0,
- "step": 2801
- },
- {
- "epoch": 2.129179331306991,
- "grad_norm": 2.1780688762664795,
- "learning_rate": 1.0680611597460607e-06,
- "loss": 0.2918209135532379,
- "mean_token_accuracy": 0.8689337968826294,
- "num_tokens": 25069453.0,
- "step": 2802
- },
- {
- "epoch": 2.1299392097264436,
- "grad_norm": 1.7905635833740234,
- "learning_rate": 1.0663448611041114e-06,
- "loss": 0.3535313308238983,
- "mean_token_accuracy": 0.8762770295143127,
- "num_tokens": 25080004.0,
- "step": 2803
- },
- {
- "epoch": 2.1306990881458967,
- "grad_norm": 1.6187241077423096,
- "learning_rate": 1.0646295686565258e-06,
- "loss": 0.3042716681957245,
- "mean_token_accuracy": 0.884156346321106,
- "num_tokens": 25089652.0,
- "step": 2804
- },
- {
- "epoch": 2.1314589665653494,
- "grad_norm": 2.667459011077881,
- "learning_rate": 1.0629152836071633e-06,
- "loss": 0.3904019892215729,
- "mean_token_accuracy": 0.8603606224060059,
- "num_tokens": 25095556.0,
- "step": 2805
- },
- {
- "epoch": 2.1322188449848025,
- "grad_norm": 1.4227970838546753,
- "learning_rate": 1.0612020071591722e-06,
- "loss": 0.3765299320220947,
- "mean_token_accuracy": 0.8655093908309937,
- "num_tokens": 25108963.0,
- "step": 2806
- },
- {
- "epoch": 2.132978723404255,
- "grad_norm": 2.262726068496704,
- "learning_rate": 1.0594897405149994e-06,
- "loss": 0.2727298140525818,
- "mean_token_accuracy": 0.9005513191223145,
- "num_tokens": 25115135.0,
- "step": 2807
- },
- {
- "epoch": 2.1337386018237083,
- "grad_norm": 2.0810186862945557,
- "learning_rate": 1.0577784848763773e-06,
- "loss": 0.4001343250274658,
- "mean_token_accuracy": 0.8537896871566772,
- "num_tokens": 25123079.0,
- "step": 2808
- },
- {
- "epoch": 2.134498480243161,
- "grad_norm": 1.6573376655578613,
- "learning_rate": 1.0560682414443315e-06,
- "loss": 0.4197486340999603,
- "mean_token_accuracy": 0.8549862504005432,
- "num_tokens": 25135398.0,
- "step": 2809
- },
- {
- "epoch": 2.135258358662614,
- "grad_norm": 2.200150489807129,
- "learning_rate": 1.0543590114191768e-06,
- "loss": 0.32026296854019165,
- "mean_token_accuracy": 0.8797904253005981,
- "num_tokens": 25141382.0,
- "step": 2810
- },
- {
- "epoch": 2.136018237082067,
- "grad_norm": 2.678558111190796,
- "learning_rate": 1.0526507960005164e-06,
- "loss": 0.30048054456710815,
- "mean_token_accuracy": 0.8849201202392578,
- "num_tokens": 25146235.0,
- "step": 2811
- },
- {
- "epoch": 2.13677811550152,
- "grad_norm": 1.5207500457763672,
- "learning_rate": 1.0509435963872422e-06,
- "loss": 0.3706427216529846,
- "mean_token_accuracy": 0.8740214109420776,
- "num_tokens": 25157108.0,
- "step": 2812
- },
- {
- "epoch": 2.1375379939209727,
- "grad_norm": 1.4632720947265625,
- "learning_rate": 1.049237413777532e-06,
- "loss": 0.27156776189804077,
- "mean_token_accuracy": 0.8950715661048889,
- "num_tokens": 25167937.0,
- "step": 2813
- },
- {
- "epoch": 2.1382978723404253,
- "grad_norm": 2.101048469543457,
- "learning_rate": 1.0475322493688506e-06,
- "loss": 0.366736501455307,
- "mean_token_accuracy": 0.8700850009918213,
- "num_tokens": 25177043.0,
- "step": 2814
- },
- {
- "epoch": 2.1390577507598785,
- "grad_norm": 2.54221248626709,
- "learning_rate": 1.0458281043579482e-06,
- "loss": 0.20383943617343903,
- "mean_token_accuracy": 0.9226665496826172,
- "num_tokens": 25182105.0,
- "step": 2815
- },
- {
- "epoch": 2.139817629179331,
- "grad_norm": 1.7742674350738525,
- "learning_rate": 1.04412497994086e-06,
- "loss": 0.26852455735206604,
- "mean_token_accuracy": 0.8987031579017639,
- "num_tokens": 25190178.0,
- "step": 2816
- },
- {
- "epoch": 2.1405775075987843,
- "grad_norm": 3.2856075763702393,
- "learning_rate": 1.0424228773129019e-06,
- "loss": 0.24643859267234802,
- "mean_token_accuracy": 0.9189155101776123,
- "num_tokens": 25194105.0,
- "step": 2817
- },
- {
- "epoch": 2.141337386018237,
- "grad_norm": 3.374311923980713,
- "learning_rate": 1.0407217976686777e-06,
- "loss": 0.2575511336326599,
- "mean_token_accuracy": 0.9143530130386353,
- "num_tokens": 25197787.0,
- "step": 2818
- },
- {
- "epoch": 2.14209726443769,
- "grad_norm": 1.4967217445373535,
- "learning_rate": 1.03902174220207e-06,
- "loss": 0.3054750859737396,
- "mean_token_accuracy": 0.8989205360412598,
- "num_tokens": 25209150.0,
- "step": 2819
- },
- {
- "epoch": 2.142857142857143,
- "grad_norm": 2.654459238052368,
- "learning_rate": 1.0373227121062423e-06,
- "loss": 0.27398061752319336,
- "mean_token_accuracy": 0.9181102514266968,
- "num_tokens": 25214015.0,
- "step": 2820
- },
- {
- "epoch": 2.143617021276596,
- "grad_norm": 1.3205828666687012,
- "learning_rate": 1.0356247085736388e-06,
- "loss": 0.4085468053817749,
- "mean_token_accuracy": 0.8745299577713013,
- "num_tokens": 25230588.0,
- "step": 2821
- },
- {
- "epoch": 2.1443768996960486,
- "grad_norm": 1.6965736150741577,
- "learning_rate": 1.0339277327959863e-06,
- "loss": 0.27269643545150757,
- "mean_token_accuracy": 0.9001271724700928,
- "num_tokens": 25239298.0,
- "step": 2822
- },
- {
- "epoch": 2.1451367781155017,
- "grad_norm": 2.789114236831665,
- "learning_rate": 1.0322317859642852e-06,
- "loss": 0.2319176197052002,
- "mean_token_accuracy": 0.9237110614776611,
- "num_tokens": 25243286.0,
- "step": 2823
- },
- {
- "epoch": 2.1458966565349544,
- "grad_norm": 1.8817718029022217,
- "learning_rate": 1.0305368692688175e-06,
- "loss": 0.2917990982532501,
- "mean_token_accuracy": 0.9211062788963318,
- "num_tokens": 25250575.0,
- "step": 2824
- },
- {
- "epoch": 2.146656534954407,
- "grad_norm": 2.1824984550476074,
- "learning_rate": 1.0288429838991405e-06,
- "loss": 0.39010798931121826,
- "mean_token_accuracy": 0.8887852430343628,
- "num_tokens": 25257947.0,
- "step": 2825
- },
- {
- "epoch": 2.1474164133738602,
- "grad_norm": 1.302579641342163,
- "learning_rate": 1.0271501310440882e-06,
- "loss": 0.3511282503604889,
- "mean_token_accuracy": 0.8728797435760498,
- "num_tokens": 25272846.0,
- "step": 2826
- },
- {
- "epoch": 2.148176291793313,
- "grad_norm": 1.691807746887207,
- "learning_rate": 1.0254583118917699e-06,
- "loss": 0.34246695041656494,
- "mean_token_accuracy": 0.8743435144424438,
- "num_tokens": 25283004.0,
- "step": 2827
- },
- {
- "epoch": 2.148936170212766,
- "grad_norm": 1.2483569383621216,
- "learning_rate": 1.0237675276295709e-06,
- "loss": 0.3346659243106842,
- "mean_token_accuracy": 0.8823951482772827,
- "num_tokens": 25297786.0,
- "step": 2828
- },
- {
- "epoch": 2.1496960486322187,
- "grad_norm": 3.7242841720581055,
- "learning_rate": 1.022077779444145e-06,
- "loss": 0.25516486167907715,
- "mean_token_accuracy": 0.9189130663871765,
- "num_tokens": 25301524.0,
- "step": 2829
- },
- {
- "epoch": 2.150455927051672,
- "grad_norm": 2.5851144790649414,
- "learning_rate": 1.020389068521426e-06,
- "loss": 0.3543069362640381,
- "mean_token_accuracy": 0.8942399621009827,
- "num_tokens": 25307277.0,
- "step": 2830
- },
- {
- "epoch": 2.1512158054711246,
- "grad_norm": 1.3453631401062012,
- "learning_rate": 1.018701396046616e-06,
- "loss": 0.2900702953338623,
- "mean_token_accuracy": 0.8847548365592957,
- "num_tokens": 25321366.0,
- "step": 2831
- },
- {
- "epoch": 2.1519756838905777,
- "grad_norm": 1.6905686855316162,
- "learning_rate": 1.0170147632041858e-06,
- "loss": 0.24844832718372345,
- "mean_token_accuracy": 0.9167388677597046,
- "num_tokens": 25328916.0,
- "step": 2832
- },
- {
- "epoch": 2.1527355623100304,
- "grad_norm": 2.6469411849975586,
- "learning_rate": 1.0153291711778825e-06,
- "loss": 0.18566903471946716,
- "mean_token_accuracy": 0.9346771836280823,
- "num_tokens": 25332871.0,
- "step": 2833
- },
- {
- "epoch": 2.1534954407294835,
- "grad_norm": 1.3880906105041504,
- "learning_rate": 1.0136446211507175e-06,
- "loss": 0.37413570284843445,
- "mean_token_accuracy": 0.8685535788536072,
- "num_tokens": 25347447.0,
- "step": 2834
- },
- {
- "epoch": 2.154255319148936,
- "grad_norm": 1.1376656293869019,
- "learning_rate": 1.0119611143049731e-06,
- "loss": 0.2844143509864807,
- "mean_token_accuracy": 0.8910006284713745,
- "num_tokens": 25365930.0,
- "step": 2835
- },
- {
- "epoch": 2.155015197568389,
- "grad_norm": 2.259666919708252,
- "learning_rate": 1.0102786518221997e-06,
- "loss": 0.3148176074028015,
- "mean_token_accuracy": 0.8851165175437927,
- "num_tokens": 25373047.0,
- "step": 2836
- },
- {
- "epoch": 2.155775075987842,
- "grad_norm": 3.304095506668091,
- "learning_rate": 1.0085972348832138e-06,
- "loss": 0.2042517364025116,
- "mean_token_accuracy": 0.9247308969497681,
- "num_tokens": 25376348.0,
- "step": 2837
- },
- {
- "epoch": 2.1565349544072947,
- "grad_norm": 1.9856120347976685,
- "learning_rate": 1.0069168646680985e-06,
- "loss": 0.3547414541244507,
- "mean_token_accuracy": 0.8941285610198975,
- "num_tokens": 25384675.0,
- "step": 2838
- },
- {
- "epoch": 2.157294832826748,
- "grad_norm": 2.8482213020324707,
- "learning_rate": 1.0052375423562038e-06,
- "loss": 0.3530133366584778,
- "mean_token_accuracy": 0.8789700269699097,
- "num_tokens": 25389631.0,
- "step": 2839
- },
- {
- "epoch": 2.1580547112462005,
- "grad_norm": 1.4270408153533936,
- "learning_rate": 1.0035592691261395e-06,
- "loss": 0.34078776836395264,
- "mean_token_accuracy": 0.8648165464401245,
- "num_tokens": 25403746.0,
- "step": 2840
- },
- {
- "epoch": 2.1588145896656536,
- "grad_norm": 0.9342723488807678,
- "learning_rate": 1.0018820461557852e-06,
- "loss": 0.2615935504436493,
- "mean_token_accuracy": 0.9082236289978027,
- "num_tokens": 25424695.0,
- "step": 2841
- },
- {
- "epoch": 2.1595744680851063,
- "grad_norm": 2.695632219314575,
- "learning_rate": 1.0002058746222807e-06,
- "loss": 0.2202145904302597,
- "mean_token_accuracy": 0.9221563339233398,
- "num_tokens": 25428783.0,
- "step": 2842
- },
- {
- "epoch": 2.1603343465045595,
- "grad_norm": 1.5679794549942017,
- "learning_rate": 9.985307557020257e-07,
- "loss": 0.24275024235248566,
- "mean_token_accuracy": 0.9363338351249695,
- "num_tokens": 25439104.0,
- "step": 2843
- },
- {
- "epoch": 2.161094224924012,
- "grad_norm": 1.5985528230667114,
- "learning_rate": 9.968566905706833e-07,
- "loss": 0.2541901171207428,
- "mean_token_accuracy": 0.9040743112564087,
- "num_tokens": 25448829.0,
- "step": 2844
- },
- {
- "epoch": 2.161854103343465,
- "grad_norm": 2.6022164821624756,
- "learning_rate": 9.951836804031795e-07,
- "loss": 0.24492180347442627,
- "mean_token_accuracy": 0.9109418392181396,
- "num_tokens": 25453902.0,
- "step": 2845
- },
- {
- "epoch": 2.162613981762918,
- "grad_norm": 1.6719969511032104,
- "learning_rate": 9.935117263736943e-07,
- "loss": 0.43255117535591125,
- "mean_token_accuracy": 0.868374228477478,
- "num_tokens": 25465538.0,
- "step": 2846
- },
- {
- "epoch": 2.1633738601823707,
- "grad_norm": 1.8284894227981567,
- "learning_rate": 9.918408296556706e-07,
- "loss": 0.32285982370376587,
- "mean_token_accuracy": 0.9016412496566772,
- "num_tokens": 25473721.0,
- "step": 2847
- },
- {
- "epoch": 2.164133738601824,
- "grad_norm": 1.4488024711608887,
- "learning_rate": 9.90170991421808e-07,
- "loss": 0.35639309883117676,
- "mean_token_accuracy": 0.8861881494522095,
- "num_tokens": 25487535.0,
- "step": 2848
- },
- {
- "epoch": 2.1648936170212765,
- "grad_norm": 2.089930534362793,
- "learning_rate": 9.88502212844063e-07,
- "loss": 0.2588546574115753,
- "mean_token_accuracy": 0.9029642939567566,
- "num_tokens": 25494567.0,
- "step": 2849
- },
- {
- "epoch": 2.1656534954407296,
- "grad_norm": 1.1274315118789673,
- "learning_rate": 9.86834495093649e-07,
- "loss": 0.37268880009651184,
- "mean_token_accuracy": 0.859347939491272,
- "num_tokens": 25518278.0,
- "step": 2850
- },
- {
- "epoch": 2.1664133738601823,
- "grad_norm": 2.3886640071868896,
- "learning_rate": 9.851678393410343e-07,
- "loss": 0.34938913583755493,
- "mean_token_accuracy": 0.8724287748336792,
- "num_tokens": 25524001.0,
- "step": 2851
- },
- {
- "epoch": 2.1671732522796354,
- "grad_norm": 2.521230459213257,
- "learning_rate": 9.83502246755942e-07,
- "loss": 0.34781408309936523,
- "mean_token_accuracy": 0.8970093131065369,
- "num_tokens": 25529982.0,
- "step": 2852
- },
- {
- "epoch": 2.167933130699088,
- "grad_norm": 2.467618942260742,
- "learning_rate": 9.818377185073493e-07,
- "loss": 0.29725387692451477,
- "mean_token_accuracy": 0.8991899490356445,
- "num_tokens": 25535356.0,
- "step": 2853
- },
- {
- "epoch": 2.1686930091185412,
- "grad_norm": 2.335873603820801,
- "learning_rate": 9.801742557634872e-07,
- "loss": 0.39603036642074585,
- "mean_token_accuracy": 0.8755916357040405,
- "num_tokens": 25542526.0,
- "step": 2854
- },
- {
- "epoch": 2.169452887537994,
- "grad_norm": 1.8388596773147583,
- "learning_rate": 9.78511859691835e-07,
- "loss": 0.3414672017097473,
- "mean_token_accuracy": 0.8951467275619507,
- "num_tokens": 25551904.0,
- "step": 2855
- },
- {
- "epoch": 2.1702127659574466,
- "grad_norm": 1.86272394657135,
- "learning_rate": 9.768505314591295e-07,
- "loss": 0.45748448371887207,
- "mean_token_accuracy": 0.8614354133605957,
- "num_tokens": 25562197.0,
- "step": 2856
- },
- {
- "epoch": 2.1709726443768997,
- "grad_norm": 1.9142264127731323,
- "learning_rate": 9.751902722313527e-07,
- "loss": 0.20877259969711304,
- "mean_token_accuracy": 0.9316688179969788,
- "num_tokens": 25569403.0,
- "step": 2857
- },
- {
- "epoch": 2.1717325227963524,
- "grad_norm": 2.1138272285461426,
- "learning_rate": 9.73531083173739e-07,
- "loss": 0.37058722972869873,
- "mean_token_accuracy": 0.8654135465621948,
- "num_tokens": 25577200.0,
- "step": 2858
- },
- {
- "epoch": 2.1724924012158056,
- "grad_norm": 1.973467469215393,
- "learning_rate": 9.718729654507713e-07,
- "loss": 0.4106993079185486,
- "mean_token_accuracy": 0.8958662152290344,
- "num_tokens": 25585694.0,
- "step": 2859
- },
- {
- "epoch": 2.1732522796352582,
- "grad_norm": 1.957513451576233,
- "learning_rate": 9.702159202261802e-07,
- "loss": 0.2067333608865738,
- "mean_token_accuracy": 0.9413473606109619,
- "num_tokens": 25591604.0,
- "step": 2860
- },
- {
- "epoch": 2.1740121580547114,
- "grad_norm": 2.7639806270599365,
- "learning_rate": 9.685599486629444e-07,
- "loss": 0.3446827232837677,
- "mean_token_accuracy": 0.8837845325469971,
- "num_tokens": 25596528.0,
- "step": 2861
- },
- {
- "epoch": 2.174772036474164,
- "grad_norm": 2.483734607696533,
- "learning_rate": 9.669050519232875e-07,
- "loss": 0.21230249106884003,
- "mean_token_accuracy": 0.9334918856620789,
- "num_tokens": 25601182.0,
- "step": 2862
- },
- {
- "epoch": 2.175531914893617,
- "grad_norm": 1.7194870710372925,
- "learning_rate": 9.65251231168681e-07,
- "loss": 0.2657586932182312,
- "mean_token_accuracy": 0.9035707712173462,
- "num_tokens": 25610561.0,
- "step": 2863
- },
- {
- "epoch": 2.17629179331307,
- "grad_norm": 2.6709611415863037,
- "learning_rate": 9.63598487559839e-07,
- "loss": 0.3673030138015747,
- "mean_token_accuracy": 0.8976202011108398,
- "num_tokens": 25615822.0,
- "step": 2864
- },
- {
- "epoch": 2.1770516717325226,
- "grad_norm": 1.6646889448165894,
- "learning_rate": 9.619468222567216e-07,
- "loss": 0.2796666622161865,
- "mean_token_accuracy": 0.8698215484619141,
- "num_tokens": 25626148.0,
- "step": 2865
- },
- {
- "epoch": 2.1778115501519757,
- "grad_norm": 1.8341799974441528,
- "learning_rate": 9.602962364185286e-07,
- "loss": 0.44835132360458374,
- "mean_token_accuracy": 0.84391850233078,
- "num_tokens": 25636305.0,
- "step": 2866
- },
- {
- "epoch": 2.1785714285714284,
- "grad_norm": 2.3579823970794678,
- "learning_rate": 9.586467312037076e-07,
- "loss": 0.2875673472881317,
- "mean_token_accuracy": 0.889403223991394,
- "num_tokens": 25642593.0,
- "step": 2867
- },
- {
- "epoch": 2.1793313069908815,
- "grad_norm": 1.1284339427947998,
- "learning_rate": 9.569983077699447e-07,
- "loss": 0.3402171730995178,
- "mean_token_accuracy": 0.8795222043991089,
- "num_tokens": 25663734.0,
- "step": 2868
- },
- {
- "epoch": 2.180091185410334,
- "grad_norm": 1.4705578088760376,
- "learning_rate": 9.553509672741646e-07,
- "loss": 0.4216107726097107,
- "mean_token_accuracy": 0.845354437828064,
- "num_tokens": 25678197.0,
- "step": 2869
- },
- {
- "epoch": 2.1808510638297873,
- "grad_norm": 2.6181085109710693,
- "learning_rate": 9.53704710872535e-07,
- "loss": 0.2777765393257141,
- "mean_token_accuracy": 0.8884872198104858,
- "num_tokens": 25683808.0,
- "step": 2870
- },
- {
- "epoch": 2.18161094224924,
- "grad_norm": 2.7285003662109375,
- "learning_rate": 9.520595397204643e-07,
- "loss": 0.33339786529541016,
- "mean_token_accuracy": 0.8892828226089478,
- "num_tokens": 25690125.0,
- "step": 2871
- },
- {
- "epoch": 2.182370820668693,
- "grad_norm": 2.200571298599243,
- "learning_rate": 9.504154549725944e-07,
- "loss": 0.46546393632888794,
- "mean_token_accuracy": 0.8389996290206909,
- "num_tokens": 25697279.0,
- "step": 2872
- },
- {
- "epoch": 2.183130699088146,
- "grad_norm": 3.491392135620117,
- "learning_rate": 9.487724577828081e-07,
- "loss": 0.17026299238204956,
- "mean_token_accuracy": 0.9410334825515747,
- "num_tokens": 25700263.0,
- "step": 2873
- },
- {
- "epoch": 2.183890577507599,
- "grad_norm": 2.7800233364105225,
- "learning_rate": 9.471305493042243e-07,
- "loss": 0.2309894859790802,
- "mean_token_accuracy": 0.9233936071395874,
- "num_tokens": 25704486.0,
- "step": 2874
- },
- {
- "epoch": 2.1846504559270516,
- "grad_norm": 2.6505582332611084,
- "learning_rate": 9.454897306891972e-07,
- "loss": 0.4378674328327179,
- "mean_token_accuracy": 0.8846660852432251,
- "num_tokens": 25710115.0,
- "step": 2875
- },
- {
- "epoch": 2.1854103343465043,
- "grad_norm": 1.5393849611282349,
- "learning_rate": 9.438500030893166e-07,
- "loss": 0.42081019282341003,
- "mean_token_accuracy": 0.8672939538955688,
- "num_tokens": 25724598.0,
- "step": 2876
- },
- {
- "epoch": 2.1861702127659575,
- "grad_norm": 1.911198377609253,
- "learning_rate": 9.422113676554073e-07,
- "loss": 0.19115394353866577,
- "mean_token_accuracy": 0.9201297163963318,
- "num_tokens": 25731040.0,
- "step": 2877
- },
- {
- "epoch": 2.18693009118541,
- "grad_norm": 1.371443748474121,
- "learning_rate": 9.405738255375243e-07,
- "loss": 0.3639947772026062,
- "mean_token_accuracy": 0.8653393983840942,
- "num_tokens": 25745335.0,
- "step": 2878
- },
- {
- "epoch": 2.1876899696048633,
- "grad_norm": 3.216238498687744,
- "learning_rate": 9.389373778849612e-07,
- "loss": 0.2623414397239685,
- "mean_token_accuracy": 0.9046015739440918,
- "num_tokens": 25749223.0,
- "step": 2879
- },
- {
- "epoch": 2.188449848024316,
- "grad_norm": 2.7558846473693848,
- "learning_rate": 9.37302025846237e-07,
- "loss": 0.31921297311782837,
- "mean_token_accuracy": 0.8903186321258545,
- "num_tokens": 25754341.0,
- "step": 2880
- },
- {
- "epoch": 2.189209726443769,
- "grad_norm": 2.06365704536438,
- "learning_rate": 9.356677705691058e-07,
- "loss": 0.357482373714447,
- "mean_token_accuracy": 0.8661626577377319,
- "num_tokens": 25761199.0,
- "step": 2881
- },
- {
- "epoch": 2.189969604863222,
- "grad_norm": 3.240328550338745,
- "learning_rate": 9.340346132005507e-07,
- "loss": 0.3157888650894165,
- "mean_token_accuracy": 0.8948285579681396,
- "num_tokens": 25765099.0,
- "step": 2882
- },
- {
- "epoch": 2.190729483282675,
- "grad_norm": 1.4671967029571533,
- "learning_rate": 9.324025548867849e-07,
- "loss": 0.32077109813690186,
- "mean_token_accuracy": 0.8813248872756958,
- "num_tokens": 25777636.0,
- "step": 2883
- },
- {
- "epoch": 2.1914893617021276,
- "grad_norm": 2.6475353240966797,
- "learning_rate": 9.307715967732492e-07,
- "loss": 0.35567623376846313,
- "mean_token_accuracy": 0.8738130331039429,
- "num_tokens": 25783737.0,
- "step": 2884
- },
- {
- "epoch": 2.1922492401215807,
- "grad_norm": 1.791491150856018,
- "learning_rate": 9.29141740004613e-07,
- "loss": 0.2556282877922058,
- "mean_token_accuracy": 0.9223519563674927,
- "num_tokens": 25792069.0,
- "step": 2885
- },
- {
- "epoch": 2.1930091185410334,
- "grad_norm": 2.3944389820098877,
- "learning_rate": 9.275129857247722e-07,
- "loss": 0.3145869970321655,
- "mean_token_accuracy": 0.8938079476356506,
- "num_tokens": 25798400.0,
- "step": 2886
- },
- {
- "epoch": 2.193768996960486,
- "grad_norm": 2.0802059173583984,
- "learning_rate": 9.258853350768499e-07,
- "loss": 0.37343069911003113,
- "mean_token_accuracy": 0.8705670833587646,
- "num_tokens": 25806567.0,
- "step": 2887
- },
- {
- "epoch": 2.1945288753799392,
- "grad_norm": 2.10831880569458,
- "learning_rate": 9.242587892031945e-07,
- "loss": 0.1989251971244812,
- "mean_token_accuracy": 0.931064248085022,
- "num_tokens": 25812715.0,
- "step": 2888
- },
- {
- "epoch": 2.195288753799392,
- "grad_norm": 2.1305530071258545,
- "learning_rate": 9.226333492453759e-07,
- "loss": 0.29377204179763794,
- "mean_token_accuracy": 0.8942701816558838,
- "num_tokens": 25819988.0,
- "step": 2889
- },
- {
- "epoch": 2.196048632218845,
- "grad_norm": 2.179025411605835,
- "learning_rate": 9.210090163441928e-07,
- "loss": 0.37565115094184875,
- "mean_token_accuracy": 0.8700202703475952,
- "num_tokens": 25827777.0,
- "step": 2890
- },
- {
- "epoch": 2.1968085106382977,
- "grad_norm": 3.177180290222168,
- "learning_rate": 9.19385791639665e-07,
- "loss": 0.16646479070186615,
- "mean_token_accuracy": 0.9426749348640442,
- "num_tokens": 25831724.0,
- "step": 2891
- },
- {
- "epoch": 2.197568389057751,
- "grad_norm": 1.103196620941162,
- "learning_rate": 9.177636762710321e-07,
- "loss": 0.29140013456344604,
- "mean_token_accuracy": 0.8789779543876648,
- "num_tokens": 25854707.0,
- "step": 2892
- },
- {
- "epoch": 2.1983282674772036,
- "grad_norm": 1.597692847251892,
- "learning_rate": 9.161426713767574e-07,
- "loss": 0.37799614667892456,
- "mean_token_accuracy": 0.8623079061508179,
- "num_tokens": 25868429.0,
- "step": 2893
- },
- {
- "epoch": 2.1990881458966567,
- "grad_norm": 2.227132558822632,
- "learning_rate": 9.145227780945265e-07,
- "loss": 0.2683261036872864,
- "mean_token_accuracy": 0.9092563390731812,
- "num_tokens": 25875367.0,
- "step": 2894
- },
- {
- "epoch": 2.1998480243161094,
- "grad_norm": 3.1229634284973145,
- "learning_rate": 9.129039975612408e-07,
- "loss": 0.21859994530677795,
- "mean_token_accuracy": 0.9187530875205994,
- "num_tokens": 25879456.0,
- "step": 2895
- },
- {
- "epoch": 2.2006079027355625,
- "grad_norm": 2.3224828243255615,
- "learning_rate": 9.112863309130235e-07,
- "loss": 0.3557605743408203,
- "mean_token_accuracy": 0.8735873103141785,
- "num_tokens": 25886477.0,
- "step": 2896
- },
- {
- "epoch": 2.201367781155015,
- "grad_norm": 1.7784863710403442,
- "learning_rate": 9.096697792852155e-07,
- "loss": 0.334577351808548,
- "mean_token_accuracy": 0.8948780298233032,
- "num_tokens": 25894977.0,
- "step": 2897
- },
- {
- "epoch": 2.202127659574468,
- "grad_norm": 2.34066104888916,
- "learning_rate": 9.080543438123746e-07,
- "loss": 0.16479721665382385,
- "mean_token_accuracy": 0.9405456781387329,
- "num_tokens": 25900015.0,
- "step": 2898
- },
- {
- "epoch": 2.202887537993921,
- "grad_norm": 1.944082498550415,
- "learning_rate": 9.064400256282757e-07,
- "loss": 0.40259572863578796,
- "mean_token_accuracy": 0.8632713556289673,
- "num_tokens": 25908749.0,
- "step": 2899
- },
- {
- "epoch": 2.2036474164133737,
- "grad_norm": 1.2758828401565552,
- "learning_rate": 9.048268258659098e-07,
- "loss": 0.3939874470233917,
- "mean_token_accuracy": 0.8652969598770142,
- "num_tokens": 25924972.0,
- "step": 2900
- },
- {
- "epoch": 2.204407294832827,
- "grad_norm": 1.4483891725540161,
- "learning_rate": 9.032147456574822e-07,
- "loss": 0.4132935404777527,
- "mean_token_accuracy": 0.868486762046814,
- "num_tokens": 25939785.0,
- "step": 2901
- },
- {
- "epoch": 2.2051671732522795,
- "grad_norm": 1.4866713285446167,
- "learning_rate": 9.01603786134413e-07,
- "loss": 0.3644951581954956,
- "mean_token_accuracy": 0.8750203847885132,
- "num_tokens": 25952648.0,
- "step": 2902
- },
- {
- "epoch": 2.2059270516717326,
- "grad_norm": 1.6555454730987549,
- "learning_rate": 8.999939484273362e-07,
- "loss": 0.48656779527664185,
- "mean_token_accuracy": 0.8372372984886169,
- "num_tokens": 25965062.0,
- "step": 2903
- },
- {
- "epoch": 2.2066869300911853,
- "grad_norm": 2.3154168128967285,
- "learning_rate": 8.983852336660959e-07,
- "loss": 0.3768891990184784,
- "mean_token_accuracy": 0.8614999055862427,
- "num_tokens": 25972152.0,
- "step": 2904
- },
- {
- "epoch": 2.2074468085106385,
- "grad_norm": 2.3618056774139404,
- "learning_rate": 8.967776429797529e-07,
- "loss": 0.24905793368816376,
- "mean_token_accuracy": 0.9170958995819092,
- "num_tokens": 25977808.0,
- "step": 2905
- },
- {
- "epoch": 2.208206686930091,
- "grad_norm": 1.929051399230957,
- "learning_rate": 8.951711774965741e-07,
- "loss": 0.38099539279937744,
- "mean_token_accuracy": 0.8812143802642822,
- "num_tokens": 25987871.0,
- "step": 2906
- },
- {
- "epoch": 2.2089665653495443,
- "grad_norm": 1.6529620885849,
- "learning_rate": 8.93565838344039e-07,
- "loss": 0.31784749031066895,
- "mean_token_accuracy": 0.8929437398910522,
- "num_tokens": 25997777.0,
- "step": 2907
- },
- {
- "epoch": 2.209726443768997,
- "grad_norm": 2.1413469314575195,
- "learning_rate": 8.919616266488373e-07,
- "loss": 0.4043882191181183,
- "mean_token_accuracy": 0.8937146663665771,
- "num_tokens": 26005213.0,
- "step": 2908
- },
- {
- "epoch": 2.2104863221884496,
- "grad_norm": 1.3838988542556763,
- "learning_rate": 8.903585435368658e-07,
- "loss": 0.2858969569206238,
- "mean_token_accuracy": 0.9084860682487488,
- "num_tokens": 26018371.0,
- "step": 2909
- },
- {
- "epoch": 2.211246200607903,
- "grad_norm": 1.2853319644927979,
- "learning_rate": 8.887565901332304e-07,
- "loss": 0.3178713619709015,
- "mean_token_accuracy": 0.872230589389801,
- "num_tokens": 26034136.0,
- "step": 2910
- },
- {
- "epoch": 2.2120060790273555,
- "grad_norm": 2.9032399654388428,
- "learning_rate": 8.871557675622442e-07,
- "loss": 0.20348960161209106,
- "mean_token_accuracy": 0.9275314807891846,
- "num_tokens": 26038299.0,
- "step": 2911
- },
- {
- "epoch": 2.2127659574468086,
- "grad_norm": 2.4349892139434814,
- "learning_rate": 8.855560769474237e-07,
- "loss": 0.24282032251358032,
- "mean_token_accuracy": 0.9103988409042358,
- "num_tokens": 26043427.0,
- "step": 2912
- },
- {
- "epoch": 2.2135258358662613,
- "grad_norm": 2.324664831161499,
- "learning_rate": 8.839575194114958e-07,
- "loss": 0.3808317184448242,
- "mean_token_accuracy": 0.8598989844322205,
- "num_tokens": 26049667.0,
- "step": 2913
- },
- {
- "epoch": 2.2142857142857144,
- "grad_norm": 2.594947576522827,
- "learning_rate": 8.823600960763901e-07,
- "loss": 0.39623332023620605,
- "mean_token_accuracy": 0.8738477230072021,
- "num_tokens": 26055428.0,
- "step": 2914
- },
- {
- "epoch": 2.215045592705167,
- "grad_norm": 1.674308180809021,
- "learning_rate": 8.807638080632375e-07,
- "loss": 0.2641369104385376,
- "mean_token_accuracy": 0.9119734764099121,
- "num_tokens": 26064355.0,
- "step": 2915
- },
- {
- "epoch": 2.2158054711246202,
- "grad_norm": 2.9884912967681885,
- "learning_rate": 8.791686564923746e-07,
- "loss": 0.19229236245155334,
- "mean_token_accuracy": 0.9388723969459534,
- "num_tokens": 26067563.0,
- "step": 2916
- },
- {
- "epoch": 2.216565349544073,
- "grad_norm": 1.8513846397399902,
- "learning_rate": 8.775746424833428e-07,
- "loss": 0.3076218366622925,
- "mean_token_accuracy": 0.9165210723876953,
- "num_tokens": 26075609.0,
- "step": 2917
- },
- {
- "epoch": 2.217325227963526,
- "grad_norm": 1.229604721069336,
- "learning_rate": 8.759817671548801e-07,
- "loss": 0.2727023959159851,
- "mean_token_accuracy": 0.8931418061256409,
- "num_tokens": 26091183.0,
- "step": 2918
- },
- {
- "epoch": 2.2180851063829787,
- "grad_norm": 2.384413957595825,
- "learning_rate": 8.743900316249273e-07,
- "loss": 0.27312609553337097,
- "mean_token_accuracy": 0.8972288370132446,
- "num_tokens": 26096677.0,
- "step": 2919
- },
- {
- "epoch": 2.2188449848024314,
- "grad_norm": 2.186370611190796,
- "learning_rate": 8.727994370106288e-07,
- "loss": 0.36045557260513306,
- "mean_token_accuracy": 0.8788503408432007,
- "num_tokens": 26104464.0,
- "step": 2920
- },
- {
- "epoch": 2.2196048632218845,
- "grad_norm": 2.769796848297119,
- "learning_rate": 8.71209984428322e-07,
- "loss": 0.3427591919898987,
- "mean_token_accuracy": 0.892108678817749,
- "num_tokens": 26109571.0,
- "step": 2921
- },
- {
- "epoch": 2.2203647416413372,
- "grad_norm": 2.9888014793395996,
- "learning_rate": 8.696216749935471e-07,
- "loss": 0.20137615501880646,
- "mean_token_accuracy": 0.9366025924682617,
- "num_tokens": 26113165.0,
- "step": 2922
- },
- {
- "epoch": 2.2211246200607904,
- "grad_norm": 1.484858751296997,
- "learning_rate": 8.680345098210408e-07,
- "loss": 0.2884698510169983,
- "mean_token_accuracy": 0.8992507457733154,
- "num_tokens": 26124385.0,
- "step": 2923
- },
- {
- "epoch": 2.221884498480243,
- "grad_norm": 1.690119981765747,
- "learning_rate": 8.664484900247363e-07,
- "loss": 0.34275567531585693,
- "mean_token_accuracy": 0.8682634234428406,
- "num_tokens": 26134944.0,
- "step": 2924
- },
- {
- "epoch": 2.222644376899696,
- "grad_norm": 1.6171982288360596,
- "learning_rate": 8.64863616717764e-07,
- "loss": 0.256338506937027,
- "mean_token_accuracy": 0.9281957745552063,
- "num_tokens": 26143586.0,
- "step": 2925
- },
- {
- "epoch": 2.223404255319149,
- "grad_norm": 2.4853835105895996,
- "learning_rate": 8.632798910124493e-07,
- "loss": 0.26290056109428406,
- "mean_token_accuracy": 0.9119559526443481,
- "num_tokens": 26148931.0,
- "step": 2926
- },
- {
- "epoch": 2.224164133738602,
- "grad_norm": 2.0014333724975586,
- "learning_rate": 8.616973140203097e-07,
- "loss": 0.33400261402130127,
- "mean_token_accuracy": 0.8796782493591309,
- "num_tokens": 26156246.0,
- "step": 2927
- },
- {
- "epoch": 2.2249240121580547,
- "grad_norm": 1.4637027978897095,
- "learning_rate": 8.601158868520617e-07,
- "loss": 0.24374958872795105,
- "mean_token_accuracy": 0.9116952419281006,
- "num_tokens": 26166431.0,
- "step": 2928
- },
- {
- "epoch": 2.225683890577508,
- "grad_norm": 2.2056987285614014,
- "learning_rate": 8.585356106176093e-07,
- "loss": 0.3419337570667267,
- "mean_token_accuracy": 0.8703858852386475,
- "num_tokens": 26173974.0,
- "step": 2929
- },
- {
- "epoch": 2.2264437689969605,
- "grad_norm": 1.3687927722930908,
- "learning_rate": 8.569564864260524e-07,
- "loss": 0.43176111578941345,
- "mean_token_accuracy": 0.8616900444030762,
- "num_tokens": 26191632.0,
- "step": 2930
- },
- {
- "epoch": 2.227203647416413,
- "grad_norm": 1.4975634813308716,
- "learning_rate": 8.553785153856809e-07,
- "loss": 0.38525745272636414,
- "mean_token_accuracy": 0.8611687421798706,
- "num_tokens": 26203300.0,
- "step": 2931
- },
- {
- "epoch": 2.2279635258358663,
- "grad_norm": 1.970109462738037,
- "learning_rate": 8.538016986039751e-07,
- "loss": 0.31731468439102173,
- "mean_token_accuracy": 0.884365975856781,
- "num_tokens": 26210037.0,
- "step": 2932
- },
- {
- "epoch": 2.228723404255319,
- "grad_norm": 2.681717872619629,
- "learning_rate": 8.522260371876068e-07,
- "loss": 0.2770140767097473,
- "mean_token_accuracy": 0.9020107984542847,
- "num_tokens": 26215460.0,
- "step": 2933
- },
- {
- "epoch": 2.229483282674772,
- "grad_norm": 2.2324795722961426,
- "learning_rate": 8.506515322424349e-07,
- "loss": 0.30599141120910645,
- "mean_token_accuracy": 0.8939633965492249,
- "num_tokens": 26221260.0,
- "step": 2934
- },
- {
- "epoch": 2.230243161094225,
- "grad_norm": 2.08915376663208,
- "learning_rate": 8.49078184873508e-07,
- "loss": 0.3609209954738617,
- "mean_token_accuracy": 0.8776482343673706,
- "num_tokens": 26228397.0,
- "step": 2935
- },
- {
- "epoch": 2.231003039513678,
- "grad_norm": 1.641366958618164,
- "learning_rate": 8.475059961850617e-07,
- "loss": 0.2969125509262085,
- "mean_token_accuracy": 0.8949217796325684,
- "num_tokens": 26238533.0,
- "step": 2936
- },
- {
- "epoch": 2.2317629179331306,
- "grad_norm": 1.082148551940918,
- "learning_rate": 8.459349672805198e-07,
- "loss": 0.23957109451293945,
- "mean_token_accuracy": 0.9255712032318115,
- "num_tokens": 26254154.0,
- "step": 2937
- },
- {
- "epoch": 2.2325227963525838,
- "grad_norm": 2.495208740234375,
- "learning_rate": 8.443650992624877e-07,
- "loss": 0.2879767417907715,
- "mean_token_accuracy": 0.8911515474319458,
- "num_tokens": 26260812.0,
- "step": 2938
- },
- {
- "epoch": 2.2332826747720365,
- "grad_norm": 3.566549062728882,
- "learning_rate": 8.427963932327621e-07,
- "loss": 0.31420570611953735,
- "mean_token_accuracy": 0.8888009190559387,
- "num_tokens": 26264592.0,
- "step": 2939
- },
- {
- "epoch": 2.2340425531914896,
- "grad_norm": 2.217177391052246,
- "learning_rate": 8.412288502923211e-07,
- "loss": 0.30547618865966797,
- "mean_token_accuracy": 0.9065294861793518,
- "num_tokens": 26270729.0,
- "step": 2940
- },
- {
- "epoch": 2.2348024316109423,
- "grad_norm": 1.404260277748108,
- "learning_rate": 8.396624715413251e-07,
- "loss": 0.32485032081604004,
- "mean_token_accuracy": 0.8799532651901245,
- "num_tokens": 26284280.0,
- "step": 2941
- },
- {
- "epoch": 2.235562310030395,
- "grad_norm": 1.5519827604293823,
- "learning_rate": 8.380972580791191e-07,
- "loss": 0.3330575227737427,
- "mean_token_accuracy": 0.8865892887115479,
- "num_tokens": 26293635.0,
- "step": 2942
- },
- {
- "epoch": 2.236322188449848,
- "grad_norm": 2.604766845703125,
- "learning_rate": 8.365332110042323e-07,
- "loss": 0.18986842036247253,
- "mean_token_accuracy": 0.9276989102363586,
- "num_tokens": 26298553.0,
- "step": 2943
- },
- {
- "epoch": 2.237082066869301,
- "grad_norm": 2.1750004291534424,
- "learning_rate": 8.349703314143712e-07,
- "loss": 0.3661153018474579,
- "mean_token_accuracy": 0.8879489302635193,
- "num_tokens": 26305697.0,
- "step": 2944
- },
- {
- "epoch": 2.237841945288754,
- "grad_norm": 2.247069835662842,
- "learning_rate": 8.334086204064254e-07,
- "loss": 0.3127560615539551,
- "mean_token_accuracy": 0.8846344351768494,
- "num_tokens": 26312347.0,
- "step": 2945
- },
- {
- "epoch": 2.2386018237082066,
- "grad_norm": 1.905275821685791,
- "learning_rate": 8.318480790764638e-07,
- "loss": 0.44245776534080505,
- "mean_token_accuracy": 0.87440425157547,
- "num_tokens": 26322787.0,
- "step": 2946
- },
- {
- "epoch": 2.2393617021276597,
- "grad_norm": 1.8596254587173462,
- "learning_rate": 8.302887085197342e-07,
- "loss": 0.30068373680114746,
- "mean_token_accuracy": 0.8847110271453857,
- "num_tokens": 26330437.0,
- "step": 2947
- },
- {
- "epoch": 2.2401215805471124,
- "grad_norm": 2.0028860569000244,
- "learning_rate": 8.28730509830663e-07,
- "loss": 0.4276006817817688,
- "mean_token_accuracy": 0.8406014442443848,
- "num_tokens": 26340100.0,
- "step": 2948
- },
- {
- "epoch": 2.2408814589665655,
- "grad_norm": 2.494434356689453,
- "learning_rate": 8.271734841028553e-07,
- "loss": 0.3874223232269287,
- "mean_token_accuracy": 0.8782174587249756,
- "num_tokens": 26345750.0,
- "step": 2949
- },
- {
- "epoch": 2.2416413373860182,
- "grad_norm": 1.955613613128662,
- "learning_rate": 8.256176324290885e-07,
- "loss": 0.28770074248313904,
- "mean_token_accuracy": 0.9004360437393188,
- "num_tokens": 26353342.0,
- "step": 2950
- },
- {
- "epoch": 2.2424012158054714,
- "grad_norm": 1.7579785585403442,
- "learning_rate": 8.240629559013222e-07,
- "loss": 0.2277943640947342,
- "mean_token_accuracy": 0.9145861864089966,
- "num_tokens": 26361348.0,
- "step": 2951
- },
- {
- "epoch": 2.243161094224924,
- "grad_norm": 1.5848479270935059,
- "learning_rate": 8.22509455610688e-07,
- "loss": 0.32944542169570923,
- "mean_token_accuracy": 0.8662827014923096,
- "num_tokens": 26372006.0,
- "step": 2952
- },
- {
- "epoch": 2.2439209726443767,
- "grad_norm": 2.6263222694396973,
- "learning_rate": 8.209571326474897e-07,
- "loss": 0.34646326303482056,
- "mean_token_accuracy": 0.8817736506462097,
- "num_tokens": 26377664.0,
- "step": 2953
- },
- {
- "epoch": 2.24468085106383,
- "grad_norm": 2.407590627670288,
- "learning_rate": 8.194059881012107e-07,
- "loss": 0.41302192211151123,
- "mean_token_accuracy": 0.8898757696151733,
- "num_tokens": 26384225.0,
- "step": 2954
- },
- {
- "epoch": 2.2454407294832825,
- "grad_norm": 2.5156402587890625,
- "learning_rate": 8.178560230605012e-07,
- "loss": 0.3468608558177948,
- "mean_token_accuracy": 0.8879599571228027,
- "num_tokens": 26389374.0,
- "step": 2955
- },
- {
- "epoch": 2.2462006079027357,
- "grad_norm": 1.5076090097427368,
- "learning_rate": 8.163072386131876e-07,
- "loss": 0.3750625550746918,
- "mean_token_accuracy": 0.8712738752365112,
- "num_tokens": 26402674.0,
- "step": 2956
- },
- {
- "epoch": 2.2469604863221884,
- "grad_norm": 1.5181068181991577,
- "learning_rate": 8.147596358462662e-07,
- "loss": 0.19113478064537048,
- "mean_token_accuracy": 0.9323463439941406,
- "num_tokens": 26411626.0,
- "step": 2957
- },
- {
- "epoch": 2.2477203647416415,
- "grad_norm": 1.0806915760040283,
- "learning_rate": 8.132132158459044e-07,
- "loss": 0.3411233425140381,
- "mean_token_accuracy": 0.8736830949783325,
- "num_tokens": 26435891.0,
- "step": 2958
- },
- {
- "epoch": 2.248480243161094,
- "grad_norm": 1.5527247190475464,
- "learning_rate": 8.116679796974389e-07,
- "loss": 0.425741970539093,
- "mean_token_accuracy": 0.8448845148086548,
- "num_tokens": 26448134.0,
- "step": 2959
- },
- {
- "epoch": 2.2492401215805473,
- "grad_norm": 1.2390631437301636,
- "learning_rate": 8.10123928485377e-07,
- "loss": 0.38084933161735535,
- "mean_token_accuracy": 0.8656617999076843,
- "num_tokens": 26467213.0,
- "step": 2960
- },
- {
- "epoch": 2.25,
- "grad_norm": 3.0672852993011475,
- "learning_rate": 8.08581063293391e-07,
- "loss": 0.29300111532211304,
- "mean_token_accuracy": 0.8933638334274292,
- "num_tokens": 26471599.0,
- "step": 2961
- },
- {
- "epoch": 2.250759878419453,
- "grad_norm": 1.2359145879745483,
- "learning_rate": 8.070393852043251e-07,
- "loss": 0.41337621212005615,
- "mean_token_accuracy": 0.854198694229126,
- "num_tokens": 26488461.0,
- "step": 2962
- },
- {
- "epoch": 2.251519756838906,
- "grad_norm": 1.8551225662231445,
- "learning_rate": 8.054988953001889e-07,
- "loss": 0.3036419153213501,
- "mean_token_accuracy": 0.8883144855499268,
- "num_tokens": 26496398.0,
- "step": 2963
- },
- {
- "epoch": 2.2522796352583585,
- "grad_norm": 1.3691812753677368,
- "learning_rate": 8.039595946621551e-07,
- "loss": 0.3286219835281372,
- "mean_token_accuracy": 0.892130434513092,
- "num_tokens": 26510493.0,
- "step": 2964
- },
- {
- "epoch": 2.2530395136778116,
- "grad_norm": 1.7371556758880615,
- "learning_rate": 8.024214843705647e-07,
- "loss": 0.4105026125907898,
- "mean_token_accuracy": 0.8889180421829224,
- "num_tokens": 26519148.0,
- "step": 2965
- },
- {
- "epoch": 2.2537993920972643,
- "grad_norm": 2.211665630340576,
- "learning_rate": 8.00884565504925e-07,
- "loss": 0.3912196159362793,
- "mean_token_accuracy": 0.8632891774177551,
- "num_tokens": 26526314.0,
- "step": 2966
- },
- {
- "epoch": 2.2545592705167175,
- "grad_norm": 2.476206064224243,
- "learning_rate": 7.993488391439025e-07,
- "loss": 0.20462508499622345,
- "mean_token_accuracy": 0.9276266098022461,
- "num_tokens": 26531781.0,
- "step": 2967
- },
- {
- "epoch": 2.25531914893617,
- "grad_norm": 1.4944102764129639,
- "learning_rate": 7.978143063653296e-07,
- "loss": 0.2694895267486572,
- "mean_token_accuracy": 0.9033881425857544,
- "num_tokens": 26543780.0,
- "step": 2968
- },
- {
- "epoch": 2.2560790273556233,
- "grad_norm": 1.7570104598999023,
- "learning_rate": 7.962809682462008e-07,
- "loss": 0.3060353100299835,
- "mean_token_accuracy": 0.8908290863037109,
- "num_tokens": 26551978.0,
- "step": 2969
- },
- {
- "epoch": 2.256838905775076,
- "grad_norm": 2.215514898300171,
- "learning_rate": 7.947488258626718e-07,
- "loss": 0.2930528521537781,
- "mean_token_accuracy": 0.8989757299423218,
- "num_tokens": 26558267.0,
- "step": 2970
- },
- {
- "epoch": 2.2575987841945286,
- "grad_norm": 2.3069000244140625,
- "learning_rate": 7.93217880290059e-07,
- "loss": 0.18501774966716766,
- "mean_token_accuracy": 0.931271493434906,
- "num_tokens": 26563286.0,
- "step": 2971
- },
- {
- "epoch": 2.2583586626139818,
- "grad_norm": 1.6555116176605225,
- "learning_rate": 7.916881326028387e-07,
- "loss": 0.3178265392780304,
- "mean_token_accuracy": 0.9016884565353394,
- "num_tokens": 26572087.0,
- "step": 2972
- },
- {
- "epoch": 2.2591185410334345,
- "grad_norm": 2.222161054611206,
- "learning_rate": 7.901595838746471e-07,
- "loss": 0.3013504445552826,
- "mean_token_accuracy": 0.8942798376083374,
- "num_tokens": 26578159.0,
- "step": 2973
- },
- {
- "epoch": 2.2598784194528876,
- "grad_norm": 1.979411005973816,
- "learning_rate": 7.886322351782782e-07,
- "loss": 0.42746615409851074,
- "mean_token_accuracy": 0.85303795337677,
- "num_tokens": 26586252.0,
- "step": 2974
- },
- {
- "epoch": 2.2606382978723403,
- "grad_norm": 1.4925786256790161,
- "learning_rate": 7.871060875856854e-07,
- "loss": 0.33495625853538513,
- "mean_token_accuracy": 0.8911026120185852,
- "num_tokens": 26599921.0,
- "step": 2975
- },
- {
- "epoch": 2.2613981762917934,
- "grad_norm": 1.9037046432495117,
- "learning_rate": 7.855811421679746e-07,
- "loss": 0.31471866369247437,
- "mean_token_accuracy": 0.9007552862167358,
- "num_tokens": 26607954.0,
- "step": 2976
- },
- {
- "epoch": 2.262158054711246,
- "grad_norm": 2.2751407623291016,
- "learning_rate": 7.840573999954154e-07,
- "loss": 0.26972368359565735,
- "mean_token_accuracy": 0.8992317914962769,
- "num_tokens": 26614036.0,
- "step": 2977
- },
- {
- "epoch": 2.262917933130699,
- "grad_norm": 2.680572271347046,
- "learning_rate": 7.825348621374257e-07,
- "loss": 0.4264066219329834,
- "mean_token_accuracy": 0.8547691106796265,
- "num_tokens": 26619545.0,
- "step": 2978
- },
- {
- "epoch": 2.263677811550152,
- "grad_norm": 2.3535876274108887,
- "learning_rate": 7.810135296625817e-07,
- "loss": 0.37871062755584717,
- "mean_token_accuracy": 0.8621708750724792,
- "num_tokens": 26626248.0,
- "step": 2979
- },
- {
- "epoch": 2.264437689969605,
- "grad_norm": 1.2249537706375122,
- "learning_rate": 7.794934036386139e-07,
- "loss": 0.3877285122871399,
- "mean_token_accuracy": 0.8593572378158569,
- "num_tokens": 26648023.0,
- "step": 2980
- },
- {
- "epoch": 2.2651975683890577,
- "grad_norm": 2.43371844291687,
- "learning_rate": 7.779744851324048e-07,
- "loss": 0.37463510036468506,
- "mean_token_accuracy": 0.8646193742752075,
- "num_tokens": 26654016.0,
- "step": 2981
- },
- {
- "epoch": 2.2659574468085104,
- "grad_norm": 1.7429327964782715,
- "learning_rate": 7.7645677520999e-07,
- "loss": 0.4033060669898987,
- "mean_token_accuracy": 0.8644014596939087,
- "num_tokens": 26664447.0,
- "step": 2982
- },
- {
- "epoch": 2.2667173252279635,
- "grad_norm": 2.4090006351470947,
- "learning_rate": 7.749402749365573e-07,
- "loss": 0.2981206774711609,
- "mean_token_accuracy": 0.8886175751686096,
- "num_tokens": 26670355.0,
- "step": 2983
- },
- {
- "epoch": 2.2674772036474162,
- "grad_norm": 1.3855396509170532,
- "learning_rate": 7.734249853764428e-07,
- "loss": 0.35967472195625305,
- "mean_token_accuracy": 0.8652631044387817,
- "num_tokens": 26685385.0,
- "step": 2984
- },
- {
- "epoch": 2.2682370820668694,
- "grad_norm": 1.328214168548584,
- "learning_rate": 7.719109075931375e-07,
- "loss": 0.3571951389312744,
- "mean_token_accuracy": 0.8894522190093994,
- "num_tokens": 26703265.0,
- "step": 2985
- },
- {
- "epoch": 2.268996960486322,
- "grad_norm": 2.5001046657562256,
- "learning_rate": 7.703980426492791e-07,
- "loss": 0.3512844741344452,
- "mean_token_accuracy": 0.887405514717102,
- "num_tokens": 26709095.0,
- "step": 2986
- },
- {
- "epoch": 2.269756838905775,
- "grad_norm": 1.8704569339752197,
- "learning_rate": 7.688863916066524e-07,
- "loss": 0.2746743857860565,
- "mean_token_accuracy": 0.903412401676178,
- "num_tokens": 26716815.0,
- "step": 2987
- },
- {
- "epoch": 2.270516717325228,
- "grad_norm": 2.1134285926818848,
- "learning_rate": 7.673759555261947e-07,
- "loss": 0.38385504484176636,
- "mean_token_accuracy": 0.8759124279022217,
- "num_tokens": 26724046.0,
- "step": 2988
- },
- {
- "epoch": 2.271276595744681,
- "grad_norm": 1.2651840448379517,
- "learning_rate": 7.65866735467988e-07,
- "loss": 0.3499506413936615,
- "mean_token_accuracy": 0.8704953193664551,
- "num_tokens": 26743024.0,
- "step": 2989
- },
- {
- "epoch": 2.2720364741641337,
- "grad_norm": 1.7289817333221436,
- "learning_rate": 7.643587324912597e-07,
- "loss": 0.3768725097179413,
- "mean_token_accuracy": 0.8623670339584351,
- "num_tokens": 26754336.0,
- "step": 2990
- },
- {
- "epoch": 2.272796352583587,
- "grad_norm": 1.6121667623519897,
- "learning_rate": 7.628519476543839e-07,
- "loss": 0.42746737599372864,
- "mean_token_accuracy": 0.8425478935241699,
- "num_tokens": 26766813.0,
- "step": 2991
- },
- {
- "epoch": 2.2735562310030395,
- "grad_norm": 2.705442428588867,
- "learning_rate": 7.613463820148831e-07,
- "loss": 0.27137982845306396,
- "mean_token_accuracy": 0.9014253616333008,
- "num_tokens": 26772565.0,
- "step": 2992
- },
- {
- "epoch": 2.274316109422492,
- "grad_norm": 1.3811960220336914,
- "learning_rate": 7.598420366294185e-07,
- "loss": 0.2957465350627899,
- "mean_token_accuracy": 0.8935354351997375,
- "num_tokens": 26787325.0,
- "step": 2993
- },
- {
- "epoch": 2.2750759878419453,
- "grad_norm": 2.469336986541748,
- "learning_rate": 7.583389125537982e-07,
- "loss": 0.2811780273914337,
- "mean_token_accuracy": 0.8956634998321533,
- "num_tokens": 26793457.0,
- "step": 2994
- },
- {
- "epoch": 2.275835866261398,
- "grad_norm": 2.945681571960449,
- "learning_rate": 7.568370108429732e-07,
- "loss": 0.3186708092689514,
- "mean_token_accuracy": 0.8817545175552368,
- "num_tokens": 26797867.0,
- "step": 2995
- },
- {
- "epoch": 2.276595744680851,
- "grad_norm": 1.7748228311538696,
- "learning_rate": 7.553363325510355e-07,
- "loss": 0.3279818892478943,
- "mean_token_accuracy": 0.884396493434906,
- "num_tokens": 26806656.0,
- "step": 2996
- },
- {
- "epoch": 2.277355623100304,
- "grad_norm": 1.312500238418579,
- "learning_rate": 7.538368787312186e-07,
- "loss": 0.3754822611808777,
- "mean_token_accuracy": 0.8653179407119751,
- "num_tokens": 26823126.0,
- "step": 2997
- },
- {
- "epoch": 2.278115501519757,
- "grad_norm": 3.1305344104766846,
- "learning_rate": 7.523386504358984e-07,
- "loss": 0.3293214440345764,
- "mean_token_accuracy": 0.8908799886703491,
- "num_tokens": 26828250.0,
- "step": 2998
- },
- {
- "epoch": 2.2788753799392096,
- "grad_norm": 2.6449344158172607,
- "learning_rate": 7.508416487165862e-07,
- "loss": 0.23732036352157593,
- "mean_token_accuracy": 0.9029837846755981,
- "num_tokens": 26833123.0,
- "step": 2999
- },
- {
- "epoch": 2.2796352583586628,
- "grad_norm": 2.04388427734375,
- "learning_rate": 7.49345874623939e-07,
- "loss": 0.31240373849868774,
- "mean_token_accuracy": 0.8860392570495605,
- "num_tokens": 26840878.0,
- "step": 3000
- }
- ],
- "logging_steps": 1.0,
- "max_steps": 3948,
- "num_input_tokens_seen": 0,
- "num_train_epochs": 3,
- "save_steps": 1000,
- "stateful_callbacks": {
- "TrainerControl": {
- "args": {
- "should_epoch_stop": false,
- "should_evaluate": false,
- "should_log": false,
- "should_save": true,
- "should_training_stop": false
- },
- "attributes": {}
- }
- },
- "total_flos": 2.925799536381133e+17,
- "train_batch_size": 1,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin
deleted file mode 100644
index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000
--- a/checkpoint-3000/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021
-size 6225
diff --git a/checkpoint-3948/chat_template.jinja b/checkpoint-3948/chat_template.jinja
deleted file mode 100644
index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000
--- a/checkpoint-3948/chat_template.jinja
+++ /dev/null
@@ -1,61 +0,0 @@
-{%- if tools %}
- {{- '<|im_start|>system\n' }}
- {%- if messages[0].role == 'system' %}
- {{- messages[0].content + '\n\n' }}
- {%- endif %}
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
- {%- for tool in tools %}
- {{- "\n" }}
- {{- tool | tojson }}
- {%- endfor %}
- {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
-{%- else %}
- {%- if messages[0].role == 'system' %}
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
- {%- endif %}
-{%- endif %}
-{%- for message in messages %}
- {%- if message.content is string %}
- {%- set content = message.content %}
- {%- else %}
- {%- set content = '' %}
- {%- endif %}
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
- {%- elif message.role == "assistant" %}
- {{- '<|im_start|>' + message.role + '\n' + content }}
- {%- if message.tool_calls %}
- {%- for tool_call in message.tool_calls %}
- {%- if (loop.first and content) or (not loop.first) %}
- {{- '\n' }}
- {%- endif %}
- {%- if tool_call.function %}
- {%- set tool_call = tool_call.function %}
- {%- endif %}
- {{- '\n{"name": "' }}
- {{- tool_call.name }}
- {{- '", "arguments": ' }}
- {%- if tool_call.arguments is string %}
- {{- tool_call.arguments }}
- {%- else %}
- {{- tool_call.arguments | tojson }}
- {%- endif %}
- {{- '}\n' }}
- {%- endfor %}
- {%- endif %}
- {{- '<|im_end|>\n' }}
- {%- elif message.role == "tool" %}
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
- {{- '<|im_start|>user' }}
- {%- endif %}
- {{- '\n\n' }}
- {{- content }}
- {{- '\n' }}
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
- {{- '<|im_end|>\n' }}
- {%- endif %}
- {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
- {{- '<|im_start|>assistant\n' }}
-{%- endif %}
\ No newline at end of file
diff --git a/checkpoint-3948/config.json b/checkpoint-3948/config.json
deleted file mode 100644
index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000
--- a/checkpoint-3948/config.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "architectures": [
- "Qwen3ForCausalLM"
- ],
- "attention_bias": false,
- "attention_dropout": 0.0,
- "bos_token_id": null,
- "dtype": "float32",
- "eos_token_id": 151645,
- "head_dim": 128,
- "hidden_act": "silu",
- "hidden_size": 2560,
- "initializer_range": 0.02,
- "intermediate_size": 9728,
- "layer_types": [
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention"
- ],
- "max_position_embeddings": 262144,
- "max_window_layers": 36,
- "model_type": "qwen3",
- "num_attention_heads": 32,
- "num_hidden_layers": 36,
- "num_key_value_heads": 8,
- "pad_token_id": 151662,
- "rms_norm_eps": 1e-06,
- "rope_parameters": {
- "rope_theta": 5000000,
- "rope_type": "default"
- },
- "sliding_window": null,
- "tie_word_embeddings": true,
- "transformers_version": "5.5.3",
- "use_cache": false,
- "use_sliding_window": false,
- "vocab_size": 151936
-}
diff --git a/checkpoint-3948/generation_config.json b/checkpoint-3948/generation_config.json
deleted file mode 100644
index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000
--- a/checkpoint-3948/generation_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "do_sample": true,
- "eos_token_id": [
- 151645,
- 151643
- ],
- "pad_token_id": 151662,
- "temperature": 0.7,
- "top_k": 20,
- "top_p": 0.8,
- "transformers_version": "5.5.3"
-}
diff --git a/checkpoint-3948/model.safetensors b/checkpoint-3948/model.safetensors
deleted file mode 100644
index f787ad62bc7ccc577c324b6d71689c0739123f0c..0000000000000000000000000000000000000000
--- a/checkpoint-3948/model.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e7db19800bbcf792dcb25dea9b5ae39f4e934a0d56f64ed6f74d7d89e87ae928
-size 17645743048
diff --git a/checkpoint-3948/optimizer.bin b/checkpoint-3948/optimizer.bin
deleted file mode 100644
index 90ea9835df74c549d6f6b88c64f00fdc211af5fa..0000000000000000000000000000000000000000
--- a/checkpoint-3948/optimizer.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:656d334c407ae1443fcaeda271d597e51249875fdde8e1a12a024812f6de73ab
-size 32180124005
diff --git a/checkpoint-3948/pytorch_model_fsdp.bin b/checkpoint-3948/pytorch_model_fsdp.bin
deleted file mode 100644
index a96db7a5fcab43218d82108cacd5f6fc2583929f..0000000000000000000000000000000000000000
--- a/checkpoint-3948/pytorch_model_fsdp.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:51d19fbc90bb938bf3c747a8b9c2b23f00398029d4ab146ca0ca0a0ea7d8885c
-size 17645897996
diff --git a/checkpoint-3948/rng_state_0.pth b/checkpoint-3948/rng_state_0.pth
deleted file mode 100644
index 5379ca97bc0c62d226d0fc37920d4937a7bb8b43..0000000000000000000000000000000000000000
--- a/checkpoint-3948/rng_state_0.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:61e957b4cd785256be4cb26eb03060ef689e1d58f1766d7f26ca36a62bec4994
-size 14917
diff --git a/checkpoint-3948/rng_state_1.pth b/checkpoint-3948/rng_state_1.pth
deleted file mode 100644
index 662ad0d5b30369c825f66c080779973608c5058e..0000000000000000000000000000000000000000
--- a/checkpoint-3948/rng_state_1.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:550c54d430b44b77b0abe44c6e3ceba90a155305315c081b7616b35e2c18d1ce
-size 14917
diff --git a/checkpoint-3948/scheduler.pt b/checkpoint-3948/scheduler.pt
deleted file mode 100644
index 51ed35f90326eb016d2a1c3993d7061549624ca8..0000000000000000000000000000000000000000
--- a/checkpoint-3948/scheduler.pt
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:deaab1725fa5d6abb332a09b31b7c4d93808c0289cb39a32cd5102547b98e285
-size 1465
diff --git a/checkpoint-3948/tokenizer.json b/checkpoint-3948/tokenizer.json
deleted file mode 100644
index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000
--- a/checkpoint-3948/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
-size 11422650
diff --git a/checkpoint-3948/tokenizer_config.json b/checkpoint-3948/tokenizer_config.json
deleted file mode 100644
index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000
--- a/checkpoint-3948/tokenizer_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "add_prefix_space": false,
- "backend": "tokenizers",
- "bos_token": null,
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|im_end|>",
- "errors": "replace",
- "extra_special_tokens": [
- "<|im_start|>",
- "<|im_end|>",
- "<|object_ref_start|>",
- "<|object_ref_end|>",
- "<|box_start|>",
- "<|box_end|>",
- "<|quad_start|>",
- "<|quad_end|>",
- "<|vision_start|>",
- "<|vision_end|>",
- "<|vision_pad|>",
- "<|image_pad|>",
- "<|video_pad|>"
- ],
- "is_local": false,
- "model_max_length": 1010000,
- "pad_token": "<|fim_pad|>",
- "split_special_tokens": false,
- "tokenizer_class": "Qwen2Tokenizer",
- "unk_token": null
-}
diff --git a/checkpoint-3948/trainer_state.json b/checkpoint-3948/trainer_state.json
deleted file mode 100644
index 5d447faf10413b9ec27585679ff7a32bdbe441fe..0000000000000000000000000000000000000000
--- a/checkpoint-3948/trainer_state.json
+++ /dev/null
@@ -1,35566 +0,0 @@
-{
- "best_global_step": null,
- "best_metric": null,
- "best_model_checkpoint": null,
- "epoch": 3.0,
- "eval_steps": 500,
- "global_step": 3948,
- "is_hyper_param_search": false,
- "is_local_process_zero": true,
- "is_world_process_zero": true,
- "log_history": [
- {
- "epoch": 0.0007598784194528875,
- "grad_norm": 11.767926216125488,
- "learning_rate": 0.0,
- "loss": 0.7937269806861877,
- "mean_token_accuracy": 0.7822731137275696,
- "num_tokens": 10507.0,
- "step": 1
- },
- {
- "epoch": 0.001519756838905775,
- "grad_norm": 14.9199800491333,
- "learning_rate": 2.5252525252525256e-08,
- "loss": 0.7665389776229858,
- "mean_token_accuracy": 0.8342233300209045,
- "num_tokens": 14806.0,
- "step": 2
- },
- {
- "epoch": 0.0022796352583586625,
- "grad_norm": 11.991217613220215,
- "learning_rate": 5.050505050505051e-08,
- "loss": 0.9597002267837524,
- "mean_token_accuracy": 0.7054992318153381,
- "num_tokens": 27170.0,
- "step": 3
- },
- {
- "epoch": 0.00303951367781155,
- "grad_norm": 12.958333015441895,
- "learning_rate": 7.575757575757576e-08,
- "loss": 0.9971482753753662,
- "mean_token_accuracy": 0.7261134386062622,
- "num_tokens": 33729.0,
- "step": 4
- },
- {
- "epoch": 0.003799392097264438,
- "grad_norm": 13.5665283203125,
- "learning_rate": 1.0101010101010103e-07,
- "loss": 0.9504883885383606,
- "mean_token_accuracy": 0.745307445526123,
- "num_tokens": 41174.0,
- "step": 5
- },
- {
- "epoch": 0.004559270516717325,
- "grad_norm": 10.09444808959961,
- "learning_rate": 1.2626262626262626e-07,
- "loss": 0.759548008441925,
- "mean_token_accuracy": 0.7842121124267578,
- "num_tokens": 47943.0,
- "step": 6
- },
- {
- "epoch": 0.005319148936170213,
- "grad_norm": 10.741650581359863,
- "learning_rate": 1.5151515151515152e-07,
- "loss": 0.8231598138809204,
- "mean_token_accuracy": 0.7550969123840332,
- "num_tokens": 56665.0,
- "step": 7
- },
- {
- "epoch": 0.0060790273556231,
- "grad_norm": 12.250170707702637,
- "learning_rate": 1.767676767676768e-07,
- "loss": 0.8576581478118896,
- "mean_token_accuracy": 0.7568671703338623,
- "num_tokens": 67606.0,
- "step": 8
- },
- {
- "epoch": 0.006838905775075988,
- "grad_norm": 12.828629493713379,
- "learning_rate": 2.0202020202020205e-07,
- "loss": 0.9886435866355896,
- "mean_token_accuracy": 0.733400285243988,
- "num_tokens": 74272.0,
- "step": 9
- },
- {
- "epoch": 0.007598784194528876,
- "grad_norm": 15.966923713684082,
- "learning_rate": 2.2727272727272729e-07,
- "loss": 1.064985990524292,
- "mean_token_accuracy": 0.7101132869720459,
- "num_tokens": 80524.0,
- "step": 10
- },
- {
- "epoch": 0.008358662613981762,
- "grad_norm": 10.864850044250488,
- "learning_rate": 2.525252525252525e-07,
- "loss": 0.8311550617218018,
- "mean_token_accuracy": 0.7431639432907104,
- "num_tokens": 96292.0,
- "step": 11
- },
- {
- "epoch": 0.00911854103343465,
- "grad_norm": 16.438785552978516,
- "learning_rate": 2.7777777777777776e-07,
- "loss": 1.0579866170883179,
- "mean_token_accuracy": 0.7222976684570312,
- "num_tokens": 102992.0,
- "step": 12
- },
- {
- "epoch": 0.009878419452887538,
- "grad_norm": 11.179214477539062,
- "learning_rate": 3.0303030303030305e-07,
- "loss": 0.9816144704818726,
- "mean_token_accuracy": 0.7206371426582336,
- "num_tokens": 113571.0,
- "step": 13
- },
- {
- "epoch": 0.010638297872340425,
- "grad_norm": 12.780299186706543,
- "learning_rate": 3.2828282828282834e-07,
- "loss": 0.847449004650116,
- "mean_token_accuracy": 0.7826199531555176,
- "num_tokens": 119568.0,
- "step": 14
- },
- {
- "epoch": 0.011398176291793313,
- "grad_norm": 14.800421714782715,
- "learning_rate": 3.535353535353536e-07,
- "loss": 0.9275516271591187,
- "mean_token_accuracy": 0.7655045986175537,
- "num_tokens": 126258.0,
- "step": 15
- },
- {
- "epoch": 0.0121580547112462,
- "grad_norm": 11.267602920532227,
- "learning_rate": 3.787878787878788e-07,
- "loss": 0.8464037179946899,
- "mean_token_accuracy": 0.7606508731842041,
- "num_tokens": 136831.0,
- "step": 16
- },
- {
- "epoch": 0.012917933130699088,
- "grad_norm": 12.891013145446777,
- "learning_rate": 4.040404040404041e-07,
- "loss": 0.9903074502944946,
- "mean_token_accuracy": 0.7247487306594849,
- "num_tokens": 150434.0,
- "step": 17
- },
- {
- "epoch": 0.013677811550151976,
- "grad_norm": 11.13957691192627,
- "learning_rate": 4.2929292929292934e-07,
- "loss": 0.8287211656570435,
- "mean_token_accuracy": 0.7621913552284241,
- "num_tokens": 158516.0,
- "step": 18
- },
- {
- "epoch": 0.014437689969604863,
- "grad_norm": 18.39569664001465,
- "learning_rate": 4.5454545454545457e-07,
- "loss": 1.150015115737915,
- "mean_token_accuracy": 0.7349498271942139,
- "num_tokens": 162214.0,
- "step": 19
- },
- {
- "epoch": 0.015197568389057751,
- "grad_norm": 9.353750228881836,
- "learning_rate": 4.797979797979798e-07,
- "loss": 0.7228299379348755,
- "mean_token_accuracy": 0.7969573736190796,
- "num_tokens": 173035.0,
- "step": 20
- },
- {
- "epoch": 0.015957446808510637,
- "grad_norm": 8.267163276672363,
- "learning_rate": 5.05050505050505e-07,
- "loss": 0.7358136177062988,
- "mean_token_accuracy": 0.7903937101364136,
- "num_tokens": 183568.0,
- "step": 21
- },
- {
- "epoch": 0.016717325227963525,
- "grad_norm": 11.137128829956055,
- "learning_rate": 5.303030303030304e-07,
- "loss": 1.0075397491455078,
- "mean_token_accuracy": 0.702807605266571,
- "num_tokens": 192759.0,
- "step": 22
- },
- {
- "epoch": 0.017477203647416412,
- "grad_norm": 10.734103202819824,
- "learning_rate": 5.555555555555555e-07,
- "loss": 0.8925919532775879,
- "mean_token_accuracy": 0.7475671768188477,
- "num_tokens": 201280.0,
- "step": 23
- },
- {
- "epoch": 0.0182370820668693,
- "grad_norm": 11.945566177368164,
- "learning_rate": 5.808080808080809e-07,
- "loss": 0.7260514497756958,
- "mean_token_accuracy": 0.7859152555465698,
- "num_tokens": 218053.0,
- "step": 24
- },
- {
- "epoch": 0.018996960486322188,
- "grad_norm": 18.610652923583984,
- "learning_rate": 6.060606060606061e-07,
- "loss": 0.8995465636253357,
- "mean_token_accuracy": 0.7931990623474121,
- "num_tokens": 220953.0,
- "step": 25
- },
- {
- "epoch": 0.019756838905775075,
- "grad_norm": 10.51898193359375,
- "learning_rate": 6.313131313131314e-07,
- "loss": 0.9532671570777893,
- "mean_token_accuracy": 0.7257645726203918,
- "num_tokens": 231200.0,
- "step": 26
- },
- {
- "epoch": 0.020516717325227963,
- "grad_norm": 9.581812858581543,
- "learning_rate": 6.565656565656567e-07,
- "loss": 0.9038010239601135,
- "mean_token_accuracy": 0.7390379905700684,
- "num_tokens": 237711.0,
- "step": 27
- },
- {
- "epoch": 0.02127659574468085,
- "grad_norm": 12.297484397888184,
- "learning_rate": 6.818181818181818e-07,
- "loss": 1.048936367034912,
- "mean_token_accuracy": 0.7175670862197876,
- "num_tokens": 242503.0,
- "step": 28
- },
- {
- "epoch": 0.022036474164133738,
- "grad_norm": 7.437953472137451,
- "learning_rate": 7.070707070707071e-07,
- "loss": 0.8308826684951782,
- "mean_token_accuracy": 0.7415335774421692,
- "num_tokens": 250842.0,
- "step": 29
- },
- {
- "epoch": 0.022796352583586626,
- "grad_norm": 6.134475231170654,
- "learning_rate": 7.323232323232324e-07,
- "loss": 0.647913932800293,
- "mean_token_accuracy": 0.8124054670333862,
- "num_tokens": 267453.0,
- "step": 30
- },
- {
- "epoch": 0.023556231003039513,
- "grad_norm": 6.678966045379639,
- "learning_rate": 7.575757575757576e-07,
- "loss": 0.7052810192108154,
- "mean_token_accuracy": 0.7908754348754883,
- "num_tokens": 284416.0,
- "step": 31
- },
- {
- "epoch": 0.0243161094224924,
- "grad_norm": 7.42232084274292,
- "learning_rate": 7.82828282828283e-07,
- "loss": 1.022383213043213,
- "mean_token_accuracy": 0.7053230404853821,
- "num_tokens": 292073.0,
- "step": 32
- },
- {
- "epoch": 0.02507598784194529,
- "grad_norm": 6.463219165802002,
- "learning_rate": 8.080808080808082e-07,
- "loss": 0.7603012323379517,
- "mean_token_accuracy": 0.7728140354156494,
- "num_tokens": 298550.0,
- "step": 33
- },
- {
- "epoch": 0.025835866261398176,
- "grad_norm": 5.668411731719971,
- "learning_rate": 8.333333333333333e-07,
- "loss": 0.7707852721214294,
- "mean_token_accuracy": 0.7827773094177246,
- "num_tokens": 306683.0,
- "step": 34
- },
- {
- "epoch": 0.026595744680851064,
- "grad_norm": 4.984964847564697,
- "learning_rate": 8.585858585858587e-07,
- "loss": 0.6317349672317505,
- "mean_token_accuracy": 0.8106861114501953,
- "num_tokens": 318842.0,
- "step": 35
- },
- {
- "epoch": 0.02735562310030395,
- "grad_norm": 4.421732425689697,
- "learning_rate": 8.838383838383839e-07,
- "loss": 0.6228617429733276,
- "mean_token_accuracy": 0.8023355603218079,
- "num_tokens": 329850.0,
- "step": 36
- },
- {
- "epoch": 0.02811550151975684,
- "grad_norm": 5.970808029174805,
- "learning_rate": 9.090909090909091e-07,
- "loss": 0.8443238139152527,
- "mean_token_accuracy": 0.7462409734725952,
- "num_tokens": 335844.0,
- "step": 37
- },
- {
- "epoch": 0.028875379939209727,
- "grad_norm": 4.5389084815979,
- "learning_rate": 9.343434343434345e-07,
- "loss": 0.6976436376571655,
- "mean_token_accuracy": 0.790410041809082,
- "num_tokens": 348768.0,
- "step": 38
- },
- {
- "epoch": 0.029635258358662615,
- "grad_norm": 4.116631507873535,
- "learning_rate": 9.595959595959596e-07,
- "loss": 0.6698519587516785,
- "mean_token_accuracy": 0.7818127870559692,
- "num_tokens": 355460.0,
- "step": 39
- },
- {
- "epoch": 0.030395136778115502,
- "grad_norm": 3.3714773654937744,
- "learning_rate": 9.84848484848485e-07,
- "loss": 0.5723201036453247,
- "mean_token_accuracy": 0.8100086450576782,
- "num_tokens": 368507.0,
- "step": 40
- },
- {
- "epoch": 0.03115501519756839,
- "grad_norm": 4.4438347816467285,
- "learning_rate": 1.01010101010101e-06,
- "loss": 0.7508786916732788,
- "mean_token_accuracy": 0.7711942791938782,
- "num_tokens": 376467.0,
- "step": 41
- },
- {
- "epoch": 0.031914893617021274,
- "grad_norm": 5.609974384307861,
- "learning_rate": 1.0353535353535354e-06,
- "loss": 0.566256046295166,
- "mean_token_accuracy": 0.8319284319877625,
- "num_tokens": 381399.0,
- "step": 42
- },
- {
- "epoch": 0.03267477203647416,
- "grad_norm": 5.124386787414551,
- "learning_rate": 1.0606060606060608e-06,
- "loss": 0.8151067495346069,
- "mean_token_accuracy": 0.7537785768508911,
- "num_tokens": 387389.0,
- "step": 43
- },
- {
- "epoch": 0.03343465045592705,
- "grad_norm": 3.6318116188049316,
- "learning_rate": 1.085858585858586e-06,
- "loss": 0.5989949107170105,
- "mean_token_accuracy": 0.8129256963729858,
- "num_tokens": 395302.0,
- "step": 44
- },
- {
- "epoch": 0.03419452887537994,
- "grad_norm": 2.694424629211426,
- "learning_rate": 1.111111111111111e-06,
- "loss": 0.5831396579742432,
- "mean_token_accuracy": 0.8056820631027222,
- "num_tokens": 409920.0,
- "step": 45
- },
- {
- "epoch": 0.034954407294832825,
- "grad_norm": 2.2949178218841553,
- "learning_rate": 1.1363636363636364e-06,
- "loss": 0.472550630569458,
- "mean_token_accuracy": 0.8343006372451782,
- "num_tokens": 428323.0,
- "step": 46
- },
- {
- "epoch": 0.03571428571428571,
- "grad_norm": 3.3930575847625732,
- "learning_rate": 1.1616161616161617e-06,
- "loss": 0.6246505379676819,
- "mean_token_accuracy": 0.783149003982544,
- "num_tokens": 435889.0,
- "step": 47
- },
- {
- "epoch": 0.0364741641337386,
- "grad_norm": 3.692598819732666,
- "learning_rate": 1.186868686868687e-06,
- "loss": 0.46132946014404297,
- "mean_token_accuracy": 0.8583089113235474,
- "num_tokens": 441192.0,
- "step": 48
- },
- {
- "epoch": 0.03723404255319149,
- "grad_norm": 6.571533203125,
- "learning_rate": 1.2121212121212122e-06,
- "loss": 0.9351121783256531,
- "mean_token_accuracy": 0.7580878734588623,
- "num_tokens": 444277.0,
- "step": 49
- },
- {
- "epoch": 0.037993920972644375,
- "grad_norm": 5.029570579528809,
- "learning_rate": 1.2373737373737375e-06,
- "loss": 0.6921554803848267,
- "mean_token_accuracy": 0.8131166100502014,
- "num_tokens": 447646.0,
- "step": 50
- },
- {
- "epoch": 0.03875379939209726,
- "grad_norm": 2.9174208641052246,
- "learning_rate": 1.2626262626262629e-06,
- "loss": 0.591706395149231,
- "mean_token_accuracy": 0.8108617067337036,
- "num_tokens": 461397.0,
- "step": 51
- },
- {
- "epoch": 0.03951367781155015,
- "grad_norm": 4.315536022186279,
- "learning_rate": 1.287878787878788e-06,
- "loss": 0.6986310482025146,
- "mean_token_accuracy": 0.7710754871368408,
- "num_tokens": 472047.0,
- "step": 52
- },
- {
- "epoch": 0.04027355623100304,
- "grad_norm": 2.6216275691986084,
- "learning_rate": 1.3131313131313134e-06,
- "loss": 0.5553690791130066,
- "mean_token_accuracy": 0.8167896866798401,
- "num_tokens": 482795.0,
- "step": 53
- },
- {
- "epoch": 0.041033434650455926,
- "grad_norm": 3.0562477111816406,
- "learning_rate": 1.3383838383838385e-06,
- "loss": 0.6909202337265015,
- "mean_token_accuracy": 0.7859863638877869,
- "num_tokens": 494818.0,
- "step": 54
- },
- {
- "epoch": 0.04179331306990881,
- "grad_norm": 2.1420412063598633,
- "learning_rate": 1.3636363636363636e-06,
- "loss": 0.5415265560150146,
- "mean_token_accuracy": 0.818886399269104,
- "num_tokens": 513695.0,
- "step": 55
- },
- {
- "epoch": 0.0425531914893617,
- "grad_norm": 2.9610488414764404,
- "learning_rate": 1.3888888888888892e-06,
- "loss": 0.6602212190628052,
- "mean_token_accuracy": 0.7830734252929688,
- "num_tokens": 523784.0,
- "step": 56
- },
- {
- "epoch": 0.04331306990881459,
- "grad_norm": 2.511972665786743,
- "learning_rate": 1.4141414141414143e-06,
- "loss": 0.5717809796333313,
- "mean_token_accuracy": 0.8053616285324097,
- "num_tokens": 546308.0,
- "step": 57
- },
- {
- "epoch": 0.044072948328267476,
- "grad_norm": 3.52642822265625,
- "learning_rate": 1.4393939393939396e-06,
- "loss": 0.6242594718933105,
- "mean_token_accuracy": 0.8162082433700562,
- "num_tokens": 552019.0,
- "step": 58
- },
- {
- "epoch": 0.044832826747720364,
- "grad_norm": 3.02362322807312,
- "learning_rate": 1.4646464646464648e-06,
- "loss": 0.6634255647659302,
- "mean_token_accuracy": 0.7682032585144043,
- "num_tokens": 560009.0,
- "step": 59
- },
- {
- "epoch": 0.04559270516717325,
- "grad_norm": 2.3910107612609863,
- "learning_rate": 1.48989898989899e-06,
- "loss": 0.5519146919250488,
- "mean_token_accuracy": 0.8270269632339478,
- "num_tokens": 571005.0,
- "step": 60
- },
- {
- "epoch": 0.04635258358662614,
- "grad_norm": 4.28154993057251,
- "learning_rate": 1.5151515151515152e-06,
- "loss": 0.7437789440155029,
- "mean_token_accuracy": 0.7782418131828308,
- "num_tokens": 574950.0,
- "step": 61
- },
- {
- "epoch": 0.04711246200607903,
- "grad_norm": 3.4078686237335205,
- "learning_rate": 1.5404040404040404e-06,
- "loss": 0.6345915198326111,
- "mean_token_accuracy": 0.7903392314910889,
- "num_tokens": 581657.0,
- "step": 62
- },
- {
- "epoch": 0.047872340425531915,
- "grad_norm": 2.6834158897399902,
- "learning_rate": 1.565656565656566e-06,
- "loss": 0.5981127023696899,
- "mean_token_accuracy": 0.7911489605903625,
- "num_tokens": 591267.0,
- "step": 63
- },
- {
- "epoch": 0.0486322188449848,
- "grad_norm": 2.1054461002349854,
- "learning_rate": 1.590909090909091e-06,
- "loss": 0.5523523688316345,
- "mean_token_accuracy": 0.8194501399993896,
- "num_tokens": 606787.0,
- "step": 64
- },
- {
- "epoch": 0.04939209726443769,
- "grad_norm": 3.322596788406372,
- "learning_rate": 1.6161616161616164e-06,
- "loss": 0.48417025804519653,
- "mean_token_accuracy": 0.8293706178665161,
- "num_tokens": 611068.0,
- "step": 65
- },
- {
- "epoch": 0.05015197568389058,
- "grad_norm": 2.302450180053711,
- "learning_rate": 1.6414141414141415e-06,
- "loss": 0.6498389840126038,
- "mean_token_accuracy": 0.7728497385978699,
- "num_tokens": 624452.0,
- "step": 66
- },
- {
- "epoch": 0.050911854103343465,
- "grad_norm": 2.680191993713379,
- "learning_rate": 1.6666666666666667e-06,
- "loss": 0.6347037553787231,
- "mean_token_accuracy": 0.8108306527137756,
- "num_tokens": 638049.0,
- "step": 67
- },
- {
- "epoch": 0.05167173252279635,
- "grad_norm": 3.0297021865844727,
- "learning_rate": 1.6919191919191922e-06,
- "loss": 0.5344363451004028,
- "mean_token_accuracy": 0.8113535046577454,
- "num_tokens": 643892.0,
- "step": 68
- },
- {
- "epoch": 0.05243161094224924,
- "grad_norm": 2.9283676147460938,
- "learning_rate": 1.7171717171717173e-06,
- "loss": 0.6999260187149048,
- "mean_token_accuracy": 0.7782022356987,
- "num_tokens": 654418.0,
- "step": 69
- },
- {
- "epoch": 0.05319148936170213,
- "grad_norm": 3.4098572731018066,
- "learning_rate": 1.7424242424242427e-06,
- "loss": 0.6508946418762207,
- "mean_token_accuracy": 0.7942900657653809,
- "num_tokens": 659837.0,
- "step": 70
- },
- {
- "epoch": 0.053951367781155016,
- "grad_norm": 2.6756019592285156,
- "learning_rate": 1.7676767676767678e-06,
- "loss": 0.603486180305481,
- "mean_token_accuracy": 0.8015457391738892,
- "num_tokens": 668361.0,
- "step": 71
- },
- {
- "epoch": 0.0547112462006079,
- "grad_norm": 2.2630293369293213,
- "learning_rate": 1.792929292929293e-06,
- "loss": 0.6608274579048157,
- "mean_token_accuracy": 0.7753809690475464,
- "num_tokens": 679025.0,
- "step": 72
- },
- {
- "epoch": 0.05547112462006079,
- "grad_norm": 2.123962879180908,
- "learning_rate": 1.8181818181818183e-06,
- "loss": 0.4525482654571533,
- "mean_token_accuracy": 0.8425612449645996,
- "num_tokens": 688574.0,
- "step": 73
- },
- {
- "epoch": 0.05623100303951368,
- "grad_norm": 7.90519905090332,
- "learning_rate": 1.8434343434343434e-06,
- "loss": 0.6507195830345154,
- "mean_token_accuracy": 0.7714964151382446,
- "num_tokens": 694534.0,
- "step": 74
- },
- {
- "epoch": 0.056990881458966566,
- "grad_norm": 2.372203826904297,
- "learning_rate": 1.868686868686869e-06,
- "loss": 0.4458143413066864,
- "mean_token_accuracy": 0.7991449236869812,
- "num_tokens": 703114.0,
- "step": 75
- },
- {
- "epoch": 0.057750759878419454,
- "grad_norm": 2.918677568435669,
- "learning_rate": 1.8939393939393941e-06,
- "loss": 0.5614339113235474,
- "mean_token_accuracy": 0.8211464881896973,
- "num_tokens": 709038.0,
- "step": 76
- },
- {
- "epoch": 0.05851063829787234,
- "grad_norm": 1.6106709241867065,
- "learning_rate": 1.9191919191919192e-06,
- "loss": 0.5802098512649536,
- "mean_token_accuracy": 0.8055065870285034,
- "num_tokens": 730482.0,
- "step": 77
- },
- {
- "epoch": 0.05927051671732523,
- "grad_norm": 2.8069989681243896,
- "learning_rate": 1.944444444444445e-06,
- "loss": 0.5709059238433838,
- "mean_token_accuracy": 0.8024872541427612,
- "num_tokens": 751817.0,
- "step": 78
- },
- {
- "epoch": 0.06003039513677812,
- "grad_norm": 2.641667127609253,
- "learning_rate": 1.96969696969697e-06,
- "loss": 0.6480152606964111,
- "mean_token_accuracy": 0.7912271618843079,
- "num_tokens": 759236.0,
- "step": 79
- },
- {
- "epoch": 0.060790273556231005,
- "grad_norm": 2.6034350395202637,
- "learning_rate": 1.994949494949495e-06,
- "loss": 0.5535176396369934,
- "mean_token_accuracy": 0.7980542778968811,
- "num_tokens": 766496.0,
- "step": 80
- },
- {
- "epoch": 0.06155015197568389,
- "grad_norm": 1.7095069885253906,
- "learning_rate": 2.02020202020202e-06,
- "loss": 0.4545496106147766,
- "mean_token_accuracy": 0.8229660391807556,
- "num_tokens": 780124.0,
- "step": 81
- },
- {
- "epoch": 0.06231003039513678,
- "grad_norm": 3.788830518722534,
- "learning_rate": 2.0454545454545457e-06,
- "loss": 0.6679391264915466,
- "mean_token_accuracy": 0.7942397594451904,
- "num_tokens": 784555.0,
- "step": 82
- },
- {
- "epoch": 0.06306990881458967,
- "grad_norm": 2.009831666946411,
- "learning_rate": 2.070707070707071e-06,
- "loss": 0.5067101120948792,
- "mean_token_accuracy": 0.8276634216308594,
- "num_tokens": 797459.0,
- "step": 83
- },
- {
- "epoch": 0.06382978723404255,
- "grad_norm": 2.201627731323242,
- "learning_rate": 2.095959595959596e-06,
- "loss": 0.5012127161026001,
- "mean_token_accuracy": 0.8432504534721375,
- "num_tokens": 810817.0,
- "step": 84
- },
- {
- "epoch": 0.06458966565349544,
- "grad_norm": 2.492568016052246,
- "learning_rate": 2.1212121212121216e-06,
- "loss": 0.6142797470092773,
- "mean_token_accuracy": 0.8338661193847656,
- "num_tokens": 818191.0,
- "step": 85
- },
- {
- "epoch": 0.06534954407294832,
- "grad_norm": 2.8360862731933594,
- "learning_rate": 2.1464646464646467e-06,
- "loss": 0.5569300651550293,
- "mean_token_accuracy": 0.8121030330657959,
- "num_tokens": 825325.0,
- "step": 86
- },
- {
- "epoch": 0.06610942249240122,
- "grad_norm": 2.407548427581787,
- "learning_rate": 2.171717171717172e-06,
- "loss": 0.6442930102348328,
- "mean_token_accuracy": 0.792514443397522,
- "num_tokens": 834439.0,
- "step": 87
- },
- {
- "epoch": 0.0668693009118541,
- "grad_norm": 2.340728759765625,
- "learning_rate": 2.196969696969697e-06,
- "loss": 0.6494365930557251,
- "mean_token_accuracy": 0.7746615409851074,
- "num_tokens": 843078.0,
- "step": 88
- },
- {
- "epoch": 0.067629179331307,
- "grad_norm": 1.7703697681427002,
- "learning_rate": 2.222222222222222e-06,
- "loss": 0.598991870880127,
- "mean_token_accuracy": 0.7992157340049744,
- "num_tokens": 860171.0,
- "step": 89
- },
- {
- "epoch": 0.06838905775075987,
- "grad_norm": 2.5779271125793457,
- "learning_rate": 2.2474747474747476e-06,
- "loss": 0.5693082809448242,
- "mean_token_accuracy": 0.8093700408935547,
- "num_tokens": 866669.0,
- "step": 90
- },
- {
- "epoch": 0.06914893617021277,
- "grad_norm": 2.014092206954956,
- "learning_rate": 2.2727272727272728e-06,
- "loss": 0.5346695780754089,
- "mean_token_accuracy": 0.8165590763092041,
- "num_tokens": 876698.0,
- "step": 91
- },
- {
- "epoch": 0.06990881458966565,
- "grad_norm": 1.7555919885635376,
- "learning_rate": 2.2979797979797983e-06,
- "loss": 0.5321458578109741,
- "mean_token_accuracy": 0.8166656494140625,
- "num_tokens": 889488.0,
- "step": 92
- },
- {
- "epoch": 0.07066869300911854,
- "grad_norm": 1.8631824254989624,
- "learning_rate": 2.3232323232323234e-06,
- "loss": 0.5246532559394836,
- "mean_token_accuracy": 0.8088107705116272,
- "num_tokens": 901322.0,
- "step": 93
- },
- {
- "epoch": 0.07142857142857142,
- "grad_norm": 3.2332139015197754,
- "learning_rate": 2.348484848484849e-06,
- "loss": 0.5141711235046387,
- "mean_token_accuracy": 0.8382217884063721,
- "num_tokens": 905792.0,
- "step": 94
- },
- {
- "epoch": 0.07218844984802432,
- "grad_norm": 1.7806555032730103,
- "learning_rate": 2.373737373737374e-06,
- "loss": 0.5233149528503418,
- "mean_token_accuracy": 0.8101529479026794,
- "num_tokens": 917320.0,
- "step": 95
- },
- {
- "epoch": 0.0729483282674772,
- "grad_norm": 1.8169859647750854,
- "learning_rate": 2.3989898989898993e-06,
- "loss": 0.578881561756134,
- "mean_token_accuracy": 0.8044873476028442,
- "num_tokens": 931062.0,
- "step": 96
- },
- {
- "epoch": 0.0737082066869301,
- "grad_norm": 4.677402496337891,
- "learning_rate": 2.4242424242424244e-06,
- "loss": 0.7842556238174438,
- "mean_token_accuracy": 0.7579764127731323,
- "num_tokens": 934712.0,
- "step": 97
- },
- {
- "epoch": 0.07446808510638298,
- "grad_norm": 2.6987264156341553,
- "learning_rate": 2.4494949494949495e-06,
- "loss": 0.5669287443161011,
- "mean_token_accuracy": 0.8186933994293213,
- "num_tokens": 941058.0,
- "step": 98
- },
- {
- "epoch": 0.07522796352583587,
- "grad_norm": 1.6906023025512695,
- "learning_rate": 2.474747474747475e-06,
- "loss": 0.4976363778114319,
- "mean_token_accuracy": 0.8198553323745728,
- "num_tokens": 956509.0,
- "step": 99
- },
- {
- "epoch": 0.07598784194528875,
- "grad_norm": 2.7256152629852295,
- "learning_rate": 2.5e-06,
- "loss": 0.7138420343399048,
- "mean_token_accuracy": 0.7752805948257446,
- "num_tokens": 963920.0,
- "step": 100
- },
- {
- "epoch": 0.07674772036474165,
- "grad_norm": 2.174870491027832,
- "learning_rate": 2.5252525252525258e-06,
- "loss": 0.6733541488647461,
- "mean_token_accuracy": 0.7745175361633301,
- "num_tokens": 975268.0,
- "step": 101
- },
- {
- "epoch": 0.07750759878419453,
- "grad_norm": 1.5587213039398193,
- "learning_rate": 2.5505050505050505e-06,
- "loss": 0.44223445653915405,
- "mean_token_accuracy": 0.8278359174728394,
- "num_tokens": 991837.0,
- "step": 102
- },
- {
- "epoch": 0.07826747720364742,
- "grad_norm": 2.181840658187866,
- "learning_rate": 2.575757575757576e-06,
- "loss": 0.625128448009491,
- "mean_token_accuracy": 0.7941786050796509,
- "num_tokens": 1004325.0,
- "step": 103
- },
- {
- "epoch": 0.0790273556231003,
- "grad_norm": 1.4986687898635864,
- "learning_rate": 2.601010101010101e-06,
- "loss": 0.39262527227401733,
- "mean_token_accuracy": 0.8412648439407349,
- "num_tokens": 1018331.0,
- "step": 104
- },
- {
- "epoch": 0.0797872340425532,
- "grad_norm": 2.3416061401367188,
- "learning_rate": 2.6262626262626267e-06,
- "loss": 0.5495132803916931,
- "mean_token_accuracy": 0.8193322420120239,
- "num_tokens": 1026090.0,
- "step": 105
- },
- {
- "epoch": 0.08054711246200608,
- "grad_norm": 3.8168859481811523,
- "learning_rate": 2.6515151515151514e-06,
- "loss": 0.4898706376552582,
- "mean_token_accuracy": 0.8467956185340881,
- "num_tokens": 1029955.0,
- "step": 106
- },
- {
- "epoch": 0.08130699088145897,
- "grad_norm": 4.113908767700195,
- "learning_rate": 2.676767676767677e-06,
- "loss": 0.6189584732055664,
- "mean_token_accuracy": 0.8019394278526306,
- "num_tokens": 1033598.0,
- "step": 107
- },
- {
- "epoch": 0.08206686930091185,
- "grad_norm": 2.50003981590271,
- "learning_rate": 2.7020202020202025e-06,
- "loss": 0.6479471921920776,
- "mean_token_accuracy": 0.7790026664733887,
- "num_tokens": 1042533.0,
- "step": 108
- },
- {
- "epoch": 0.08282674772036475,
- "grad_norm": 1.408934473991394,
- "learning_rate": 2.7272727272727272e-06,
- "loss": 0.3909248113632202,
- "mean_token_accuracy": 0.8477586507797241,
- "num_tokens": 1061755.0,
- "step": 109
- },
- {
- "epoch": 0.08358662613981763,
- "grad_norm": 3.360633611679077,
- "learning_rate": 2.7525252525252528e-06,
- "loss": 0.6952459812164307,
- "mean_token_accuracy": 0.777535080909729,
- "num_tokens": 1067316.0,
- "step": 110
- },
- {
- "epoch": 0.08434650455927052,
- "grad_norm": 1.8631696701049805,
- "learning_rate": 2.7777777777777783e-06,
- "loss": 0.5420593023300171,
- "mean_token_accuracy": 0.8157662749290466,
- "num_tokens": 1079930.0,
- "step": 111
- },
- {
- "epoch": 0.0851063829787234,
- "grad_norm": 2.4308314323425293,
- "learning_rate": 2.803030303030303e-06,
- "loss": 0.5863882303237915,
- "mean_token_accuracy": 0.8206346035003662,
- "num_tokens": 1088069.0,
- "step": 112
- },
- {
- "epoch": 0.0858662613981763,
- "grad_norm": 2.922808885574341,
- "learning_rate": 2.8282828282828286e-06,
- "loss": 0.5217319130897522,
- "mean_token_accuracy": 0.8253234028816223,
- "num_tokens": 1093607.0,
- "step": 113
- },
- {
- "epoch": 0.08662613981762918,
- "grad_norm": 2.3596107959747314,
- "learning_rate": 2.8535353535353537e-06,
- "loss": 0.5070714950561523,
- "mean_token_accuracy": 0.8258323669433594,
- "num_tokens": 1100405.0,
- "step": 114
- },
- {
- "epoch": 0.08738601823708207,
- "grad_norm": 3.0853066444396973,
- "learning_rate": 2.8787878787878793e-06,
- "loss": 0.591964840888977,
- "mean_token_accuracy": 0.8047322630882263,
- "num_tokens": 1107535.0,
- "step": 115
- },
- {
- "epoch": 0.08814589665653495,
- "grad_norm": 1.9251092672348022,
- "learning_rate": 2.904040404040404e-06,
- "loss": 0.5226191878318787,
- "mean_token_accuracy": 0.8022720217704773,
- "num_tokens": 1118716.0,
- "step": 116
- },
- {
- "epoch": 0.08890577507598785,
- "grad_norm": 1.9692988395690918,
- "learning_rate": 2.9292929292929295e-06,
- "loss": 0.5462069511413574,
- "mean_token_accuracy": 0.8157015442848206,
- "num_tokens": 1131917.0,
- "step": 117
- },
- {
- "epoch": 0.08966565349544073,
- "grad_norm": 1.4738909006118774,
- "learning_rate": 2.954545454545455e-06,
- "loss": 0.4564219117164612,
- "mean_token_accuracy": 0.849632978439331,
- "num_tokens": 1148534.0,
- "step": 118
- },
- {
- "epoch": 0.09042553191489362,
- "grad_norm": 2.72646164894104,
- "learning_rate": 2.97979797979798e-06,
- "loss": 0.6654808521270752,
- "mean_token_accuracy": 0.7752684354782104,
- "num_tokens": 1155438.0,
- "step": 119
- },
- {
- "epoch": 0.0911854103343465,
- "grad_norm": 2.7843852043151855,
- "learning_rate": 3.0050505050505054e-06,
- "loss": 0.5354680418968201,
- "mean_token_accuracy": 0.8196378946304321,
- "num_tokens": 1161815.0,
- "step": 120
- },
- {
- "epoch": 0.0919452887537994,
- "grad_norm": 2.8052573204040527,
- "learning_rate": 3.0303030303030305e-06,
- "loss": 0.6366757154464722,
- "mean_token_accuracy": 0.7967483997344971,
- "num_tokens": 1168295.0,
- "step": 121
- },
- {
- "epoch": 0.09270516717325228,
- "grad_norm": 2.7462735176086426,
- "learning_rate": 3.055555555555556e-06,
- "loss": 0.59470534324646,
- "mean_token_accuracy": 0.8023771047592163,
- "num_tokens": 1174502.0,
- "step": 122
- },
- {
- "epoch": 0.09346504559270517,
- "grad_norm": 2.2743821144104004,
- "learning_rate": 3.0808080808080807e-06,
- "loss": 0.5720560550689697,
- "mean_token_accuracy": 0.8162771463394165,
- "num_tokens": 1183615.0,
- "step": 123
- },
- {
- "epoch": 0.09422492401215805,
- "grad_norm": 1.8669533729553223,
- "learning_rate": 3.1060606060606063e-06,
- "loss": 0.4655378758907318,
- "mean_token_accuracy": 0.8360732793807983,
- "num_tokens": 1193761.0,
- "step": 124
- },
- {
- "epoch": 0.09498480243161095,
- "grad_norm": 1.7666901350021362,
- "learning_rate": 3.131313131313132e-06,
- "loss": 0.5524153709411621,
- "mean_token_accuracy": 0.8252713680267334,
- "num_tokens": 1207870.0,
- "step": 125
- },
- {
- "epoch": 0.09574468085106383,
- "grad_norm": 2.4720070362091064,
- "learning_rate": 3.1565656565656566e-06,
- "loss": 0.5003011226654053,
- "mean_token_accuracy": 0.8491042852401733,
- "num_tokens": 1214603.0,
- "step": 126
- },
- {
- "epoch": 0.09650455927051672,
- "grad_norm": 1.6500422954559326,
- "learning_rate": 3.181818181818182e-06,
- "loss": 0.5137069225311279,
- "mean_token_accuracy": 0.8273531198501587,
- "num_tokens": 1228717.0,
- "step": 127
- },
- {
- "epoch": 0.0972644376899696,
- "grad_norm": 3.402543067932129,
- "learning_rate": 3.2070707070707072e-06,
- "loss": 0.708167552947998,
- "mean_token_accuracy": 0.7705385684967041,
- "num_tokens": 1234361.0,
- "step": 128
- },
- {
- "epoch": 0.0980243161094225,
- "grad_norm": 2.547285795211792,
- "learning_rate": 3.232323232323233e-06,
- "loss": 0.6020137071609497,
- "mean_token_accuracy": 0.7981340289115906,
- "num_tokens": 1244169.0,
- "step": 129
- },
- {
- "epoch": 0.09878419452887538,
- "grad_norm": 2.0578792095184326,
- "learning_rate": 3.257575757575758e-06,
- "loss": 0.4425000250339508,
- "mean_token_accuracy": 0.8567807674407959,
- "num_tokens": 1252709.0,
- "step": 130
- },
- {
- "epoch": 0.09954407294832827,
- "grad_norm": 1.672614336013794,
- "learning_rate": 3.282828282828283e-06,
- "loss": 0.4860966205596924,
- "mean_token_accuracy": 0.8393139243125916,
- "num_tokens": 1265766.0,
- "step": 131
- },
- {
- "epoch": 0.10030395136778116,
- "grad_norm": 3.2560198307037354,
- "learning_rate": 3.3080808080808086e-06,
- "loss": 0.624736487865448,
- "mean_token_accuracy": 0.7875322699546814,
- "num_tokens": 1270779.0,
- "step": 132
- },
- {
- "epoch": 0.10106382978723404,
- "grad_norm": 2.4468185901641846,
- "learning_rate": 3.3333333333333333e-06,
- "loss": 0.5062227249145508,
- "mean_token_accuracy": 0.8217229843139648,
- "num_tokens": 1277113.0,
- "step": 133
- },
- {
- "epoch": 0.10182370820668693,
- "grad_norm": 2.6371328830718994,
- "learning_rate": 3.358585858585859e-06,
- "loss": 0.477113276720047,
- "mean_token_accuracy": 0.8605583906173706,
- "num_tokens": 1282514.0,
- "step": 134
- },
- {
- "epoch": 0.10258358662613981,
- "grad_norm": 2.48421311378479,
- "learning_rate": 3.3838383838383844e-06,
- "loss": 0.40855684876441956,
- "mean_token_accuracy": 0.864548921585083,
- "num_tokens": 1287859.0,
- "step": 135
- },
- {
- "epoch": 0.1033434650455927,
- "grad_norm": 1.993099331855774,
- "learning_rate": 3.409090909090909e-06,
- "loss": 0.5913145542144775,
- "mean_token_accuracy": 0.8248485922813416,
- "num_tokens": 1301074.0,
- "step": 136
- },
- {
- "epoch": 0.10410334346504559,
- "grad_norm": 3.5947680473327637,
- "learning_rate": 3.4343434343434347e-06,
- "loss": 0.5028599500656128,
- "mean_token_accuracy": 0.8367215394973755,
- "num_tokens": 1305219.0,
- "step": 137
- },
- {
- "epoch": 0.10486322188449848,
- "grad_norm": 2.5778582096099854,
- "learning_rate": 3.45959595959596e-06,
- "loss": 0.5297672748565674,
- "mean_token_accuracy": 0.8232187032699585,
- "num_tokens": 1312482.0,
- "step": 138
- },
- {
- "epoch": 0.10562310030395136,
- "grad_norm": 1.8961588144302368,
- "learning_rate": 3.4848484848484854e-06,
- "loss": 0.39954107999801636,
- "mean_token_accuracy": 0.8605833053588867,
- "num_tokens": 1323404.0,
- "step": 139
- },
- {
- "epoch": 0.10638297872340426,
- "grad_norm": 1.9687960147857666,
- "learning_rate": 3.51010101010101e-06,
- "loss": 0.48791587352752686,
- "mean_token_accuracy": 0.8200347423553467,
- "num_tokens": 1333027.0,
- "step": 140
- },
- {
- "epoch": 0.10714285714285714,
- "grad_norm": 2.520242691040039,
- "learning_rate": 3.5353535353535356e-06,
- "loss": 0.6106002330780029,
- "mean_token_accuracy": 0.790692150592804,
- "num_tokens": 1340999.0,
- "step": 141
- },
- {
- "epoch": 0.10790273556231003,
- "grad_norm": 3.751617431640625,
- "learning_rate": 3.560606060606061e-06,
- "loss": 0.48141729831695557,
- "mean_token_accuracy": 0.8421382904052734,
- "num_tokens": 1344687.0,
- "step": 142
- },
- {
- "epoch": 0.10866261398176291,
- "grad_norm": 2.7101709842681885,
- "learning_rate": 3.585858585858586e-06,
- "loss": 0.5375241637229919,
- "mean_token_accuracy": 0.8061438202857971,
- "num_tokens": 1350192.0,
- "step": 143
- },
- {
- "epoch": 0.1094224924012158,
- "grad_norm": 2.583484411239624,
- "learning_rate": 3.6111111111111115e-06,
- "loss": 0.6492470502853394,
- "mean_token_accuracy": 0.7863001823425293,
- "num_tokens": 1358148.0,
- "step": 144
- },
- {
- "epoch": 0.11018237082066869,
- "grad_norm": 1.792561650276184,
- "learning_rate": 3.6363636363636366e-06,
- "loss": 0.48480600118637085,
- "mean_token_accuracy": 0.8358709812164307,
- "num_tokens": 1369519.0,
- "step": 145
- },
- {
- "epoch": 0.11094224924012158,
- "grad_norm": 2.6480472087860107,
- "learning_rate": 3.661616161616162e-06,
- "loss": 0.5268933176994324,
- "mean_token_accuracy": 0.8214013576507568,
- "num_tokens": 1375862.0,
- "step": 146
- },
- {
- "epoch": 0.11170212765957446,
- "grad_norm": 2.3174469470977783,
- "learning_rate": 3.686868686868687e-06,
- "loss": 0.42517897486686707,
- "mean_token_accuracy": 0.8523461222648621,
- "num_tokens": 1381546.0,
- "step": 147
- },
- {
- "epoch": 0.11246200607902736,
- "grad_norm": 3.0090949535369873,
- "learning_rate": 3.7121212121212124e-06,
- "loss": 0.4042336940765381,
- "mean_token_accuracy": 0.8670448064804077,
- "num_tokens": 1385896.0,
- "step": 148
- },
- {
- "epoch": 0.11322188449848024,
- "grad_norm": 2.4928104877471924,
- "learning_rate": 3.737373737373738e-06,
- "loss": 0.6498878598213196,
- "mean_token_accuracy": 0.7967068552970886,
- "num_tokens": 1394169.0,
- "step": 149
- },
- {
- "epoch": 0.11398176291793313,
- "grad_norm": 1.5984913110733032,
- "learning_rate": 3.7626262626262627e-06,
- "loss": 0.546096920967102,
- "mean_token_accuracy": 0.8035850524902344,
- "num_tokens": 1408785.0,
- "step": 150
- },
- {
- "epoch": 0.11474164133738601,
- "grad_norm": 2.3663532733917236,
- "learning_rate": 3.7878787878787882e-06,
- "loss": 0.6111721992492676,
- "mean_token_accuracy": 0.8015355467796326,
- "num_tokens": 1417510.0,
- "step": 151
- },
- {
- "epoch": 0.11550151975683891,
- "grad_norm": 2.518932819366455,
- "learning_rate": 3.8131313131313138e-06,
- "loss": 0.5274964570999146,
- "mean_token_accuracy": 0.8155480623245239,
- "num_tokens": 1424186.0,
- "step": 152
- },
- {
- "epoch": 0.11626139817629179,
- "grad_norm": 2.14353609085083,
- "learning_rate": 3.8383838383838385e-06,
- "loss": 0.5283297896385193,
- "mean_token_accuracy": 0.8275758028030396,
- "num_tokens": 1432630.0,
- "step": 153
- },
- {
- "epoch": 0.11702127659574468,
- "grad_norm": 1.8243604898452759,
- "learning_rate": 3.863636363636364e-06,
- "loss": 0.41854870319366455,
- "mean_token_accuracy": 0.8222295045852661,
- "num_tokens": 1442691.0,
- "step": 154
- },
- {
- "epoch": 0.11778115501519756,
- "grad_norm": 2.088212251663208,
- "learning_rate": 3.88888888888889e-06,
- "loss": 0.6062943339347839,
- "mean_token_accuracy": 0.8009427785873413,
- "num_tokens": 1456890.0,
- "step": 155
- },
- {
- "epoch": 0.11854103343465046,
- "grad_norm": 1.3469511270523071,
- "learning_rate": 3.914141414141415e-06,
- "loss": 0.4390433728694916,
- "mean_token_accuracy": 0.8436295986175537,
- "num_tokens": 1475349.0,
- "step": 156
- },
- {
- "epoch": 0.11930091185410334,
- "grad_norm": 3.247023105621338,
- "learning_rate": 3.93939393939394e-06,
- "loss": 0.6490433216094971,
- "mean_token_accuracy": 0.8037861585617065,
- "num_tokens": 1479952.0,
- "step": 157
- },
- {
- "epoch": 0.12006079027355623,
- "grad_norm": 2.6610445976257324,
- "learning_rate": 3.964646464646465e-06,
- "loss": 0.6221826076507568,
- "mean_token_accuracy": 0.7848749160766602,
- "num_tokens": 1487306.0,
- "step": 158
- },
- {
- "epoch": 0.12082066869300911,
- "grad_norm": 2.3060810565948486,
- "learning_rate": 3.98989898989899e-06,
- "loss": 0.5052388310432434,
- "mean_token_accuracy": 0.8281195759773254,
- "num_tokens": 1495367.0,
- "step": 159
- },
- {
- "epoch": 0.12158054711246201,
- "grad_norm": 2.504448652267456,
- "learning_rate": 4.015151515151515e-06,
- "loss": 0.5005477666854858,
- "mean_token_accuracy": 0.8408058881759644,
- "num_tokens": 1502069.0,
- "step": 160
- },
- {
- "epoch": 0.12234042553191489,
- "grad_norm": 3.993938446044922,
- "learning_rate": 4.04040404040404e-06,
- "loss": 0.5569638013839722,
- "mean_token_accuracy": 0.8095242977142334,
- "num_tokens": 1510224.0,
- "step": 161
- },
- {
- "epoch": 0.12310030395136778,
- "grad_norm": 2.2287683486938477,
- "learning_rate": 4.065656565656566e-06,
- "loss": 0.524042546749115,
- "mean_token_accuracy": 0.8102203607559204,
- "num_tokens": 1518364.0,
- "step": 162
- },
- {
- "epoch": 0.12386018237082067,
- "grad_norm": 1.9531738758087158,
- "learning_rate": 4.0909090909090915e-06,
- "loss": 0.45794573426246643,
- "mean_token_accuracy": 0.8560376167297363,
- "num_tokens": 1528097.0,
- "step": 163
- },
- {
- "epoch": 0.12462006079027356,
- "grad_norm": 1.5841206312179565,
- "learning_rate": 4.116161616161617e-06,
- "loss": 0.5420972108840942,
- "mean_token_accuracy": 0.8092726469039917,
- "num_tokens": 1544119.0,
- "step": 164
- },
- {
- "epoch": 0.12537993920972645,
- "grad_norm": 1.7536218166351318,
- "learning_rate": 4.141414141414142e-06,
- "loss": 0.554668664932251,
- "mean_token_accuracy": 0.8193825483322144,
- "num_tokens": 1559140.0,
- "step": 165
- },
- {
- "epoch": 0.12613981762917933,
- "grad_norm": 3.545454740524292,
- "learning_rate": 4.166666666666667e-06,
- "loss": 0.580947995185852,
- "mean_token_accuracy": 0.8286383152008057,
- "num_tokens": 1563625.0,
- "step": 166
- },
- {
- "epoch": 0.12689969604863222,
- "grad_norm": 1.6608915328979492,
- "learning_rate": 4.191919191919192e-06,
- "loss": 0.5523324012756348,
- "mean_token_accuracy": 0.8155215978622437,
- "num_tokens": 1574945.0,
- "step": 167
- },
- {
- "epoch": 0.1276595744680851,
- "grad_norm": 1.4832708835601807,
- "learning_rate": 4.217171717171717e-06,
- "loss": 0.5133191347122192,
- "mean_token_accuracy": 0.8367571830749512,
- "num_tokens": 1595865.0,
- "step": 168
- },
- {
- "epoch": 0.128419452887538,
- "grad_norm": 1.7807520627975464,
- "learning_rate": 4.242424242424243e-06,
- "loss": 0.5131410360336304,
- "mean_token_accuracy": 0.8129367232322693,
- "num_tokens": 1608723.0,
- "step": 169
- },
- {
- "epoch": 0.12917933130699089,
- "grad_norm": 2.707569122314453,
- "learning_rate": 4.267676767676767e-06,
- "loss": 0.6129013299942017,
- "mean_token_accuracy": 0.7926048040390015,
- "num_tokens": 1616136.0,
- "step": 170
- },
- {
- "epoch": 0.12993920972644377,
- "grad_norm": 2.5831644535064697,
- "learning_rate": 4.292929292929293e-06,
- "loss": 0.6264227628707886,
- "mean_token_accuracy": 0.8074911236763,
- "num_tokens": 1624228.0,
- "step": 171
- },
- {
- "epoch": 0.13069908814589665,
- "grad_norm": 3.1124250888824463,
- "learning_rate": 4.3181818181818185e-06,
- "loss": 0.41763827204704285,
- "mean_token_accuracy": 0.8565453290939331,
- "num_tokens": 1628098.0,
- "step": 172
- },
- {
- "epoch": 0.13145896656534956,
- "grad_norm": 2.3214211463928223,
- "learning_rate": 4.343434343434344e-06,
- "loss": 0.421974778175354,
- "mean_token_accuracy": 0.8391546010971069,
- "num_tokens": 1634950.0,
- "step": 173
- },
- {
- "epoch": 0.13221884498480244,
- "grad_norm": 2.1010327339172363,
- "learning_rate": 4.368686868686869e-06,
- "loss": 0.5307331681251526,
- "mean_token_accuracy": 0.8139588236808777,
- "num_tokens": 1644132.0,
- "step": 174
- },
- {
- "epoch": 0.13297872340425532,
- "grad_norm": 2.533612012863159,
- "learning_rate": 4.393939393939394e-06,
- "loss": 0.5626664161682129,
- "mean_token_accuracy": 0.8029808402061462,
- "num_tokens": 1651637.0,
- "step": 175
- },
- {
- "epoch": 0.1337386018237082,
- "grad_norm": 1.669508457183838,
- "learning_rate": 4.41919191919192e-06,
- "loss": 0.5351508259773254,
- "mean_token_accuracy": 0.8281655311584473,
- "num_tokens": 1666776.0,
- "step": 176
- },
- {
- "epoch": 0.1344984802431611,
- "grad_norm": 1.7579659223556519,
- "learning_rate": 4.444444444444444e-06,
- "loss": 0.5235031247138977,
- "mean_token_accuracy": 0.8143284320831299,
- "num_tokens": 1679241.0,
- "step": 177
- },
- {
- "epoch": 0.135258358662614,
- "grad_norm": 3.123563528060913,
- "learning_rate": 4.46969696969697e-06,
- "loss": 0.43051332235336304,
- "mean_token_accuracy": 0.8518186211585999,
- "num_tokens": 1683317.0,
- "step": 178
- },
- {
- "epoch": 0.13601823708206687,
- "grad_norm": 2.2411575317382812,
- "learning_rate": 4.494949494949495e-06,
- "loss": 0.5471380949020386,
- "mean_token_accuracy": 0.8267596960067749,
- "num_tokens": 1691366.0,
- "step": 179
- },
- {
- "epoch": 0.13677811550151975,
- "grad_norm": 2.621973991394043,
- "learning_rate": 4.520202020202021e-06,
- "loss": 0.5685839653015137,
- "mean_token_accuracy": 0.8260642290115356,
- "num_tokens": 1698148.0,
- "step": 180
- },
- {
- "epoch": 0.13753799392097266,
- "grad_norm": 2.1553852558135986,
- "learning_rate": 4.5454545454545455e-06,
- "loss": 0.5703883171081543,
- "mean_token_accuracy": 0.8219090700149536,
- "num_tokens": 1707225.0,
- "step": 181
- },
- {
- "epoch": 0.13829787234042554,
- "grad_norm": 5.1767897605896,
- "learning_rate": 4.5707070707070715e-06,
- "loss": 0.32704639434814453,
- "mean_token_accuracy": 0.8754568099975586,
- "num_tokens": 1712748.0,
- "step": 182
- },
- {
- "epoch": 0.13905775075987842,
- "grad_norm": 2.609168291091919,
- "learning_rate": 4.595959595959597e-06,
- "loss": 0.5939987301826477,
- "mean_token_accuracy": 0.8034975528717041,
- "num_tokens": 1719932.0,
- "step": 183
- },
- {
- "epoch": 0.1398176291793313,
- "grad_norm": 2.2059099674224854,
- "learning_rate": 4.621212121212122e-06,
- "loss": 0.5310720205307007,
- "mean_token_accuracy": 0.8177368640899658,
- "num_tokens": 1727640.0,
- "step": 184
- },
- {
- "epoch": 0.1405775075987842,
- "grad_norm": 2.6367759704589844,
- "learning_rate": 4.646464646464647e-06,
- "loss": 0.522086501121521,
- "mean_token_accuracy": 0.826233983039856,
- "num_tokens": 1733609.0,
- "step": 185
- },
- {
- "epoch": 0.1413373860182371,
- "grad_norm": 3.326732873916626,
- "learning_rate": 4.671717171717172e-06,
- "loss": 0.4127829074859619,
- "mean_token_accuracy": 0.8551101684570312,
- "num_tokens": 1737256.0,
- "step": 186
- },
- {
- "epoch": 0.14209726443768997,
- "grad_norm": 1.828412413597107,
- "learning_rate": 4.696969696969698e-06,
- "loss": 0.5444269180297852,
- "mean_token_accuracy": 0.8350818157196045,
- "num_tokens": 1750196.0,
- "step": 187
- },
- {
- "epoch": 0.14285714285714285,
- "grad_norm": 3.209203004837036,
- "learning_rate": 4.722222222222222e-06,
- "loss": 0.5087994933128357,
- "mean_token_accuracy": 0.8349015712738037,
- "num_tokens": 1754836.0,
- "step": 188
- },
- {
- "epoch": 0.14361702127659576,
- "grad_norm": 1.7339166402816772,
- "learning_rate": 4.747474747474748e-06,
- "loss": 0.5151352286338806,
- "mean_token_accuracy": 0.8321266174316406,
- "num_tokens": 1766015.0,
- "step": 189
- },
- {
- "epoch": 0.14437689969604864,
- "grad_norm": 2.699068069458008,
- "learning_rate": 4.772727272727273e-06,
- "loss": 0.4406203031539917,
- "mean_token_accuracy": 0.8425000905990601,
- "num_tokens": 1771684.0,
- "step": 190
- },
- {
- "epoch": 0.14513677811550152,
- "grad_norm": 2.8117282390594482,
- "learning_rate": 4.7979797979797985e-06,
- "loss": 0.40428489446640015,
- "mean_token_accuracy": 0.8654326796531677,
- "num_tokens": 1776301.0,
- "step": 191
- },
- {
- "epoch": 0.1458966565349544,
- "grad_norm": 2.9204647541046143,
- "learning_rate": 4.823232323232324e-06,
- "loss": 0.4191770553588867,
- "mean_token_accuracy": 0.8574687242507935,
- "num_tokens": 1781678.0,
- "step": 192
- },
- {
- "epoch": 0.1466565349544073,
- "grad_norm": 2.1648988723754883,
- "learning_rate": 4.848484848484849e-06,
- "loss": 0.5839012861251831,
- "mean_token_accuracy": 0.8053664565086365,
- "num_tokens": 1792516.0,
- "step": 193
- },
- {
- "epoch": 0.1474164133738602,
- "grad_norm": 2.3221631050109863,
- "learning_rate": 4.873737373737374e-06,
- "loss": 0.5037894248962402,
- "mean_token_accuracy": 0.8427227139472961,
- "num_tokens": 1800192.0,
- "step": 194
- },
- {
- "epoch": 0.14817629179331307,
- "grad_norm": 2.4536430835723877,
- "learning_rate": 4.898989898989899e-06,
- "loss": 0.42326074838638306,
- "mean_token_accuracy": 0.8510633111000061,
- "num_tokens": 1806159.0,
- "step": 195
- },
- {
- "epoch": 0.14893617021276595,
- "grad_norm": 2.4875805377960205,
- "learning_rate": 4.924242424242425e-06,
- "loss": 0.539531409740448,
- "mean_token_accuracy": 0.8060250282287598,
- "num_tokens": 1813392.0,
- "step": 196
- },
- {
- "epoch": 0.14969604863221886,
- "grad_norm": 2.1664798259735107,
- "learning_rate": 4.94949494949495e-06,
- "loss": 0.42502015829086304,
- "mean_token_accuracy": 0.8503251075744629,
- "num_tokens": 1821424.0,
- "step": 197
- },
- {
- "epoch": 0.15045592705167174,
- "grad_norm": 2.568808078765869,
- "learning_rate": 4.974747474747475e-06,
- "loss": 0.5025098323822021,
- "mean_token_accuracy": 0.8182311058044434,
- "num_tokens": 1827225.0,
- "step": 198
- },
- {
- "epoch": 0.15121580547112462,
- "grad_norm": 1.9116802215576172,
- "learning_rate": 5e-06,
- "loss": 0.4907258450984955,
- "mean_token_accuracy": 0.8310189843177795,
- "num_tokens": 1836297.0,
- "step": 199
- },
- {
- "epoch": 0.1519756838905775,
- "grad_norm": 3.150765895843506,
- "learning_rate": 4.999999122701883e-06,
- "loss": 0.390616774559021,
- "mean_token_accuracy": 0.8626647591590881,
- "num_tokens": 1839984.0,
- "step": 200
- },
- {
- "epoch": 0.15273556231003038,
- "grad_norm": 3.2229044437408447,
- "learning_rate": 4.999996490808146e-06,
- "loss": 0.48009657859802246,
- "mean_token_accuracy": 0.825214147567749,
- "num_tokens": 1844610.0,
- "step": 201
- },
- {
- "epoch": 0.1534954407294833,
- "grad_norm": 1.4473289251327515,
- "learning_rate": 4.9999921043206356e-06,
- "loss": 0.40135183930397034,
- "mean_token_accuracy": 0.8537827730178833,
- "num_tokens": 1859573.0,
- "step": 202
- },
- {
- "epoch": 0.15425531914893617,
- "grad_norm": 4.072319507598877,
- "learning_rate": 4.999985963242432e-06,
- "loss": 0.6158689260482788,
- "mean_token_accuracy": 0.8075432777404785,
- "num_tokens": 1863147.0,
- "step": 203
- },
- {
- "epoch": 0.15501519756838905,
- "grad_norm": 3.15741229057312,
- "learning_rate": 4.999978067577844e-06,
- "loss": 0.4603108763694763,
- "mean_token_accuracy": 0.8418779373168945,
- "num_tokens": 1867201.0,
- "step": 204
- },
- {
- "epoch": 0.15577507598784193,
- "grad_norm": 2.1925418376922607,
- "learning_rate": 4.999968417332415e-06,
- "loss": 0.5552488565444946,
- "mean_token_accuracy": 0.8216016292572021,
- "num_tokens": 1874837.0,
- "step": 205
- },
- {
- "epoch": 0.15653495440729484,
- "grad_norm": 2.2518117427825928,
- "learning_rate": 4.999957012512916e-06,
- "loss": 0.4912569522857666,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 1881842.0,
- "step": 206
- },
- {
- "epoch": 0.15729483282674772,
- "grad_norm": 1.8223762512207031,
- "learning_rate": 4.999943853127351e-06,
- "loss": 0.47709137201309204,
- "mean_token_accuracy": 0.8311659097671509,
- "num_tokens": 1890805.0,
- "step": 207
- },
- {
- "epoch": 0.1580547112462006,
- "grad_norm": 2.066499948501587,
- "learning_rate": 4.999928939184958e-06,
- "loss": 0.44794657826423645,
- "mean_token_accuracy": 0.8513424396514893,
- "num_tokens": 1898264.0,
- "step": 208
- },
- {
- "epoch": 0.15881458966565348,
- "grad_norm": 3.53865909576416,
- "learning_rate": 4.999912270696202e-06,
- "loss": 0.5978270769119263,
- "mean_token_accuracy": 0.8080137968063354,
- "num_tokens": 1902435.0,
- "step": 209
- },
- {
- "epoch": 0.1595744680851064,
- "grad_norm": 2.0760679244995117,
- "learning_rate": 4.999893847672783e-06,
- "loss": 0.5930601358413696,
- "mean_token_accuracy": 0.8028650283813477,
- "num_tokens": 1912252.0,
- "step": 210
- },
- {
- "epoch": 0.16033434650455927,
- "grad_norm": 2.21551513671875,
- "learning_rate": 4.99987367012763e-06,
- "loss": 0.6336753964424133,
- "mean_token_accuracy": 0.7902286648750305,
- "num_tokens": 1922095.0,
- "step": 211
- },
- {
- "epoch": 0.16109422492401215,
- "grad_norm": 1.7654480934143066,
- "learning_rate": 4.999851738074904e-06,
- "loss": 0.6373403668403625,
- "mean_token_accuracy": 0.7802424430847168,
- "num_tokens": 1938962.0,
- "step": 212
- },
- {
- "epoch": 0.16185410334346503,
- "grad_norm": 2.852834701538086,
- "learning_rate": 4.9998280515300006e-06,
- "loss": 0.6418683528900146,
- "mean_token_accuracy": 0.7895716428756714,
- "num_tokens": 1944668.0,
- "step": 213
- },
- {
- "epoch": 0.16261398176291794,
- "grad_norm": 3.4737212657928467,
- "learning_rate": 4.999802610509541e-06,
- "loss": 0.6323273181915283,
- "mean_token_accuracy": 0.7982614636421204,
- "num_tokens": 1949142.0,
- "step": 214
- },
- {
- "epoch": 0.16337386018237082,
- "grad_norm": 3.0802664756774902,
- "learning_rate": 4.999775415031381e-06,
- "loss": 0.5929068326950073,
- "mean_token_accuracy": 0.8112219572067261,
- "num_tokens": 1954141.0,
- "step": 215
- },
- {
- "epoch": 0.1641337386018237,
- "grad_norm": 2.9808855056762695,
- "learning_rate": 4.999746465114609e-06,
- "loss": 0.5556406378746033,
- "mean_token_accuracy": 0.8117628693580627,
- "num_tokens": 1959406.0,
- "step": 216
- },
- {
- "epoch": 0.16489361702127658,
- "grad_norm": 1.7346166372299194,
- "learning_rate": 4.999715760779541e-06,
- "loss": 0.5122925043106079,
- "mean_token_accuracy": 0.8040724992752075,
- "num_tokens": 1971921.0,
- "step": 217
- },
- {
- "epoch": 0.1656534954407295,
- "grad_norm": 1.4183907508850098,
- "learning_rate": 4.999683302047729e-06,
- "loss": 0.46471893787384033,
- "mean_token_accuracy": 0.8381330966949463,
- "num_tokens": 1988863.0,
- "step": 218
- },
- {
- "epoch": 0.16641337386018237,
- "grad_norm": 1.6797802448272705,
- "learning_rate": 4.999649088941951e-06,
- "loss": 0.38348832726478577,
- "mean_token_accuracy": 0.8344278931617737,
- "num_tokens": 2000003.0,
- "step": 219
- },
- {
- "epoch": 0.16717325227963525,
- "grad_norm": 3.036963939666748,
- "learning_rate": 4.999613121486222e-06,
- "loss": 0.6062780618667603,
- "mean_token_accuracy": 0.8217900991439819,
- "num_tokens": 2004813.0,
- "step": 220
- },
- {
- "epoch": 0.16793313069908813,
- "grad_norm": 2.0343217849731445,
- "learning_rate": 4.999575399705782e-06,
- "loss": 0.5052450895309448,
- "mean_token_accuracy": 0.8368623852729797,
- "num_tokens": 2013565.0,
- "step": 221
- },
- {
- "epoch": 0.16869300911854104,
- "grad_norm": 2.1162009239196777,
- "learning_rate": 4.9995359236271094e-06,
- "loss": 0.5169756412506104,
- "mean_token_accuracy": 0.8339958190917969,
- "num_tokens": 2025763.0,
- "step": 222
- },
- {
- "epoch": 0.16945288753799392,
- "grad_norm": 2.055333375930786,
- "learning_rate": 4.9994946932779076e-06,
- "loss": 0.6327048540115356,
- "mean_token_accuracy": 0.8078711032867432,
- "num_tokens": 2037005.0,
- "step": 223
- },
- {
- "epoch": 0.1702127659574468,
- "grad_norm": 3.334620475769043,
- "learning_rate": 4.999451708687114e-06,
- "loss": 0.5688358545303345,
- "mean_token_accuracy": 0.8015589714050293,
- "num_tokens": 2041473.0,
- "step": 224
- },
- {
- "epoch": 0.17097264437689969,
- "grad_norm": 2.3734676837921143,
- "learning_rate": 4.999406969884897e-06,
- "loss": 0.5673821568489075,
- "mean_token_accuracy": 0.8054057359695435,
- "num_tokens": 2049397.0,
- "step": 225
- },
- {
- "epoch": 0.1717325227963526,
- "grad_norm": 1.807358980178833,
- "learning_rate": 4.999360476902656e-06,
- "loss": 0.4376158118247986,
- "mean_token_accuracy": 0.8456039428710938,
- "num_tokens": 2058721.0,
- "step": 226
- },
- {
- "epoch": 0.17249240121580547,
- "grad_norm": 3.231638193130493,
- "learning_rate": 4.999312229773022e-06,
- "loss": 0.5592809915542603,
- "mean_token_accuracy": 0.8170154094696045,
- "num_tokens": 2063455.0,
- "step": 227
- },
- {
- "epoch": 0.17325227963525835,
- "grad_norm": 2.2717151641845703,
- "learning_rate": 4.999262228529855e-06,
- "loss": 0.6144396066665649,
- "mean_token_accuracy": 0.7948470115661621,
- "num_tokens": 2071686.0,
- "step": 228
- },
- {
- "epoch": 0.17401215805471124,
- "grad_norm": 1.4171342849731445,
- "learning_rate": 4.99921047320825e-06,
- "loss": 0.43680912256240845,
- "mean_token_accuracy": 0.84850013256073,
- "num_tokens": 2086999.0,
- "step": 229
- },
- {
- "epoch": 0.17477203647416414,
- "grad_norm": 3.162736654281616,
- "learning_rate": 4.99915696384453e-06,
- "loss": 0.6025407910346985,
- "mean_token_accuracy": 0.8042335510253906,
- "num_tokens": 2092001.0,
- "step": 230
- },
- {
- "epoch": 0.17553191489361702,
- "grad_norm": 1.8672804832458496,
- "learning_rate": 4.99910170047625e-06,
- "loss": 0.5843087434768677,
- "mean_token_accuracy": 0.8016980886459351,
- "num_tokens": 2103372.0,
- "step": 231
- },
- {
- "epoch": 0.1762917933130699,
- "grad_norm": 2.967587471008301,
- "learning_rate": 4.999044683142196e-06,
- "loss": 0.5123642086982727,
- "mean_token_accuracy": 0.8216149806976318,
- "num_tokens": 2108008.0,
- "step": 232
- },
- {
- "epoch": 0.1770516717325228,
- "grad_norm": 1.9651981592178345,
- "learning_rate": 4.998985911882383e-06,
- "loss": 0.5868178606033325,
- "mean_token_accuracy": 0.7904198169708252,
- "num_tokens": 2119009.0,
- "step": 233
- },
- {
- "epoch": 0.1778115501519757,
- "grad_norm": 2.7785449028015137,
- "learning_rate": 4.998925386738063e-06,
- "loss": 0.5075510144233704,
- "mean_token_accuracy": 0.8280210494995117,
- "num_tokens": 2124915.0,
- "step": 234
- },
- {
- "epoch": 0.17857142857142858,
- "grad_norm": 2.957470417022705,
- "learning_rate": 4.998863107751711e-06,
- "loss": 0.5351958274841309,
- "mean_token_accuracy": 0.846825122833252,
- "num_tokens": 2129905.0,
- "step": 235
- },
- {
- "epoch": 0.17933130699088146,
- "grad_norm": 3.207671880722046,
- "learning_rate": 4.99879907496704e-06,
- "loss": 0.6209091544151306,
- "mean_token_accuracy": 0.789960503578186,
- "num_tokens": 2135027.0,
- "step": 236
- },
- {
- "epoch": 0.18009118541033434,
- "grad_norm": 2.018953800201416,
- "learning_rate": 4.998733288428987e-06,
- "loss": 0.601510763168335,
- "mean_token_accuracy": 0.8136930465698242,
- "num_tokens": 2147016.0,
- "step": 237
- },
- {
- "epoch": 0.18085106382978725,
- "grad_norm": 2.437281847000122,
- "learning_rate": 4.998665748183727e-06,
- "loss": 0.5813639163970947,
- "mean_token_accuracy": 0.8116716146469116,
- "num_tokens": 2155386.0,
- "step": 238
- },
- {
- "epoch": 0.18161094224924013,
- "grad_norm": 1.5708180665969849,
- "learning_rate": 4.998596454278661e-06,
- "loss": 0.5252395272254944,
- "mean_token_accuracy": 0.8193864822387695,
- "num_tokens": 2170295.0,
- "step": 239
- },
- {
- "epoch": 0.182370820668693,
- "grad_norm": 1.9921495914459229,
- "learning_rate": 4.998525406762422e-06,
- "loss": 0.5335029363632202,
- "mean_token_accuracy": 0.8120872974395752,
- "num_tokens": 2180012.0,
- "step": 240
- },
- {
- "epoch": 0.1831306990881459,
- "grad_norm": 2.6562681198120117,
- "learning_rate": 4.998452605684874e-06,
- "loss": 0.48021435737609863,
- "mean_token_accuracy": 0.8388714790344238,
- "num_tokens": 2185607.0,
- "step": 241
- },
- {
- "epoch": 0.1838905775075988,
- "grad_norm": 2.2535853385925293,
- "learning_rate": 4.998378051097111e-06,
- "loss": 0.5747300386428833,
- "mean_token_accuracy": 0.8004639148712158,
- "num_tokens": 2194105.0,
- "step": 242
- },
- {
- "epoch": 0.18465045592705168,
- "grad_norm": 1.6151788234710693,
- "learning_rate": 4.998301743051459e-06,
- "loss": 0.6190565824508667,
- "mean_token_accuracy": 0.7816627621650696,
- "num_tokens": 2210629.0,
- "step": 243
- },
- {
- "epoch": 0.18541033434650456,
- "grad_norm": 2.1088173389434814,
- "learning_rate": 4.9982236816014735e-06,
- "loss": 0.4715560972690582,
- "mean_token_accuracy": 0.8485721349716187,
- "num_tokens": 2218958.0,
- "step": 244
- },
- {
- "epoch": 0.18617021276595744,
- "grad_norm": 2.6168735027313232,
- "learning_rate": 4.998143866801941e-06,
- "loss": 0.6077103018760681,
- "mean_token_accuracy": 0.8057924509048462,
- "num_tokens": 2226368.0,
- "step": 245
- },
- {
- "epoch": 0.18693009118541035,
- "grad_norm": 2.5988616943359375,
- "learning_rate": 4.99806229870888e-06,
- "loss": 0.5021637678146362,
- "mean_token_accuracy": 0.8361666202545166,
- "num_tokens": 2232485.0,
- "step": 246
- },
- {
- "epoch": 0.18768996960486323,
- "grad_norm": 2.015887498855591,
- "learning_rate": 4.9979789773795365e-06,
- "loss": 0.4309737980365753,
- "mean_token_accuracy": 0.8508044481277466,
- "num_tokens": 2240819.0,
- "step": 247
- },
- {
- "epoch": 0.1884498480243161,
- "grad_norm": 2.3115265369415283,
- "learning_rate": 4.997893902872389e-06,
- "loss": 0.5776500701904297,
- "mean_token_accuracy": 0.8079549074172974,
- "num_tokens": 2249460.0,
- "step": 248
- },
- {
- "epoch": 0.189209726443769,
- "grad_norm": 1.7387021780014038,
- "learning_rate": 4.997807075247147e-06,
- "loss": 0.430944561958313,
- "mean_token_accuracy": 0.8483544588088989,
- "num_tokens": 2259124.0,
- "step": 249
- },
- {
- "epoch": 0.1899696048632219,
- "grad_norm": 1.6378381252288818,
- "learning_rate": 4.997718494564747e-06,
- "loss": 0.4123363792896271,
- "mean_token_accuracy": 0.8557409644126892,
- "num_tokens": 2269899.0,
- "step": 250
- },
- {
- "epoch": 0.19072948328267478,
- "grad_norm": 1.336282730102539,
- "learning_rate": 4.997628160887361e-06,
- "loss": 0.502329409122467,
- "mean_token_accuracy": 0.8186938166618347,
- "num_tokens": 2292821.0,
- "step": 251
- },
- {
- "epoch": 0.19148936170212766,
- "grad_norm": 3.3335583209991455,
- "learning_rate": 4.997536074278388e-06,
- "loss": 0.584446907043457,
- "mean_token_accuracy": 0.8062717318534851,
- "num_tokens": 2297175.0,
- "step": 252
- },
- {
- "epoch": 0.19224924012158054,
- "grad_norm": 2.246727228164673,
- "learning_rate": 4.9974422348024565e-06,
- "loss": 0.5683060884475708,
- "mean_token_accuracy": 0.8193703293800354,
- "num_tokens": 2305456.0,
- "step": 253
- },
- {
- "epoch": 0.19300911854103345,
- "grad_norm": 2.3520865440368652,
- "learning_rate": 4.997346642525429e-06,
- "loss": 0.4724946618080139,
- "mean_token_accuracy": 0.8426719307899475,
- "num_tokens": 2312241.0,
- "step": 254
- },
- {
- "epoch": 0.19376899696048633,
- "grad_norm": 2.7115702629089355,
- "learning_rate": 4.9972492975143936e-06,
- "loss": 0.5019032955169678,
- "mean_token_accuracy": 0.8253573179244995,
- "num_tokens": 2318094.0,
- "step": 255
- },
- {
- "epoch": 0.1945288753799392,
- "grad_norm": 1.705528974533081,
- "learning_rate": 4.997150199837671e-06,
- "loss": 0.45588475465774536,
- "mean_token_accuracy": 0.836666464805603,
- "num_tokens": 2329025.0,
- "step": 256
- },
- {
- "epoch": 0.1952887537993921,
- "grad_norm": 2.161400318145752,
- "learning_rate": 4.997049349564814e-06,
- "loss": 0.5170183777809143,
- "mean_token_accuracy": 0.8287534117698669,
- "num_tokens": 2337448.0,
- "step": 257
- },
- {
- "epoch": 0.196048632218845,
- "grad_norm": 2.629669189453125,
- "learning_rate": 4.996946746766602e-06,
- "loss": 0.44650501012802124,
- "mean_token_accuracy": 0.850114107131958,
- "num_tokens": 2343207.0,
- "step": 258
- },
- {
- "epoch": 0.19680851063829788,
- "grad_norm": 1.6735503673553467,
- "learning_rate": 4.996842391515045e-06,
- "loss": 0.5247820019721985,
- "mean_token_accuracy": 0.8285071849822998,
- "num_tokens": 2356801.0,
- "step": 259
- },
- {
- "epoch": 0.19756838905775076,
- "grad_norm": 1.2753115892410278,
- "learning_rate": 4.996736283883382e-06,
- "loss": 0.41870927810668945,
- "mean_token_accuracy": 0.8448047637939453,
- "num_tokens": 2377306.0,
- "step": 260
- },
- {
- "epoch": 0.19832826747720364,
- "grad_norm": 2.6947314739227295,
- "learning_rate": 4.9966284239460875e-06,
- "loss": 0.5059205889701843,
- "mean_token_accuracy": 0.8430814743041992,
- "num_tokens": 2383352.0,
- "step": 261
- },
- {
- "epoch": 0.19908814589665655,
- "grad_norm": 2.0509963035583496,
- "learning_rate": 4.996518811778858e-06,
- "loss": 0.4565388560295105,
- "mean_token_accuracy": 0.8453130722045898,
- "num_tokens": 2391149.0,
- "step": 262
- },
- {
- "epoch": 0.19984802431610943,
- "grad_norm": 2.1856348514556885,
- "learning_rate": 4.996407447458626e-06,
- "loss": 0.531380832195282,
- "mean_token_accuracy": 0.8387004137039185,
- "num_tokens": 2399875.0,
- "step": 263
- },
- {
- "epoch": 0.2006079027355623,
- "grad_norm": 2.7348573207855225,
- "learning_rate": 4.99629433106355e-06,
- "loss": 0.5242817401885986,
- "mean_token_accuracy": 0.8177423477172852,
- "num_tokens": 2406586.0,
- "step": 264
- },
- {
- "epoch": 0.2013677811550152,
- "grad_norm": 1.76587975025177,
- "learning_rate": 4.99617946267302e-06,
- "loss": 0.49298471212387085,
- "mean_token_accuracy": 0.8271149396896362,
- "num_tokens": 2418683.0,
- "step": 265
- },
- {
- "epoch": 0.20212765957446807,
- "grad_norm": 2.8129730224609375,
- "learning_rate": 4.996062842367655e-06,
- "loss": 0.46420302987098694,
- "mean_token_accuracy": 0.8453244566917419,
- "num_tokens": 2422929.0,
- "step": 266
- },
- {
- "epoch": 0.20288753799392098,
- "grad_norm": 2.575744152069092,
- "learning_rate": 4.9959444702293025e-06,
- "loss": 0.43208545446395874,
- "mean_token_accuracy": 0.8494843244552612,
- "num_tokens": 2429567.0,
- "step": 267
- },
- {
- "epoch": 0.20364741641337386,
- "grad_norm": 2.7586750984191895,
- "learning_rate": 4.995824346341041e-06,
- "loss": 0.4390473961830139,
- "mean_token_accuracy": 0.8348895311355591,
- "num_tokens": 2434700.0,
- "step": 268
- },
- {
- "epoch": 0.20440729483282674,
- "grad_norm": 1.972145438194275,
- "learning_rate": 4.99570247078718e-06,
- "loss": 0.6219544410705566,
- "mean_token_accuracy": 0.7939999103546143,
- "num_tokens": 2447007.0,
- "step": 269
- },
- {
- "epoch": 0.20516717325227962,
- "grad_norm": 2.2963485717773438,
- "learning_rate": 4.995578843653255e-06,
- "loss": 0.5008970499038696,
- "mean_token_accuracy": 0.8255308866500854,
- "num_tokens": 2453936.0,
- "step": 270
- },
- {
- "epoch": 0.20592705167173253,
- "grad_norm": 1.8897721767425537,
- "learning_rate": 4.995453465026033e-06,
- "loss": 0.5436089038848877,
- "mean_token_accuracy": 0.819086492061615,
- "num_tokens": 2464494.0,
- "step": 271
- },
- {
- "epoch": 0.2066869300911854,
- "grad_norm": 2.319728374481201,
- "learning_rate": 4.995326334993508e-06,
- "loss": 0.5136368870735168,
- "mean_token_accuracy": 0.820817232131958,
- "num_tokens": 2470938.0,
- "step": 272
- },
- {
- "epoch": 0.2074468085106383,
- "grad_norm": 2.230414390563965,
- "learning_rate": 4.9951974536449055e-06,
- "loss": 0.5272846817970276,
- "mean_token_accuracy": 0.8203279972076416,
- "num_tokens": 2478629.0,
- "step": 273
- },
- {
- "epoch": 0.20820668693009117,
- "grad_norm": 3.401937484741211,
- "learning_rate": 4.9950668210706795e-06,
- "loss": 0.4389592111110687,
- "mean_token_accuracy": 0.8647899031639099,
- "num_tokens": 2482193.0,
- "step": 274
- },
- {
- "epoch": 0.20896656534954408,
- "grad_norm": 2.1278507709503174,
- "learning_rate": 4.994934437362513e-06,
- "loss": 0.598863422870636,
- "mean_token_accuracy": 0.7945119738578796,
- "num_tokens": 2492465.0,
- "step": 275
- },
- {
- "epoch": 0.20972644376899696,
- "grad_norm": 1.9259960651397705,
- "learning_rate": 4.994800302613318e-06,
- "loss": 0.49520939588546753,
- "mean_token_accuracy": 0.8371536135673523,
- "num_tokens": 2500825.0,
- "step": 276
- },
- {
- "epoch": 0.21048632218844984,
- "grad_norm": 2.346418857574463,
- "learning_rate": 4.994664416917236e-06,
- "loss": 0.5412614345550537,
- "mean_token_accuracy": 0.810661792755127,
- "num_tokens": 2509513.0,
- "step": 277
- },
- {
- "epoch": 0.21124620060790272,
- "grad_norm": 1.3092039823532104,
- "learning_rate": 4.994526780369636e-06,
- "loss": 0.46305379271507263,
- "mean_token_accuracy": 0.8358527421951294,
- "num_tokens": 2531405.0,
- "step": 278
- },
- {
- "epoch": 0.21200607902735563,
- "grad_norm": 2.924611806869507,
- "learning_rate": 4.9943873930671175e-06,
- "loss": 0.6134544610977173,
- "mean_token_accuracy": 0.7947378754615784,
- "num_tokens": 2536744.0,
- "step": 279
- },
- {
- "epoch": 0.2127659574468085,
- "grad_norm": 2.8290598392486572,
- "learning_rate": 4.994246255107506e-06,
- "loss": 0.465520441532135,
- "mean_token_accuracy": 0.8440108299255371,
- "num_tokens": 2541184.0,
- "step": 280
- },
- {
- "epoch": 0.2135258358662614,
- "grad_norm": 3.8081259727478027,
- "learning_rate": 4.994103366589859e-06,
- "loss": 0.43394139409065247,
- "mean_token_accuracy": 0.8579148054122925,
- "num_tokens": 2545395.0,
- "step": 281
- },
- {
- "epoch": 0.21428571428571427,
- "grad_norm": 1.7994529008865356,
- "learning_rate": 4.993958727614462e-06,
- "loss": 0.5076484680175781,
- "mean_token_accuracy": 0.8270803093910217,
- "num_tokens": 2556541.0,
- "step": 282
- },
- {
- "epoch": 0.21504559270516718,
- "grad_norm": 2.5582659244537354,
- "learning_rate": 4.993812338282826e-06,
- "loss": 0.4453684389591217,
- "mean_token_accuracy": 0.8488293886184692,
- "num_tokens": 2562949.0,
- "step": 283
- },
- {
- "epoch": 0.21580547112462006,
- "grad_norm": 1.6448938846588135,
- "learning_rate": 4.993664198697694e-06,
- "loss": 0.461971640586853,
- "mean_token_accuracy": 0.824763298034668,
- "num_tokens": 2576407.0,
- "step": 284
- },
- {
- "epoch": 0.21656534954407294,
- "grad_norm": 2.1264469623565674,
- "learning_rate": 4.993514308963037e-06,
- "loss": 0.6241602897644043,
- "mean_token_accuracy": 0.7916014790534973,
- "num_tokens": 2585695.0,
- "step": 285
- },
- {
- "epoch": 0.21732522796352582,
- "grad_norm": 3.629991292953491,
- "learning_rate": 4.993362669184051e-06,
- "loss": 0.610355019569397,
- "mean_token_accuracy": 0.7847568988800049,
- "num_tokens": 2589778.0,
- "step": 286
- },
- {
- "epoch": 0.21808510638297873,
- "grad_norm": 1.9070756435394287,
- "learning_rate": 4.993209279467164e-06,
- "loss": 0.5513623952865601,
- "mean_token_accuracy": 0.7911607027053833,
- "num_tokens": 2600920.0,
- "step": 287
- },
- {
- "epoch": 0.2188449848024316,
- "grad_norm": 1.761062741279602,
- "learning_rate": 4.993054139920031e-06,
- "loss": 0.4579957127571106,
- "mean_token_accuracy": 0.8189530372619629,
- "num_tokens": 2611856.0,
- "step": 288
- },
- {
- "epoch": 0.2196048632218845,
- "grad_norm": 1.7264713048934937,
- "learning_rate": 4.992897250651535e-06,
- "loss": 0.5871305465698242,
- "mean_token_accuracy": 0.7918527126312256,
- "num_tokens": 2624730.0,
- "step": 289
- },
- {
- "epoch": 0.22036474164133737,
- "grad_norm": 1.7455977201461792,
- "learning_rate": 4.992738611771787e-06,
- "loss": 0.5475119948387146,
- "mean_token_accuracy": 0.8226917386054993,
- "num_tokens": 2635705.0,
- "step": 290
- },
- {
- "epoch": 0.22112462006079028,
- "grad_norm": 2.095095157623291,
- "learning_rate": 4.992578223392124e-06,
- "loss": 0.5952225923538208,
- "mean_token_accuracy": 0.8078469038009644,
- "num_tokens": 2643954.0,
- "step": 291
- },
- {
- "epoch": 0.22188449848024316,
- "grad_norm": 2.994664192199707,
- "learning_rate": 4.992416085625115e-06,
- "loss": 0.5432442426681519,
- "mean_token_accuracy": 0.8329008221626282,
- "num_tokens": 2648800.0,
- "step": 292
- },
- {
- "epoch": 0.22264437689969604,
- "grad_norm": 2.796790361404419,
- "learning_rate": 4.992252198584554e-06,
- "loss": 0.5168961882591248,
- "mean_token_accuracy": 0.8393474817276001,
- "num_tokens": 2653546.0,
- "step": 293
- },
- {
- "epoch": 0.22340425531914893,
- "grad_norm": 1.8610522747039795,
- "learning_rate": 4.992086562385462e-06,
- "loss": 0.5728024244308472,
- "mean_token_accuracy": 0.797406792640686,
- "num_tokens": 2667483.0,
- "step": 294
- },
- {
- "epoch": 0.22416413373860183,
- "grad_norm": 1.695472002029419,
- "learning_rate": 4.9919191771440905e-06,
- "loss": 0.5460028648376465,
- "mean_token_accuracy": 0.8123016357421875,
- "num_tokens": 2683574.0,
- "step": 295
- },
- {
- "epoch": 0.22492401215805471,
- "grad_norm": 2.8627376556396484,
- "learning_rate": 4.9917500429779165e-06,
- "loss": 0.5566985011100769,
- "mean_token_accuracy": 0.815531313419342,
- "num_tokens": 2688985.0,
- "step": 296
- },
- {
- "epoch": 0.2256838905775076,
- "grad_norm": 2.73323655128479,
- "learning_rate": 4.991579160005644e-06,
- "loss": 0.48197102546691895,
- "mean_token_accuracy": 0.8471829295158386,
- "num_tokens": 2694799.0,
- "step": 297
- },
- {
- "epoch": 0.22644376899696048,
- "grad_norm": 1.8436161279678345,
- "learning_rate": 4.991406528347206e-06,
- "loss": 0.4528339207172394,
- "mean_token_accuracy": 0.8603188395500183,
- "num_tokens": 2707321.0,
- "step": 298
- },
- {
- "epoch": 0.22720364741641338,
- "grad_norm": 2.6231515407562256,
- "learning_rate": 4.9912321481237616e-06,
- "loss": 0.5916541814804077,
- "mean_token_accuracy": 0.8050242066383362,
- "num_tokens": 2714233.0,
- "step": 299
- },
- {
- "epoch": 0.22796352583586627,
- "grad_norm": 3.08776593208313,
- "learning_rate": 4.991056019457697e-06,
- "loss": 0.4860580563545227,
- "mean_token_accuracy": 0.8464088439941406,
- "num_tokens": 2718443.0,
- "step": 300
- },
- {
- "epoch": 0.22872340425531915,
- "grad_norm": 2.2537803649902344,
- "learning_rate": 4.990878142472628e-06,
- "loss": 0.5158311128616333,
- "mean_token_accuracy": 0.824694812297821,
- "num_tokens": 2726158.0,
- "step": 301
- },
- {
- "epoch": 0.22948328267477203,
- "grad_norm": 2.1122705936431885,
- "learning_rate": 4.990698517293394e-06,
- "loss": 0.495265394449234,
- "mean_token_accuracy": 0.8343238830566406,
- "num_tokens": 2735022.0,
- "step": 302
- },
- {
- "epoch": 0.23024316109422494,
- "grad_norm": 3.5503528118133545,
- "learning_rate": 4.9905171440460645e-06,
- "loss": 0.46063232421875,
- "mean_token_accuracy": 0.8420047760009766,
- "num_tokens": 2738550.0,
- "step": 303
- },
- {
- "epoch": 0.23100303951367782,
- "grad_norm": 3.9858486652374268,
- "learning_rate": 4.990334022857932e-06,
- "loss": 0.5832710266113281,
- "mean_token_accuracy": 0.8144199848175049,
- "num_tokens": 2741720.0,
- "step": 304
- },
- {
- "epoch": 0.2317629179331307,
- "grad_norm": 2.407231330871582,
- "learning_rate": 4.990149153857519e-06,
- "loss": 0.4692630171775818,
- "mean_token_accuracy": 0.8429223299026489,
- "num_tokens": 2748693.0,
- "step": 305
- },
- {
- "epoch": 0.23252279635258358,
- "grad_norm": 1.6996397972106934,
- "learning_rate": 4.989962537174573e-06,
- "loss": 0.49143946170806885,
- "mean_token_accuracy": 0.8340128064155579,
- "num_tokens": 2761254.0,
- "step": 306
- },
- {
- "epoch": 0.23328267477203649,
- "grad_norm": 3.746432065963745,
- "learning_rate": 4.989774172940071e-06,
- "loss": 0.6282026767730713,
- "mean_token_accuracy": 0.775698184967041,
- "num_tokens": 2765115.0,
- "step": 307
- },
- {
- "epoch": 0.23404255319148937,
- "grad_norm": 2.212872266769409,
- "learning_rate": 4.989584061286211e-06,
- "loss": 0.5193763971328735,
- "mean_token_accuracy": 0.8168246746063232,
- "num_tokens": 2772345.0,
- "step": 308
- },
- {
- "epoch": 0.23480243161094225,
- "grad_norm": 1.752297282218933,
- "learning_rate": 4.989392202346423e-06,
- "loss": 0.4437984824180603,
- "mean_token_accuracy": 0.8451256155967712,
- "num_tokens": 2783072.0,
- "step": 309
- },
- {
- "epoch": 0.23556231003039513,
- "grad_norm": 2.386019706726074,
- "learning_rate": 4.989198596255361e-06,
- "loss": 0.4090752899646759,
- "mean_token_accuracy": 0.8480085134506226,
- "num_tokens": 2788757.0,
- "step": 310
- },
- {
- "epoch": 0.23632218844984804,
- "grad_norm": 3.9981489181518555,
- "learning_rate": 4.989003243148904e-06,
- "loss": 0.5149132013320923,
- "mean_token_accuracy": 0.8179056644439697,
- "num_tokens": 2792096.0,
- "step": 311
- },
- {
- "epoch": 0.23708206686930092,
- "grad_norm": 1.8723100423812866,
- "learning_rate": 4.988806143164159e-06,
- "loss": 0.4531487822532654,
- "mean_token_accuracy": 0.8400167226791382,
- "num_tokens": 2802210.0,
- "step": 312
- },
- {
- "epoch": 0.2378419452887538,
- "grad_norm": 2.3415136337280273,
- "learning_rate": 4.988607296439459e-06,
- "loss": 0.5974439978599548,
- "mean_token_accuracy": 0.8035976886749268,
- "num_tokens": 2810088.0,
- "step": 313
- },
- {
- "epoch": 0.23860182370820668,
- "grad_norm": 1.5317577123641968,
- "learning_rate": 4.98840670311436e-06,
- "loss": 0.49247145652770996,
- "mean_token_accuracy": 0.8292540311813354,
- "num_tokens": 2824005.0,
- "step": 314
- },
- {
- "epoch": 0.2393617021276596,
- "grad_norm": 2.170772075653076,
- "learning_rate": 4.988204363329648e-06,
- "loss": 0.6359974145889282,
- "mean_token_accuracy": 0.7785564661026001,
- "num_tokens": 2834680.0,
- "step": 315
- },
- {
- "epoch": 0.24012158054711247,
- "grad_norm": 3.2655932903289795,
- "learning_rate": 4.988000277227334e-06,
- "loss": 0.5080196857452393,
- "mean_token_accuracy": 0.8295877575874329,
- "num_tokens": 2838735.0,
- "step": 316
- },
- {
- "epoch": 0.24088145896656535,
- "grad_norm": 3.406589984893799,
- "learning_rate": 4.987794444950651e-06,
- "loss": 0.3939085006713867,
- "mean_token_accuracy": 0.8700719475746155,
- "num_tokens": 2842127.0,
- "step": 317
- },
- {
- "epoch": 0.24164133738601823,
- "grad_norm": 1.8211106061935425,
- "learning_rate": 4.987586866644061e-06,
- "loss": 0.5270540118217468,
- "mean_token_accuracy": 0.826683521270752,
- "num_tokens": 2853656.0,
- "step": 318
- },
- {
- "epoch": 0.24240121580547114,
- "grad_norm": 1.8429969549179077,
- "learning_rate": 4.9873775424532515e-06,
- "loss": 0.4705049991607666,
- "mean_token_accuracy": 0.8355701565742493,
- "num_tokens": 2863513.0,
- "step": 319
- },
- {
- "epoch": 0.24316109422492402,
- "grad_norm": 2.2425320148468018,
- "learning_rate": 4.9871664725251314e-06,
- "loss": 0.485736608505249,
- "mean_token_accuracy": 0.835182785987854,
- "num_tokens": 2871556.0,
- "step": 320
- },
- {
- "epoch": 0.2439209726443769,
- "grad_norm": 1.6202056407928467,
- "learning_rate": 4.986953657007841e-06,
- "loss": 0.4437887370586395,
- "mean_token_accuracy": 0.8282591700553894,
- "num_tokens": 2884335.0,
- "step": 321
- },
- {
- "epoch": 0.24468085106382978,
- "grad_norm": 1.1027268171310425,
- "learning_rate": 4.98673909605074e-06,
- "loss": 0.3770800828933716,
- "mean_token_accuracy": 0.8325437307357788,
- "num_tokens": 2904286.0,
- "step": 322
- },
- {
- "epoch": 0.2454407294832827,
- "grad_norm": 2.3239076137542725,
- "learning_rate": 4.986522789804417e-06,
- "loss": 0.5387254953384399,
- "mean_token_accuracy": 0.806242823600769,
- "num_tokens": 2910975.0,
- "step": 323
- },
- {
- "epoch": 0.24620060790273557,
- "grad_norm": 2.243482828140259,
- "learning_rate": 4.986304738420684e-06,
- "loss": 0.4396553039550781,
- "mean_token_accuracy": 0.8561904430389404,
- "num_tokens": 2917087.0,
- "step": 324
- },
- {
- "epoch": 0.24696048632218845,
- "grad_norm": 2.537264347076416,
- "learning_rate": 4.986084942052577e-06,
- "loss": 0.395110160112381,
- "mean_token_accuracy": 0.8636915683746338,
- "num_tokens": 2921887.0,
- "step": 325
- },
- {
- "epoch": 0.24772036474164133,
- "grad_norm": 2.319399118423462,
- "learning_rate": 4.9858634008543574e-06,
- "loss": 0.581517219543457,
- "mean_token_accuracy": 0.8157487511634827,
- "num_tokens": 2928996.0,
- "step": 326
- },
- {
- "epoch": 0.24848024316109424,
- "grad_norm": 1.9787474870681763,
- "learning_rate": 4.985640114981513e-06,
- "loss": 0.5084106922149658,
- "mean_token_accuracy": 0.835221529006958,
- "num_tokens": 2940302.0,
- "step": 327
- },
- {
- "epoch": 0.24924012158054712,
- "grad_norm": 2.4783265590667725,
- "learning_rate": 4.985415084590752e-06,
- "loss": 0.6062222719192505,
- "mean_token_accuracy": 0.7885516285896301,
- "num_tokens": 2946386.0,
- "step": 328
- },
- {
- "epoch": 0.25,
- "grad_norm": 2.4081411361694336,
- "learning_rate": 4.985188309840012e-06,
- "loss": 0.5079880356788635,
- "mean_token_accuracy": 0.8313904404640198,
- "num_tokens": 2952323.0,
- "step": 329
- },
- {
- "epoch": 0.2507598784194529,
- "grad_norm": 2.64993953704834,
- "learning_rate": 4.984959790888451e-06,
- "loss": 0.5461447834968567,
- "mean_token_accuracy": 0.8125468492507935,
- "num_tokens": 2958119.0,
- "step": 330
- },
- {
- "epoch": 0.25151975683890576,
- "grad_norm": 2.549734115600586,
- "learning_rate": 4.984729527896451e-06,
- "loss": 0.5998573303222656,
- "mean_token_accuracy": 0.8076666593551636,
- "num_tokens": 2964947.0,
- "step": 331
- },
- {
- "epoch": 0.25227963525835867,
- "grad_norm": 3.2185161113739014,
- "learning_rate": 4.984497521025622e-06,
- "loss": 0.4232945442199707,
- "mean_token_accuracy": 0.8543803095817566,
- "num_tokens": 2968598.0,
- "step": 332
- },
- {
- "epoch": 0.2530395136778115,
- "grad_norm": 2.588994264602661,
- "learning_rate": 4.984263770438793e-06,
- "loss": 0.460967481136322,
- "mean_token_accuracy": 0.8416207432746887,
- "num_tokens": 2974510.0,
- "step": 333
- },
- {
- "epoch": 0.25379939209726443,
- "grad_norm": 2.1373162269592285,
- "learning_rate": 4.984028276300021e-06,
- "loss": 0.49382102489471436,
- "mean_token_accuracy": 0.8388048410415649,
- "num_tokens": 2981632.0,
- "step": 334
- },
- {
- "epoch": 0.25455927051671734,
- "grad_norm": 2.2524826526641846,
- "learning_rate": 4.983791038774585e-06,
- "loss": 0.4947671890258789,
- "mean_token_accuracy": 0.8066365122795105,
- "num_tokens": 2988736.0,
- "step": 335
- },
- {
- "epoch": 0.2553191489361702,
- "grad_norm": 1.7244199514389038,
- "learning_rate": 4.983552058028985e-06,
- "loss": 0.48096776008605957,
- "mean_token_accuracy": 0.830735445022583,
- "num_tokens": 3003576.0,
- "step": 336
- },
- {
- "epoch": 0.2560790273556231,
- "grad_norm": 3.0628933906555176,
- "learning_rate": 4.9833113342309495e-06,
- "loss": 0.6027032136917114,
- "mean_token_accuracy": 0.8008694648742676,
- "num_tokens": 3009549.0,
- "step": 337
- },
- {
- "epoch": 0.256838905775076,
- "grad_norm": 2.438674211502075,
- "learning_rate": 4.983068867549427e-06,
- "loss": 0.517090916633606,
- "mean_token_accuracy": 0.827893853187561,
- "num_tokens": 3015236.0,
- "step": 338
- },
- {
- "epoch": 0.25759878419452886,
- "grad_norm": 2.131535053253174,
- "learning_rate": 4.982824658154589e-06,
- "loss": 0.6656812429428101,
- "mean_token_accuracy": 0.7772425413131714,
- "num_tokens": 3028142.0,
- "step": 339
- },
- {
- "epoch": 0.25835866261398177,
- "grad_norm": 2.3206584453582764,
- "learning_rate": 4.9825787062178315e-06,
- "loss": 0.5757625699043274,
- "mean_token_accuracy": 0.8073873519897461,
- "num_tokens": 3040996.0,
- "step": 340
- },
- {
- "epoch": 0.2591185410334346,
- "grad_norm": 1.3905521631240845,
- "learning_rate": 4.982331011911774e-06,
- "loss": 0.4193805456161499,
- "mean_token_accuracy": 0.8399466872215271,
- "num_tokens": 3061931.0,
- "step": 341
- },
- {
- "epoch": 0.25987841945288753,
- "grad_norm": 2.184173345565796,
- "learning_rate": 4.982081575410256e-06,
- "loss": 0.4751223921775818,
- "mean_token_accuracy": 0.8409271240234375,
- "num_tokens": 3069081.0,
- "step": 342
- },
- {
- "epoch": 0.26063829787234044,
- "grad_norm": 3.538764238357544,
- "learning_rate": 4.9818303968883445e-06,
- "loss": 0.8119601011276245,
- "mean_token_accuracy": 0.7442739009857178,
- "num_tokens": 3073628.0,
- "step": 343
- },
- {
- "epoch": 0.2613981762917933,
- "grad_norm": 1.8063762187957764,
- "learning_rate": 4.981577476522323e-06,
- "loss": 0.5615730881690979,
- "mean_token_accuracy": 0.8207751512527466,
- "num_tokens": 3086596.0,
- "step": 344
- },
- {
- "epoch": 0.2621580547112462,
- "grad_norm": 2.4346961975097656,
- "learning_rate": 4.981322814489703e-06,
- "loss": 0.5266709327697754,
- "mean_token_accuracy": 0.8211277723312378,
- "num_tokens": 3092631.0,
- "step": 345
- },
- {
- "epoch": 0.2629179331306991,
- "grad_norm": 1.91289484500885,
- "learning_rate": 4.981066410969215e-06,
- "loss": 0.5047177672386169,
- "mean_token_accuracy": 0.8356877565383911,
- "num_tokens": 3101102.0,
- "step": 346
- },
- {
- "epoch": 0.26367781155015196,
- "grad_norm": 2.1495707035064697,
- "learning_rate": 4.980808266140813e-06,
- "loss": 0.47876280546188354,
- "mean_token_accuracy": 0.8364313244819641,
- "num_tokens": 3107998.0,
- "step": 347
- },
- {
- "epoch": 0.26443768996960487,
- "grad_norm": 2.5961992740631104,
- "learning_rate": 4.9805483801856744e-06,
- "loss": 0.5512958765029907,
- "mean_token_accuracy": 0.8181467652320862,
- "num_tokens": 3113848.0,
- "step": 348
- },
- {
- "epoch": 0.2651975683890577,
- "grad_norm": 3.2828900814056396,
- "learning_rate": 4.980286753286196e-06,
- "loss": 0.4217945635318756,
- "mean_token_accuracy": 0.8617103099822998,
- "num_tokens": 3117652.0,
- "step": 349
- },
- {
- "epoch": 0.26595744680851063,
- "grad_norm": 1.425554871559143,
- "learning_rate": 4.980023385625996e-06,
- "loss": 0.4042487144470215,
- "mean_token_accuracy": 0.8492785692214966,
- "num_tokens": 3132336.0,
- "step": 350
- },
- {
- "epoch": 0.26671732522796354,
- "grad_norm": 2.933504104614258,
- "learning_rate": 4.979758277389919e-06,
- "loss": 0.5406704545021057,
- "mean_token_accuracy": 0.8035423755645752,
- "num_tokens": 3137544.0,
- "step": 351
- },
- {
- "epoch": 0.2674772036474164,
- "grad_norm": 1.9958966970443726,
- "learning_rate": 4.9794914287640264e-06,
- "loss": 0.5857555270195007,
- "mean_token_accuracy": 0.7965140342712402,
- "num_tokens": 3149705.0,
- "step": 352
- },
- {
- "epoch": 0.2682370820668693,
- "grad_norm": 2.467694044113159,
- "learning_rate": 4.979222839935602e-06,
- "loss": 0.6404043436050415,
- "mean_token_accuracy": 0.7823755741119385,
- "num_tokens": 3158353.0,
- "step": 353
- },
- {
- "epoch": 0.2689969604863222,
- "grad_norm": 2.0102720260620117,
- "learning_rate": 4.9789525110931545e-06,
- "loss": 0.5681496858596802,
- "mean_token_accuracy": 0.8108169436454773,
- "num_tokens": 3167121.0,
- "step": 354
- },
- {
- "epoch": 0.26975683890577506,
- "grad_norm": 2.6017866134643555,
- "learning_rate": 4.978680442426409e-06,
- "loss": 0.6309828162193298,
- "mean_token_accuracy": 0.7742617130279541,
- "num_tokens": 3175012.0,
- "step": 355
- },
- {
- "epoch": 0.270516717325228,
- "grad_norm": 1.8799268007278442,
- "learning_rate": 4.978406634126315e-06,
- "loss": 0.524029016494751,
- "mean_token_accuracy": 0.8317689895629883,
- "num_tokens": 3185331.0,
- "step": 356
- },
- {
- "epoch": 0.2712765957446808,
- "grad_norm": 1.508332371711731,
- "learning_rate": 4.978131086385041e-06,
- "loss": 0.46656402945518494,
- "mean_token_accuracy": 0.8339117765426636,
- "num_tokens": 3198813.0,
- "step": 357
- },
- {
- "epoch": 0.27203647416413373,
- "grad_norm": 3.595707654953003,
- "learning_rate": 4.977853799395976e-06,
- "loss": 0.5101234912872314,
- "mean_token_accuracy": 0.8251723051071167,
- "num_tokens": 3206557.0,
- "step": 358
- },
- {
- "epoch": 0.27279635258358664,
- "grad_norm": 3.5317916870117188,
- "learning_rate": 4.977574773353732e-06,
- "loss": 0.5684665441513062,
- "mean_token_accuracy": 0.8124493360519409,
- "num_tokens": 3210912.0,
- "step": 359
- },
- {
- "epoch": 0.2735562310030395,
- "grad_norm": 2.8606204986572266,
- "learning_rate": 4.97729400845414e-06,
- "loss": 0.4746384620666504,
- "mean_token_accuracy": 0.8195606470108032,
- "num_tokens": 3215365.0,
- "step": 360
- },
- {
- "epoch": 0.2743161094224924,
- "grad_norm": 1.8214033842086792,
- "learning_rate": 4.977011504894253e-06,
- "loss": 0.4842769503593445,
- "mean_token_accuracy": 0.82928866147995,
- "num_tokens": 3224037.0,
- "step": 361
- },
- {
- "epoch": 0.2750759878419453,
- "grad_norm": 1.628746509552002,
- "learning_rate": 4.97672726287234e-06,
- "loss": 0.4397493302822113,
- "mean_token_accuracy": 0.8606528043746948,
- "num_tokens": 3235589.0,
- "step": 362
- },
- {
- "epoch": 0.27583586626139817,
- "grad_norm": 3.557973861694336,
- "learning_rate": 4.976441282587894e-06,
- "loss": 0.5732032060623169,
- "mean_token_accuracy": 0.8041545748710632,
- "num_tokens": 3239958.0,
- "step": 363
- },
- {
- "epoch": 0.2765957446808511,
- "grad_norm": 1.3467901945114136,
- "learning_rate": 4.9761535642416284e-06,
- "loss": 0.4525323510169983,
- "mean_token_accuracy": 0.8281061053276062,
- "num_tokens": 3257703.0,
- "step": 364
- },
- {
- "epoch": 0.2773556231003039,
- "grad_norm": 2.2649986743927,
- "learning_rate": 4.9758641080354745e-06,
- "loss": 0.5074734687805176,
- "mean_token_accuracy": 0.8447474241256714,
- "num_tokens": 3264334.0,
- "step": 365
- },
- {
- "epoch": 0.27811550151975684,
- "grad_norm": 2.8667566776275635,
- "learning_rate": 4.975572914172581e-06,
- "loss": 0.5759559869766235,
- "mean_token_accuracy": 0.7976793050765991,
- "num_tokens": 3269314.0,
- "step": 366
- },
- {
- "epoch": 0.27887537993920974,
- "grad_norm": 2.2514986991882324,
- "learning_rate": 4.975279982857324e-06,
- "loss": 0.5786465406417847,
- "mean_token_accuracy": 0.8058781623840332,
- "num_tokens": 3277324.0,
- "step": 367
- },
- {
- "epoch": 0.2796352583586626,
- "grad_norm": 1.3826723098754883,
- "learning_rate": 4.97498531429529e-06,
- "loss": 0.40801727771759033,
- "mean_token_accuracy": 0.8601310849189758,
- "num_tokens": 3290530.0,
- "step": 368
- },
- {
- "epoch": 0.2803951367781155,
- "grad_norm": 2.084092617034912,
- "learning_rate": 4.97468890869329e-06,
- "loss": 0.47076648473739624,
- "mean_token_accuracy": 0.8310186862945557,
- "num_tokens": 3298325.0,
- "step": 369
- },
- {
- "epoch": 0.2811550151975684,
- "grad_norm": 1.3467998504638672,
- "learning_rate": 4.974390766259353e-06,
- "loss": 0.44668465852737427,
- "mean_token_accuracy": 0.8275353908538818,
- "num_tokens": 3314302.0,
- "step": 370
- },
- {
- "epoch": 0.28191489361702127,
- "grad_norm": 2.5921075344085693,
- "learning_rate": 4.974090887202726e-06,
- "loss": 0.5343953967094421,
- "mean_token_accuracy": 0.8110706806182861,
- "num_tokens": 3320963.0,
- "step": 371
- },
- {
- "epoch": 0.2826747720364742,
- "grad_norm": 2.042781352996826,
- "learning_rate": 4.973789271733877e-06,
- "loss": 0.6293343305587769,
- "mean_token_accuracy": 0.7800243496894836,
- "num_tokens": 3332742.0,
- "step": 372
- },
- {
- "epoch": 0.28343465045592703,
- "grad_norm": 4.822193145751953,
- "learning_rate": 4.973485920064491e-06,
- "loss": 0.6256728768348694,
- "mean_token_accuracy": 0.7962433099746704,
- "num_tokens": 3335872.0,
- "step": 373
- },
- {
- "epoch": 0.28419452887537994,
- "grad_norm": 1.260988473892212,
- "learning_rate": 4.973180832407471e-06,
- "loss": 0.38731223344802856,
- "mean_token_accuracy": 0.8385066986083984,
- "num_tokens": 3351884.0,
- "step": 374
- },
- {
- "epoch": 0.28495440729483285,
- "grad_norm": 2.669966697692871,
- "learning_rate": 4.97287400897694e-06,
- "loss": 0.5594710111618042,
- "mean_token_accuracy": 0.8097212314605713,
- "num_tokens": 3358197.0,
- "step": 375
- },
- {
- "epoch": 0.2857142857142857,
- "grad_norm": 3.0344486236572266,
- "learning_rate": 4.972565449988238e-06,
- "loss": 0.34449583292007446,
- "mean_token_accuracy": 0.8813316822052002,
- "num_tokens": 3362133.0,
- "step": 376
- },
- {
- "epoch": 0.2864741641337386,
- "grad_norm": 2.562251091003418,
- "learning_rate": 4.972255155657925e-06,
- "loss": 0.5331522822380066,
- "mean_token_accuracy": 0.8212941288948059,
- "num_tokens": 3370346.0,
- "step": 377
- },
- {
- "epoch": 0.2872340425531915,
- "grad_norm": 2.7083740234375,
- "learning_rate": 4.9719431262037755e-06,
- "loss": 0.5403046011924744,
- "mean_token_accuracy": 0.8108335733413696,
- "num_tokens": 3375588.0,
- "step": 378
- },
- {
- "epoch": 0.28799392097264437,
- "grad_norm": 1.396430492401123,
- "learning_rate": 4.971629361844785e-06,
- "loss": 0.4041529893875122,
- "mean_token_accuracy": 0.8588063716888428,
- "num_tokens": 3390749.0,
- "step": 379
- },
- {
- "epoch": 0.2887537993920973,
- "grad_norm": 1.9872784614562988,
- "learning_rate": 4.971313862801166e-06,
- "loss": 0.4336993098258972,
- "mean_token_accuracy": 0.8511303663253784,
- "num_tokens": 3399064.0,
- "step": 380
- },
- {
- "epoch": 0.28951367781155013,
- "grad_norm": 1.9652575254440308,
- "learning_rate": 4.9709966292943455e-06,
- "loss": 0.4578358232975006,
- "mean_token_accuracy": 0.8229440450668335,
- "num_tokens": 3407229.0,
- "step": 381
- },
- {
- "epoch": 0.29027355623100304,
- "grad_norm": 1.6626898050308228,
- "learning_rate": 4.970677661546972e-06,
- "loss": 0.5427594184875488,
- "mean_token_accuracy": 0.815427303314209,
- "num_tokens": 3422321.0,
- "step": 382
- },
- {
- "epoch": 0.29103343465045595,
- "grad_norm": 3.5265562534332275,
- "learning_rate": 4.970356959782909e-06,
- "loss": 0.6661460995674133,
- "mean_token_accuracy": 0.7856965065002441,
- "num_tokens": 3427442.0,
- "step": 383
- },
- {
- "epoch": 0.2917933130699088,
- "grad_norm": 1.667205572128296,
- "learning_rate": 4.970034524227239e-06,
- "loss": 0.36256325244903564,
- "mean_token_accuracy": 0.8711205720901489,
- "num_tokens": 3436662.0,
- "step": 384
- },
- {
- "epoch": 0.2925531914893617,
- "grad_norm": 1.3389486074447632,
- "learning_rate": 4.969710355106256e-06,
- "loss": 0.4282698631286621,
- "mean_token_accuracy": 0.838951587677002,
- "num_tokens": 3450060.0,
- "step": 385
- },
- {
- "epoch": 0.2933130699088146,
- "grad_norm": 2.5163397789001465,
- "learning_rate": 4.969384452647477e-06,
- "loss": 0.5176984071731567,
- "mean_token_accuracy": 0.8235267996788025,
- "num_tokens": 3456990.0,
- "step": 386
- },
- {
- "epoch": 0.29407294832826747,
- "grad_norm": 1.7588495016098022,
- "learning_rate": 4.969056817079633e-06,
- "loss": 0.49710947275161743,
- "mean_token_accuracy": 0.818520724773407,
- "num_tokens": 3468098.0,
- "step": 387
- },
- {
- "epoch": 0.2948328267477204,
- "grad_norm": 2.6381046772003174,
- "learning_rate": 4.968727448632669e-06,
- "loss": 0.4425308108329773,
- "mean_token_accuracy": 0.8451643586158752,
- "num_tokens": 3472899.0,
- "step": 388
- },
- {
- "epoch": 0.29559270516717323,
- "grad_norm": 1.6345038414001465,
- "learning_rate": 4.968396347537751e-06,
- "loss": 0.4177059829235077,
- "mean_token_accuracy": 0.8498886227607727,
- "num_tokens": 3484826.0,
- "step": 389
- },
- {
- "epoch": 0.29635258358662614,
- "grad_norm": 3.0466468334198,
- "learning_rate": 4.968063514027258e-06,
- "loss": 0.4274463951587677,
- "mean_token_accuracy": 0.8387278318405151,
- "num_tokens": 3488610.0,
- "step": 390
- },
- {
- "epoch": 0.29711246200607905,
- "grad_norm": 2.6509406566619873,
- "learning_rate": 4.967728948334784e-06,
- "loss": 0.5401753783226013,
- "mean_token_accuracy": 0.8252490162849426,
- "num_tokens": 3493657.0,
- "step": 391
- },
- {
- "epoch": 0.2978723404255319,
- "grad_norm": 1.6372219324111938,
- "learning_rate": 4.967392650695141e-06,
- "loss": 0.3862472176551819,
- "mean_token_accuracy": 0.8555525541305542,
- "num_tokens": 3505588.0,
- "step": 392
- },
- {
- "epoch": 0.2986322188449848,
- "grad_norm": 2.1615452766418457,
- "learning_rate": 4.967054621344356e-06,
- "loss": 0.57850581407547,
- "mean_token_accuracy": 0.8222678899765015,
- "num_tokens": 3514396.0,
- "step": 393
- },
- {
- "epoch": 0.2993920972644377,
- "grad_norm": 1.8610916137695312,
- "learning_rate": 4.96671486051967e-06,
- "loss": 0.5440595149993896,
- "mean_token_accuracy": 0.8196715116500854,
- "num_tokens": 3523604.0,
- "step": 394
- },
- {
- "epoch": 0.30015197568389057,
- "grad_norm": 2.9585862159729004,
- "learning_rate": 4.966373368459542e-06,
- "loss": 0.6921588182449341,
- "mean_token_accuracy": 0.7816659808158875,
- "num_tokens": 3529849.0,
- "step": 395
- },
- {
- "epoch": 0.3009118541033435,
- "grad_norm": 1.9374035596847534,
- "learning_rate": 4.966030145403642e-06,
- "loss": 0.5494055151939392,
- "mean_token_accuracy": 0.8126792907714844,
- "num_tokens": 3539529.0,
- "step": 396
- },
- {
- "epoch": 0.30167173252279633,
- "grad_norm": 1.730530023574829,
- "learning_rate": 4.965685191592859e-06,
- "loss": 0.4271572232246399,
- "mean_token_accuracy": 0.8383668661117554,
- "num_tokens": 3550972.0,
- "step": 397
- },
- {
- "epoch": 0.30243161094224924,
- "grad_norm": 3.9635560512542725,
- "learning_rate": 4.9653385072692935e-06,
- "loss": 0.5576210021972656,
- "mean_token_accuracy": 0.799404501914978,
- "num_tokens": 3554147.0,
- "step": 398
- },
- {
- "epoch": 0.30319148936170215,
- "grad_norm": 2.5731968879699707,
- "learning_rate": 4.964990092676263e-06,
- "loss": 0.5478942394256592,
- "mean_token_accuracy": 0.8220961093902588,
- "num_tokens": 3559972.0,
- "step": 399
- },
- {
- "epoch": 0.303951367781155,
- "grad_norm": 2.2096588611602783,
- "learning_rate": 4.964639948058297e-06,
- "loss": 0.35461270809173584,
- "mean_token_accuracy": 0.8640927076339722,
- "num_tokens": 3565770.0,
- "step": 400
- },
- {
- "epoch": 0.3047112462006079,
- "grad_norm": 1.7874189615249634,
- "learning_rate": 4.964288073661142e-06,
- "loss": 0.38849619030952454,
- "mean_token_accuracy": 0.8443037271499634,
- "num_tokens": 3574514.0,
- "step": 401
- },
- {
- "epoch": 0.30547112462006076,
- "grad_norm": 1.5583146810531616,
- "learning_rate": 4.963934469731756e-06,
- "loss": 0.48909449577331543,
- "mean_token_accuracy": 0.8429768681526184,
- "num_tokens": 3585877.0,
- "step": 402
- },
- {
- "epoch": 0.30623100303951367,
- "grad_norm": 3.026599645614624,
- "learning_rate": 4.963579136518312e-06,
- "loss": 0.5138992071151733,
- "mean_token_accuracy": 0.8283728361129761,
- "num_tokens": 3590412.0,
- "step": 403
- },
- {
- "epoch": 0.3069908814589666,
- "grad_norm": 2.777505874633789,
- "learning_rate": 4.963222074270197e-06,
- "loss": 0.6241534948348999,
- "mean_token_accuracy": 0.8130464553833008,
- "num_tokens": 3596246.0,
- "step": 404
- },
- {
- "epoch": 0.30775075987841943,
- "grad_norm": 2.4772839546203613,
- "learning_rate": 4.962863283238011e-06,
- "loss": 0.5930814146995544,
- "mean_token_accuracy": 0.8036394715309143,
- "num_tokens": 3602878.0,
- "step": 405
- },
- {
- "epoch": 0.30851063829787234,
- "grad_norm": 1.5049982070922852,
- "learning_rate": 4.962502763673566e-06,
- "loss": 0.4903082549571991,
- "mean_token_accuracy": 0.8184912204742432,
- "num_tokens": 3617018.0,
- "step": 406
- },
- {
- "epoch": 0.30927051671732525,
- "grad_norm": 2.453155040740967,
- "learning_rate": 4.96214051582989e-06,
- "loss": 0.5138067603111267,
- "mean_token_accuracy": 0.8336835503578186,
- "num_tokens": 3624188.0,
- "step": 407
- },
- {
- "epoch": 0.3100303951367781,
- "grad_norm": 2.4038336277008057,
- "learning_rate": 4.961776539961222e-06,
- "loss": 0.5752760171890259,
- "mean_token_accuracy": 0.8054730892181396,
- "num_tokens": 3634152.0,
- "step": 408
- },
- {
- "epoch": 0.310790273556231,
- "grad_norm": 2.629068374633789,
- "learning_rate": 4.961410836323014e-06,
- "loss": 0.5580606460571289,
- "mean_token_accuracy": 0.8121089935302734,
- "num_tokens": 3639528.0,
- "step": 409
- },
- {
- "epoch": 0.31155015197568386,
- "grad_norm": 1.4245928525924683,
- "learning_rate": 4.961043405171931e-06,
- "loss": 0.5399882793426514,
- "mean_token_accuracy": 0.812280535697937,
- "num_tokens": 3655744.0,
- "step": 410
- },
- {
- "epoch": 0.3123100303951368,
- "grad_norm": 1.5236459970474243,
- "learning_rate": 4.9606742467658505e-06,
- "loss": 0.5234690308570862,
- "mean_token_accuracy": 0.8188928365707397,
- "num_tokens": 3675010.0,
- "step": 411
- },
- {
- "epoch": 0.3130699088145897,
- "grad_norm": 2.27961802482605,
- "learning_rate": 4.960303361363863e-06,
- "loss": 0.5502505898475647,
- "mean_token_accuracy": 0.8161963224411011,
- "num_tokens": 3682328.0,
- "step": 412
- },
- {
- "epoch": 0.31382978723404253,
- "grad_norm": 1.554518222808838,
- "learning_rate": 4.959930749226269e-06,
- "loss": 0.420867919921875,
- "mean_token_accuracy": 0.8499157428741455,
- "num_tokens": 3694980.0,
- "step": 413
- },
- {
- "epoch": 0.31458966565349544,
- "grad_norm": 2.609218120574951,
- "learning_rate": 4.9595564106145825e-06,
- "loss": 0.4706704318523407,
- "mean_token_accuracy": 0.8412490487098694,
- "num_tokens": 3700033.0,
- "step": 414
- },
- {
- "epoch": 0.31534954407294835,
- "grad_norm": 1.5303231477737427,
- "learning_rate": 4.959180345791528e-06,
- "loss": 0.4668654799461365,
- "mean_token_accuracy": 0.8125015497207642,
- "num_tokens": 3715012.0,
- "step": 415
- },
- {
- "epoch": 0.3161094224924012,
- "grad_norm": 1.2774665355682373,
- "learning_rate": 4.958802555021042e-06,
- "loss": 0.4339369237422943,
- "mean_token_accuracy": 0.8442851901054382,
- "num_tokens": 3733928.0,
- "step": 416
- },
- {
- "epoch": 0.3168693009118541,
- "grad_norm": 2.1240181922912598,
- "learning_rate": 4.958423038568274e-06,
- "loss": 0.4029104709625244,
- "mean_token_accuracy": 0.8627674579620361,
- "num_tokens": 3740202.0,
- "step": 417
- },
- {
- "epoch": 0.31762917933130697,
- "grad_norm": 2.00538969039917,
- "learning_rate": 4.958041796699583e-06,
- "loss": 0.5229607820510864,
- "mean_token_accuracy": 0.8282366394996643,
- "num_tokens": 3749308.0,
- "step": 418
- },
- {
- "epoch": 0.3183890577507599,
- "grad_norm": 2.6555092334747314,
- "learning_rate": 4.957658829682539e-06,
- "loss": 0.5344101190567017,
- "mean_token_accuracy": 0.8183202743530273,
- "num_tokens": 3754595.0,
- "step": 419
- },
- {
- "epoch": 0.3191489361702128,
- "grad_norm": 1.7468839883804321,
- "learning_rate": 4.9572741377859225e-06,
- "loss": 0.5667245984077454,
- "mean_token_accuracy": 0.8080123662948608,
- "num_tokens": 3765761.0,
- "step": 420
- },
- {
- "epoch": 0.31990881458966564,
- "grad_norm": 2.9612457752227783,
- "learning_rate": 4.956887721279726e-06,
- "loss": 0.5389559864997864,
- "mean_token_accuracy": 0.8019476532936096,
- "num_tokens": 3770844.0,
- "step": 421
- },
- {
- "epoch": 0.32066869300911854,
- "grad_norm": 1.842403769493103,
- "learning_rate": 4.95649958043515e-06,
- "loss": 0.38279837369918823,
- "mean_token_accuracy": 0.858866810798645,
- "num_tokens": 3778094.0,
- "step": 422
- },
- {
- "epoch": 0.32142857142857145,
- "grad_norm": 2.3108131885528564,
- "learning_rate": 4.956109715524609e-06,
- "loss": 0.5453893542289734,
- "mean_token_accuracy": 0.8085013031959534,
- "num_tokens": 3785015.0,
- "step": 423
- },
- {
- "epoch": 0.3221884498480243,
- "grad_norm": 3.0326945781707764,
- "learning_rate": 4.9557181268217225e-06,
- "loss": 0.5550523400306702,
- "mean_token_accuracy": 0.8125876188278198,
- "num_tokens": 3789830.0,
- "step": 424
- },
- {
- "epoch": 0.3229483282674772,
- "grad_norm": 1.8851977586746216,
- "learning_rate": 4.955324814601324e-06,
- "loss": 0.4902324974536896,
- "mean_token_accuracy": 0.8205406665802002,
- "num_tokens": 3799862.0,
- "step": 425
- },
- {
- "epoch": 0.32370820668693007,
- "grad_norm": 2.6018171310424805,
- "learning_rate": 4.954929779139455e-06,
- "loss": 0.5920133590698242,
- "mean_token_accuracy": 0.8340690732002258,
- "num_tokens": 3806617.0,
- "step": 426
- },
- {
- "epoch": 0.324468085106383,
- "grad_norm": 2.4283878803253174,
- "learning_rate": 4.954533020713367e-06,
- "loss": 0.5305854082107544,
- "mean_token_accuracy": 0.8137468099594116,
- "num_tokens": 3813843.0,
- "step": 427
- },
- {
- "epoch": 0.3252279635258359,
- "grad_norm": 2.667978525161743,
- "learning_rate": 4.954134539601519e-06,
- "loss": 0.5333638787269592,
- "mean_token_accuracy": 0.8402629494667053,
- "num_tokens": 3819450.0,
- "step": 428
- },
- {
- "epoch": 0.32598784194528874,
- "grad_norm": 1.7302523851394653,
- "learning_rate": 4.953734336083582e-06,
- "loss": 0.422895610332489,
- "mean_token_accuracy": 0.8709704875946045,
- "num_tokens": 3831027.0,
- "step": 429
- },
- {
- "epoch": 0.32674772036474165,
- "grad_norm": 2.427192211151123,
- "learning_rate": 4.953332410440434e-06,
- "loss": 0.6334598064422607,
- "mean_token_accuracy": 0.7817479968070984,
- "num_tokens": 3841776.0,
- "step": 430
- },
- {
- "epoch": 0.32750759878419455,
- "grad_norm": 1.460949182510376,
- "learning_rate": 4.952928762954161e-06,
- "loss": 0.3654777705669403,
- "mean_token_accuracy": 0.8780122995376587,
- "num_tokens": 3852213.0,
- "step": 431
- },
- {
- "epoch": 0.3282674772036474,
- "grad_norm": 1.9855005741119385,
- "learning_rate": 4.952523393908059e-06,
- "loss": 0.5117089748382568,
- "mean_token_accuracy": 0.811911404132843,
- "num_tokens": 3861176.0,
- "step": 432
- },
- {
- "epoch": 0.3290273556231003,
- "grad_norm": 2.2653207778930664,
- "learning_rate": 4.952116303586631e-06,
- "loss": 0.42514950037002563,
- "mean_token_accuracy": 0.8448518514633179,
- "num_tokens": 3867164.0,
- "step": 433
- },
- {
- "epoch": 0.32978723404255317,
- "grad_norm": 1.9780964851379395,
- "learning_rate": 4.951707492275589e-06,
- "loss": 0.5095293521881104,
- "mean_token_accuracy": 0.8262748718261719,
- "num_tokens": 3876406.0,
- "step": 434
- },
- {
- "epoch": 0.3305471124620061,
- "grad_norm": 2.9480233192443848,
- "learning_rate": 4.951296960261853e-06,
- "loss": 0.3494448959827423,
- "mean_token_accuracy": 0.8781307935714722,
- "num_tokens": 3880298.0,
- "step": 435
- },
- {
- "epoch": 0.331306990881459,
- "grad_norm": 2.335571527481079,
- "learning_rate": 4.95088470783355e-06,
- "loss": 0.5456914901733398,
- "mean_token_accuracy": 0.816297173500061,
- "num_tokens": 3886487.0,
- "step": 436
- },
- {
- "epoch": 0.33206686930091184,
- "grad_norm": 2.3046419620513916,
- "learning_rate": 4.950470735280013e-06,
- "loss": 0.4835948944091797,
- "mean_token_accuracy": 0.8539175391197205,
- "num_tokens": 3892706.0,
- "step": 437
- },
- {
- "epoch": 0.33282674772036475,
- "grad_norm": 2.44047474861145,
- "learning_rate": 4.950055042891786e-06,
- "loss": 0.5154092907905579,
- "mean_token_accuracy": 0.8579919338226318,
- "num_tokens": 3899532.0,
- "step": 438
- },
- {
- "epoch": 0.33358662613981765,
- "grad_norm": 4.826764106750488,
- "learning_rate": 4.949637630960618e-06,
- "loss": 0.5270259976387024,
- "mean_token_accuracy": 0.8172192573547363,
- "num_tokens": 3902260.0,
- "step": 439
- },
- {
- "epoch": 0.3343465045592705,
- "grad_norm": 2.001574754714966,
- "learning_rate": 4.949218499779462e-06,
- "loss": 0.5413002967834473,
- "mean_token_accuracy": 0.8162837028503418,
- "num_tokens": 3911706.0,
- "step": 440
- },
- {
- "epoch": 0.3351063829787234,
- "grad_norm": 1.7998944520950317,
- "learning_rate": 4.948797649642484e-06,
- "loss": 0.5131614208221436,
- "mean_token_accuracy": 0.8367440700531006,
- "num_tokens": 3923490.0,
- "step": 441
- },
- {
- "epoch": 0.33586626139817627,
- "grad_norm": 3.4566173553466797,
- "learning_rate": 4.94837508084505e-06,
- "loss": 0.7258909940719604,
- "mean_token_accuracy": 0.771377444267273,
- "num_tokens": 3928099.0,
- "step": 442
- },
- {
- "epoch": 0.3366261398176292,
- "grad_norm": 2.0040442943573,
- "learning_rate": 4.9479507936837364e-06,
- "loss": 0.482135534286499,
- "mean_token_accuracy": 0.8339327573776245,
- "num_tokens": 3937328.0,
- "step": 443
- },
- {
- "epoch": 0.3373860182370821,
- "grad_norm": 2.949502944946289,
- "learning_rate": 4.947524788456325e-06,
- "loss": 0.6474795341491699,
- "mean_token_accuracy": 0.7951677441596985,
- "num_tokens": 3942529.0,
- "step": 444
- },
- {
- "epoch": 0.33814589665653494,
- "grad_norm": 1.5528364181518555,
- "learning_rate": 4.947097065461801e-06,
- "loss": 0.48791584372520447,
- "mean_token_accuracy": 0.8425545692443848,
- "num_tokens": 3955200.0,
- "step": 445
- },
- {
- "epoch": 0.33890577507598785,
- "grad_norm": 1.8813284635543823,
- "learning_rate": 4.946667625000358e-06,
- "loss": 0.45922309160232544,
- "mean_token_accuracy": 0.8206527233123779,
- "num_tokens": 3962975.0,
- "step": 446
- },
- {
- "epoch": 0.33966565349544076,
- "grad_norm": 1.7157847881317139,
- "learning_rate": 4.946236467373392e-06,
- "loss": 0.5454182028770447,
- "mean_token_accuracy": 0.8049604892730713,
- "num_tokens": 3973956.0,
- "step": 447
- },
- {
- "epoch": 0.3404255319148936,
- "grad_norm": 2.008857250213623,
- "learning_rate": 4.945803592883509e-06,
- "loss": 0.5151860117912292,
- "mean_token_accuracy": 0.8262045383453369,
- "num_tokens": 3982853.0,
- "step": 448
- },
- {
- "epoch": 0.3411854103343465,
- "grad_norm": 1.6632496118545532,
- "learning_rate": 4.9453690018345144e-06,
- "loss": 0.42710691690444946,
- "mean_token_accuracy": 0.8521314859390259,
- "num_tokens": 3993838.0,
- "step": 449
- },
- {
- "epoch": 0.34194528875379937,
- "grad_norm": 1.365234375,
- "learning_rate": 4.944932694531423e-06,
- "loss": 0.5172526836395264,
- "mean_token_accuracy": 0.8277045488357544,
- "num_tokens": 4014179.0,
- "step": 450
- },
- {
- "epoch": 0.3427051671732523,
- "grad_norm": 1.7610243558883667,
- "learning_rate": 4.94449467128045e-06,
- "loss": 0.42104798555374146,
- "mean_token_accuracy": 0.8552065491676331,
- "num_tokens": 4023663.0,
- "step": 451
- },
- {
- "epoch": 0.3434650455927052,
- "grad_norm": 2.3732354640960693,
- "learning_rate": 4.944054932389018e-06,
- "loss": 0.5471175909042358,
- "mean_token_accuracy": 0.8487317562103271,
- "num_tokens": 4030100.0,
- "step": 452
- },
- {
- "epoch": 0.34422492401215804,
- "grad_norm": 1.5973623991012573,
- "learning_rate": 4.943613478165753e-06,
- "loss": 0.419813871383667,
- "mean_token_accuracy": 0.8484025001525879,
- "num_tokens": 4041124.0,
- "step": 453
- },
- {
- "epoch": 0.34498480243161095,
- "grad_norm": 2.966381549835205,
- "learning_rate": 4.943170308920484e-06,
- "loss": 0.5370652675628662,
- "mean_token_accuracy": 0.8439491987228394,
- "num_tokens": 4045675.0,
- "step": 454
- },
- {
- "epoch": 0.34574468085106386,
- "grad_norm": 2.5097248554229736,
- "learning_rate": 4.9427254249642445e-06,
- "loss": 0.5776349306106567,
- "mean_token_accuracy": 0.8060523867607117,
- "num_tokens": 4053250.0,
- "step": 455
- },
- {
- "epoch": 0.3465045592705167,
- "grad_norm": 1.6779125928878784,
- "learning_rate": 4.942278826609272e-06,
- "loss": 0.5245476961135864,
- "mean_token_accuracy": 0.8168526887893677,
- "num_tokens": 4064106.0,
- "step": 456
- },
- {
- "epoch": 0.3472644376899696,
- "grad_norm": 1.5945546627044678,
- "learning_rate": 4.9418305141690045e-06,
- "loss": 0.4972047209739685,
- "mean_token_accuracy": 0.8257735967636108,
- "num_tokens": 4077687.0,
- "step": 457
- },
- {
- "epoch": 0.34802431610942247,
- "grad_norm": 2.864778757095337,
- "learning_rate": 4.9413804879580865e-06,
- "loss": 0.5372499823570251,
- "mean_token_accuracy": 0.8423776626586914,
- "num_tokens": 4082632.0,
- "step": 458
- },
- {
- "epoch": 0.3487841945288754,
- "grad_norm": 1.4797078371047974,
- "learning_rate": 4.940928748292363e-06,
- "loss": 0.5903409719467163,
- "mean_token_accuracy": 0.8061295747756958,
- "num_tokens": 4104218.0,
- "step": 459
- },
- {
- "epoch": 0.3495440729483283,
- "grad_norm": 2.4376983642578125,
- "learning_rate": 4.940475295488882e-06,
- "loss": 0.4534894824028015,
- "mean_token_accuracy": 0.8395825028419495,
- "num_tokens": 4110530.0,
- "step": 460
- },
- {
- "epoch": 0.35030395136778114,
- "grad_norm": 1.2955626249313354,
- "learning_rate": 4.940020129865895e-06,
- "loss": 0.47155818343162537,
- "mean_token_accuracy": 0.8253582715988159,
- "num_tokens": 4128398.0,
- "step": 461
- },
- {
- "epoch": 0.35106382978723405,
- "grad_norm": 2.066575527191162,
- "learning_rate": 4.9395632517428546e-06,
- "loss": 0.5555641651153564,
- "mean_token_accuracy": 0.814624547958374,
- "num_tokens": 4137623.0,
- "step": 462
- },
- {
- "epoch": 0.3518237082066869,
- "grad_norm": 1.6407525539398193,
- "learning_rate": 4.939104661440415e-06,
- "loss": 0.4361790418624878,
- "mean_token_accuracy": 0.8544459342956543,
- "num_tokens": 4152803.0,
- "step": 463
- },
- {
- "epoch": 0.3525835866261398,
- "grad_norm": 2.1685116291046143,
- "learning_rate": 4.938644359280433e-06,
- "loss": 0.5347012877464294,
- "mean_token_accuracy": 0.853853702545166,
- "num_tokens": 4160778.0,
- "step": 464
- },
- {
- "epoch": 0.3533434650455927,
- "grad_norm": 1.8824869394302368,
- "learning_rate": 4.938182345585967e-06,
- "loss": 0.5512481927871704,
- "mean_token_accuracy": 0.7985891699790955,
- "num_tokens": 4170380.0,
- "step": 465
- },
- {
- "epoch": 0.3541033434650456,
- "grad_norm": 2.2229504585266113,
- "learning_rate": 4.937718620681273e-06,
- "loss": 0.516828179359436,
- "mean_token_accuracy": 0.8265621066093445,
- "num_tokens": 4178179.0,
- "step": 466
- },
- {
- "epoch": 0.3548632218844985,
- "grad_norm": 1.955990195274353,
- "learning_rate": 4.9372531848918145e-06,
- "loss": 0.5586158037185669,
- "mean_token_accuracy": 0.8367916345596313,
- "num_tokens": 4188626.0,
- "step": 467
- },
- {
- "epoch": 0.3556231003039514,
- "grad_norm": 1.9687023162841797,
- "learning_rate": 4.936786038544251e-06,
- "loss": 0.5517531633377075,
- "mean_token_accuracy": 0.8134098052978516,
- "num_tokens": 4198144.0,
- "step": 468
- },
- {
- "epoch": 0.35638297872340424,
- "grad_norm": 1.405516505241394,
- "learning_rate": 4.9363171819664434e-06,
- "loss": 0.5305492877960205,
- "mean_token_accuracy": 0.8014427423477173,
- "num_tokens": 4222818.0,
- "step": 469
- },
- {
- "epoch": 0.35714285714285715,
- "grad_norm": 2.6355695724487305,
- "learning_rate": 4.9358466154874535e-06,
- "loss": 0.5303391218185425,
- "mean_token_accuracy": 0.8028861284255981,
- "num_tokens": 4228318.0,
- "step": 470
- },
- {
- "epoch": 0.35790273556231,
- "grad_norm": 1.5133824348449707,
- "learning_rate": 4.935374339437543e-06,
- "loss": 0.5329189300537109,
- "mean_token_accuracy": 0.8479441404342651,
- "num_tokens": 4244527.0,
- "step": 471
- },
- {
- "epoch": 0.3586626139817629,
- "grad_norm": 3.4356725215911865,
- "learning_rate": 4.934900354148173e-06,
- "loss": 0.5431582927703857,
- "mean_token_accuracy": 0.8328983783721924,
- "num_tokens": 4248034.0,
- "step": 472
- },
- {
- "epoch": 0.3594224924012158,
- "grad_norm": 2.5789499282836914,
- "learning_rate": 4.934424659952006e-06,
- "loss": 0.4141455292701721,
- "mean_token_accuracy": 0.8658635020256042,
- "num_tokens": 4252953.0,
- "step": 473
- },
- {
- "epoch": 0.3601823708206687,
- "grad_norm": 1.145262598991394,
- "learning_rate": 4.933947257182901e-06,
- "loss": 0.40294092893600464,
- "mean_token_accuracy": 0.8565847277641296,
- "num_tokens": 4277813.0,
- "step": 474
- },
- {
- "epoch": 0.3609422492401216,
- "grad_norm": 1.7242133617401123,
- "learning_rate": 4.933468146175918e-06,
- "loss": 0.6036738753318787,
- "mean_token_accuracy": 0.8072597980499268,
- "num_tokens": 4291088.0,
- "step": 475
- },
- {
- "epoch": 0.3617021276595745,
- "grad_norm": 2.3490941524505615,
- "learning_rate": 4.932987327267317e-06,
- "loss": 0.49456146359443665,
- "mean_token_accuracy": 0.8372673988342285,
- "num_tokens": 4297376.0,
- "step": 476
- },
- {
- "epoch": 0.36246200607902734,
- "grad_norm": 1.3605526685714722,
- "learning_rate": 4.932504800794553e-06,
- "loss": 0.43595948815345764,
- "mean_token_accuracy": 0.8415953516960144,
- "num_tokens": 4312054.0,
- "step": 477
- },
- {
- "epoch": 0.36322188449848025,
- "grad_norm": 1.4525885581970215,
- "learning_rate": 4.9320205670962815e-06,
- "loss": 0.5390371680259705,
- "mean_token_accuracy": 0.8101649284362793,
- "num_tokens": 4328701.0,
- "step": 478
- },
- {
- "epoch": 0.3639817629179331,
- "grad_norm": 1.9862419366836548,
- "learning_rate": 4.931534626512359e-06,
- "loss": 0.45436930656433105,
- "mean_token_accuracy": 0.8352861404418945,
- "num_tokens": 4338372.0,
- "step": 479
- },
- {
- "epoch": 0.364741641337386,
- "grad_norm": 1.7804961204528809,
- "learning_rate": 4.931046979383836e-06,
- "loss": 0.4677754044532776,
- "mean_token_accuracy": 0.840467095375061,
- "num_tokens": 4347897.0,
- "step": 480
- },
- {
- "epoch": 0.3655015197568389,
- "grad_norm": 2.066632032394409,
- "learning_rate": 4.930557626052961e-06,
- "loss": 0.42418140172958374,
- "mean_token_accuracy": 0.8528275489807129,
- "num_tokens": 4354061.0,
- "step": 481
- },
- {
- "epoch": 0.3662613981762918,
- "grad_norm": 1.6155282258987427,
- "learning_rate": 4.930066566863182e-06,
- "loss": 0.5424284934997559,
- "mean_token_accuracy": 0.825040876865387,
- "num_tokens": 4370400.0,
- "step": 482
- },
- {
- "epoch": 0.3670212765957447,
- "grad_norm": 2.1452953815460205,
- "learning_rate": 4.929573802159143e-06,
- "loss": 0.5105804204940796,
- "mean_token_accuracy": 0.8284053802490234,
- "num_tokens": 4377579.0,
- "step": 483
- },
- {
- "epoch": 0.3677811550151976,
- "grad_norm": 1.8940945863723755,
- "learning_rate": 4.929079332286685e-06,
- "loss": 0.43478304147720337,
- "mean_token_accuracy": 0.8505665063858032,
- "num_tokens": 4385686.0,
- "step": 484
- },
- {
- "epoch": 0.36854103343465044,
- "grad_norm": 1.6785860061645508,
- "learning_rate": 4.928583157592846e-06,
- "loss": 0.40227848291397095,
- "mean_token_accuracy": 0.8623573780059814,
- "num_tokens": 4396128.0,
- "step": 485
- },
- {
- "epoch": 0.36930091185410335,
- "grad_norm": 1.6416733264923096,
- "learning_rate": 4.928085278425862e-06,
- "loss": 0.526267409324646,
- "mean_token_accuracy": 0.8284667730331421,
- "num_tokens": 4407963.0,
- "step": 486
- },
- {
- "epoch": 0.3700607902735562,
- "grad_norm": 1.8882389068603516,
- "learning_rate": 4.927585695135162e-06,
- "loss": 0.5555213093757629,
- "mean_token_accuracy": 0.8115293979644775,
- "num_tokens": 4418057.0,
- "step": 487
- },
- {
- "epoch": 0.3708206686930091,
- "grad_norm": 2.300248384475708,
- "learning_rate": 4.9270844080713735e-06,
- "loss": 0.5812339186668396,
- "mean_token_accuracy": 0.800270676612854,
- "num_tokens": 4425358.0,
- "step": 488
- },
- {
- "epoch": 0.371580547112462,
- "grad_norm": 1.6802922487258911,
- "learning_rate": 4.926581417586319e-06,
- "loss": 0.5134941935539246,
- "mean_token_accuracy": 0.8247408866882324,
- "num_tokens": 4437702.0,
- "step": 489
- },
- {
- "epoch": 0.3723404255319149,
- "grad_norm": 1.7620291709899902,
- "learning_rate": 4.926076724033016e-06,
- "loss": 0.5233973264694214,
- "mean_token_accuracy": 0.8102161884307861,
- "num_tokens": 4448584.0,
- "step": 490
- },
- {
- "epoch": 0.3731003039513678,
- "grad_norm": 1.6911998987197876,
- "learning_rate": 4.925570327765678e-06,
- "loss": 0.5337274074554443,
- "mean_token_accuracy": 0.845306396484375,
- "num_tokens": 4462651.0,
- "step": 491
- },
- {
- "epoch": 0.3738601823708207,
- "grad_norm": 1.7991242408752441,
- "learning_rate": 4.9250622291397144e-06,
- "loss": 0.31018948554992676,
- "mean_token_accuracy": 0.8857606053352356,
- "num_tokens": 4469971.0,
- "step": 492
- },
- {
- "epoch": 0.37462006079027355,
- "grad_norm": 4.9776835441589355,
- "learning_rate": 4.924552428511727e-06,
- "loss": 0.44114983081817627,
- "mean_token_accuracy": 0.8429906368255615,
- "num_tokens": 4478275.0,
- "step": 493
- },
- {
- "epoch": 0.37537993920972645,
- "grad_norm": 1.8007272481918335,
- "learning_rate": 4.924040926239515e-06,
- "loss": 0.574328601360321,
- "mean_token_accuracy": 0.7669196128845215,
- "num_tokens": 4491551.0,
- "step": 494
- },
- {
- "epoch": 0.3761398176291793,
- "grad_norm": 2.021300792694092,
- "learning_rate": 4.92352772268207e-06,
- "loss": 0.45636120438575745,
- "mean_token_accuracy": 0.840438723564148,
- "num_tokens": 4498658.0,
- "step": 495
- },
- {
- "epoch": 0.3768996960486322,
- "grad_norm": 2.369748592376709,
- "learning_rate": 4.923012818199576e-06,
- "loss": 0.5206376910209656,
- "mean_token_accuracy": 0.8521823287010193,
- "num_tokens": 4504648.0,
- "step": 496
- },
- {
- "epoch": 0.3776595744680851,
- "grad_norm": 2.733485221862793,
- "learning_rate": 4.922496213153416e-06,
- "loss": 0.5067723989486694,
- "mean_token_accuracy": 0.8168281316757202,
- "num_tokens": 4509990.0,
- "step": 497
- },
- {
- "epoch": 0.378419452887538,
- "grad_norm": 2.3751676082611084,
- "learning_rate": 4.921977907906161e-06,
- "loss": 0.49757206439971924,
- "mean_token_accuracy": 0.8325017690658569,
- "num_tokens": 4518373.0,
- "step": 498
- },
- {
- "epoch": 0.3791793313069909,
- "grad_norm": 2.1672775745391846,
- "learning_rate": 4.921457902821578e-06,
- "loss": 0.4237566590309143,
- "mean_token_accuracy": 0.8404698371887207,
- "num_tokens": 4524338.0,
- "step": 499
- },
- {
- "epoch": 0.3799392097264438,
- "grad_norm": 1.8374360799789429,
- "learning_rate": 4.9209361982646275e-06,
- "loss": 0.4995468854904175,
- "mean_token_accuracy": 0.8299649953842163,
- "num_tokens": 4533396.0,
- "step": 500
- },
- {
- "epoch": 0.38069908814589665,
- "grad_norm": 2.083967924118042,
- "learning_rate": 4.920412794601461e-06,
- "loss": 0.489935040473938,
- "mean_token_accuracy": 0.8315291404724121,
- "num_tokens": 4540941.0,
- "step": 501
- },
- {
- "epoch": 0.38145896656534956,
- "grad_norm": 2.2075610160827637,
- "learning_rate": 4.919887692199423e-06,
- "loss": 0.5233147740364075,
- "mean_token_accuracy": 0.804171085357666,
- "num_tokens": 4548215.0,
- "step": 502
- },
- {
- "epoch": 0.3822188449848024,
- "grad_norm": 2.076775312423706,
- "learning_rate": 4.9193608914270515e-06,
- "loss": 0.5785550475120544,
- "mean_token_accuracy": 0.7993186116218567,
- "num_tokens": 4558204.0,
- "step": 503
- },
- {
- "epoch": 0.3829787234042553,
- "grad_norm": 2.238546133041382,
- "learning_rate": 4.918832392654075e-06,
- "loss": 0.5287384390830994,
- "mean_token_accuracy": 0.8214945793151855,
- "num_tokens": 4565407.0,
- "step": 504
- },
- {
- "epoch": 0.3837386018237082,
- "grad_norm": 1.6783074140548706,
- "learning_rate": 4.9183021962514145e-06,
- "loss": 0.6063359379768372,
- "mean_token_accuracy": 0.7914625406265259,
- "num_tokens": 4580991.0,
- "step": 505
- },
- {
- "epoch": 0.3844984802431611,
- "grad_norm": 1.6287449598312378,
- "learning_rate": 4.917770302591183e-06,
- "loss": 0.3598247766494751,
- "mean_token_accuracy": 0.8706809878349304,
- "num_tokens": 4590579.0,
- "step": 506
- },
- {
- "epoch": 0.385258358662614,
- "grad_norm": 1.5432041883468628,
- "learning_rate": 4.917236712046682e-06,
- "loss": 0.5267890095710754,
- "mean_token_accuracy": 0.8032117486000061,
- "num_tokens": 4608380.0,
- "step": 507
- },
- {
- "epoch": 0.3860182370820669,
- "grad_norm": 1.7664037942886353,
- "learning_rate": 4.9167014249924075e-06,
- "loss": 0.3552354574203491,
- "mean_token_accuracy": 0.8569793701171875,
- "num_tokens": 4616426.0,
- "step": 508
- },
- {
- "epoch": 0.38677811550151975,
- "grad_norm": 2.1147472858428955,
- "learning_rate": 4.916164441804044e-06,
- "loss": 0.5212404727935791,
- "mean_token_accuracy": 0.8196578025817871,
- "num_tokens": 4623908.0,
- "step": 509
- },
- {
- "epoch": 0.38753799392097266,
- "grad_norm": 2.1092333793640137,
- "learning_rate": 4.915625762858467e-06,
- "loss": 0.5197038650512695,
- "mean_token_accuracy": 0.8245604634284973,
- "num_tokens": 4630956.0,
- "step": 510
- },
- {
- "epoch": 0.3882978723404255,
- "grad_norm": 1.23331880569458,
- "learning_rate": 4.915085388533743e-06,
- "loss": 0.4759839177131653,
- "mean_token_accuracy": 0.8192248344421387,
- "num_tokens": 4651269.0,
- "step": 511
- },
- {
- "epoch": 0.3890577507598784,
- "grad_norm": 2.424199104309082,
- "learning_rate": 4.914543319209126e-06,
- "loss": 0.5576270818710327,
- "mean_token_accuracy": 0.8203302621841431,
- "num_tokens": 4657296.0,
- "step": 512
- },
- {
- "epoch": 0.3898176291793313,
- "grad_norm": 2.725156307220459,
- "learning_rate": 4.913999555265062e-06,
- "loss": 0.4337949752807617,
- "mean_token_accuracy": 0.8382406234741211,
- "num_tokens": 4661850.0,
- "step": 513
- },
- {
- "epoch": 0.3905775075987842,
- "grad_norm": 2.3120534420013428,
- "learning_rate": 4.913454097083185e-06,
- "loss": 0.4941597580909729,
- "mean_token_accuracy": 0.8302834033966064,
- "num_tokens": 4667769.0,
- "step": 514
- },
- {
- "epoch": 0.3913373860182371,
- "grad_norm": 2.3111207485198975,
- "learning_rate": 4.912906945046319e-06,
- "loss": 0.5253715515136719,
- "mean_token_accuracy": 0.84515380859375,
- "num_tokens": 4674537.0,
- "step": 515
- },
- {
- "epoch": 0.39209726443769,
- "grad_norm": 1.4117841720581055,
- "learning_rate": 4.912358099538476e-06,
- "loss": 0.4521017074584961,
- "mean_token_accuracy": 0.8208256959915161,
- "num_tokens": 4690605.0,
- "step": 516
- },
- {
- "epoch": 0.39285714285714285,
- "grad_norm": 2.3742799758911133,
- "learning_rate": 4.911807560944858e-06,
- "loss": 0.41572901606559753,
- "mean_token_accuracy": 0.8550551533699036,
- "num_tokens": 4706437.0,
- "step": 517
- },
- {
- "epoch": 0.39361702127659576,
- "grad_norm": 2.4052202701568604,
- "learning_rate": 4.911255329651852e-06,
- "loss": 0.6003736257553101,
- "mean_token_accuracy": 0.8247885704040527,
- "num_tokens": 4712746.0,
- "step": 518
- },
- {
- "epoch": 0.3943768996960486,
- "grad_norm": 1.9335490465164185,
- "learning_rate": 4.910701406047037e-06,
- "loss": 0.5457713603973389,
- "mean_token_accuracy": 0.787429690361023,
- "num_tokens": 4731937.0,
- "step": 519
- },
- {
- "epoch": 0.3951367781155015,
- "grad_norm": 2.257706880569458,
- "learning_rate": 4.910145790519177e-06,
- "loss": 0.5300652980804443,
- "mean_token_accuracy": 0.8192912936210632,
- "num_tokens": 4739422.0,
- "step": 520
- },
- {
- "epoch": 0.3958966565349544,
- "grad_norm": 1.2099462747573853,
- "learning_rate": 4.9095884834582256e-06,
- "loss": 0.45872747898101807,
- "mean_token_accuracy": 0.8362667560577393,
- "num_tokens": 4757113.0,
- "step": 521
- },
- {
- "epoch": 0.3966565349544073,
- "grad_norm": 2.7991135120391846,
- "learning_rate": 4.909029485255321e-06,
- "loss": 0.49039560556411743,
- "mean_token_accuracy": 0.8260016441345215,
- "num_tokens": 4761709.0,
- "step": 522
- },
- {
- "epoch": 0.3974164133738602,
- "grad_norm": 2.2360129356384277,
- "learning_rate": 4.90846879630279e-06,
- "loss": 0.49556830525398254,
- "mean_token_accuracy": 0.827864408493042,
- "num_tokens": 4769048.0,
- "step": 523
- },
- {
- "epoch": 0.3981762917933131,
- "grad_norm": 2.5953688621520996,
- "learning_rate": 4.907906416994146e-06,
- "loss": 0.387208491563797,
- "mean_token_accuracy": 0.8467001914978027,
- "num_tokens": 4774637.0,
- "step": 524
- },
- {
- "epoch": 0.39893617021276595,
- "grad_norm": 2.1046814918518066,
- "learning_rate": 4.907342347724088e-06,
- "loss": 0.5477259755134583,
- "mean_token_accuracy": 0.8060322999954224,
- "num_tokens": 4782774.0,
- "step": 525
- },
- {
- "epoch": 0.39969604863221886,
- "grad_norm": 2.5622646808624268,
- "learning_rate": 4.906776588888502e-06,
- "loss": 0.5684159398078918,
- "mean_token_accuracy": 0.8095303177833557,
- "num_tokens": 4788766.0,
- "step": 526
- },
- {
- "epoch": 0.4004559270516717,
- "grad_norm": 1.9027913808822632,
- "learning_rate": 4.906209140884459e-06,
- "loss": 0.535524845123291,
- "mean_token_accuracy": 0.815237820148468,
- "num_tokens": 4798492.0,
- "step": 527
- },
- {
- "epoch": 0.4012158054711246,
- "grad_norm": 2.1447622776031494,
- "learning_rate": 4.905640004110216e-06,
- "loss": 0.5628632307052612,
- "mean_token_accuracy": 0.8085395097732544,
- "num_tokens": 4805737.0,
- "step": 528
- },
- {
- "epoch": 0.40197568389057753,
- "grad_norm": 1.6754741668701172,
- "learning_rate": 4.905069178965215e-06,
- "loss": 0.5046736598014832,
- "mean_token_accuracy": 0.8247535228729248,
- "num_tokens": 4816912.0,
- "step": 529
- },
- {
- "epoch": 0.4027355623100304,
- "grad_norm": 2.271230459213257,
- "learning_rate": 4.904496665850083e-06,
- "loss": 0.6086187958717346,
- "mean_token_accuracy": 0.7935276627540588,
- "num_tokens": 4824577.0,
- "step": 530
- },
- {
- "epoch": 0.4034954407294833,
- "grad_norm": 2.107595205307007,
- "learning_rate": 4.903922465166633e-06,
- "loss": 0.5431341528892517,
- "mean_token_accuracy": 0.8129537105560303,
- "num_tokens": 4831772.0,
- "step": 531
- },
- {
- "epoch": 0.40425531914893614,
- "grad_norm": 1.3860732316970825,
- "learning_rate": 4.903346577317859e-06,
- "loss": 0.45816320180892944,
- "mean_token_accuracy": 0.8328287601470947,
- "num_tokens": 4850302.0,
- "step": 532
- },
- {
- "epoch": 0.40501519756838905,
- "grad_norm": 1.9186837673187256,
- "learning_rate": 4.902769002707942e-06,
- "loss": 0.3294633626937866,
- "mean_token_accuracy": 0.8853933811187744,
- "num_tokens": 4856624.0,
- "step": 533
- },
- {
- "epoch": 0.40577507598784196,
- "grad_norm": 1.516194462776184,
- "learning_rate": 4.902189741742247e-06,
- "loss": 0.45482105016708374,
- "mean_token_accuracy": 0.8370342254638672,
- "num_tokens": 4870395.0,
- "step": 534
- },
- {
- "epoch": 0.4065349544072948,
- "grad_norm": 2.3235628604888916,
- "learning_rate": 4.901608794827321e-06,
- "loss": 0.40688639879226685,
- "mean_token_accuracy": 0.8643521666526794,
- "num_tokens": 4875645.0,
- "step": 535
- },
- {
- "epoch": 0.4072948328267477,
- "grad_norm": 2.29286527633667,
- "learning_rate": 4.9010261623708945e-06,
- "loss": 0.45482826232910156,
- "mean_token_accuracy": 0.8429383039474487,
- "num_tokens": 4881772.0,
- "step": 536
- },
- {
- "epoch": 0.40805471124620063,
- "grad_norm": 1.5907070636749268,
- "learning_rate": 4.900441844781882e-06,
- "loss": 0.5266948342323303,
- "mean_token_accuracy": 0.8348641395568848,
- "num_tokens": 4894289.0,
- "step": 537
- },
- {
- "epoch": 0.4088145896656535,
- "grad_norm": 2.1816294193267822,
- "learning_rate": 4.89985584247038e-06,
- "loss": 0.4797617793083191,
- "mean_token_accuracy": 0.8549500703811646,
- "num_tokens": 4901106.0,
- "step": 538
- },
- {
- "epoch": 0.4095744680851064,
- "grad_norm": 1.7347146272659302,
- "learning_rate": 4.899268155847667e-06,
- "loss": 0.4754739999771118,
- "mean_token_accuracy": 0.8278418183326721,
- "num_tokens": 4912131.0,
- "step": 539
- },
- {
- "epoch": 0.41033434650455924,
- "grad_norm": 2.0694527626037598,
- "learning_rate": 4.898678785326205e-06,
- "loss": 0.5071008801460266,
- "mean_token_accuracy": 0.8157946467399597,
- "num_tokens": 4921141.0,
- "step": 540
- },
- {
- "epoch": 0.41109422492401215,
- "grad_norm": 2.570047616958618,
- "learning_rate": 4.898087731319637e-06,
- "loss": 0.43639278411865234,
- "mean_token_accuracy": 0.8682913780212402,
- "num_tokens": 4926182.0,
- "step": 541
- },
- {
- "epoch": 0.41185410334346506,
- "grad_norm": 4.064006805419922,
- "learning_rate": 4.8974949942427854e-06,
- "loss": 0.539260745048523,
- "mean_token_accuracy": 0.8225528001785278,
- "num_tokens": 4929449.0,
- "step": 542
- },
- {
- "epoch": 0.4126139817629179,
- "grad_norm": 1.7644332647323608,
- "learning_rate": 4.896900574511657e-06,
- "loss": 0.472618043422699,
- "mean_token_accuracy": 0.8332902193069458,
- "num_tokens": 4939443.0,
- "step": 543
- },
- {
- "epoch": 0.4133738601823708,
- "grad_norm": 2.879918336868286,
- "learning_rate": 4.89630447254344e-06,
- "loss": 0.6360667943954468,
- "mean_token_accuracy": 0.8215296268463135,
- "num_tokens": 4950838.0,
- "step": 544
- },
- {
- "epoch": 0.41413373860182373,
- "grad_norm": 1.4575570821762085,
- "learning_rate": 4.8957066887565005e-06,
- "loss": 0.45617997646331787,
- "mean_token_accuracy": 0.8373187184333801,
- "num_tokens": 4965222.0,
- "step": 545
- },
- {
- "epoch": 0.4148936170212766,
- "grad_norm": 2.4829535484313965,
- "learning_rate": 4.895107223570386e-06,
- "loss": 0.42285341024398804,
- "mean_token_accuracy": 0.8686380386352539,
- "num_tokens": 4970724.0,
- "step": 546
- },
- {
- "epoch": 0.4156534954407295,
- "grad_norm": 2.639474630355835,
- "learning_rate": 4.894506077405824e-06,
- "loss": 0.5906289219856262,
- "mean_token_accuracy": 0.8174435496330261,
- "num_tokens": 4976766.0,
- "step": 547
- },
- {
- "epoch": 0.41641337386018235,
- "grad_norm": 2.7960562705993652,
- "learning_rate": 4.893903250684723e-06,
- "loss": 0.4518949091434479,
- "mean_token_accuracy": 0.8387585282325745,
- "num_tokens": 4980991.0,
- "step": 548
- },
- {
- "epoch": 0.41717325227963525,
- "grad_norm": 2.184176206588745,
- "learning_rate": 4.893298743830168e-06,
- "loss": 0.5223842859268188,
- "mean_token_accuracy": 0.8170937299728394,
- "num_tokens": 4987781.0,
- "step": 549
- },
- {
- "epoch": 0.41793313069908816,
- "grad_norm": 2.2393438816070557,
- "learning_rate": 4.892692557266429e-06,
- "loss": 0.5238431692123413,
- "mean_token_accuracy": 0.8217905759811401,
- "num_tokens": 4994321.0,
- "step": 550
- },
- {
- "epoch": 0.418693009118541,
- "grad_norm": 3.579047441482544,
- "learning_rate": 4.8920846914189465e-06,
- "loss": 0.5367584228515625,
- "mean_token_accuracy": 0.8312011361122131,
- "num_tokens": 4997951.0,
- "step": 551
- },
- {
- "epoch": 0.4194528875379939,
- "grad_norm": 1.6330240964889526,
- "learning_rate": 4.891475146714348e-06,
- "loss": 0.6054705381393433,
- "mean_token_accuracy": 0.7938206791877747,
- "num_tokens": 5012726.0,
- "step": 552
- },
- {
- "epoch": 0.42021276595744683,
- "grad_norm": 1.5775716304779053,
- "learning_rate": 4.8908639235804324e-06,
- "loss": 0.4774656891822815,
- "mean_token_accuracy": 0.828762948513031,
- "num_tokens": 5026751.0,
- "step": 553
- },
- {
- "epoch": 0.4209726443768997,
- "grad_norm": 1.5719101428985596,
- "learning_rate": 4.890251022446181e-06,
- "loss": 0.549429178237915,
- "mean_token_accuracy": 0.8110791444778442,
- "num_tokens": 5041861.0,
- "step": 554
- },
- {
- "epoch": 0.4217325227963526,
- "grad_norm": 1.8585275411605835,
- "learning_rate": 4.889636443741752e-06,
- "loss": 0.4448118805885315,
- "mean_token_accuracy": 0.8462690711021423,
- "num_tokens": 5052690.0,
- "step": 555
- },
- {
- "epoch": 0.42249240121580545,
- "grad_norm": 2.189202070236206,
- "learning_rate": 4.88902018789848e-06,
- "loss": 0.4296762943267822,
- "mean_token_accuracy": 0.8488791584968567,
- "num_tokens": 5058964.0,
- "step": 556
- },
- {
- "epoch": 0.42325227963525835,
- "grad_norm": 1.9328460693359375,
- "learning_rate": 4.888402255348877e-06,
- "loss": 0.5369474291801453,
- "mean_token_accuracy": 0.8184729814529419,
- "num_tokens": 5068465.0,
- "step": 557
- },
- {
- "epoch": 0.42401215805471126,
- "grad_norm": 1.6233323812484741,
- "learning_rate": 4.887782646526631e-06,
- "loss": 0.5284391641616821,
- "mean_token_accuracy": 0.8276044726371765,
- "num_tokens": 5081052.0,
- "step": 558
- },
- {
- "epoch": 0.4247720364741641,
- "grad_norm": 2.222813844680786,
- "learning_rate": 4.887161361866608e-06,
- "loss": 0.5679137706756592,
- "mean_token_accuracy": 0.8012375831604004,
- "num_tokens": 5090001.0,
- "step": 559
- },
- {
- "epoch": 0.425531914893617,
- "grad_norm": 2.1062207221984863,
- "learning_rate": 4.8865384018048494e-06,
- "loss": 0.5554201602935791,
- "mean_token_accuracy": 0.8128066062927246,
- "num_tokens": 5097644.0,
- "step": 560
- },
- {
- "epoch": 0.42629179331306993,
- "grad_norm": 1.5380984544754028,
- "learning_rate": 4.8859137667785735e-06,
- "loss": 0.4948265850543976,
- "mean_token_accuracy": 0.8258291482925415,
- "num_tokens": 5110069.0,
- "step": 561
- },
- {
- "epoch": 0.4270516717325228,
- "grad_norm": 2.0290257930755615,
- "learning_rate": 4.8852874572261715e-06,
- "loss": 0.4969530403614044,
- "mean_token_accuracy": 0.8297134637832642,
- "num_tokens": 5117452.0,
- "step": 562
- },
- {
- "epoch": 0.4278115501519757,
- "grad_norm": 1.5651452541351318,
- "learning_rate": 4.884659473587213e-06,
- "loss": 0.5353102087974548,
- "mean_token_accuracy": 0.8161719441413879,
- "num_tokens": 5133756.0,
- "step": 563
- },
- {
- "epoch": 0.42857142857142855,
- "grad_norm": 2.2470998764038086,
- "learning_rate": 4.884029816302441e-06,
- "loss": 0.5104288458824158,
- "mean_token_accuracy": 0.8081635236740112,
- "num_tokens": 5140278.0,
- "step": 564
- },
- {
- "epoch": 0.42933130699088146,
- "grad_norm": 1.726891279220581,
- "learning_rate": 4.883398485813772e-06,
- "loss": 0.4508771002292633,
- "mean_token_accuracy": 0.8548800349235535,
- "num_tokens": 5150115.0,
- "step": 565
- },
- {
- "epoch": 0.43009118541033436,
- "grad_norm": 1.4779289960861206,
- "learning_rate": 4.8827654825642984e-06,
- "loss": 0.46861088275909424,
- "mean_token_accuracy": 0.8209476470947266,
- "num_tokens": 5163225.0,
- "step": 566
- },
- {
- "epoch": 0.4308510638297872,
- "grad_norm": 1.2361034154891968,
- "learning_rate": 4.882130806998287e-06,
- "loss": 0.4591076672077179,
- "mean_token_accuracy": 0.803041934967041,
- "num_tokens": 5180342.0,
- "step": 567
- },
- {
- "epoch": 0.4316109422492401,
- "grad_norm": 1.882467269897461,
- "learning_rate": 4.881494459561177e-06,
- "loss": 0.579258143901825,
- "mean_token_accuracy": 0.8007112741470337,
- "num_tokens": 5189595.0,
- "step": 568
- },
- {
- "epoch": 0.43237082066869303,
- "grad_norm": 1.095462441444397,
- "learning_rate": 4.880856440699582e-06,
- "loss": 0.3806574046611786,
- "mean_token_accuracy": 0.8650111556053162,
- "num_tokens": 5211642.0,
- "step": 569
- },
- {
- "epoch": 0.4331306990881459,
- "grad_norm": 1.6469846963882446,
- "learning_rate": 4.880216750861288e-06,
- "loss": 0.544589638710022,
- "mean_token_accuracy": 0.8060122728347778,
- "num_tokens": 5224137.0,
- "step": 570
- },
- {
- "epoch": 0.4338905775075988,
- "grad_norm": 1.8561251163482666,
- "learning_rate": 4.879575390495254e-06,
- "loss": 0.4094924330711365,
- "mean_token_accuracy": 0.8591406345367432,
- "num_tokens": 5231588.0,
- "step": 571
- },
- {
- "epoch": 0.43465045592705165,
- "grad_norm": 3.01326847076416,
- "learning_rate": 4.878932360051611e-06,
- "loss": 0.6139192581176758,
- "mean_token_accuracy": 0.8108739852905273,
- "num_tokens": 5236853.0,
- "step": 572
- },
- {
- "epoch": 0.43541033434650456,
- "grad_norm": 2.1753034591674805,
- "learning_rate": 4.878287659981663e-06,
- "loss": 0.49082931876182556,
- "mean_token_accuracy": 0.862828254699707,
- "num_tokens": 5243264.0,
- "step": 573
- },
- {
- "epoch": 0.43617021276595747,
- "grad_norm": 1.4437755346298218,
- "learning_rate": 4.8776412907378845e-06,
- "loss": 0.5608728528022766,
- "mean_token_accuracy": 0.8271626234054565,
- "num_tokens": 5261757.0,
- "step": 574
- },
- {
- "epoch": 0.4369300911854103,
- "grad_norm": 1.786683440208435,
- "learning_rate": 4.876993252773923e-06,
- "loss": 0.4377627968788147,
- "mean_token_accuracy": 0.844936192035675,
- "num_tokens": 5271038.0,
- "step": 575
- },
- {
- "epoch": 0.4376899696048632,
- "grad_norm": 1.3425915241241455,
- "learning_rate": 4.876343546544596e-06,
- "loss": 0.44762521982192993,
- "mean_token_accuracy": 0.8397793769836426,
- "num_tokens": 5285555.0,
- "step": 576
- },
- {
- "epoch": 0.43844984802431614,
- "grad_norm": 2.1549675464630127,
- "learning_rate": 4.8756921725058935e-06,
- "loss": 0.5332942008972168,
- "mean_token_accuracy": 0.820149302482605,
- "num_tokens": 5294595.0,
- "step": 577
- },
- {
- "epoch": 0.439209726443769,
- "grad_norm": 1.5254042148590088,
- "learning_rate": 4.875039131114975e-06,
- "loss": 0.3646543622016907,
- "mean_token_accuracy": 0.8442583084106445,
- "num_tokens": 5304955.0,
- "step": 578
- },
- {
- "epoch": 0.4399696048632219,
- "grad_norm": 1.5751557350158691,
- "learning_rate": 4.8743844228301676e-06,
- "loss": 0.4854734539985657,
- "mean_token_accuracy": 0.8317523002624512,
- "num_tokens": 5317351.0,
- "step": 579
- },
- {
- "epoch": 0.44072948328267475,
- "grad_norm": 1.6950466632843018,
- "learning_rate": 4.873728048110973e-06,
- "loss": 0.5907570719718933,
- "mean_token_accuracy": 0.7946986556053162,
- "num_tokens": 5332542.0,
- "step": 580
- },
- {
- "epoch": 0.44148936170212766,
- "grad_norm": 2.1180708408355713,
- "learning_rate": 4.873070007418059e-06,
- "loss": 0.5220296382904053,
- "mean_token_accuracy": 0.8037363290786743,
- "num_tokens": 5341722.0,
- "step": 581
- },
- {
- "epoch": 0.44224924012158057,
- "grad_norm": 1.3643816709518433,
- "learning_rate": 4.872410301213265e-06,
- "loss": 0.4865502417087555,
- "mean_token_accuracy": 0.8377852439880371,
- "num_tokens": 5359359.0,
- "step": 582
- },
- {
- "epoch": 0.4430091185410334,
- "grad_norm": 1.483280897140503,
- "learning_rate": 4.871748929959598e-06,
- "loss": 0.36856764554977417,
- "mean_token_accuracy": 0.8709549903869629,
- "num_tokens": 5369749.0,
- "step": 583
- },
- {
- "epoch": 0.44376899696048633,
- "grad_norm": 1.6891541481018066,
- "learning_rate": 4.871085894121234e-06,
- "loss": 0.5768930912017822,
- "mean_token_accuracy": 0.8030461668968201,
- "num_tokens": 5383912.0,
- "step": 584
- },
- {
- "epoch": 0.44452887537993924,
- "grad_norm": 2.1318740844726562,
- "learning_rate": 4.870421194163515e-06,
- "loss": 0.4337100386619568,
- "mean_token_accuracy": 0.8562518358230591,
- "num_tokens": 5389412.0,
- "step": 585
- },
- {
- "epoch": 0.4452887537993921,
- "grad_norm": 2.540255546569824,
- "learning_rate": 4.869754830552956e-06,
- "loss": 0.4708256125450134,
- "mean_token_accuracy": 0.8446552753448486,
- "num_tokens": 5394762.0,
- "step": 586
- },
- {
- "epoch": 0.446048632218845,
- "grad_norm": 2.048015594482422,
- "learning_rate": 4.869086803757235e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8181137442588806,
- "num_tokens": 5402379.0,
- "step": 587
- },
- {
- "epoch": 0.44680851063829785,
- "grad_norm": 2.9821012020111084,
- "learning_rate": 4.868417114245199e-06,
- "loss": 0.6299797296524048,
- "mean_token_accuracy": 0.8237329125404358,
- "num_tokens": 5408229.0,
- "step": 588
- },
- {
- "epoch": 0.44756838905775076,
- "grad_norm": 1.7807202339172363,
- "learning_rate": 4.867745762486862e-06,
- "loss": 0.5176759958267212,
- "mean_token_accuracy": 0.8184244632720947,
- "num_tokens": 5418383.0,
- "step": 589
- },
- {
- "epoch": 0.44832826747720367,
- "grad_norm": 1.5466399192810059,
- "learning_rate": 4.8670727489534035e-06,
- "loss": 0.5137228965759277,
- "mean_token_accuracy": 0.8365053534507751,
- "num_tokens": 5432127.0,
- "step": 590
- },
- {
- "epoch": 0.4490881458966565,
- "grad_norm": 2.9521141052246094,
- "learning_rate": 4.866398074117173e-06,
- "loss": 0.4056887924671173,
- "mean_token_accuracy": 0.8561501502990723,
- "num_tokens": 5436062.0,
- "step": 591
- },
- {
- "epoch": 0.44984802431610943,
- "grad_norm": 2.058743953704834,
- "learning_rate": 4.86572173845168e-06,
- "loss": 0.6124799251556396,
- "mean_token_accuracy": 0.8007957339286804,
- "num_tokens": 5444989.0,
- "step": 592
- },
- {
- "epoch": 0.4506079027355623,
- "grad_norm": 2.1243767738342285,
- "learning_rate": 4.865043742431605e-06,
- "loss": 0.5659694671630859,
- "mean_token_accuracy": 0.8084750175476074,
- "num_tokens": 5453865.0,
- "step": 593
- },
- {
- "epoch": 0.4513677811550152,
- "grad_norm": 1.6732314825057983,
- "learning_rate": 4.864364086532792e-06,
- "loss": 0.47879064083099365,
- "mean_token_accuracy": 0.8346436023712158,
- "num_tokens": 5466398.0,
- "step": 594
- },
- {
- "epoch": 0.4521276595744681,
- "grad_norm": 1.3793858289718628,
- "learning_rate": 4.863682771232249e-06,
- "loss": 0.45989373326301575,
- "mean_token_accuracy": 0.8254791498184204,
- "num_tokens": 5482121.0,
- "step": 595
- },
- {
- "epoch": 0.45288753799392095,
- "grad_norm": 1.9812315702438354,
- "learning_rate": 4.862999797008149e-06,
- "loss": 0.5778874754905701,
- "mean_token_accuracy": 0.8041508197784424,
- "num_tokens": 5493000.0,
- "step": 596
- },
- {
- "epoch": 0.45364741641337386,
- "grad_norm": 3.3065083026885986,
- "learning_rate": 4.862315164339829e-06,
- "loss": 0.4623975157737732,
- "mean_token_accuracy": 0.8426318168640137,
- "num_tokens": 5496723.0,
- "step": 597
- },
- {
- "epoch": 0.45440729483282677,
- "grad_norm": 3.167119026184082,
- "learning_rate": 4.861628873707792e-06,
- "loss": 0.6984533667564392,
- "mean_token_accuracy": 0.772136926651001,
- "num_tokens": 5501161.0,
- "step": 598
- },
- {
- "epoch": 0.4551671732522796,
- "grad_norm": 2.2130985260009766,
- "learning_rate": 4.860940925593703e-06,
- "loss": 0.4823192059993744,
- "mean_token_accuracy": 0.8462972640991211,
- "num_tokens": 5509544.0,
- "step": 599
- },
- {
- "epoch": 0.45592705167173253,
- "grad_norm": 3.029191732406616,
- "learning_rate": 4.86025132048039e-06,
- "loss": 0.523664116859436,
- "mean_token_accuracy": 0.8229140043258667,
- "num_tokens": 5514586.0,
- "step": 600
- },
- {
- "epoch": 0.4566869300911854,
- "grad_norm": 1.6983962059020996,
- "learning_rate": 4.859560058851844e-06,
- "loss": 0.4832698106765747,
- "mean_token_accuracy": 0.8403248190879822,
- "num_tokens": 5525773.0,
- "step": 601
- },
- {
- "epoch": 0.4574468085106383,
- "grad_norm": 3.0504038333892822,
- "learning_rate": 4.8588671411932195e-06,
- "loss": 0.5158926248550415,
- "mean_token_accuracy": 0.8098392486572266,
- "num_tokens": 5529739.0,
- "step": 602
- },
- {
- "epoch": 0.4582066869300912,
- "grad_norm": 2.584836483001709,
- "learning_rate": 4.858172567990832e-06,
- "loss": 0.5724587440490723,
- "mean_token_accuracy": 0.8128519058227539,
- "num_tokens": 5535763.0,
- "step": 603
- },
- {
- "epoch": 0.45896656534954405,
- "grad_norm": 2.0514042377471924,
- "learning_rate": 4.857476339732162e-06,
- "loss": 0.4337679445743561,
- "mean_token_accuracy": 0.8405929207801819,
- "num_tokens": 5543075.0,
- "step": 604
- },
- {
- "epoch": 0.45972644376899696,
- "grad_norm": 2.2949347496032715,
- "learning_rate": 4.856778456905846e-06,
- "loss": 0.46532145142555237,
- "mean_token_accuracy": 0.8345137238502502,
- "num_tokens": 5549035.0,
- "step": 605
- },
- {
- "epoch": 0.46048632218844987,
- "grad_norm": 2.2067551612854004,
- "learning_rate": 4.856078920001689e-06,
- "loss": 0.5855136513710022,
- "mean_token_accuracy": 0.8043795228004456,
- "num_tokens": 5555545.0,
- "step": 606
- },
- {
- "epoch": 0.4612462006079027,
- "grad_norm": 2.101945161819458,
- "learning_rate": 4.855377729510648e-06,
- "loss": 0.6071814298629761,
- "mean_token_accuracy": 0.7973253130912781,
- "num_tokens": 5563615.0,
- "step": 607
- },
- {
- "epoch": 0.46200607902735563,
- "grad_norm": 2.5958821773529053,
- "learning_rate": 4.8546748859248504e-06,
- "loss": 0.6278061866760254,
- "mean_token_accuracy": 0.7864972352981567,
- "num_tokens": 5570078.0,
- "step": 608
- },
- {
- "epoch": 0.4627659574468085,
- "grad_norm": 2.778101921081543,
- "learning_rate": 4.853970389737576e-06,
- "loss": 0.35521194338798523,
- "mean_token_accuracy": 0.8752605319023132,
- "num_tokens": 5573995.0,
- "step": 609
- },
- {
- "epoch": 0.4635258358662614,
- "grad_norm": 2.600534677505493,
- "learning_rate": 4.8532642414432675e-06,
- "loss": 0.6541563868522644,
- "mean_token_accuracy": 0.7843613028526306,
- "num_tokens": 5580333.0,
- "step": 610
- },
- {
- "epoch": 0.4642857142857143,
- "grad_norm": 1.778337836265564,
- "learning_rate": 4.852556441537528e-06,
- "loss": 0.3561405837535858,
- "mean_token_accuracy": 0.8579353094100952,
- "num_tokens": 5588430.0,
- "step": 611
- },
- {
- "epoch": 0.46504559270516715,
- "grad_norm": 1.5653862953186035,
- "learning_rate": 4.851846990517118e-06,
- "loss": 0.6067906618118286,
- "mean_token_accuracy": 0.7919317483901978,
- "num_tokens": 5601700.0,
- "step": 612
- },
- {
- "epoch": 0.46580547112462006,
- "grad_norm": 1.6097723245620728,
- "learning_rate": 4.851135888879958e-06,
- "loss": 0.446664422750473,
- "mean_token_accuracy": 0.8441969156265259,
- "num_tokens": 5612063.0,
- "step": 613
- },
- {
- "epoch": 0.46656534954407297,
- "grad_norm": 1.961207389831543,
- "learning_rate": 4.850423137125126e-06,
- "loss": 0.5508605241775513,
- "mean_token_accuracy": 0.8240450024604797,
- "num_tokens": 5620245.0,
- "step": 614
- },
- {
- "epoch": 0.4673252279635258,
- "grad_norm": 2.2189085483551025,
- "learning_rate": 4.8497087357528585e-06,
- "loss": 0.6805076599121094,
- "mean_token_accuracy": 0.771978497505188,
- "num_tokens": 5629590.0,
- "step": 615
- },
- {
- "epoch": 0.46808510638297873,
- "grad_norm": 2.5176279544830322,
- "learning_rate": 4.8489926852645505e-06,
- "loss": 0.4512156844139099,
- "mean_token_accuracy": 0.836459755897522,
- "num_tokens": 5635259.0,
- "step": 616
- },
- {
- "epoch": 0.4688449848024316,
- "grad_norm": 1.5327287912368774,
- "learning_rate": 4.848274986162754e-06,
- "loss": 0.4884302616119385,
- "mean_token_accuracy": 0.8194037079811096,
- "num_tokens": 5649993.0,
- "step": 617
- },
- {
- "epoch": 0.4696048632218845,
- "grad_norm": 2.184554100036621,
- "learning_rate": 4.847555638951177e-06,
- "loss": 0.5141451358795166,
- "mean_token_accuracy": 0.8245922327041626,
- "num_tokens": 5657375.0,
- "step": 618
- },
- {
- "epoch": 0.4703647416413374,
- "grad_norm": 1.6143407821655273,
- "learning_rate": 4.846834644134686e-06,
- "loss": 0.4276641607284546,
- "mean_token_accuracy": 0.8481845855712891,
- "num_tokens": 5667941.0,
- "step": 619
- },
- {
- "epoch": 0.47112462006079026,
- "grad_norm": 2.3747270107269287,
- "learning_rate": 4.846112002219301e-06,
- "loss": 0.5608246922492981,
- "mean_token_accuracy": 0.8073011040687561,
- "num_tokens": 5675042.0,
- "step": 620
- },
- {
- "epoch": 0.47188449848024316,
- "grad_norm": 2.390404224395752,
- "learning_rate": 4.845387713712203e-06,
- "loss": 0.46616724133491516,
- "mean_token_accuracy": 0.8468319177627563,
- "num_tokens": 5680207.0,
- "step": 621
- },
- {
- "epoch": 0.4726443768996961,
- "grad_norm": 1.7245099544525146,
- "learning_rate": 4.844661779121723e-06,
- "loss": 0.5652435421943665,
- "mean_token_accuracy": 0.8010749816894531,
- "num_tokens": 5693759.0,
- "step": 622
- },
- {
- "epoch": 0.4734042553191489,
- "grad_norm": 2.6923108100891113,
- "learning_rate": 4.843934198957351e-06,
- "loss": 0.6254661679267883,
- "mean_token_accuracy": 0.8236024975776672,
- "num_tokens": 5699916.0,
- "step": 623
- },
- {
- "epoch": 0.47416413373860183,
- "grad_norm": 2.516901969909668,
- "learning_rate": 4.84320497372973e-06,
- "loss": 0.6334252953529358,
- "mean_token_accuracy": 0.7803834676742554,
- "num_tokens": 5706554.0,
- "step": 624
- },
- {
- "epoch": 0.4749240121580547,
- "grad_norm": 2.3744447231292725,
- "learning_rate": 4.842474103950658e-06,
- "loss": 0.4221811890602112,
- "mean_token_accuracy": 0.8639545440673828,
- "num_tokens": 5711756.0,
- "step": 625
- },
- {
- "epoch": 0.4756838905775076,
- "grad_norm": 3.2373476028442383,
- "learning_rate": 4.841741590133089e-06,
- "loss": 0.6637828946113586,
- "mean_token_accuracy": 0.7968347072601318,
- "num_tokens": 5716458.0,
- "step": 626
- },
- {
- "epoch": 0.4764437689969605,
- "grad_norm": 2.153888463973999,
- "learning_rate": 4.841007432791129e-06,
- "loss": 0.4877486228942871,
- "mean_token_accuracy": 0.8345249891281128,
- "num_tokens": 5723155.0,
- "step": 627
- },
- {
- "epoch": 0.47720364741641336,
- "grad_norm": 2.120497703552246,
- "learning_rate": 4.8402716324400375e-06,
- "loss": 0.37323033809661865,
- "mean_token_accuracy": 0.8734050393104553,
- "num_tokens": 5729171.0,
- "step": 628
- },
- {
- "epoch": 0.47796352583586627,
- "grad_norm": 1.5294172763824463,
- "learning_rate": 4.839534189596228e-06,
- "loss": 0.4057067334651947,
- "mean_token_accuracy": 0.8523319959640503,
- "num_tokens": 5740112.0,
- "step": 629
- },
- {
- "epoch": 0.4787234042553192,
- "grad_norm": 2.1913886070251465,
- "learning_rate": 4.8387951047772656e-06,
- "loss": 0.4835960865020752,
- "mean_token_accuracy": 0.8438145518302917,
- "num_tokens": 5746838.0,
- "step": 630
- },
- {
- "epoch": 0.479483282674772,
- "grad_norm": 1.482897162437439,
- "learning_rate": 4.838054378501868e-06,
- "loss": 0.46967992186546326,
- "mean_token_accuracy": 0.8315759897232056,
- "num_tokens": 5760428.0,
- "step": 631
- },
- {
- "epoch": 0.48024316109422494,
- "grad_norm": 1.38850998878479,
- "learning_rate": 4.837312011289907e-06,
- "loss": 0.41845446825027466,
- "mean_token_accuracy": 0.8557186126708984,
- "num_tokens": 5773437.0,
- "step": 632
- },
- {
- "epoch": 0.4810030395136778,
- "grad_norm": 3.8337457180023193,
- "learning_rate": 4.836568003662403e-06,
- "loss": 0.5102912187576294,
- "mean_token_accuracy": 0.830644965171814,
- "num_tokens": 5776367.0,
- "step": 633
- },
- {
- "epoch": 0.4817629179331307,
- "grad_norm": 1.2084007263183594,
- "learning_rate": 4.8358223561415304e-06,
- "loss": 0.3835333585739136,
- "mean_token_accuracy": 0.8639016151428223,
- "num_tokens": 5792246.0,
- "step": 634
- },
- {
- "epoch": 0.4825227963525836,
- "grad_norm": 1.939408540725708,
- "learning_rate": 4.835075069250613e-06,
- "loss": 0.4044850468635559,
- "mean_token_accuracy": 0.8488376140594482,
- "num_tokens": 5799853.0,
- "step": 635
- },
- {
- "epoch": 0.48328267477203646,
- "grad_norm": 1.345870852470398,
- "learning_rate": 4.8343261435141245e-06,
- "loss": 0.46660199761390686,
- "mean_token_accuracy": 0.8371681571006775,
- "num_tokens": 5817478.0,
- "step": 636
- },
- {
- "epoch": 0.48404255319148937,
- "grad_norm": 1.6531339883804321,
- "learning_rate": 4.833575579457691e-06,
- "loss": 0.3886989951133728,
- "mean_token_accuracy": 0.8763507008552551,
- "num_tokens": 5825739.0,
- "step": 637
- },
- {
- "epoch": 0.4848024316109423,
- "grad_norm": 1.6443969011306763,
- "learning_rate": 4.832823377608088e-06,
- "loss": 0.4070289731025696,
- "mean_token_accuracy": 0.8586630821228027,
- "num_tokens": 5837917.0,
- "step": 638
- },
- {
- "epoch": 0.48556231003039513,
- "grad_norm": 2.005136013031006,
- "learning_rate": 4.832069538493237e-06,
- "loss": 0.40616685152053833,
- "mean_token_accuracy": 0.8571510314941406,
- "num_tokens": 5845250.0,
- "step": 639
- },
- {
- "epoch": 0.48632218844984804,
- "grad_norm": 1.5244266986846924,
- "learning_rate": 4.831314062642213e-06,
- "loss": 0.49530288577079773,
- "mean_token_accuracy": 0.8328841924667358,
- "num_tokens": 5857407.0,
- "step": 640
- },
- {
- "epoch": 0.4870820668693009,
- "grad_norm": 1.9876971244812012,
- "learning_rate": 4.830556950585239e-06,
- "loss": 0.4583776593208313,
- "mean_token_accuracy": 0.8427221179008484,
- "num_tokens": 5865391.0,
- "step": 641
- },
- {
- "epoch": 0.4878419452887538,
- "grad_norm": 3.023336172103882,
- "learning_rate": 4.829798202853683e-06,
- "loss": 0.6134771108627319,
- "mean_token_accuracy": 0.7981935739517212,
- "num_tokens": 5870729.0,
- "step": 642
- },
- {
- "epoch": 0.4886018237082067,
- "grad_norm": 1.8889515399932861,
- "learning_rate": 4.829037819980065e-06,
- "loss": 0.4420135021209717,
- "mean_token_accuracy": 0.8480775356292725,
- "num_tokens": 5878982.0,
- "step": 643
- },
- {
- "epoch": 0.48936170212765956,
- "grad_norm": 2.2408435344696045,
- "learning_rate": 4.828275802498051e-06,
- "loss": 0.525706946849823,
- "mean_token_accuracy": 0.8271557092666626,
- "num_tokens": 5885097.0,
- "step": 644
- },
- {
- "epoch": 0.49012158054711247,
- "grad_norm": 1.9734224081039429,
- "learning_rate": 4.827512150942454e-06,
- "loss": 0.44246578216552734,
- "mean_token_accuracy": 0.8456668257713318,
- "num_tokens": 5893941.0,
- "step": 645
- },
- {
- "epoch": 0.4908814589665654,
- "grad_norm": 1.9618173837661743,
- "learning_rate": 4.8267468658492335e-06,
- "loss": 0.5119768381118774,
- "mean_token_accuracy": 0.8355510830879211,
- "num_tokens": 5902829.0,
- "step": 646
- },
- {
- "epoch": 0.49164133738601823,
- "grad_norm": 1.7181587219238281,
- "learning_rate": 4.825979947755496e-06,
- "loss": 0.5666520595550537,
- "mean_token_accuracy": 0.7951971888542175,
- "num_tokens": 5915212.0,
- "step": 647
- },
- {
- "epoch": 0.49240121580547114,
- "grad_norm": 3.0121164321899414,
- "learning_rate": 4.8252113971994955e-06,
- "loss": 0.628632128238678,
- "mean_token_accuracy": 0.8041050434112549,
- "num_tokens": 5921410.0,
- "step": 648
- },
- {
- "epoch": 0.493161094224924,
- "grad_norm": 2.9980475902557373,
- "learning_rate": 4.824441214720629e-06,
- "loss": 0.4507424831390381,
- "mean_token_accuracy": 0.8636263608932495,
- "num_tokens": 5925179.0,
- "step": 649
- },
- {
- "epoch": 0.4939209726443769,
- "grad_norm": 2.0096445083618164,
- "learning_rate": 4.823669400859441e-06,
- "loss": 0.602759838104248,
- "mean_token_accuracy": 0.8104915618896484,
- "num_tokens": 5934160.0,
- "step": 650
- },
- {
- "epoch": 0.4946808510638298,
- "grad_norm": 1.1186442375183105,
- "learning_rate": 4.8228959561576195e-06,
- "loss": 0.41168469190597534,
- "mean_token_accuracy": 0.8461419939994812,
- "num_tokens": 5954163.0,
- "step": 651
- },
- {
- "epoch": 0.49544072948328266,
- "grad_norm": 1.855465054512024,
- "learning_rate": 4.822120881157998e-06,
- "loss": 0.5049735307693481,
- "mean_token_accuracy": 0.8225747346878052,
- "num_tokens": 5963840.0,
- "step": 652
- },
- {
- "epoch": 0.49620060790273557,
- "grad_norm": 3.550563335418701,
- "learning_rate": 4.821344176404554e-06,
- "loss": 0.49025264382362366,
- "mean_token_accuracy": 0.8265978693962097,
- "num_tokens": 5967358.0,
- "step": 653
- },
- {
- "epoch": 0.4969604863221885,
- "grad_norm": 3.063910484313965,
- "learning_rate": 4.820565842442408e-06,
- "loss": 0.5652767419815063,
- "mean_token_accuracy": 0.811700701713562,
- "num_tokens": 5971858.0,
- "step": 654
- },
- {
- "epoch": 0.49772036474164133,
- "grad_norm": 2.4613308906555176,
- "learning_rate": 4.819785879817827e-06,
- "loss": 0.5296125411987305,
- "mean_token_accuracy": 0.8336488008499146,
- "num_tokens": 5977442.0,
- "step": 655
- },
- {
- "epoch": 0.49848024316109424,
- "grad_norm": 2.342519760131836,
- "learning_rate": 4.819004289078217e-06,
- "loss": 0.5753380060195923,
- "mean_token_accuracy": 0.7922406792640686,
- "num_tokens": 5984531.0,
- "step": 656
- },
- {
- "epoch": 0.4992401215805471,
- "grad_norm": 2.0410680770874023,
- "learning_rate": 4.818221070772129e-06,
- "loss": 0.5433275699615479,
- "mean_token_accuracy": 0.8043830990791321,
- "num_tokens": 5992642.0,
- "step": 657
- },
- {
- "epoch": 0.5,
- "grad_norm": 1.4999698400497437,
- "learning_rate": 4.8174362254492555e-06,
- "loss": 0.5248899459838867,
- "mean_token_accuracy": 0.8107168674468994,
- "num_tokens": 6005543.0,
- "step": 658
- },
- {
- "epoch": 0.5007598784194529,
- "grad_norm": 1.9494401216506958,
- "learning_rate": 4.816649753660431e-06,
- "loss": 0.41291385889053345,
- "mean_token_accuracy": 0.8650569915771484,
- "num_tokens": 6012185.0,
- "step": 659
- },
- {
- "epoch": 0.5015197568389058,
- "grad_norm": 2.7514095306396484,
- "learning_rate": 4.815861655957632e-06,
- "loss": 0.4244142770767212,
- "mean_token_accuracy": 0.8485112190246582,
- "num_tokens": 6016809.0,
- "step": 660
- },
- {
- "epoch": 0.5022796352583586,
- "grad_norm": 1.4354928731918335,
- "learning_rate": 4.815071932893976e-06,
- "loss": 0.4332060217857361,
- "mean_token_accuracy": 0.8386815786361694,
- "num_tokens": 6034795.0,
- "step": 661
- },
- {
- "epoch": 0.5030395136778115,
- "grad_norm": 1.3113417625427246,
- "learning_rate": 4.81428058502372e-06,
- "loss": 0.5415540933609009,
- "mean_token_accuracy": 0.8115285038948059,
- "num_tokens": 6053624.0,
- "step": 662
- },
- {
- "epoch": 0.5037993920972644,
- "grad_norm": 1.820868730545044,
- "learning_rate": 4.813487612902265e-06,
- "loss": 0.5360245108604431,
- "mean_token_accuracy": 0.8313555717468262,
- "num_tokens": 6063399.0,
- "step": 663
- },
- {
- "epoch": 0.5045592705167173,
- "grad_norm": 2.347001552581787,
- "learning_rate": 4.812693017086145e-06,
- "loss": 0.4926982820034027,
- "mean_token_accuracy": 0.8137006759643555,
- "num_tokens": 6070111.0,
- "step": 664
- },
- {
- "epoch": 0.5053191489361702,
- "grad_norm": 1.8830888271331787,
- "learning_rate": 4.811896798133042e-06,
- "loss": 0.5419014692306519,
- "mean_token_accuracy": 0.8027454614639282,
- "num_tokens": 6081090.0,
- "step": 665
- },
- {
- "epoch": 0.506079027355623,
- "grad_norm": 2.3258056640625,
- "learning_rate": 4.811098956601772e-06,
- "loss": 0.4629337787628174,
- "mean_token_accuracy": 0.8416580557823181,
- "num_tokens": 6087921.0,
- "step": 666
- },
- {
- "epoch": 0.506838905775076,
- "grad_norm": 1.9578291177749634,
- "learning_rate": 4.810299493052289e-06,
- "loss": 0.40305402874946594,
- "mean_token_accuracy": 0.8529061079025269,
- "num_tokens": 6100034.0,
- "step": 667
- },
- {
- "epoch": 0.5075987841945289,
- "grad_norm": 2.800635576248169,
- "learning_rate": 4.809498408045691e-06,
- "loss": 0.5087342262268066,
- "mean_token_accuracy": 0.8214689493179321,
- "num_tokens": 6104742.0,
- "step": 668
- },
- {
- "epoch": 0.5083586626139818,
- "grad_norm": 1.5318149328231812,
- "learning_rate": 4.808695702144206e-06,
- "loss": 0.4733222723007202,
- "mean_token_accuracy": 0.837577223777771,
- "num_tokens": 6117242.0,
- "step": 669
- },
- {
- "epoch": 0.5091185410334347,
- "grad_norm": 1.2368661165237427,
- "learning_rate": 4.807891375911207e-06,
- "loss": 0.3929097056388855,
- "mean_token_accuracy": 0.8331400752067566,
- "num_tokens": 6133509.0,
- "step": 670
- },
- {
- "epoch": 0.5098784194528876,
- "grad_norm": 2.4711415767669678,
- "learning_rate": 4.8070854299112e-06,
- "loss": 0.6294851303100586,
- "mean_token_accuracy": 0.7956781983375549,
- "num_tokens": 6140294.0,
- "step": 671
- },
- {
- "epoch": 0.5106382978723404,
- "grad_norm": 2.590961217880249,
- "learning_rate": 4.806277864709828e-06,
- "loss": 0.580160915851593,
- "mean_token_accuracy": 0.809589684009552,
- "num_tokens": 6145803.0,
- "step": 672
- },
- {
- "epoch": 0.5113981762917933,
- "grad_norm": 2.4653842449188232,
- "learning_rate": 4.805468680873874e-06,
- "loss": 0.5262120366096497,
- "mean_token_accuracy": 0.822458803653717,
- "num_tokens": 6151236.0,
- "step": 673
- },
- {
- "epoch": 0.5121580547112462,
- "grad_norm": 2.860720157623291,
- "learning_rate": 4.804657878971252e-06,
- "loss": 0.4007391035556793,
- "mean_token_accuracy": 0.8637382984161377,
- "num_tokens": 6155310.0,
- "step": 674
- },
- {
- "epoch": 0.5129179331306991,
- "grad_norm": 2.520282030105591,
- "learning_rate": 4.803845459571014e-06,
- "loss": 0.45798182487487793,
- "mean_token_accuracy": 0.8270114660263062,
- "num_tokens": 6160326.0,
- "step": 675
- },
- {
- "epoch": 0.513677811550152,
- "grad_norm": 2.7290921211242676,
- "learning_rate": 4.803031423243349e-06,
- "loss": 0.5745848417282104,
- "mean_token_accuracy": 0.8401234745979309,
- "num_tokens": 6165709.0,
- "step": 676
- },
- {
- "epoch": 0.5144376899696048,
- "grad_norm": 1.6678650379180908,
- "learning_rate": 4.802215770559578e-06,
- "loss": 0.5257721543312073,
- "mean_token_accuracy": 0.8241991996765137,
- "num_tokens": 6177875.0,
- "step": 677
- },
- {
- "epoch": 0.5151975683890577,
- "grad_norm": 2.1720468997955322,
- "learning_rate": 4.801398502092156e-06,
- "loss": 0.45342206954956055,
- "mean_token_accuracy": 0.8463799953460693,
- "num_tokens": 6185415.0,
- "step": 678
- },
- {
- "epoch": 0.5159574468085106,
- "grad_norm": 2.282259702682495,
- "learning_rate": 4.800579618414677e-06,
- "loss": 0.4864169955253601,
- "mean_token_accuracy": 0.8300632238388062,
- "num_tokens": 6191832.0,
- "step": 679
- },
- {
- "epoch": 0.5167173252279635,
- "grad_norm": 2.0092248916625977,
- "learning_rate": 4.799759120101861e-06,
- "loss": 0.5781463980674744,
- "mean_token_accuracy": 0.8267031908035278,
- "num_tokens": 6199440.0,
- "step": 680
- },
- {
- "epoch": 0.5174772036474165,
- "grad_norm": 1.396580696105957,
- "learning_rate": 4.798937007729568e-06,
- "loss": 0.49689239263534546,
- "mean_token_accuracy": 0.8257499933242798,
- "num_tokens": 6213840.0,
- "step": 681
- },
- {
- "epoch": 0.5182370820668692,
- "grad_norm": 1.9060769081115723,
- "learning_rate": 4.798113281874788e-06,
- "loss": 0.48969539999961853,
- "mean_token_accuracy": 0.8171790838241577,
- "num_tokens": 6223006.0,
- "step": 682
- },
- {
- "epoch": 0.5189969604863222,
- "grad_norm": 1.6255282163619995,
- "learning_rate": 4.797287943115642e-06,
- "loss": 0.5532330870628357,
- "mean_token_accuracy": 0.8173393607139587,
- "num_tokens": 6234857.0,
- "step": 683
- },
- {
- "epoch": 0.5197568389057751,
- "grad_norm": 1.6923905611038208,
- "learning_rate": 4.796460992031386e-06,
- "loss": 0.4880887269973755,
- "mean_token_accuracy": 0.834983229637146,
- "num_tokens": 6245252.0,
- "step": 684
- },
- {
- "epoch": 0.520516717325228,
- "grad_norm": 2.13161301612854,
- "learning_rate": 4.7956324292024045e-06,
- "loss": 0.5687593817710876,
- "mean_token_accuracy": 0.7996571063995361,
- "num_tokens": 6253726.0,
- "step": 685
- },
- {
- "epoch": 0.5212765957446809,
- "grad_norm": 2.509375810623169,
- "learning_rate": 4.794802255210217e-06,
- "loss": 0.5396929979324341,
- "mean_token_accuracy": 0.8007107973098755,
- "num_tokens": 6259238.0,
- "step": 686
- },
- {
- "epoch": 0.5220364741641338,
- "grad_norm": 2.393710136413574,
- "learning_rate": 4.793970470637469e-06,
- "loss": 0.6165191531181335,
- "mean_token_accuracy": 0.7891418933868408,
- "num_tokens": 6266325.0,
- "step": 687
- },
- {
- "epoch": 0.5227963525835866,
- "grad_norm": 1.511647343635559,
- "learning_rate": 4.7931370760679415e-06,
- "loss": 0.4773876965045929,
- "mean_token_accuracy": 0.8381044864654541,
- "num_tokens": 6277447.0,
- "step": 688
- },
- {
- "epoch": 0.5235562310030395,
- "grad_norm": 2.206587314605713,
- "learning_rate": 4.792302072086542e-06,
- "loss": 0.5482058525085449,
- "mean_token_accuracy": 0.8239108920097351,
- "num_tokens": 6285163.0,
- "step": 689
- },
- {
- "epoch": 0.5243161094224924,
- "grad_norm": 3.018146514892578,
- "learning_rate": 4.7914654592793065e-06,
- "loss": 0.4880615472793579,
- "mean_token_accuracy": 0.8361308574676514,
- "num_tokens": 6289386.0,
- "step": 690
- },
- {
- "epoch": 0.5250759878419453,
- "grad_norm": 1.6469231843948364,
- "learning_rate": 4.790627238233405e-06,
- "loss": 0.4164774715900421,
- "mean_token_accuracy": 0.8496290445327759,
- "num_tokens": 6298915.0,
- "step": 691
- },
- {
- "epoch": 0.5258358662613982,
- "grad_norm": 2.352505922317505,
- "learning_rate": 4.789787409537131e-06,
- "loss": 0.5366303324699402,
- "mean_token_accuracy": 0.8350417613983154,
- "num_tokens": 6306130.0,
- "step": 692
- },
- {
- "epoch": 0.526595744680851,
- "grad_norm": 1.7463021278381348,
- "learning_rate": 4.7889459737799105e-06,
- "loss": 0.4389137923717499,
- "mean_token_accuracy": 0.8463300466537476,
- "num_tokens": 6315503.0,
- "step": 693
- },
- {
- "epoch": 0.5273556231003039,
- "grad_norm": 2.257706642150879,
- "learning_rate": 4.788102931552294e-06,
- "loss": 0.5309344530105591,
- "mean_token_accuracy": 0.8164352178573608,
- "num_tokens": 6321852.0,
- "step": 694
- },
- {
- "epoch": 0.5281155015197568,
- "grad_norm": 2.392732620239258,
- "learning_rate": 4.787258283445962e-06,
- "loss": 0.3956204056739807,
- "mean_token_accuracy": 0.8671456575393677,
- "num_tokens": 6327380.0,
- "step": 695
- },
- {
- "epoch": 0.5288753799392097,
- "grad_norm": 2.210514545440674,
- "learning_rate": 4.786412030053721e-06,
- "loss": 0.4842875003814697,
- "mean_token_accuracy": 0.8508446216583252,
- "num_tokens": 6334898.0,
- "step": 696
- },
- {
- "epoch": 0.5296352583586627,
- "grad_norm": 1.8678946495056152,
- "learning_rate": 4.785564171969503e-06,
- "loss": 0.47399595379829407,
- "mean_token_accuracy": 0.8514996767044067,
- "num_tokens": 6346374.0,
- "step": 697
- },
- {
- "epoch": 0.5303951367781155,
- "grad_norm": 2.604079484939575,
- "learning_rate": 4.784714709788368e-06,
- "loss": 0.5950228571891785,
- "mean_token_accuracy": 0.7983481884002686,
- "num_tokens": 6351648.0,
- "step": 698
- },
- {
- "epoch": 0.5311550151975684,
- "grad_norm": 1.662381649017334,
- "learning_rate": 4.783863644106502e-06,
- "loss": 0.41616758704185486,
- "mean_token_accuracy": 0.8554803133010864,
- "num_tokens": 6360506.0,
- "step": 699
- },
- {
- "epoch": 0.5319148936170213,
- "grad_norm": 1.6300342082977295,
- "learning_rate": 4.783010975521216e-06,
- "loss": 0.43029269576072693,
- "mean_token_accuracy": 0.8443028926849365,
- "num_tokens": 6370675.0,
- "step": 700
- },
- {
- "epoch": 0.5326747720364742,
- "grad_norm": 1.731873869895935,
- "learning_rate": 4.782156704630944e-06,
- "loss": 0.4383814334869385,
- "mean_token_accuracy": 0.8443183898925781,
- "num_tokens": 6381803.0,
- "step": 701
- },
- {
- "epoch": 0.5334346504559271,
- "grad_norm": 3.1788413524627686,
- "learning_rate": 4.7813008320352475e-06,
- "loss": 0.32194480299949646,
- "mean_token_accuracy": 0.8870962858200073,
- "num_tokens": 6389263.0,
- "step": 702
- },
- {
- "epoch": 0.53419452887538,
- "grad_norm": 2.099513530731201,
- "learning_rate": 4.78044335833481e-06,
- "loss": 0.36962923407554626,
- "mean_token_accuracy": 0.8661133646965027,
- "num_tokens": 6395589.0,
- "step": 703
- },
- {
- "epoch": 0.5349544072948328,
- "grad_norm": 1.4859435558319092,
- "learning_rate": 4.77958428413144e-06,
- "loss": 0.4619954824447632,
- "mean_token_accuracy": 0.8438555002212524,
- "num_tokens": 6407470.0,
- "step": 704
- },
- {
- "epoch": 0.5357142857142857,
- "grad_norm": 1.2561073303222656,
- "learning_rate": 4.7787236100280685e-06,
- "loss": 0.3770977258682251,
- "mean_token_accuracy": 0.8515733480453491,
- "num_tokens": 6422888.0,
- "step": 705
- },
- {
- "epoch": 0.5364741641337386,
- "grad_norm": 1.4455817937850952,
- "learning_rate": 4.777861336628751e-06,
- "loss": 0.46481069922447205,
- "mean_token_accuracy": 0.8502002954483032,
- "num_tokens": 6441266.0,
- "step": 706
- },
- {
- "epoch": 0.5372340425531915,
- "grad_norm": 1.1387295722961426,
- "learning_rate": 4.7769974645386616e-06,
- "loss": 0.36964765191078186,
- "mean_token_accuracy": 0.8719524145126343,
- "num_tokens": 6463686.0,
- "step": 707
- },
- {
- "epoch": 0.5379939209726444,
- "grad_norm": 1.7179663181304932,
- "learning_rate": 4.776131994364102e-06,
- "loss": 0.4231719970703125,
- "mean_token_accuracy": 0.8416585922241211,
- "num_tokens": 6472956.0,
- "step": 708
- },
- {
- "epoch": 0.5387537993920972,
- "grad_norm": 1.6328502893447876,
- "learning_rate": 4.775264926712489e-06,
- "loss": 0.5836569666862488,
- "mean_token_accuracy": 0.8039724230766296,
- "num_tokens": 6485773.0,
- "step": 709
- },
- {
- "epoch": 0.5395136778115501,
- "grad_norm": 1.8515360355377197,
- "learning_rate": 4.774396262192368e-06,
- "loss": 0.5477553009986877,
- "mean_token_accuracy": 0.8136521577835083,
- "num_tokens": 6496379.0,
- "step": 710
- },
- {
- "epoch": 0.540273556231003,
- "grad_norm": 1.741858959197998,
- "learning_rate": 4.7735260014133986e-06,
- "loss": 0.4663267731666565,
- "mean_token_accuracy": 0.8473691940307617,
- "num_tokens": 6507652.0,
- "step": 711
- },
- {
- "epoch": 0.541033434650456,
- "grad_norm": 1.7516659498214722,
- "learning_rate": 4.772654144986364e-06,
- "loss": 0.374914288520813,
- "mean_token_accuracy": 0.8600220680236816,
- "num_tokens": 6519030.0,
- "step": 712
- },
- {
- "epoch": 0.5417933130699089,
- "grad_norm": 2.662343978881836,
- "learning_rate": 4.7717806935231665e-06,
- "loss": 0.4206875264644623,
- "mean_token_accuracy": 0.8544126749038696,
- "num_tokens": 6523669.0,
- "step": 713
- },
- {
- "epoch": 0.5425531914893617,
- "grad_norm": 1.4088834524154663,
- "learning_rate": 4.770905647636828e-06,
- "loss": 0.5824331045150757,
- "mean_token_accuracy": 0.7857901453971863,
- "num_tokens": 6540560.0,
- "step": 714
- },
- {
- "epoch": 0.5433130699088146,
- "grad_norm": 2.173656940460205,
- "learning_rate": 4.77002900794149e-06,
- "loss": 0.555023729801178,
- "mean_token_accuracy": 0.8067290782928467,
- "num_tokens": 6548946.0,
- "step": 715
- },
- {
- "epoch": 0.5440729483282675,
- "grad_norm": 2.121018648147583,
- "learning_rate": 4.769150775052411e-06,
- "loss": 0.559730052947998,
- "mean_token_accuracy": 0.8166372776031494,
- "num_tokens": 6556065.0,
- "step": 716
- },
- {
- "epoch": 0.5448328267477204,
- "grad_norm": 3.335866928100586,
- "learning_rate": 4.768270949585968e-06,
- "loss": 0.6442267894744873,
- "mean_token_accuracy": 0.7858607769012451,
- "num_tokens": 6560615.0,
- "step": 717
- },
- {
- "epoch": 0.5455927051671733,
- "grad_norm": 2.3813695907592773,
- "learning_rate": 4.767389532159659e-06,
- "loss": 0.4027421474456787,
- "mean_token_accuracy": 0.8635619282722473,
- "num_tokens": 6565841.0,
- "step": 718
- },
- {
- "epoch": 0.5463525835866262,
- "grad_norm": 2.0657708644866943,
- "learning_rate": 4.766506523392095e-06,
- "loss": 0.38899827003479004,
- "mean_token_accuracy": 0.8660480380058289,
- "num_tokens": 6572362.0,
- "step": 719
- },
- {
- "epoch": 0.547112462006079,
- "grad_norm": 1.093705415725708,
- "learning_rate": 4.765621923903005e-06,
- "loss": 0.45967352390289307,
- "mean_token_accuracy": 0.8338102102279663,
- "num_tokens": 6595998.0,
- "step": 720
- },
- {
- "epoch": 0.5478723404255319,
- "grad_norm": 2.942065954208374,
- "learning_rate": 4.764735734313236e-06,
- "loss": 0.42910510301589966,
- "mean_token_accuracy": 0.8406122922897339,
- "num_tokens": 6601075.0,
- "step": 721
- },
- {
- "epoch": 0.5486322188449848,
- "grad_norm": 2.049011707305908,
- "learning_rate": 4.763847955244749e-06,
- "loss": 0.5584231615066528,
- "mean_token_accuracy": 0.8171684741973877,
- "num_tokens": 6609310.0,
- "step": 722
- },
- {
- "epoch": 0.5493920972644377,
- "grad_norm": 2.485543966293335,
- "learning_rate": 4.762958587320623e-06,
- "loss": 0.5396170020103455,
- "mean_token_accuracy": 0.8158525824546814,
- "num_tokens": 6616185.0,
- "step": 723
- },
- {
- "epoch": 0.5501519756838906,
- "grad_norm": 1.87015962600708,
- "learning_rate": 4.762067631165049e-06,
- "loss": 0.49739527702331543,
- "mean_token_accuracy": 0.8303765654563904,
- "num_tokens": 6625629.0,
- "step": 724
- },
- {
- "epoch": 0.5509118541033434,
- "grad_norm": 4.239654541015625,
- "learning_rate": 4.761175087403336e-06,
- "loss": 0.6029239296913147,
- "mean_token_accuracy": 0.8123486042022705,
- "num_tokens": 6629194.0,
- "step": 725
- },
- {
- "epoch": 0.5516717325227963,
- "grad_norm": 2.0134730339050293,
- "learning_rate": 4.760280956661904e-06,
- "loss": 0.4777873754501343,
- "mean_token_accuracy": 0.8283513784408569,
- "num_tokens": 6636929.0,
- "step": 726
- },
- {
- "epoch": 0.5524316109422492,
- "grad_norm": 1.991780400276184,
- "learning_rate": 4.75938523956829e-06,
- "loss": 0.4631248116493225,
- "mean_token_accuracy": 0.8275107741355896,
- "num_tokens": 6645135.0,
- "step": 727
- },
- {
- "epoch": 0.5531914893617021,
- "grad_norm": 1.423792839050293,
- "learning_rate": 4.75848793675114e-06,
- "loss": 0.49630722403526306,
- "mean_token_accuracy": 0.8388000130653381,
- "num_tokens": 6662690.0,
- "step": 728
- },
- {
- "epoch": 0.5539513677811551,
- "grad_norm": 2.345294952392578,
- "learning_rate": 4.757589048840219e-06,
- "loss": 0.37830638885498047,
- "mean_token_accuracy": 0.8782080411911011,
- "num_tokens": 6667285.0,
- "step": 729
- },
- {
- "epoch": 0.5547112462006079,
- "grad_norm": 2.7452144622802734,
- "learning_rate": 4.756688576466398e-06,
- "loss": 0.51595538854599,
- "mean_token_accuracy": 0.8441770672798157,
- "num_tokens": 6672324.0,
- "step": 730
- },
- {
- "epoch": 0.5554711246200608,
- "grad_norm": 1.5247859954833984,
- "learning_rate": 4.755786520261666e-06,
- "loss": 0.48365193605422974,
- "mean_token_accuracy": 0.8276445269584656,
- "num_tokens": 6685296.0,
- "step": 731
- },
- {
- "epoch": 0.5562310030395137,
- "grad_norm": 1.4018276929855347,
- "learning_rate": 4.75488288085912e-06,
- "loss": 0.3876481354236603,
- "mean_token_accuracy": 0.8612343072891235,
- "num_tokens": 6697515.0,
- "step": 732
- },
- {
- "epoch": 0.5569908814589666,
- "grad_norm": 2.9570324420928955,
- "learning_rate": 4.753977658892967e-06,
- "loss": 0.5468149185180664,
- "mean_token_accuracy": 0.8054271340370178,
- "num_tokens": 6702194.0,
- "step": 733
- },
- {
- "epoch": 0.5577507598784195,
- "grad_norm": 1.9282715320587158,
- "learning_rate": 4.753070854998529e-06,
- "loss": 0.4758574962615967,
- "mean_token_accuracy": 0.8379775285720825,
- "num_tokens": 6709938.0,
- "step": 734
- },
- {
- "epoch": 0.5585106382978723,
- "grad_norm": 1.981264591217041,
- "learning_rate": 4.752162469812234e-06,
- "loss": 0.48461222648620605,
- "mean_token_accuracy": 0.833509087562561,
- "num_tokens": 6718125.0,
- "step": 735
- },
- {
- "epoch": 0.5592705167173252,
- "grad_norm": 1.1643427610397339,
- "learning_rate": 4.751252503971624e-06,
- "loss": 0.410121887922287,
- "mean_token_accuracy": 0.8221402764320374,
- "num_tokens": 6735125.0,
- "step": 736
- },
- {
- "epoch": 0.5600303951367781,
- "grad_norm": 1.786566972732544,
- "learning_rate": 4.750340958115346e-06,
- "loss": 0.5964341163635254,
- "mean_token_accuracy": 0.8038164377212524,
- "num_tokens": 6747369.0,
- "step": 737
- },
- {
- "epoch": 0.560790273556231,
- "grad_norm": 1.7256991863250732,
- "learning_rate": 4.749427832883158e-06,
- "loss": 0.48737066984176636,
- "mean_token_accuracy": 0.830894947052002,
- "num_tokens": 6758115.0,
- "step": 738
- },
- {
- "epoch": 0.5615501519756839,
- "grad_norm": 1.997747540473938,
- "learning_rate": 4.748513128915928e-06,
- "loss": 0.5238886475563049,
- "mean_token_accuracy": 0.8066858053207397,
- "num_tokens": 6766111.0,
- "step": 739
- },
- {
- "epoch": 0.5623100303951368,
- "grad_norm": 2.127016305923462,
- "learning_rate": 4.747596846855629e-06,
- "loss": 0.5045586228370667,
- "mean_token_accuracy": 0.821424126625061,
- "num_tokens": 6772893.0,
- "step": 740
- },
- {
- "epoch": 0.5630699088145896,
- "grad_norm": 1.7664796113967896,
- "learning_rate": 4.7466789873453446e-06,
- "loss": 0.42954835295677185,
- "mean_token_accuracy": 0.8533384799957275,
- "num_tokens": 6785133.0,
- "step": 741
- },
- {
- "epoch": 0.5638297872340425,
- "grad_norm": 1.4987404346466064,
- "learning_rate": 4.7457595510292615e-06,
- "loss": 0.5378558039665222,
- "mean_token_accuracy": 0.8184819221496582,
- "num_tokens": 6799563.0,
- "step": 742
- },
- {
- "epoch": 0.5645896656534954,
- "grad_norm": 1.4444655179977417,
- "learning_rate": 4.744838538552678e-06,
- "loss": 0.42193782329559326,
- "mean_token_accuracy": 0.837514340877533,
- "num_tokens": 6812470.0,
- "step": 743
- },
- {
- "epoch": 0.5653495440729484,
- "grad_norm": 3.867751121520996,
- "learning_rate": 4.7439159505619946e-06,
- "loss": 0.4457814693450928,
- "mean_token_accuracy": 0.8630104660987854,
- "num_tokens": 6815652.0,
- "step": 744
- },
- {
- "epoch": 0.5661094224924013,
- "grad_norm": 2.1250710487365723,
- "learning_rate": 4.74299178770472e-06,
- "loss": 0.5638922452926636,
- "mean_token_accuracy": 0.7969781160354614,
- "num_tokens": 6824566.0,
- "step": 745
- },
- {
- "epoch": 0.5668693009118541,
- "grad_norm": 2.547072410583496,
- "learning_rate": 4.742066050629465e-06,
- "loss": 0.5516207814216614,
- "mean_token_accuracy": 0.8160669803619385,
- "num_tokens": 6830589.0,
- "step": 746
- },
- {
- "epoch": 0.567629179331307,
- "grad_norm": 1.2975233793258667,
- "learning_rate": 4.741138739985951e-06,
- "loss": 0.3823344111442566,
- "mean_token_accuracy": 0.8668368458747864,
- "num_tokens": 6842707.0,
- "step": 747
- },
- {
- "epoch": 0.5683890577507599,
- "grad_norm": 1.3410450220108032,
- "learning_rate": 4.740209856424998e-06,
- "loss": 0.5148671269416809,
- "mean_token_accuracy": 0.8188045024871826,
- "num_tokens": 6857624.0,
- "step": 748
- },
- {
- "epoch": 0.5691489361702128,
- "grad_norm": 1.219467282295227,
- "learning_rate": 4.7392794005985324e-06,
- "loss": 0.3998957872390747,
- "mean_token_accuracy": 0.855175256729126,
- "num_tokens": 6875064.0,
- "step": 749
- },
- {
- "epoch": 0.5699088145896657,
- "grad_norm": 1.3530343770980835,
- "learning_rate": 4.738347373159585e-06,
- "loss": 0.5359633564949036,
- "mean_token_accuracy": 0.8178457021713257,
- "num_tokens": 6890911.0,
- "step": 750
- },
- {
- "epoch": 0.5706686930091185,
- "grad_norm": 2.146988868713379,
- "learning_rate": 4.737413774762287e-06,
- "loss": 0.4460008144378662,
- "mean_token_accuracy": 0.8172903060913086,
- "num_tokens": 6896959.0,
- "step": 751
- },
- {
- "epoch": 0.5714285714285714,
- "grad_norm": 1.456023097038269,
- "learning_rate": 4.736478606061876e-06,
- "loss": 0.43616920709609985,
- "mean_token_accuracy": 0.8465108871459961,
- "num_tokens": 6908904.0,
- "step": 752
- },
- {
- "epoch": 0.5721884498480243,
- "grad_norm": 2.9696967601776123,
- "learning_rate": 4.735541867714687e-06,
- "loss": 0.43464532494544983,
- "mean_token_accuracy": 0.8608652353286743,
- "num_tokens": 6913026.0,
- "step": 753
- },
- {
- "epoch": 0.5729483282674772,
- "grad_norm": 2.2990667819976807,
- "learning_rate": 4.73460356037816e-06,
- "loss": 0.6619116067886353,
- "mean_token_accuracy": 0.7821142673492432,
- "num_tokens": 6920588.0,
- "step": 754
- },
- {
- "epoch": 0.5737082066869301,
- "grad_norm": 2.054746389389038,
- "learning_rate": 4.733663684710835e-06,
- "loss": 0.5304250717163086,
- "mean_token_accuracy": 0.8265531063079834,
- "num_tokens": 6928910.0,
- "step": 755
- },
- {
- "epoch": 0.574468085106383,
- "grad_norm": 2.0050594806671143,
- "learning_rate": 4.732722241372354e-06,
- "loss": 0.6393026113510132,
- "mean_token_accuracy": 0.796819806098938,
- "num_tokens": 6940217.0,
- "step": 756
- },
- {
- "epoch": 0.5752279635258358,
- "grad_norm": 1.4285320043563843,
- "learning_rate": 4.731779231023456e-06,
- "loss": 0.5432837009429932,
- "mean_token_accuracy": 0.8104778528213501,
- "num_tokens": 6959101.0,
- "step": 757
- },
- {
- "epoch": 0.5759878419452887,
- "grad_norm": 2.3941943645477295,
- "learning_rate": 4.730834654325984e-06,
- "loss": 0.46550673246383667,
- "mean_token_accuracy": 0.8444503545761108,
- "num_tokens": 6965036.0,
- "step": 758
- },
- {
- "epoch": 0.5767477203647416,
- "grad_norm": 2.3850574493408203,
- "learning_rate": 4.729888511942877e-06,
- "loss": 0.4916389584541321,
- "mean_token_accuracy": 0.8228527307510376,
- "num_tokens": 6971184.0,
- "step": 759
- },
- {
- "epoch": 0.5775075987841946,
- "grad_norm": 1.627480149269104,
- "learning_rate": 4.728940804538176e-06,
- "loss": 0.5863215923309326,
- "mean_token_accuracy": 0.7995302677154541,
- "num_tokens": 6982569.0,
- "step": 760
- },
- {
- "epoch": 0.5782674772036475,
- "grad_norm": 1.1723195314407349,
- "learning_rate": 4.727991532777016e-06,
- "loss": 0.36908864974975586,
- "mean_token_accuracy": 0.8355655670166016,
- "num_tokens": 6998659.0,
- "step": 761
- },
- {
- "epoch": 0.5790273556231003,
- "grad_norm": 1.5324925184249878,
- "learning_rate": 4.727040697325634e-06,
- "loss": 0.557658851146698,
- "mean_token_accuracy": 0.8141458034515381,
- "num_tokens": 7012969.0,
- "step": 762
- },
- {
- "epoch": 0.5797872340425532,
- "grad_norm": 2.4106390476226807,
- "learning_rate": 4.726088298851362e-06,
- "loss": 0.5004243850708008,
- "mean_token_accuracy": 0.8376860618591309,
- "num_tokens": 7018301.0,
- "step": 763
- },
- {
- "epoch": 0.5805471124620061,
- "grad_norm": 2.2594921588897705,
- "learning_rate": 4.725134338022631e-06,
- "loss": 0.6067016124725342,
- "mean_token_accuracy": 0.8100241422653198,
- "num_tokens": 7025201.0,
- "step": 764
- },
- {
- "epoch": 0.581306990881459,
- "grad_norm": 1.4649826288223267,
- "learning_rate": 4.724178815508967e-06,
- "loss": 0.36200693249702454,
- "mean_token_accuracy": 0.8621826171875,
- "num_tokens": 7035112.0,
- "step": 765
- },
- {
- "epoch": 0.5820668693009119,
- "grad_norm": 2.3634560108184814,
- "learning_rate": 4.723221731980993e-06,
- "loss": 0.41862213611602783,
- "mean_token_accuracy": 0.8541463613510132,
- "num_tokens": 7040339.0,
- "step": 766
- },
- {
- "epoch": 0.5828267477203647,
- "grad_norm": 2.7798104286193848,
- "learning_rate": 4.722263088110426e-06,
- "loss": 0.4647108018398285,
- "mean_token_accuracy": 0.8505672216415405,
- "num_tokens": 7044880.0,
- "step": 767
- },
- {
- "epoch": 0.5835866261398176,
- "grad_norm": 2.070528507232666,
- "learning_rate": 4.721302884570079e-06,
- "loss": 0.5147565007209778,
- "mean_token_accuracy": 0.8113877773284912,
- "num_tokens": 7052433.0,
- "step": 768
- },
- {
- "epoch": 0.5843465045592705,
- "grad_norm": 2.1953284740448,
- "learning_rate": 4.720341122033862e-06,
- "loss": 0.5075466632843018,
- "mean_token_accuracy": 0.8474211096763611,
- "num_tokens": 7058686.0,
- "step": 769
- },
- {
- "epoch": 0.5851063829787234,
- "grad_norm": 1.9287755489349365,
- "learning_rate": 4.719377801176774e-06,
- "loss": 0.5382202863693237,
- "mean_token_accuracy": 0.8148090243339539,
- "num_tokens": 7067538.0,
- "step": 770
- },
- {
- "epoch": 0.5858662613981763,
- "grad_norm": 1.5574456453323364,
- "learning_rate": 4.718412922674913e-06,
- "loss": 0.43406790494918823,
- "mean_token_accuracy": 0.8477081060409546,
- "num_tokens": 7077853.0,
- "step": 771
- },
- {
- "epoch": 0.5866261398176292,
- "grad_norm": 1.5490336418151855,
- "learning_rate": 4.717446487205466e-06,
- "loss": 0.43164271116256714,
- "mean_token_accuracy": 0.8504570126533508,
- "num_tokens": 7091728.0,
- "step": 772
- },
- {
- "epoch": 0.587386018237082,
- "grad_norm": 1.6945984363555908,
- "learning_rate": 4.716478495446717e-06,
- "loss": 0.5153743624687195,
- "mean_token_accuracy": 0.8213579058647156,
- "num_tokens": 7108680.0,
- "step": 773
- },
- {
- "epoch": 0.5881458966565349,
- "grad_norm": 2.2633883953094482,
- "learning_rate": 4.715508948078037e-06,
- "loss": 0.45254790782928467,
- "mean_token_accuracy": 0.8392219543457031,
- "num_tokens": 7115546.0,
- "step": 774
- },
- {
- "epoch": 0.5889057750759878,
- "grad_norm": 1.5731090307235718,
- "learning_rate": 4.714537845779894e-06,
- "loss": 0.38678881525993347,
- "mean_token_accuracy": 0.8800252676010132,
- "num_tokens": 7126360.0,
- "step": 775
- },
- {
- "epoch": 0.5896656534954408,
- "grad_norm": 2.4873392581939697,
- "learning_rate": 4.7135651892338445e-06,
- "loss": 0.5190927386283875,
- "mean_token_accuracy": 0.8145407438278198,
- "num_tokens": 7135705.0,
- "step": 776
- },
- {
- "epoch": 0.5904255319148937,
- "grad_norm": 1.2931004762649536,
- "learning_rate": 4.712590979122534e-06,
- "loss": 0.3686544895172119,
- "mean_token_accuracy": 0.8720537424087524,
- "num_tokens": 7150688.0,
- "step": 777
- },
- {
- "epoch": 0.5911854103343465,
- "grad_norm": 1.6353671550750732,
- "learning_rate": 4.7116152161297045e-06,
- "loss": 0.49065062403678894,
- "mean_token_accuracy": 0.8203760385513306,
- "num_tokens": 7161040.0,
- "step": 778
- },
- {
- "epoch": 0.5919452887537994,
- "grad_norm": 1.2345483303070068,
- "learning_rate": 4.710637900940181e-06,
- "loss": 0.4004976451396942,
- "mean_token_accuracy": 0.8302007913589478,
- "num_tokens": 7178074.0,
- "step": 779
- },
- {
- "epoch": 0.5927051671732523,
- "grad_norm": 2.2506837844848633,
- "learning_rate": 4.7096590342398825e-06,
- "loss": 0.45142874121665955,
- "mean_token_accuracy": 0.8481036424636841,
- "num_tokens": 7184153.0,
- "step": 780
- },
- {
- "epoch": 0.5934650455927052,
- "grad_norm": 1.420479416847229,
- "learning_rate": 4.708678616715815e-06,
- "loss": 0.4802100360393524,
- "mean_token_accuracy": 0.8586992025375366,
- "num_tokens": 7202810.0,
- "step": 781
- },
- {
- "epoch": 0.5942249240121581,
- "grad_norm": 3.457632303237915,
- "learning_rate": 4.707696649056073e-06,
- "loss": 0.5265094041824341,
- "mean_token_accuracy": 0.8260114192962646,
- "num_tokens": 7206396.0,
- "step": 782
- },
- {
- "epoch": 0.5949848024316109,
- "grad_norm": 1.1592093706130981,
- "learning_rate": 4.706713131949839e-06,
- "loss": 0.3708173632621765,
- "mean_token_accuracy": 0.8476542234420776,
- "num_tokens": 7225034.0,
- "step": 783
- },
- {
- "epoch": 0.5957446808510638,
- "grad_norm": 1.6761400699615479,
- "learning_rate": 4.705728066087384e-06,
- "loss": 0.4137252867221832,
- "mean_token_accuracy": 0.8462049961090088,
- "num_tokens": 7237101.0,
- "step": 784
- },
- {
- "epoch": 0.5965045592705167,
- "grad_norm": 2.320185422897339,
- "learning_rate": 4.704741452160064e-06,
- "loss": 0.5157154202461243,
- "mean_token_accuracy": 0.8391785621643066,
- "num_tokens": 7243826.0,
- "step": 785
- },
- {
- "epoch": 0.5972644376899696,
- "grad_norm": 2.079423427581787,
- "learning_rate": 4.703753290860323e-06,
- "loss": 0.4734993278980255,
- "mean_token_accuracy": 0.8353281021118164,
- "num_tokens": 7250175.0,
- "step": 786
- },
- {
- "epoch": 0.5980243161094225,
- "grad_norm": 1.8215159177780151,
- "learning_rate": 4.702763582881692e-06,
- "loss": 0.520193338394165,
- "mean_token_accuracy": 0.844062864780426,
- "num_tokens": 7258868.0,
- "step": 787
- },
- {
- "epoch": 0.5987841945288754,
- "grad_norm": 1.3823071718215942,
- "learning_rate": 4.701772328918784e-06,
- "loss": 0.4177844822406769,
- "mean_token_accuracy": 0.8363165259361267,
- "num_tokens": 7271744.0,
- "step": 788
- },
- {
- "epoch": 0.5995440729483282,
- "grad_norm": 2.4749298095703125,
- "learning_rate": 4.700779529667301e-06,
- "loss": 0.5115069150924683,
- "mean_token_accuracy": 0.8473520278930664,
- "num_tokens": 7277040.0,
- "step": 789
- },
- {
- "epoch": 0.6003039513677811,
- "grad_norm": 1.7072296142578125,
- "learning_rate": 4.699785185824026e-06,
- "loss": 0.5265800952911377,
- "mean_token_accuracy": 0.8161447048187256,
- "num_tokens": 7288288.0,
- "step": 790
- },
- {
- "epoch": 0.601063829787234,
- "grad_norm": 1.6479384899139404,
- "learning_rate": 4.69878929808683e-06,
- "loss": 0.4445168972015381,
- "mean_token_accuracy": 0.8381255865097046,
- "num_tokens": 7298640.0,
- "step": 791
- },
- {
- "epoch": 0.601823708206687,
- "grad_norm": 1.9095896482467651,
- "learning_rate": 4.6977918671546635e-06,
- "loss": 0.5841238498687744,
- "mean_token_accuracy": 0.7971454858779907,
- "num_tokens": 7307220.0,
- "step": 792
- },
- {
- "epoch": 0.6025835866261399,
- "grad_norm": 1.9614146947860718,
- "learning_rate": 4.696792893727562e-06,
- "loss": 0.34684082865715027,
- "mean_token_accuracy": 0.8739526271820068,
- "num_tokens": 7313875.0,
- "step": 793
- },
- {
- "epoch": 0.6033434650455927,
- "grad_norm": 2.015570640563965,
- "learning_rate": 4.695792378506645e-06,
- "loss": 0.42779117822647095,
- "mean_token_accuracy": 0.8625012636184692,
- "num_tokens": 7321439.0,
- "step": 794
- },
- {
- "epoch": 0.6041033434650456,
- "grad_norm": 2.8581228256225586,
- "learning_rate": 4.694790322194111e-06,
- "loss": 0.6519991159439087,
- "mean_token_accuracy": 0.7629562616348267,
- "num_tokens": 7326916.0,
- "step": 795
- },
- {
- "epoch": 0.6048632218844985,
- "grad_norm": 2.482715368270874,
- "learning_rate": 4.693786725493242e-06,
- "loss": 0.532963216304779,
- "mean_token_accuracy": 0.832184910774231,
- "num_tokens": 7333311.0,
- "step": 796
- },
- {
- "epoch": 0.6056231003039514,
- "grad_norm": 1.6076741218566895,
- "learning_rate": 4.692781589108402e-06,
- "loss": 0.43381205201148987,
- "mean_token_accuracy": 0.8402494192123413,
- "num_tokens": 7343731.0,
- "step": 797
- },
- {
- "epoch": 0.6063829787234043,
- "grad_norm": 2.2133216857910156,
- "learning_rate": 4.691774913745033e-06,
- "loss": 0.4380851089954376,
- "mean_token_accuracy": 0.8600908517837524,
- "num_tokens": 7350224.0,
- "step": 798
- },
- {
- "epoch": 0.6071428571428571,
- "grad_norm": 2.046280860900879,
- "learning_rate": 4.690766700109659e-06,
- "loss": 0.3821919560432434,
- "mean_token_accuracy": 0.8691814541816711,
- "num_tokens": 7356717.0,
- "step": 799
- },
- {
- "epoch": 0.60790273556231,
- "grad_norm": 1.8482693433761597,
- "learning_rate": 4.689756948909884e-06,
- "loss": 0.5217651128768921,
- "mean_token_accuracy": 0.803473711013794,
- "num_tokens": 7365806.0,
- "step": 800
- },
- {
- "epoch": 0.6086626139817629,
- "grad_norm": 2.192134141921997,
- "learning_rate": 4.688745660854388e-06,
- "loss": 0.573980987071991,
- "mean_token_accuracy": 0.8198676109313965,
- "num_tokens": 7380281.0,
- "step": 801
- },
- {
- "epoch": 0.6094224924012158,
- "grad_norm": 2.363626718521118,
- "learning_rate": 4.687732836652935e-06,
- "loss": 0.5204599499702454,
- "mean_token_accuracy": 0.8373252153396606,
- "num_tokens": 7386938.0,
- "step": 802
- },
- {
- "epoch": 0.6101823708206687,
- "grad_norm": 1.9320523738861084,
- "learning_rate": 4.686718477016361e-06,
- "loss": 0.47316622734069824,
- "mean_token_accuracy": 0.830596923828125,
- "num_tokens": 7395069.0,
- "step": 803
- },
- {
- "epoch": 0.6109422492401215,
- "grad_norm": 2.6573057174682617,
- "learning_rate": 4.6857025826565845e-06,
- "loss": 0.5495861768722534,
- "mean_token_accuracy": 0.8187421560287476,
- "num_tokens": 7400563.0,
- "step": 804
- },
- {
- "epoch": 0.6117021276595744,
- "grad_norm": 2.0893123149871826,
- "learning_rate": 4.684685154286599e-06,
- "loss": 0.5362675786018372,
- "mean_token_accuracy": 0.8394701480865479,
- "num_tokens": 7406973.0,
- "step": 805
- },
- {
- "epoch": 0.6124620060790273,
- "grad_norm": 2.455130100250244,
- "learning_rate": 4.683666192620474e-06,
- "loss": 0.5405995845794678,
- "mean_token_accuracy": 0.8079100847244263,
- "num_tokens": 7412931.0,
- "step": 806
- },
- {
- "epoch": 0.6132218844984803,
- "grad_norm": 2.311915636062622,
- "learning_rate": 4.682645698373357e-06,
- "loss": 0.5395106077194214,
- "mean_token_accuracy": 0.8156260251998901,
- "num_tokens": 7419699.0,
- "step": 807
- },
- {
- "epoch": 0.6139817629179332,
- "grad_norm": 1.686838984489441,
- "learning_rate": 4.6816236722614694e-06,
- "loss": 0.6034521460533142,
- "mean_token_accuracy": 0.7855954170227051,
- "num_tokens": 7431899.0,
- "step": 808
- },
- {
- "epoch": 0.6147416413373861,
- "grad_norm": 1.682759165763855,
- "learning_rate": 4.680600115002109e-06,
- "loss": 0.48593831062316895,
- "mean_token_accuracy": 0.8229435682296753,
- "num_tokens": 7443187.0,
- "step": 809
- },
- {
- "epoch": 0.6155015197568389,
- "grad_norm": 2.064589738845825,
- "learning_rate": 4.679575027313649e-06,
- "loss": 0.5098468661308289,
- "mean_token_accuracy": 0.8234638571739197,
- "num_tokens": 7450868.0,
- "step": 810
- },
- {
- "epoch": 0.6162613981762918,
- "grad_norm": 2.2063486576080322,
- "learning_rate": 4.6785484099155324e-06,
- "loss": 0.5138497352600098,
- "mean_token_accuracy": 0.8152111172676086,
- "num_tokens": 7457176.0,
- "step": 811
- },
- {
- "epoch": 0.6170212765957447,
- "grad_norm": 1.6258726119995117,
- "learning_rate": 4.67752026352828e-06,
- "loss": 0.4064181447029114,
- "mean_token_accuracy": 0.8720619678497314,
- "num_tokens": 7466557.0,
- "step": 812
- },
- {
- "epoch": 0.6177811550151976,
- "grad_norm": 2.3309383392333984,
- "learning_rate": 4.676490588873486e-06,
- "loss": 0.5180112719535828,
- "mean_token_accuracy": 0.8233879804611206,
- "num_tokens": 7472650.0,
- "step": 813
- },
- {
- "epoch": 0.6185410334346505,
- "grad_norm": 1.4545246362686157,
- "learning_rate": 4.675459386673815e-06,
- "loss": 0.37917959690093994,
- "mean_token_accuracy": 0.8598103523254395,
- "num_tokens": 7485171.0,
- "step": 814
- },
- {
- "epoch": 0.6193009118541033,
- "grad_norm": 2.654231071472168,
- "learning_rate": 4.674426657653003e-06,
- "loss": 0.554074227809906,
- "mean_token_accuracy": 0.8026446104049683,
- "num_tokens": 7490787.0,
- "step": 815
- },
- {
- "epoch": 0.6200607902735562,
- "grad_norm": 1.5543994903564453,
- "learning_rate": 4.67339240253586e-06,
- "loss": 0.6335440278053284,
- "mean_token_accuracy": 0.783241868019104,
- "num_tokens": 7505975.0,
- "step": 816
- },
- {
- "epoch": 0.6208206686930091,
- "grad_norm": 2.079998016357422,
- "learning_rate": 4.672356622048266e-06,
- "loss": 0.5169394016265869,
- "mean_token_accuracy": 0.8088761568069458,
- "num_tokens": 7513470.0,
- "step": 817
- },
- {
- "epoch": 0.621580547112462,
- "grad_norm": 1.5971896648406982,
- "learning_rate": 4.671319316917172e-06,
- "loss": 0.44588586688041687,
- "mean_token_accuracy": 0.8518649339675903,
- "num_tokens": 7524352.0,
- "step": 818
- },
- {
- "epoch": 0.6223404255319149,
- "grad_norm": 2.477579116821289,
- "learning_rate": 4.670280487870599e-06,
- "loss": 0.5713893175125122,
- "mean_token_accuracy": 0.8116940259933472,
- "num_tokens": 7530359.0,
- "step": 819
- },
- {
- "epoch": 0.6231003039513677,
- "grad_norm": 2.066211700439453,
- "learning_rate": 4.669240135637635e-06,
- "loss": 0.5295331478118896,
- "mean_token_accuracy": 0.819536566734314,
- "num_tokens": 7536963.0,
- "step": 820
- },
- {
- "epoch": 0.6238601823708206,
- "grad_norm": 2.1217997074127197,
- "learning_rate": 4.668198260948442e-06,
- "loss": 0.6146406531333923,
- "mean_token_accuracy": 0.7932635545730591,
- "num_tokens": 7545800.0,
- "step": 821
- },
- {
- "epoch": 0.6246200607902735,
- "grad_norm": 2.0173542499542236,
- "learning_rate": 4.667154864534245e-06,
- "loss": 0.6240535974502563,
- "mean_token_accuracy": 0.7883644104003906,
- "num_tokens": 7556165.0,
- "step": 822
- },
- {
- "epoch": 0.6253799392097265,
- "grad_norm": 2.014526128768921,
- "learning_rate": 4.666109947127343e-06,
- "loss": 0.40367332100868225,
- "mean_token_accuracy": 0.8653522729873657,
- "num_tokens": 7562665.0,
- "step": 823
- },
- {
- "epoch": 0.6261398176291794,
- "grad_norm": 2.5078861713409424,
- "learning_rate": 4.665063509461098e-06,
- "loss": 0.5903617739677429,
- "mean_token_accuracy": 0.7902897596359253,
- "num_tokens": 7568922.0,
- "step": 824
- },
- {
- "epoch": 0.6268996960486323,
- "grad_norm": 2.454622745513916,
- "learning_rate": 4.664015552269938e-06,
- "loss": 0.5238361358642578,
- "mean_token_accuracy": 0.838546872138977,
- "num_tokens": 7575965.0,
- "step": 825
- },
- {
- "epoch": 0.6276595744680851,
- "grad_norm": 2.920919418334961,
- "learning_rate": 4.662966076289363e-06,
- "loss": 0.5028782486915588,
- "mean_token_accuracy": 0.8311152458190918,
- "num_tokens": 7580193.0,
- "step": 826
- },
- {
- "epoch": 0.628419452887538,
- "grad_norm": 1.545382022857666,
- "learning_rate": 4.661915082255932e-06,
- "loss": 0.4817378520965576,
- "mean_token_accuracy": 0.8373227119445801,
- "num_tokens": 7593024.0,
- "step": 827
- },
- {
- "epoch": 0.6291793313069909,
- "grad_norm": 1.5152469873428345,
- "learning_rate": 4.6608625709072766e-06,
- "loss": 0.4693033695220947,
- "mean_token_accuracy": 0.8150848150253296,
- "num_tokens": 7606459.0,
- "step": 828
- },
- {
- "epoch": 0.6299392097264438,
- "grad_norm": 2.1310224533081055,
- "learning_rate": 4.659808542982089e-06,
- "loss": 0.4653395414352417,
- "mean_token_accuracy": 0.8286294341087341,
- "num_tokens": 7613036.0,
- "step": 829
- },
- {
- "epoch": 0.6306990881458967,
- "grad_norm": 2.1949679851531982,
- "learning_rate": 4.658752999220125e-06,
- "loss": 0.3698633909225464,
- "mean_token_accuracy": 0.871590793132782,
- "num_tokens": 7618527.0,
- "step": 830
- },
- {
- "epoch": 0.6314589665653495,
- "grad_norm": 2.2770416736602783,
- "learning_rate": 4.657695940362207e-06,
- "loss": 0.5202419757843018,
- "mean_token_accuracy": 0.817577600479126,
- "num_tokens": 7624459.0,
- "step": 831
- },
- {
- "epoch": 0.6322188449848024,
- "grad_norm": 1.402042269706726,
- "learning_rate": 4.65663736715022e-06,
- "loss": 0.51531583070755,
- "mean_token_accuracy": 0.8228116631507874,
- "num_tokens": 7639371.0,
- "step": 832
- },
- {
- "epoch": 0.6329787234042553,
- "grad_norm": 3.3554883003234863,
- "learning_rate": 4.65557728032711e-06,
- "loss": 0.6771188378334045,
- "mean_token_accuracy": 0.7880028486251831,
- "num_tokens": 7643924.0,
- "step": 833
- },
- {
- "epoch": 0.6337386018237082,
- "grad_norm": 2.081040143966675,
- "learning_rate": 4.654515680636888e-06,
- "loss": 0.5712796449661255,
- "mean_token_accuracy": 0.8177868127822876,
- "num_tokens": 7651881.0,
- "step": 834
- },
- {
- "epoch": 0.6344984802431611,
- "grad_norm": 0.9128716588020325,
- "learning_rate": 4.653452568824625e-06,
- "loss": 0.3423936069011688,
- "mean_token_accuracy": 0.8782886266708374,
- "num_tokens": 7677829.0,
- "step": 835
- },
- {
- "epoch": 0.6352583586626139,
- "grad_norm": 3.49015736579895,
- "learning_rate": 4.652387945636454e-06,
- "loss": 0.34657734632492065,
- "mean_token_accuracy": 0.8770567178726196,
- "num_tokens": 7680796.0,
- "step": 836
- },
- {
- "epoch": 0.6360182370820668,
- "grad_norm": 2.026247501373291,
- "learning_rate": 4.651321811819568e-06,
- "loss": 0.5098431706428528,
- "mean_token_accuracy": 0.8216961622238159,
- "num_tokens": 7688746.0,
- "step": 837
- },
- {
- "epoch": 0.6367781155015197,
- "grad_norm": 2.444343090057373,
- "learning_rate": 4.650254168122222e-06,
- "loss": 0.5490090250968933,
- "mean_token_accuracy": 0.8092857599258423,
- "num_tokens": 7695220.0,
- "step": 838
- },
- {
- "epoch": 0.6375379939209727,
- "grad_norm": 2.0171122550964355,
- "learning_rate": 4.649185015293728e-06,
- "loss": 0.47221142053604126,
- "mean_token_accuracy": 0.8514408469200134,
- "num_tokens": 7702759.0,
- "step": 839
- },
- {
- "epoch": 0.6382978723404256,
- "grad_norm": 1.9800984859466553,
- "learning_rate": 4.64811435408446e-06,
- "loss": 0.5238803625106812,
- "mean_token_accuracy": 0.8479194641113281,
- "num_tokens": 7714017.0,
- "step": 840
- },
- {
- "epoch": 0.6390577507598785,
- "grad_norm": 3.0674357414245605,
- "learning_rate": 4.647042185245848e-06,
- "loss": 0.4668245315551758,
- "mean_token_accuracy": 0.8381714820861816,
- "num_tokens": 7717801.0,
- "step": 841
- },
- {
- "epoch": 0.6398176291793313,
- "grad_norm": 1.5672820806503296,
- "learning_rate": 4.645968509530381e-06,
- "loss": 0.4428741931915283,
- "mean_token_accuracy": 0.8416479825973511,
- "num_tokens": 7728342.0,
- "step": 842
- },
- {
- "epoch": 0.6405775075987842,
- "grad_norm": 2.3042354583740234,
- "learning_rate": 4.644893327691608e-06,
- "loss": 0.49937760829925537,
- "mean_token_accuracy": 0.827070951461792,
- "num_tokens": 7734576.0,
- "step": 843
- },
- {
- "epoch": 0.6413373860182371,
- "grad_norm": 2.057772159576416,
- "learning_rate": 4.6438166404841316e-06,
- "loss": 0.5912986993789673,
- "mean_token_accuracy": 0.805509090423584,
- "num_tokens": 7742481.0,
- "step": 844
- },
- {
- "epoch": 0.64209726443769,
- "grad_norm": 1.9688186645507812,
- "learning_rate": 4.6427384486636115e-06,
- "loss": 0.482401967048645,
- "mean_token_accuracy": 0.8358086347579956,
- "num_tokens": 7750002.0,
- "step": 845
- },
- {
- "epoch": 0.6428571428571429,
- "grad_norm": 2.6852948665618896,
- "learning_rate": 4.6416587529867665e-06,
- "loss": 0.5479315519332886,
- "mean_token_accuracy": 0.8091106414794922,
- "num_tokens": 7755578.0,
- "step": 846
- },
- {
- "epoch": 0.6436170212765957,
- "grad_norm": 2.0547337532043457,
- "learning_rate": 4.640577554211366e-06,
- "loss": 0.5327274203300476,
- "mean_token_accuracy": 0.8280376195907593,
- "num_tokens": 7763513.0,
- "step": 847
- },
- {
- "epoch": 0.6443768996960486,
- "grad_norm": 2.0328633785247803,
- "learning_rate": 4.63949485309624e-06,
- "loss": 0.4814409613609314,
- "mean_token_accuracy": 0.8527672290802002,
- "num_tokens": 7771131.0,
- "step": 848
- },
- {
- "epoch": 0.6451367781155015,
- "grad_norm": 1.5892863273620605,
- "learning_rate": 4.638410650401267e-06,
- "loss": 0.4492785334587097,
- "mean_token_accuracy": 0.846997857093811,
- "num_tokens": 7781572.0,
- "step": 849
- },
- {
- "epoch": 0.6458966565349544,
- "grad_norm": 1.8295910358428955,
- "learning_rate": 4.637324946887384e-06,
- "loss": 0.37088239192962646,
- "mean_token_accuracy": 0.8616628646850586,
- "num_tokens": 7788604.0,
- "step": 850
- },
- {
- "epoch": 0.6466565349544073,
- "grad_norm": 3.380040168762207,
- "learning_rate": 4.636237743316578e-06,
- "loss": 0.4737280607223511,
- "mean_token_accuracy": 0.855940580368042,
- "num_tokens": 7792504.0,
- "step": 851
- },
- {
- "epoch": 0.6474164133738601,
- "grad_norm": 2.8790009021759033,
- "learning_rate": 4.635149040451891e-06,
- "loss": 0.39790448546409607,
- "mean_token_accuracy": 0.8710698485374451,
- "num_tokens": 7796333.0,
- "step": 852
- },
- {
- "epoch": 0.648176291793313,
- "grad_norm": 1.914914608001709,
- "learning_rate": 4.634058839057417e-06,
- "loss": 0.2954312562942505,
- "mean_token_accuracy": 0.8880234956741333,
- "num_tokens": 7802456.0,
- "step": 853
- },
- {
- "epoch": 0.648936170212766,
- "grad_norm": 1.3709120750427246,
- "learning_rate": 4.632967139898301e-06,
- "loss": 0.43224576115608215,
- "mean_token_accuracy": 0.8446190357208252,
- "num_tokens": 7816770.0,
- "step": 854
- },
- {
- "epoch": 0.6496960486322189,
- "grad_norm": 1.6579312086105347,
- "learning_rate": 4.63187394374074e-06,
- "loss": 0.3535553514957428,
- "mean_token_accuracy": 0.8738704919815063,
- "num_tokens": 7824963.0,
- "step": 855
- },
- {
- "epoch": 0.6504559270516718,
- "grad_norm": 2.4055678844451904,
- "learning_rate": 4.63077925135198e-06,
- "loss": 0.5078744292259216,
- "mean_token_accuracy": 0.8430874347686768,
- "num_tokens": 7830962.0,
- "step": 856
- },
- {
- "epoch": 0.6512158054711246,
- "grad_norm": 2.5171499252319336,
- "learning_rate": 4.629683063500319e-06,
- "loss": 0.5172419548034668,
- "mean_token_accuracy": 0.8087141513824463,
- "num_tokens": 7836638.0,
- "step": 857
- },
- {
- "epoch": 0.6519756838905775,
- "grad_norm": 1.7588486671447754,
- "learning_rate": 4.628585380955104e-06,
- "loss": 0.5759496092796326,
- "mean_token_accuracy": 0.8043236136436462,
- "num_tokens": 7844654.0,
- "step": 858
- },
- {
- "epoch": 0.6527355623100304,
- "grad_norm": 1.5887070894241333,
- "learning_rate": 4.62748620448673e-06,
- "loss": 0.41849038004875183,
- "mean_token_accuracy": 0.8556643724441528,
- "num_tokens": 7855642.0,
- "step": 859
- },
- {
- "epoch": 0.6534954407294833,
- "grad_norm": 3.227942705154419,
- "learning_rate": 4.626385534866642e-06,
- "loss": 0.5279449224472046,
- "mean_token_accuracy": 0.8250958323478699,
- "num_tokens": 7859890.0,
- "step": 860
- },
- {
- "epoch": 0.6542553191489362,
- "grad_norm": 2.440467119216919,
- "learning_rate": 4.625283372867333e-06,
- "loss": 0.5294933319091797,
- "mean_token_accuracy": 0.8235013484954834,
- "num_tokens": 7866766.0,
- "step": 861
- },
- {
- "epoch": 0.6550151975683891,
- "grad_norm": 2.4106903076171875,
- "learning_rate": 4.624179719262342e-06,
- "loss": 0.5662813186645508,
- "mean_token_accuracy": 0.8061668872833252,
- "num_tokens": 7872809.0,
- "step": 862
- },
- {
- "epoch": 0.6557750759878419,
- "grad_norm": 3.5151145458221436,
- "learning_rate": 4.623074574826254e-06,
- "loss": 0.5471097230911255,
- "mean_token_accuracy": 0.8220691084861755,
- "num_tokens": 7876136.0,
- "step": 863
- },
- {
- "epoch": 0.6565349544072948,
- "grad_norm": 1.5319840908050537,
- "learning_rate": 4.621967940334705e-06,
- "loss": 0.4178982377052307,
- "mean_token_accuracy": 0.8517135977745056,
- "num_tokens": 7886113.0,
- "step": 864
- },
- {
- "epoch": 0.6572948328267477,
- "grad_norm": 1.63701331615448,
- "learning_rate": 4.620859816564371e-06,
- "loss": 0.4666512608528137,
- "mean_token_accuracy": 0.8223508596420288,
- "num_tokens": 7897982.0,
- "step": 865
- },
- {
- "epoch": 0.6580547112462006,
- "grad_norm": 2.1515414714813232,
- "learning_rate": 4.619750204292978e-06,
- "loss": 0.5359305143356323,
- "mean_token_accuracy": 0.8192868232727051,
- "num_tokens": 7904947.0,
- "step": 866
- },
- {
- "epoch": 0.6588145896656535,
- "grad_norm": 2.2140955924987793,
- "learning_rate": 4.618639104299294e-06,
- "loss": 0.5275633931159973,
- "mean_token_accuracy": 0.8120715618133545,
- "num_tokens": 7913913.0,
- "step": 867
- },
- {
- "epoch": 0.6595744680851063,
- "grad_norm": 1.3956893682479858,
- "learning_rate": 4.6175265173631304e-06,
- "loss": 0.4378768503665924,
- "mean_token_accuracy": 0.8479125499725342,
- "num_tokens": 7927979.0,
- "step": 868
- },
- {
- "epoch": 0.6603343465045592,
- "grad_norm": 2.98103928565979,
- "learning_rate": 4.616412444265344e-06,
- "loss": 0.42614591121673584,
- "mean_token_accuracy": 0.8595094680786133,
- "num_tokens": 7934293.0,
- "step": 869
- },
- {
- "epoch": 0.6610942249240122,
- "grad_norm": 2.554845094680786,
- "learning_rate": 4.6152968857878365e-06,
- "loss": 0.3698030412197113,
- "mean_token_accuracy": 0.8717041015625,
- "num_tokens": 7938547.0,
- "step": 870
- },
- {
- "epoch": 0.6618541033434651,
- "grad_norm": 3.0901825428009033,
- "learning_rate": 4.6141798427135475e-06,
- "loss": 0.5037497282028198,
- "mean_token_accuracy": 0.8354041576385498,
- "num_tokens": 7942829.0,
- "step": 871
- },
- {
- "epoch": 0.662613981762918,
- "grad_norm": 2.8692073822021484,
- "learning_rate": 4.6130613158264605e-06,
- "loss": 0.5418164134025574,
- "mean_token_accuracy": 0.8298909664154053,
- "num_tokens": 7949303.0,
- "step": 872
- },
- {
- "epoch": 0.6633738601823708,
- "grad_norm": 3.960404396057129,
- "learning_rate": 4.611941305911602e-06,
- "loss": 0.6284480094909668,
- "mean_token_accuracy": 0.837495744228363,
- "num_tokens": 7952486.0,
- "step": 873
- },
- {
- "epoch": 0.6641337386018237,
- "grad_norm": 2.6690115928649902,
- "learning_rate": 4.610819813755038e-06,
- "loss": 0.5214360952377319,
- "mean_token_accuracy": 0.8213508129119873,
- "num_tokens": 7957559.0,
- "step": 874
- },
- {
- "epoch": 0.6648936170212766,
- "grad_norm": 2.3376171588897705,
- "learning_rate": 4.609696840143875e-06,
- "loss": 0.46887528896331787,
- "mean_token_accuracy": 0.8438819646835327,
- "num_tokens": 7962826.0,
- "step": 875
- },
- {
- "epoch": 0.6656534954407295,
- "grad_norm": 2.2222683429718018,
- "learning_rate": 4.6085723858662575e-06,
- "loss": 0.5607719421386719,
- "mean_token_accuracy": 0.8128405809402466,
- "num_tokens": 7970131.0,
- "step": 876
- },
- {
- "epoch": 0.6664133738601824,
- "grad_norm": 2.069091558456421,
- "learning_rate": 4.607446451711372e-06,
- "loss": 0.506301760673523,
- "mean_token_accuracy": 0.8256827592849731,
- "num_tokens": 7977524.0,
- "step": 877
- },
- {
- "epoch": 0.6671732522796353,
- "grad_norm": 1.3724967241287231,
- "learning_rate": 4.606319038469443e-06,
- "loss": 0.43285101652145386,
- "mean_token_accuracy": 0.8525032997131348,
- "num_tokens": 7989174.0,
- "step": 878
- },
- {
- "epoch": 0.6679331306990881,
- "grad_norm": 2.278205156326294,
- "learning_rate": 4.605190146931731e-06,
- "loss": 0.4845905303955078,
- "mean_token_accuracy": 0.8284652829170227,
- "num_tokens": 7998524.0,
- "step": 879
- },
- {
- "epoch": 0.668693009118541,
- "grad_norm": 1.3871766328811646,
- "learning_rate": 4.604059777890537e-06,
- "loss": 0.5736679434776306,
- "mean_token_accuracy": 0.8223285675048828,
- "num_tokens": 8015776.0,
- "step": 880
- },
- {
- "epoch": 0.6694528875379939,
- "grad_norm": 1.926164984703064,
- "learning_rate": 4.602927932139197e-06,
- "loss": 0.4133230447769165,
- "mean_token_accuracy": 0.8653768301010132,
- "num_tokens": 8022979.0,
- "step": 881
- },
- {
- "epoch": 0.6702127659574468,
- "grad_norm": 2.109272003173828,
- "learning_rate": 4.601794610472083e-06,
- "loss": 0.7005600929260254,
- "mean_token_accuracy": 0.7777010202407837,
- "num_tokens": 8032618.0,
- "step": 882
- },
- {
- "epoch": 0.6709726443768997,
- "grad_norm": 2.077977418899536,
- "learning_rate": 4.6006598136846056e-06,
- "loss": 0.5278208255767822,
- "mean_token_accuracy": 0.8230358958244324,
- "num_tokens": 8040534.0,
- "step": 883
- },
- {
- "epoch": 0.6717325227963525,
- "grad_norm": 1.678581714630127,
- "learning_rate": 4.599523542573207e-06,
- "loss": 0.4955351650714874,
- "mean_token_accuracy": 0.8270003795623779,
- "num_tokens": 8052249.0,
- "step": 884
- },
- {
- "epoch": 0.6724924012158054,
- "grad_norm": 2.0751662254333496,
- "learning_rate": 4.598385797935368e-06,
- "loss": 0.5266247987747192,
- "mean_token_accuracy": 0.8263581991195679,
- "num_tokens": 8060600.0,
- "step": 885
- },
- {
- "epoch": 0.6732522796352584,
- "grad_norm": 2.418405771255493,
- "learning_rate": 4.5972465805696e-06,
- "loss": 0.4481425881385803,
- "mean_token_accuracy": 0.846164345741272,
- "num_tokens": 8066025.0,
- "step": 886
- },
- {
- "epoch": 0.6740121580547113,
- "grad_norm": 2.3936474323272705,
- "learning_rate": 4.596105891275449e-06,
- "loss": 0.4553404450416565,
- "mean_token_accuracy": 0.8412896394729614,
- "num_tokens": 8071544.0,
- "step": 887
- },
- {
- "epoch": 0.6747720364741642,
- "grad_norm": 2.2024407386779785,
- "learning_rate": 4.594963730853497e-06,
- "loss": 0.6218541860580444,
- "mean_token_accuracy": 0.7890232801437378,
- "num_tokens": 8079061.0,
- "step": 888
- },
- {
- "epoch": 0.675531914893617,
- "grad_norm": 2.51015567779541,
- "learning_rate": 4.593820100105355e-06,
- "loss": 0.5149124264717102,
- "mean_token_accuracy": 0.8241918087005615,
- "num_tokens": 8084293.0,
- "step": 889
- },
- {
- "epoch": 0.6762917933130699,
- "grad_norm": 1.8748939037322998,
- "learning_rate": 4.5926749998336665e-06,
- "loss": 0.50836181640625,
- "mean_token_accuracy": 0.8067223429679871,
- "num_tokens": 8092511.0,
- "step": 890
- },
- {
- "epoch": 0.6770516717325228,
- "grad_norm": 1.801193118095398,
- "learning_rate": 4.5915284308421075e-06,
- "loss": 0.4372861683368683,
- "mean_token_accuracy": 0.8510604500770569,
- "num_tokens": 8101174.0,
- "step": 891
- },
- {
- "epoch": 0.6778115501519757,
- "grad_norm": 2.6476457118988037,
- "learning_rate": 4.590380393935383e-06,
- "loss": 0.38700711727142334,
- "mean_token_accuracy": 0.8659796714782715,
- "num_tokens": 8105398.0,
- "step": 892
- },
- {
- "epoch": 0.6785714285714286,
- "grad_norm": 1.1147183179855347,
- "learning_rate": 4.589230889919232e-06,
- "loss": 0.38546115159988403,
- "mean_token_accuracy": 0.8570581674575806,
- "num_tokens": 8127394.0,
- "step": 893
- },
- {
- "epoch": 0.6793313069908815,
- "grad_norm": 2.908905506134033,
- "learning_rate": 4.588079919600419e-06,
- "loss": 0.5108504295349121,
- "mean_token_accuracy": 0.8121406435966492,
- "num_tokens": 8131801.0,
- "step": 894
- },
- {
- "epoch": 0.6800911854103343,
- "grad_norm": 3.1522326469421387,
- "learning_rate": 4.586927483786739e-06,
- "loss": 0.44059112668037415,
- "mean_token_accuracy": 0.8448011875152588,
- "num_tokens": 8154416.0,
- "step": 895
- },
- {
- "epoch": 0.6808510638297872,
- "grad_norm": 1.5142440795898438,
- "learning_rate": 4.585773583287017e-06,
- "loss": 0.513217568397522,
- "mean_token_accuracy": 0.8386049270629883,
- "num_tokens": 8171156.0,
- "step": 896
- },
- {
- "epoch": 0.6816109422492401,
- "grad_norm": 2.597881317138672,
- "learning_rate": 4.584618218911104e-06,
- "loss": 0.4937712550163269,
- "mean_token_accuracy": 0.8223681449890137,
- "num_tokens": 8176124.0,
- "step": 897
- },
- {
- "epoch": 0.682370820668693,
- "grad_norm": 1.8185619115829468,
- "learning_rate": 4.583461391469879e-06,
- "loss": 0.519811749458313,
- "mean_token_accuracy": 0.8169777393341064,
- "num_tokens": 8185136.0,
- "step": 898
- },
- {
- "epoch": 0.6831306990881459,
- "grad_norm": 3.2061994075775146,
- "learning_rate": 4.582303101775249e-06,
- "loss": 0.4655115008354187,
- "mean_token_accuracy": 0.8425977230072021,
- "num_tokens": 8188864.0,
- "step": 899
- },
- {
- "epoch": 0.6838905775075987,
- "grad_norm": 1.3485229015350342,
- "learning_rate": 4.581143350640146e-06,
- "loss": 0.5014470815658569,
- "mean_token_accuracy": 0.8273109197616577,
- "num_tokens": 8203460.0,
- "step": 900
- },
- {
- "epoch": 0.6846504559270516,
- "grad_norm": 1.3264713287353516,
- "learning_rate": 4.579982138878527e-06,
- "loss": 0.5073703527450562,
- "mean_token_accuracy": 0.8259357213973999,
- "num_tokens": 8219348.0,
- "step": 901
- },
- {
- "epoch": 0.6854103343465046,
- "grad_norm": 2.4436347484588623,
- "learning_rate": 4.578819467305375e-06,
- "loss": 0.47020310163497925,
- "mean_token_accuracy": 0.8567265272140503,
- "num_tokens": 8224427.0,
- "step": 902
- },
- {
- "epoch": 0.6861702127659575,
- "grad_norm": 1.921749234199524,
- "learning_rate": 4.5776553367367e-06,
- "loss": 0.622514009475708,
- "mean_token_accuracy": 0.7863982319831848,
- "num_tokens": 8233151.0,
- "step": 903
- },
- {
- "epoch": 0.6869300911854104,
- "grad_norm": 1.8815616369247437,
- "learning_rate": 4.576489747989532e-06,
- "loss": 0.4910545349121094,
- "mean_token_accuracy": 0.8147122859954834,
- "num_tokens": 8240762.0,
- "step": 904
- },
- {
- "epoch": 0.6876899696048632,
- "grad_norm": 1.2366989850997925,
- "learning_rate": 4.575322701881926e-06,
- "loss": 0.3947566747665405,
- "mean_token_accuracy": 0.873993992805481,
- "num_tokens": 8259381.0,
- "step": 905
- },
- {
- "epoch": 0.6884498480243161,
- "grad_norm": 1.5767735242843628,
- "learning_rate": 4.57415419923296e-06,
- "loss": 0.57136070728302,
- "mean_token_accuracy": 0.8028088808059692,
- "num_tokens": 8273296.0,
- "step": 906
- },
- {
- "epoch": 0.689209726443769,
- "grad_norm": 2.378675699234009,
- "learning_rate": 4.572984240862733e-06,
- "loss": 0.5894849896430969,
- "mean_token_accuracy": 0.7977708578109741,
- "num_tokens": 8280083.0,
- "step": 907
- },
- {
- "epoch": 0.6899696048632219,
- "grad_norm": 2.0401132106781006,
- "learning_rate": 4.57181282759237e-06,
- "loss": 0.5524613261222839,
- "mean_token_accuracy": 0.8138598203659058,
- "num_tokens": 8288236.0,
- "step": 908
- },
- {
- "epoch": 0.6907294832826748,
- "grad_norm": 2.293701648712158,
- "learning_rate": 4.570639960244011e-06,
- "loss": 0.5154546499252319,
- "mean_token_accuracy": 0.8234660625457764,
- "num_tokens": 8294493.0,
- "step": 909
- },
- {
- "epoch": 0.6914893617021277,
- "grad_norm": 1.9286527633666992,
- "learning_rate": 4.56946563964082e-06,
- "loss": 0.5364264845848083,
- "mean_token_accuracy": 0.8147368431091309,
- "num_tokens": 8303441.0,
- "step": 910
- },
- {
- "epoch": 0.6922492401215805,
- "grad_norm": 1.2571251392364502,
- "learning_rate": 4.5682898666069815e-06,
- "loss": 0.43535223603248596,
- "mean_token_accuracy": 0.859239935874939,
- "num_tokens": 8321548.0,
- "step": 911
- },
- {
- "epoch": 0.6930091185410334,
- "grad_norm": 1.2224860191345215,
- "learning_rate": 4.567112641967697e-06,
- "loss": 0.40205076336860657,
- "mean_token_accuracy": 0.8724711537361145,
- "num_tokens": 8335205.0,
- "step": 912
- },
- {
- "epoch": 0.6937689969604863,
- "grad_norm": 1.2064491510391235,
- "learning_rate": 4.5659339665491894e-06,
- "loss": 0.37790587544441223,
- "mean_token_accuracy": 0.8464339971542358,
- "num_tokens": 8350926.0,
- "step": 913
- },
- {
- "epoch": 0.6945288753799392,
- "grad_norm": 2.1755270957946777,
- "learning_rate": 4.5647538411786965e-06,
- "loss": 0.42034298181533813,
- "mean_token_accuracy": 0.84148108959198,
- "num_tokens": 8356739.0,
- "step": 914
- },
- {
- "epoch": 0.6952887537993921,
- "grad_norm": 1.234864592552185,
- "learning_rate": 4.563572266684478e-06,
- "loss": 0.5062938332557678,
- "mean_token_accuracy": 0.8132052421569824,
- "num_tokens": 8373660.0,
- "step": 915
- },
- {
- "epoch": 0.6960486322188449,
- "grad_norm": 2.4250621795654297,
- "learning_rate": 4.562389243895807e-06,
- "loss": 0.4907791018486023,
- "mean_token_accuracy": 0.8337979912757874,
- "num_tokens": 8378661.0,
- "step": 916
- },
- {
- "epoch": 0.6968085106382979,
- "grad_norm": 1.5018314123153687,
- "learning_rate": 4.561204773642974e-06,
- "loss": 0.41041281819343567,
- "mean_token_accuracy": 0.8569784164428711,
- "num_tokens": 8390322.0,
- "step": 917
- },
- {
- "epoch": 0.6975683890577508,
- "grad_norm": 2.797269344329834,
- "learning_rate": 4.5600188567572874e-06,
- "loss": 0.3146931529045105,
- "mean_token_accuracy": 0.8913302421569824,
- "num_tokens": 8393567.0,
- "step": 918
- },
- {
- "epoch": 0.6983282674772037,
- "grad_norm": 1.4002827405929565,
- "learning_rate": 4.558831494071069e-06,
- "loss": 0.4275597333908081,
- "mean_token_accuracy": 0.8504893779754639,
- "num_tokens": 8407119.0,
- "step": 919
- },
- {
- "epoch": 0.6990881458966566,
- "grad_norm": 1.7045831680297852,
- "learning_rate": 4.557642686417654e-06,
- "loss": 0.49593430757522583,
- "mean_token_accuracy": 0.8185091018676758,
- "num_tokens": 8417408.0,
- "step": 920
- },
- {
- "epoch": 0.6998480243161094,
- "grad_norm": 2.8818066120147705,
- "learning_rate": 4.556452434631396e-06,
- "loss": 0.637908935546875,
- "mean_token_accuracy": 0.7883946895599365,
- "num_tokens": 8422319.0,
- "step": 921
- },
- {
- "epoch": 0.7006079027355623,
- "grad_norm": 2.3587265014648438,
- "learning_rate": 4.555260739547657e-06,
- "loss": 0.38749319314956665,
- "mean_token_accuracy": 0.8774704933166504,
- "num_tokens": 8427315.0,
- "step": 922
- },
- {
- "epoch": 0.7013677811550152,
- "grad_norm": 1.6648749113082886,
- "learning_rate": 4.554067602002815e-06,
- "loss": 0.4044865369796753,
- "mean_token_accuracy": 0.8524141311645508,
- "num_tokens": 8438662.0,
- "step": 923
- },
- {
- "epoch": 0.7021276595744681,
- "grad_norm": 3.467787742614746,
- "learning_rate": 4.55287302283426e-06,
- "loss": 0.591016411781311,
- "mean_token_accuracy": 0.81184983253479,
- "num_tokens": 8442237.0,
- "step": 924
- },
- {
- "epoch": 0.702887537993921,
- "grad_norm": 2.1458635330200195,
- "learning_rate": 4.551677002880395e-06,
- "loss": 0.5017476677894592,
- "mean_token_accuracy": 0.822914183139801,
- "num_tokens": 8449494.0,
- "step": 925
- },
- {
- "epoch": 0.7036474164133738,
- "grad_norm": 2.521714448928833,
- "learning_rate": 4.550479542980632e-06,
- "loss": 0.531912088394165,
- "mean_token_accuracy": 0.8225687742233276,
- "num_tokens": 8454983.0,
- "step": 926
- },
- {
- "epoch": 0.7044072948328267,
- "grad_norm": 3.5248100757598877,
- "learning_rate": 4.549280643975394e-06,
- "loss": 0.4631815254688263,
- "mean_token_accuracy": 0.8443771600723267,
- "num_tokens": 8458504.0,
- "step": 927
- },
- {
- "epoch": 0.7051671732522796,
- "grad_norm": 2.5105819702148438,
- "learning_rate": 4.548080306706114e-06,
- "loss": 0.30487123131752014,
- "mean_token_accuracy": 0.9018767476081848,
- "num_tokens": 8462589.0,
- "step": 928
- },
- {
- "epoch": 0.7059270516717325,
- "grad_norm": 1.3367713689804077,
- "learning_rate": 4.5468785320152365e-06,
- "loss": 0.4355026185512543,
- "mean_token_accuracy": 0.8323584794998169,
- "num_tokens": 8478450.0,
- "step": 929
- },
- {
- "epoch": 0.7066869300911854,
- "grad_norm": 2.2506282329559326,
- "learning_rate": 4.545675320746212e-06,
- "loss": 0.5082957744598389,
- "mean_token_accuracy": 0.823430597782135,
- "num_tokens": 8485991.0,
- "step": 930
- },
- {
- "epoch": 0.7074468085106383,
- "grad_norm": 1.7164632081985474,
- "learning_rate": 4.544470673743502e-06,
- "loss": 0.3960164785385132,
- "mean_token_accuracy": 0.8592486381530762,
- "num_tokens": 8495217.0,
- "step": 931
- },
- {
- "epoch": 0.7082066869300911,
- "grad_norm": 1.5864969491958618,
- "learning_rate": 4.543264591852572e-06,
- "loss": 0.49114471673965454,
- "mean_token_accuracy": 0.8330780267715454,
- "num_tokens": 8508904.0,
- "step": 932
- },
- {
- "epoch": 0.708966565349544,
- "grad_norm": 2.1707003116607666,
- "learning_rate": 4.542057075919898e-06,
- "loss": 0.49895772337913513,
- "mean_token_accuracy": 0.8327431082725525,
- "num_tokens": 8515792.0,
- "step": 933
- },
- {
- "epoch": 0.709726443768997,
- "grad_norm": 1.9002083539962769,
- "learning_rate": 4.54084812679296e-06,
- "loss": 0.4548531472682953,
- "mean_token_accuracy": 0.834532618522644,
- "num_tokens": 8524006.0,
- "step": 934
- },
- {
- "epoch": 0.7104863221884499,
- "grad_norm": 1.8505141735076904,
- "learning_rate": 4.539637745320247e-06,
- "loss": 0.35716521739959717,
- "mean_token_accuracy": 0.872222900390625,
- "num_tokens": 8533647.0,
- "step": 935
- },
- {
- "epoch": 0.7112462006079028,
- "grad_norm": 2.092620849609375,
- "learning_rate": 4.53842593235125e-06,
- "loss": 0.4673694372177124,
- "mean_token_accuracy": 0.8460999131202698,
- "num_tokens": 8540734.0,
- "step": 936
- },
- {
- "epoch": 0.7120060790273556,
- "grad_norm": 2.689514636993408,
- "learning_rate": 4.537212688736466e-06,
- "loss": 0.45461273193359375,
- "mean_token_accuracy": 0.8450704216957092,
- "num_tokens": 8544948.0,
- "step": 937
- },
- {
- "epoch": 0.7127659574468085,
- "grad_norm": 2.4507734775543213,
- "learning_rate": 4.535998015327396e-06,
- "loss": 0.4571906626224518,
- "mean_token_accuracy": 0.8429360389709473,
- "num_tokens": 8550445.0,
- "step": 938
- },
- {
- "epoch": 0.7135258358662614,
- "grad_norm": 1.8960013389587402,
- "learning_rate": 4.534781912976546e-06,
- "loss": 0.4461391568183899,
- "mean_token_accuracy": 0.8487973213195801,
- "num_tokens": 8557630.0,
- "step": 939
- },
- {
- "epoch": 0.7142857142857143,
- "grad_norm": 1.602611780166626,
- "learning_rate": 4.533564382537421e-06,
- "loss": 0.5277102589607239,
- "mean_token_accuracy": 0.8330916166305542,
- "num_tokens": 8570397.0,
- "step": 940
- },
- {
- "epoch": 0.7150455927051672,
- "grad_norm": 1.8936395645141602,
- "learning_rate": 4.532345424864533e-06,
- "loss": 0.38619571924209595,
- "mean_token_accuracy": 0.8514572381973267,
- "num_tokens": 8582673.0,
- "step": 941
- },
- {
- "epoch": 0.71580547112462,
- "grad_norm": 1.3898619413375854,
- "learning_rate": 4.531125040813392e-06,
- "loss": 0.4825032949447632,
- "mean_token_accuracy": 0.833012580871582,
- "num_tokens": 8597239.0,
- "step": 942
- },
- {
- "epoch": 0.7165653495440729,
- "grad_norm": 2.128230571746826,
- "learning_rate": 4.529903231240511e-06,
- "loss": 0.4862118065357208,
- "mean_token_accuracy": 0.8210917711257935,
- "num_tokens": 8605877.0,
- "step": 943
- },
- {
- "epoch": 0.7173252279635258,
- "grad_norm": 1.6552259922027588,
- "learning_rate": 4.528679997003403e-06,
- "loss": 0.5092059373855591,
- "mean_token_accuracy": 0.8247389793395996,
- "num_tokens": 8617060.0,
- "step": 944
- },
- {
- "epoch": 0.7180851063829787,
- "grad_norm": 2.1174771785736084,
- "learning_rate": 4.52745533896058e-06,
- "loss": 0.39110174775123596,
- "mean_token_accuracy": 0.8672944903373718,
- "num_tokens": 8623306.0,
- "step": 945
- },
- {
- "epoch": 0.7188449848024316,
- "grad_norm": 2.8648383617401123,
- "learning_rate": 4.526229257971556e-06,
- "loss": 0.49864327907562256,
- "mean_token_accuracy": 0.8305130004882812,
- "num_tokens": 8627466.0,
- "step": 946
- },
- {
- "epoch": 0.7196048632218845,
- "grad_norm": 2.155514717102051,
- "learning_rate": 4.52500175489684e-06,
- "loss": 0.5070191025733948,
- "mean_token_accuracy": 0.8311188817024231,
- "num_tokens": 8634759.0,
- "step": 947
- },
- {
- "epoch": 0.7203647416413373,
- "grad_norm": 1.8432683944702148,
- "learning_rate": 4.523772830597942e-06,
- "loss": 0.5569252371788025,
- "mean_token_accuracy": 0.8070821762084961,
- "num_tokens": 8644160.0,
- "step": 948
- },
- {
- "epoch": 0.7211246200607903,
- "grad_norm": 2.8912241458892822,
- "learning_rate": 4.522542485937369e-06,
- "loss": 0.4799427390098572,
- "mean_token_accuracy": 0.8443552851676941,
- "num_tokens": 8648377.0,
- "step": 949
- },
- {
- "epoch": 0.7218844984802432,
- "grad_norm": 3.3449625968933105,
- "learning_rate": 4.521310721778622e-06,
- "loss": 0.44043463468551636,
- "mean_token_accuracy": 0.8521315455436707,
- "num_tokens": 8651846.0,
- "step": 950
- },
- {
- "epoch": 0.7226443768996961,
- "grad_norm": 1.4127917289733887,
- "learning_rate": 4.520077538986203e-06,
- "loss": 0.4700999855995178,
- "mean_token_accuracy": 0.8377952575683594,
- "num_tokens": 8665199.0,
- "step": 951
- },
- {
- "epoch": 0.723404255319149,
- "grad_norm": 2.1607301235198975,
- "learning_rate": 4.518842938425606e-06,
- "loss": 0.4374256730079651,
- "mean_token_accuracy": 0.8448896408081055,
- "num_tokens": 8672158.0,
- "step": 952
- },
- {
- "epoch": 0.7241641337386018,
- "grad_norm": 1.3442779779434204,
- "learning_rate": 4.51760692096332e-06,
- "loss": 0.38948923349380493,
- "mean_token_accuracy": 0.8598923683166504,
- "num_tokens": 8684532.0,
- "step": 953
- },
- {
- "epoch": 0.7249240121580547,
- "grad_norm": 2.0003178119659424,
- "learning_rate": 4.516369487466832e-06,
- "loss": 0.3797217011451721,
- "mean_token_accuracy": 0.8652102947235107,
- "num_tokens": 8691460.0,
- "step": 954
- },
- {
- "epoch": 0.7256838905775076,
- "grad_norm": 1.8196535110473633,
- "learning_rate": 4.5151306388046175e-06,
- "loss": 0.5676811933517456,
- "mean_token_accuracy": 0.818500816822052,
- "num_tokens": 8701624.0,
- "step": 955
- },
- {
- "epoch": 0.7264437689969605,
- "grad_norm": 2.1962296962738037,
- "learning_rate": 4.513890375846152e-06,
- "loss": 0.45399484038352966,
- "mean_token_accuracy": 0.8463879227638245,
- "num_tokens": 8707410.0,
- "step": 956
- },
- {
- "epoch": 0.7272036474164134,
- "grad_norm": 1.8798872232437134,
- "learning_rate": 4.512648699461897e-06,
- "loss": 0.5679811239242554,
- "mean_token_accuracy": 0.8089900016784668,
- "num_tokens": 8715630.0,
- "step": 957
- },
- {
- "epoch": 0.7279635258358662,
- "grad_norm": 2.3540258407592773,
- "learning_rate": 4.511405610523309e-06,
- "loss": 0.5282865762710571,
- "mean_token_accuracy": 0.8196114301681519,
- "num_tokens": 8721934.0,
- "step": 958
- },
- {
- "epoch": 0.7287234042553191,
- "grad_norm": 2.5630908012390137,
- "learning_rate": 4.510161109902837e-06,
- "loss": 0.39442378282546997,
- "mean_token_accuracy": 0.8400980830192566,
- "num_tokens": 8726511.0,
- "step": 959
- },
- {
- "epoch": 0.729483282674772,
- "grad_norm": 1.9829226732254028,
- "learning_rate": 4.508915198473919e-06,
- "loss": 0.4611976742744446,
- "mean_token_accuracy": 0.8439624309539795,
- "num_tokens": 8733460.0,
- "step": 960
- },
- {
- "epoch": 0.7302431610942249,
- "grad_norm": 3.0291950702667236,
- "learning_rate": 4.507667877110982e-06,
- "loss": 0.5158340930938721,
- "mean_token_accuracy": 0.8300060033798218,
- "num_tokens": 8737629.0,
- "step": 961
- },
- {
- "epoch": 0.7310030395136778,
- "grad_norm": 1.9208252429962158,
- "learning_rate": 4.506419146689445e-06,
- "loss": 0.3807099163532257,
- "mean_token_accuracy": 0.871469259262085,
- "num_tokens": 8744615.0,
- "step": 962
- },
- {
- "epoch": 0.7317629179331308,
- "grad_norm": 3.051565408706665,
- "learning_rate": 4.505169008085717e-06,
- "loss": 0.38461726903915405,
- "mean_token_accuracy": 0.874465823173523,
- "num_tokens": 8748154.0,
- "step": 963
- },
- {
- "epoch": 0.7325227963525835,
- "grad_norm": 1.375466227531433,
- "learning_rate": 4.503917462177192e-06,
- "loss": 0.42490679025650024,
- "mean_token_accuracy": 0.8457326889038086,
- "num_tokens": 8760965.0,
- "step": 964
- },
- {
- "epoch": 0.7332826747720365,
- "grad_norm": 2.216681957244873,
- "learning_rate": 4.5026645098422515e-06,
- "loss": 0.43149900436401367,
- "mean_token_accuracy": 0.8527278900146484,
- "num_tokens": 8766996.0,
- "step": 965
- },
- {
- "epoch": 0.7340425531914894,
- "grad_norm": 1.9422595500946045,
- "learning_rate": 4.5014101519602684e-06,
- "loss": 0.4964504539966583,
- "mean_token_accuracy": 0.8137556314468384,
- "num_tokens": 8774411.0,
- "step": 966
- },
- {
- "epoch": 0.7348024316109423,
- "grad_norm": 2.058887004852295,
- "learning_rate": 4.500154389411598e-06,
- "loss": 0.4977570176124573,
- "mean_token_accuracy": 0.8254626989364624,
- "num_tokens": 8782220.0,
- "step": 967
- },
- {
- "epoch": 0.7355623100303952,
- "grad_norm": 2.9977786540985107,
- "learning_rate": 4.498897223077582e-06,
- "loss": 0.4061415195465088,
- "mean_token_accuracy": 0.8752427101135254,
- "num_tokens": 8786120.0,
- "step": 968
- },
- {
- "epoch": 0.736322188449848,
- "grad_norm": 2.2636303901672363,
- "learning_rate": 4.49763865384055e-06,
- "loss": 0.5062161087989807,
- "mean_token_accuracy": 0.8171653747558594,
- "num_tokens": 8792459.0,
- "step": 969
- },
- {
- "epoch": 0.7370820668693009,
- "grad_norm": 1.8850842714309692,
- "learning_rate": 4.496378682583813e-06,
- "loss": 0.5014280676841736,
- "mean_token_accuracy": 0.8547511100769043,
- "num_tokens": 8800675.0,
- "step": 970
- },
- {
- "epoch": 0.7378419452887538,
- "grad_norm": 1.191985011100769,
- "learning_rate": 4.495117310191667e-06,
- "loss": 0.4713883101940155,
- "mean_token_accuracy": 0.8213596343994141,
- "num_tokens": 8820740.0,
- "step": 971
- },
- {
- "epoch": 0.7386018237082067,
- "grad_norm": 1.823000192642212,
- "learning_rate": 4.493854537549393e-06,
- "loss": 0.46332645416259766,
- "mean_token_accuracy": 0.8359860777854919,
- "num_tokens": 8828884.0,
- "step": 972
- },
- {
- "epoch": 0.7393617021276596,
- "grad_norm": 2.590446949005127,
- "learning_rate": 4.492590365543253e-06,
- "loss": 0.49074703454971313,
- "mean_token_accuracy": 0.8433758020401001,
- "num_tokens": 8833859.0,
- "step": 973
- },
- {
- "epoch": 0.7401215805471124,
- "grad_norm": 2.2762670516967773,
- "learning_rate": 4.491324795060491e-06,
- "loss": 0.39465656876564026,
- "mean_token_accuracy": 0.8734766244888306,
- "num_tokens": 8839350.0,
- "step": 974
- },
- {
- "epoch": 0.7408814589665653,
- "grad_norm": 2.698725461959839,
- "learning_rate": 4.490057826989333e-06,
- "loss": 0.5552085041999817,
- "mean_token_accuracy": 0.8132266998291016,
- "num_tokens": 8844373.0,
- "step": 975
- },
- {
- "epoch": 0.7416413373860182,
- "grad_norm": 2.704606294631958,
- "learning_rate": 4.488789462218988e-06,
- "loss": 0.3447791635990143,
- "mean_token_accuracy": 0.8736170530319214,
- "num_tokens": 8848236.0,
- "step": 976
- },
- {
- "epoch": 0.7424012158054711,
- "grad_norm": 3.1260716915130615,
- "learning_rate": 4.487519701639641e-06,
- "loss": 0.5945233702659607,
- "mean_token_accuracy": 0.7997599840164185,
- "num_tokens": 8852935.0,
- "step": 977
- },
- {
- "epoch": 0.743161094224924,
- "grad_norm": 1.6895452737808228,
- "learning_rate": 4.486248546142459e-06,
- "loss": 0.4823892116546631,
- "mean_token_accuracy": 0.8279662132263184,
- "num_tokens": 8861743.0,
- "step": 978
- },
- {
- "epoch": 0.743920972644377,
- "grad_norm": 1.9161452054977417,
- "learning_rate": 4.4849759966195885e-06,
- "loss": 0.5266581773757935,
- "mean_token_accuracy": 0.8218623399734497,
- "num_tokens": 8870601.0,
- "step": 979
- },
- {
- "epoch": 0.7446808510638298,
- "grad_norm": 1.6894301176071167,
- "learning_rate": 4.483702053964154e-06,
- "loss": 0.4186219573020935,
- "mean_token_accuracy": 0.8471781015396118,
- "num_tokens": 8885617.0,
- "step": 980
- },
- {
- "epoch": 0.7454407294832827,
- "grad_norm": 1.6319992542266846,
- "learning_rate": 4.482426719070258e-06,
- "loss": 0.541317880153656,
- "mean_token_accuracy": 0.8216162323951721,
- "num_tokens": 8897595.0,
- "step": 981
- },
- {
- "epoch": 0.7462006079027356,
- "grad_norm": 5.102413177490234,
- "learning_rate": 4.4811499928329775e-06,
- "loss": 0.3928517699241638,
- "mean_token_accuracy": 0.858033299446106,
- "num_tokens": 8901682.0,
- "step": 982
- },
- {
- "epoch": 0.7469604863221885,
- "grad_norm": 2.213860273361206,
- "learning_rate": 4.479871876148368e-06,
- "loss": 0.4276347756385803,
- "mean_token_accuracy": 0.8529798984527588,
- "num_tokens": 8908088.0,
- "step": 983
- },
- {
- "epoch": 0.7477203647416414,
- "grad_norm": 1.2180038690567017,
- "learning_rate": 4.478592369913464e-06,
- "loss": 0.3941590189933777,
- "mean_token_accuracy": 0.8608149290084839,
- "num_tokens": 8925876.0,
- "step": 984
- },
- {
- "epoch": 0.7484802431610942,
- "grad_norm": 2.849802255630493,
- "learning_rate": 4.477311475026271e-06,
- "loss": 0.42190325260162354,
- "mean_token_accuracy": 0.860505223274231,
- "num_tokens": 8930190.0,
- "step": 985
- },
- {
- "epoch": 0.7492401215805471,
- "grad_norm": 1.704128384590149,
- "learning_rate": 4.476029192385769e-06,
- "loss": 0.4786282777786255,
- "mean_token_accuracy": 0.8302322626113892,
- "num_tokens": 8938340.0,
- "step": 986
- },
- {
- "epoch": 0.75,
- "grad_norm": 2.06322979927063,
- "learning_rate": 4.474745522891915e-06,
- "loss": 0.4648786187171936,
- "mean_token_accuracy": 0.8366481065750122,
- "num_tokens": 8944633.0,
- "step": 987
- },
- {
- "epoch": 0.7507598784194529,
- "grad_norm": 2.0745396614074707,
- "learning_rate": 4.473460467445637e-06,
- "loss": 0.5744885206222534,
- "mean_token_accuracy": 0.8357284069061279,
- "num_tokens": 8954457.0,
- "step": 988
- },
- {
- "epoch": 0.7515197568389058,
- "grad_norm": 1.9281407594680786,
- "learning_rate": 4.472174026948836e-06,
- "loss": 0.528974175453186,
- "mean_token_accuracy": 0.8083580732345581,
- "num_tokens": 8962701.0,
- "step": 989
- },
- {
- "epoch": 0.7522796352583586,
- "grad_norm": 3.012381076812744,
- "learning_rate": 4.470886202304385e-06,
- "loss": 0.48754751682281494,
- "mean_token_accuracy": 0.8368391990661621,
- "num_tokens": 8967272.0,
- "step": 990
- },
- {
- "epoch": 0.7530395136778115,
- "grad_norm": 1.691826581954956,
- "learning_rate": 4.469596994416131e-06,
- "loss": 0.484740674495697,
- "mean_token_accuracy": 0.8500643968582153,
- "num_tokens": 8976615.0,
- "step": 991
- },
- {
- "epoch": 0.7537993920972644,
- "grad_norm": 2.4961965084075928,
- "learning_rate": 4.468306404188887e-06,
- "loss": 0.50777268409729,
- "mean_token_accuracy": 0.8168395757675171,
- "num_tokens": 8983235.0,
- "step": 992
- },
- {
- "epoch": 0.7545592705167173,
- "grad_norm": 1.512007713317871,
- "learning_rate": 4.467014432528441e-06,
- "loss": 0.4583340287208557,
- "mean_token_accuracy": 0.8465162515640259,
- "num_tokens": 8993815.0,
- "step": 993
- },
- {
- "epoch": 0.7553191489361702,
- "grad_norm": 1.9362257719039917,
- "learning_rate": 4.465721080341547e-06,
- "loss": 0.6027892827987671,
- "mean_token_accuracy": 0.8052380084991455,
- "num_tokens": 9002697.0,
- "step": 994
- },
- {
- "epoch": 0.756079027355623,
- "grad_norm": 2.473632335662842,
- "learning_rate": 4.4644263485359316e-06,
- "loss": 0.5394320487976074,
- "mean_token_accuracy": 0.834665834903717,
- "num_tokens": 9007428.0,
- "step": 995
- },
- {
- "epoch": 0.756838905775076,
- "grad_norm": 2.2527434825897217,
- "learning_rate": 4.463130238020284e-06,
- "loss": 0.5485198497772217,
- "mean_token_accuracy": 0.8090173006057739,
- "num_tokens": 9013570.0,
- "step": 996
- },
- {
- "epoch": 0.7575987841945289,
- "grad_norm": 1.4130940437316895,
- "learning_rate": 4.4618327497042676e-06,
- "loss": 0.37994423508644104,
- "mean_token_accuracy": 0.8625167012214661,
- "num_tokens": 9025485.0,
- "step": 997
- },
- {
- "epoch": 0.7583586626139818,
- "grad_norm": 2.685115098953247,
- "learning_rate": 4.460533884498509e-06,
- "loss": 0.447973370552063,
- "mean_token_accuracy": 0.8564165234565735,
- "num_tokens": 9030355.0,
- "step": 998
- },
- {
- "epoch": 0.7591185410334347,
- "grad_norm": 3.2743139266967773,
- "learning_rate": 4.4592336433146e-06,
- "loss": 0.45275989174842834,
- "mean_token_accuracy": 0.8462578058242798,
- "num_tokens": 9034406.0,
- "step": 999
- },
- {
- "epoch": 0.7598784194528876,
- "grad_norm": 1.9383049011230469,
- "learning_rate": 4.457932027065102e-06,
- "loss": 0.5387729406356812,
- "mean_token_accuracy": 0.8357330560684204,
- "num_tokens": 9041502.0,
- "step": 1000
- },
- {
- "epoch": 0.7606382978723404,
- "grad_norm": 2.7348275184631348,
- "learning_rate": 4.456629036663537e-06,
- "loss": 0.4448447823524475,
- "mean_token_accuracy": 0.8453642129898071,
- "num_tokens": 9046088.0,
- "step": 1001
- },
- {
- "epoch": 0.7613981762917933,
- "grad_norm": 1.8477401733398438,
- "learning_rate": 4.455324673024396e-06,
- "loss": 0.5766505002975464,
- "mean_token_accuracy": 0.8074213862419128,
- "num_tokens": 9055678.0,
- "step": 1002
- },
- {
- "epoch": 0.7621580547112462,
- "grad_norm": 3.134481430053711,
- "learning_rate": 4.4540189370631315e-06,
- "loss": 0.5690872669219971,
- "mean_token_accuracy": 0.8414670825004578,
- "num_tokens": 9062006.0,
- "step": 1003
- },
- {
- "epoch": 0.7629179331306991,
- "grad_norm": 1.7933398485183716,
- "learning_rate": 4.452711829696158e-06,
- "loss": 0.4898291826248169,
- "mean_token_accuracy": 0.8259007930755615,
- "num_tokens": 9070754.0,
- "step": 1004
- },
- {
- "epoch": 0.763677811550152,
- "grad_norm": 1.2552275657653809,
- "learning_rate": 4.451403351840855e-06,
- "loss": 0.4280198812484741,
- "mean_token_accuracy": 0.8409112691879272,
- "num_tokens": 9085306.0,
- "step": 1005
- },
- {
- "epoch": 0.7644376899696048,
- "grad_norm": 1.6749331951141357,
- "learning_rate": 4.450093504415562e-06,
- "loss": 0.3723178505897522,
- "mean_token_accuracy": 0.8545734882354736,
- "num_tokens": 9102453.0,
- "step": 1006
- },
- {
- "epoch": 0.7651975683890577,
- "grad_norm": 2.7514500617980957,
- "learning_rate": 4.44878228833958e-06,
- "loss": 0.5463190674781799,
- "mean_token_accuracy": 0.8121639490127563,
- "num_tokens": 9108342.0,
- "step": 1007
- },
- {
- "epoch": 0.7659574468085106,
- "grad_norm": 1.3322733640670776,
- "learning_rate": 4.447469704533172e-06,
- "loss": 0.573723316192627,
- "mean_token_accuracy": 0.8065711259841919,
- "num_tokens": 9123712.0,
- "step": 1008
- },
- {
- "epoch": 0.7667173252279635,
- "grad_norm": 2.6893765926361084,
- "learning_rate": 4.446155753917559e-06,
- "loss": 0.6856257915496826,
- "mean_token_accuracy": 0.7718256711959839,
- "num_tokens": 9130728.0,
- "step": 1009
- },
- {
- "epoch": 0.7674772036474165,
- "grad_norm": 1.792765498161316,
- "learning_rate": 4.444840437414923e-06,
- "loss": 0.48203110694885254,
- "mean_token_accuracy": 0.8419194221496582,
- "num_tokens": 9137983.0,
- "step": 1010
- },
- {
- "epoch": 0.7682370820668692,
- "grad_norm": 1.4957399368286133,
- "learning_rate": 4.443523755948401e-06,
- "loss": 0.4372181296348572,
- "mean_token_accuracy": 0.8491764664649963,
- "num_tokens": 9148081.0,
- "step": 1011
- },
- {
- "epoch": 0.7689969604863222,
- "grad_norm": 1.7294867038726807,
- "learning_rate": 4.442205710442095e-06,
- "loss": 0.54277503490448,
- "mean_token_accuracy": 0.8196806907653809,
- "num_tokens": 9158407.0,
- "step": 1012
- },
- {
- "epoch": 0.7697568389057751,
- "grad_norm": 2.2091221809387207,
- "learning_rate": 4.4408863018210564e-06,
- "loss": 0.4888187646865845,
- "mean_token_accuracy": 0.8384175300598145,
- "num_tokens": 9164754.0,
- "step": 1013
- },
- {
- "epoch": 0.770516717325228,
- "grad_norm": 1.7615830898284912,
- "learning_rate": 4.439565531011299e-06,
- "loss": 0.4640008211135864,
- "mean_token_accuracy": 0.8424701690673828,
- "num_tokens": 9172715.0,
- "step": 1014
- },
- {
- "epoch": 0.7712765957446809,
- "grad_norm": 1.6796128749847412,
- "learning_rate": 4.43824339893979e-06,
- "loss": 0.5227609276771545,
- "mean_token_accuracy": 0.8135923743247986,
- "num_tokens": 9183214.0,
- "step": 1015
- },
- {
- "epoch": 0.7720364741641338,
- "grad_norm": 2.1485698223114014,
- "learning_rate": 4.436919906534452e-06,
- "loss": 0.4857056140899658,
- "mean_token_accuracy": 0.8323013782501221,
- "num_tokens": 9190360.0,
- "step": 1016
- },
- {
- "epoch": 0.7727963525835866,
- "grad_norm": 2.7842206954956055,
- "learning_rate": 4.4355950547241645e-06,
- "loss": 0.46406883001327515,
- "mean_token_accuracy": 0.859869122505188,
- "num_tokens": 9194523.0,
- "step": 1017
- },
- {
- "epoch": 0.7735562310030395,
- "grad_norm": 2.3774640560150146,
- "learning_rate": 4.434268844438758e-06,
- "loss": 0.5625549554824829,
- "mean_token_accuracy": 0.8188897371292114,
- "num_tokens": 9201155.0,
- "step": 1018
- },
- {
- "epoch": 0.7743161094224924,
- "grad_norm": 2.004427909851074,
- "learning_rate": 4.432941276609018e-06,
- "loss": 0.5164387226104736,
- "mean_token_accuracy": 0.829569935798645,
- "num_tokens": 9209269.0,
- "step": 1019
- },
- {
- "epoch": 0.7750759878419453,
- "grad_norm": 1.7218989133834839,
- "learning_rate": 4.431612352166684e-06,
- "loss": 0.481005996465683,
- "mean_token_accuracy": 0.8359906673431396,
- "num_tokens": 9220860.0,
- "step": 1020
- },
- {
- "epoch": 0.7758358662613982,
- "grad_norm": 2.197108507156372,
- "learning_rate": 4.4302820720444454e-06,
- "loss": 0.440413236618042,
- "mean_token_accuracy": 0.8412867784500122,
- "num_tokens": 9226414.0,
- "step": 1021
- },
- {
- "epoch": 0.776595744680851,
- "grad_norm": 2.6995162963867188,
- "learning_rate": 4.428950437175944e-06,
- "loss": 0.3884299397468567,
- "mean_token_accuracy": 0.8696021437644958,
- "num_tokens": 9230898.0,
- "step": 1022
- },
- {
- "epoch": 0.7773556231003039,
- "grad_norm": 2.1671667098999023,
- "learning_rate": 4.427617448495772e-06,
- "loss": 0.5747478008270264,
- "mean_token_accuracy": 0.7842930555343628,
- "num_tokens": 9238479.0,
- "step": 1023
- },
- {
- "epoch": 0.7781155015197568,
- "grad_norm": 1.6299028396606445,
- "learning_rate": 4.426283106939474e-06,
- "loss": 0.39478403329849243,
- "mean_token_accuracy": 0.8685503602027893,
- "num_tokens": 9248263.0,
- "step": 1024
- },
- {
- "epoch": 0.7788753799392097,
- "grad_norm": 2.2621798515319824,
- "learning_rate": 4.424947413443539e-06,
- "loss": 0.4582178592681885,
- "mean_token_accuracy": 0.8312377333641052,
- "num_tokens": 9254168.0,
- "step": 1025
- },
- {
- "epoch": 0.7796352583586627,
- "grad_norm": 2.121091365814209,
- "learning_rate": 4.423610368945411e-06,
- "loss": 0.5315121412277222,
- "mean_token_accuracy": 0.8121483325958252,
- "num_tokens": 9261808.0,
- "step": 1026
- },
- {
- "epoch": 0.7803951367781155,
- "grad_norm": 1.8558297157287598,
- "learning_rate": 4.422271974383479e-06,
- "loss": 0.4299176037311554,
- "mean_token_accuracy": 0.8452648520469666,
- "num_tokens": 9269264.0,
- "step": 1027
- },
- {
- "epoch": 0.7811550151975684,
- "grad_norm": 1.9089949131011963,
- "learning_rate": 4.420932230697079e-06,
- "loss": 0.43876272439956665,
- "mean_token_accuracy": 0.8434094190597534,
- "num_tokens": 9277381.0,
- "step": 1028
- },
- {
- "epoch": 0.7819148936170213,
- "grad_norm": 1.8619649410247803,
- "learning_rate": 4.419591138826495e-06,
- "loss": 0.48798668384552,
- "mean_token_accuracy": 0.8281317353248596,
- "num_tokens": 9285413.0,
- "step": 1029
- },
- {
- "epoch": 0.7826747720364742,
- "grad_norm": 1.3273087739944458,
- "learning_rate": 4.418248699712955e-06,
- "loss": 0.4611460864543915,
- "mean_token_accuracy": 0.8233213424682617,
- "num_tokens": 9300805.0,
- "step": 1030
- },
- {
- "epoch": 0.7834346504559271,
- "grad_norm": 1.0473746061325073,
- "learning_rate": 4.416904914298637e-06,
- "loss": 0.36537665128707886,
- "mean_token_accuracy": 0.8671857118606567,
- "num_tokens": 9320035.0,
- "step": 1031
- },
- {
- "epoch": 0.78419452887538,
- "grad_norm": 1.9130918979644775,
- "learning_rate": 4.415559783526661e-06,
- "loss": 0.4916655123233795,
- "mean_token_accuracy": 0.8266351222991943,
- "num_tokens": 9326795.0,
- "step": 1032
- },
- {
- "epoch": 0.7849544072948328,
- "grad_norm": 2.0001816749572754,
- "learning_rate": 4.414213308341092e-06,
- "loss": 0.5711008310317993,
- "mean_token_accuracy": 0.8093076348304749,
- "num_tokens": 9335625.0,
- "step": 1033
- },
- {
- "epoch": 0.7857142857142857,
- "grad_norm": 3.933542251586914,
- "learning_rate": 4.412865489686936e-06,
- "loss": 0.621616542339325,
- "mean_token_accuracy": 0.7938898801803589,
- "num_tokens": 9339080.0,
- "step": 1034
- },
- {
- "epoch": 0.7864741641337386,
- "grad_norm": 2.061558961868286,
- "learning_rate": 4.411516328510145e-06,
- "loss": 0.583686113357544,
- "mean_token_accuracy": 0.8216883540153503,
- "num_tokens": 9348581.0,
- "step": 1035
- },
- {
- "epoch": 0.7872340425531915,
- "grad_norm": 1.9401264190673828,
- "learning_rate": 4.410165825757613e-06,
- "loss": 0.4905240535736084,
- "mean_token_accuracy": 0.8229951858520508,
- "num_tokens": 9356032.0,
- "step": 1036
- },
- {
- "epoch": 0.7879939209726444,
- "grad_norm": 3.620547294616699,
- "learning_rate": 4.408813982377175e-06,
- "loss": 0.4269888997077942,
- "mean_token_accuracy": 0.8713940978050232,
- "num_tokens": 9359061.0,
- "step": 1037
- },
- {
- "epoch": 0.7887537993920972,
- "grad_norm": 1.2027851343154907,
- "learning_rate": 4.407460799317605e-06,
- "loss": 0.39972418546676636,
- "mean_token_accuracy": 0.8610097765922546,
- "num_tokens": 9377068.0,
- "step": 1038
- },
- {
- "epoch": 0.7895136778115501,
- "grad_norm": 2.566753387451172,
- "learning_rate": 4.40610627752862e-06,
- "loss": 0.45267152786254883,
- "mean_token_accuracy": 0.83243328332901,
- "num_tokens": 9383604.0,
- "step": 1039
- },
- {
- "epoch": 0.790273556231003,
- "grad_norm": 2.940094470977783,
- "learning_rate": 4.404750417960876e-06,
- "loss": 0.42862242460250854,
- "mean_token_accuracy": 0.8582849502563477,
- "num_tokens": 9387541.0,
- "step": 1040
- },
- {
- "epoch": 0.791033434650456,
- "grad_norm": 2.0223944187164307,
- "learning_rate": 4.403393221565966e-06,
- "loss": 0.4349963665008545,
- "mean_token_accuracy": 0.8453047871589661,
- "num_tokens": 9394382.0,
- "step": 1041
- },
- {
- "epoch": 0.7917933130699089,
- "grad_norm": 2.9399030208587646,
- "learning_rate": 4.402034689296425e-06,
- "loss": 0.32197174429893494,
- "mean_token_accuracy": 0.8953392505645752,
- "num_tokens": 9397741.0,
- "step": 1042
- },
- {
- "epoch": 0.7925531914893617,
- "grad_norm": 2.819016456604004,
- "learning_rate": 4.400674822105721e-06,
- "loss": 0.6790289878845215,
- "mean_token_accuracy": 0.8135063648223877,
- "num_tokens": 9403509.0,
- "step": 1043
- },
- {
- "epoch": 0.7933130699088146,
- "grad_norm": 1.3225977420806885,
- "learning_rate": 4.399313620948262e-06,
- "loss": 0.42203834652900696,
- "mean_token_accuracy": 0.8399381637573242,
- "num_tokens": 9418870.0,
- "step": 1044
- },
- {
- "epoch": 0.7940729483282675,
- "grad_norm": 1.7822176218032837,
- "learning_rate": 4.397951086779392e-06,
- "loss": 0.4666554927825928,
- "mean_token_accuracy": 0.8364764451980591,
- "num_tokens": 9427640.0,
- "step": 1045
- },
- {
- "epoch": 0.7948328267477204,
- "grad_norm": 3.186439037322998,
- "learning_rate": 4.396587220555389e-06,
- "loss": 0.6048363447189331,
- "mean_token_accuracy": 0.7806557416915894,
- "num_tokens": 9431927.0,
- "step": 1046
- },
- {
- "epoch": 0.7955927051671733,
- "grad_norm": 3.0804805755615234,
- "learning_rate": 4.395222023233467e-06,
- "loss": 0.445969820022583,
- "mean_token_accuracy": 0.850671112537384,
- "num_tokens": 9436136.0,
- "step": 1047
- },
- {
- "epoch": 0.7963525835866262,
- "grad_norm": 1.675968885421753,
- "learning_rate": 4.393855495771774e-06,
- "loss": 0.4311422109603882,
- "mean_token_accuracy": 0.8449079990386963,
- "num_tokens": 9445189.0,
- "step": 1048
- },
- {
- "epoch": 0.797112462006079,
- "grad_norm": 2.342410087585449,
- "learning_rate": 4.3924876391293915e-06,
- "loss": 0.5733606219291687,
- "mean_token_accuracy": 0.8156592845916748,
- "num_tokens": 9451939.0,
- "step": 1049
- },
- {
- "epoch": 0.7978723404255319,
- "grad_norm": 1.5967470407485962,
- "learning_rate": 4.391118454266335e-06,
- "loss": 0.46664729714393616,
- "mean_token_accuracy": 0.8091695308685303,
- "num_tokens": 9463968.0,
- "step": 1050
- },
- {
- "epoch": 0.7986322188449848,
- "grad_norm": 1.5777863264083862,
- "learning_rate": 4.389747942143549e-06,
- "loss": 0.46028903126716614,
- "mean_token_accuracy": 0.8347330093383789,
- "num_tokens": 9475561.0,
- "step": 1051
- },
- {
- "epoch": 0.7993920972644377,
- "grad_norm": 2.7630488872528076,
- "learning_rate": 4.388376103722914e-06,
- "loss": 0.5618188977241516,
- "mean_token_accuracy": 0.8273467421531677,
- "num_tokens": 9480661.0,
- "step": 1052
- },
- {
- "epoch": 0.8001519756838906,
- "grad_norm": 2.093397378921509,
- "learning_rate": 4.387002939967237e-06,
- "loss": 0.2998353838920593,
- "mean_token_accuracy": 0.8905231952667236,
- "num_tokens": 9485924.0,
- "step": 1053
- },
- {
- "epoch": 0.8009118541033434,
- "grad_norm": 1.4385871887207031,
- "learning_rate": 4.38562845184026e-06,
- "loss": 0.4944111704826355,
- "mean_token_accuracy": 0.8403056263923645,
- "num_tokens": 9500128.0,
- "step": 1054
- },
- {
- "epoch": 0.8016717325227963,
- "grad_norm": 1.6393156051635742,
- "learning_rate": 4.384252640306649e-06,
- "loss": 0.5727907419204712,
- "mean_token_accuracy": 0.7849414348602295,
- "num_tokens": 9511569.0,
- "step": 1055
- },
- {
- "epoch": 0.8024316109422492,
- "grad_norm": 2.3909664154052734,
- "learning_rate": 4.382875506332002e-06,
- "loss": 0.4760419726371765,
- "mean_token_accuracy": 0.8408266305923462,
- "num_tokens": 9517244.0,
- "step": 1056
- },
- {
- "epoch": 0.8031914893617021,
- "grad_norm": 1.7288594245910645,
- "learning_rate": 4.381497050882845e-06,
- "loss": 0.5375926494598389,
- "mean_token_accuracy": 0.8138614892959595,
- "num_tokens": 9528736.0,
- "step": 1057
- },
- {
- "epoch": 0.8039513677811551,
- "grad_norm": 2.093407392501831,
- "learning_rate": 4.380117274926632e-06,
- "loss": 0.46659404039382935,
- "mean_token_accuracy": 0.8450702428817749,
- "num_tokens": 9536200.0,
- "step": 1058
- },
- {
- "epoch": 0.8047112462006079,
- "grad_norm": 1.6835898160934448,
- "learning_rate": 4.3787361794317405e-06,
- "loss": 0.43157699704170227,
- "mean_token_accuracy": 0.8279973268508911,
- "num_tokens": 9546314.0,
- "step": 1059
- },
- {
- "epoch": 0.8054711246200608,
- "grad_norm": 1.983067512512207,
- "learning_rate": 4.377353765367479e-06,
- "loss": 0.5021739602088928,
- "mean_token_accuracy": 0.8274815082550049,
- "num_tokens": 9554375.0,
- "step": 1060
- },
- {
- "epoch": 0.8062310030395137,
- "grad_norm": 2.0472030639648438,
- "learning_rate": 4.375970033704078e-06,
- "loss": 0.34298190474510193,
- "mean_token_accuracy": 0.8900876045227051,
- "num_tokens": 9560230.0,
- "step": 1061
- },
- {
- "epoch": 0.8069908814589666,
- "grad_norm": 1.9613717794418335,
- "learning_rate": 4.374584985412692e-06,
- "loss": 0.3826758861541748,
- "mean_token_accuracy": 0.839923620223999,
- "num_tokens": 9566809.0,
- "step": 1062
- },
- {
- "epoch": 0.8077507598784195,
- "grad_norm": 1.991289496421814,
- "learning_rate": 4.373198621465405e-06,
- "loss": 0.5492525100708008,
- "mean_token_accuracy": 0.8153272867202759,
- "num_tokens": 9576810.0,
- "step": 1063
- },
- {
- "epoch": 0.8085106382978723,
- "grad_norm": 2.421370506286621,
- "learning_rate": 4.3718109428352155e-06,
- "loss": 0.5240297317504883,
- "mean_token_accuracy": 0.8087242245674133,
- "num_tokens": 9582906.0,
- "step": 1064
- },
- {
- "epoch": 0.8092705167173252,
- "grad_norm": 3.697765588760376,
- "learning_rate": 4.370421950496055e-06,
- "loss": 0.6096476912498474,
- "mean_token_accuracy": 0.787585973739624,
- "num_tokens": 9586920.0,
- "step": 1065
- },
- {
- "epoch": 0.8100303951367781,
- "grad_norm": 2.0767786502838135,
- "learning_rate": 4.369031645422768e-06,
- "loss": 0.41120079159736633,
- "mean_token_accuracy": 0.8513731956481934,
- "num_tokens": 9593902.0,
- "step": 1066
- },
- {
- "epoch": 0.810790273556231,
- "grad_norm": 2.5968732833862305,
- "learning_rate": 4.367640028591126e-06,
- "loss": 0.3364982008934021,
- "mean_token_accuracy": 0.8786963224411011,
- "num_tokens": 9597745.0,
- "step": 1067
- },
- {
- "epoch": 0.8115501519756839,
- "grad_norm": 2.165742874145508,
- "learning_rate": 4.366247100977818e-06,
- "loss": 0.406129390001297,
- "mean_token_accuracy": 0.868243932723999,
- "num_tokens": 9603496.0,
- "step": 1068
- },
- {
- "epoch": 0.8123100303951368,
- "grad_norm": 2.0493404865264893,
- "learning_rate": 4.364852863560456e-06,
- "loss": 0.5356296300888062,
- "mean_token_accuracy": 0.8191947340965271,
- "num_tokens": 9610898.0,
- "step": 1069
- },
- {
- "epoch": 0.8130699088145896,
- "grad_norm": 2.3224308490753174,
- "learning_rate": 4.363457317317568e-06,
- "loss": 0.41461923718452454,
- "mean_token_accuracy": 0.8537945747375488,
- "num_tokens": 9616626.0,
- "step": 1070
- },
- {
- "epoch": 0.8138297872340425,
- "grad_norm": 1.7387986183166504,
- "learning_rate": 4.362060463228603e-06,
- "loss": 0.5134786367416382,
- "mean_token_accuracy": 0.8511737585067749,
- "num_tokens": 9626223.0,
- "step": 1071
- },
- {
- "epoch": 0.8145896656534954,
- "grad_norm": 3.0270655155181885,
- "learning_rate": 4.360662302273926e-06,
- "loss": 0.3410695791244507,
- "mean_token_accuracy": 0.8746449947357178,
- "num_tokens": 9629455.0,
- "step": 1072
- },
- {
- "epoch": 0.8153495440729484,
- "grad_norm": 1.7727062702178955,
- "learning_rate": 4.35926283543482e-06,
- "loss": 0.4610968828201294,
- "mean_token_accuracy": 0.8444793224334717,
- "num_tokens": 9638070.0,
- "step": 1073
- },
- {
- "epoch": 0.8161094224924013,
- "grad_norm": 3.6333565711975098,
- "learning_rate": 4.357862063693486e-06,
- "loss": 0.3881273865699768,
- "mean_token_accuracy": 0.8757344484329224,
- "num_tokens": 9641028.0,
- "step": 1074
- },
- {
- "epoch": 0.8168693009118541,
- "grad_norm": 3.024042844772339,
- "learning_rate": 4.356459988033039e-06,
- "loss": 0.3853808641433716,
- "mean_token_accuracy": 0.8602254390716553,
- "num_tokens": 9645730.0,
- "step": 1075
- },
- {
- "epoch": 0.817629179331307,
- "grad_norm": 2.3359482288360596,
- "learning_rate": 4.355056609437509e-06,
- "loss": 0.4852045476436615,
- "mean_token_accuracy": 0.8502728343009949,
- "num_tokens": 9650975.0,
- "step": 1076
- },
- {
- "epoch": 0.8183890577507599,
- "grad_norm": 2.2390685081481934,
- "learning_rate": 4.353651928891842e-06,
- "loss": 0.5287341475486755,
- "mean_token_accuracy": 0.8247801065444946,
- "num_tokens": 9657471.0,
- "step": 1077
- },
- {
- "epoch": 0.8191489361702128,
- "grad_norm": 2.3809144496917725,
- "learning_rate": 4.352245947381897e-06,
- "loss": 0.5218510627746582,
- "mean_token_accuracy": 0.8149170875549316,
- "num_tokens": 9664108.0,
- "step": 1078
- },
- {
- "epoch": 0.8199088145896657,
- "grad_norm": 1.7072309255599976,
- "learning_rate": 4.3508386658944455e-06,
- "loss": 0.46481168270111084,
- "mean_token_accuracy": 0.834963321685791,
- "num_tokens": 9673175.0,
- "step": 1079
- },
- {
- "epoch": 0.8206686930091185,
- "grad_norm": 1.7383702993392944,
- "learning_rate": 4.349430085417171e-06,
- "loss": 0.4505952000617981,
- "mean_token_accuracy": 0.8507769107818604,
- "num_tokens": 9682800.0,
- "step": 1080
- },
- {
- "epoch": 0.8214285714285714,
- "grad_norm": 2.4308547973632812,
- "learning_rate": 4.348020206938672e-06,
- "loss": 0.4832455515861511,
- "mean_token_accuracy": 0.8538393974304199,
- "num_tokens": 9688123.0,
- "step": 1081
- },
- {
- "epoch": 0.8221884498480243,
- "grad_norm": 2.2686192989349365,
- "learning_rate": 4.3466090314484526e-06,
- "loss": 0.5112563371658325,
- "mean_token_accuracy": 0.8308460712432861,
- "num_tokens": 9694299.0,
- "step": 1082
- },
- {
- "epoch": 0.8229483282674772,
- "grad_norm": 2.806093454360962,
- "learning_rate": 4.345196559936931e-06,
- "loss": 0.4818246364593506,
- "mean_token_accuracy": 0.86617112159729,
- "num_tokens": 9698471.0,
- "step": 1083
- },
- {
- "epoch": 0.8237082066869301,
- "grad_norm": 1.7340706586837769,
- "learning_rate": 4.343782793395435e-06,
- "loss": 0.38246971368789673,
- "mean_token_accuracy": 0.8675198554992676,
- "num_tokens": 9706444.0,
- "step": 1084
- },
- {
- "epoch": 0.824468085106383,
- "grad_norm": 1.664942741394043,
- "learning_rate": 4.3423677328162e-06,
- "loss": 0.498797208070755,
- "mean_token_accuracy": 0.8447319865226746,
- "num_tokens": 9716765.0,
- "step": 1085
- },
- {
- "epoch": 0.8252279635258358,
- "grad_norm": 1.3608235120773315,
- "learning_rate": 4.340951379192369e-06,
- "loss": 0.41961491107940674,
- "mean_token_accuracy": 0.8339346647262573,
- "num_tokens": 9729564.0,
- "step": 1086
- },
- {
- "epoch": 0.8259878419452887,
- "grad_norm": 1.642503261566162,
- "learning_rate": 4.3395337335179945e-06,
- "loss": 0.5477945804595947,
- "mean_token_accuracy": 0.8117889761924744,
- "num_tokens": 9741217.0,
- "step": 1087
- },
- {
- "epoch": 0.8267477203647416,
- "grad_norm": 3.0345044136047363,
- "learning_rate": 4.338114796788035e-06,
- "loss": 0.5024623870849609,
- "mean_token_accuracy": 0.8333141207695007,
- "num_tokens": 9744941.0,
- "step": 1088
- },
- {
- "epoch": 0.8275075987841946,
- "grad_norm": 1.3096630573272705,
- "learning_rate": 4.336694569998354e-06,
- "loss": 0.44169723987579346,
- "mean_token_accuracy": 0.859926700592041,
- "num_tokens": 9757854.0,
- "step": 1089
- },
- {
- "epoch": 0.8282674772036475,
- "grad_norm": 2.203279495239258,
- "learning_rate": 4.3352730541457215e-06,
- "loss": 0.5283265113830566,
- "mean_token_accuracy": 0.8053759932518005,
- "num_tokens": 9764096.0,
- "step": 1090
- },
- {
- "epoch": 0.8290273556231003,
- "grad_norm": 1.3774312734603882,
- "learning_rate": 4.333850250227814e-06,
- "loss": 0.4584103226661682,
- "mean_token_accuracy": 0.8342611193656921,
- "num_tokens": 9777768.0,
- "step": 1091
- },
- {
- "epoch": 0.8297872340425532,
- "grad_norm": 1.822637915611267,
- "learning_rate": 4.332426159243206e-06,
- "loss": 0.5432791709899902,
- "mean_token_accuracy": 0.8136210441589355,
- "num_tokens": 9791276.0,
- "step": 1092
- },
- {
- "epoch": 0.8305471124620061,
- "grad_norm": 3.0190067291259766,
- "learning_rate": 4.331000782191384e-06,
- "loss": 0.5018150806427002,
- "mean_token_accuracy": 0.8234807252883911,
- "num_tokens": 9794902.0,
- "step": 1093
- },
- {
- "epoch": 0.831306990881459,
- "grad_norm": 2.09987735748291,
- "learning_rate": 4.329574120072728e-06,
- "loss": 0.4270891547203064,
- "mean_token_accuracy": 0.8544977903366089,
- "num_tokens": 9800903.0,
- "step": 1094
- },
- {
- "epoch": 0.8320668693009119,
- "grad_norm": 1.969549536705017,
- "learning_rate": 4.328146173888528e-06,
- "loss": 0.45801427960395813,
- "mean_token_accuracy": 0.8334714770317078,
- "num_tokens": 9808719.0,
- "step": 1095
- },
- {
- "epoch": 0.8328267477203647,
- "grad_norm": 1.4565571546554565,
- "learning_rate": 4.32671694464097e-06,
- "loss": 0.34864288568496704,
- "mean_token_accuracy": 0.8689061999320984,
- "num_tokens": 9818262.0,
- "step": 1096
- },
- {
- "epoch": 0.8335866261398176,
- "grad_norm": 1.2163832187652588,
- "learning_rate": 4.3252864333331424e-06,
- "loss": 0.37953704595565796,
- "mean_token_accuracy": 0.866554856300354,
- "num_tokens": 9833942.0,
- "step": 1097
- },
- {
- "epoch": 0.8343465045592705,
- "grad_norm": 1.6112010478973389,
- "learning_rate": 4.323854640969033e-06,
- "loss": 0.5442801713943481,
- "mean_token_accuracy": 0.8190416097640991,
- "num_tokens": 9844765.0,
- "step": 1098
- },
- {
- "epoch": 0.8351063829787234,
- "grad_norm": 1.8190315961837769,
- "learning_rate": 4.322421568553529e-06,
- "loss": 0.48271381855010986,
- "mean_token_accuracy": 0.8203652501106262,
- "num_tokens": 9852625.0,
- "step": 1099
- },
- {
- "epoch": 0.8358662613981763,
- "grad_norm": 2.7897756099700928,
- "learning_rate": 4.320987217092416e-06,
- "loss": 0.4086323380470276,
- "mean_token_accuracy": 0.8504934310913086,
- "num_tokens": 9856888.0,
- "step": 1100
- },
- {
- "epoch": 0.8366261398176292,
- "grad_norm": 1.7035977840423584,
- "learning_rate": 4.319551587592377e-06,
- "loss": 0.6325064301490784,
- "mean_token_accuracy": 0.788190484046936,
- "num_tokens": 9869419.0,
- "step": 1101
- },
- {
- "epoch": 0.837386018237082,
- "grad_norm": 2.609731912612915,
- "learning_rate": 4.318114681060989e-06,
- "loss": 0.519314706325531,
- "mean_token_accuracy": 0.8469992280006409,
- "num_tokens": 9874553.0,
- "step": 1102
- },
- {
- "epoch": 0.8381458966565349,
- "grad_norm": 1.2519766092300415,
- "learning_rate": 4.316676498506735e-06,
- "loss": 0.3566005825996399,
- "mean_token_accuracy": 0.8588439226150513,
- "num_tokens": 9886498.0,
- "step": 1103
- },
- {
- "epoch": 0.8389057750759878,
- "grad_norm": 1.430892825126648,
- "learning_rate": 4.3152370409389795e-06,
- "loss": 0.5250182747840881,
- "mean_token_accuracy": 0.8164948225021362,
- "num_tokens": 9900256.0,
- "step": 1104
- },
- {
- "epoch": 0.8396656534954408,
- "grad_norm": 3.1245436668395996,
- "learning_rate": 4.3137963093679945e-06,
- "loss": 0.3173971176147461,
- "mean_token_accuracy": 0.8835347890853882,
- "num_tokens": 9903899.0,
- "step": 1105
- },
- {
- "epoch": 0.8404255319148937,
- "grad_norm": 3.131812572479248,
- "learning_rate": 4.3123543048049395e-06,
- "loss": 0.6567763090133667,
- "mean_token_accuracy": 0.8233605027198792,
- "num_tokens": 9908798.0,
- "step": 1106
- },
- {
- "epoch": 0.8411854103343465,
- "grad_norm": 1.3551725149154663,
- "learning_rate": 4.310911028261867e-06,
- "loss": 0.3993729054927826,
- "mean_token_accuracy": 0.8529655933380127,
- "num_tokens": 9922577.0,
- "step": 1107
- },
- {
- "epoch": 0.8419452887537994,
- "grad_norm": 2.572533130645752,
- "learning_rate": 4.309466480751726e-06,
- "loss": 0.40906503796577454,
- "mean_token_accuracy": 0.8630726933479309,
- "num_tokens": 9926890.0,
- "step": 1108
- },
- {
- "epoch": 0.8427051671732523,
- "grad_norm": 1.9146469831466675,
- "learning_rate": 4.308020663288356e-06,
- "loss": 0.48423194885253906,
- "mean_token_accuracy": 0.8370280861854553,
- "num_tokens": 9934293.0,
- "step": 1109
- },
- {
- "epoch": 0.8434650455927052,
- "grad_norm": 1.6178001165390015,
- "learning_rate": 4.306573576886485e-06,
- "loss": 0.4262213408946991,
- "mean_token_accuracy": 0.839401125907898,
- "num_tokens": 9944513.0,
- "step": 1110
- },
- {
- "epoch": 0.8442249240121581,
- "grad_norm": 2.4444572925567627,
- "learning_rate": 4.305125222561736e-06,
- "loss": 0.5199950933456421,
- "mean_token_accuracy": 0.8507720232009888,
- "num_tokens": 9949512.0,
- "step": 1111
- },
- {
- "epoch": 0.8449848024316109,
- "grad_norm": 1.7983134984970093,
- "learning_rate": 4.303675601330618e-06,
- "loss": 0.36155956983566284,
- "mean_token_accuracy": 0.8568712472915649,
- "num_tokens": 9956402.0,
- "step": 1112
- },
- {
- "epoch": 0.8457446808510638,
- "grad_norm": 2.391096353530884,
- "learning_rate": 4.302224714210532e-06,
- "loss": 0.5391949415206909,
- "mean_token_accuracy": 0.8183057308197021,
- "num_tokens": 9961606.0,
- "step": 1113
- },
- {
- "epoch": 0.8465045592705167,
- "grad_norm": 1.8520214557647705,
- "learning_rate": 4.3007725622197675e-06,
- "loss": 0.5758882761001587,
- "mean_token_accuracy": 0.7924330234527588,
- "num_tokens": 9971473.0,
- "step": 1114
- },
- {
- "epoch": 0.8472644376899696,
- "grad_norm": 2.436640739440918,
- "learning_rate": 4.2993191463775e-06,
- "loss": 0.3837985396385193,
- "mean_token_accuracy": 0.8620110750198364,
- "num_tokens": 9976333.0,
- "step": 1115
- },
- {
- "epoch": 0.8480243161094225,
- "grad_norm": 1.7287120819091797,
- "learning_rate": 4.29786446770379e-06,
- "loss": 0.40066856145858765,
- "mean_token_accuracy": 0.8618333339691162,
- "num_tokens": 9985617.0,
- "step": 1116
- },
- {
- "epoch": 0.8487841945288754,
- "grad_norm": 2.0310518741607666,
- "learning_rate": 4.296408527219592e-06,
- "loss": 0.5465943217277527,
- "mean_token_accuracy": 0.812044620513916,
- "num_tokens": 9995363.0,
- "step": 1117
- },
- {
- "epoch": 0.8495440729483282,
- "grad_norm": 1.4858589172363281,
- "learning_rate": 4.294951325946737e-06,
- "loss": 0.45840176939964294,
- "mean_token_accuracy": 0.8432979583740234,
- "num_tokens": 10006400.0,
- "step": 1118
- },
- {
- "epoch": 0.8503039513677811,
- "grad_norm": 1.6153514385223389,
- "learning_rate": 4.293492864907947e-06,
- "loss": 0.5225611925125122,
- "mean_token_accuracy": 0.8180211186408997,
- "num_tokens": 10018352.0,
- "step": 1119
- },
- {
- "epoch": 0.851063829787234,
- "grad_norm": 2.1178412437438965,
- "learning_rate": 4.2920331451268246e-06,
- "loss": 0.5580621361732483,
- "mean_token_accuracy": 0.8211709260940552,
- "num_tokens": 10025614.0,
- "step": 1120
- },
- {
- "epoch": 0.851823708206687,
- "grad_norm": 2.036839246749878,
- "learning_rate": 4.2905721676278585e-06,
- "loss": 0.4658433198928833,
- "mean_token_accuracy": 0.8380423784255981,
- "num_tokens": 10032489.0,
- "step": 1121
- },
- {
- "epoch": 0.8525835866261399,
- "grad_norm": 2.0056262016296387,
- "learning_rate": 4.28910993343642e-06,
- "loss": 0.47023308277130127,
- "mean_token_accuracy": 0.8340359926223755,
- "num_tokens": 10040050.0,
- "step": 1122
- },
- {
- "epoch": 0.8533434650455927,
- "grad_norm": 2.540024518966675,
- "learning_rate": 4.2876464435787576e-06,
- "loss": 0.502303957939148,
- "mean_token_accuracy": 0.8288739919662476,
- "num_tokens": 10045042.0,
- "step": 1123
- },
- {
- "epoch": 0.8541033434650456,
- "grad_norm": 1.7894693613052368,
- "learning_rate": 4.286181699082008e-06,
- "loss": 0.4732973575592041,
- "mean_token_accuracy": 0.8340568542480469,
- "num_tokens": 10054424.0,
- "step": 1124
- },
- {
- "epoch": 0.8548632218844985,
- "grad_norm": 1.5601223707199097,
- "learning_rate": 4.284715700974186e-06,
- "loss": 0.472471684217453,
- "mean_token_accuracy": 0.8274722695350647,
- "num_tokens": 10065523.0,
- "step": 1125
- },
- {
- "epoch": 0.8556231003039514,
- "grad_norm": 1.7326055765151978,
- "learning_rate": 4.283248450284182e-06,
- "loss": 0.5924872159957886,
- "mean_token_accuracy": 0.7943467497825623,
- "num_tokens": 10076839.0,
- "step": 1126
- },
- {
- "epoch": 0.8563829787234043,
- "grad_norm": 1.5165479183197021,
- "learning_rate": 4.281779948041772e-06,
- "loss": 0.44768425822257996,
- "mean_token_accuracy": 0.8394696712493896,
- "num_tokens": 10088168.0,
- "step": 1127
- },
- {
- "epoch": 0.8571428571428571,
- "grad_norm": 1.5448920726776123,
- "learning_rate": 4.280310195277606e-06,
- "loss": 0.4458175003528595,
- "mean_token_accuracy": 0.835773229598999,
- "num_tokens": 10100306.0,
- "step": 1128
- },
- {
- "epoch": 0.85790273556231,
- "grad_norm": 1.6311609745025635,
- "learning_rate": 4.278839193023214e-06,
- "loss": 0.4158072769641876,
- "mean_token_accuracy": 0.8482539653778076,
- "num_tokens": 10110581.0,
- "step": 1129
- },
- {
- "epoch": 0.8586626139817629,
- "grad_norm": 1.6714754104614258,
- "learning_rate": 4.277366942311001e-06,
- "loss": 0.3686875104904175,
- "mean_token_accuracy": 0.8681533336639404,
- "num_tokens": 10118799.0,
- "step": 1130
- },
- {
- "epoch": 0.8594224924012158,
- "grad_norm": 2.1604413986206055,
- "learning_rate": 4.2758934441742494e-06,
- "loss": 0.37267982959747314,
- "mean_token_accuracy": 0.8520427346229553,
- "num_tokens": 10124734.0,
- "step": 1131
- },
- {
- "epoch": 0.8601823708206687,
- "grad_norm": 2.123013973236084,
- "learning_rate": 4.274418699647117e-06,
- "loss": 0.49963313341140747,
- "mean_token_accuracy": 0.8248758912086487,
- "num_tokens": 10131965.0,
- "step": 1132
- },
- {
- "epoch": 0.8609422492401215,
- "grad_norm": 1.4308786392211914,
- "learning_rate": 4.272942709764638e-06,
- "loss": 0.48666873574256897,
- "mean_token_accuracy": 0.8304717540740967,
- "num_tokens": 10145164.0,
- "step": 1133
- },
- {
- "epoch": 0.8617021276595744,
- "grad_norm": 1.7952618598937988,
- "learning_rate": 4.271465475562716e-06,
- "loss": 0.5536223649978638,
- "mean_token_accuracy": 0.8093959093093872,
- "num_tokens": 10154083.0,
- "step": 1134
- },
- {
- "epoch": 0.8624620060790273,
- "grad_norm": 2.0622456073760986,
- "learning_rate": 4.269986998078132e-06,
- "loss": 0.5173629522323608,
- "mean_token_accuracy": 0.8285619020462036,
- "num_tokens": 10161889.0,
- "step": 1135
- },
- {
- "epoch": 0.8632218844984803,
- "grad_norm": 2.0707509517669678,
- "learning_rate": 4.268507278348539e-06,
- "loss": 0.5871608257293701,
- "mean_token_accuracy": 0.7827386856079102,
- "num_tokens": 10170726.0,
- "step": 1136
- },
- {
- "epoch": 0.8639817629179332,
- "grad_norm": 2.054368257522583,
- "learning_rate": 4.2670263174124615e-06,
- "loss": 0.5788969993591309,
- "mean_token_accuracy": 0.7967237234115601,
- "num_tokens": 10178474.0,
- "step": 1137
- },
- {
- "epoch": 0.8647416413373861,
- "grad_norm": 1.901846170425415,
- "learning_rate": 4.265544116309294e-06,
- "loss": 0.5405587553977966,
- "mean_token_accuracy": 0.8151819705963135,
- "num_tokens": 10187013.0,
- "step": 1138
- },
- {
- "epoch": 0.8655015197568389,
- "grad_norm": 2.901285409927368,
- "learning_rate": 4.264060676079302e-06,
- "loss": 0.44101861119270325,
- "mean_token_accuracy": 0.8433429002761841,
- "num_tokens": 10191517.0,
- "step": 1139
- },
- {
- "epoch": 0.8662613981762918,
- "grad_norm": 2.4168388843536377,
- "learning_rate": 4.262575997763622e-06,
- "loss": 0.4686204195022583,
- "mean_token_accuracy": 0.8505309820175171,
- "num_tokens": 10196948.0,
- "step": 1140
- },
- {
- "epoch": 0.8670212765957447,
- "grad_norm": 1.9588396549224854,
- "learning_rate": 4.2610900824042575e-06,
- "loss": 0.47056013345718384,
- "mean_token_accuracy": 0.8280024528503418,
- "num_tokens": 10204292.0,
- "step": 1141
- },
- {
- "epoch": 0.8677811550151976,
- "grad_norm": 2.569150924682617,
- "learning_rate": 4.2596029310440826e-06,
- "loss": 0.573108434677124,
- "mean_token_accuracy": 0.8108246326446533,
- "num_tokens": 10209571.0,
- "step": 1142
- },
- {
- "epoch": 0.8685410334346505,
- "grad_norm": 2.038032293319702,
- "learning_rate": 4.258114544726835e-06,
- "loss": 0.40545332431793213,
- "mean_token_accuracy": 0.8611703515052795,
- "num_tokens": 10215716.0,
- "step": 1143
- },
- {
- "epoch": 0.8693009118541033,
- "grad_norm": 1.9884231090545654,
- "learning_rate": 4.256624924497124e-06,
- "loss": 0.40085992217063904,
- "mean_token_accuracy": 0.8615031242370605,
- "num_tokens": 10222775.0,
- "step": 1144
- },
- {
- "epoch": 0.8700607902735562,
- "grad_norm": 1.912842035293579,
- "learning_rate": 4.25513407140042e-06,
- "loss": 0.41022324562072754,
- "mean_token_accuracy": 0.8459607362747192,
- "num_tokens": 10229589.0,
- "step": 1145
- },
- {
- "epoch": 0.8708206686930091,
- "grad_norm": 1.9190576076507568,
- "learning_rate": 4.253641986483063e-06,
- "loss": 0.5541447401046753,
- "mean_token_accuracy": 0.8256468772888184,
- "num_tokens": 10240633.0,
- "step": 1146
- },
- {
- "epoch": 0.871580547112462,
- "grad_norm": 1.3742294311523438,
- "learning_rate": 4.2521486707922545e-06,
- "loss": 0.3680543899536133,
- "mean_token_accuracy": 0.8654477596282959,
- "num_tokens": 10251252.0,
- "step": 1147
- },
- {
- "epoch": 0.8723404255319149,
- "grad_norm": 1.4438525438308716,
- "learning_rate": 4.250654125376062e-06,
- "loss": 0.45830875635147095,
- "mean_token_accuracy": 0.8433834314346313,
- "num_tokens": 10263980.0,
- "step": 1148
- },
- {
- "epoch": 0.8731003039513677,
- "grad_norm": 2.1273653507232666,
- "learning_rate": 4.249158351283414e-06,
- "loss": 0.4129376709461212,
- "mean_token_accuracy": 0.861556351184845,
- "num_tokens": 10270426.0,
- "step": 1149
- },
- {
- "epoch": 0.8738601823708206,
- "grad_norm": 2.598440647125244,
- "learning_rate": 4.247661349564103e-06,
- "loss": 0.418030709028244,
- "mean_token_accuracy": 0.86553955078125,
- "num_tokens": 10275493.0,
- "step": 1150
- },
- {
- "epoch": 0.8746200607902735,
- "grad_norm": 1.6852490901947021,
- "learning_rate": 4.246163121268782e-06,
- "loss": 0.6403408050537109,
- "mean_token_accuracy": 0.7966094017028809,
- "num_tokens": 10287989.0,
- "step": 1151
- },
- {
- "epoch": 0.8753799392097265,
- "grad_norm": 2.5013794898986816,
- "learning_rate": 4.244663667448965e-06,
- "loss": 0.49922505021095276,
- "mean_token_accuracy": 0.8318735361099243,
- "num_tokens": 10293360.0,
- "step": 1152
- },
- {
- "epoch": 0.8761398176291794,
- "grad_norm": 1.2022709846496582,
- "learning_rate": 4.243162989157027e-06,
- "loss": 0.4414965510368347,
- "mean_token_accuracy": 0.8338693380355835,
- "num_tokens": 10310558.0,
- "step": 1153
- },
- {
- "epoch": 0.8768996960486323,
- "grad_norm": 1.9903281927108765,
- "learning_rate": 4.241661087446202e-06,
- "loss": 0.4277610778808594,
- "mean_token_accuracy": 0.8560749292373657,
- "num_tokens": 10316983.0,
- "step": 1154
- },
- {
- "epoch": 0.8776595744680851,
- "grad_norm": 2.104923725128174,
- "learning_rate": 4.240157963370583e-06,
- "loss": 0.44431713223457336,
- "mean_token_accuracy": 0.8785282969474792,
- "num_tokens": 10323294.0,
- "step": 1155
- },
- {
- "epoch": 0.878419452887538,
- "grad_norm": 2.8364813327789307,
- "learning_rate": 4.2386536179851175e-06,
- "loss": 0.49948397278785706,
- "mean_token_accuracy": 0.8305255174636841,
- "num_tokens": 10327662.0,
- "step": 1156
- },
- {
- "epoch": 0.8791793313069909,
- "grad_norm": 1.9493682384490967,
- "learning_rate": 4.2371480523456156e-06,
- "loss": 0.45867404341697693,
- "mean_token_accuracy": 0.8373264074325562,
- "num_tokens": 10335699.0,
- "step": 1157
- },
- {
- "epoch": 0.8799392097264438,
- "grad_norm": 2.268616199493408,
- "learning_rate": 4.235641267508741e-06,
- "loss": 0.4547857940196991,
- "mean_token_accuracy": 0.8252766132354736,
- "num_tokens": 10342464.0,
- "step": 1158
- },
- {
- "epoch": 0.8806990881458967,
- "grad_norm": 2.1334283351898193,
- "learning_rate": 4.234133264532012e-06,
- "loss": 0.39503124356269836,
- "mean_token_accuracy": 0.8648351430892944,
- "num_tokens": 10347514.0,
- "step": 1159
- },
- {
- "epoch": 0.8814589665653495,
- "grad_norm": 1.2775357961654663,
- "learning_rate": 4.232624044473805e-06,
- "loss": 0.39945733547210693,
- "mean_token_accuracy": 0.8369829654693604,
- "num_tokens": 10363316.0,
- "step": 1160
- },
- {
- "epoch": 0.8822188449848024,
- "grad_norm": 2.458413600921631,
- "learning_rate": 4.231113608393348e-06,
- "loss": 0.5020045638084412,
- "mean_token_accuracy": 0.8295938968658447,
- "num_tokens": 10368401.0,
- "step": 1161
- },
- {
- "epoch": 0.8829787234042553,
- "grad_norm": 1.7464948892593384,
- "learning_rate": 4.229601957350722e-06,
- "loss": 0.5335392951965332,
- "mean_token_accuracy": 0.8134858012199402,
- "num_tokens": 10378337.0,
- "step": 1162
- },
- {
- "epoch": 0.8837386018237082,
- "grad_norm": 3.1152119636535645,
- "learning_rate": 4.228089092406863e-06,
- "loss": 0.4811682105064392,
- "mean_token_accuracy": 0.8460187315940857,
- "num_tokens": 10382362.0,
- "step": 1163
- },
- {
- "epoch": 0.8844984802431611,
- "grad_norm": 2.190847158432007,
- "learning_rate": 4.226575014623557e-06,
- "loss": 0.4428049921989441,
- "mean_token_accuracy": 0.8382467031478882,
- "num_tokens": 10388211.0,
- "step": 1164
- },
- {
- "epoch": 0.8852583586626139,
- "grad_norm": 1.860153079032898,
- "learning_rate": 4.225059725063444e-06,
- "loss": 0.5265918970108032,
- "mean_token_accuracy": 0.8181334733963013,
- "num_tokens": 10398873.0,
- "step": 1165
- },
- {
- "epoch": 0.8860182370820668,
- "grad_norm": 1.3372713327407837,
- "learning_rate": 4.22354322479001e-06,
- "loss": 0.43202850222587585,
- "mean_token_accuracy": 0.8432420492172241,
- "num_tokens": 10413158.0,
- "step": 1166
- },
- {
- "epoch": 0.8867781155015197,
- "grad_norm": 1.3653379678726196,
- "learning_rate": 4.222025514867596e-06,
- "loss": 0.43780991435050964,
- "mean_token_accuracy": 0.8441485166549683,
- "num_tokens": 10428137.0,
- "step": 1167
- },
- {
- "epoch": 0.8875379939209727,
- "grad_norm": 3.0230672359466553,
- "learning_rate": 4.220506596361387e-06,
- "loss": 0.6039337515830994,
- "mean_token_accuracy": 0.8274872303009033,
- "num_tokens": 10432586.0,
- "step": 1168
- },
- {
- "epoch": 0.8882978723404256,
- "grad_norm": 2.2180392742156982,
- "learning_rate": 4.218986470337419e-06,
- "loss": 0.5453792810440063,
- "mean_token_accuracy": 0.8127184510231018,
- "num_tokens": 10439471.0,
- "step": 1169
- },
- {
- "epoch": 0.8890577507598785,
- "grad_norm": 1.8519103527069092,
- "learning_rate": 4.217465137862575e-06,
- "loss": 0.5145469903945923,
- "mean_token_accuracy": 0.8178654909133911,
- "num_tokens": 10450471.0,
- "step": 1170
- },
- {
- "epoch": 0.8898176291793313,
- "grad_norm": 2.034008026123047,
- "learning_rate": 4.215942600004586e-06,
- "loss": 0.44061461091041565,
- "mean_token_accuracy": 0.8572084307670593,
- "num_tokens": 10457382.0,
- "step": 1171
- },
- {
- "epoch": 0.8905775075987842,
- "grad_norm": 3.4304304122924805,
- "learning_rate": 4.214418857832025e-06,
- "loss": 0.44397830963134766,
- "mean_token_accuracy": 0.842149019241333,
- "num_tokens": 10460650.0,
- "step": 1172
- },
- {
- "epoch": 0.8913373860182371,
- "grad_norm": 1.9021750688552856,
- "learning_rate": 4.212893912414316e-06,
- "loss": 0.3769867420196533,
- "mean_token_accuracy": 0.8806171417236328,
- "num_tokens": 10468214.0,
- "step": 1173
- },
- {
- "epoch": 0.89209726443769,
- "grad_norm": 1.9704062938690186,
- "learning_rate": 4.211367764821722e-06,
- "loss": 0.5501819849014282,
- "mean_token_accuracy": 0.8176811337471008,
- "num_tokens": 10476739.0,
- "step": 1174
- },
- {
- "epoch": 0.8928571428571429,
- "grad_norm": 1.4350415468215942,
- "learning_rate": 4.209840416125353e-06,
- "loss": 0.41897401213645935,
- "mean_token_accuracy": 0.8498011827468872,
- "num_tokens": 10491769.0,
- "step": 1175
- },
- {
- "epoch": 0.8936170212765957,
- "grad_norm": 3.8237783908843994,
- "learning_rate": 4.208311867397162e-06,
- "loss": 0.5296977162361145,
- "mean_token_accuracy": 0.8168715834617615,
- "num_tokens": 10494958.0,
- "step": 1176
- },
- {
- "epoch": 0.8943768996960486,
- "grad_norm": 2.04784893989563,
- "learning_rate": 4.206782119709942e-06,
- "loss": 0.476105272769928,
- "mean_token_accuracy": 0.834011435508728,
- "num_tokens": 10502077.0,
- "step": 1177
- },
- {
- "epoch": 0.8951367781155015,
- "grad_norm": 1.8839610815048218,
- "learning_rate": 4.205251174137329e-06,
- "loss": 0.49628815054893494,
- "mean_token_accuracy": 0.8212119936943054,
- "num_tokens": 10510077.0,
- "step": 1178
- },
- {
- "epoch": 0.8958966565349544,
- "grad_norm": 1.2100634574890137,
- "learning_rate": 4.2037190317538e-06,
- "loss": 0.4931519329547882,
- "mean_token_accuracy": 0.8170043230056763,
- "num_tokens": 10528373.0,
- "step": 1179
- },
- {
- "epoch": 0.8966565349544073,
- "grad_norm": 1.884637713432312,
- "learning_rate": 4.202185693634671e-06,
- "loss": 0.4913347363471985,
- "mean_token_accuracy": 0.8234949707984924,
- "num_tokens": 10537108.0,
- "step": 1180
- },
- {
- "epoch": 0.8974164133738601,
- "grad_norm": 1.5062434673309326,
- "learning_rate": 4.200651160856099e-06,
- "loss": 0.4160492420196533,
- "mean_token_accuracy": 0.845937192440033,
- "num_tokens": 10547577.0,
- "step": 1181
- },
- {
- "epoch": 0.898176291793313,
- "grad_norm": 2.331169605255127,
- "learning_rate": 4.1991154344950755e-06,
- "loss": 0.6532632112503052,
- "mean_token_accuracy": 0.7743191123008728,
- "num_tokens": 10556328.0,
- "step": 1182
- },
- {
- "epoch": 0.898936170212766,
- "grad_norm": 1.3538362979888916,
- "learning_rate": 4.197578515629435e-06,
- "loss": 0.4437566101551056,
- "mean_token_accuracy": 0.8427901268005371,
- "num_tokens": 10570026.0,
- "step": 1183
- },
- {
- "epoch": 0.8996960486322189,
- "grad_norm": 2.3828957080841064,
- "learning_rate": 4.196040405337846e-06,
- "loss": 0.6185290217399597,
- "mean_token_accuracy": 0.7969824075698853,
- "num_tokens": 10576465.0,
- "step": 1184
- },
- {
- "epoch": 0.9004559270516718,
- "grad_norm": 2.4759042263031006,
- "learning_rate": 4.194501104699813e-06,
- "loss": 0.46489226818084717,
- "mean_token_accuracy": 0.8472316265106201,
- "num_tokens": 10582034.0,
- "step": 1185
- },
- {
- "epoch": 0.9012158054711246,
- "grad_norm": 1.9215164184570312,
- "learning_rate": 4.192960614795676e-06,
- "loss": 0.48001551628112793,
- "mean_token_accuracy": 0.8371596336364746,
- "num_tokens": 10590556.0,
- "step": 1186
- },
- {
- "epoch": 0.9019756838905775,
- "grad_norm": 2.2717080116271973,
- "learning_rate": 4.19141893670661e-06,
- "loss": 0.40083563327789307,
- "mean_token_accuracy": 0.8464195728302002,
- "num_tokens": 10595661.0,
- "step": 1187
- },
- {
- "epoch": 0.9027355623100304,
- "grad_norm": 2.187122344970703,
- "learning_rate": 4.189876071514624e-06,
- "loss": 0.4942901134490967,
- "mean_token_accuracy": 0.8186990022659302,
- "num_tokens": 10603366.0,
- "step": 1188
- },
- {
- "epoch": 0.9034954407294833,
- "grad_norm": 1.542414665222168,
- "learning_rate": 4.188332020302561e-06,
- "loss": 0.4731982946395874,
- "mean_token_accuracy": 0.8487229347229004,
- "num_tokens": 10616203.0,
- "step": 1189
- },
- {
- "epoch": 0.9042553191489362,
- "grad_norm": 0.9957579970359802,
- "learning_rate": 4.186786784154096e-06,
- "loss": 0.33211836218833923,
- "mean_token_accuracy": 0.870644748210907,
- "num_tokens": 10633294.0,
- "step": 1190
- },
- {
- "epoch": 0.9050151975683891,
- "grad_norm": 2.593867540359497,
- "learning_rate": 4.1852403641537344e-06,
- "loss": 0.6825464963912964,
- "mean_token_accuracy": 0.7716869115829468,
- "num_tokens": 10640615.0,
- "step": 1191
- },
- {
- "epoch": 0.9057750759878419,
- "grad_norm": 2.0424516201019287,
- "learning_rate": 4.183692761386813e-06,
- "loss": 0.5672709941864014,
- "mean_token_accuracy": 0.7973801493644714,
- "num_tokens": 10649845.0,
- "step": 1192
- },
- {
- "epoch": 0.9065349544072948,
- "grad_norm": 1.429018259048462,
- "learning_rate": 4.1821439769395e-06,
- "loss": 0.5427846908569336,
- "mean_token_accuracy": 0.8200292587280273,
- "num_tokens": 10665898.0,
- "step": 1193
- },
- {
- "epoch": 0.9072948328267477,
- "grad_norm": 1.9764264822006226,
- "learning_rate": 4.180594011898791e-06,
- "loss": 0.4784567356109619,
- "mean_token_accuracy": 0.82924485206604,
- "num_tokens": 10673595.0,
- "step": 1194
- },
- {
- "epoch": 0.9080547112462006,
- "grad_norm": 1.4004309177398682,
- "learning_rate": 4.1790428673525104e-06,
- "loss": 0.4791432023048401,
- "mean_token_accuracy": 0.8334879875183105,
- "num_tokens": 10687892.0,
- "step": 1195
- },
- {
- "epoch": 0.9088145896656535,
- "grad_norm": 2.2207727432250977,
- "learning_rate": 4.177490544389313e-06,
- "loss": 0.5089365243911743,
- "mean_token_accuracy": 0.8270776271820068,
- "num_tokens": 10694911.0,
- "step": 1196
- },
- {
- "epoch": 0.9095744680851063,
- "grad_norm": 2.2890450954437256,
- "learning_rate": 4.175937044098678e-06,
- "loss": 0.5152267813682556,
- "mean_token_accuracy": 0.8527299165725708,
- "num_tokens": 10700512.0,
- "step": 1197
- },
- {
- "epoch": 0.9103343465045592,
- "grad_norm": 1.7938050031661987,
- "learning_rate": 4.1743823675709115e-06,
- "loss": 0.3507300615310669,
- "mean_token_accuracy": 0.8694599866867065,
- "num_tokens": 10707953.0,
- "step": 1198
- },
- {
- "epoch": 0.9110942249240122,
- "grad_norm": 1.4368808269500732,
- "learning_rate": 4.172826515897146e-06,
- "loss": 0.407418429851532,
- "mean_token_accuracy": 0.8432893753051758,
- "num_tokens": 10717485.0,
- "step": 1199
- },
- {
- "epoch": 0.9118541033434651,
- "grad_norm": 1.735339879989624,
- "learning_rate": 4.171269490169337e-06,
- "loss": 0.46996885538101196,
- "mean_token_accuracy": 0.8331948518753052,
- "num_tokens": 10726160.0,
- "step": 1200
- },
- {
- "epoch": 0.912613981762918,
- "grad_norm": 1.7859221696853638,
- "learning_rate": 4.1697112914802665e-06,
- "loss": 0.5325199365615845,
- "mean_token_accuracy": 0.8179605007171631,
- "num_tokens": 10736284.0,
- "step": 1201
- },
- {
- "epoch": 0.9133738601823708,
- "grad_norm": 2.6394896507263184,
- "learning_rate": 4.168151920923536e-06,
- "loss": 0.4039744734764099,
- "mean_token_accuracy": 0.8545527458190918,
- "num_tokens": 10740673.0,
- "step": 1202
- },
- {
- "epoch": 0.9141337386018237,
- "grad_norm": 1.910988211631775,
- "learning_rate": 4.1665913795935755e-06,
- "loss": 0.5190291404724121,
- "mean_token_accuracy": 0.8203921318054199,
- "num_tokens": 10751946.0,
- "step": 1203
- },
- {
- "epoch": 0.9148936170212766,
- "grad_norm": 3.0006964206695557,
- "learning_rate": 4.16502966858563e-06,
- "loss": 0.5856777429580688,
- "mean_token_accuracy": 0.8061224222183228,
- "num_tokens": 10756795.0,
- "step": 1204
- },
- {
- "epoch": 0.9156534954407295,
- "grad_norm": 1.7396167516708374,
- "learning_rate": 4.163466788995768e-06,
- "loss": 0.54935222864151,
- "mean_token_accuracy": 0.8052443265914917,
- "num_tokens": 10767202.0,
- "step": 1205
- },
- {
- "epoch": 0.9164133738601824,
- "grad_norm": 2.143735885620117,
- "learning_rate": 4.161902741920881e-06,
- "loss": 0.5020298361778259,
- "mean_token_accuracy": 0.8249630928039551,
- "num_tokens": 10774329.0,
- "step": 1206
- },
- {
- "epoch": 0.9171732522796353,
- "grad_norm": 2.8871893882751465,
- "learning_rate": 4.160337528458676e-06,
- "loss": 0.5154489278793335,
- "mean_token_accuracy": 0.8276848793029785,
- "num_tokens": 10778929.0,
- "step": 1207
- },
- {
- "epoch": 0.9179331306990881,
- "grad_norm": 1.4642788171768188,
- "learning_rate": 4.15877114970768e-06,
- "loss": 0.5033774375915527,
- "mean_token_accuracy": 0.8296241164207458,
- "num_tokens": 10790928.0,
- "step": 1208
- },
- {
- "epoch": 0.918693009118541,
- "grad_norm": 1.8313497304916382,
- "learning_rate": 4.1572036067672386e-06,
- "loss": 0.5674909353256226,
- "mean_token_accuracy": 0.7975562214851379,
- "num_tokens": 10801372.0,
- "step": 1209
- },
- {
- "epoch": 0.9194528875379939,
- "grad_norm": 2.005958080291748,
- "learning_rate": 4.155634900737513e-06,
- "loss": 0.5557019114494324,
- "mean_token_accuracy": 0.8141391277313232,
- "num_tokens": 10809150.0,
- "step": 1210
- },
- {
- "epoch": 0.9202127659574468,
- "grad_norm": 2.333519697189331,
- "learning_rate": 4.154065032719482e-06,
- "loss": 0.6990420818328857,
- "mean_token_accuracy": 0.7565394043922424,
- "num_tokens": 10816612.0,
- "step": 1211
- },
- {
- "epoch": 0.9209726443768997,
- "grad_norm": 1.4472655057907104,
- "learning_rate": 4.152494003814939e-06,
- "loss": 0.541398286819458,
- "mean_token_accuracy": 0.8027358055114746,
- "num_tokens": 10833840.0,
- "step": 1212
- },
- {
- "epoch": 0.9217325227963525,
- "grad_norm": 1.6183619499206543,
- "learning_rate": 4.150921815126493e-06,
- "loss": 0.6096762418746948,
- "mean_token_accuracy": 0.7994354963302612,
- "num_tokens": 10846367.0,
- "step": 1213
- },
- {
- "epoch": 0.9224924012158054,
- "grad_norm": 2.614919900894165,
- "learning_rate": 4.149348467757566e-06,
- "loss": 0.41846764087677,
- "mean_token_accuracy": 0.8555068969726562,
- "num_tokens": 10850836.0,
- "step": 1214
- },
- {
- "epoch": 0.9232522796352584,
- "grad_norm": 1.4419831037521362,
- "learning_rate": 4.147773962812393e-06,
- "loss": 0.4139535427093506,
- "mean_token_accuracy": 0.845671534538269,
- "num_tokens": 10864228.0,
- "step": 1215
- },
- {
- "epoch": 0.9240121580547113,
- "grad_norm": 2.3868865966796875,
- "learning_rate": 4.146198301396025e-06,
- "loss": 0.3357275128364563,
- "mean_token_accuracy": 0.8829520344734192,
- "num_tokens": 10868920.0,
- "step": 1216
- },
- {
- "epoch": 0.9247720364741642,
- "grad_norm": 1.7685474157333374,
- "learning_rate": 4.14462148461432e-06,
- "loss": 0.45333072543144226,
- "mean_token_accuracy": 0.8505891561508179,
- "num_tokens": 10877286.0,
- "step": 1217
- },
- {
- "epoch": 0.925531914893617,
- "grad_norm": 1.7627625465393066,
- "learning_rate": 4.143043513573949e-06,
- "loss": 0.5028705596923828,
- "mean_token_accuracy": 0.825471043586731,
- "num_tokens": 10887047.0,
- "step": 1218
- },
- {
- "epoch": 0.9262917933130699,
- "grad_norm": 1.3168725967407227,
- "learning_rate": 4.141464389382392e-06,
- "loss": 0.5494637489318848,
- "mean_token_accuracy": 0.8121747970581055,
- "num_tokens": 10903599.0,
- "step": 1219
- },
- {
- "epoch": 0.9270516717325228,
- "grad_norm": 2.5180399417877197,
- "learning_rate": 4.13988411314794e-06,
- "loss": 0.6134277582168579,
- "mean_token_accuracy": 0.7983006834983826,
- "num_tokens": 10909791.0,
- "step": 1220
- },
- {
- "epoch": 0.9278115501519757,
- "grad_norm": 1.1889166831970215,
- "learning_rate": 4.13830268597969e-06,
- "loss": 0.36713096499443054,
- "mean_token_accuracy": 0.8416121006011963,
- "num_tokens": 10925794.0,
- "step": 1221
- },
- {
- "epoch": 0.9285714285714286,
- "grad_norm": 2.142422676086426,
- "learning_rate": 4.136720108987552e-06,
- "loss": 0.4427933096885681,
- "mean_token_accuracy": 0.8427745699882507,
- "num_tokens": 10931622.0,
- "step": 1222
- },
- {
- "epoch": 0.9293313069908815,
- "grad_norm": 1.908564567565918,
- "learning_rate": 4.1351363832822364e-06,
- "loss": 0.5088109374046326,
- "mean_token_accuracy": 0.8309272527694702,
- "num_tokens": 10940843.0,
- "step": 1223
- },
- {
- "epoch": 0.9300911854103343,
- "grad_norm": 1.2862322330474854,
- "learning_rate": 4.133551509975264e-06,
- "loss": 0.3963761329650879,
- "mean_token_accuracy": 0.8602159023284912,
- "num_tokens": 10954481.0,
- "step": 1224
- },
- {
- "epoch": 0.9308510638297872,
- "grad_norm": 1.5876200199127197,
- "learning_rate": 4.13196549017896e-06,
- "loss": 0.4311184287071228,
- "mean_token_accuracy": 0.8460899591445923,
- "num_tokens": 10963501.0,
- "step": 1225
- },
- {
- "epoch": 0.9316109422492401,
- "grad_norm": 2.459878444671631,
- "learning_rate": 4.130378325006453e-06,
- "loss": 0.5016295313835144,
- "mean_token_accuracy": 0.8125218152999878,
- "num_tokens": 10968850.0,
- "step": 1226
- },
- {
- "epoch": 0.932370820668693,
- "grad_norm": 2.059718370437622,
- "learning_rate": 4.128790015571679e-06,
- "loss": 0.48982277512550354,
- "mean_token_accuracy": 0.8327049016952515,
- "num_tokens": 10976642.0,
- "step": 1227
- },
- {
- "epoch": 0.9331306990881459,
- "grad_norm": 1.3719185590744019,
- "learning_rate": 4.127200562989372e-06,
- "loss": 0.38778752088546753,
- "mean_token_accuracy": 0.8623501062393188,
- "num_tokens": 10988703.0,
- "step": 1228
- },
- {
- "epoch": 0.9338905775075987,
- "grad_norm": 1.302140712738037,
- "learning_rate": 4.125609968375073e-06,
- "loss": 0.4887842535972595,
- "mean_token_accuracy": 0.8322232961654663,
- "num_tokens": 11005981.0,
- "step": 1229
- },
- {
- "epoch": 0.9346504559270516,
- "grad_norm": 1.819624423980713,
- "learning_rate": 4.12401823284512e-06,
- "loss": 0.49825209379196167,
- "mean_token_accuracy": 0.8278916478157043,
- "num_tokens": 11014145.0,
- "step": 1230
- },
- {
- "epoch": 0.9354103343465046,
- "grad_norm": 1.2762807607650757,
- "learning_rate": 4.122425357516658e-06,
- "loss": 0.433994323015213,
- "mean_token_accuracy": 0.853028416633606,
- "num_tokens": 11029232.0,
- "step": 1231
- },
- {
- "epoch": 0.9361702127659575,
- "grad_norm": 2.2171671390533447,
- "learning_rate": 4.1208313435076255e-06,
- "loss": 0.38436949253082275,
- "mean_token_accuracy": 0.8616260290145874,
- "num_tokens": 11034743.0,
- "step": 1232
- },
- {
- "epoch": 0.9369300911854104,
- "grad_norm": 1.355879545211792,
- "learning_rate": 4.119236191936764e-06,
- "loss": 0.5378084182739258,
- "mean_token_accuracy": 0.8256701231002808,
- "num_tokens": 11048149.0,
- "step": 1233
- },
- {
- "epoch": 0.9376899696048632,
- "grad_norm": 2.66812801361084,
- "learning_rate": 4.117639903923611e-06,
- "loss": 0.5236451625823975,
- "mean_token_accuracy": 0.8431973457336426,
- "num_tokens": 11052295.0,
- "step": 1234
- },
- {
- "epoch": 0.9384498480243161,
- "grad_norm": 1.5740545988082886,
- "learning_rate": 4.116042480588505e-06,
- "loss": 0.44322824478149414,
- "mean_token_accuracy": 0.8436908721923828,
- "num_tokens": 11062066.0,
- "step": 1235
- },
- {
- "epoch": 0.939209726443769,
- "grad_norm": 1.230706810951233,
- "learning_rate": 4.114443923052577e-06,
- "loss": 0.3325323462486267,
- "mean_token_accuracy": 0.8674666881561279,
- "num_tokens": 11074300.0,
- "step": 1236
- },
- {
- "epoch": 0.9399696048632219,
- "grad_norm": 1.9870070219039917,
- "learning_rate": 4.112844232437757e-06,
- "loss": 0.5711548328399658,
- "mean_token_accuracy": 0.8081738948822021,
- "num_tokens": 11082297.0,
- "step": 1237
- },
- {
- "epoch": 0.9407294832826748,
- "grad_norm": 1.3020970821380615,
- "learning_rate": 4.11124340986677e-06,
- "loss": 0.4187922477722168,
- "mean_token_accuracy": 0.8566171526908875,
- "num_tokens": 11096810.0,
- "step": 1238
- },
- {
- "epoch": 0.9414893617021277,
- "grad_norm": 2.1399197578430176,
- "learning_rate": 4.109641456463135e-06,
- "loss": 0.5293116569519043,
- "mean_token_accuracy": 0.8176157474517822,
- "num_tokens": 11102761.0,
- "step": 1239
- },
- {
- "epoch": 0.9422492401215805,
- "grad_norm": 1.3503763675689697,
- "learning_rate": 4.108038373351163e-06,
- "loss": 0.4907652735710144,
- "mean_token_accuracy": 0.8204987049102783,
- "num_tokens": 11118480.0,
- "step": 1240
- },
- {
- "epoch": 0.9430091185410334,
- "grad_norm": 1.9571399688720703,
- "learning_rate": 4.106434161655962e-06,
- "loss": 0.4709656536579132,
- "mean_token_accuracy": 0.8371885418891907,
- "num_tokens": 11126265.0,
- "step": 1241
- },
- {
- "epoch": 0.9437689969604863,
- "grad_norm": 2.1277313232421875,
- "learning_rate": 4.104828822503427e-06,
- "loss": 0.4010283350944519,
- "mean_token_accuracy": 0.8586333990097046,
- "num_tokens": 11133022.0,
- "step": 1242
- },
- {
- "epoch": 0.9445288753799392,
- "grad_norm": 1.6745036840438843,
- "learning_rate": 4.103222357020248e-06,
- "loss": 0.562545657157898,
- "mean_token_accuracy": 0.8052060604095459,
- "num_tokens": 11145255.0,
- "step": 1243
- },
- {
- "epoch": 0.9452887537993921,
- "grad_norm": 2.3616299629211426,
- "learning_rate": 4.101614766333904e-06,
- "loss": 0.5878340601921082,
- "mean_token_accuracy": 0.796745777130127,
- "num_tokens": 11152020.0,
- "step": 1244
- },
- {
- "epoch": 0.9460486322188449,
- "grad_norm": 1.6182078123092651,
- "learning_rate": 4.100006051572664e-06,
- "loss": 0.5357589721679688,
- "mean_token_accuracy": 0.8089962005615234,
- "num_tokens": 11163112.0,
- "step": 1245
- },
- {
- "epoch": 0.9468085106382979,
- "grad_norm": 1.911770224571228,
- "learning_rate": 4.098396213865587e-06,
- "loss": 0.49805426597595215,
- "mean_token_accuracy": 0.8289647102355957,
- "num_tokens": 11171768.0,
- "step": 1246
- },
- {
- "epoch": 0.9475683890577508,
- "grad_norm": 1.649155616760254,
- "learning_rate": 4.096785254342518e-06,
- "loss": 0.5756166577339172,
- "mean_token_accuracy": 0.807680606842041,
- "num_tokens": 11183527.0,
- "step": 1247
- },
- {
- "epoch": 0.9483282674772037,
- "grad_norm": 1.8922761678695679,
- "learning_rate": 4.095173174134091e-06,
- "loss": 0.44688963890075684,
- "mean_token_accuracy": 0.8375608921051025,
- "num_tokens": 11191494.0,
- "step": 1248
- },
- {
- "epoch": 0.9490881458966566,
- "grad_norm": 2.9044547080993652,
- "learning_rate": 4.093559974371725e-06,
- "loss": 0.48609739542007446,
- "mean_token_accuracy": 0.8404892086982727,
- "num_tokens": 11195837.0,
- "step": 1249
- },
- {
- "epoch": 0.9498480243161094,
- "grad_norm": 2.287506580352783,
- "learning_rate": 4.091945656187626e-06,
- "loss": 0.5260225534439087,
- "mean_token_accuracy": 0.8181945085525513,
- "num_tokens": 11202174.0,
- "step": 1250
- },
- {
- "epoch": 0.9506079027355623,
- "grad_norm": 1.7908886671066284,
- "learning_rate": 4.090330220714785e-06,
- "loss": 0.4207724928855896,
- "mean_token_accuracy": 0.8616912364959717,
- "num_tokens": 11209995.0,
- "step": 1251
- },
- {
- "epoch": 0.9513677811550152,
- "grad_norm": 2.905418634414673,
- "learning_rate": 4.0887136690869774e-06,
- "loss": 0.4209241271018982,
- "mean_token_accuracy": 0.8561323285102844,
- "num_tokens": 11213799.0,
- "step": 1252
- },
- {
- "epoch": 0.9521276595744681,
- "grad_norm": 2.814150333404541,
- "learning_rate": 4.08709600243876e-06,
- "loss": 0.36855608224868774,
- "mean_token_accuracy": 0.8764539361000061,
- "num_tokens": 11217643.0,
- "step": 1253
- },
- {
- "epoch": 0.952887537993921,
- "grad_norm": 1.9385707378387451,
- "learning_rate": 4.0854772219054735e-06,
- "loss": 0.531031608581543,
- "mean_token_accuracy": 0.80600905418396,
- "num_tokens": 11225871.0,
- "step": 1254
- },
- {
- "epoch": 0.9536474164133738,
- "grad_norm": 2.103058099746704,
- "learning_rate": 4.083857328623243e-06,
- "loss": 0.4576364755630493,
- "mean_token_accuracy": 0.8447524905204773,
- "num_tokens": 11231829.0,
- "step": 1255
- },
- {
- "epoch": 0.9544072948328267,
- "grad_norm": 1.7518818378448486,
- "learning_rate": 4.082236323728969e-06,
- "loss": 0.5386767983436584,
- "mean_token_accuracy": 0.8055596351623535,
- "num_tokens": 11240977.0,
- "step": 1256
- },
- {
- "epoch": 0.9551671732522796,
- "grad_norm": 1.8434966802597046,
- "learning_rate": 4.0806142083603365e-06,
- "loss": 0.5415925979614258,
- "mean_token_accuracy": 0.809962272644043,
- "num_tokens": 11249616.0,
- "step": 1257
- },
- {
- "epoch": 0.9559270516717325,
- "grad_norm": 1.7341015338897705,
- "learning_rate": 4.078990983655807e-06,
- "loss": 0.4621101915836334,
- "mean_token_accuracy": 0.8330386877059937,
- "num_tokens": 11258616.0,
- "step": 1258
- },
- {
- "epoch": 0.9566869300911854,
- "grad_norm": 1.8589727878570557,
- "learning_rate": 4.077366650754624e-06,
- "loss": 0.4031238555908203,
- "mean_token_accuracy": 0.842434287071228,
- "num_tokens": 11266006.0,
- "step": 1259
- },
- {
- "epoch": 0.9574468085106383,
- "grad_norm": 1.657175898551941,
- "learning_rate": 4.075741210796806e-06,
- "loss": 0.41686388850212097,
- "mean_token_accuracy": 0.8443650007247925,
- "num_tokens": 11275601.0,
- "step": 1260
- },
- {
- "epoch": 0.9582066869300911,
- "grad_norm": 2.4303717613220215,
- "learning_rate": 4.07411466492315e-06,
- "loss": 0.4554435610771179,
- "mean_token_accuracy": 0.853043794631958,
- "num_tokens": 11280650.0,
- "step": 1261
- },
- {
- "epoch": 0.958966565349544,
- "grad_norm": 2.3653745651245117,
- "learning_rate": 4.072487014275228e-06,
- "loss": 0.4304995536804199,
- "mean_token_accuracy": 0.8462260961532593,
- "num_tokens": 11285637.0,
- "step": 1262
- },
- {
- "epoch": 0.959726443768997,
- "grad_norm": 1.6689718961715698,
- "learning_rate": 4.070858259995388e-06,
- "loss": 0.5290807485580444,
- "mean_token_accuracy": 0.8176917433738708,
- "num_tokens": 11299110.0,
- "step": 1263
- },
- {
- "epoch": 0.9604863221884499,
- "grad_norm": 2.103879451751709,
- "learning_rate": 4.069228403226751e-06,
- "loss": 0.4620879888534546,
- "mean_token_accuracy": 0.835270345211029,
- "num_tokens": 11305564.0,
- "step": 1264
- },
- {
- "epoch": 0.9612462006079028,
- "grad_norm": 2.139012575149536,
- "learning_rate": 4.067597445113216e-06,
- "loss": 0.5143396258354187,
- "mean_token_accuracy": 0.8191739320755005,
- "num_tokens": 11311870.0,
- "step": 1265
- },
- {
- "epoch": 0.9620060790273556,
- "grad_norm": 1.3971210718154907,
- "learning_rate": 4.06596538679945e-06,
- "loss": 0.472080260515213,
- "mean_token_accuracy": 0.8321092128753662,
- "num_tokens": 11323970.0,
- "step": 1266
- },
- {
- "epoch": 0.9627659574468085,
- "grad_norm": 1.4965174198150635,
- "learning_rate": 4.064332229430895e-06,
- "loss": 0.359701007604599,
- "mean_token_accuracy": 0.8903120160102844,
- "num_tokens": 11333412.0,
- "step": 1267
- },
- {
- "epoch": 0.9635258358662614,
- "grad_norm": 1.1898726224899292,
- "learning_rate": 4.062697974153764e-06,
- "loss": 0.3423798084259033,
- "mean_token_accuracy": 0.8661491870880127,
- "num_tokens": 11347657.0,
- "step": 1268
- },
- {
- "epoch": 0.9642857142857143,
- "grad_norm": 1.4952168464660645,
- "learning_rate": 4.06106262211504e-06,
- "loss": 0.4214417338371277,
- "mean_token_accuracy": 0.8362159729003906,
- "num_tokens": 11357786.0,
- "step": 1269
- },
- {
- "epoch": 0.9650455927051672,
- "grad_norm": 1.7949583530426025,
- "learning_rate": 4.059426174462476e-06,
- "loss": 0.59087735414505,
- "mean_token_accuracy": 0.7965556979179382,
- "num_tokens": 11370561.0,
- "step": 1270
- },
- {
- "epoch": 0.96580547112462,
- "grad_norm": 1.8973214626312256,
- "learning_rate": 4.057788632344594e-06,
- "loss": 0.47525322437286377,
- "mean_token_accuracy": 0.8317365050315857,
- "num_tokens": 11378507.0,
- "step": 1271
- },
- {
- "epoch": 0.9665653495440729,
- "grad_norm": 1.8665250539779663,
- "learning_rate": 4.056149996910683e-06,
- "loss": 0.3537125587463379,
- "mean_token_accuracy": 0.8921569585800171,
- "num_tokens": 11385186.0,
- "step": 1272
- },
- {
- "epoch": 0.9673252279635258,
- "grad_norm": 1.5072317123413086,
- "learning_rate": 4.054510269310803e-06,
- "loss": 0.5145624876022339,
- "mean_token_accuracy": 0.8265488147735596,
- "num_tokens": 11397125.0,
- "step": 1273
- },
- {
- "epoch": 0.9680851063829787,
- "grad_norm": 1.520525574684143,
- "learning_rate": 4.052869450695776e-06,
- "loss": 0.44322293996810913,
- "mean_token_accuracy": 0.8403642177581787,
- "num_tokens": 11409919.0,
- "step": 1274
- },
- {
- "epoch": 0.9688449848024316,
- "grad_norm": 1.3764475584030151,
- "learning_rate": 4.051227542217192e-06,
- "loss": 0.5774400234222412,
- "mean_token_accuracy": 0.804118275642395,
- "num_tokens": 11425900.0,
- "step": 1275
- },
- {
- "epoch": 0.9696048632218845,
- "grad_norm": 1.3922648429870605,
- "learning_rate": 4.049584545027406e-06,
- "loss": 0.42727944254875183,
- "mean_token_accuracy": 0.8654505014419556,
- "num_tokens": 11438787.0,
- "step": 1276
- },
- {
- "epoch": 0.9703647416413373,
- "grad_norm": 1.8505840301513672,
- "learning_rate": 4.047940460279537e-06,
- "loss": 0.490803062915802,
- "mean_token_accuracy": 0.8340574502944946,
- "num_tokens": 11447997.0,
- "step": 1277
- },
- {
- "epoch": 0.9711246200607903,
- "grad_norm": 2.28271222114563,
- "learning_rate": 4.046295289127466e-06,
- "loss": 0.588828444480896,
- "mean_token_accuracy": 0.833497166633606,
- "num_tokens": 11454072.0,
- "step": 1278
- },
- {
- "epoch": 0.9718844984802432,
- "grad_norm": 2.4242560863494873,
- "learning_rate": 4.044649032725836e-06,
- "loss": 0.5128831267356873,
- "mean_token_accuracy": 0.8225122690200806,
- "num_tokens": 11460211.0,
- "step": 1279
- },
- {
- "epoch": 0.9726443768996961,
- "grad_norm": 2.1738455295562744,
- "learning_rate": 4.0430016922300566e-06,
- "loss": 0.441631942987442,
- "mean_token_accuracy": 0.841723620891571,
- "num_tokens": 11466814.0,
- "step": 1280
- },
- {
- "epoch": 0.973404255319149,
- "grad_norm": 2.541599988937378,
- "learning_rate": 4.0413532687962926e-06,
- "loss": 0.5062629580497742,
- "mean_token_accuracy": 0.8013502359390259,
- "num_tokens": 11472371.0,
- "step": 1281
- },
- {
- "epoch": 0.9741641337386018,
- "grad_norm": 2.8011014461517334,
- "learning_rate": 4.039703763581472e-06,
- "loss": 0.5061966776847839,
- "mean_token_accuracy": 0.829810380935669,
- "num_tokens": 11476672.0,
- "step": 1282
- },
- {
- "epoch": 0.9749240121580547,
- "grad_norm": 2.4505462646484375,
- "learning_rate": 4.038053177743279e-06,
- "loss": 0.43407535552978516,
- "mean_token_accuracy": 0.8428469896316528,
- "num_tokens": 11481297.0,
- "step": 1283
- },
- {
- "epoch": 0.9756838905775076,
- "grad_norm": 2.1618378162384033,
- "learning_rate": 4.036401512440161e-06,
- "loss": 0.6056663393974304,
- "mean_token_accuracy": 0.7977457642555237,
- "num_tokens": 11488657.0,
- "step": 1284
- },
- {
- "epoch": 0.9764437689969605,
- "grad_norm": 1.9192147254943848,
- "learning_rate": 4.034748768831319e-06,
- "loss": 0.524390697479248,
- "mean_token_accuracy": 0.8120636940002441,
- "num_tokens": 11496485.0,
- "step": 1285
- },
- {
- "epoch": 0.9772036474164134,
- "grad_norm": 2.766435384750366,
- "learning_rate": 4.033094948076713e-06,
- "loss": 0.5494908690452576,
- "mean_token_accuracy": 0.8141890168190002,
- "num_tokens": 11501341.0,
- "step": 1286
- },
- {
- "epoch": 0.9779635258358662,
- "grad_norm": 1.3519539833068848,
- "learning_rate": 4.031440051337056e-06,
- "loss": 0.4339691400527954,
- "mean_token_accuracy": 0.8400131464004517,
- "num_tokens": 11512843.0,
- "step": 1287
- },
- {
- "epoch": 0.9787234042553191,
- "grad_norm": 1.2492141723632812,
- "learning_rate": 4.02978407977382e-06,
- "loss": 0.4433518052101135,
- "mean_token_accuracy": 0.8432940244674683,
- "num_tokens": 11530227.0,
- "step": 1288
- },
- {
- "epoch": 0.979483282674772,
- "grad_norm": 1.6597715616226196,
- "learning_rate": 4.02812703454923e-06,
- "loss": 0.602222204208374,
- "mean_token_accuracy": 0.786965548992157,
- "num_tokens": 11543955.0,
- "step": 1289
- },
- {
- "epoch": 0.9802431610942249,
- "grad_norm": 1.6621816158294678,
- "learning_rate": 4.026468916826262e-06,
- "loss": 0.35662174224853516,
- "mean_token_accuracy": 0.8716133832931519,
- "num_tokens": 11552064.0,
- "step": 1290
- },
- {
- "epoch": 0.9810030395136778,
- "grad_norm": 4.539844989776611,
- "learning_rate": 4.024809727768648e-06,
- "loss": 0.543423593044281,
- "mean_token_accuracy": 0.8293194770812988,
- "num_tokens": 11555595.0,
- "step": 1291
- },
- {
- "epoch": 0.9817629179331308,
- "grad_norm": 1.4026556015014648,
- "learning_rate": 4.023149468540871e-06,
- "loss": 0.4301237165927887,
- "mean_token_accuracy": 0.8358224630355835,
- "num_tokens": 11572275.0,
- "step": 1292
- },
- {
- "epoch": 0.9825227963525835,
- "grad_norm": 1.611262321472168,
- "learning_rate": 4.021488140308165e-06,
- "loss": 0.5378580689430237,
- "mean_token_accuracy": 0.8173760771751404,
- "num_tokens": 11584299.0,
- "step": 1293
- },
- {
- "epoch": 0.9832826747720365,
- "grad_norm": 4.138631820678711,
- "learning_rate": 4.019825744236514e-06,
- "loss": 0.40272149443626404,
- "mean_token_accuracy": 0.8648844957351685,
- "num_tokens": 11586705.0,
- "step": 1294
- },
- {
- "epoch": 0.9840425531914894,
- "grad_norm": 3.177703619003296,
- "learning_rate": 4.018162281492651e-06,
- "loss": 0.5320103168487549,
- "mean_token_accuracy": 0.8250276446342468,
- "num_tokens": 11590689.0,
- "step": 1295
- },
- {
- "epoch": 0.9848024316109423,
- "grad_norm": 2.727597713470459,
- "learning_rate": 4.016497753244058e-06,
- "loss": 0.5662774443626404,
- "mean_token_accuracy": 0.8074625730514526,
- "num_tokens": 11596092.0,
- "step": 1296
- },
- {
- "epoch": 0.9855623100303952,
- "grad_norm": 1.485139012336731,
- "learning_rate": 4.014832160658966e-06,
- "loss": 0.5414972305297852,
- "mean_token_accuracy": 0.8082696199417114,
- "num_tokens": 11613785.0,
- "step": 1297
- },
- {
- "epoch": 0.986322188449848,
- "grad_norm": 2.4025990962982178,
- "learning_rate": 4.013165504906352e-06,
- "loss": 0.6556503772735596,
- "mean_token_accuracy": 0.7785214781761169,
- "num_tokens": 11620421.0,
- "step": 1298
- },
- {
- "epoch": 0.9870820668693009,
- "grad_norm": 1.878273606300354,
- "learning_rate": 4.011497787155938e-06,
- "loss": 0.4221133887767792,
- "mean_token_accuracy": 0.850035548210144,
- "num_tokens": 11627998.0,
- "step": 1299
- },
- {
- "epoch": 0.9878419452887538,
- "grad_norm": 2.0430715084075928,
- "learning_rate": 4.009829008578192e-06,
- "loss": 0.5205984711647034,
- "mean_token_accuracy": 0.819183349609375,
- "num_tokens": 11636279.0,
- "step": 1300
- },
- {
- "epoch": 0.9886018237082067,
- "grad_norm": 3.4769439697265625,
- "learning_rate": 4.00815917034433e-06,
- "loss": 0.5449948310852051,
- "mean_token_accuracy": 0.8240023851394653,
- "num_tokens": 11639638.0,
- "step": 1301
- },
- {
- "epoch": 0.9893617021276596,
- "grad_norm": 2.4783987998962402,
- "learning_rate": 4.006488273626307e-06,
- "loss": 0.4316832423210144,
- "mean_token_accuracy": 0.8474695086479187,
- "num_tokens": 11645463.0,
- "step": 1302
- },
- {
- "epoch": 0.9901215805471124,
- "grad_norm": 1.881475567817688,
- "learning_rate": 4.004816319596822e-06,
- "loss": 0.5157331824302673,
- "mean_token_accuracy": 0.826042652130127,
- "num_tokens": 11653955.0,
- "step": 1303
- },
- {
- "epoch": 0.9908814589665653,
- "grad_norm": 2.6569254398345947,
- "learning_rate": 4.003143309429317e-06,
- "loss": 0.46492767333984375,
- "mean_token_accuracy": 0.8320850133895874,
- "num_tokens": 11659357.0,
- "step": 1304
- },
- {
- "epoch": 0.9916413373860182,
- "grad_norm": 2.4917593002319336,
- "learning_rate": 4.0014692442979756e-06,
- "loss": 0.459585040807724,
- "mean_token_accuracy": 0.8457611799240112,
- "num_tokens": 11664207.0,
- "step": 1305
- },
- {
- "epoch": 0.9924012158054711,
- "grad_norm": 2.6885526180267334,
- "learning_rate": 3.999794125377721e-06,
- "loss": 0.4677402973175049,
- "mean_token_accuracy": 0.8307361602783203,
- "num_tokens": 11668879.0,
- "step": 1306
- },
- {
- "epoch": 0.993161094224924,
- "grad_norm": 1.9737319946289062,
- "learning_rate": 3.998117953844215e-06,
- "loss": 0.44684839248657227,
- "mean_token_accuracy": 0.8367687463760376,
- "num_tokens": 11676081.0,
- "step": 1307
- },
- {
- "epoch": 0.993920972644377,
- "grad_norm": 1.4333021640777588,
- "learning_rate": 3.996440730873861e-06,
- "loss": 0.526146650314331,
- "mean_token_accuracy": 0.816251814365387,
- "num_tokens": 11689333.0,
- "step": 1308
- },
- {
- "epoch": 0.9946808510638298,
- "grad_norm": 1.3689230680465698,
- "learning_rate": 3.9947624576437975e-06,
- "loss": 0.40214329957962036,
- "mean_token_accuracy": 0.8610327839851379,
- "num_tokens": 11701540.0,
- "step": 1309
- },
- {
- "epoch": 0.9954407294832827,
- "grad_norm": 1.2435375452041626,
- "learning_rate": 3.9930831353319025e-06,
- "loss": 0.4532913267612457,
- "mean_token_accuracy": 0.8415389060974121,
- "num_tokens": 11717920.0,
- "step": 1310
- },
- {
- "epoch": 0.9962006079027356,
- "grad_norm": 1.9968011379241943,
- "learning_rate": 3.9914027651167866e-06,
- "loss": 0.46954160928726196,
- "mean_token_accuracy": 0.8351103663444519,
- "num_tokens": 11724999.0,
- "step": 1311
- },
- {
- "epoch": 0.9969604863221885,
- "grad_norm": 1.9521311521530151,
- "learning_rate": 3.989721348177801e-06,
- "loss": 0.5068016052246094,
- "mean_token_accuracy": 0.8220845460891724,
- "num_tokens": 11732569.0,
- "step": 1312
- },
- {
- "epoch": 0.9977203647416414,
- "grad_norm": 2.7332582473754883,
- "learning_rate": 3.988038885695028e-06,
- "loss": 0.4154692590236664,
- "mean_token_accuracy": 0.8493857383728027,
- "num_tokens": 11736759.0,
- "step": 1313
- },
- {
- "epoch": 0.9984802431610942,
- "grad_norm": 1.8656952381134033,
- "learning_rate": 3.986355378849284e-06,
- "loss": 0.4151354134082794,
- "mean_token_accuracy": 0.83440101146698,
- "num_tokens": 11743827.0,
- "step": 1314
- },
- {
- "epoch": 0.9992401215805471,
- "grad_norm": 1.304006576538086,
- "learning_rate": 3.984670828822118e-06,
- "loss": 0.4926128089427948,
- "mean_token_accuracy": 0.8603005409240723,
- "num_tokens": 11757707.0,
- "step": 1315
- },
- {
- "epoch": 1.0,
- "grad_norm": 1.497079610824585,
- "learning_rate": 3.982985236795815e-06,
- "loss": 0.43342477083206177,
- "mean_token_accuracy": 0.8550825119018555,
- "num_tokens": 11769678.0,
- "step": 1316
- },
- {
- "epoch": 1.000759878419453,
- "grad_norm": 2.870274543762207,
- "learning_rate": 3.981298603953385e-06,
- "loss": 0.3723528981208801,
- "mean_token_accuracy": 0.8745899796485901,
- "num_tokens": 11773290.0,
- "step": 1317
- },
- {
- "epoch": 1.0015197568389058,
- "grad_norm": 1.3442503213882446,
- "learning_rate": 3.979610931478574e-06,
- "loss": 0.34688329696655273,
- "mean_token_accuracy": 0.8749074935913086,
- "num_tokens": 11786400.0,
- "step": 1318
- },
- {
- "epoch": 1.0022796352583587,
- "grad_norm": 1.7272238731384277,
- "learning_rate": 3.977922220555855e-06,
- "loss": 0.28274932503700256,
- "mean_token_accuracy": 0.896713137626648,
- "num_tokens": 11793059.0,
- "step": 1319
- },
- {
- "epoch": 1.0030395136778116,
- "grad_norm": 1.7362451553344727,
- "learning_rate": 3.976232472370431e-06,
- "loss": 0.5494794845581055,
- "mean_token_accuracy": 0.8341718912124634,
- "num_tokens": 11802593.0,
- "step": 1320
- },
- {
- "epoch": 1.0037993920972645,
- "grad_norm": 1.3316494226455688,
- "learning_rate": 3.97454168810823e-06,
- "loss": 0.41505366563796997,
- "mean_token_accuracy": 0.8581969738006592,
- "num_tokens": 11813925.0,
- "step": 1321
- },
- {
- "epoch": 1.0045592705167172,
- "grad_norm": 1.6152615547180176,
- "learning_rate": 3.972849868955913e-06,
- "loss": 0.44761013984680176,
- "mean_token_accuracy": 0.8413045406341553,
- "num_tokens": 11825709.0,
- "step": 1322
- },
- {
- "epoch": 1.0053191489361701,
- "grad_norm": 2.1172471046447754,
- "learning_rate": 3.97115701610086e-06,
- "loss": 0.3903353810310364,
- "mean_token_accuracy": 0.8662760257720947,
- "num_tokens": 11832070.0,
- "step": 1323
- },
- {
- "epoch": 1.006079027355623,
- "grad_norm": 1.5923868417739868,
- "learning_rate": 3.969463130731183e-06,
- "loss": 0.4491051137447357,
- "mean_token_accuracy": 0.8677828311920166,
- "num_tokens": 11843154.0,
- "step": 1324
- },
- {
- "epoch": 1.006838905775076,
- "grad_norm": 1.6848995685577393,
- "learning_rate": 3.967768214035716e-06,
- "loss": 0.45765817165374756,
- "mean_token_accuracy": 0.8401060104370117,
- "num_tokens": 11854826.0,
- "step": 1325
- },
- {
- "epoch": 1.0075987841945289,
- "grad_norm": 2.3739020824432373,
- "learning_rate": 3.966072267204014e-06,
- "loss": 0.4482722580432892,
- "mean_token_accuracy": 0.8368916511535645,
- "num_tokens": 11860559.0,
- "step": 1326
- },
- {
- "epoch": 1.0083586626139818,
- "grad_norm": 1.5403034687042236,
- "learning_rate": 3.964375291426361e-06,
- "loss": 0.35589972138404846,
- "mean_token_accuracy": 0.8728118538856506,
- "num_tokens": 11871959.0,
- "step": 1327
- },
- {
- "epoch": 1.0091185410334347,
- "grad_norm": 1.6750119924545288,
- "learning_rate": 3.962677287893758e-06,
- "loss": 0.35873427987098694,
- "mean_token_accuracy": 0.9027186632156372,
- "num_tokens": 11881818.0,
- "step": 1328
- },
- {
- "epoch": 1.0098784194528876,
- "grad_norm": 1.5489170551300049,
- "learning_rate": 3.9609782577979305e-06,
- "loss": 0.3634672462940216,
- "mean_token_accuracy": 0.8582607507705688,
- "num_tokens": 11891084.0,
- "step": 1329
- },
- {
- "epoch": 1.0106382978723405,
- "grad_norm": 2.43859601020813,
- "learning_rate": 3.959278202331323e-06,
- "loss": 0.3640799820423126,
- "mean_token_accuracy": 0.88062584400177,
- "num_tokens": 11896032.0,
- "step": 1330
- },
- {
- "epoch": 1.0113981762917934,
- "grad_norm": 3.612184524536133,
- "learning_rate": 3.9575771226870986e-06,
- "loss": 0.3733130097389221,
- "mean_token_accuracy": 0.8946067094802856,
- "num_tokens": 11899479.0,
- "step": 1331
- },
- {
- "epoch": 1.012158054711246,
- "grad_norm": 1.541355848312378,
- "learning_rate": 3.955875020059141e-06,
- "loss": 0.320593923330307,
- "mean_token_accuracy": 0.9057406783103943,
- "num_tokens": 11910179.0,
- "step": 1332
- },
- {
- "epoch": 1.012917933130699,
- "grad_norm": 2.0565030574798584,
- "learning_rate": 3.954171895642052e-06,
- "loss": 0.3341682553291321,
- "mean_token_accuracy": 0.8829344511032104,
- "num_tokens": 11916489.0,
- "step": 1333
- },
- {
- "epoch": 1.013677811550152,
- "grad_norm": 2.9732539653778076,
- "learning_rate": 3.9524677506311505e-06,
- "loss": 0.38488566875457764,
- "mean_token_accuracy": 0.8752974271774292,
- "num_tokens": 11920682.0,
- "step": 1334
- },
- {
- "epoch": 1.0144376899696048,
- "grad_norm": 2.7697458267211914,
- "learning_rate": 3.950762586222469e-06,
- "loss": 0.39864760637283325,
- "mean_token_accuracy": 0.8593167662620544,
- "num_tokens": 11925233.0,
- "step": 1335
- },
- {
- "epoch": 1.0151975683890577,
- "grad_norm": 2.2302119731903076,
- "learning_rate": 3.949056403612758e-06,
- "loss": 0.3985682725906372,
- "mean_token_accuracy": 0.8677899837493896,
- "num_tokens": 11932000.0,
- "step": 1336
- },
- {
- "epoch": 1.0159574468085106,
- "grad_norm": 2.360572576522827,
- "learning_rate": 3.947349203999485e-06,
- "loss": 0.36940714716911316,
- "mean_token_accuracy": 0.8760676383972168,
- "num_tokens": 11937569.0,
- "step": 1337
- },
- {
- "epoch": 1.0167173252279635,
- "grad_norm": 1.3383921384811401,
- "learning_rate": 3.945640988580824e-06,
- "loss": 0.40628793835639954,
- "mean_token_accuracy": 0.866442084312439,
- "num_tokens": 11955679.0,
- "step": 1338
- },
- {
- "epoch": 1.0174772036474165,
- "grad_norm": 2.1502623558044434,
- "learning_rate": 3.943931758555669e-06,
- "loss": 0.4493565559387207,
- "mean_token_accuracy": 0.8307522535324097,
- "num_tokens": 11962734.0,
- "step": 1339
- },
- {
- "epoch": 1.0182370820668694,
- "grad_norm": 2.4737331867218018,
- "learning_rate": 3.942221515123624e-06,
- "loss": 0.28508758544921875,
- "mean_token_accuracy": 0.8967142105102539,
- "num_tokens": 11967783.0,
- "step": 1340
- },
- {
- "epoch": 1.0189969604863223,
- "grad_norm": 2.4525370597839355,
- "learning_rate": 3.940510259485002e-06,
- "loss": 0.40227818489074707,
- "mean_token_accuracy": 0.8618967533111572,
- "num_tokens": 11972918.0,
- "step": 1341
- },
- {
- "epoch": 1.0197568389057752,
- "grad_norm": 1.7299731969833374,
- "learning_rate": 3.938797992840828e-06,
- "loss": 0.26339593529701233,
- "mean_token_accuracy": 0.9004406929016113,
- "num_tokens": 11981250.0,
- "step": 1342
- },
- {
- "epoch": 1.0205167173252279,
- "grad_norm": 2.8756747245788574,
- "learning_rate": 3.937084716392839e-06,
- "loss": 0.47792482376098633,
- "mean_token_accuracy": 0.8440839052200317,
- "num_tokens": 11986356.0,
- "step": 1343
- },
- {
- "epoch": 1.0212765957446808,
- "grad_norm": 2.104473114013672,
- "learning_rate": 3.935370431343475e-06,
- "loss": 0.36723971366882324,
- "mean_token_accuracy": 0.8831232786178589,
- "num_tokens": 11994495.0,
- "step": 1344
- },
- {
- "epoch": 1.0220364741641337,
- "grad_norm": 1.9173074960708618,
- "learning_rate": 3.933655138895889e-06,
- "loss": 0.409319669008255,
- "mean_token_accuracy": 0.8632645606994629,
- "num_tokens": 12002060.0,
- "step": 1345
- },
- {
- "epoch": 1.0227963525835866,
- "grad_norm": 2.958311080932617,
- "learning_rate": 3.9319388402539395e-06,
- "loss": 0.5390093922615051,
- "mean_token_accuracy": 0.8204828500747681,
- "num_tokens": 12007588.0,
- "step": 1346
- },
- {
- "epoch": 1.0235562310030395,
- "grad_norm": 1.6470831632614136,
- "learning_rate": 3.930221536622192e-06,
- "loss": 0.4524633288383484,
- "mean_token_accuracy": 0.8516575694084167,
- "num_tokens": 12018831.0,
- "step": 1347
- },
- {
- "epoch": 1.0243161094224924,
- "grad_norm": 1.3160780668258667,
- "learning_rate": 3.928503229205913e-06,
- "loss": 0.4180558919906616,
- "mean_token_accuracy": 0.8495022058486938,
- "num_tokens": 12033947.0,
- "step": 1348
- },
- {
- "epoch": 1.0250759878419453,
- "grad_norm": 1.9686089754104614,
- "learning_rate": 3.92678391921108e-06,
- "loss": 0.41927334666252136,
- "mean_token_accuracy": 0.8462997674942017,
- "num_tokens": 12042005.0,
- "step": 1349
- },
- {
- "epoch": 1.0258358662613982,
- "grad_norm": 2.351778507232666,
- "learning_rate": 3.92506360784437e-06,
- "loss": 0.2946245074272156,
- "mean_token_accuracy": 0.9170923233032227,
- "num_tokens": 12046579.0,
- "step": 1350
- },
- {
- "epoch": 1.0265957446808511,
- "grad_norm": 2.0636913776397705,
- "learning_rate": 3.923342296313162e-06,
- "loss": 0.3422774076461792,
- "mean_token_accuracy": 0.8809213638305664,
- "num_tokens": 12053214.0,
- "step": 1351
- },
- {
- "epoch": 1.027355623100304,
- "grad_norm": 1.7272592782974243,
- "learning_rate": 3.92161998582554e-06,
- "loss": 0.5864541530609131,
- "mean_token_accuracy": 0.7986117601394653,
- "num_tokens": 12068522.0,
- "step": 1352
- },
- {
- "epoch": 1.028115501519757,
- "grad_norm": 0.8980231881141663,
- "learning_rate": 3.919896677590289e-06,
- "loss": 0.2964550256729126,
- "mean_token_accuracy": 0.8911845088005066,
- "num_tokens": 12093834.0,
- "step": 1353
- },
- {
- "epoch": 1.0288753799392096,
- "grad_norm": 1.6031712293624878,
- "learning_rate": 3.918172372816892e-06,
- "loss": 0.37254488468170166,
- "mean_token_accuracy": 0.8615843057632446,
- "num_tokens": 12104393.0,
- "step": 1354
- },
- {
- "epoch": 1.0296352583586625,
- "grad_norm": 1.282134771347046,
- "learning_rate": 3.916447072715531e-06,
- "loss": 0.3522927761077881,
- "mean_token_accuracy": 0.8713657259941101,
- "num_tokens": 12118671.0,
- "step": 1355
- },
- {
- "epoch": 1.0303951367781155,
- "grad_norm": 2.1986680030822754,
- "learning_rate": 3.914720778497091e-06,
- "loss": 0.3716316223144531,
- "mean_token_accuracy": 0.8661249279975891,
- "num_tokens": 12125178.0,
- "step": 1356
- },
- {
- "epoch": 1.0311550151975684,
- "grad_norm": 1.5937882661819458,
- "learning_rate": 3.91299349137315e-06,
- "loss": 0.48067355155944824,
- "mean_token_accuracy": 0.8284252882003784,
- "num_tokens": 12136785.0,
- "step": 1357
- },
- {
- "epoch": 1.0319148936170213,
- "grad_norm": 1.6743099689483643,
- "learning_rate": 3.9112652125559845e-06,
- "loss": 0.4461551308631897,
- "mean_token_accuracy": 0.8381845355033875,
- "num_tokens": 12150066.0,
- "step": 1358
- },
- {
- "epoch": 1.0326747720364742,
- "grad_norm": 2.2346715927124023,
- "learning_rate": 3.909535943258567e-06,
- "loss": 0.3148220181465149,
- "mean_token_accuracy": 0.8797591924667358,
- "num_tokens": 12155506.0,
- "step": 1359
- },
- {
- "epoch": 1.033434650455927,
- "grad_norm": 1.9608992338180542,
- "learning_rate": 3.907805684694567e-06,
- "loss": 0.32598960399627686,
- "mean_token_accuracy": 0.8819410800933838,
- "num_tokens": 12163261.0,
- "step": 1360
- },
- {
- "epoch": 1.03419452887538,
- "grad_norm": 2.413477897644043,
- "learning_rate": 3.906074438078343e-06,
- "loss": 0.38179588317871094,
- "mean_token_accuracy": 0.8739585876464844,
- "num_tokens": 12169254.0,
- "step": 1361
- },
- {
- "epoch": 1.034954407294833,
- "grad_norm": 2.0258278846740723,
- "learning_rate": 3.904342204624955e-06,
- "loss": 0.33240315318107605,
- "mean_token_accuracy": 0.8808181285858154,
- "num_tokens": 12175379.0,
- "step": 1362
- },
- {
- "epoch": 1.0357142857142858,
- "grad_norm": 2.4111437797546387,
- "learning_rate": 3.9026089855501475e-06,
- "loss": 0.412802517414093,
- "mean_token_accuracy": 0.8504396677017212,
- "num_tokens": 12182007.0,
- "step": 1363
- },
- {
- "epoch": 1.0364741641337385,
- "grad_norm": 2.0424840450286865,
- "learning_rate": 3.900874782070362e-06,
- "loss": 0.2914797067642212,
- "mean_token_accuracy": 0.8731886148452759,
- "num_tokens": 12187743.0,
- "step": 1364
- },
- {
- "epoch": 1.0372340425531914,
- "grad_norm": 2.9248716831207275,
- "learning_rate": 3.899139595402729e-06,
- "loss": 0.34071338176727295,
- "mean_token_accuracy": 0.8736443519592285,
- "num_tokens": 12191830.0,
- "step": 1365
- },
- {
- "epoch": 1.0379939209726443,
- "grad_norm": 2.240220785140991,
- "learning_rate": 3.8974034267650695e-06,
- "loss": 0.23049014806747437,
- "mean_token_accuracy": 0.9000070691108704,
- "num_tokens": 12196460.0,
- "step": 1366
- },
- {
- "epoch": 1.0387537993920972,
- "grad_norm": 1.5038460493087769,
- "learning_rate": 3.895666277375892e-06,
- "loss": 0.32255327701568604,
- "mean_token_accuracy": 0.873004674911499,
- "num_tokens": 12206230.0,
- "step": 1367
- },
- {
- "epoch": 1.0395136778115501,
- "grad_norm": 1.2339142560958862,
- "learning_rate": 3.893928148454398e-06,
- "loss": 0.4069131314754486,
- "mean_token_accuracy": 0.8461740016937256,
- "num_tokens": 12226502.0,
- "step": 1368
- },
- {
- "epoch": 1.040273556231003,
- "grad_norm": 2.531553268432617,
- "learning_rate": 3.89218904122047e-06,
- "loss": 0.43681037425994873,
- "mean_token_accuracy": 0.8497104048728943,
- "num_tokens": 12232241.0,
- "step": 1369
- },
- {
- "epoch": 1.041033434650456,
- "grad_norm": 3.8404815196990967,
- "learning_rate": 3.890448956894682e-06,
- "loss": 0.3241814970970154,
- "mean_token_accuracy": 0.884732723236084,
- "num_tokens": 12235126.0,
- "step": 1370
- },
- {
- "epoch": 1.0417933130699089,
- "grad_norm": 2.9608030319213867,
- "learning_rate": 3.888707896698293e-06,
- "loss": 0.4641021490097046,
- "mean_token_accuracy": 0.8496800661087036,
- "num_tokens": 12240630.0,
- "step": 1371
- },
- {
- "epoch": 1.0425531914893618,
- "grad_norm": 2.1166417598724365,
- "learning_rate": 3.886965861853243e-06,
- "loss": 0.42038479447364807,
- "mean_token_accuracy": 0.8512747287750244,
- "num_tokens": 12247969.0,
- "step": 1372
- },
- {
- "epoch": 1.0433130699088147,
- "grad_norm": 2.5918161869049072,
- "learning_rate": 3.885222853582163e-06,
- "loss": 0.2871917188167572,
- "mean_token_accuracy": 0.9129709601402283,
- "num_tokens": 12252161.0,
- "step": 1373
- },
- {
- "epoch": 1.0440729483282676,
- "grad_norm": 2.4261348247528076,
- "learning_rate": 3.88347887310836e-06,
- "loss": 0.4003123342990875,
- "mean_token_accuracy": 0.8570356369018555,
- "num_tokens": 12258135.0,
- "step": 1374
- },
- {
- "epoch": 1.0448328267477203,
- "grad_norm": 1.3439548015594482,
- "learning_rate": 3.881733921655829e-06,
- "loss": 0.3278140425682068,
- "mean_token_accuracy": 0.8831373453140259,
- "num_tokens": 12272849.0,
- "step": 1375
- },
- {
- "epoch": 1.0455927051671732,
- "grad_norm": 1.527989387512207,
- "learning_rate": 3.879988000449243e-06,
- "loss": 0.33789363503456116,
- "mean_token_accuracy": 0.8825669884681702,
- "num_tokens": 12283281.0,
- "step": 1376
- },
- {
- "epoch": 1.046352583586626,
- "grad_norm": 1.6755503416061401,
- "learning_rate": 3.878241110713957e-06,
- "loss": 0.4816160798072815,
- "mean_token_accuracy": 0.8193758726119995,
- "num_tokens": 12295422.0,
- "step": 1377
- },
- {
- "epoch": 1.047112462006079,
- "grad_norm": 2.8110361099243164,
- "learning_rate": 3.876493253676004e-06,
- "loss": 0.38662949204444885,
- "mean_token_accuracy": 0.8611986637115479,
- "num_tokens": 12299806.0,
- "step": 1378
- },
- {
- "epoch": 1.047872340425532,
- "grad_norm": 1.86097252368927,
- "learning_rate": 3.8747444305621e-06,
- "loss": 0.27612629532814026,
- "mean_token_accuracy": 0.8984048366546631,
- "num_tokens": 12306599.0,
- "step": 1379
- },
- {
- "epoch": 1.0486322188449848,
- "grad_norm": 2.361828565597534,
- "learning_rate": 3.872994642599635e-06,
- "loss": 0.469953715801239,
- "mean_token_accuracy": 0.8464452028274536,
- "num_tokens": 12314249.0,
- "step": 1380
- },
- {
- "epoch": 1.0493920972644377,
- "grad_norm": 1.9524794816970825,
- "learning_rate": 3.871243891016676e-06,
- "loss": 0.5419625043869019,
- "mean_token_accuracy": 0.8468329906463623,
- "num_tokens": 12324987.0,
- "step": 1381
- },
- {
- "epoch": 1.0501519756838906,
- "grad_norm": 1.6931511163711548,
- "learning_rate": 3.869492177041971e-06,
- "loss": 0.3791416883468628,
- "mean_token_accuracy": 0.8692882061004639,
- "num_tokens": 12336864.0,
- "step": 1382
- },
- {
- "epoch": 1.0509118541033435,
- "grad_norm": 1.909692406654358,
- "learning_rate": 3.867739501904938e-06,
- "loss": 0.27974557876586914,
- "mean_token_accuracy": 0.9004636406898499,
- "num_tokens": 12343093.0,
- "step": 1383
- },
- {
- "epoch": 1.0516717325227964,
- "grad_norm": 1.415162205696106,
- "learning_rate": 3.8659858668356735e-06,
- "loss": 0.38928335905075073,
- "mean_token_accuracy": 0.8491984009742737,
- "num_tokens": 12356613.0,
- "step": 1384
- },
- {
- "epoch": 1.0524316109422491,
- "grad_norm": 1.8195741176605225,
- "learning_rate": 3.864231273064944e-06,
- "loss": 0.3798758089542389,
- "mean_token_accuracy": 0.8728072047233582,
- "num_tokens": 12364860.0,
- "step": 1385
- },
- {
- "epoch": 1.053191489361702,
- "grad_norm": 1.8481454849243164,
- "learning_rate": 3.862475721824193e-06,
- "loss": 0.269635945558548,
- "mean_token_accuracy": 0.899247407913208,
- "num_tokens": 12371841.0,
- "step": 1386
- },
- {
- "epoch": 1.053951367781155,
- "grad_norm": 1.7838784456253052,
- "learning_rate": 3.8607192143455325e-06,
- "loss": 0.36971768736839294,
- "mean_token_accuracy": 0.8833638429641724,
- "num_tokens": 12380685.0,
- "step": 1387
- },
- {
- "epoch": 1.0547112462006079,
- "grad_norm": 1.333358645439148,
- "learning_rate": 3.858961751861748e-06,
- "loss": 0.4039418399333954,
- "mean_token_accuracy": 0.8541078567504883,
- "num_tokens": 12394072.0,
- "step": 1388
- },
- {
- "epoch": 1.0554711246200608,
- "grad_norm": 2.1600265502929688,
- "learning_rate": 3.857203335606294e-06,
- "loss": 0.38211894035339355,
- "mean_token_accuracy": 0.8549972772598267,
- "num_tokens": 12400449.0,
- "step": 1389
- },
- {
- "epoch": 1.0562310030395137,
- "grad_norm": 2.914902687072754,
- "learning_rate": 3.855443966813295e-06,
- "loss": 0.2237374186515808,
- "mean_token_accuracy": 0.9253600835800171,
- "num_tokens": 12403758.0,
- "step": 1390
- },
- {
- "epoch": 1.0569908814589666,
- "grad_norm": 2.2361080646514893,
- "learning_rate": 3.853683646717543e-06,
- "loss": 0.3359566926956177,
- "mean_token_accuracy": 0.898173451423645,
- "num_tokens": 12410374.0,
- "step": 1391
- },
- {
- "epoch": 1.0577507598784195,
- "grad_norm": 2.3639304637908936,
- "learning_rate": 3.8519223765544985e-06,
- "loss": 0.3844943046569824,
- "mean_token_accuracy": 0.863599419593811,
- "num_tokens": 12416016.0,
- "step": 1392
- },
- {
- "epoch": 1.0585106382978724,
- "grad_norm": 2.202971935272217,
- "learning_rate": 3.85016015756029e-06,
- "loss": 0.3546281158924103,
- "mean_token_accuracy": 0.8907540440559387,
- "num_tokens": 12422026.0,
- "step": 1393
- },
- {
- "epoch": 1.0592705167173253,
- "grad_norm": 1.1279661655426025,
- "learning_rate": 3.848396990971709e-06,
- "loss": 0.31522464752197266,
- "mean_token_accuracy": 0.8662257194519043,
- "num_tokens": 12439964.0,
- "step": 1394
- },
- {
- "epoch": 1.0600303951367782,
- "grad_norm": 2.4731740951538086,
- "learning_rate": 3.846632878026214e-06,
- "loss": 0.456442266702652,
- "mean_token_accuracy": 0.8516958951950073,
- "num_tokens": 12446231.0,
- "step": 1395
- },
- {
- "epoch": 1.060790273556231,
- "grad_norm": 1.7631878852844238,
- "learning_rate": 3.844867819961928e-06,
- "loss": 0.487227201461792,
- "mean_token_accuracy": 0.8466947078704834,
- "num_tokens": 12459989.0,
- "step": 1396
- },
- {
- "epoch": 1.0615501519756838,
- "grad_norm": 2.4468278884887695,
- "learning_rate": 3.843101818017637e-06,
- "loss": 0.3367291986942291,
- "mean_token_accuracy": 0.8734689950942993,
- "num_tokens": 12465741.0,
- "step": 1397
- },
- {
- "epoch": 1.0623100303951367,
- "grad_norm": 1.9045145511627197,
- "learning_rate": 3.841334873432789e-06,
- "loss": 0.4652615487575531,
- "mean_token_accuracy": 0.8333107233047485,
- "num_tokens": 12474963.0,
- "step": 1398
- },
- {
- "epoch": 1.0630699088145896,
- "grad_norm": 1.6816917657852173,
- "learning_rate": 3.839566987447492e-06,
- "loss": 0.4144279956817627,
- "mean_token_accuracy": 0.8472539186477661,
- "num_tokens": 12485521.0,
- "step": 1399
- },
- {
- "epoch": 1.0638297872340425,
- "grad_norm": 1.8990092277526855,
- "learning_rate": 3.837798161302518e-06,
- "loss": 0.4040985405445099,
- "mean_token_accuracy": 0.8514704704284668,
- "num_tokens": 12493495.0,
- "step": 1400
- },
- {
- "epoch": 1.0645896656534954,
- "grad_norm": 2.27785325050354,
- "learning_rate": 3.836028396239297e-06,
- "loss": 0.43425723910331726,
- "mean_token_accuracy": 0.8795069456100464,
- "num_tokens": 12499789.0,
- "step": 1401
- },
- {
- "epoch": 1.0653495440729484,
- "grad_norm": 2.5130882263183594,
- "learning_rate": 3.8342576934999184e-06,
- "loss": 0.33892524242401123,
- "mean_token_accuracy": 0.8717449903488159,
- "num_tokens": 12504885.0,
- "step": 1402
- },
- {
- "epoch": 1.0661094224924013,
- "grad_norm": 2.650040864944458,
- "learning_rate": 3.832486054327131e-06,
- "loss": 0.4200317859649658,
- "mean_token_accuracy": 0.8616159558296204,
- "num_tokens": 12509783.0,
- "step": 1403
- },
- {
- "epoch": 1.0668693009118542,
- "grad_norm": 2.9176881313323975,
- "learning_rate": 3.830713479964335e-06,
- "loss": 0.37018489837646484,
- "mean_token_accuracy": 0.8676021695137024,
- "num_tokens": 12514441.0,
- "step": 1404
- },
- {
- "epoch": 1.067629179331307,
- "grad_norm": 1.6430318355560303,
- "learning_rate": 3.828939971655595e-06,
- "loss": 0.27539193630218506,
- "mean_token_accuracy": 0.9077831506729126,
- "num_tokens": 12523677.0,
- "step": 1405
- },
- {
- "epoch": 1.06838905775076,
- "grad_norm": 1.3683708906173706,
- "learning_rate": 3.827165530645627e-06,
- "loss": 0.4085099697113037,
- "mean_token_accuracy": 0.8579255938529968,
- "num_tokens": 12540104.0,
- "step": 1406
- },
- {
- "epoch": 1.0691489361702127,
- "grad_norm": 2.528465747833252,
- "learning_rate": 3.825390158179802e-06,
- "loss": 0.42462456226348877,
- "mean_token_accuracy": 0.852813720703125,
- "num_tokens": 12548239.0,
- "step": 1407
- },
- {
- "epoch": 1.0699088145896656,
- "grad_norm": 1.8288795948028564,
- "learning_rate": 3.823613855504144e-06,
- "loss": 0.412417471408844,
- "mean_token_accuracy": 0.8622130751609802,
- "num_tokens": 12557316.0,
- "step": 1408
- },
- {
- "epoch": 1.0706686930091185,
- "grad_norm": 2.341794490814209,
- "learning_rate": 3.82183662386533e-06,
- "loss": 0.2996668815612793,
- "mean_token_accuracy": 0.8964041471481323,
- "num_tokens": 12562377.0,
- "step": 1409
- },
- {
- "epoch": 1.0714285714285714,
- "grad_norm": 2.555877208709717,
- "learning_rate": 3.82005846451069e-06,
- "loss": 0.4184221625328064,
- "mean_token_accuracy": 0.8678828477859497,
- "num_tokens": 12568516.0,
- "step": 1410
- },
- {
- "epoch": 1.0721884498480243,
- "grad_norm": 2.081308126449585,
- "learning_rate": 3.8182793786882065e-06,
- "loss": 0.4376835823059082,
- "mean_token_accuracy": 0.8409077525138855,
- "num_tokens": 12576598.0,
- "step": 1411
- },
- {
- "epoch": 1.0729483282674772,
- "grad_norm": 2.0272316932678223,
- "learning_rate": 3.816499367646508e-06,
- "loss": 0.3630060851573944,
- "mean_token_accuracy": 0.8762413263320923,
- "num_tokens": 12584587.0,
- "step": 1412
- },
- {
- "epoch": 1.0737082066869301,
- "grad_norm": 2.6382484436035156,
- "learning_rate": 3.814718432634877e-06,
- "loss": 0.4244990348815918,
- "mean_token_accuracy": 0.8509312272071838,
- "num_tokens": 12590028.0,
- "step": 1413
- },
- {
- "epoch": 1.074468085106383,
- "grad_norm": 2.429800271987915,
- "learning_rate": 3.8129365749032398e-06,
- "loss": 0.36990004777908325,
- "mean_token_accuracy": 0.8749774098396301,
- "num_tokens": 12594984.0,
- "step": 1414
- },
- {
- "epoch": 1.075227963525836,
- "grad_norm": 3.5939090251922607,
- "learning_rate": 3.8111537957021736e-06,
- "loss": 0.4245661199092865,
- "mean_token_accuracy": 0.8481623530387878,
- "num_tokens": 12598494.0,
- "step": 1415
- },
- {
- "epoch": 1.0759878419452888,
- "grad_norm": 2.705955982208252,
- "learning_rate": 3.809370096282903e-06,
- "loss": 0.41851678490638733,
- "mean_token_accuracy": 0.8548051714897156,
- "num_tokens": 12603876.0,
- "step": 1416
- },
- {
- "epoch": 1.0767477203647418,
- "grad_norm": 1.7812079191207886,
- "learning_rate": 3.807585477897296e-06,
- "loss": 0.47113919258117676,
- "mean_token_accuracy": 0.8346904516220093,
- "num_tokens": 12613402.0,
- "step": 1417
- },
- {
- "epoch": 1.0775075987841944,
- "grad_norm": 1.4335212707519531,
- "learning_rate": 3.8057999417978654e-06,
- "loss": 0.3802063465118408,
- "mean_token_accuracy": 0.8563423156738281,
- "num_tokens": 12626865.0,
- "step": 1418
- },
- {
- "epoch": 1.0782674772036474,
- "grad_norm": 1.9171305894851685,
- "learning_rate": 3.8040134892377702e-06,
- "loss": 0.20898357033729553,
- "mean_token_accuracy": 0.9189738035202026,
- "num_tokens": 12632593.0,
- "step": 1419
- },
- {
- "epoch": 1.0790273556231003,
- "grad_norm": 1.4996821880340576,
- "learning_rate": 3.802226121470811e-06,
- "loss": 0.4203261137008667,
- "mean_token_accuracy": 0.8479211330413818,
- "num_tokens": 12646395.0,
- "step": 1420
- },
- {
- "epoch": 1.0797872340425532,
- "grad_norm": 2.2007253170013428,
- "learning_rate": 3.800437839751432e-06,
- "loss": 0.40370577573776245,
- "mean_token_accuracy": 0.8427679538726807,
- "num_tokens": 12653508.0,
- "step": 1421
- },
- {
- "epoch": 1.080547112462006,
- "grad_norm": 1.7266581058502197,
- "learning_rate": 3.7986486453347183e-06,
- "loss": 0.46750491857528687,
- "mean_token_accuracy": 0.8429205417633057,
- "num_tokens": 12666329.0,
- "step": 1422
- },
- {
- "epoch": 1.081306990881459,
- "grad_norm": 1.4716318845748901,
- "learning_rate": 3.796858539476394e-06,
- "loss": 0.3330317735671997,
- "mean_token_accuracy": 0.879012942314148,
- "num_tokens": 12676741.0,
- "step": 1423
- },
- {
- "epoch": 1.082066869300912,
- "grad_norm": 2.652127265930176,
- "learning_rate": 3.795067523432826e-06,
- "loss": 0.35365715622901917,
- "mean_token_accuracy": 0.8796792030334473,
- "num_tokens": 12681479.0,
- "step": 1424
- },
- {
- "epoch": 1.0828267477203648,
- "grad_norm": 1.2937829494476318,
- "learning_rate": 3.793275598461017e-06,
- "loss": 0.25272446870803833,
- "mean_token_accuracy": 0.9231734275817871,
- "num_tokens": 12694238.0,
- "step": 1425
- },
- {
- "epoch": 1.0835866261398177,
- "grad_norm": 1.3831220865249634,
- "learning_rate": 3.7914827658186104e-06,
- "loss": 0.4935331344604492,
- "mean_token_accuracy": 0.8417420387268066,
- "num_tokens": 12712857.0,
- "step": 1426
- },
- {
- "epoch": 1.0843465045592706,
- "grad_norm": 3.059525728225708,
- "learning_rate": 3.7896890267638832e-06,
- "loss": 0.2592190206050873,
- "mean_token_accuracy": 0.9040263295173645,
- "num_tokens": 12716766.0,
- "step": 1427
- },
- {
- "epoch": 1.0851063829787233,
- "grad_norm": 2.8399202823638916,
- "learning_rate": 3.787894382555752e-06,
- "loss": 0.32098138332366943,
- "mean_token_accuracy": 0.8838302493095398,
- "num_tokens": 12720774.0,
- "step": 1428
- },
- {
- "epoch": 1.0858662613981762,
- "grad_norm": 2.618479013442993,
- "learning_rate": 3.7860988344537664e-06,
- "loss": 0.425255686044693,
- "mean_token_accuracy": 0.8564130067825317,
- "num_tokens": 12726506.0,
- "step": 1429
- },
- {
- "epoch": 1.0866261398176291,
- "grad_norm": 1.3108669519424438,
- "learning_rate": 3.7843023837181126e-06,
- "loss": 0.40220165252685547,
- "mean_token_accuracy": 0.8588873147964478,
- "num_tokens": 12742814.0,
- "step": 1430
- },
- {
- "epoch": 1.087386018237082,
- "grad_norm": 2.2083566188812256,
- "learning_rate": 3.782505031609607e-06,
- "loss": 0.318379282951355,
- "mean_token_accuracy": 0.8887606859207153,
- "num_tokens": 12748388.0,
- "step": 1431
- },
- {
- "epoch": 1.088145896656535,
- "grad_norm": 1.922358751296997,
- "learning_rate": 3.7807067793897006e-06,
- "loss": 0.2519589364528656,
- "mean_token_accuracy": 0.8936764001846313,
- "num_tokens": 12754761.0,
- "step": 1432
- },
- {
- "epoch": 1.0889057750759878,
- "grad_norm": 1.7367439270019531,
- "learning_rate": 3.778907628320477e-06,
- "loss": 0.3970367908477783,
- "mean_token_accuracy": 0.858735203742981,
- "num_tokens": 12764016.0,
- "step": 1433
- },
- {
- "epoch": 1.0896656534954408,
- "grad_norm": 2.1931066513061523,
- "learning_rate": 3.77710757966465e-06,
- "loss": 0.5250554084777832,
- "mean_token_accuracy": 0.8356746435165405,
- "num_tokens": 12772272.0,
- "step": 1434
- },
- {
- "epoch": 1.0904255319148937,
- "grad_norm": 1.718337893486023,
- "learning_rate": 3.775306634685562e-06,
- "loss": 0.283231645822525,
- "mean_token_accuracy": 0.9009919166564941,
- "num_tokens": 12780706.0,
- "step": 1435
- },
- {
- "epoch": 1.0911854103343466,
- "grad_norm": 2.1985926628112793,
- "learning_rate": 3.773504794647187e-06,
- "loss": 0.3913170397281647,
- "mean_token_accuracy": 0.8909255266189575,
- "num_tokens": 12787052.0,
- "step": 1436
- },
- {
- "epoch": 1.0919452887537995,
- "grad_norm": 2.8687937259674072,
- "learning_rate": 3.771702060814123e-06,
- "loss": 0.3135771155357361,
- "mean_token_accuracy": 0.9016125202178955,
- "num_tokens": 12791854.0,
- "step": 1437
- },
- {
- "epoch": 1.0927051671732522,
- "grad_norm": 4.203946590423584,
- "learning_rate": 3.7698984344516e-06,
- "loss": 0.3642737865447998,
- "mean_token_accuracy": 0.8842349052429199,
- "num_tokens": 12794969.0,
- "step": 1438
- },
- {
- "epoch": 1.093465045592705,
- "grad_norm": 1.5134642124176025,
- "learning_rate": 3.7680939168254733e-06,
- "loss": 0.3732057213783264,
- "mean_token_accuracy": 0.8671083450317383,
- "num_tokens": 12808480.0,
- "step": 1439
- },
- {
- "epoch": 1.094224924012158,
- "grad_norm": 3.2103970050811768,
- "learning_rate": 3.7662885092022206e-06,
- "loss": 0.3556194603443146,
- "mean_token_accuracy": 0.8786529302597046,
- "num_tokens": 12812654.0,
- "step": 1440
- },
- {
- "epoch": 1.094984802431611,
- "grad_norm": 2.2774064540863037,
- "learning_rate": 3.7644822128489476e-06,
- "loss": 0.38409674167633057,
- "mean_token_accuracy": 0.866563081741333,
- "num_tokens": 12819854.0,
- "step": 1441
- },
- {
- "epoch": 1.0957446808510638,
- "grad_norm": 1.8250885009765625,
- "learning_rate": 3.7626750290333824e-06,
- "loss": 0.3812350034713745,
- "mean_token_accuracy": 0.8676212430000305,
- "num_tokens": 12830338.0,
- "step": 1442
- },
- {
- "epoch": 1.0965045592705167,
- "grad_norm": 1.8337891101837158,
- "learning_rate": 3.7608669590238765e-06,
- "loss": 0.3892471194267273,
- "mean_token_accuracy": 0.8616238832473755,
- "num_tokens": 12840340.0,
- "step": 1443
- },
- {
- "epoch": 1.0972644376899696,
- "grad_norm": 1.5300254821777344,
- "learning_rate": 3.7590580040894025e-06,
- "loss": 0.35288217663764954,
- "mean_token_accuracy": 0.8625509738922119,
- "num_tokens": 12853144.0,
- "step": 1444
- },
- {
- "epoch": 1.0980243161094225,
- "grad_norm": 2.152683734893799,
- "learning_rate": 3.7572481654995554e-06,
- "loss": 0.4004772901535034,
- "mean_token_accuracy": 0.858427107334137,
- "num_tokens": 12859970.0,
- "step": 1445
- },
- {
- "epoch": 1.0987841945288754,
- "grad_norm": 1.532832145690918,
- "learning_rate": 3.755437444524548e-06,
- "loss": 0.46820127964019775,
- "mean_token_accuracy": 0.8585472106933594,
- "num_tokens": 12875243.0,
- "step": 1446
- },
- {
- "epoch": 1.0995440729483283,
- "grad_norm": 1.6485342979431152,
- "learning_rate": 3.7536258424352164e-06,
- "loss": 0.46329325437545776,
- "mean_token_accuracy": 0.8376060724258423,
- "num_tokens": 12886383.0,
- "step": 1447
- },
- {
- "epoch": 1.1003039513677813,
- "grad_norm": 2.402256488800049,
- "learning_rate": 3.75181336050301e-06,
- "loss": 0.43916207551956177,
- "mean_token_accuracy": 0.8448786735534668,
- "num_tokens": 12892613.0,
- "step": 1448
- },
- {
- "epoch": 1.101063829787234,
- "grad_norm": 1.3893651962280273,
- "learning_rate": 3.7500000000000005e-06,
- "loss": 0.3919021785259247,
- "mean_token_accuracy": 0.8495820760726929,
- "num_tokens": 12905523.0,
- "step": 1449
- },
- {
- "epoch": 1.1018237082066868,
- "grad_norm": 1.5519827604293823,
- "learning_rate": 3.7481857621988734e-06,
- "loss": 0.4710700809955597,
- "mean_token_accuracy": 0.8387632369995117,
- "num_tokens": 12918236.0,
- "step": 1450
- },
- {
- "epoch": 1.1025835866261398,
- "grad_norm": 2.0141353607177734,
- "learning_rate": 3.74637064837293e-06,
- "loss": 0.30866751074790955,
- "mean_token_accuracy": 0.9059321880340576,
- "num_tokens": 12924391.0,
- "step": 1451
- },
- {
- "epoch": 1.1033434650455927,
- "grad_norm": 1.2201496362686157,
- "learning_rate": 3.7445546597960882e-06,
- "loss": 0.3938257396221161,
- "mean_token_accuracy": 0.8726630210876465,
- "num_tokens": 12943338.0,
- "step": 1452
- },
- {
- "epoch": 1.1041033434650456,
- "grad_norm": 2.29434871673584,
- "learning_rate": 3.742737797742878e-06,
- "loss": 0.4347776174545288,
- "mean_token_accuracy": 0.840569257736206,
- "num_tokens": 12950636.0,
- "step": 1453
- },
- {
- "epoch": 1.1048632218844985,
- "grad_norm": 2.3875105381011963,
- "learning_rate": 3.7409200634884425e-06,
- "loss": 0.48353564739227295,
- "mean_token_accuracy": 0.8207056522369385,
- "num_tokens": 12957635.0,
- "step": 1454
- },
- {
- "epoch": 1.1056231003039514,
- "grad_norm": 2.3539648056030273,
- "learning_rate": 3.7391014583085384e-06,
- "loss": 0.3532431721687317,
- "mean_token_accuracy": 0.8903788924217224,
- "num_tokens": 12963032.0,
- "step": 1455
- },
- {
- "epoch": 1.1063829787234043,
- "grad_norm": 1.5611135959625244,
- "learning_rate": 3.737281983479534e-06,
- "loss": 0.4734863042831421,
- "mean_token_accuracy": 0.8413879871368408,
- "num_tokens": 12977170.0,
- "step": 1456
- },
- {
- "epoch": 1.1071428571428572,
- "grad_norm": 1.474320411682129,
- "learning_rate": 3.735461640278404e-06,
- "loss": 0.41854286193847656,
- "mean_token_accuracy": 0.8499876856803894,
- "num_tokens": 12993750.0,
- "step": 1457
- },
- {
- "epoch": 1.1079027355623101,
- "grad_norm": 2.6873273849487305,
- "learning_rate": 3.733640429982738e-06,
- "loss": 0.47637903690338135,
- "mean_token_accuracy": 0.83599853515625,
- "num_tokens": 12999058.0,
- "step": 1458
- },
- {
- "epoch": 1.108662613981763,
- "grad_norm": 1.4575026035308838,
- "learning_rate": 3.731818353870729e-06,
- "loss": 0.38441652059555054,
- "mean_token_accuracy": 0.8582364320755005,
- "num_tokens": 13013864.0,
- "step": 1459
- },
- {
- "epoch": 1.1094224924012157,
- "grad_norm": 1.7722690105438232,
- "learning_rate": 3.729995413221183e-06,
- "loss": 0.4224998950958252,
- "mean_token_accuracy": 0.8511888384819031,
- "num_tokens": 13023714.0,
- "step": 1460
- },
- {
- "epoch": 1.1101823708206686,
- "grad_norm": 2.625760555267334,
- "learning_rate": 3.7281716093135068e-06,
- "loss": 0.3487582802772522,
- "mean_token_accuracy": 0.8834779262542725,
- "num_tokens": 13028608.0,
- "step": 1461
- },
- {
- "epoch": 1.1109422492401215,
- "grad_norm": 1.2554056644439697,
- "learning_rate": 3.726346943427719e-06,
- "loss": 0.33312469720840454,
- "mean_token_accuracy": 0.8704153299331665,
- "num_tokens": 13044901.0,
- "step": 1462
- },
- {
- "epoch": 1.1117021276595744,
- "grad_norm": 2.1109910011291504,
- "learning_rate": 3.7245214168444388e-06,
- "loss": 0.387290894985199,
- "mean_token_accuracy": 0.860816240310669,
- "num_tokens": 13051452.0,
- "step": 1463
- },
- {
- "epoch": 1.1124620060790273,
- "grad_norm": 3.159201145172119,
- "learning_rate": 3.722695030844891e-06,
- "loss": 0.37690871953964233,
- "mean_token_accuracy": 0.8717561960220337,
- "num_tokens": 13055131.0,
- "step": 1464
- },
- {
- "epoch": 1.1132218844984803,
- "grad_norm": 1.3810011148452759,
- "learning_rate": 3.7208677867109042e-06,
- "loss": 0.36598485708236694,
- "mean_token_accuracy": 0.8683375120162964,
- "num_tokens": 13069798.0,
- "step": 1465
- },
- {
- "epoch": 1.1139817629179332,
- "grad_norm": 2.500849485397339,
- "learning_rate": 3.7190396857249087e-06,
- "loss": 0.2781746983528137,
- "mean_token_accuracy": 0.9026005268096924,
- "num_tokens": 13075127.0,
- "step": 1466
- },
- {
- "epoch": 1.114741641337386,
- "grad_norm": 1.7445712089538574,
- "learning_rate": 3.7172107291699356e-06,
- "loss": 0.5055314302444458,
- "mean_token_accuracy": 0.8252174258232117,
- "num_tokens": 13084843.0,
- "step": 1467
- },
- {
- "epoch": 1.115501519756839,
- "grad_norm": 1.6386256217956543,
- "learning_rate": 3.7153809183296174e-06,
- "loss": 0.38478314876556396,
- "mean_token_accuracy": 0.8600847721099854,
- "num_tokens": 13096517.0,
- "step": 1468
- },
- {
- "epoch": 1.1162613981762919,
- "grad_norm": 2.3818395137786865,
- "learning_rate": 3.713550254488185e-06,
- "loss": 0.40308547019958496,
- "mean_token_accuracy": 0.8628184795379639,
- "num_tokens": 13102324.0,
- "step": 1469
- },
- {
- "epoch": 1.1170212765957448,
- "grad_norm": 1.73163640499115,
- "learning_rate": 3.7117187389304703e-06,
- "loss": 0.5035421848297119,
- "mean_token_accuracy": 0.8229597210884094,
- "num_tokens": 13113763.0,
- "step": 1470
- },
- {
- "epoch": 1.1177811550151975,
- "grad_norm": 3.147177219390869,
- "learning_rate": 3.7098863729418997e-06,
- "loss": 0.557449221611023,
- "mean_token_accuracy": 0.8266849517822266,
- "num_tokens": 13118849.0,
- "step": 1471
- },
- {
- "epoch": 1.1185410334346504,
- "grad_norm": 1.5061391592025757,
- "learning_rate": 3.7080531578085e-06,
- "loss": 0.3759554922580719,
- "mean_token_accuracy": 0.8541903495788574,
- "num_tokens": 13131337.0,
- "step": 1472
- },
- {
- "epoch": 1.1193009118541033,
- "grad_norm": 2.172346353530884,
- "learning_rate": 3.7062190948168906e-06,
- "loss": 0.41491609811782837,
- "mean_token_accuracy": 0.8531454801559448,
- "num_tokens": 13139767.0,
- "step": 1473
- },
- {
- "epoch": 1.1200607902735562,
- "grad_norm": 2.1527154445648193,
- "learning_rate": 3.7043841852542884e-06,
- "loss": 0.4309239387512207,
- "mean_token_accuracy": 0.8327745199203491,
- "num_tokens": 13147210.0,
- "step": 1474
- },
- {
- "epoch": 1.1208206686930091,
- "grad_norm": 1.8342832326889038,
- "learning_rate": 3.7025484304085035e-06,
- "loss": 0.34393298625946045,
- "mean_token_accuracy": 0.8948153257369995,
- "num_tokens": 13154831.0,
- "step": 1475
- },
- {
- "epoch": 1.121580547112462,
- "grad_norm": 2.509291172027588,
- "learning_rate": 3.7007118315679384e-06,
- "loss": 0.4479471445083618,
- "mean_token_accuracy": 0.8280234336853027,
- "num_tokens": 13161040.0,
- "step": 1476
- },
- {
- "epoch": 1.122340425531915,
- "grad_norm": 2.914710521697998,
- "learning_rate": 3.6988743900215895e-06,
- "loss": 0.3724832832813263,
- "mean_token_accuracy": 0.863893985748291,
- "num_tokens": 13164975.0,
- "step": 1477
- },
- {
- "epoch": 1.1231003039513678,
- "grad_norm": 3.274808645248413,
- "learning_rate": 3.6970361070590443e-06,
- "loss": 0.4088161885738373,
- "mean_token_accuracy": 0.8474822044372559,
- "num_tokens": 13168826.0,
- "step": 1478
- },
- {
- "epoch": 1.1238601823708207,
- "grad_norm": 2.861546277999878,
- "learning_rate": 3.695196983970481e-06,
- "loss": 0.45837992429733276,
- "mean_token_accuracy": 0.8579759001731873,
- "num_tokens": 13173794.0,
- "step": 1479
- },
- {
- "epoch": 1.1246200607902737,
- "grad_norm": 1.9491597414016724,
- "learning_rate": 3.6933570220466654e-06,
- "loss": 0.4333910346031189,
- "mean_token_accuracy": 0.8444236516952515,
- "num_tokens": 13181598.0,
- "step": 1480
- },
- {
- "epoch": 1.1253799392097266,
- "grad_norm": 1.329848051071167,
- "learning_rate": 3.6915162225789546e-06,
- "loss": 0.36404621601104736,
- "mean_token_accuracy": 0.8694117069244385,
- "num_tokens": 13196381.0,
- "step": 1481
- },
- {
- "epoch": 1.1261398176291793,
- "grad_norm": 1.8854197263717651,
- "learning_rate": 3.6896745868592924e-06,
- "loss": 0.4085756838321686,
- "mean_token_accuracy": 0.855188250541687,
- "num_tokens": 13205236.0,
- "step": 1482
- },
- {
- "epoch": 1.1268996960486322,
- "grad_norm": 3.01684832572937,
- "learning_rate": 3.6878321161802106e-06,
- "loss": 0.28105655312538147,
- "mean_token_accuracy": 0.9009426236152649,
- "num_tokens": 13209380.0,
- "step": 1483
- },
- {
- "epoch": 1.127659574468085,
- "grad_norm": 1.8051308393478394,
- "learning_rate": 3.685988811834823e-06,
- "loss": 0.3314531147480011,
- "mean_token_accuracy": 0.8805814385414124,
- "num_tokens": 13217714.0,
- "step": 1484
- },
- {
- "epoch": 1.128419452887538,
- "grad_norm": 1.61757493019104,
- "learning_rate": 3.684144675116836e-06,
- "loss": 0.4543863534927368,
- "mean_token_accuracy": 0.8400536775588989,
- "num_tokens": 13229330.0,
- "step": 1485
- },
- {
- "epoch": 1.1291793313069909,
- "grad_norm": 1.602686882019043,
- "learning_rate": 3.682299707320532e-06,
- "loss": 0.3653204143047333,
- "mean_token_accuracy": 0.8655825853347778,
- "num_tokens": 13242872.0,
- "step": 1486
- },
- {
- "epoch": 1.1299392097264438,
- "grad_norm": 2.3093113899230957,
- "learning_rate": 3.680453909740782e-06,
- "loss": 0.4383693039417267,
- "mean_token_accuracy": 0.839782178401947,
- "num_tokens": 13248976.0,
- "step": 1487
- },
- {
- "epoch": 1.1306990881458967,
- "grad_norm": 1.180559754371643,
- "learning_rate": 3.6786072836730376e-06,
- "loss": 0.5354755520820618,
- "mean_token_accuracy": 0.8151205778121948,
- "num_tokens": 13272896.0,
- "step": 1488
- },
- {
- "epoch": 1.1314589665653496,
- "grad_norm": 1.9554040431976318,
- "learning_rate": 3.6767598304133325e-06,
- "loss": 0.4485316872596741,
- "mean_token_accuracy": 0.8399936556816101,
- "num_tokens": 13280757.0,
- "step": 1489
- },
- {
- "epoch": 1.1322188449848025,
- "grad_norm": 2.236471176147461,
- "learning_rate": 3.674911551258279e-06,
- "loss": 0.45594364404678345,
- "mean_token_accuracy": 0.8552400469779968,
- "num_tokens": 13287328.0,
- "step": 1490
- },
- {
- "epoch": 1.1329787234042552,
- "grad_norm": 2.5228686332702637,
- "learning_rate": 3.673062447505072e-06,
- "loss": 0.4048641622066498,
- "mean_token_accuracy": 0.8617376685142517,
- "num_tokens": 13292716.0,
- "step": 1491
- },
- {
- "epoch": 1.1337386018237081,
- "grad_norm": 1.1274473667144775,
- "learning_rate": 3.6712125204514836e-06,
- "loss": 0.3848876357078552,
- "mean_token_accuracy": 0.8672975301742554,
- "num_tokens": 13313403.0,
- "step": 1492
- },
- {
- "epoch": 1.134498480243161,
- "grad_norm": 2.349541425704956,
- "learning_rate": 3.6693617713958633e-06,
- "loss": 0.3166058361530304,
- "mean_token_accuracy": 0.8896721601486206,
- "num_tokens": 13318720.0,
- "step": 1493
- },
- {
- "epoch": 1.135258358662614,
- "grad_norm": 2.2438278198242188,
- "learning_rate": 3.6675102016371387e-06,
- "loss": 0.5418218970298767,
- "mean_token_accuracy": 0.8256527185440063,
- "num_tokens": 13325360.0,
- "step": 1494
- },
- {
- "epoch": 1.1360182370820668,
- "grad_norm": 2.21268892288208,
- "learning_rate": 3.665657812474812e-06,
- "loss": 0.48603951930999756,
- "mean_token_accuracy": 0.8273470401763916,
- "num_tokens": 13333217.0,
- "step": 1495
- },
- {
- "epoch": 1.1367781155015197,
- "grad_norm": 2.6105997562408447,
- "learning_rate": 3.6638046052089614e-06,
- "loss": 0.31221291422843933,
- "mean_token_accuracy": 0.888375997543335,
- "num_tokens": 13338413.0,
- "step": 1496
- },
- {
- "epoch": 1.1375379939209727,
- "grad_norm": 3.655658483505249,
- "learning_rate": 3.661950581140239e-06,
- "loss": 0.3609023988246918,
- "mean_token_accuracy": 0.8838576078414917,
- "num_tokens": 13341499.0,
- "step": 1497
- },
- {
- "epoch": 1.1382978723404256,
- "grad_norm": 2.242009162902832,
- "learning_rate": 3.660095741569871e-06,
- "loss": 0.40022802352905273,
- "mean_token_accuracy": 0.8559960722923279,
- "num_tokens": 13347917.0,
- "step": 1498
- },
- {
- "epoch": 1.1390577507598785,
- "grad_norm": 1.7958979606628418,
- "learning_rate": 3.658240087799655e-06,
- "loss": 0.499157190322876,
- "mean_token_accuracy": 0.8423802256584167,
- "num_tokens": 13361570.0,
- "step": 1499
- },
- {
- "epoch": 1.1398176291793314,
- "grad_norm": 2.5406908988952637,
- "learning_rate": 3.6563836211319593e-06,
- "loss": 0.4090137481689453,
- "mean_token_accuracy": 0.8769663572311401,
- "num_tokens": 13367183.0,
- "step": 1500
- },
- {
- "epoch": 1.1405775075987843,
- "grad_norm": 1.9861716032028198,
- "learning_rate": 3.654526342869724e-06,
- "loss": 0.5125207304954529,
- "mean_token_accuracy": 0.8315266370773315,
- "num_tokens": 13376767.0,
- "step": 1501
- },
- {
- "epoch": 1.141337386018237,
- "grad_norm": 1.731188178062439,
- "learning_rate": 3.65266825431646e-06,
- "loss": 0.39452576637268066,
- "mean_token_accuracy": 0.8585706353187561,
- "num_tokens": 13388437.0,
- "step": 1502
- },
- {
- "epoch": 1.1420972644376899,
- "grad_norm": 1.5203773975372314,
- "learning_rate": 3.6508093567762425e-06,
- "loss": 0.39466819167137146,
- "mean_token_accuracy": 0.8584027886390686,
- "num_tokens": 13399727.0,
- "step": 1503
- },
- {
- "epoch": 1.1428571428571428,
- "grad_norm": 2.606462001800537,
- "learning_rate": 3.6489496515537204e-06,
- "loss": 0.4521079361438751,
- "mean_token_accuracy": 0.8413360118865967,
- "num_tokens": 13408426.0,
- "step": 1504
- },
- {
- "epoch": 1.1436170212765957,
- "grad_norm": 2.6207993030548096,
- "learning_rate": 3.647089139954104e-06,
- "loss": 0.4709353446960449,
- "mean_token_accuracy": 0.8397113084793091,
- "num_tokens": 13413506.0,
- "step": 1505
- },
- {
- "epoch": 1.1443768996960486,
- "grad_norm": 1.7214165925979614,
- "learning_rate": 3.6452278232831734e-06,
- "loss": 0.45506367087364197,
- "mean_token_accuracy": 0.8466023206710815,
- "num_tokens": 13424592.0,
- "step": 1506
- },
- {
- "epoch": 1.1451367781155015,
- "grad_norm": 1.7111759185791016,
- "learning_rate": 3.643365702847272e-06,
- "loss": 0.5016278624534607,
- "mean_token_accuracy": 0.8196234703063965,
- "num_tokens": 13434421.0,
- "step": 1507
- },
- {
- "epoch": 1.1458966565349544,
- "grad_norm": 1.7528148889541626,
- "learning_rate": 3.641502779953307e-06,
- "loss": 0.5020896196365356,
- "mean_token_accuracy": 0.826249361038208,
- "num_tokens": 13445286.0,
- "step": 1508
- },
- {
- "epoch": 1.1466565349544073,
- "grad_norm": 1.3470909595489502,
- "learning_rate": 3.639639055908751e-06,
- "loss": 0.45765724778175354,
- "mean_token_accuracy": 0.8380560278892517,
- "num_tokens": 13465030.0,
- "step": 1509
- },
- {
- "epoch": 1.1474164133738602,
- "grad_norm": 2.4846835136413574,
- "learning_rate": 3.6377745320216346e-06,
- "loss": 0.46488267183303833,
- "mean_token_accuracy": 0.8393925428390503,
- "num_tokens": 13470883.0,
- "step": 1510
- },
- {
- "epoch": 1.1481762917933132,
- "grad_norm": 1.770201563835144,
- "learning_rate": 3.635909209600555e-06,
- "loss": 0.5262179374694824,
- "mean_token_accuracy": 0.8201162815093994,
- "num_tokens": 13482558.0,
- "step": 1511
- },
- {
- "epoch": 1.148936170212766,
- "grad_norm": 1.5955098867416382,
- "learning_rate": 3.6340430899546656e-06,
- "loss": 0.430621862411499,
- "mean_token_accuracy": 0.8488553762435913,
- "num_tokens": 13493003.0,
- "step": 1512
- },
- {
- "epoch": 1.1496960486322187,
- "grad_norm": 2.846176862716675,
- "learning_rate": 3.632176174393682e-06,
- "loss": 0.23461638391017914,
- "mean_token_accuracy": 0.9218817353248596,
- "num_tokens": 13496566.0,
- "step": 1513
- },
- {
- "epoch": 1.1504559270516717,
- "grad_norm": 1.9606610536575317,
- "learning_rate": 3.630308464227877e-06,
- "loss": 0.4940161108970642,
- "mean_token_accuracy": 0.8474864959716797,
- "num_tokens": 13504843.0,
- "step": 1514
- },
- {
- "epoch": 1.1512158054711246,
- "grad_norm": 1.1588608026504517,
- "learning_rate": 3.628439960768082e-06,
- "loss": 0.32650992274284363,
- "mean_token_accuracy": 0.8797246217727661,
- "num_tokens": 13521513.0,
- "step": 1515
- },
- {
- "epoch": 1.1519756838905775,
- "grad_norm": 1.3566495180130005,
- "learning_rate": 3.6265706653256837e-06,
- "loss": 0.4359064996242523,
- "mean_token_accuracy": 0.8379859328269958,
- "num_tokens": 13540608.0,
- "step": 1516
- },
- {
- "epoch": 1.1527355623100304,
- "grad_norm": 1.4728609323501587,
- "learning_rate": 3.624700579212626e-06,
- "loss": 0.29939693212509155,
- "mean_token_accuracy": 0.8831408023834229,
- "num_tokens": 13550641.0,
- "step": 1517
- },
- {
- "epoch": 1.1534954407294833,
- "grad_norm": 2.162325382232666,
- "learning_rate": 3.6228297037414077e-06,
- "loss": 0.4097636938095093,
- "mean_token_accuracy": 0.8575425148010254,
- "num_tokens": 13556931.0,
- "step": 1518
- },
- {
- "epoch": 1.1542553191489362,
- "grad_norm": 1.754439353942871,
- "learning_rate": 3.6209580402250816e-06,
- "loss": 0.400202214717865,
- "mean_token_accuracy": 0.8569821119308472,
- "num_tokens": 13565491.0,
- "step": 1519
- },
- {
- "epoch": 1.155015197568389,
- "grad_norm": 1.5250083208084106,
- "learning_rate": 3.619085589977251e-06,
- "loss": 0.43330419063568115,
- "mean_token_accuracy": 0.8492985963821411,
- "num_tokens": 13577147.0,
- "step": 1520
- },
- {
- "epoch": 1.155775075987842,
- "grad_norm": 1.9108905792236328,
- "learning_rate": 3.617212354312076e-06,
- "loss": 0.30567464232444763,
- "mean_token_accuracy": 0.8850164413452148,
- "num_tokens": 13584366.0,
- "step": 1521
- },
- {
- "epoch": 1.156534954407295,
- "grad_norm": 2.2574243545532227,
- "learning_rate": 3.615338334544265e-06,
- "loss": 0.4391738772392273,
- "mean_token_accuracy": 0.839765727519989,
- "num_tokens": 13591816.0,
- "step": 1522
- },
- {
- "epoch": 1.1572948328267478,
- "grad_norm": 2.1235218048095703,
- "learning_rate": 3.6134635319890763e-06,
- "loss": 0.45043107867240906,
- "mean_token_accuracy": 0.8385299444198608,
- "num_tokens": 13599736.0,
- "step": 1523
- },
- {
- "epoch": 1.1580547112462005,
- "grad_norm": 2.2274110317230225,
- "learning_rate": 3.611587947962319e-06,
- "loss": 0.3623226284980774,
- "mean_token_accuracy": 0.8724044561386108,
- "num_tokens": 13605354.0,
- "step": 1524
- },
- {
- "epoch": 1.1588145896656534,
- "grad_norm": 3.414236545562744,
- "learning_rate": 3.6097115837803504e-06,
- "loss": 0.30060696601867676,
- "mean_token_accuracy": 0.8971061706542969,
- "num_tokens": 13608851.0,
- "step": 1525
- },
- {
- "epoch": 1.1595744680851063,
- "grad_norm": 2.496264696121216,
- "learning_rate": 3.6078344407600744e-06,
- "loss": 0.3567180037498474,
- "mean_token_accuracy": 0.8596180081367493,
- "num_tokens": 13614339.0,
- "step": 1526
- },
- {
- "epoch": 1.1603343465045592,
- "grad_norm": 2.0191843509674072,
- "learning_rate": 3.6059565202189433e-06,
- "loss": 0.43206095695495605,
- "mean_token_accuracy": 0.8464000821113586,
- "num_tokens": 13622395.0,
- "step": 1527
- },
- {
- "epoch": 1.1610942249240122,
- "grad_norm": 1.5475906133651733,
- "learning_rate": 3.604077823474954e-06,
- "loss": 0.4535648226737976,
- "mean_token_accuracy": 0.8391586542129517,
- "num_tokens": 13635356.0,
- "step": 1528
- },
- {
- "epoch": 1.161854103343465,
- "grad_norm": 2.1348211765289307,
- "learning_rate": 3.6021983518466468e-06,
- "loss": 0.2733963429927826,
- "mean_token_accuracy": 0.9007417559623718,
- "num_tokens": 13640641.0,
- "step": 1529
- },
- {
- "epoch": 1.162613981762918,
- "grad_norm": 2.8452792167663574,
- "learning_rate": 3.600318106653108e-06,
- "loss": 0.29591235518455505,
- "mean_token_accuracy": 0.8934413194656372,
- "num_tokens": 13644995.0,
- "step": 1530
- },
- {
- "epoch": 1.1633738601823709,
- "grad_norm": 2.342907190322876,
- "learning_rate": 3.5984370892139663e-06,
- "loss": 0.4675130248069763,
- "mean_token_accuracy": 0.8352028131484985,
- "num_tokens": 13652695.0,
- "step": 1531
- },
- {
- "epoch": 1.1641337386018238,
- "grad_norm": 2.3480238914489746,
- "learning_rate": 3.5965553008493924e-06,
- "loss": 0.3114515542984009,
- "mean_token_accuracy": 0.8845353126525879,
- "num_tokens": 13658101.0,
- "step": 1532
- },
- {
- "epoch": 1.1648936170212765,
- "grad_norm": 1.8608155250549316,
- "learning_rate": 3.594672742880097e-06,
- "loss": 0.3864145278930664,
- "mean_token_accuracy": 0.867354154586792,
- "num_tokens": 13666042.0,
- "step": 1533
- },
- {
- "epoch": 1.1656534954407296,
- "grad_norm": 1.4756088256835938,
- "learning_rate": 3.5927894166273324e-06,
- "loss": 0.3671600818634033,
- "mean_token_accuracy": 0.8695988655090332,
- "num_tokens": 13678253.0,
- "step": 1534
- },
- {
- "epoch": 1.1664133738601823,
- "grad_norm": 2.8831355571746826,
- "learning_rate": 3.5909053234128893e-06,
- "loss": 0.267184317111969,
- "mean_token_accuracy": 0.9008115530014038,
- "num_tokens": 13681790.0,
- "step": 1535
- },
- {
- "epoch": 1.1671732522796352,
- "grad_norm": 2.1984763145446777,
- "learning_rate": 3.5890204645590964e-06,
- "loss": 0.4431505799293518,
- "mean_token_accuracy": 0.8623673915863037,
- "num_tokens": 13688444.0,
- "step": 1536
- },
- {
- "epoch": 1.167933130699088,
- "grad_norm": 1.8271523714065552,
- "learning_rate": 3.5871348413888207e-06,
- "loss": 0.3861040771007538,
- "mean_token_accuracy": 0.8624277114868164,
- "num_tokens": 13696872.0,
- "step": 1537
- },
- {
- "epoch": 1.168693009118541,
- "grad_norm": 1.6313756704330444,
- "learning_rate": 3.585248455225466e-06,
- "loss": 0.3775154948234558,
- "mean_token_accuracy": 0.8624461889266968,
- "num_tokens": 13706167.0,
- "step": 1538
- },
- {
- "epoch": 1.169452887537994,
- "grad_norm": 2.4377901554107666,
- "learning_rate": 3.5833613073929684e-06,
- "loss": 0.2308957427740097,
- "mean_token_accuracy": 0.920600175857544,
- "num_tokens": 13710367.0,
- "step": 1539
- },
- {
- "epoch": 1.1702127659574468,
- "grad_norm": 2.2621750831604004,
- "learning_rate": 3.5814733992158025e-06,
- "loss": 0.33167219161987305,
- "mean_token_accuracy": 0.8963261842727661,
- "num_tokens": 13716384.0,
- "step": 1540
- },
- {
- "epoch": 1.1709726443768997,
- "grad_norm": 1.3178150653839111,
- "learning_rate": 3.579584732018975e-06,
- "loss": 0.3276631832122803,
- "mean_token_accuracy": 0.8853521347045898,
- "num_tokens": 13731031.0,
- "step": 1541
- },
- {
- "epoch": 1.1717325227963526,
- "grad_norm": 2.177750587463379,
- "learning_rate": 3.577695307128024e-06,
- "loss": 0.48177266120910645,
- "mean_token_accuracy": 0.830329418182373,
- "num_tokens": 13737925.0,
- "step": 1542
- },
- {
- "epoch": 1.1724924012158056,
- "grad_norm": 2.2268829345703125,
- "learning_rate": 3.5758051258690223e-06,
- "loss": 0.48843517899513245,
- "mean_token_accuracy": 0.8310644030570984,
- "num_tokens": 13746039.0,
- "step": 1543
- },
- {
- "epoch": 1.1732522796352582,
- "grad_norm": 1.498701572418213,
- "learning_rate": 3.5739141895685708e-06,
- "loss": 0.4542962312698364,
- "mean_token_accuracy": 0.8500330448150635,
- "num_tokens": 13765002.0,
- "step": 1544
- },
- {
- "epoch": 1.1740121580547112,
- "grad_norm": 1.786670446395874,
- "learning_rate": 3.5720224995538023e-06,
- "loss": 0.27367928624153137,
- "mean_token_accuracy": 0.8916142582893372,
- "num_tokens": 13774113.0,
- "step": 1545
- },
- {
- "epoch": 1.174772036474164,
- "grad_norm": 2.0311272144317627,
- "learning_rate": 3.5701300571523757e-06,
- "loss": 0.559987485408783,
- "mean_token_accuracy": 0.8266973495483398,
- "num_tokens": 13783912.0,
- "step": 1546
- },
- {
- "epoch": 1.175531914893617,
- "grad_norm": 1.8732186555862427,
- "learning_rate": 3.5682368636924825e-06,
- "loss": 0.5184751152992249,
- "mean_token_accuracy": 0.8450918197631836,
- "num_tokens": 13792728.0,
- "step": 1547
- },
- {
- "epoch": 1.1762917933130699,
- "grad_norm": 1.4410661458969116,
- "learning_rate": 3.566342920502837e-06,
- "loss": 0.383536696434021,
- "mean_token_accuracy": 0.8672217726707458,
- "num_tokens": 13813590.0,
- "step": 1548
- },
- {
- "epoch": 1.1770516717325228,
- "grad_norm": 3.06056547164917,
- "learning_rate": 3.564448228912682e-06,
- "loss": 0.3941686153411865,
- "mean_token_accuracy": 0.8696402311325073,
- "num_tokens": 13817704.0,
- "step": 1549
- },
- {
- "epoch": 1.1778115501519757,
- "grad_norm": 1.6150329113006592,
- "learning_rate": 3.562552790251785e-06,
- "loss": 0.41606605052948,
- "mean_token_accuracy": 0.8488572835922241,
- "num_tokens": 13831303.0,
- "step": 1550
- },
- {
- "epoch": 1.1785714285714286,
- "grad_norm": 2.1199934482574463,
- "learning_rate": 3.5606566058504377e-06,
- "loss": 0.3974752426147461,
- "mean_token_accuracy": 0.8686345219612122,
- "num_tokens": 13837613.0,
- "step": 1551
- },
- {
- "epoch": 1.1793313069908815,
- "grad_norm": 1.5683876276016235,
- "learning_rate": 3.558759677039455e-06,
- "loss": 0.35225993394851685,
- "mean_token_accuracy": 0.8710784316062927,
- "num_tokens": 13846779.0,
- "step": 1552
- },
- {
- "epoch": 1.1800911854103344,
- "grad_norm": 1.4644675254821777,
- "learning_rate": 3.5568620051501755e-06,
- "loss": 0.38400042057037354,
- "mean_token_accuracy": 0.8548328876495361,
- "num_tokens": 13860713.0,
- "step": 1553
- },
- {
- "epoch": 1.1808510638297873,
- "grad_norm": 1.461491346359253,
- "learning_rate": 3.5549635915144578e-06,
- "loss": 0.4572640061378479,
- "mean_token_accuracy": 0.8506045937538147,
- "num_tokens": 13877289.0,
- "step": 1554
- },
- {
- "epoch": 1.18161094224924,
- "grad_norm": 2.6364715099334717,
- "learning_rate": 3.553064437464682e-06,
- "loss": 0.3954341411590576,
- "mean_token_accuracy": 0.8561649322509766,
- "num_tokens": 13882064.0,
- "step": 1555
- },
- {
- "epoch": 1.182370820668693,
- "grad_norm": 2.027273654937744,
- "learning_rate": 3.551164544333745e-06,
- "loss": 0.47625732421875,
- "mean_token_accuracy": 0.8349384069442749,
- "num_tokens": 13890306.0,
- "step": 1556
- },
- {
- "epoch": 1.1831306990881458,
- "grad_norm": 2.8427743911743164,
- "learning_rate": 3.549263913455069e-06,
- "loss": 0.4273033142089844,
- "mean_token_accuracy": 0.8541387319564819,
- "num_tokens": 13894882.0,
- "step": 1557
- },
- {
- "epoch": 1.1838905775075987,
- "grad_norm": 1.6298975944519043,
- "learning_rate": 3.5473625461625884e-06,
- "loss": 0.4378639757633209,
- "mean_token_accuracy": 0.8634963631629944,
- "num_tokens": 13906152.0,
- "step": 1558
- },
- {
- "epoch": 1.1846504559270516,
- "grad_norm": 2.4098947048187256,
- "learning_rate": 3.5454604437907535e-06,
- "loss": 0.47236716747283936,
- "mean_token_accuracy": 0.8646864891052246,
- "num_tokens": 13911803.0,
- "step": 1559
- },
- {
- "epoch": 1.1854103343465046,
- "grad_norm": 1.5972497463226318,
- "learning_rate": 3.543557607674537e-06,
- "loss": 0.3001407980918884,
- "mean_token_accuracy": 0.8927055597305298,
- "num_tokens": 13921304.0,
- "step": 1560
- },
- {
- "epoch": 1.1861702127659575,
- "grad_norm": 2.1140005588531494,
- "learning_rate": 3.54165403914942e-06,
- "loss": 0.41898271441459656,
- "mean_token_accuracy": 0.8542245626449585,
- "num_tokens": 13929434.0,
- "step": 1561
- },
- {
- "epoch": 1.1869300911854104,
- "grad_norm": 1.8733803033828735,
- "learning_rate": 3.539749739551401e-06,
- "loss": 0.35469961166381836,
- "mean_token_accuracy": 0.8805290460586548,
- "num_tokens": 13937781.0,
- "step": 1562
- },
- {
- "epoch": 1.1876899696048633,
- "grad_norm": 2.2805802822113037,
- "learning_rate": 3.53784471021699e-06,
- "loss": 0.44496792554855347,
- "mean_token_accuracy": 0.8454172611236572,
- "num_tokens": 13944394.0,
- "step": 1563
- },
- {
- "epoch": 1.1884498480243162,
- "grad_norm": 0.9728449583053589,
- "learning_rate": 3.535938952483211e-06,
- "loss": 0.3156968355178833,
- "mean_token_accuracy": 0.8739837408065796,
- "num_tokens": 13966712.0,
- "step": 1564
- },
- {
- "epoch": 1.189209726443769,
- "grad_norm": 3.025338888168335,
- "learning_rate": 3.534032467687597e-06,
- "loss": 0.30036938190460205,
- "mean_token_accuracy": 0.9058252573013306,
- "num_tokens": 13970183.0,
- "step": 1565
- },
- {
- "epoch": 1.1899696048632218,
- "grad_norm": 2.0659425258636475,
- "learning_rate": 3.532125257168193e-06,
- "loss": 0.30619731545448303,
- "mean_token_accuracy": 0.9041587710380554,
- "num_tokens": 13976657.0,
- "step": 1566
- },
- {
- "epoch": 1.1907294832826747,
- "grad_norm": 3.2036776542663574,
- "learning_rate": 3.5302173222635526e-06,
- "loss": 0.4145944118499756,
- "mean_token_accuracy": 0.8502328395843506,
- "num_tokens": 13981198.0,
- "step": 1567
- },
- {
- "epoch": 1.1914893617021276,
- "grad_norm": 1.7767539024353027,
- "learning_rate": 3.5283086643127396e-06,
- "loss": 0.437128484249115,
- "mean_token_accuracy": 0.8965631723403931,
- "num_tokens": 13990259.0,
- "step": 1568
- },
- {
- "epoch": 1.1922492401215805,
- "grad_norm": 1.7777384519577026,
- "learning_rate": 3.5263992846553203e-06,
- "loss": 0.33831220865249634,
- "mean_token_accuracy": 0.8734279870986938,
- "num_tokens": 13999363.0,
- "step": 1569
- },
- {
- "epoch": 1.1930091185410334,
- "grad_norm": 1.6710708141326904,
- "learning_rate": 3.5244891846313733e-06,
- "loss": 0.4005590081214905,
- "mean_token_accuracy": 0.8820298314094543,
- "num_tokens": 14008719.0,
- "step": 1570
- },
- {
- "epoch": 1.1937689969604863,
- "grad_norm": 1.0378777980804443,
- "learning_rate": 3.5225783655814798e-06,
- "loss": 0.3174915313720703,
- "mean_token_accuracy": 0.8894162774085999,
- "num_tokens": 14025806.0,
- "step": 1571
- },
- {
- "epoch": 1.1945288753799392,
- "grad_norm": 1.2647521495819092,
- "learning_rate": 3.520666828846726e-06,
- "loss": 0.4173050820827484,
- "mean_token_accuracy": 0.8437265157699585,
- "num_tokens": 14046445.0,
- "step": 1572
- },
- {
- "epoch": 1.1952887537993921,
- "grad_norm": 2.8625528812408447,
- "learning_rate": 3.518754575768702e-06,
- "loss": 0.37182557582855225,
- "mean_token_accuracy": 0.8660947680473328,
- "num_tokens": 14051197.0,
- "step": 1573
- },
- {
- "epoch": 1.196048632218845,
- "grad_norm": 1.1213171482086182,
- "learning_rate": 3.516841607689501e-06,
- "loss": 0.332731157541275,
- "mean_token_accuracy": 0.8573278784751892,
- "num_tokens": 14070817.0,
- "step": 1574
- },
- {
- "epoch": 1.196808510638298,
- "grad_norm": 1.197508692741394,
- "learning_rate": 3.5149279259517165e-06,
- "loss": 0.34058472514152527,
- "mean_token_accuracy": 0.8603571653366089,
- "num_tokens": 14085301.0,
- "step": 1575
- },
- {
- "epoch": 1.1975683890577509,
- "grad_norm": 4.019949913024902,
- "learning_rate": 3.5130135318984454e-06,
- "loss": 0.3094622492790222,
- "mean_token_accuracy": 0.8905094861984253,
- "num_tokens": 14088107.0,
- "step": 1576
- },
- {
- "epoch": 1.1983282674772036,
- "grad_norm": 2.591181755065918,
- "learning_rate": 3.5110984268732827e-06,
- "loss": 0.3407078981399536,
- "mean_token_accuracy": 0.880385160446167,
- "num_tokens": 14092887.0,
- "step": 1577
- },
- {
- "epoch": 1.1990881458966565,
- "grad_norm": 1.3069331645965576,
- "learning_rate": 3.509182612220322e-06,
- "loss": 0.3761988878250122,
- "mean_token_accuracy": 0.862013041973114,
- "num_tokens": 14109216.0,
- "step": 1578
- },
- {
- "epoch": 1.1998480243161094,
- "grad_norm": 1.7802022695541382,
- "learning_rate": 3.507266089284157e-06,
- "loss": 0.3824652135372162,
- "mean_token_accuracy": 0.8707721829414368,
- "num_tokens": 14119645.0,
- "step": 1579
- },
- {
- "epoch": 1.2006079027355623,
- "grad_norm": 2.7937185764312744,
- "learning_rate": 3.5053488594098763e-06,
- "loss": 0.33828890323638916,
- "mean_token_accuracy": 0.8765541315078735,
- "num_tokens": 14124628.0,
- "step": 1580
- },
- {
- "epoch": 1.2013677811550152,
- "grad_norm": 1.892671823501587,
- "learning_rate": 3.5034309239430664e-06,
- "loss": 0.3476094603538513,
- "mean_token_accuracy": 0.9053795337677002,
- "num_tokens": 14131756.0,
- "step": 1581
- },
- {
- "epoch": 1.202127659574468,
- "grad_norm": 1.6857695579528809,
- "learning_rate": 3.501512284229807e-06,
- "loss": 0.5397108793258667,
- "mean_token_accuracy": 0.8173421025276184,
- "num_tokens": 14143024.0,
- "step": 1582
- },
- {
- "epoch": 1.202887537993921,
- "grad_norm": 2.501737117767334,
- "learning_rate": 3.4995929416166756e-06,
- "loss": 0.4192458391189575,
- "mean_token_accuracy": 0.8558136224746704,
- "num_tokens": 14149499.0,
- "step": 1583
- },
- {
- "epoch": 1.203647416413374,
- "grad_norm": 2.0133907794952393,
- "learning_rate": 3.4976728974507387e-06,
- "loss": 0.4791576564311981,
- "mean_token_accuracy": 0.8253597021102905,
- "num_tokens": 14158381.0,
- "step": 1584
- },
- {
- "epoch": 1.2044072948328268,
- "grad_norm": 2.984611988067627,
- "learning_rate": 3.4957521530795576e-06,
- "loss": 0.3040750026702881,
- "mean_token_accuracy": 0.8902391791343689,
- "num_tokens": 14162419.0,
- "step": 1585
- },
- {
- "epoch": 1.2051671732522795,
- "grad_norm": 1.518591284751892,
- "learning_rate": 3.493830709851185e-06,
- "loss": 0.35539618134498596,
- "mean_token_accuracy": 0.8737183809280396,
- "num_tokens": 14173048.0,
- "step": 1586
- },
- {
- "epoch": 1.2059270516717326,
- "grad_norm": 2.628758192062378,
- "learning_rate": 3.4919085691141636e-06,
- "loss": 0.33340200781822205,
- "mean_token_accuracy": 0.8705098628997803,
- "num_tokens": 14178255.0,
- "step": 1587
- },
- {
- "epoch": 1.2066869300911853,
- "grad_norm": 2.5565974712371826,
- "learning_rate": 3.4899857322175252e-06,
- "loss": 0.44939476251602173,
- "mean_token_accuracy": 0.8315504193305969,
- "num_tokens": 14183808.0,
- "step": 1588
- },
- {
- "epoch": 1.2074468085106382,
- "grad_norm": 1.7521045207977295,
- "learning_rate": 3.4880622005107916e-06,
- "loss": 0.3168621063232422,
- "mean_token_accuracy": 0.8824669122695923,
- "num_tokens": 14192186.0,
- "step": 1589
- },
- {
- "epoch": 1.2082066869300911,
- "grad_norm": 1.9816104173660278,
- "learning_rate": 3.486137975343971e-06,
- "loss": 0.3892582058906555,
- "mean_token_accuracy": 0.8524188995361328,
- "num_tokens": 14200512.0,
- "step": 1590
- },
- {
- "epoch": 1.208966565349544,
- "grad_norm": 1.459800124168396,
- "learning_rate": 3.484213058067559e-06,
- "loss": 0.45930033922195435,
- "mean_token_accuracy": 0.8408471345901489,
- "num_tokens": 14215232.0,
- "step": 1591
- },
- {
- "epoch": 1.209726443768997,
- "grad_norm": 2.015493154525757,
- "learning_rate": 3.482287450032536e-06,
- "loss": 0.5514016151428223,
- "mean_token_accuracy": 0.8456779718399048,
- "num_tokens": 14225402.0,
- "step": 1592
- },
- {
- "epoch": 1.2104863221884499,
- "grad_norm": 3.4511911869049072,
- "learning_rate": 3.4803611525903687e-06,
- "loss": 0.4772771894931793,
- "mean_token_accuracy": 0.8558698892593384,
- "num_tokens": 14229038.0,
- "step": 1593
- },
- {
- "epoch": 1.2112462006079028,
- "grad_norm": 2.2247982025146484,
- "learning_rate": 3.4784341670930067e-06,
- "loss": 0.4042825996875763,
- "mean_token_accuracy": 0.8635870218276978,
- "num_tokens": 14237057.0,
- "step": 1594
- },
- {
- "epoch": 1.2120060790273557,
- "grad_norm": 2.0534820556640625,
- "learning_rate": 3.4765064948928813e-06,
- "loss": 0.34057414531707764,
- "mean_token_accuracy": 0.8800770044326782,
- "num_tokens": 14243013.0,
- "step": 1595
- },
- {
- "epoch": 1.2127659574468086,
- "grad_norm": 2.594703197479248,
- "learning_rate": 3.474578137342909e-06,
- "loss": 0.4997410774230957,
- "mean_token_accuracy": 0.8302106261253357,
- "num_tokens": 14251210.0,
- "step": 1596
- },
- {
- "epoch": 1.2135258358662613,
- "grad_norm": 2.517833948135376,
- "learning_rate": 3.4726490957964836e-06,
- "loss": 0.3630390465259552,
- "mean_token_accuracy": 0.8679884672164917,
- "num_tokens": 14255893.0,
- "step": 1597
- },
- {
- "epoch": 1.2142857142857142,
- "grad_norm": 1.5177065134048462,
- "learning_rate": 3.4707193716074816e-06,
- "loss": 0.36218544840812683,
- "mean_token_accuracy": 0.879178524017334,
- "num_tokens": 14268143.0,
- "step": 1598
- },
- {
- "epoch": 1.215045592705167,
- "grad_norm": 2.215291738510132,
- "learning_rate": 3.4687889661302577e-06,
- "loss": 0.4166645407676697,
- "mean_token_accuracy": 0.8495793342590332,
- "num_tokens": 14276794.0,
- "step": 1599
- },
- {
- "epoch": 1.21580547112462,
- "grad_norm": 1.534294843673706,
- "learning_rate": 3.466857880719645e-06,
- "loss": 0.2635883092880249,
- "mean_token_accuracy": 0.8971712589263916,
- "num_tokens": 14287000.0,
- "step": 1600
- },
- {
- "epoch": 1.216565349544073,
- "grad_norm": 1.2338658571243286,
- "learning_rate": 3.464926116730953e-06,
- "loss": 0.339110404253006,
- "mean_token_accuracy": 0.895592987537384,
- "num_tokens": 14303217.0,
- "step": 1601
- },
- {
- "epoch": 1.2173252279635258,
- "grad_norm": 1.8717178106307983,
- "learning_rate": 3.462993675519968e-06,
- "loss": 0.41204726696014404,
- "mean_token_accuracy": 0.8560728430747986,
- "num_tokens": 14311372.0,
- "step": 1602
- },
- {
- "epoch": 1.2180851063829787,
- "grad_norm": 2.844160795211792,
- "learning_rate": 3.4610605584429526e-06,
- "loss": 0.4129520058631897,
- "mean_token_accuracy": 0.8555002212524414,
- "num_tokens": 14316244.0,
- "step": 1603
- },
- {
- "epoch": 1.2188449848024316,
- "grad_norm": 1.099926471710205,
- "learning_rate": 3.4591267668566412e-06,
- "loss": 0.35783132910728455,
- "mean_token_accuracy": 0.8693175315856934,
- "num_tokens": 14338414.0,
- "step": 1604
- },
- {
- "epoch": 1.2196048632218845,
- "grad_norm": 1.6448384523391724,
- "learning_rate": 3.457192302118244e-06,
- "loss": 0.42060258984565735,
- "mean_token_accuracy": 0.8557323217391968,
- "num_tokens": 14349143.0,
- "step": 1605
- },
- {
- "epoch": 1.2203647416413375,
- "grad_norm": 2.097529888153076,
- "learning_rate": 3.455257165585444e-06,
- "loss": 0.5227499008178711,
- "mean_token_accuracy": 0.828961968421936,
- "num_tokens": 14360032.0,
- "step": 1606
- },
- {
- "epoch": 1.2211246200607904,
- "grad_norm": 1.602988600730896,
- "learning_rate": 3.453321358616393e-06,
- "loss": 0.3537187874317169,
- "mean_token_accuracy": 0.8776708841323853,
- "num_tokens": 14370005.0,
- "step": 1607
- },
- {
- "epoch": 1.221884498480243,
- "grad_norm": 2.358971357345581,
- "learning_rate": 3.4513848825697145e-06,
- "loss": 0.3448919653892517,
- "mean_token_accuracy": 0.8887944221496582,
- "num_tokens": 14375718.0,
- "step": 1608
- },
- {
- "epoch": 1.222644376899696,
- "grad_norm": 1.72306227684021,
- "learning_rate": 3.4494477388045035e-06,
- "loss": 0.36985084414482117,
- "mean_token_accuracy": 0.859595537185669,
- "num_tokens": 14385016.0,
- "step": 1609
- },
- {
- "epoch": 1.2234042553191489,
- "grad_norm": 1.5494085550308228,
- "learning_rate": 3.4475099286803204e-06,
- "loss": 0.49003708362579346,
- "mean_token_accuracy": 0.8701964616775513,
- "num_tokens": 14399277.0,
- "step": 1610
- },
- {
- "epoch": 1.2241641337386018,
- "grad_norm": 2.6874046325683594,
- "learning_rate": 3.445571453557196e-06,
- "loss": 0.3424490690231323,
- "mean_token_accuracy": 0.8835943937301636,
- "num_tokens": 14404182.0,
- "step": 1611
- },
- {
- "epoch": 1.2249240121580547,
- "grad_norm": 2.2163190841674805,
- "learning_rate": 3.443632314795627e-06,
- "loss": 0.40944457054138184,
- "mean_token_accuracy": 0.8649888038635254,
- "num_tokens": 14410158.0,
- "step": 1612
- },
- {
- "epoch": 1.2256838905775076,
- "grad_norm": 2.7961158752441406,
- "learning_rate": 3.4416925137565756e-06,
- "loss": 0.17890746891498566,
- "mean_token_accuracy": 0.9439430832862854,
- "num_tokens": 14413285.0,
- "step": 1613
- },
- {
- "epoch": 1.2264437689969605,
- "grad_norm": 1.421451210975647,
- "learning_rate": 3.439752051801467e-06,
- "loss": 0.33948683738708496,
- "mean_token_accuracy": 0.8754585981369019,
- "num_tokens": 14424674.0,
- "step": 1614
- },
- {
- "epoch": 1.2272036474164134,
- "grad_norm": 2.105196237564087,
- "learning_rate": 3.4378109302921946e-06,
- "loss": 0.40009379386901855,
- "mean_token_accuracy": 0.8600341081619263,
- "num_tokens": 14432400.0,
- "step": 1615
- },
- {
- "epoch": 1.2279635258358663,
- "grad_norm": 2.004122734069824,
- "learning_rate": 3.4358691505911105e-06,
- "loss": 0.46013444662094116,
- "mean_token_accuracy": 0.8400925993919373,
- "num_tokens": 14440741.0,
- "step": 1616
- },
- {
- "epoch": 1.2287234042553192,
- "grad_norm": 1.8407535552978516,
- "learning_rate": 3.4339267140610317e-06,
- "loss": 0.38828906416893005,
- "mean_token_accuracy": 0.8582802414894104,
- "num_tokens": 14448698.0,
- "step": 1617
- },
- {
- "epoch": 1.2294832826747721,
- "grad_norm": 2.4285924434661865,
- "learning_rate": 3.4319836220652334e-06,
- "loss": 0.3109283447265625,
- "mean_token_accuracy": 0.8888344764709473,
- "num_tokens": 14453674.0,
- "step": 1618
- },
- {
- "epoch": 1.2302431610942248,
- "grad_norm": 1.6322550773620605,
- "learning_rate": 3.430039875967454e-06,
- "loss": 0.5222204327583313,
- "mean_token_accuracy": 0.825019121170044,
- "num_tokens": 14465736.0,
- "step": 1619
- },
- {
- "epoch": 1.2310030395136777,
- "grad_norm": 2.307573080062866,
- "learning_rate": 3.428095477131888e-06,
- "loss": 0.29477375745773315,
- "mean_token_accuracy": 0.8899064660072327,
- "num_tokens": 14471266.0,
- "step": 1620
- },
- {
- "epoch": 1.2317629179331306,
- "grad_norm": 1.8044531345367432,
- "learning_rate": 3.4261504269231904e-06,
- "loss": 0.4883342981338501,
- "mean_token_accuracy": 0.8310165405273438,
- "num_tokens": 14481679.0,
- "step": 1621
- },
- {
- "epoch": 1.2325227963525835,
- "grad_norm": 2.7585411071777344,
- "learning_rate": 3.4242047267064714e-06,
- "loss": 0.45369645953178406,
- "mean_token_accuracy": 0.8432134985923767,
- "num_tokens": 14487299.0,
- "step": 1622
- },
- {
- "epoch": 1.2332826747720365,
- "grad_norm": 2.687490701675415,
- "learning_rate": 3.4222583778472997e-06,
- "loss": 0.5627540349960327,
- "mean_token_accuracy": 0.8186438083648682,
- "num_tokens": 14494254.0,
- "step": 1623
- },
- {
- "epoch": 1.2340425531914894,
- "grad_norm": 2.622443199157715,
- "learning_rate": 3.4203113817116955e-06,
- "loss": 0.28697147965431213,
- "mean_token_accuracy": 0.8861737847328186,
- "num_tokens": 14498632.0,
- "step": 1624
- },
- {
- "epoch": 1.2348024316109423,
- "grad_norm": 2.6943359375,
- "learning_rate": 3.4183637396661372e-06,
- "loss": 0.25273287296295166,
- "mean_token_accuracy": 0.9104914665222168,
- "num_tokens": 14502797.0,
- "step": 1625
- },
- {
- "epoch": 1.2355623100303952,
- "grad_norm": 2.428189992904663,
- "learning_rate": 3.4164154530775552e-06,
- "loss": 0.4213451147079468,
- "mean_token_accuracy": 0.851524293422699,
- "num_tokens": 14508503.0,
- "step": 1626
- },
- {
- "epoch": 1.236322188449848,
- "grad_norm": 2.1722824573516846,
- "learning_rate": 3.4144665233133318e-06,
- "loss": 0.35238856077194214,
- "mean_token_accuracy": 0.8730837106704712,
- "num_tokens": 14516126.0,
- "step": 1627
- },
- {
- "epoch": 1.237082066869301,
- "grad_norm": 2.291365146636963,
- "learning_rate": 3.4125169517413005e-06,
- "loss": 0.43963465094566345,
- "mean_token_accuracy": 0.8525444865226746,
- "num_tokens": 14522507.0,
- "step": 1628
- },
- {
- "epoch": 1.237841945288754,
- "grad_norm": 1.6181648969650269,
- "learning_rate": 3.410566739729746e-06,
- "loss": 0.2799680233001709,
- "mean_token_accuracy": 0.8915654420852661,
- "num_tokens": 14531025.0,
- "step": 1629
- },
- {
- "epoch": 1.2386018237082066,
- "grad_norm": 1.4039218425750732,
- "learning_rate": 3.408615888647402e-06,
- "loss": 0.29756587743759155,
- "mean_token_accuracy": 0.8951715230941772,
- "num_tokens": 14543770.0,
- "step": 1630
- },
- {
- "epoch": 1.2393617021276595,
- "grad_norm": 2.148325204849243,
- "learning_rate": 3.4066643998634506e-06,
- "loss": 0.3983418345451355,
- "mean_token_accuracy": 0.8635951280593872,
- "num_tokens": 14550896.0,
- "step": 1631
- },
- {
- "epoch": 1.2401215805471124,
- "grad_norm": 1.5225859880447388,
- "learning_rate": 3.4047122747475227e-06,
- "loss": 0.3247569799423218,
- "mean_token_accuracy": 0.8727027177810669,
- "num_tokens": 14562181.0,
- "step": 1632
- },
- {
- "epoch": 1.2408814589665653,
- "grad_norm": 3.99835467338562,
- "learning_rate": 3.402759514669694e-06,
- "loss": 0.4317352771759033,
- "mean_token_accuracy": 0.8488142490386963,
- "num_tokens": 14565521.0,
- "step": 1633
- },
- {
- "epoch": 1.2416413373860182,
- "grad_norm": 1.7306902408599854,
- "learning_rate": 3.4008061210004872e-06,
- "loss": 0.389854371547699,
- "mean_token_accuracy": 0.8553084135055542,
- "num_tokens": 14574633.0,
- "step": 1634
- },
- {
- "epoch": 1.2424012158054711,
- "grad_norm": 2.3614673614501953,
- "learning_rate": 3.3988520951108683e-06,
- "loss": 0.3150152564048767,
- "mean_token_accuracy": 0.8865959644317627,
- "num_tokens": 14580240.0,
- "step": 1635
- },
- {
- "epoch": 1.243161094224924,
- "grad_norm": 1.5625747442245483,
- "learning_rate": 3.3968974383722497e-06,
- "loss": 0.43160033226013184,
- "mean_token_accuracy": 0.840155839920044,
- "num_tokens": 14594255.0,
- "step": 1636
- },
- {
- "epoch": 1.243920972644377,
- "grad_norm": 1.871620535850525,
- "learning_rate": 3.3949421521564825e-06,
- "loss": 0.49550193548202515,
- "mean_token_accuracy": 0.8315126299858093,
- "num_tokens": 14605416.0,
- "step": 1637
- },
- {
- "epoch": 1.2446808510638299,
- "grad_norm": 2.111304759979248,
- "learning_rate": 3.392986237835863e-06,
- "loss": 0.2794899046421051,
- "mean_token_accuracy": 0.9049773216247559,
- "num_tokens": 14611711.0,
- "step": 1638
- },
- {
- "epoch": 1.2454407294832828,
- "grad_norm": 3.7479894161224365,
- "learning_rate": 3.391029696783127e-06,
- "loss": 0.469397634267807,
- "mean_token_accuracy": 0.8352956771850586,
- "num_tokens": 14615536.0,
- "step": 1639
- },
- {
- "epoch": 1.2462006079027357,
- "grad_norm": 3.277726650238037,
- "learning_rate": 3.389072530371451e-06,
- "loss": 0.35431790351867676,
- "mean_token_accuracy": 0.8822286128997803,
- "num_tokens": 14619390.0,
- "step": 1640
- },
- {
- "epoch": 1.2469604863221884,
- "grad_norm": 1.9583072662353516,
- "learning_rate": 3.3871147399744482e-06,
- "loss": 0.3708694577217102,
- "mean_token_accuracy": 0.8720351457595825,
- "num_tokens": 14626573.0,
- "step": 1641
- },
- {
- "epoch": 1.2477203647416413,
- "grad_norm": 1.8734042644500732,
- "learning_rate": 3.385156326966173e-06,
- "loss": 0.48163774609565735,
- "mean_token_accuracy": 0.8479621410369873,
- "num_tokens": 14636382.0,
- "step": 1642
- },
- {
- "epoch": 1.2484802431610942,
- "grad_norm": 2.0085532665252686,
- "learning_rate": 3.383197292721114e-06,
- "loss": 0.4893198311328888,
- "mean_token_accuracy": 0.838238000869751,
- "num_tokens": 14645083.0,
- "step": 1643
- },
- {
- "epoch": 1.249240121580547,
- "grad_norm": 2.0874593257904053,
- "learning_rate": 3.3812376386141966e-06,
- "loss": 0.4610505700111389,
- "mean_token_accuracy": 0.8441368341445923,
- "num_tokens": 14654048.0,
- "step": 1644
- },
- {
- "epoch": 1.25,
- "grad_norm": 1.6887420415878296,
- "learning_rate": 3.379277366020782e-06,
- "loss": 0.3628596067428589,
- "mean_token_accuracy": 0.8838590383529663,
- "num_tokens": 14662317.0,
- "step": 1645
- },
- {
- "epoch": 1.250759878419453,
- "grad_norm": 2.389002561569214,
- "learning_rate": 3.3773164763166653e-06,
- "loss": 0.21903495490550995,
- "mean_token_accuracy": 0.9249413013458252,
- "num_tokens": 14666394.0,
- "step": 1646
- },
- {
- "epoch": 1.2515197568389058,
- "grad_norm": 1.7091087102890015,
- "learning_rate": 3.3753549708780736e-06,
- "loss": 0.37802332639694214,
- "mean_token_accuracy": 0.8644627332687378,
- "num_tokens": 14676214.0,
- "step": 1647
- },
- {
- "epoch": 1.2522796352583587,
- "grad_norm": 2.5717999935150146,
- "learning_rate": 3.3733928510816677e-06,
- "loss": 0.4236462116241455,
- "mean_token_accuracy": 0.8519910573959351,
- "num_tokens": 14681681.0,
- "step": 1648
- },
- {
- "epoch": 1.2530395136778116,
- "grad_norm": 1.958856463432312,
- "learning_rate": 3.3714301183045382e-06,
- "loss": 0.3923419415950775,
- "mean_token_accuracy": 0.8720202445983887,
- "num_tokens": 14690419.0,
- "step": 1649
- },
- {
- "epoch": 1.2537993920972643,
- "grad_norm": 1.5900038480758667,
- "learning_rate": 3.369466773924207e-06,
- "loss": 0.4182325601577759,
- "mean_token_accuracy": 0.8515387177467346,
- "num_tokens": 14699790.0,
- "step": 1650
- },
- {
- "epoch": 1.2545592705167175,
- "grad_norm": 1.260547161102295,
- "learning_rate": 3.3675028193186243e-06,
- "loss": 0.3915718197822571,
- "mean_token_accuracy": 0.8536830544471741,
- "num_tokens": 14717502.0,
- "step": 1651
- },
- {
- "epoch": 1.2553191489361701,
- "grad_norm": 1.8152283430099487,
- "learning_rate": 3.365538255866169e-06,
- "loss": 0.424524188041687,
- "mean_token_accuracy": 0.8434420824050903,
- "num_tokens": 14726591.0,
- "step": 1652
- },
- {
- "epoch": 1.256079027355623,
- "grad_norm": 1.3357285261154175,
- "learning_rate": 3.3635730849456484e-06,
- "loss": 0.2949739396572113,
- "mean_token_accuracy": 0.8868321180343628,
- "num_tokens": 14739911.0,
- "step": 1653
- },
- {
- "epoch": 1.256838905775076,
- "grad_norm": 1.1770358085632324,
- "learning_rate": 3.3616073079362925e-06,
- "loss": 0.29939576983451843,
- "mean_token_accuracy": 0.8923654556274414,
- "num_tokens": 14755521.0,
- "step": 1654
- },
- {
- "epoch": 1.2575987841945289,
- "grad_norm": 2.059162139892578,
- "learning_rate": 3.3596409262177633e-06,
- "loss": 0.4562555253505707,
- "mean_token_accuracy": 0.8585271239280701,
- "num_tokens": 14764173.0,
- "step": 1655
- },
- {
- "epoch": 1.2583586626139818,
- "grad_norm": 1.430752158164978,
- "learning_rate": 3.357673941170139e-06,
- "loss": 0.35301265120506287,
- "mean_token_accuracy": 0.8920517563819885,
- "num_tokens": 14775596.0,
- "step": 1656
- },
- {
- "epoch": 1.2591185410334347,
- "grad_norm": 1.6066302061080933,
- "learning_rate": 3.3557063541739283e-06,
- "loss": 0.41129636764526367,
- "mean_token_accuracy": 0.8512256145477295,
- "num_tokens": 14786289.0,
- "step": 1657
- },
- {
- "epoch": 1.2598784194528876,
- "grad_norm": 1.5471590757369995,
- "learning_rate": 3.353738166610058e-06,
- "loss": 0.3935067057609558,
- "mean_token_accuracy": 0.8514131903648376,
- "num_tokens": 14798672.0,
- "step": 1658
- },
- {
- "epoch": 1.2606382978723405,
- "grad_norm": 1.3455181121826172,
- "learning_rate": 3.35176937985988e-06,
- "loss": 0.3486790657043457,
- "mean_token_accuracy": 0.8644362688064575,
- "num_tokens": 14811603.0,
- "step": 1659
- },
- {
- "epoch": 1.2613981762917934,
- "grad_norm": 1.891432762145996,
- "learning_rate": 3.349799995305162e-06,
- "loss": 0.3325638175010681,
- "mean_token_accuracy": 0.8844645023345947,
- "num_tokens": 14819256.0,
- "step": 1660
- },
- {
- "epoch": 1.262158054711246,
- "grad_norm": 2.600614309310913,
- "learning_rate": 3.3478300143280946e-06,
- "loss": 0.30310919880867004,
- "mean_token_accuracy": 0.9103429317474365,
- "num_tokens": 14823706.0,
- "step": 1661
- },
- {
- "epoch": 1.2629179331306992,
- "grad_norm": 3.8636202812194824,
- "learning_rate": 3.3458594383112868e-06,
- "loss": 0.28377676010131836,
- "mean_token_accuracy": 0.9047091007232666,
- "num_tokens": 14826688.0,
- "step": 1662
- },
- {
- "epoch": 1.263677811550152,
- "grad_norm": 2.3100268840789795,
- "learning_rate": 3.343888268637765e-06,
- "loss": 0.4723394513130188,
- "mean_token_accuracy": 0.8306777477264404,
- "num_tokens": 14835471.0,
- "step": 1663
- },
- {
- "epoch": 1.2644376899696048,
- "grad_norm": 1.7582160234451294,
- "learning_rate": 3.341916506690971e-06,
- "loss": 0.48168784379959106,
- "mean_token_accuracy": 0.8281306028366089,
- "num_tokens": 14846513.0,
- "step": 1664
- },
- {
- "epoch": 1.2651975683890577,
- "grad_norm": 2.166055917739868,
- "learning_rate": 3.3399441538547638e-06,
- "loss": 0.4626024067401886,
- "mean_token_accuracy": 0.8377980589866638,
- "num_tokens": 14853408.0,
- "step": 1665
- },
- {
- "epoch": 1.2659574468085106,
- "grad_norm": 2.23038911819458,
- "learning_rate": 3.337971211513417e-06,
- "loss": 0.38434159755706787,
- "mean_token_accuracy": 0.8708412647247314,
- "num_tokens": 14859919.0,
- "step": 1666
- },
- {
- "epoch": 1.2667173252279635,
- "grad_norm": 2.092505693435669,
- "learning_rate": 3.3359976810516164e-06,
- "loss": 0.35072219371795654,
- "mean_token_accuracy": 0.8761640191078186,
- "num_tokens": 14865624.0,
- "step": 1667
- },
- {
- "epoch": 1.2674772036474165,
- "grad_norm": 1.8255130052566528,
- "learning_rate": 3.3340235638544633e-06,
- "loss": 0.4404270648956299,
- "mean_token_accuracy": 0.836356520652771,
- "num_tokens": 14874181.0,
- "step": 1668
- },
- {
- "epoch": 1.2682370820668694,
- "grad_norm": 1.9889036417007446,
- "learning_rate": 3.332048861307467e-06,
- "loss": 0.4199368357658386,
- "mean_token_accuracy": 0.8508217334747314,
- "num_tokens": 14882275.0,
- "step": 1669
- },
- {
- "epoch": 1.2689969604863223,
- "grad_norm": 4.050281047821045,
- "learning_rate": 3.330073574796551e-06,
- "loss": 0.4271625280380249,
- "mean_token_accuracy": 0.8471108675003052,
- "num_tokens": 14893633.0,
- "step": 1670
- },
- {
- "epoch": 1.2697568389057752,
- "grad_norm": 1.998838186264038,
- "learning_rate": 3.328097705708047e-06,
- "loss": 0.34743767976760864,
- "mean_token_accuracy": 0.8771528005599976,
- "num_tokens": 14899859.0,
- "step": 1671
- },
- {
- "epoch": 1.2705167173252279,
- "grad_norm": 1.7989062070846558,
- "learning_rate": 3.3261212554286977e-06,
- "loss": 0.5267184376716614,
- "mean_token_accuracy": 0.8323302268981934,
- "num_tokens": 14911131.0,
- "step": 1672
- },
- {
- "epoch": 1.2712765957446808,
- "grad_norm": 1.312070369720459,
- "learning_rate": 3.324144225345649e-06,
- "loss": 0.4675425887107849,
- "mean_token_accuracy": 0.8157106637954712,
- "num_tokens": 14928955.0,
- "step": 1673
- },
- {
- "epoch": 1.2720364741641337,
- "grad_norm": 2.0547919273376465,
- "learning_rate": 3.3221666168464584e-06,
- "loss": 0.33704331517219543,
- "mean_token_accuracy": 0.8621441125869751,
- "num_tokens": 14935536.0,
- "step": 1674
- },
- {
- "epoch": 1.2727963525835866,
- "grad_norm": 2.810413122177124,
- "learning_rate": 3.320188431319088e-06,
- "loss": 0.4007563292980194,
- "mean_token_accuracy": 0.8649672269821167,
- "num_tokens": 14940219.0,
- "step": 1675
- },
- {
- "epoch": 1.2735562310030395,
- "grad_norm": 1.3516674041748047,
- "learning_rate": 3.318209670151904e-06,
- "loss": 0.3457040786743164,
- "mean_token_accuracy": 0.8698287010192871,
- "num_tokens": 14952904.0,
- "step": 1676
- },
- {
- "epoch": 1.2743161094224924,
- "grad_norm": 2.440643310546875,
- "learning_rate": 3.3162303347336765e-06,
- "loss": 0.5195086002349854,
- "mean_token_accuracy": 0.8348199129104614,
- "num_tokens": 14958623.0,
- "step": 1677
- },
- {
- "epoch": 1.2750759878419453,
- "grad_norm": 1.3264343738555908,
- "learning_rate": 3.3142504264535808e-06,
- "loss": 0.2990425229072571,
- "mean_token_accuracy": 0.8961933851242065,
- "num_tokens": 14971494.0,
- "step": 1678
- },
- {
- "epoch": 1.2758358662613982,
- "grad_norm": 1.3106894493103027,
- "learning_rate": 3.3122699467011913e-06,
- "loss": 0.291853666305542,
- "mean_token_accuracy": 0.893449068069458,
- "num_tokens": 14985239.0,
- "step": 1679
- },
- {
- "epoch": 1.2765957446808511,
- "grad_norm": 2.5387396812438965,
- "learning_rate": 3.3102888968664857e-06,
- "loss": 0.4336916208267212,
- "mean_token_accuracy": 0.8447890877723694,
- "num_tokens": 14991453.0,
- "step": 1680
- },
- {
- "epoch": 1.2773556231003038,
- "grad_norm": 2.7052135467529297,
- "learning_rate": 3.308307278339842e-06,
- "loss": 0.3279378116130829,
- "mean_token_accuracy": 0.8935879468917847,
- "num_tokens": 14995428.0,
- "step": 1681
- },
- {
- "epoch": 1.278115501519757,
- "grad_norm": 1.6251261234283447,
- "learning_rate": 3.306325092512034e-06,
- "loss": 0.32066458463668823,
- "mean_token_accuracy": 0.8909799456596375,
- "num_tokens": 15004841.0,
- "step": 1682
- },
- {
- "epoch": 1.2788753799392096,
- "grad_norm": 2.3014605045318604,
- "learning_rate": 3.3043423407742374e-06,
- "loss": 0.3523373603820801,
- "mean_token_accuracy": 0.8810735940933228,
- "num_tokens": 15010742.0,
- "step": 1683
- },
- {
- "epoch": 1.2796352583586625,
- "grad_norm": 2.9563019275665283,
- "learning_rate": 3.3023590245180237e-06,
- "loss": 0.39715707302093506,
- "mean_token_accuracy": 0.8779881000518799,
- "num_tokens": 15015357.0,
- "step": 1684
- },
- {
- "epoch": 1.2803951367781155,
- "grad_norm": 1.5787957906723022,
- "learning_rate": 3.300375145135361e-06,
- "loss": 0.44630166888237,
- "mean_token_accuracy": 0.8400174975395203,
- "num_tokens": 15031360.0,
- "step": 1685
- },
- {
- "epoch": 1.2811550151975684,
- "grad_norm": 1.6753438711166382,
- "learning_rate": 3.2983907040186112e-06,
- "loss": 0.3235800862312317,
- "mean_token_accuracy": 0.8938044309616089,
- "num_tokens": 15040276.0,
- "step": 1686
- },
- {
- "epoch": 1.2819148936170213,
- "grad_norm": 1.7331148386001587,
- "learning_rate": 3.296405702560532e-06,
- "loss": 0.39061424136161804,
- "mean_token_accuracy": 0.8599754571914673,
- "num_tokens": 15049725.0,
- "step": 1687
- },
- {
- "epoch": 1.2826747720364742,
- "grad_norm": 2.2029430866241455,
- "learning_rate": 3.294420142154274e-06,
- "loss": 0.43598297238349915,
- "mean_token_accuracy": 0.8663698434829712,
- "num_tokens": 15058182.0,
- "step": 1688
- },
- {
- "epoch": 1.283434650455927,
- "grad_norm": 2.943964958190918,
- "learning_rate": 3.29243402419338e-06,
- "loss": 0.405210942029953,
- "mean_token_accuracy": 0.854996919631958,
- "num_tokens": 15062920.0,
- "step": 1689
- },
- {
- "epoch": 1.28419452887538,
- "grad_norm": 1.9343379735946655,
- "learning_rate": 3.2904473500717826e-06,
- "loss": 0.35011449456214905,
- "mean_token_accuracy": 0.8745867013931274,
- "num_tokens": 15070298.0,
- "step": 1690
- },
- {
- "epoch": 1.284954407294833,
- "grad_norm": 2.559859037399292,
- "learning_rate": 3.2884601211838087e-06,
- "loss": 0.38816407322883606,
- "mean_token_accuracy": 0.854763388633728,
- "num_tokens": 15075667.0,
- "step": 1691
- },
- {
- "epoch": 1.2857142857142856,
- "grad_norm": 1.4357839822769165,
- "learning_rate": 3.2864723389241697e-06,
- "loss": 0.4512745141983032,
- "mean_token_accuracy": 0.8398592472076416,
- "num_tokens": 15090291.0,
- "step": 1692
- },
- {
- "epoch": 1.2864741641337387,
- "grad_norm": 1.7643728256225586,
- "learning_rate": 3.284484004687969e-06,
- "loss": 0.3536742627620697,
- "mean_token_accuracy": 0.8726381063461304,
- "num_tokens": 15099325.0,
- "step": 1693
- },
- {
- "epoch": 1.2872340425531914,
- "grad_norm": 1.853173017501831,
- "learning_rate": 3.2824951198706958e-06,
- "loss": 0.36579740047454834,
- "mean_token_accuracy": 0.8988048434257507,
- "num_tokens": 15107090.0,
- "step": 1694
- },
- {
- "epoch": 1.2879939209726443,
- "grad_norm": 1.6526862382888794,
- "learning_rate": 3.280505685868226e-06,
- "loss": 0.3853636682033539,
- "mean_token_accuracy": 0.8743607997894287,
- "num_tokens": 15117818.0,
- "step": 1695
- },
- {
- "epoch": 1.2887537993920972,
- "grad_norm": 2.790398597717285,
- "learning_rate": 3.278515704076821e-06,
- "loss": 0.2707311511039734,
- "mean_token_accuracy": 0.9034668803215027,
- "num_tokens": 15121641.0,
- "step": 1696
- },
- {
- "epoch": 1.2895136778115501,
- "grad_norm": 1.69557523727417,
- "learning_rate": 3.276525175893126e-06,
- "loss": 0.3707970082759857,
- "mean_token_accuracy": 0.8617855906486511,
- "num_tokens": 15130414.0,
- "step": 1697
- },
- {
- "epoch": 1.290273556231003,
- "grad_norm": 1.1360478401184082,
- "learning_rate": 3.274534102714172e-06,
- "loss": 0.3368082344532013,
- "mean_token_accuracy": 0.8781654834747314,
- "num_tokens": 15148307.0,
- "step": 1698
- },
- {
- "epoch": 1.291033434650456,
- "grad_norm": 1.5894653797149658,
- "learning_rate": 3.272542485937369e-06,
- "loss": 0.3870658278465271,
- "mean_token_accuracy": 0.8830926418304443,
- "num_tokens": 15161841.0,
- "step": 1699
- },
- {
- "epoch": 1.2917933130699089,
- "grad_norm": 2.3735709190368652,
- "learning_rate": 3.270550326960511e-06,
- "loss": 0.3873991370201111,
- "mean_token_accuracy": 0.8729057908058167,
- "num_tokens": 15167733.0,
- "step": 1700
- },
- {
- "epoch": 1.2925531914893618,
- "grad_norm": 1.3739598989486694,
- "learning_rate": 3.268557627181772e-06,
- "loss": 0.30831626057624817,
- "mean_token_accuracy": 0.8695719242095947,
- "num_tokens": 15180861.0,
- "step": 1701
- },
- {
- "epoch": 1.2933130699088147,
- "grad_norm": 1.7526969909667969,
- "learning_rate": 3.2665643879997054e-06,
- "loss": 0.4716024398803711,
- "mean_token_accuracy": 0.8303275108337402,
- "num_tokens": 15191642.0,
- "step": 1702
- },
- {
- "epoch": 1.2940729483282674,
- "grad_norm": 2.7866084575653076,
- "learning_rate": 3.2645706108132426e-06,
- "loss": 0.33337634801864624,
- "mean_token_accuracy": 0.8790726065635681,
- "num_tokens": 15196038.0,
- "step": 1703
- },
- {
- "epoch": 1.2948328267477205,
- "grad_norm": 2.319765090942383,
- "learning_rate": 3.2625762970216944e-06,
- "loss": 0.3999716639518738,
- "mean_token_accuracy": 0.8693568706512451,
- "num_tokens": 15202075.0,
- "step": 1704
- },
- {
- "epoch": 1.2955927051671732,
- "grad_norm": 3.18292498588562,
- "learning_rate": 3.2605814480247454e-06,
- "loss": 0.4579541087150574,
- "mean_token_accuracy": 0.8516187071800232,
- "num_tokens": 15206886.0,
- "step": 1705
- },
- {
- "epoch": 1.296352583586626,
- "grad_norm": 2.1816933155059814,
- "learning_rate": 3.258586065222459e-06,
- "loss": 0.5198885202407837,
- "mean_token_accuracy": 0.8170592784881592,
- "num_tokens": 15214088.0,
- "step": 1706
- },
- {
- "epoch": 1.297112462006079,
- "grad_norm": 1.9076340198516846,
- "learning_rate": 3.2565901500152702e-06,
- "loss": 0.49752360582351685,
- "mean_token_accuracy": 0.8681992292404175,
- "num_tokens": 15226046.0,
- "step": 1707
- },
- {
- "epoch": 1.297872340425532,
- "grad_norm": 2.0223331451416016,
- "learning_rate": 3.2545937038039904e-06,
- "loss": 0.4515793025493622,
- "mean_token_accuracy": 0.8429619073867798,
- "num_tokens": 15234993.0,
- "step": 1708
- },
- {
- "epoch": 1.2986322188449848,
- "grad_norm": 2.5089669227600098,
- "learning_rate": 3.2525967279898017e-06,
- "loss": 0.43628376722335815,
- "mean_token_accuracy": 0.8493682146072388,
- "num_tokens": 15240575.0,
- "step": 1709
- },
- {
- "epoch": 1.2993920972644377,
- "grad_norm": 2.8347091674804688,
- "learning_rate": 3.2505992239742582e-06,
- "loss": 0.25112441182136536,
- "mean_token_accuracy": 0.908825159072876,
- "num_tokens": 15244085.0,
- "step": 1710
- },
- {
- "epoch": 1.3001519756838906,
- "grad_norm": 2.3157572746276855,
- "learning_rate": 3.2486011931592863e-06,
- "loss": 0.482818067073822,
- "mean_token_accuracy": 0.8305923938751221,
- "num_tokens": 15250377.0,
- "step": 1711
- },
- {
- "epoch": 1.3009118541033435,
- "grad_norm": 3.169052839279175,
- "learning_rate": 3.2466026369471804e-06,
- "loss": 0.3493242561817169,
- "mean_token_accuracy": 0.86913001537323,
- "num_tokens": 15255041.0,
- "step": 1712
- },
- {
- "epoch": 1.3016717325227964,
- "grad_norm": 1.4475083351135254,
- "learning_rate": 3.2446035567406033e-06,
- "loss": 0.4177290201187134,
- "mean_token_accuracy": 0.8497589826583862,
- "num_tokens": 15266946.0,
- "step": 1713
- },
- {
- "epoch": 1.3024316109422491,
- "grad_norm": 1.6473008394241333,
- "learning_rate": 3.2426039539425875e-06,
- "loss": 0.5272886753082275,
- "mean_token_accuracy": 0.8440133333206177,
- "num_tokens": 15279263.0,
- "step": 1714
- },
- {
- "epoch": 1.3031914893617023,
- "grad_norm": 2.3996543884277344,
- "learning_rate": 3.240603829956531e-06,
- "loss": 0.4272066652774811,
- "mean_token_accuracy": 0.8495640754699707,
- "num_tokens": 15285213.0,
- "step": 1715
- },
- {
- "epoch": 1.303951367781155,
- "grad_norm": 1.63034987449646,
- "learning_rate": 3.238603186186198e-06,
- "loss": 0.4034635126590729,
- "mean_token_accuracy": 0.8638584613800049,
- "num_tokens": 15295974.0,
- "step": 1716
- },
- {
- "epoch": 1.3047112462006079,
- "grad_norm": 2.153608798980713,
- "learning_rate": 3.2366020240357166e-06,
- "loss": 0.30712565779685974,
- "mean_token_accuracy": 0.8863866329193115,
- "num_tokens": 15302220.0,
- "step": 1717
- },
- {
- "epoch": 1.3054711246200608,
- "grad_norm": 2.9814558029174805,
- "learning_rate": 3.2346003449095803e-06,
- "loss": 0.3922840356826782,
- "mean_token_accuracy": 0.868030309677124,
- "num_tokens": 15306747.0,
- "step": 1718
- },
- {
- "epoch": 1.3062310030395137,
- "grad_norm": 3.3417985439300537,
- "learning_rate": 3.2325981502126434e-06,
- "loss": 0.30750396847724915,
- "mean_token_accuracy": 0.9065356850624084,
- "num_tokens": 15310309.0,
- "step": 1719
- },
- {
- "epoch": 1.3069908814589666,
- "grad_norm": 2.237682819366455,
- "learning_rate": 3.2305954413501252e-06,
- "loss": 0.35068294405937195,
- "mean_token_accuracy": 0.8887614011764526,
- "num_tokens": 15316463.0,
- "step": 1720
- },
- {
- "epoch": 1.3077507598784195,
- "grad_norm": 1.9526605606079102,
- "learning_rate": 3.228592219727602e-06,
- "loss": 0.42061835527420044,
- "mean_token_accuracy": 0.8456839323043823,
- "num_tokens": 15323984.0,
- "step": 1721
- },
- {
- "epoch": 1.3085106382978724,
- "grad_norm": 1.6454212665557861,
- "learning_rate": 3.226588486751012e-06,
- "loss": 0.5189976692199707,
- "mean_token_accuracy": 0.8187375068664551,
- "num_tokens": 15338807.0,
- "step": 1722
- },
- {
- "epoch": 1.3092705167173253,
- "grad_norm": 1.4521609544754028,
- "learning_rate": 3.2245842438266526e-06,
- "loss": 0.329673171043396,
- "mean_token_accuracy": 0.853867769241333,
- "num_tokens": 15350400.0,
- "step": 1723
- },
- {
- "epoch": 1.3100303951367782,
- "grad_norm": 1.8750989437103271,
- "learning_rate": 3.222579492361179e-06,
- "loss": 0.4635341167449951,
- "mean_token_accuracy": 0.8393422365188599,
- "num_tokens": 15360557.0,
- "step": 1724
- },
- {
- "epoch": 1.310790273556231,
- "grad_norm": 1.2728849649429321,
- "learning_rate": 3.220574233761603e-06,
- "loss": 0.3255572021007538,
- "mean_token_accuracy": 0.8989741802215576,
- "num_tokens": 15376548.0,
- "step": 1725
- },
- {
- "epoch": 1.3115501519756838,
- "grad_norm": 3.5155694484710693,
- "learning_rate": 3.2185684694352913e-06,
- "loss": 0.34204089641571045,
- "mean_token_accuracy": 0.8781906366348267,
- "num_tokens": 15380304.0,
- "step": 1726
- },
- {
- "epoch": 1.3123100303951367,
- "grad_norm": 2.059800148010254,
- "learning_rate": 3.216562200789968e-06,
- "loss": 0.36288338899612427,
- "mean_token_accuracy": 0.8595278263092041,
- "num_tokens": 15387653.0,
- "step": 1727
- },
- {
- "epoch": 1.3130699088145896,
- "grad_norm": 3.5388240814208984,
- "learning_rate": 3.214555429233707e-06,
- "loss": 0.5434849858283997,
- "mean_token_accuracy": 0.8074631690979004,
- "num_tokens": 15391662.0,
- "step": 1728
- },
- {
- "epoch": 1.3138297872340425,
- "grad_norm": 2.8595592975616455,
- "learning_rate": 3.2125481561749406e-06,
- "loss": 0.5113687515258789,
- "mean_token_accuracy": 0.8448649644851685,
- "num_tokens": 15397536.0,
- "step": 1729
- },
- {
- "epoch": 1.3145896656534954,
- "grad_norm": 2.50386905670166,
- "learning_rate": 3.210540383022449e-06,
- "loss": 0.5293697118759155,
- "mean_token_accuracy": 0.8096445798873901,
- "num_tokens": 15403478.0,
- "step": 1730
- },
- {
- "epoch": 1.3153495440729484,
- "grad_norm": 1.880035400390625,
- "learning_rate": 3.208532111185365e-06,
- "loss": 0.5344835519790649,
- "mean_token_accuracy": 0.8172965049743652,
- "num_tokens": 15413812.0,
- "step": 1731
- },
- {
- "epoch": 1.3161094224924013,
- "grad_norm": 1.3688768148422241,
- "learning_rate": 3.2065233420731717e-06,
- "loss": 0.2577427327632904,
- "mean_token_accuracy": 0.9142681360244751,
- "num_tokens": 15423583.0,
- "step": 1732
- },
- {
- "epoch": 1.3168693009118542,
- "grad_norm": 1.7945705652236938,
- "learning_rate": 3.2045140770956987e-06,
- "loss": 0.3983926773071289,
- "mean_token_accuracy": 0.8652000427246094,
- "num_tokens": 15432473.0,
- "step": 1733
- },
- {
- "epoch": 1.3176291793313069,
- "grad_norm": 1.8243350982666016,
- "learning_rate": 3.2025043176631283e-06,
- "loss": 0.48644185066223145,
- "mean_token_accuracy": 0.8319193124771118,
- "num_tokens": 15445463.0,
- "step": 1734
- },
- {
- "epoch": 1.31838905775076,
- "grad_norm": 2.000094175338745,
- "learning_rate": 3.2004940651859844e-06,
- "loss": 0.43567317724227905,
- "mean_token_accuracy": 0.8857482671737671,
- "num_tokens": 15452382.0,
- "step": 1735
- },
- {
- "epoch": 1.3191489361702127,
- "grad_norm": 2.379974365234375,
- "learning_rate": 3.198483321075141e-06,
- "loss": 0.5153506398200989,
- "mean_token_accuracy": 0.8295865654945374,
- "num_tokens": 15458740.0,
- "step": 1736
- },
- {
- "epoch": 1.3199088145896656,
- "grad_norm": 1.6564184427261353,
- "learning_rate": 3.196472086741815e-06,
- "loss": 0.508430540561676,
- "mean_token_accuracy": 0.8181540369987488,
- "num_tokens": 15471844.0,
- "step": 1737
- },
- {
- "epoch": 1.3206686930091185,
- "grad_norm": 2.006925344467163,
- "learning_rate": 3.194460363597569e-06,
- "loss": 0.34542378783226013,
- "mean_token_accuracy": 0.8827437162399292,
- "num_tokens": 15478414.0,
- "step": 1738
- },
- {
- "epoch": 1.3214285714285714,
- "grad_norm": 3.589045763015747,
- "learning_rate": 3.192448153054306e-06,
- "loss": 0.4385780096054077,
- "mean_token_accuracy": 0.8480287790298462,
- "num_tokens": 15482063.0,
- "step": 1739
- },
- {
- "epoch": 1.3221884498480243,
- "grad_norm": 1.9797427654266357,
- "learning_rate": 3.190435456524275e-06,
- "loss": 0.4330386519432068,
- "mean_token_accuracy": 0.8458058834075928,
- "num_tokens": 15489803.0,
- "step": 1740
- },
- {
- "epoch": 1.3229483282674772,
- "grad_norm": 1.4777411222457886,
- "learning_rate": 3.188422275420063e-06,
- "loss": 0.3997895419597626,
- "mean_token_accuracy": 0.8639512062072754,
- "num_tokens": 15501103.0,
- "step": 1741
- },
- {
- "epoch": 1.3237082066869301,
- "grad_norm": 2.882338523864746,
- "learning_rate": 3.186408611154597e-06,
- "loss": 0.2336438149213791,
- "mean_token_accuracy": 0.9176726937294006,
- "num_tokens": 15504854.0,
- "step": 1742
- },
- {
- "epoch": 1.324468085106383,
- "grad_norm": 2.353503704071045,
- "learning_rate": 3.184394465141146e-06,
- "loss": 0.4107069671154022,
- "mean_token_accuracy": 0.8677014112472534,
- "num_tokens": 15510662.0,
- "step": 1743
- },
- {
- "epoch": 1.325227963525836,
- "grad_norm": 2.6551976203918457,
- "learning_rate": 3.1823798387933134e-06,
- "loss": 0.3862302899360657,
- "mean_token_accuracy": 0.8819445371627808,
- "num_tokens": 15515681.0,
- "step": 1744
- },
- {
- "epoch": 1.3259878419452886,
- "grad_norm": 1.478572964668274,
- "learning_rate": 3.180364733525043e-06,
- "loss": 0.43972986936569214,
- "mean_token_accuracy": 0.832388162612915,
- "num_tokens": 15529542.0,
- "step": 1745
- },
- {
- "epoch": 1.3267477203647418,
- "grad_norm": 1.6003550291061401,
- "learning_rate": 3.178349150750612e-06,
- "loss": 0.3404902219772339,
- "mean_token_accuracy": 0.8764007091522217,
- "num_tokens": 15538865.0,
- "step": 1746
- },
- {
- "epoch": 1.3275075987841944,
- "grad_norm": 2.130689859390259,
- "learning_rate": 3.1763330918846347e-06,
- "loss": 0.383136510848999,
- "mean_token_accuracy": 0.8652247190475464,
- "num_tokens": 15545567.0,
- "step": 1747
- },
- {
- "epoch": 1.3282674772036474,
- "grad_norm": 2.395937442779541,
- "learning_rate": 3.1743165583420586e-06,
- "loss": 0.3870319128036499,
- "mean_token_accuracy": 0.8618065118789673,
- "num_tokens": 15551090.0,
- "step": 1748
- },
- {
- "epoch": 1.3290273556231003,
- "grad_norm": 2.0841057300567627,
- "learning_rate": 3.1722995515381644e-06,
- "loss": 0.4838739335536957,
- "mean_token_accuracy": 0.8548711538314819,
- "num_tokens": 15558913.0,
- "step": 1749
- },
- {
- "epoch": 1.3297872340425532,
- "grad_norm": 1.4237847328186035,
- "learning_rate": 3.1702820728885657e-06,
- "loss": 0.40350261330604553,
- "mean_token_accuracy": 0.858984649181366,
- "num_tokens": 15572045.0,
- "step": 1750
- },
- {
- "epoch": 1.330547112462006,
- "grad_norm": 2.2641282081604004,
- "learning_rate": 3.1682641238092064e-06,
- "loss": 0.5117636919021606,
- "mean_token_accuracy": 0.8078924417495728,
- "num_tokens": 15579753.0,
- "step": 1751
- },
- {
- "epoch": 1.331306990881459,
- "grad_norm": 1.0010309219360352,
- "learning_rate": 3.1662457057163603e-06,
- "loss": 0.3220978379249573,
- "mean_token_accuracy": 0.8786559104919434,
- "num_tokens": 15602823.0,
- "step": 1752
- },
- {
- "epoch": 1.332066869300912,
- "grad_norm": 2.441230535507202,
- "learning_rate": 3.164226820026632e-06,
- "loss": 0.37529727816581726,
- "mean_token_accuracy": 0.8886898756027222,
- "num_tokens": 15608473.0,
- "step": 1753
- },
- {
- "epoch": 1.3328267477203648,
- "grad_norm": 1.2960991859436035,
- "learning_rate": 3.162207468156952e-06,
- "loss": 0.3393767476081848,
- "mean_token_accuracy": 0.8766993284225464,
- "num_tokens": 15620893.0,
- "step": 1754
- },
- {
- "epoch": 1.3335866261398177,
- "grad_norm": 2.0806996822357178,
- "learning_rate": 3.16018765152458e-06,
- "loss": 0.38034507632255554,
- "mean_token_accuracy": 0.8854838609695435,
- "num_tokens": 15627068.0,
- "step": 1755
- },
- {
- "epoch": 1.3343465045592704,
- "grad_norm": 1.4316699504852295,
- "learning_rate": 3.1581673715471007e-06,
- "loss": 0.3665890693664551,
- "mean_token_accuracy": 0.870919406414032,
- "num_tokens": 15641070.0,
- "step": 1756
- },
- {
- "epoch": 1.3351063829787235,
- "grad_norm": 1.3466622829437256,
- "learning_rate": 3.1561466296424247e-06,
- "loss": 0.37387198209762573,
- "mean_token_accuracy": 0.8633951544761658,
- "num_tokens": 15653777.0,
- "step": 1757
- },
- {
- "epoch": 1.3358662613981762,
- "grad_norm": 1.8108628988265991,
- "learning_rate": 3.154125427228786e-06,
- "loss": 0.38428938388824463,
- "mean_token_accuracy": 0.85402512550354,
- "num_tokens": 15662494.0,
- "step": 1758
- },
- {
- "epoch": 1.3366261398176291,
- "grad_norm": 1.3221700191497803,
- "learning_rate": 3.152103765724743e-06,
- "loss": 0.42825520038604736,
- "mean_token_accuracy": 0.8435465097427368,
- "num_tokens": 15677552.0,
- "step": 1759
- },
- {
- "epoch": 1.337386018237082,
- "grad_norm": 2.6247692108154297,
- "learning_rate": 3.150081646549174e-06,
- "loss": 0.36186715960502625,
- "mean_token_accuracy": 0.8767328262329102,
- "num_tokens": 15682103.0,
- "step": 1760
- },
- {
- "epoch": 1.338145896656535,
- "grad_norm": 2.1469814777374268,
- "learning_rate": 3.1480590711212823e-06,
- "loss": 0.3734385669231415,
- "mean_token_accuracy": 0.8711104393005371,
- "num_tokens": 15689182.0,
- "step": 1761
- },
- {
- "epoch": 1.3389057750759878,
- "grad_norm": 2.1702585220336914,
- "learning_rate": 3.1460360408605866e-06,
- "loss": 0.2795315086841583,
- "mean_token_accuracy": 0.8892190456390381,
- "num_tokens": 15694272.0,
- "step": 1762
- },
- {
- "epoch": 1.3396656534954408,
- "grad_norm": 1.918797254562378,
- "learning_rate": 3.144012557186931e-06,
- "loss": 0.4363473057746887,
- "mean_token_accuracy": 0.8573931455612183,
- "num_tokens": 15703532.0,
- "step": 1763
- },
- {
- "epoch": 1.3404255319148937,
- "grad_norm": 2.5579960346221924,
- "learning_rate": 3.14198862152047e-06,
- "loss": 0.406247079372406,
- "mean_token_accuracy": 0.8617593050003052,
- "num_tokens": 15708652.0,
- "step": 1764
- },
- {
- "epoch": 1.3411854103343466,
- "grad_norm": 2.3617870807647705,
- "learning_rate": 3.1399642352816825e-06,
- "loss": 0.2839522659778595,
- "mean_token_accuracy": 0.8996064066886902,
- "num_tokens": 15713598.0,
- "step": 1765
- },
- {
- "epoch": 1.3419452887537995,
- "grad_norm": 1.248302936553955,
- "learning_rate": 3.1379393998913594e-06,
- "loss": 0.2922290861606598,
- "mean_token_accuracy": 0.8948773145675659,
- "num_tokens": 15726693.0,
- "step": 1766
- },
- {
- "epoch": 1.3427051671732522,
- "grad_norm": 2.143599510192871,
- "learning_rate": 3.135914116770609e-06,
- "loss": 0.32176223397254944,
- "mean_token_accuracy": 0.8808754682540894,
- "num_tokens": 15731901.0,
- "step": 1767
- },
- {
- "epoch": 1.3434650455927053,
- "grad_norm": 4.226369857788086,
- "learning_rate": 3.1338883873408517e-06,
- "loss": 0.4682556390762329,
- "mean_token_accuracy": 0.8566025495529175,
- "num_tokens": 15735029.0,
- "step": 1768
- },
- {
- "epoch": 1.344224924012158,
- "grad_norm": 1.8695988655090332,
- "learning_rate": 3.1318622130238237e-06,
- "loss": 0.4297192394733429,
- "mean_token_accuracy": 0.8419148921966553,
- "num_tokens": 15744310.0,
- "step": 1769
- },
- {
- "epoch": 1.344984802431611,
- "grad_norm": 2.4321305751800537,
- "learning_rate": 3.1298355952415714e-06,
- "loss": 0.36076444387435913,
- "mean_token_accuracy": 0.8826035261154175,
- "num_tokens": 15749337.0,
- "step": 1770
- },
- {
- "epoch": 1.3457446808510638,
- "grad_norm": 1.5500011444091797,
- "learning_rate": 3.127808535416454e-06,
- "loss": 0.48664039373397827,
- "mean_token_accuracy": 0.844344437122345,
- "num_tokens": 15761096.0,
- "step": 1771
- },
- {
- "epoch": 1.3465045592705167,
- "grad_norm": 2.1498289108276367,
- "learning_rate": 3.1257810349711388e-06,
- "loss": 0.4841752052307129,
- "mean_token_accuracy": 0.8324567079544067,
- "num_tokens": 15768646.0,
- "step": 1772
- },
- {
- "epoch": 1.3472644376899696,
- "grad_norm": 1.2995187044143677,
- "learning_rate": 3.1237530953286046e-06,
- "loss": 0.492019385099411,
- "mean_token_accuracy": 0.8285316228866577,
- "num_tokens": 15788401.0,
- "step": 1773
- },
- {
- "epoch": 1.3480243161094225,
- "grad_norm": 2.324819803237915,
- "learning_rate": 3.121724717912138e-06,
- "loss": 0.33166298270225525,
- "mean_token_accuracy": 0.8856451511383057,
- "num_tokens": 15794097.0,
- "step": 1774
- },
- {
- "epoch": 1.3487841945288754,
- "grad_norm": 1.9611430168151855,
- "learning_rate": 3.11969590414533e-06,
- "loss": 0.3974284827709198,
- "mean_token_accuracy": 0.8751305937767029,
- "num_tokens": 15801065.0,
- "step": 1775
- },
- {
- "epoch": 1.3495440729483283,
- "grad_norm": 1.7084417343139648,
- "learning_rate": 3.1176666554520827e-06,
- "loss": 0.38729435205459595,
- "mean_token_accuracy": 0.8680770397186279,
- "num_tokens": 15810353.0,
- "step": 1776
- },
- {
- "epoch": 1.3503039513677813,
- "grad_norm": 1.7616240978240967,
- "learning_rate": 3.1156369732566006e-06,
- "loss": 0.4271578788757324,
- "mean_token_accuracy": 0.843730092048645,
- "num_tokens": 15821889.0,
- "step": 1777
- },
- {
- "epoch": 1.351063829787234,
- "grad_norm": 2.030747413635254,
- "learning_rate": 3.113606858983391e-06,
- "loss": 0.361891508102417,
- "mean_token_accuracy": 0.8522407412528992,
- "num_tokens": 15830800.0,
- "step": 1778
- },
- {
- "epoch": 1.3518237082066868,
- "grad_norm": 1.4842649698257446,
- "learning_rate": 3.1115763140572686e-06,
- "loss": 0.466334730386734,
- "mean_token_accuracy": 0.8433995246887207,
- "num_tokens": 15849422.0,
- "step": 1779
- },
- {
- "epoch": 1.3525835866261398,
- "grad_norm": 1.6595379114151,
- "learning_rate": 3.109545339903347e-06,
- "loss": 0.4622533321380615,
- "mean_token_accuracy": 0.8526314496994019,
- "num_tokens": 15860431.0,
- "step": 1780
- },
- {
- "epoch": 1.3533434650455927,
- "grad_norm": 2.1235809326171875,
- "learning_rate": 3.107513937947041e-06,
- "loss": 0.42694270610809326,
- "mean_token_accuracy": 0.854864239692688,
- "num_tokens": 15869044.0,
- "step": 1781
- },
- {
- "epoch": 1.3541033434650456,
- "grad_norm": 1.5889263153076172,
- "learning_rate": 3.1054821096140675e-06,
- "loss": 0.41838499903678894,
- "mean_token_accuracy": 0.8671513795852661,
- "num_tokens": 15878598.0,
- "step": 1782
- },
- {
- "epoch": 1.3548632218844985,
- "grad_norm": 2.2261741161346436,
- "learning_rate": 3.1034498563304435e-06,
- "loss": 0.4045066237449646,
- "mean_token_accuracy": 0.843826949596405,
- "num_tokens": 15885167.0,
- "step": 1783
- },
- {
- "epoch": 1.3556231003039514,
- "grad_norm": 2.2569329738616943,
- "learning_rate": 3.1014171795224794e-06,
- "loss": 0.36677104234695435,
- "mean_token_accuracy": 0.8747833967208862,
- "num_tokens": 15891308.0,
- "step": 1784
- },
- {
- "epoch": 1.3563829787234043,
- "grad_norm": 2.1027088165283203,
- "learning_rate": 3.0993840806167884e-06,
- "loss": 0.437946081161499,
- "mean_token_accuracy": 0.8370785117149353,
- "num_tokens": 15898952.0,
- "step": 1785
- },
- {
- "epoch": 1.3571428571428572,
- "grad_norm": 1.8768929243087769,
- "learning_rate": 3.0973505610402767e-06,
- "loss": 0.4201734662055969,
- "mean_token_accuracy": 0.8474810123443604,
- "num_tokens": 15907340.0,
- "step": 1786
- },
- {
- "epoch": 1.35790273556231,
- "grad_norm": 1.7216229438781738,
- "learning_rate": 3.0953166222201474e-06,
- "loss": 0.4225231409072876,
- "mean_token_accuracy": 0.8437749147415161,
- "num_tokens": 15917852.0,
- "step": 1787
- },
- {
- "epoch": 1.358662613981763,
- "grad_norm": 2.6256966590881348,
- "learning_rate": 3.093282265583895e-06,
- "loss": 0.435439795255661,
- "mean_token_accuracy": 0.8452040553092957,
- "num_tokens": 15923739.0,
- "step": 1788
- },
- {
- "epoch": 1.3594224924012157,
- "grad_norm": 2.90028977394104,
- "learning_rate": 3.0912474925593124e-06,
- "loss": 0.3730456829071045,
- "mean_token_accuracy": 0.8766646385192871,
- "num_tokens": 15927943.0,
- "step": 1789
- },
- {
- "epoch": 1.3601823708206686,
- "grad_norm": 1.5966626405715942,
- "learning_rate": 3.0892123045744787e-06,
- "loss": 0.42150455713272095,
- "mean_token_accuracy": 0.854656457901001,
- "num_tokens": 15939922.0,
- "step": 1790
- },
- {
- "epoch": 1.3609422492401215,
- "grad_norm": 1.8069748878479004,
- "learning_rate": 3.0871767030577686e-06,
- "loss": 0.4954872131347656,
- "mean_token_accuracy": 0.8289790153503418,
- "num_tokens": 15950095.0,
- "step": 1791
- },
- {
- "epoch": 1.3617021276595744,
- "grad_norm": 2.0855250358581543,
- "learning_rate": 3.085140689437846e-06,
- "loss": 0.41999945044517517,
- "mean_token_accuracy": 0.8517382144927979,
- "num_tokens": 15957972.0,
- "step": 1792
- },
- {
- "epoch": 1.3624620060790273,
- "grad_norm": 2.108659267425537,
- "learning_rate": 3.0831042651436634e-06,
- "loss": 0.3668023645877838,
- "mean_token_accuracy": 0.8710855841636658,
- "num_tokens": 15965614.0,
- "step": 1793
- },
- {
- "epoch": 1.3632218844984803,
- "grad_norm": 1.3799632787704468,
- "learning_rate": 3.0810674316044602e-06,
- "loss": 0.351409375667572,
- "mean_token_accuracy": 0.870837390422821,
- "num_tokens": 15978854.0,
- "step": 1794
- },
- {
- "epoch": 1.3639817629179332,
- "grad_norm": 1.540397047996521,
- "learning_rate": 3.0790301902497664e-06,
- "loss": 0.403600811958313,
- "mean_token_accuracy": 0.8485002517700195,
- "num_tokens": 15993324.0,
- "step": 1795
- },
- {
- "epoch": 1.364741641337386,
- "grad_norm": 1.946882963180542,
- "learning_rate": 3.076992542509396e-06,
- "loss": 0.40118327736854553,
- "mean_token_accuracy": 0.8607497811317444,
- "num_tokens": 16001937.0,
- "step": 1796
- },
- {
- "epoch": 1.365501519756839,
- "grad_norm": 2.0464305877685547,
- "learning_rate": 3.0749544898134487e-06,
- "loss": 0.31742292642593384,
- "mean_token_accuracy": 0.8878391981124878,
- "num_tokens": 16009277.0,
- "step": 1797
- },
- {
- "epoch": 1.3662613981762917,
- "grad_norm": 2.091754913330078,
- "learning_rate": 3.072916033592307e-06,
- "loss": 0.31580421328544617,
- "mean_token_accuracy": 0.8875244855880737,
- "num_tokens": 16015756.0,
- "step": 1798
- },
- {
- "epoch": 1.3670212765957448,
- "grad_norm": 3.4449212551116943,
- "learning_rate": 3.0708771752766397e-06,
- "loss": 0.4692591726779938,
- "mean_token_accuracy": 0.8456202149391174,
- "num_tokens": 16019912.0,
- "step": 1799
- },
- {
- "epoch": 1.3677811550151975,
- "grad_norm": 1.600419521331787,
- "learning_rate": 3.068837916297396e-06,
- "loss": 0.40389442443847656,
- "mean_token_accuracy": 0.8378961086273193,
- "num_tokens": 16032637.0,
- "step": 1800
- },
- {
- "epoch": 1.3685410334346504,
- "grad_norm": 1.5282686948776245,
- "learning_rate": 3.0667982580858047e-06,
- "loss": 0.379841685295105,
- "mean_token_accuracy": 0.8752143383026123,
- "num_tokens": 16045205.0,
- "step": 1801
- },
- {
- "epoch": 1.3693009118541033,
- "grad_norm": 2.486079454421997,
- "learning_rate": 3.0647582020733773e-06,
- "loss": 0.41060030460357666,
- "mean_token_accuracy": 0.8575131893157959,
- "num_tokens": 16051189.0,
- "step": 1802
- },
- {
- "epoch": 1.3700607902735562,
- "grad_norm": 1.9458621740341187,
- "learning_rate": 3.062717749691904e-06,
- "loss": 0.4442213773727417,
- "mean_token_accuracy": 0.8451495170593262,
- "num_tokens": 16059700.0,
- "step": 1803
- },
- {
- "epoch": 1.3708206686930091,
- "grad_norm": 1.4333001375198364,
- "learning_rate": 3.0606769023734535e-06,
- "loss": 0.39132001996040344,
- "mean_token_accuracy": 0.8609901666641235,
- "num_tokens": 16072458.0,
- "step": 1804
- },
- {
- "epoch": 1.371580547112462,
- "grad_norm": 1.490355372428894,
- "learning_rate": 3.0586356615503693e-06,
- "loss": 0.4108564257621765,
- "mean_token_accuracy": 0.8871046304702759,
- "num_tokens": 16083142.0,
- "step": 1805
- },
- {
- "epoch": 1.372340425531915,
- "grad_norm": 1.7765129804611206,
- "learning_rate": 3.056594028655274e-06,
- "loss": 0.3850266635417938,
- "mean_token_accuracy": 0.8923365473747253,
- "num_tokens": 16092519.0,
- "step": 1806
- },
- {
- "epoch": 1.3731003039513678,
- "grad_norm": 1.955661416053772,
- "learning_rate": 3.0545520051210637e-06,
- "loss": 0.4665378928184509,
- "mean_token_accuracy": 0.837419867515564,
- "num_tokens": 16100618.0,
- "step": 1807
- },
- {
- "epoch": 1.3738601823708207,
- "grad_norm": 3.259265422821045,
- "learning_rate": 3.052509592380909e-06,
- "loss": 0.24722981452941895,
- "mean_token_accuracy": 0.9106054306030273,
- "num_tokens": 16103836.0,
- "step": 1808
- },
- {
- "epoch": 1.3746200607902734,
- "grad_norm": 1.7995736598968506,
- "learning_rate": 3.050466791868254e-06,
- "loss": 0.4982220530509949,
- "mean_token_accuracy": 0.8298169374465942,
- "num_tokens": 16114727.0,
- "step": 1809
- },
- {
- "epoch": 1.3753799392097266,
- "grad_norm": 1.9643093347549438,
- "learning_rate": 3.048423605016815e-06,
- "loss": 0.5076829195022583,
- "mean_token_accuracy": 0.8303098678588867,
- "num_tokens": 16129491.0,
- "step": 1810
- },
- {
- "epoch": 1.3761398176291793,
- "grad_norm": 3.505594491958618,
- "learning_rate": 3.0463800332605787e-06,
- "loss": 0.27466052770614624,
- "mean_token_accuracy": 0.9018045663833618,
- "num_tokens": 16132640.0,
- "step": 1811
- },
- {
- "epoch": 1.3768996960486322,
- "grad_norm": 1.798437237739563,
- "learning_rate": 3.0443360780338034e-06,
- "loss": 0.4004853069782257,
- "mean_token_accuracy": 0.8569544553756714,
- "num_tokens": 16143317.0,
- "step": 1812
- },
- {
- "epoch": 1.377659574468085,
- "grad_norm": 2.276740789413452,
- "learning_rate": 3.042291740771014e-06,
- "loss": 0.3823797106742859,
- "mean_token_accuracy": 0.8764113783836365,
- "num_tokens": 16148898.0,
- "step": 1813
- },
- {
- "epoch": 1.378419452887538,
- "grad_norm": 2.5051357746124268,
- "learning_rate": 3.0402470229070057e-06,
- "loss": 0.40365856885910034,
- "mean_token_accuracy": 0.8809891939163208,
- "num_tokens": 16153815.0,
- "step": 1814
- },
- {
- "epoch": 1.3791793313069909,
- "grad_norm": 1.2379236221313477,
- "learning_rate": 3.03820192587684e-06,
- "loss": 0.3955119848251343,
- "mean_token_accuracy": 0.8536627292633057,
- "num_tokens": 16167783.0,
- "step": 1815
- },
- {
- "epoch": 1.3799392097264438,
- "grad_norm": 2.2286343574523926,
- "learning_rate": 3.036156451115846e-06,
- "loss": 0.39647501707077026,
- "mean_token_accuracy": 0.8621993064880371,
- "num_tokens": 16174707.0,
- "step": 1816
- },
- {
- "epoch": 1.3806990881458967,
- "grad_norm": 1.884639024734497,
- "learning_rate": 3.034110600059616e-06,
- "loss": 0.31612110137939453,
- "mean_token_accuracy": 0.8942475318908691,
- "num_tokens": 16181919.0,
- "step": 1817
- },
- {
- "epoch": 1.3814589665653496,
- "grad_norm": 1.891312599182129,
- "learning_rate": 3.0320643741440052e-06,
- "loss": 0.46209126710891724,
- "mean_token_accuracy": 0.8374713659286499,
- "num_tokens": 16189276.0,
- "step": 1818
- },
- {
- "epoch": 1.3822188449848025,
- "grad_norm": 2.507478713989258,
- "learning_rate": 3.0300177748051375e-06,
- "loss": 0.37601593136787415,
- "mean_token_accuracy": 0.8633589148521423,
- "num_tokens": 16194346.0,
- "step": 1819
- },
- {
- "epoch": 1.3829787234042552,
- "grad_norm": 1.5046696662902832,
- "learning_rate": 3.0279708034793907e-06,
- "loss": 0.3284982144832611,
- "mean_token_accuracy": 0.8792630434036255,
- "num_tokens": 16205457.0,
- "step": 1820
- },
- {
- "epoch": 1.3837386018237083,
- "grad_norm": 2.4244449138641357,
- "learning_rate": 3.025923461603412e-06,
- "loss": 0.40939009189605713,
- "mean_token_accuracy": 0.8596426248550415,
- "num_tokens": 16211866.0,
- "step": 1821
- },
- {
- "epoch": 1.384498480243161,
- "grad_norm": 2.8656933307647705,
- "learning_rate": 3.0238757506141013e-06,
- "loss": 0.4397110044956207,
- "mean_token_accuracy": 0.8597331047058105,
- "num_tokens": 16216607.0,
- "step": 1822
- },
- {
- "epoch": 1.385258358662614,
- "grad_norm": 2.0718610286712646,
- "learning_rate": 3.0218276719486245e-06,
- "loss": 0.49057573080062866,
- "mean_token_accuracy": 0.8325331211090088,
- "num_tokens": 16224014.0,
- "step": 1823
- },
- {
- "epoch": 1.3860182370820668,
- "grad_norm": 1.054450273513794,
- "learning_rate": 3.019779227044398e-06,
- "loss": 0.3758106827735901,
- "mean_token_accuracy": 0.8689473867416382,
- "num_tokens": 16248627.0,
- "step": 1824
- },
- {
- "epoch": 1.3867781155015197,
- "grad_norm": 2.1115148067474365,
- "learning_rate": 3.0177304173391038e-06,
- "loss": 0.502967119216919,
- "mean_token_accuracy": 0.823198676109314,
- "num_tokens": 16256255.0,
- "step": 1825
- },
- {
- "epoch": 1.3875379939209727,
- "grad_norm": 2.207277297973633,
- "learning_rate": 3.015681244270672e-06,
- "loss": 0.3458971083164215,
- "mean_token_accuracy": 0.8930196762084961,
- "num_tokens": 16261823.0,
- "step": 1826
- },
- {
- "epoch": 1.3882978723404256,
- "grad_norm": 1.289669156074524,
- "learning_rate": 3.0136317092772923e-06,
- "loss": 0.4422765374183655,
- "mean_token_accuracy": 0.8358346819877625,
- "num_tokens": 16280659.0,
- "step": 1827
- },
- {
- "epoch": 1.3890577507598785,
- "grad_norm": 2.233865737915039,
- "learning_rate": 3.0115818137974066e-06,
- "loss": 0.3643006384372711,
- "mean_token_accuracy": 0.8682862520217896,
- "num_tokens": 16286356.0,
- "step": 1828
- },
- {
- "epoch": 1.3898176291793314,
- "grad_norm": 1.0950042009353638,
- "learning_rate": 3.0095315592697126e-06,
- "loss": 0.34712421894073486,
- "mean_token_accuracy": 0.8578766584396362,
- "num_tokens": 16307298.0,
- "step": 1829
- },
- {
- "epoch": 1.3905775075987843,
- "grad_norm": 1.1708037853240967,
- "learning_rate": 3.007480947133155e-06,
- "loss": 0.33152541518211365,
- "mean_token_accuracy": 0.894973874092102,
- "num_tokens": 16323232.0,
- "step": 1830
- },
- {
- "epoch": 1.391337386018237,
- "grad_norm": 1.2226970195770264,
- "learning_rate": 3.0054299788269343e-06,
- "loss": 0.3915635943412781,
- "mean_token_accuracy": 0.8575779795646667,
- "num_tokens": 16339273.0,
- "step": 1831
- },
- {
- "epoch": 1.39209726443769,
- "grad_norm": 1.2226042747497559,
- "learning_rate": 3.0033786557904982e-06,
- "loss": 0.45846253633499146,
- "mean_token_accuracy": 0.8290432691574097,
- "num_tokens": 16360145.0,
- "step": 1832
- },
- {
- "epoch": 1.3928571428571428,
- "grad_norm": 2.0117406845092773,
- "learning_rate": 3.001326979463545e-06,
- "loss": 0.3837882876396179,
- "mean_token_accuracy": 0.8941739797592163,
- "num_tokens": 16366602.0,
- "step": 1833
- },
- {
- "epoch": 1.3936170212765957,
- "grad_norm": 1.8419997692108154,
- "learning_rate": 2.9992749512860177e-06,
- "loss": 0.40777021646499634,
- "mean_token_accuracy": 0.854655385017395,
- "num_tokens": 16375611.0,
- "step": 1834
- },
- {
- "epoch": 1.3943768996960486,
- "grad_norm": 1.9405122995376587,
- "learning_rate": 2.9972225726981114e-06,
- "loss": 0.46685922145843506,
- "mean_token_accuracy": 0.8493201732635498,
- "num_tokens": 16384878.0,
- "step": 1835
- },
- {
- "epoch": 1.3951367781155015,
- "grad_norm": 1.2425674200057983,
- "learning_rate": 2.995169845140264e-06,
- "loss": 0.394692063331604,
- "mean_token_accuracy": 0.851348876953125,
- "num_tokens": 16404452.0,
- "step": 1836
- },
- {
- "epoch": 1.3958966565349544,
- "grad_norm": 1.2215365171432495,
- "learning_rate": 2.9931167700531575e-06,
- "loss": 0.31412452459335327,
- "mean_token_accuracy": 0.882760763168335,
- "num_tokens": 16419358.0,
- "step": 1837
- },
- {
- "epoch": 1.3966565349544073,
- "grad_norm": 1.912168025970459,
- "learning_rate": 2.9910633488777198e-06,
- "loss": 0.5065487623214722,
- "mean_token_accuracy": 0.8524355292320251,
- "num_tokens": 16430418.0,
- "step": 1838
- },
- {
- "epoch": 1.3974164133738602,
- "grad_norm": 2.2173948287963867,
- "learning_rate": 2.989009583055121e-06,
- "loss": 0.4290938377380371,
- "mean_token_accuracy": 0.8381836414337158,
- "num_tokens": 16438267.0,
- "step": 1839
- },
- {
- "epoch": 1.3981762917933132,
- "grad_norm": 1.8293484449386597,
- "learning_rate": 2.9869554740267726e-06,
- "loss": 0.41683733463287354,
- "mean_token_accuracy": 0.8548779487609863,
- "num_tokens": 16447382.0,
- "step": 1840
- },
- {
- "epoch": 1.398936170212766,
- "grad_norm": 1.835015892982483,
- "learning_rate": 2.9849010232343274e-06,
- "loss": 0.5080599784851074,
- "mean_token_accuracy": 0.8193596601486206,
- "num_tokens": 16458541.0,
- "step": 1841
- },
- {
- "epoch": 1.3996960486322187,
- "grad_norm": 2.031339645385742,
- "learning_rate": 2.982846232119679e-06,
- "loss": 0.5168882012367249,
- "mean_token_accuracy": 0.8525956869125366,
- "num_tokens": 16467747.0,
- "step": 1842
- },
- {
- "epoch": 1.4004559270516717,
- "grad_norm": 1.5554167032241821,
- "learning_rate": 2.9807911021249573e-06,
- "loss": 0.35098958015441895,
- "mean_token_accuracy": 0.888373851776123,
- "num_tokens": 16479319.0,
- "step": 1843
- },
- {
- "epoch": 1.4012158054711246,
- "grad_norm": 1.7183740139007568,
- "learning_rate": 2.9787356346925327e-06,
- "loss": 0.41263148188591003,
- "mean_token_accuracy": 0.8478364944458008,
- "num_tokens": 16489952.0,
- "step": 1844
- },
- {
- "epoch": 1.4019756838905775,
- "grad_norm": 1.7743209600448608,
- "learning_rate": 2.9766798312650112e-06,
- "loss": 0.4211183190345764,
- "mean_token_accuracy": 0.8641136884689331,
- "num_tokens": 16498655.0,
- "step": 1845
- },
- {
- "epoch": 1.4027355623100304,
- "grad_norm": 2.141300916671753,
- "learning_rate": 2.9746236932852355e-06,
- "loss": 0.49548980593681335,
- "mean_token_accuracy": 0.8304252028465271,
- "num_tokens": 16506348.0,
- "step": 1846
- },
- {
- "epoch": 1.4034954407294833,
- "grad_norm": 2.341571807861328,
- "learning_rate": 2.9725672221962804e-06,
- "loss": 0.40804803371429443,
- "mean_token_accuracy": 0.8545800447463989,
- "num_tokens": 16513091.0,
- "step": 1847
- },
- {
- "epoch": 1.4042553191489362,
- "grad_norm": 1.934428095817566,
- "learning_rate": 2.9705104194414587e-06,
- "loss": 0.30029812455177307,
- "mean_token_accuracy": 0.9032052755355835,
- "num_tokens": 16519455.0,
- "step": 1848
- },
- {
- "epoch": 1.405015197568389,
- "grad_norm": 1.420804500579834,
- "learning_rate": 2.9684532864643123e-06,
- "loss": 0.4384060502052307,
- "mean_token_accuracy": 0.8465110063552856,
- "num_tokens": 16533222.0,
- "step": 1849
- },
- {
- "epoch": 1.405775075987842,
- "grad_norm": 2.1180737018585205,
- "learning_rate": 2.9663958247086165e-06,
- "loss": 0.3915565609931946,
- "mean_token_accuracy": 0.8633890748023987,
- "num_tokens": 16539489.0,
- "step": 1850
- },
- {
- "epoch": 1.4065349544072947,
- "grad_norm": 1.408048152923584,
- "learning_rate": 2.964338035618378e-06,
- "loss": 0.46166157722473145,
- "mean_token_accuracy": 0.8305013179779053,
- "num_tokens": 16555785.0,
- "step": 1851
- },
- {
- "epoch": 1.4072948328267478,
- "grad_norm": 1.3418530225753784,
- "learning_rate": 2.9622799206378306e-06,
- "loss": 0.5314373970031738,
- "mean_token_accuracy": 0.81779944896698,
- "num_tokens": 16578111.0,
- "step": 1852
- },
- {
- "epoch": 1.4080547112462005,
- "grad_norm": 1.4634262323379517,
- "learning_rate": 2.9602214812114414e-06,
- "loss": 0.4859408140182495,
- "mean_token_accuracy": 0.8261818885803223,
- "num_tokens": 16591976.0,
- "step": 1853
- },
- {
- "epoch": 1.4088145896656534,
- "grad_norm": 1.4840295314788818,
- "learning_rate": 2.9581627187838997e-06,
- "loss": 0.4079628586769104,
- "mean_token_accuracy": 0.8549603223800659,
- "num_tokens": 16603631.0,
- "step": 1854
- },
- {
- "epoch": 1.4095744680851063,
- "grad_norm": 2.1474642753601074,
- "learning_rate": 2.956103634800126e-06,
- "loss": 0.32997995615005493,
- "mean_token_accuracy": 0.8836915493011475,
- "num_tokens": 16609875.0,
- "step": 1855
- },
- {
- "epoch": 1.4103343465045592,
- "grad_norm": 2.627460241317749,
- "learning_rate": 2.9540442307052643e-06,
- "loss": 0.3229186236858368,
- "mean_token_accuracy": 0.8852157592773438,
- "num_tokens": 16614113.0,
- "step": 1856
- },
- {
- "epoch": 1.4110942249240122,
- "grad_norm": 1.9569811820983887,
- "learning_rate": 2.9519845079446824e-06,
- "loss": 0.5057883858680725,
- "mean_token_accuracy": 0.8585711717605591,
- "num_tokens": 16624611.0,
- "step": 1857
- },
- {
- "epoch": 1.411854103343465,
- "grad_norm": 2.0604090690612793,
- "learning_rate": 2.949924467963975e-06,
- "loss": 0.4681510329246521,
- "mean_token_accuracy": 0.8390560150146484,
- "num_tokens": 16632938.0,
- "step": 1858
- },
- {
- "epoch": 1.412613981762918,
- "grad_norm": 2.5430450439453125,
- "learning_rate": 2.9478641122089563e-06,
- "loss": 0.3090999126434326,
- "mean_token_accuracy": 0.8943990468978882,
- "num_tokens": 16637135.0,
- "step": 1859
- },
- {
- "epoch": 1.4133738601823709,
- "grad_norm": 1.3275387287139893,
- "learning_rate": 2.945803442125663e-06,
- "loss": 0.3592180013656616,
- "mean_token_accuracy": 0.8678265810012817,
- "num_tokens": 16650322.0,
- "step": 1860
- },
- {
- "epoch": 1.4141337386018238,
- "grad_norm": 1.9070929288864136,
- "learning_rate": 2.943742459160354e-06,
- "loss": 0.5332518815994263,
- "mean_token_accuracy": 0.8475706577301025,
- "num_tokens": 16660240.0,
- "step": 1861
- },
- {
- "epoch": 1.4148936170212765,
- "grad_norm": 2.8724546432495117,
- "learning_rate": 2.9416811647595052e-06,
- "loss": 0.5052884817123413,
- "mean_token_accuracy": 0.8363175392150879,
- "num_tokens": 16665481.0,
- "step": 1862
- },
- {
- "epoch": 1.4156534954407296,
- "grad_norm": 4.203817844390869,
- "learning_rate": 2.939619560369813e-06,
- "loss": 0.546925961971283,
- "mean_token_accuracy": 0.834044337272644,
- "num_tokens": 16669615.0,
- "step": 1863
- },
- {
- "epoch": 1.4164133738601823,
- "grad_norm": 1.6466281414031982,
- "learning_rate": 2.9375576474381907e-06,
- "loss": 0.3474533259868622,
- "mean_token_accuracy": 0.8571163415908813,
- "num_tokens": 16678893.0,
- "step": 1864
- },
- {
- "epoch": 1.4171732522796352,
- "grad_norm": 1.8885842561721802,
- "learning_rate": 2.9354954274117683e-06,
- "loss": 0.3726021349430084,
- "mean_token_accuracy": 0.8629094958305359,
- "num_tokens": 16685939.0,
- "step": 1865
- },
- {
- "epoch": 1.417933130699088,
- "grad_norm": 2.830599784851074,
- "learning_rate": 2.9334329017378898e-06,
- "loss": 0.4138668477535248,
- "mean_token_accuracy": 0.8670746088027954,
- "num_tokens": 16690012.0,
- "step": 1866
- },
- {
- "epoch": 1.418693009118541,
- "grad_norm": 1.6838961839675903,
- "learning_rate": 2.9313700718641167e-06,
- "loss": 0.33954259753227234,
- "mean_token_accuracy": 0.8660278916358948,
- "num_tokens": 16700061.0,
- "step": 1867
- },
- {
- "epoch": 1.419452887537994,
- "grad_norm": 2.8767011165618896,
- "learning_rate": 2.9293069392382224e-06,
- "loss": 0.4650302827358246,
- "mean_token_accuracy": 0.8448452949523926,
- "num_tokens": 16705072.0,
- "step": 1868
- },
- {
- "epoch": 1.4202127659574468,
- "grad_norm": 1.5901305675506592,
- "learning_rate": 2.927243505308192e-06,
- "loss": 0.40838998556137085,
- "mean_token_accuracy": 0.8560664653778076,
- "num_tokens": 16714763.0,
- "step": 1869
- },
- {
- "epoch": 1.4209726443768997,
- "grad_norm": 1.3293657302856445,
- "learning_rate": 2.925179771522223e-06,
- "loss": 0.34712862968444824,
- "mean_token_accuracy": 0.8633697032928467,
- "num_tokens": 16729575.0,
- "step": 1870
- },
- {
- "epoch": 1.4217325227963526,
- "grad_norm": 1.7465964555740356,
- "learning_rate": 2.9231157393287234e-06,
- "loss": 0.48190903663635254,
- "mean_token_accuracy": 0.8255834579467773,
- "num_tokens": 16742529.0,
- "step": 1871
- },
- {
- "epoch": 1.4224924012158056,
- "grad_norm": 1.865749716758728,
- "learning_rate": 2.9210514101763116e-06,
- "loss": 0.4912028908729553,
- "mean_token_accuracy": 0.8309572339057922,
- "num_tokens": 16753989.0,
- "step": 1872
- },
- {
- "epoch": 1.4232522796352582,
- "grad_norm": 2.55780291557312,
- "learning_rate": 2.9189867855138103e-06,
- "loss": 0.4550635814666748,
- "mean_token_accuracy": 0.8584091067314148,
- "num_tokens": 16758906.0,
- "step": 1873
- },
- {
- "epoch": 1.4240121580547114,
- "grad_norm": 1.867530107498169,
- "learning_rate": 2.9169218667902562e-06,
- "loss": 0.3524911105632782,
- "mean_token_accuracy": 0.8715004920959473,
- "num_tokens": 16765969.0,
- "step": 1874
- },
- {
- "epoch": 1.424772036474164,
- "grad_norm": 1.8886862993240356,
- "learning_rate": 2.9148566554548857e-06,
- "loss": 0.37144535779953003,
- "mean_token_accuracy": 0.8640961050987244,
- "num_tokens": 16773935.0,
- "step": 1875
- },
- {
- "epoch": 1.425531914893617,
- "grad_norm": 1.266065239906311,
- "learning_rate": 2.912791152957145e-06,
- "loss": 0.3341747522354126,
- "mean_token_accuracy": 0.8929134607315063,
- "num_tokens": 16787780.0,
- "step": 1876
- },
- {
- "epoch": 1.4262917933130699,
- "grad_norm": 2.524888753890991,
- "learning_rate": 2.9107253607466833e-06,
- "loss": 0.33709171414375305,
- "mean_token_accuracy": 0.8857531547546387,
- "num_tokens": 16792753.0,
- "step": 1877
- },
- {
- "epoch": 1.4270516717325228,
- "grad_norm": 1.9269018173217773,
- "learning_rate": 2.908659280273354e-06,
- "loss": 0.32599249482154846,
- "mean_token_accuracy": 0.8777773380279541,
- "num_tokens": 16799904.0,
- "step": 1878
- },
- {
- "epoch": 1.4278115501519757,
- "grad_norm": 1.9844375848770142,
- "learning_rate": 2.9065929129872097e-06,
- "loss": 0.4086732268333435,
- "mean_token_accuracy": 0.8505409955978394,
- "num_tokens": 16807774.0,
- "step": 1879
- },
- {
- "epoch": 1.4285714285714286,
- "grad_norm": 4.0958662033081055,
- "learning_rate": 2.9045262603385073e-06,
- "loss": 0.3838827610015869,
- "mean_token_accuracy": 0.877601146697998,
- "num_tokens": 16810908.0,
- "step": 1880
- },
- {
- "epoch": 1.4293313069908815,
- "grad_norm": 1.7323768138885498,
- "learning_rate": 2.902459323777704e-06,
- "loss": 0.37459003925323486,
- "mean_token_accuracy": 0.8655836582183838,
- "num_tokens": 16819494.0,
- "step": 1881
- },
- {
- "epoch": 1.4300911854103344,
- "grad_norm": 2.608043670654297,
- "learning_rate": 2.900392104755455e-06,
- "loss": 0.5798726677894592,
- "mean_token_accuracy": 0.8382592797279358,
- "num_tokens": 16827745.0,
- "step": 1882
- },
- {
- "epoch": 1.4308510638297873,
- "grad_norm": 1.3262078762054443,
- "learning_rate": 2.8983246047226137e-06,
- "loss": 0.3724595904350281,
- "mean_token_accuracy": 0.8651963472366333,
- "num_tokens": 16844171.0,
- "step": 1883
- },
- {
- "epoch": 1.43161094224924,
- "grad_norm": 1.7250545024871826,
- "learning_rate": 2.8962568251302327e-06,
- "loss": 0.3478979468345642,
- "mean_token_accuracy": 0.8807886242866516,
- "num_tokens": 16852838.0,
- "step": 1884
- },
- {
- "epoch": 1.4323708206686931,
- "grad_norm": 2.114525318145752,
- "learning_rate": 2.8941887674295573e-06,
- "loss": 0.5156140327453613,
- "mean_token_accuracy": 0.825178861618042,
- "num_tokens": 16861087.0,
- "step": 1885
- },
- {
- "epoch": 1.4331306990881458,
- "grad_norm": 2.400829792022705,
- "learning_rate": 2.892120433072031e-06,
- "loss": 0.2807392477989197,
- "mean_token_accuracy": 0.8907361030578613,
- "num_tokens": 16866557.0,
- "step": 1886
- },
- {
- "epoch": 1.4338905775075987,
- "grad_norm": 2.490880012512207,
- "learning_rate": 2.8900518235092908e-06,
- "loss": 0.2615952491760254,
- "mean_token_accuracy": 0.9152894020080566,
- "num_tokens": 16871357.0,
- "step": 1887
- },
- {
- "epoch": 1.4346504559270516,
- "grad_norm": 1.9058431386947632,
- "learning_rate": 2.887982940193165e-06,
- "loss": 0.43623363971710205,
- "mean_token_accuracy": 0.84696364402771,
- "num_tokens": 16879016.0,
- "step": 1888
- },
- {
- "epoch": 1.4354103343465046,
- "grad_norm": 1.4520210027694702,
- "learning_rate": 2.8859137845756785e-06,
- "loss": 0.3961856961250305,
- "mean_token_accuracy": 0.8518897294998169,
- "num_tokens": 16892254.0,
- "step": 1889
- },
- {
- "epoch": 1.4361702127659575,
- "grad_norm": 2.500274896621704,
- "learning_rate": 2.8838443581090415e-06,
- "loss": 0.41457289457321167,
- "mean_token_accuracy": 0.8751448392868042,
- "num_tokens": 16897156.0,
- "step": 1890
- },
- {
- "epoch": 1.4369300911854104,
- "grad_norm": 2.9312057495117188,
- "learning_rate": 2.8817746622456585e-06,
- "loss": 0.45875269174575806,
- "mean_token_accuracy": 0.8411039113998413,
- "num_tokens": 16902291.0,
- "step": 1891
- },
- {
- "epoch": 1.4376899696048633,
- "grad_norm": 2.367419481277466,
- "learning_rate": 2.879704698438121e-06,
- "loss": 0.3643629848957062,
- "mean_token_accuracy": 0.8771071434020996,
- "num_tokens": 16908128.0,
- "step": 1892
- },
- {
- "epoch": 1.4384498480243162,
- "grad_norm": 1.9907705783843994,
- "learning_rate": 2.8776344681392106e-06,
- "loss": 0.3206835389137268,
- "mean_token_accuracy": 0.879996657371521,
- "num_tokens": 16914918.0,
- "step": 1893
- },
- {
- "epoch": 1.439209726443769,
- "grad_norm": 3.536956310272217,
- "learning_rate": 2.875563972801893e-06,
- "loss": 0.3640141785144806,
- "mean_token_accuracy": 0.8814959526062012,
- "num_tokens": 16918187.0,
- "step": 1894
- },
- {
- "epoch": 1.4399696048632218,
- "grad_norm": 1.3451156616210938,
- "learning_rate": 2.8734932138793226e-06,
- "loss": 0.3427346348762512,
- "mean_token_accuracy": 0.8835382461547852,
- "num_tokens": 16931135.0,
- "step": 1895
- },
- {
- "epoch": 1.4407294832826747,
- "grad_norm": 2.0735955238342285,
- "learning_rate": 2.871422192824837e-06,
- "loss": 0.4265315532684326,
- "mean_token_accuracy": 0.8452677726745605,
- "num_tokens": 16937995.0,
- "step": 1896
- },
- {
- "epoch": 1.4414893617021276,
- "grad_norm": 1.5124932527542114,
- "learning_rate": 2.8693509110919597e-06,
- "loss": 0.497121661901474,
- "mean_token_accuracy": 0.815092921257019,
- "num_tokens": 16952743.0,
- "step": 1897
- },
- {
- "epoch": 1.4422492401215805,
- "grad_norm": 3.716669797897339,
- "learning_rate": 2.867279370134395e-06,
- "loss": 0.5452651381492615,
- "mean_token_accuracy": 0.8150380849838257,
- "num_tokens": 16956797.0,
- "step": 1898
- },
- {
- "epoch": 1.4430091185410334,
- "grad_norm": 1.3571398258209229,
- "learning_rate": 2.8652075714060296e-06,
- "loss": 0.4249724745750427,
- "mean_token_accuracy": 0.8675867915153503,
- "num_tokens": 16974494.0,
- "step": 1899
- },
- {
- "epoch": 1.4437689969604863,
- "grad_norm": 2.310673475265503,
- "learning_rate": 2.863135516360932e-06,
- "loss": 0.39368677139282227,
- "mean_token_accuracy": 0.878392219543457,
- "num_tokens": 16980612.0,
- "step": 1900
- },
- {
- "epoch": 1.4445288753799392,
- "grad_norm": 1.9025533199310303,
- "learning_rate": 2.8610632064533517e-06,
- "loss": 0.4786127805709839,
- "mean_token_accuracy": 0.8720556497573853,
- "num_tokens": 16992262.0,
- "step": 1901
- },
- {
- "epoch": 1.4452887537993921,
- "grad_norm": 2.528564453125,
- "learning_rate": 2.8589906431377133e-06,
- "loss": 0.4223094582557678,
- "mean_token_accuracy": 0.8513246178627014,
- "num_tokens": 16997717.0,
- "step": 1902
- },
- {
- "epoch": 1.446048632218845,
- "grad_norm": 1.010425329208374,
- "learning_rate": 2.8569178278686222e-06,
- "loss": 0.3908255696296692,
- "mean_token_accuracy": 0.8620463609695435,
- "num_tokens": 17020903.0,
- "step": 1903
- },
- {
- "epoch": 1.4468085106382977,
- "grad_norm": 1.5760232210159302,
- "learning_rate": 2.8548447621008614e-06,
- "loss": 0.4134044051170349,
- "mean_token_accuracy": 0.8472093343734741,
- "num_tokens": 17035250.0,
- "step": 1904
- },
- {
- "epoch": 1.4475683890577509,
- "grad_norm": 2.0668535232543945,
- "learning_rate": 2.8527714472893866e-06,
- "loss": 0.44095730781555176,
- "mean_token_accuracy": 0.881983757019043,
- "num_tokens": 17042170.0,
- "step": 1905
- },
- {
- "epoch": 1.4483282674772036,
- "grad_norm": 1.1620599031448364,
- "learning_rate": 2.85069788488933e-06,
- "loss": 0.3607163429260254,
- "mean_token_accuracy": 0.8684282898902893,
- "num_tokens": 17061937.0,
- "step": 1906
- },
- {
- "epoch": 1.4490881458966565,
- "grad_norm": 2.1316568851470947,
- "learning_rate": 2.8486240763559984e-06,
- "loss": 0.3478124141693115,
- "mean_token_accuracy": 0.8772403001785278,
- "num_tokens": 17068628.0,
- "step": 1907
- },
- {
- "epoch": 1.4498480243161094,
- "grad_norm": 2.4756391048431396,
- "learning_rate": 2.8465500231448707e-06,
- "loss": 0.46441152691841125,
- "mean_token_accuracy": 0.8436450958251953,
- "num_tokens": 17075495.0,
- "step": 1908
- },
- {
- "epoch": 1.4506079027355623,
- "grad_norm": 2.249720573425293,
- "learning_rate": 2.844475726711595e-06,
- "loss": 0.41565513610839844,
- "mean_token_accuracy": 0.8525094985961914,
- "num_tokens": 17080940.0,
- "step": 1909
- },
- {
- "epoch": 1.4513677811550152,
- "grad_norm": 2.3081841468811035,
- "learning_rate": 2.8424011885119956e-06,
- "loss": 0.49903199076652527,
- "mean_token_accuracy": 0.8212426900863647,
- "num_tokens": 17092024.0,
- "step": 1910
- },
- {
- "epoch": 1.452127659574468,
- "grad_norm": 1.2929959297180176,
- "learning_rate": 2.8403264100020613e-06,
- "loss": 0.47038257122039795,
- "mean_token_accuracy": 0.8319816589355469,
- "num_tokens": 17108840.0,
- "step": 1911
- },
- {
- "epoch": 1.452887537993921,
- "grad_norm": 1.6476463079452515,
- "learning_rate": 2.8382513926379508e-06,
- "loss": 0.42287829518318176,
- "mean_token_accuracy": 0.8555682897567749,
- "num_tokens": 17119704.0,
- "step": 1912
- },
- {
- "epoch": 1.453647416413374,
- "grad_norm": 1.759998083114624,
- "learning_rate": 2.836176137875993e-06,
- "loss": 0.40904951095581055,
- "mean_token_accuracy": 0.8698266744613647,
- "num_tokens": 17130676.0,
- "step": 1913
- },
- {
- "epoch": 1.4544072948328268,
- "grad_norm": 1.510909914970398,
- "learning_rate": 2.8341006471726817e-06,
- "loss": 0.47834792733192444,
- "mean_token_accuracy": 0.8335825204849243,
- "num_tokens": 17146304.0,
- "step": 1914
- },
- {
- "epoch": 1.4551671732522795,
- "grad_norm": 3.538071632385254,
- "learning_rate": 2.832024921984674e-06,
- "loss": 0.34059035778045654,
- "mean_token_accuracy": 0.8769031763076782,
- "num_tokens": 17150458.0,
- "step": 1915
- },
- {
- "epoch": 1.4559270516717326,
- "grad_norm": 2.3368659019470215,
- "learning_rate": 2.8299489637687955e-06,
- "loss": 0.43068382143974304,
- "mean_token_accuracy": 0.845360517501831,
- "num_tokens": 17157368.0,
- "step": 1916
- },
- {
- "epoch": 1.4566869300911853,
- "grad_norm": 1.8720396757125854,
- "learning_rate": 2.8278727739820334e-06,
- "loss": 0.37013399600982666,
- "mean_token_accuracy": 0.854241132736206,
- "num_tokens": 17166325.0,
- "step": 1917
- },
- {
- "epoch": 1.4574468085106382,
- "grad_norm": 1.6706892251968384,
- "learning_rate": 2.825796354081537e-06,
- "loss": 0.5397020578384399,
- "mean_token_accuracy": 0.8309713006019592,
- "num_tokens": 17178920.0,
- "step": 1918
- },
- {
- "epoch": 1.4582066869300911,
- "grad_norm": 2.729210376739502,
- "learning_rate": 2.8237197055246175e-06,
- "loss": 0.25137859582901,
- "mean_token_accuracy": 0.9148792028427124,
- "num_tokens": 17183107.0,
- "step": 1919
- },
- {
- "epoch": 1.458966565349544,
- "grad_norm": 3.023500680923462,
- "learning_rate": 2.821642829768748e-06,
- "loss": 0.43312495946884155,
- "mean_token_accuracy": 0.8481811285018921,
- "num_tokens": 17187853.0,
- "step": 1920
- },
- {
- "epoch": 1.459726443768997,
- "grad_norm": 1.8108519315719604,
- "learning_rate": 2.8195657282715595e-06,
- "loss": 0.5101792216300964,
- "mean_token_accuracy": 0.8315553069114685,
- "num_tokens": 17199247.0,
- "step": 1921
- },
- {
- "epoch": 1.4604863221884499,
- "grad_norm": 2.0262672901153564,
- "learning_rate": 2.817488402490841e-06,
- "loss": 0.4449934959411621,
- "mean_token_accuracy": 0.8634527325630188,
- "num_tokens": 17206348.0,
- "step": 1922
- },
- {
- "epoch": 1.4612462006079028,
- "grad_norm": 2.6163926124572754,
- "learning_rate": 2.8154108538845405e-06,
- "loss": 0.43052345514297485,
- "mean_token_accuracy": 0.8375401496887207,
- "num_tokens": 17211702.0,
- "step": 1923
- },
- {
- "epoch": 1.4620060790273557,
- "grad_norm": 2.0854408740997314,
- "learning_rate": 2.813333083910761e-06,
- "loss": 0.5011380910873413,
- "mean_token_accuracy": 0.8359915018081665,
- "num_tokens": 17219096.0,
- "step": 1924
- },
- {
- "epoch": 1.4627659574468086,
- "grad_norm": 2.2081687450408936,
- "learning_rate": 2.8112550940277615e-06,
- "loss": 0.5239193439483643,
- "mean_token_accuracy": 0.8499593734741211,
- "num_tokens": 17229266.0,
- "step": 1925
- },
- {
- "epoch": 1.4635258358662613,
- "grad_norm": 1.798343539237976,
- "learning_rate": 2.809176885693956e-06,
- "loss": 0.4515029191970825,
- "mean_token_accuracy": 0.8400485515594482,
- "num_tokens": 17239280.0,
- "step": 1926
- },
- {
- "epoch": 1.4642857142857144,
- "grad_norm": 1.897887945175171,
- "learning_rate": 2.807098460367911e-06,
- "loss": 0.35935714840888977,
- "mean_token_accuracy": 0.8776072263717651,
- "num_tokens": 17247132.0,
- "step": 1927
- },
- {
- "epoch": 1.465045592705167,
- "grad_norm": 2.705836296081543,
- "learning_rate": 2.8050198195083445e-06,
- "loss": 0.3728443682193756,
- "mean_token_accuracy": 0.8649885654449463,
- "num_tokens": 17251865.0,
- "step": 1928
- },
- {
- "epoch": 1.46580547112462,
- "grad_norm": 1.841178059577942,
- "learning_rate": 2.802940964574127e-06,
- "loss": 0.40604841709136963,
- "mean_token_accuracy": 0.8537783622741699,
- "num_tokens": 17260163.0,
- "step": 1929
- },
- {
- "epoch": 1.466565349544073,
- "grad_norm": 2.7393605709075928,
- "learning_rate": 2.800861897024279e-06,
- "loss": 0.39346879720687866,
- "mean_token_accuracy": 0.8628787994384766,
- "num_tokens": 17264876.0,
- "step": 1930
- },
- {
- "epoch": 1.4673252279635258,
- "grad_norm": 1.84367835521698,
- "learning_rate": 2.798782618317971e-06,
- "loss": 0.37411895394325256,
- "mean_token_accuracy": 0.8605265617370605,
- "num_tokens": 17273049.0,
- "step": 1931
- },
- {
- "epoch": 1.4680851063829787,
- "grad_norm": 1.6546733379364014,
- "learning_rate": 2.796703129914519e-06,
- "loss": 0.4997844099998474,
- "mean_token_accuracy": 0.8267433643341064,
- "num_tokens": 17285074.0,
- "step": 1932
- },
- {
- "epoch": 1.4688449848024316,
- "grad_norm": 2.2749221324920654,
- "learning_rate": 2.79462343327339e-06,
- "loss": 0.35453367233276367,
- "mean_token_accuracy": 0.8746850490570068,
- "num_tokens": 17290273.0,
- "step": 1933
- },
- {
- "epoch": 1.4696048632218845,
- "grad_norm": 1.7142518758773804,
- "learning_rate": 2.7925435298541944e-06,
- "loss": 0.345878541469574,
- "mean_token_accuracy": 0.8600981831550598,
- "num_tokens": 17301045.0,
- "step": 1934
- },
- {
- "epoch": 1.4703647416413375,
- "grad_norm": 3.163342237472534,
- "learning_rate": 2.7904634211166877e-06,
- "loss": 0.4356975853443146,
- "mean_token_accuracy": 0.8460350036621094,
- "num_tokens": 17305108.0,
- "step": 1935
- },
- {
- "epoch": 1.4711246200607904,
- "grad_norm": 1.6377612352371216,
- "learning_rate": 2.7883831085207707e-06,
- "loss": 0.4459729790687561,
- "mean_token_accuracy": 0.8463394641876221,
- "num_tokens": 17315479.0,
- "step": 1936
- },
- {
- "epoch": 1.471884498480243,
- "grad_norm": 1.865268588066101,
- "learning_rate": 2.7863025935264876e-06,
- "loss": 0.394723117351532,
- "mean_token_accuracy": 0.864177942276001,
- "num_tokens": 17324795.0,
- "step": 1937
- },
- {
- "epoch": 1.4726443768996962,
- "grad_norm": 1.241937518119812,
- "learning_rate": 2.784221877594024e-06,
- "loss": 0.2752220630645752,
- "mean_token_accuracy": 0.8998259902000427,
- "num_tokens": 17338000.0,
- "step": 1938
- },
- {
- "epoch": 1.4734042553191489,
- "grad_norm": 1.8013651371002197,
- "learning_rate": 2.7821409621837042e-06,
- "loss": 0.4251005947589874,
- "mean_token_accuracy": 0.8518919348716736,
- "num_tokens": 17347351.0,
- "step": 1939
- },
- {
- "epoch": 1.4741641337386018,
- "grad_norm": 1.2902207374572754,
- "learning_rate": 2.7800598487559976e-06,
- "loss": 0.3640727400779724,
- "mean_token_accuracy": 0.8592870235443115,
- "num_tokens": 17362335.0,
- "step": 1940
- },
- {
- "epoch": 1.4749240121580547,
- "grad_norm": 2.5427513122558594,
- "learning_rate": 2.777978538771508e-06,
- "loss": 0.38166797161102295,
- "mean_token_accuracy": 0.8653234839439392,
- "num_tokens": 17367733.0,
- "step": 1941
- },
- {
- "epoch": 1.4756838905775076,
- "grad_norm": 1.7793641090393066,
- "learning_rate": 2.7758970336909795e-06,
- "loss": 0.3113783895969391,
- "mean_token_accuracy": 0.8812868595123291,
- "num_tokens": 17375267.0,
- "step": 1942
- },
- {
- "epoch": 1.4764437689969605,
- "grad_norm": 3.4031741619110107,
- "learning_rate": 2.7738153349752923e-06,
- "loss": 0.4800986647605896,
- "mean_token_accuracy": 0.8336698412895203,
- "num_tokens": 17379549.0,
- "step": 1943
- },
- {
- "epoch": 1.4772036474164134,
- "grad_norm": 1.3451651334762573,
- "learning_rate": 2.7717334440854634e-06,
- "loss": 0.3115345239639282,
- "mean_token_accuracy": 0.908623218536377,
- "num_tokens": 17394455.0,
- "step": 1944
- },
- {
- "epoch": 1.4779635258358663,
- "grad_norm": 1.980919599533081,
- "learning_rate": 2.7696513624826422e-06,
- "loss": 0.391154944896698,
- "mean_token_accuracy": 0.8650267720222473,
- "num_tokens": 17401931.0,
- "step": 1945
- },
- {
- "epoch": 1.4787234042553192,
- "grad_norm": 1.0118765830993652,
- "learning_rate": 2.7675690916281158e-06,
- "loss": 0.3157956600189209,
- "mean_token_accuracy": 0.8827471733093262,
- "num_tokens": 17424144.0,
- "step": 1946
- },
- {
- "epoch": 1.4794832826747721,
- "grad_norm": 1.579654335975647,
- "learning_rate": 2.7654866329833e-06,
- "loss": 0.4578486382961273,
- "mean_token_accuracy": 0.8361750245094299,
- "num_tokens": 17435769.0,
- "step": 1947
- },
- {
- "epoch": 1.4802431610942248,
- "grad_norm": 1.7706717252731323,
- "learning_rate": 2.763403988009746e-06,
- "loss": 0.3564416170120239,
- "mean_token_accuracy": 0.8689201474189758,
- "num_tokens": 17444088.0,
- "step": 1948
- },
- {
- "epoch": 1.4810030395136777,
- "grad_norm": 1.2264244556427002,
- "learning_rate": 2.761321158169134e-06,
- "loss": 0.30763837695121765,
- "mean_token_accuracy": 0.8960219621658325,
- "num_tokens": 17458096.0,
- "step": 1949
- },
- {
- "epoch": 1.4817629179331306,
- "grad_norm": 1.214431881904602,
- "learning_rate": 2.759238144923274e-06,
- "loss": 0.49099457263946533,
- "mean_token_accuracy": 0.8279136419296265,
- "num_tokens": 17481062.0,
- "step": 1950
- },
- {
- "epoch": 1.4825227963525835,
- "grad_norm": 1.593892216682434,
- "learning_rate": 2.7571549497341044e-06,
- "loss": 0.3745320737361908,
- "mean_token_accuracy": 0.8690779209136963,
- "num_tokens": 17490874.0,
- "step": 1951
- },
- {
- "epoch": 1.4832826747720365,
- "grad_norm": 2.409924268722534,
- "learning_rate": 2.755071574063692e-06,
- "loss": 0.4310247600078583,
- "mean_token_accuracy": 0.8521159291267395,
- "num_tokens": 17496942.0,
- "step": 1952
- },
- {
- "epoch": 1.4840425531914894,
- "grad_norm": 1.2557463645935059,
- "learning_rate": 2.7529880193742297e-06,
- "loss": 0.34304720163345337,
- "mean_token_accuracy": 0.8748183250427246,
- "num_tokens": 17514391.0,
- "step": 1953
- },
- {
- "epoch": 1.4848024316109423,
- "grad_norm": 1.17310631275177,
- "learning_rate": 2.7509042871280373e-06,
- "loss": 0.3835817277431488,
- "mean_token_accuracy": 0.8853274583816528,
- "num_tokens": 17533289.0,
- "step": 1954
- },
- {
- "epoch": 1.4855623100303952,
- "grad_norm": 1.5261479616165161,
- "learning_rate": 2.748820378787558e-06,
- "loss": 0.4799988865852356,
- "mean_token_accuracy": 0.8252149820327759,
- "num_tokens": 17544118.0,
- "step": 1955
- },
- {
- "epoch": 1.486322188449848,
- "grad_norm": 2.030930757522583,
- "learning_rate": 2.7467362958153585e-06,
- "loss": 0.35690805315971375,
- "mean_token_accuracy": 0.8959587216377258,
- "num_tokens": 17550431.0,
- "step": 1956
- },
- {
- "epoch": 1.4870820668693008,
- "grad_norm": 2.376520872116089,
- "learning_rate": 2.7446520396741293e-06,
- "loss": 0.262234091758728,
- "mean_token_accuracy": 0.9054547548294067,
- "num_tokens": 17554853.0,
- "step": 1957
- },
- {
- "epoch": 1.487841945288754,
- "grad_norm": 1.6944479942321777,
- "learning_rate": 2.742567611826681e-06,
- "loss": 0.529259979724884,
- "mean_token_accuracy": 0.8195339441299438,
- "num_tokens": 17568016.0,
- "step": 1958
- },
- {
- "epoch": 1.4886018237082066,
- "grad_norm": 2.833029270172119,
- "learning_rate": 2.7404830137359445e-06,
- "loss": 0.30229634046554565,
- "mean_token_accuracy": 0.8933001756668091,
- "num_tokens": 17572587.0,
- "step": 1959
- },
- {
- "epoch": 1.4893617021276595,
- "grad_norm": 1.7040144205093384,
- "learning_rate": 2.7383982468649715e-06,
- "loss": 0.3166356682777405,
- "mean_token_accuracy": 0.8871906399726868,
- "num_tokens": 17580966.0,
- "step": 1960
- },
- {
- "epoch": 1.4901215805471124,
- "grad_norm": 1.7539052963256836,
- "learning_rate": 2.7363133126769326e-06,
- "loss": 0.4231064021587372,
- "mean_token_accuracy": 0.8708304166793823,
- "num_tokens": 17590907.0,
- "step": 1961
- },
- {
- "epoch": 1.4908814589665653,
- "grad_norm": 1.6198650598526,
- "learning_rate": 2.7342282126351145e-06,
- "loss": 0.4198967218399048,
- "mean_token_accuracy": 0.8723280429840088,
- "num_tokens": 17604291.0,
- "step": 1962
- },
- {
- "epoch": 1.4916413373860182,
- "grad_norm": 1.8437711000442505,
- "learning_rate": 2.73214294820292e-06,
- "loss": 0.38923323154449463,
- "mean_token_accuracy": 0.8697006106376648,
- "num_tokens": 17612291.0,
- "step": 1963
- },
- {
- "epoch": 1.4924012158054711,
- "grad_norm": 1.1129369735717773,
- "learning_rate": 2.7300575208438684e-06,
- "loss": 0.3107512593269348,
- "mean_token_accuracy": 0.878618597984314,
- "num_tokens": 17630073.0,
- "step": 1964
- },
- {
- "epoch": 1.493161094224924,
- "grad_norm": 3.0210442543029785,
- "learning_rate": 2.7279719320215924e-06,
- "loss": 0.4630751609802246,
- "mean_token_accuracy": 0.8567075729370117,
- "num_tokens": 17634758.0,
- "step": 1965
- },
- {
- "epoch": 1.493920972644377,
- "grad_norm": 2.8825972080230713,
- "learning_rate": 2.725886183199839e-06,
- "loss": 0.35351765155792236,
- "mean_token_accuracy": 0.8711981773376465,
- "num_tokens": 17639613.0,
- "step": 1966
- },
- {
- "epoch": 1.4946808510638299,
- "grad_norm": 2.111238718032837,
- "learning_rate": 2.723800275842468e-06,
- "loss": 0.3529569208621979,
- "mean_token_accuracy": 0.8679244518280029,
- "num_tokens": 17645308.0,
- "step": 1967
- },
- {
- "epoch": 1.4954407294832825,
- "grad_norm": 2.080509901046753,
- "learning_rate": 2.7217142114134466e-06,
- "loss": 0.43321219086647034,
- "mean_token_accuracy": 0.8848220109939575,
- "num_tokens": 17652292.0,
- "step": 1968
- },
- {
- "epoch": 1.4962006079027357,
- "grad_norm": 2.8686363697052,
- "learning_rate": 2.7196279913768587e-06,
- "loss": 0.417035311460495,
- "mean_token_accuracy": 0.8724601864814758,
- "num_tokens": 17656908.0,
- "step": 1969
- },
- {
- "epoch": 1.4969604863221884,
- "grad_norm": 3.294193744659424,
- "learning_rate": 2.717541617196891e-06,
- "loss": 0.3551934063434601,
- "mean_token_accuracy": 0.8838565349578857,
- "num_tokens": 17660590.0,
- "step": 1970
- },
- {
- "epoch": 1.4977203647416413,
- "grad_norm": 1.766292929649353,
- "learning_rate": 2.7154550903378425e-06,
- "loss": 0.36521971225738525,
- "mean_token_accuracy": 0.8810199499130249,
- "num_tokens": 17668214.0,
- "step": 1971
- },
- {
- "epoch": 1.4984802431610942,
- "grad_norm": 1.2127676010131836,
- "learning_rate": 2.713368412264118e-06,
- "loss": 0.35184425115585327,
- "mean_token_accuracy": 0.8672580718994141,
- "num_tokens": 17684736.0,
- "step": 1972
- },
- {
- "epoch": 1.499240121580547,
- "grad_norm": 2.268256664276123,
- "learning_rate": 2.711281584440228e-06,
- "loss": 0.40115267038345337,
- "mean_token_accuracy": 0.8517841100692749,
- "num_tokens": 17691510.0,
- "step": 1973
- },
- {
- "epoch": 1.5,
- "grad_norm": 2.7196054458618164,
- "learning_rate": 2.70919460833079e-06,
- "loss": 0.3819037675857544,
- "mean_token_accuracy": 0.8765411376953125,
- "num_tokens": 17696179.0,
- "step": 1974
- },
- {
- "epoch": 1.500759878419453,
- "grad_norm": 2.969406843185425,
- "learning_rate": 2.7071074854005206e-06,
- "loss": 0.3922455608844757,
- "mean_token_accuracy": 0.8796037435531616,
- "num_tokens": 17700597.0,
- "step": 1975
- },
- {
- "epoch": 1.5015197568389058,
- "grad_norm": 2.2965853214263916,
- "learning_rate": 2.705020217114248e-06,
- "loss": 0.5433666110038757,
- "mean_token_accuracy": 0.809639036655426,
- "num_tokens": 17708895.0,
- "step": 1976
- },
- {
- "epoch": 1.5022796352583585,
- "grad_norm": 1.5584394931793213,
- "learning_rate": 2.7029328049368942e-06,
- "loss": 0.4736343324184418,
- "mean_token_accuracy": 0.8197190761566162,
- "num_tokens": 17725202.0,
- "step": 1977
- },
- {
- "epoch": 1.5030395136778116,
- "grad_norm": 1.3903142213821411,
- "learning_rate": 2.700845250333486e-06,
- "loss": 0.4471571445465088,
- "mean_token_accuracy": 0.839043140411377,
- "num_tokens": 17742835.0,
- "step": 1978
- },
- {
- "epoch": 1.5037993920972643,
- "grad_norm": 3.080716609954834,
- "learning_rate": 2.69875755476915e-06,
- "loss": 0.45760005712509155,
- "mean_token_accuracy": 0.8366328477859497,
- "num_tokens": 17747324.0,
- "step": 1979
- },
- {
- "epoch": 1.5045592705167175,
- "grad_norm": 1.0150405168533325,
- "learning_rate": 2.696669719709111e-06,
- "loss": 0.33638954162597656,
- "mean_token_accuracy": 0.8591676354408264,
- "num_tokens": 17765565.0,
- "step": 1980
- },
- {
- "epoch": 1.5053191489361701,
- "grad_norm": 2.402927875518799,
- "learning_rate": 2.694581746618691e-06,
- "loss": 0.4086601436138153,
- "mean_token_accuracy": 0.8769911527633667,
- "num_tokens": 17771275.0,
- "step": 1981
- },
- {
- "epoch": 1.506079027355623,
- "grad_norm": 2.030583381652832,
- "learning_rate": 2.6924936369633126e-06,
- "loss": 0.5115457773208618,
- "mean_token_accuracy": 0.8054746389389038,
- "num_tokens": 17779999.0,
- "step": 1982
- },
- {
- "epoch": 1.506838905775076,
- "grad_norm": 2.575199604034424,
- "learning_rate": 2.6904053922084893e-06,
- "loss": 0.363183856010437,
- "mean_token_accuracy": 0.8716042637825012,
- "num_tokens": 17785473.0,
- "step": 1983
- },
- {
- "epoch": 1.5075987841945289,
- "grad_norm": 1.8497480154037476,
- "learning_rate": 2.688317013819832e-06,
- "loss": 0.4254384934902191,
- "mean_token_accuracy": 0.8549597263336182,
- "num_tokens": 17793812.0,
- "step": 1984
- },
- {
- "epoch": 1.5083586626139818,
- "grad_norm": 1.7786511182785034,
- "learning_rate": 2.686228503263045e-06,
- "loss": 0.33400774002075195,
- "mean_token_accuracy": 0.9027615189552307,
- "num_tokens": 17801783.0,
- "step": 1985
- },
- {
- "epoch": 1.5091185410334347,
- "grad_norm": 1.8365367650985718,
- "learning_rate": 2.684139862003927e-06,
- "loss": 0.35765063762664795,
- "mean_token_accuracy": 0.8663736581802368,
- "num_tokens": 17809562.0,
- "step": 1986
- },
- {
- "epoch": 1.5098784194528876,
- "grad_norm": 1.8817477226257324,
- "learning_rate": 2.682051091508365e-06,
- "loss": 0.4627506732940674,
- "mean_token_accuracy": 0.8358862400054932,
- "num_tokens": 17819094.0,
- "step": 1987
- },
- {
- "epoch": 1.5106382978723403,
- "grad_norm": 2.221547842025757,
- "learning_rate": 2.679962193242338e-06,
- "loss": 0.577020525932312,
- "mean_token_accuracy": 0.80013108253479,
- "num_tokens": 17826666.0,
- "step": 1988
- },
- {
- "epoch": 1.5113981762917934,
- "grad_norm": 2.6618270874023438,
- "learning_rate": 2.6778731686719177e-06,
- "loss": 0.44632256031036377,
- "mean_token_accuracy": 0.8611289262771606,
- "num_tokens": 17833172.0,
- "step": 1989
- },
- {
- "epoch": 1.512158054711246,
- "grad_norm": 2.9495689868927,
- "learning_rate": 2.67578401926326e-06,
- "loss": 0.3482511043548584,
- "mean_token_accuracy": 0.8703314661979675,
- "num_tokens": 17837220.0,
- "step": 1990
- },
- {
- "epoch": 1.5129179331306992,
- "grad_norm": 2.0943644046783447,
- "learning_rate": 2.6736947464826107e-06,
- "loss": 0.2354314625263214,
- "mean_token_accuracy": 0.9137634038925171,
- "num_tokens": 17842712.0,
- "step": 1991
- },
- {
- "epoch": 1.513677811550152,
- "grad_norm": 1.1303033828735352,
- "learning_rate": 2.671605351796302e-06,
- "loss": 0.3624761700630188,
- "mean_token_accuracy": 0.8769594430923462,
- "num_tokens": 17860902.0,
- "step": 1992
- },
- {
- "epoch": 1.5144376899696048,
- "grad_norm": 2.8921146392822266,
- "learning_rate": 2.6695158366707526e-06,
- "loss": 0.2517220973968506,
- "mean_token_accuracy": 0.8974182605743408,
- "num_tokens": 17865160.0,
- "step": 1993
- },
- {
- "epoch": 1.5151975683890577,
- "grad_norm": 2.320587158203125,
- "learning_rate": 2.667426202572463e-06,
- "loss": 0.4589889943599701,
- "mean_token_accuracy": 0.8379613161087036,
- "num_tokens": 17871994.0,
- "step": 1994
- },
- {
- "epoch": 1.5159574468085106,
- "grad_norm": 1.1407674551010132,
- "learning_rate": 2.665336450968019e-06,
- "loss": 0.34412115812301636,
- "mean_token_accuracy": 0.8776306509971619,
- "num_tokens": 17889941.0,
- "step": 1995
- },
- {
- "epoch": 1.5167173252279635,
- "grad_norm": 2.069814920425415,
- "learning_rate": 2.6632465833240895e-06,
- "loss": 0.47524404525756836,
- "mean_token_accuracy": 0.830310046672821,
- "num_tokens": 17898447.0,
- "step": 1996
- },
- {
- "epoch": 1.5174772036474165,
- "grad_norm": 1.822415828704834,
- "learning_rate": 2.661156601107424e-06,
- "loss": 0.4541318416595459,
- "mean_token_accuracy": 0.8856616020202637,
- "num_tokens": 17908729.0,
- "step": 1997
- },
- {
- "epoch": 1.5182370820668694,
- "grad_norm": 2.851428985595703,
- "learning_rate": 2.659066505784852e-06,
- "loss": 0.41761666536331177,
- "mean_token_accuracy": 0.8710572719573975,
- "num_tokens": 17913860.0,
- "step": 1998
- },
- {
- "epoch": 1.518996960486322,
- "grad_norm": 1.8483710289001465,
- "learning_rate": 2.6569762988232838e-06,
- "loss": 0.45517268776893616,
- "mean_token_accuracy": 0.8411115407943726,
- "num_tokens": 17923497.0,
- "step": 1999
- },
- {
- "epoch": 1.5197568389057752,
- "grad_norm": 1.9044219255447388,
- "learning_rate": 2.654885981689706e-06,
- "loss": 0.42533189058303833,
- "mean_token_accuracy": 0.8597894906997681,
- "num_tokens": 17932670.0,
- "step": 2000
- },
- {
- "epoch": 1.5205167173252279,
- "grad_norm": 1.8170348405838013,
- "learning_rate": 2.652795555851184e-06,
- "loss": 0.4009692072868347,
- "mean_token_accuracy": 0.8553036451339722,
- "num_tokens": 17941616.0,
- "step": 2001
- },
- {
- "epoch": 1.521276595744681,
- "grad_norm": 1.4704090356826782,
- "learning_rate": 2.6507050227748595e-06,
- "loss": 0.3732764720916748,
- "mean_token_accuracy": 0.8788566589355469,
- "num_tokens": 17957187.0,
- "step": 2002
- },
- {
- "epoch": 1.5220364741641337,
- "grad_norm": 1.6681534051895142,
- "learning_rate": 2.648614383927949e-06,
- "loss": 0.341326504945755,
- "mean_token_accuracy": 0.874875545501709,
- "num_tokens": 17966668.0,
- "step": 2003
- },
- {
- "epoch": 1.5227963525835866,
- "grad_norm": 1.8578619956970215,
- "learning_rate": 2.646523640777741e-06,
- "loss": 0.3937399983406067,
- "mean_token_accuracy": 0.8656851053237915,
- "num_tokens": 17976194.0,
- "step": 2004
- },
- {
- "epoch": 1.5235562310030395,
- "grad_norm": 1.7520431280136108,
- "learning_rate": 2.6444327947916037e-06,
- "loss": 0.3392767906188965,
- "mean_token_accuracy": 0.8799679279327393,
- "num_tokens": 17984492.0,
- "step": 2005
- },
- {
- "epoch": 1.5243161094224924,
- "grad_norm": 3.4649906158447266,
- "learning_rate": 2.6423418474369707e-06,
- "loss": 0.3451516032218933,
- "mean_token_accuracy": 0.8753262758255005,
- "num_tokens": 17988240.0,
- "step": 2006
- },
- {
- "epoch": 1.5250759878419453,
- "grad_norm": 1.8037052154541016,
- "learning_rate": 2.64025080018135e-06,
- "loss": 0.34428173303604126,
- "mean_token_accuracy": 0.8719067573547363,
- "num_tokens": 17996644.0,
- "step": 2007
- },
- {
- "epoch": 1.5258358662613982,
- "grad_norm": 1.743722677230835,
- "learning_rate": 2.6381596544923184e-06,
- "loss": 0.4446655213832855,
- "mean_token_accuracy": 0.8612518906593323,
- "num_tokens": 18005109.0,
- "step": 2008
- },
- {
- "epoch": 1.5265957446808511,
- "grad_norm": 1.3357981443405151,
- "learning_rate": 2.636068411837523e-06,
- "loss": 0.38647788763046265,
- "mean_token_accuracy": 0.858294665813446,
- "num_tokens": 18018193.0,
- "step": 2009
- },
- {
- "epoch": 1.5273556231003038,
- "grad_norm": 1.4848440885543823,
- "learning_rate": 2.6339770736846794e-06,
- "loss": 0.3597261607646942,
- "mean_token_accuracy": 0.8760983943939209,
- "num_tokens": 18028959.0,
- "step": 2010
- },
- {
- "epoch": 1.528115501519757,
- "grad_norm": 2.356933832168579,
- "learning_rate": 2.6318856415015664e-06,
- "loss": 0.2697138488292694,
- "mean_token_accuracy": 0.9078473448753357,
- "num_tokens": 18033946.0,
- "step": 2011
- },
- {
- "epoch": 1.5288753799392096,
- "grad_norm": 1.964368224143982,
- "learning_rate": 2.629794116756035e-06,
- "loss": 0.41349685192108154,
- "mean_token_accuracy": 0.8567900657653809,
- "num_tokens": 18042724.0,
- "step": 2012
- },
- {
- "epoch": 1.5296352583586628,
- "grad_norm": 1.5630402565002441,
- "learning_rate": 2.627702500915995e-06,
- "loss": 0.49310681223869324,
- "mean_token_accuracy": 0.8229681253433228,
- "num_tokens": 18054396.0,
- "step": 2013
- },
- {
- "epoch": 1.5303951367781155,
- "grad_norm": 1.6657718420028687,
- "learning_rate": 2.625610795449424e-06,
- "loss": 0.4263935387134552,
- "mean_token_accuracy": 0.8634918332099915,
- "num_tokens": 18064347.0,
- "step": 2014
- },
- {
- "epoch": 1.5311550151975684,
- "grad_norm": 1.3684180974960327,
- "learning_rate": 2.6235190018243623e-06,
- "loss": 0.2903984487056732,
- "mean_token_accuracy": 0.8930408358573914,
- "num_tokens": 18076826.0,
- "step": 2015
- },
- {
- "epoch": 1.5319148936170213,
- "grad_norm": 1.635044813156128,
- "learning_rate": 2.6214271215089106e-06,
- "loss": 0.3066539168357849,
- "mean_token_accuracy": 0.8912158012390137,
- "num_tokens": 18085761.0,
- "step": 2016
- },
- {
- "epoch": 1.5326747720364742,
- "grad_norm": 2.431518316268921,
- "learning_rate": 2.6193351559712294e-06,
- "loss": 0.31123271584510803,
- "mean_token_accuracy": 0.8865828514099121,
- "num_tokens": 18091715.0,
- "step": 2017
- },
- {
- "epoch": 1.533434650455927,
- "grad_norm": 1.8317419290542603,
- "learning_rate": 2.6172431066795428e-06,
- "loss": 0.5042020082473755,
- "mean_token_accuracy": 0.8245081901550293,
- "num_tokens": 18102095.0,
- "step": 2018
- },
- {
- "epoch": 1.53419452887538,
- "grad_norm": 3.4221980571746826,
- "learning_rate": 2.6151509751021307e-06,
- "loss": 0.2885819971561432,
- "mean_token_accuracy": 0.8997149467468262,
- "num_tokens": 18105456.0,
- "step": 2019
- },
- {
- "epoch": 1.534954407294833,
- "grad_norm": 1.4435855150222778,
- "learning_rate": 2.6130587627073315e-06,
- "loss": 0.45573529601097107,
- "mean_token_accuracy": 0.837191104888916,
- "num_tokens": 18119039.0,
- "step": 2020
- },
- {
- "epoch": 1.5357142857142856,
- "grad_norm": 1.5748237371444702,
- "learning_rate": 2.6109664709635413e-06,
- "loss": 0.4561889171600342,
- "mean_token_accuracy": 0.8334558010101318,
- "num_tokens": 18132150.0,
- "step": 2021
- },
- {
- "epoch": 1.5364741641337387,
- "grad_norm": 2.8278751373291016,
- "learning_rate": 2.60887410133921e-06,
- "loss": 0.3495104908943176,
- "mean_token_accuracy": 0.8926796913146973,
- "num_tokens": 18136528.0,
- "step": 2022
- },
- {
- "epoch": 1.5372340425531914,
- "grad_norm": 2.5045573711395264,
- "learning_rate": 2.606781655302843e-06,
- "loss": 0.45362481474876404,
- "mean_token_accuracy": 0.8379551768302917,
- "num_tokens": 18142581.0,
- "step": 2023
- },
- {
- "epoch": 1.5379939209726445,
- "grad_norm": 2.5984106063842773,
- "learning_rate": 2.604689134322999e-06,
- "loss": 0.4210243821144104,
- "mean_token_accuracy": 0.8571645021438599,
- "num_tokens": 18148152.0,
- "step": 2024
- },
- {
- "epoch": 1.5387537993920972,
- "grad_norm": 1.7180702686309814,
- "learning_rate": 2.602596539868292e-06,
- "loss": 0.2478562295436859,
- "mean_token_accuracy": 0.9227135181427002,
- "num_tokens": 18155435.0,
- "step": 2025
- },
- {
- "epoch": 1.5395136778115501,
- "grad_norm": 2.3721933364868164,
- "learning_rate": 2.6005038734073833e-06,
- "loss": 0.3820664584636688,
- "mean_token_accuracy": 0.8788443803787231,
- "num_tokens": 18161403.0,
- "step": 2026
- },
- {
- "epoch": 1.540273556231003,
- "grad_norm": 1.4967509508132935,
- "learning_rate": 2.5984111364089875e-06,
- "loss": 0.34247124195098877,
- "mean_token_accuracy": 0.8809049129486084,
- "num_tokens": 18173724.0,
- "step": 2027
- },
- {
- "epoch": 1.541033434650456,
- "grad_norm": 2.5226845741271973,
- "learning_rate": 2.5963183303418682e-06,
- "loss": 0.2647642493247986,
- "mean_token_accuracy": 0.8988642692565918,
- "num_tokens": 18178927.0,
- "step": 2028
- },
- {
- "epoch": 1.5417933130699089,
- "grad_norm": 2.217228412628174,
- "learning_rate": 2.594225456674837e-06,
- "loss": 0.37754058837890625,
- "mean_token_accuracy": 0.8660204410552979,
- "num_tokens": 18185268.0,
- "step": 2029
- },
- {
- "epoch": 1.5425531914893615,
- "grad_norm": 2.336409091949463,
- "learning_rate": 2.592132516876753e-06,
- "loss": 0.45098528265953064,
- "mean_token_accuracy": 0.842115044593811,
- "num_tokens": 18192372.0,
- "step": 2030
- },
- {
- "epoch": 1.5433130699088147,
- "grad_norm": 3.5437142848968506,
- "learning_rate": 2.5900395124165216e-06,
- "loss": 0.5326460003852844,
- "mean_token_accuracy": 0.8125103712081909,
- "num_tokens": 18199182.0,
- "step": 2031
- },
- {
- "epoch": 1.5440729483282674,
- "grad_norm": 1.5785651206970215,
- "learning_rate": 2.5879464447630947e-06,
- "loss": 0.3714991509914398,
- "mean_token_accuracy": 0.8711390495300293,
- "num_tokens": 18209045.0,
- "step": 2032
- },
- {
- "epoch": 1.5448328267477205,
- "grad_norm": 2.3616182804107666,
- "learning_rate": 2.5858533153854676e-06,
- "loss": 0.4548399746417999,
- "mean_token_accuracy": 0.8411449193954468,
- "num_tokens": 18215487.0,
- "step": 2033
- },
- {
- "epoch": 1.5455927051671732,
- "grad_norm": 2.0750479698181152,
- "learning_rate": 2.583760125752679e-06,
- "loss": 0.3980535566806793,
- "mean_token_accuracy": 0.8603327870368958,
- "num_tokens": 18222606.0,
- "step": 2034
- },
- {
- "epoch": 1.5463525835866263,
- "grad_norm": 2.609295129776001,
- "learning_rate": 2.58166687733381e-06,
- "loss": 0.40177756547927856,
- "mean_token_accuracy": 0.8652099370956421,
- "num_tokens": 18227341.0,
- "step": 2035
- },
- {
- "epoch": 1.547112462006079,
- "grad_norm": 2.1621339321136475,
- "learning_rate": 2.5795735715979826e-06,
- "loss": 0.45104342699050903,
- "mean_token_accuracy": 0.8481369018554688,
- "num_tokens": 18235820.0,
- "step": 2036
- },
- {
- "epoch": 1.547872340425532,
- "grad_norm": 1.0381370782852173,
- "learning_rate": 2.577480210014359e-06,
- "loss": 0.32621103525161743,
- "mean_token_accuracy": 0.8867391347885132,
- "num_tokens": 18258307.0,
- "step": 2037
- },
- {
- "epoch": 1.5486322188449848,
- "grad_norm": 1.7634375095367432,
- "learning_rate": 2.575386794052142e-06,
- "loss": 0.5115169882774353,
- "mean_token_accuracy": 0.818779468536377,
- "num_tokens": 18272782.0,
- "step": 2038
- },
- {
- "epoch": 1.5493920972644377,
- "grad_norm": 1.874875545501709,
- "learning_rate": 2.5732933251805716e-06,
- "loss": 0.4381459951400757,
- "mean_token_accuracy": 0.8594684600830078,
- "num_tokens": 18282618.0,
- "step": 2039
- },
- {
- "epoch": 1.5501519756838906,
- "grad_norm": 2.1316351890563965,
- "learning_rate": 2.571199804868923e-06,
- "loss": 0.5410124063491821,
- "mean_token_accuracy": 0.8247587084770203,
- "num_tokens": 18289750.0,
- "step": 2040
- },
- {
- "epoch": 1.5509118541033433,
- "grad_norm": 1.7574573755264282,
- "learning_rate": 2.569106234586511e-06,
- "loss": 0.29967373609542847,
- "mean_token_accuracy": 0.8913218975067139,
- "num_tokens": 18298110.0,
- "step": 2041
- },
- {
- "epoch": 1.5516717325227964,
- "grad_norm": 1.929626703262329,
- "learning_rate": 2.5670126158026843e-06,
- "loss": 0.3287760019302368,
- "mean_token_accuracy": 0.8870488405227661,
- "num_tokens": 18305702.0,
- "step": 2042
- },
- {
- "epoch": 1.5524316109422491,
- "grad_norm": 3.020153284072876,
- "learning_rate": 2.5649189499868233e-06,
- "loss": 0.38523542881011963,
- "mean_token_accuracy": 0.854824960231781,
- "num_tokens": 18309830.0,
- "step": 2043
- },
- {
- "epoch": 1.5531914893617023,
- "grad_norm": 1.6378421783447266,
- "learning_rate": 2.5628252386083443e-06,
- "loss": 0.47371378540992737,
- "mean_token_accuracy": 0.8627713918685913,
- "num_tokens": 18322820.0,
- "step": 2044
- },
- {
- "epoch": 1.553951367781155,
- "grad_norm": 1.3711130619049072,
- "learning_rate": 2.560731483136694e-06,
- "loss": 0.3319293260574341,
- "mean_token_accuracy": 0.8704103231430054,
- "num_tokens": 18335074.0,
- "step": 2045
- },
- {
- "epoch": 1.5547112462006079,
- "grad_norm": 1.7589185237884521,
- "learning_rate": 2.558637685041352e-06,
- "loss": 0.4446021020412445,
- "mean_token_accuracy": 0.8446722626686096,
- "num_tokens": 18344115.0,
- "step": 2046
- },
- {
- "epoch": 1.5554711246200608,
- "grad_norm": 2.5249195098876953,
- "learning_rate": 2.5565438457918247e-06,
- "loss": 0.4625541865825653,
- "mean_token_accuracy": 0.8451195359230042,
- "num_tokens": 18349235.0,
- "step": 2047
- },
- {
- "epoch": 1.5562310030395137,
- "grad_norm": 1.0562543869018555,
- "learning_rate": 2.5544499668576508e-06,
- "loss": 0.33747735619544983,
- "mean_token_accuracy": 0.8503615856170654,
- "num_tokens": 18368253.0,
- "step": 2048
- },
- {
- "epoch": 1.5569908814589666,
- "grad_norm": 2.9451215267181396,
- "learning_rate": 2.5523560497083927e-06,
- "loss": 0.3958815932273865,
- "mean_token_accuracy": 0.8393744826316833,
- "num_tokens": 18372887.0,
- "step": 2049
- },
- {
- "epoch": 1.5577507598784195,
- "grad_norm": 1.3597660064697266,
- "learning_rate": 2.5502620958136444e-06,
- "loss": 0.46281275153160095,
- "mean_token_accuracy": 0.8269470930099487,
- "num_tokens": 18388074.0,
- "step": 2050
- },
- {
- "epoch": 1.5585106382978724,
- "grad_norm": 3.269068717956543,
- "learning_rate": 2.548168106643022e-06,
- "loss": 0.2309008538722992,
- "mean_token_accuracy": 0.9178205728530884,
- "num_tokens": 18391406.0,
- "step": 2051
- },
- {
- "epoch": 1.559270516717325,
- "grad_norm": 2.1459391117095947,
- "learning_rate": 2.546074083666169e-06,
- "loss": 0.4006733298301697,
- "mean_token_accuracy": 0.8631902933120728,
- "num_tokens": 18397497.0,
- "step": 2052
- },
- {
- "epoch": 1.5600303951367782,
- "grad_norm": 1.4614566564559937,
- "learning_rate": 2.5439800283527495e-06,
- "loss": 0.40810418128967285,
- "mean_token_accuracy": 0.8473483920097351,
- "num_tokens": 18409474.0,
- "step": 2053
- },
- {
- "epoch": 1.560790273556231,
- "grad_norm": 2.084808826446533,
- "learning_rate": 2.541885942172454e-06,
- "loss": 0.34967708587646484,
- "mean_token_accuracy": 0.8707003593444824,
- "num_tokens": 18416400.0,
- "step": 2054
- },
- {
- "epoch": 1.561550151975684,
- "grad_norm": 1.90664541721344,
- "learning_rate": 2.539791826594991e-06,
- "loss": 0.37694251537323,
- "mean_token_accuracy": 0.8704941272735596,
- "num_tokens": 18424206.0,
- "step": 2055
- },
- {
- "epoch": 1.5623100303951367,
- "grad_norm": 1.880176305770874,
- "learning_rate": 2.537697683090093e-06,
- "loss": 0.32510411739349365,
- "mean_token_accuracy": 0.8848961591720581,
- "num_tokens": 18431676.0,
- "step": 2056
- },
- {
- "epoch": 1.5630699088145896,
- "grad_norm": 2.133375406265259,
- "learning_rate": 2.5356035131275096e-06,
- "loss": 0.30538493394851685,
- "mean_token_accuracy": 0.8890067338943481,
- "num_tokens": 18438014.0,
- "step": 2057
- },
- {
- "epoch": 1.5638297872340425,
- "grad_norm": 2.3495655059814453,
- "learning_rate": 2.5335093181770105e-06,
- "loss": 0.3126775324344635,
- "mean_token_accuracy": 0.8865689039230347,
- "num_tokens": 18443604.0,
- "step": 2058
- },
- {
- "epoch": 1.5645896656534954,
- "grad_norm": 2.37949538230896,
- "learning_rate": 2.531415099708382e-06,
- "loss": 0.3257793188095093,
- "mean_token_accuracy": 0.8809669017791748,
- "num_tokens": 18448654.0,
- "step": 2059
- },
- {
- "epoch": 1.5653495440729484,
- "grad_norm": 1.8285472393035889,
- "learning_rate": 2.5293208591914265e-06,
- "loss": 0.32376936078071594,
- "mean_token_accuracy": 0.8816431760787964,
- "num_tokens": 18456619.0,
- "step": 2060
- },
- {
- "epoch": 1.5661094224924013,
- "grad_norm": 2.3238534927368164,
- "learning_rate": 2.5272265980959644e-06,
- "loss": 0.40366506576538086,
- "mean_token_accuracy": 0.8496750593185425,
- "num_tokens": 18462788.0,
- "step": 2061
- },
- {
- "epoch": 1.5668693009118542,
- "grad_norm": 1.8954942226409912,
- "learning_rate": 2.525132317891827e-06,
- "loss": 0.3405473828315735,
- "mean_token_accuracy": 0.8849360942840576,
- "num_tokens": 18470719.0,
- "step": 2062
- },
- {
- "epoch": 1.5676291793313069,
- "grad_norm": 1.6268190145492554,
- "learning_rate": 2.523038020048861e-06,
- "loss": 0.3662685751914978,
- "mean_token_accuracy": 0.8865662813186646,
- "num_tokens": 18482095.0,
- "step": 2063
- },
- {
- "epoch": 1.56838905775076,
- "grad_norm": 2.5198733806610107,
- "learning_rate": 2.5209437060369266e-06,
- "loss": 0.3968311548233032,
- "mean_token_accuracy": 0.8643308281898499,
- "num_tokens": 18488069.0,
- "step": 2064
- },
- {
- "epoch": 1.5691489361702127,
- "grad_norm": 2.9197335243225098,
- "learning_rate": 2.518849377325893e-06,
- "loss": 0.24738386273384094,
- "mean_token_accuracy": 0.91959547996521,
- "num_tokens": 18491762.0,
- "step": 2065
- },
- {
- "epoch": 1.5699088145896658,
- "grad_norm": 1.5914254188537598,
- "learning_rate": 2.51675503538564e-06,
- "loss": 0.33473581075668335,
- "mean_token_accuracy": 0.8794662952423096,
- "num_tokens": 18501316.0,
- "step": 2066
- },
- {
- "epoch": 1.5706686930091185,
- "grad_norm": 2.5130460262298584,
- "learning_rate": 2.5146606816860597e-06,
- "loss": 0.4067240357398987,
- "mean_token_accuracy": 0.8564209342002869,
- "num_tokens": 18507169.0,
- "step": 2067
- },
- {
- "epoch": 1.5714285714285714,
- "grad_norm": 2.093353509902954,
- "learning_rate": 2.5125663176970475e-06,
- "loss": 0.4312136769294739,
- "mean_token_accuracy": 0.8540225028991699,
- "num_tokens": 18514536.0,
- "step": 2068
- },
- {
- "epoch": 1.5721884498480243,
- "grad_norm": 1.284495234489441,
- "learning_rate": 2.5104719448885103e-06,
- "loss": 0.3813856542110443,
- "mean_token_accuracy": 0.8435653448104858,
- "num_tokens": 18529947.0,
- "step": 2069
- },
- {
- "epoch": 1.5729483282674772,
- "grad_norm": 2.0383973121643066,
- "learning_rate": 2.5083775647303583e-06,
- "loss": 0.4428079426288605,
- "mean_token_accuracy": 0.8841741681098938,
- "num_tokens": 18537109.0,
- "step": 2070
- },
- {
- "epoch": 1.5737082066869301,
- "grad_norm": 1.7991697788238525,
- "learning_rate": 2.5062831786925102e-06,
- "loss": 0.460052490234375,
- "mean_token_accuracy": 0.8459943532943726,
- "num_tokens": 18547108.0,
- "step": 2071
- },
- {
- "epoch": 1.574468085106383,
- "grad_norm": 2.2168822288513184,
- "learning_rate": 2.5041887882448845e-06,
- "loss": 0.2863885462284088,
- "mean_token_accuracy": 0.906816840171814,
- "num_tokens": 18552357.0,
- "step": 2072
- },
- {
- "epoch": 1.575227963525836,
- "grad_norm": 3.918499708175659,
- "learning_rate": 2.5020943948574056e-06,
- "loss": 0.3439999222755432,
- "mean_token_accuracy": 0.8742123246192932,
- "num_tokens": 18555272.0,
- "step": 2073
- },
- {
- "epoch": 1.5759878419452886,
- "grad_norm": 1.773869514465332,
- "learning_rate": 2.5e-06,
- "loss": 0.2815646827220917,
- "mean_token_accuracy": 0.8939872980117798,
- "num_tokens": 18562989.0,
- "step": 2074
- },
- {
- "epoch": 1.5767477203647418,
- "grad_norm": 1.8675572872161865,
- "learning_rate": 2.497905605142595e-06,
- "loss": 0.5005829930305481,
- "mean_token_accuracy": 0.8242729902267456,
- "num_tokens": 18575587.0,
- "step": 2075
- },
- {
- "epoch": 1.5775075987841944,
- "grad_norm": 2.3143508434295654,
- "learning_rate": 2.4958112117551163e-06,
- "loss": 0.42472895979881287,
- "mean_token_accuracy": 0.8540043830871582,
- "num_tokens": 18581666.0,
- "step": 2076
- },
- {
- "epoch": 1.5782674772036476,
- "grad_norm": 2.529740333557129,
- "learning_rate": 2.4937168213074906e-06,
- "loss": 0.24539905786514282,
- "mean_token_accuracy": 0.9041235446929932,
- "num_tokens": 18585773.0,
- "step": 2077
- },
- {
- "epoch": 1.5790273556231003,
- "grad_norm": 2.5188395977020264,
- "learning_rate": 2.491622435269642e-06,
- "loss": 0.23059265315532684,
- "mean_token_accuracy": 0.9204603433609009,
- "num_tokens": 18589915.0,
- "step": 2078
- },
- {
- "epoch": 1.5797872340425532,
- "grad_norm": 2.7752444744110107,
- "learning_rate": 2.489528055111491e-06,
- "loss": 0.452225923538208,
- "mean_token_accuracy": 0.8444918990135193,
- "num_tokens": 18595488.0,
- "step": 2079
- },
- {
- "epoch": 1.580547112462006,
- "grad_norm": 1.174774408340454,
- "learning_rate": 2.487433682302953e-06,
- "loss": 0.3399246633052826,
- "mean_token_accuracy": 0.8608446717262268,
- "num_tokens": 18613756.0,
- "step": 2080
- },
- {
- "epoch": 1.581306990881459,
- "grad_norm": 1.515575647354126,
- "learning_rate": 2.485339318313941e-06,
- "loss": 0.45886170864105225,
- "mean_token_accuracy": 0.8479131460189819,
- "num_tokens": 18629610.0,
- "step": 2081
- },
- {
- "epoch": 1.582066869300912,
- "grad_norm": 1.7039403915405273,
- "learning_rate": 2.4832449646143605e-06,
- "loss": 0.349803626537323,
- "mean_token_accuracy": 0.8721815347671509,
- "num_tokens": 18637523.0,
- "step": 2082
- },
- {
- "epoch": 1.5828267477203646,
- "grad_norm": 3.2289421558380127,
- "learning_rate": 2.4811506226741077e-06,
- "loss": 0.4967171549797058,
- "mean_token_accuracy": 0.8303675651550293,
- "num_tokens": 18641826.0,
- "step": 2083
- },
- {
- "epoch": 1.5835866261398177,
- "grad_norm": 1.71235990524292,
- "learning_rate": 2.4790562939630738e-06,
- "loss": 0.4202485680580139,
- "mean_token_accuracy": 0.8581224679946899,
- "num_tokens": 18653146.0,
- "step": 2084
- },
- {
- "epoch": 1.5843465045592704,
- "grad_norm": 1.710036277770996,
- "learning_rate": 2.4769619799511392e-06,
- "loss": 0.3942421078681946,
- "mean_token_accuracy": 0.8553562164306641,
- "num_tokens": 18663826.0,
- "step": 2085
- },
- {
- "epoch": 1.5851063829787235,
- "grad_norm": 1.464859127998352,
- "learning_rate": 2.474867682108174e-06,
- "loss": 0.4093329906463623,
- "mean_token_accuracy": 0.8598780632019043,
- "num_tokens": 18675325.0,
- "step": 2086
- },
- {
- "epoch": 1.5858662613981762,
- "grad_norm": 2.083707809448242,
- "learning_rate": 2.472773401904037e-06,
- "loss": 0.4252093434333801,
- "mean_token_accuracy": 0.8433356881141663,
- "num_tokens": 18682416.0,
- "step": 2087
- },
- {
- "epoch": 1.5866261398176293,
- "grad_norm": 1.5577973127365112,
- "learning_rate": 2.470679140808574e-06,
- "loss": 0.3680085241794586,
- "mean_token_accuracy": 0.8609116077423096,
- "num_tokens": 18694445.0,
- "step": 2088
- },
- {
- "epoch": 1.587386018237082,
- "grad_norm": 2.1617276668548584,
- "learning_rate": 2.4685849002916184e-06,
- "loss": 0.40488749742507935,
- "mean_token_accuracy": 0.8429721593856812,
- "num_tokens": 18701204.0,
- "step": 2089
- },
- {
- "epoch": 1.588145896656535,
- "grad_norm": 2.046678304672241,
- "learning_rate": 2.4664906818229903e-06,
- "loss": 0.329141229391098,
- "mean_token_accuracy": 0.8830771446228027,
- "num_tokens": 18708354.0,
- "step": 2090
- },
- {
- "epoch": 1.5889057750759878,
- "grad_norm": 2.7741200923919678,
- "learning_rate": 2.4643964868724916e-06,
- "loss": 0.42294493317604065,
- "mean_token_accuracy": 0.8612706065177917,
- "num_tokens": 18713017.0,
- "step": 2091
- },
- {
- "epoch": 1.5896656534954408,
- "grad_norm": 2.085151433944702,
- "learning_rate": 2.4623023169099074e-06,
- "loss": 0.39038220047950745,
- "mean_token_accuracy": 0.861169695854187,
- "num_tokens": 18721423.0,
- "step": 2092
- },
- {
- "epoch": 1.5904255319148937,
- "grad_norm": 2.8721165657043457,
- "learning_rate": 2.4602081734050093e-06,
- "loss": 0.27753859758377075,
- "mean_token_accuracy": 0.8959167003631592,
- "num_tokens": 18725044.0,
- "step": 2093
- },
- {
- "epoch": 1.5911854103343464,
- "grad_norm": 1.7388207912445068,
- "learning_rate": 2.4581140578275473e-06,
- "loss": 0.3570033311843872,
- "mean_token_accuracy": 0.8715590238571167,
- "num_tokens": 18733891.0,
- "step": 2094
- },
- {
- "epoch": 1.5919452887537995,
- "grad_norm": 2.3645241260528564,
- "learning_rate": 2.456019971647251e-06,
- "loss": 0.38982006907463074,
- "mean_token_accuracy": 0.8734139800071716,
- "num_tokens": 18740464.0,
- "step": 2095
- },
- {
- "epoch": 1.5927051671732522,
- "grad_norm": 3.674072027206421,
- "learning_rate": 2.4539259163338317e-06,
- "loss": 0.4068281650543213,
- "mean_token_accuracy": 0.8397839069366455,
- "num_tokens": 18744857.0,
- "step": 2096
- },
- {
- "epoch": 1.5934650455927053,
- "grad_norm": 1.8209186792373657,
- "learning_rate": 2.4518318933569786e-06,
- "loss": 0.3471015691757202,
- "mean_token_accuracy": 0.8709044456481934,
- "num_tokens": 18752414.0,
- "step": 2097
- },
- {
- "epoch": 1.594224924012158,
- "grad_norm": 1.8138704299926758,
- "learning_rate": 2.449737904186357e-06,
- "loss": 0.3438487648963928,
- "mean_token_accuracy": 0.8766711950302124,
- "num_tokens": 18760587.0,
- "step": 2098
- },
- {
- "epoch": 1.594984802431611,
- "grad_norm": 1.7893842458724976,
- "learning_rate": 2.447643950291608e-06,
- "loss": 0.43519508838653564,
- "mean_token_accuracy": 0.8682907819747925,
- "num_tokens": 18770293.0,
- "step": 2099
- },
- {
- "epoch": 1.5957446808510638,
- "grad_norm": 1.4305094480514526,
- "learning_rate": 2.4455500331423505e-06,
- "loss": 0.37106508016586304,
- "mean_token_accuracy": 0.8611354827880859,
- "num_tokens": 18782456.0,
- "step": 2100
- },
- {
- "epoch": 1.5965045592705167,
- "grad_norm": 2.0797057151794434,
- "learning_rate": 2.4434561542081765e-06,
- "loss": 0.43942689895629883,
- "mean_token_accuracy": 0.8477288484573364,
- "num_tokens": 18789547.0,
- "step": 2101
- },
- {
- "epoch": 1.5972644376899696,
- "grad_norm": 1.2983288764953613,
- "learning_rate": 2.441362314958649e-06,
- "loss": 0.46385765075683594,
- "mean_token_accuracy": 0.8340978622436523,
- "num_tokens": 18809456.0,
- "step": 2102
- },
- {
- "epoch": 1.5980243161094225,
- "grad_norm": 2.60866641998291,
- "learning_rate": 2.439268516863306e-06,
- "loss": 0.3106239140033722,
- "mean_token_accuracy": 0.8859497308731079,
- "num_tokens": 18813781.0,
- "step": 2103
- },
- {
- "epoch": 1.5987841945288754,
- "grad_norm": 3.389376163482666,
- "learning_rate": 2.4371747613916566e-06,
- "loss": 0.44926169514656067,
- "mean_token_accuracy": 0.8664819002151489,
- "num_tokens": 18817666.0,
- "step": 2104
- },
- {
- "epoch": 1.5995440729483281,
- "grad_norm": 3.3417351245880127,
- "learning_rate": 2.4350810500131776e-06,
- "loss": 0.4786076545715332,
- "mean_token_accuracy": 0.8357523679733276,
- "num_tokens": 18823717.0,
- "step": 2105
- },
- {
- "epoch": 1.6003039513677813,
- "grad_norm": 1.5215197801589966,
- "learning_rate": 2.4329873841973174e-06,
- "loss": 0.4123923182487488,
- "mean_token_accuracy": 0.853337287902832,
- "num_tokens": 18835163.0,
- "step": 2106
- },
- {
- "epoch": 1.601063829787234,
- "grad_norm": 1.8798415660858154,
- "learning_rate": 2.4308937654134893e-06,
- "loss": 0.45594000816345215,
- "mean_token_accuracy": 0.8553717732429504,
- "num_tokens": 18843923.0,
- "step": 2107
- },
- {
- "epoch": 1.601823708206687,
- "grad_norm": 2.1012487411499023,
- "learning_rate": 2.428800195131078e-06,
- "loss": 0.4340161085128784,
- "mean_token_accuracy": 0.8448120355606079,
- "num_tokens": 18851852.0,
- "step": 2108
- },
- {
- "epoch": 1.6025835866261398,
- "grad_norm": 2.827080726623535,
- "learning_rate": 2.4267066748194297e-06,
- "loss": 0.25922513008117676,
- "mean_token_accuracy": 0.9024698734283447,
- "num_tokens": 18856113.0,
- "step": 2109
- },
- {
- "epoch": 1.6033434650455927,
- "grad_norm": 1.641032338142395,
- "learning_rate": 2.4246132059478582e-06,
- "loss": 0.591558575630188,
- "mean_token_accuracy": 0.7960667610168457,
- "num_tokens": 18870618.0,
- "step": 2110
- },
- {
- "epoch": 1.6041033434650456,
- "grad_norm": 2.600771188735962,
- "learning_rate": 2.4225197899856416e-06,
- "loss": 0.382815957069397,
- "mean_token_accuracy": 0.8654585480690002,
- "num_tokens": 18875456.0,
- "step": 2111
- },
- {
- "epoch": 1.6048632218844985,
- "grad_norm": 1.5125449895858765,
- "learning_rate": 2.4204264284020182e-06,
- "loss": 0.4643454849720001,
- "mean_token_accuracy": 0.837038516998291,
- "num_tokens": 18887979.0,
- "step": 2112
- },
- {
- "epoch": 1.6056231003039514,
- "grad_norm": 1.7571941614151,
- "learning_rate": 2.4183331226661913e-06,
- "loss": 0.30713701248168945,
- "mean_token_accuracy": 0.8856921195983887,
- "num_tokens": 18896143.0,
- "step": 2113
- },
- {
- "epoch": 1.6063829787234043,
- "grad_norm": 2.124593496322632,
- "learning_rate": 2.4162398742473216e-06,
- "loss": 0.2873607575893402,
- "mean_token_accuracy": 0.8986717462539673,
- "num_tokens": 18902364.0,
- "step": 2114
- },
- {
- "epoch": 1.6071428571428572,
- "grad_norm": 2.3496272563934326,
- "learning_rate": 2.4141466846145332e-06,
- "loss": 0.33715200424194336,
- "mean_token_accuracy": 0.8816461563110352,
- "num_tokens": 18908038.0,
- "step": 2115
- },
- {
- "epoch": 1.60790273556231,
- "grad_norm": 1.2783573865890503,
- "learning_rate": 2.4120535552369057e-06,
- "loss": 0.45153388381004333,
- "mean_token_accuracy": 0.8345640897750854,
- "num_tokens": 18926687.0,
- "step": 2116
- },
- {
- "epoch": 1.608662613981763,
- "grad_norm": 2.1481080055236816,
- "learning_rate": 2.4099604875834796e-06,
- "loss": 0.43976694345474243,
- "mean_token_accuracy": 0.847899317741394,
- "num_tokens": 18932974.0,
- "step": 2117
- },
- {
- "epoch": 1.6094224924012157,
- "grad_norm": 1.8669065237045288,
- "learning_rate": 2.407867483123248e-06,
- "loss": 0.4649358093738556,
- "mean_token_accuracy": 0.8310785293579102,
- "num_tokens": 18942551.0,
- "step": 2118
- },
- {
- "epoch": 1.6101823708206688,
- "grad_norm": 2.7667746543884277,
- "learning_rate": 2.4057745433251637e-06,
- "loss": 0.4542210102081299,
- "mean_token_accuracy": 0.8450086116790771,
- "num_tokens": 18947525.0,
- "step": 2119
- },
- {
- "epoch": 1.6109422492401215,
- "grad_norm": 2.2865076065063477,
- "learning_rate": 2.4036816696581326e-06,
- "loss": 0.34291431307792664,
- "mean_token_accuracy": 0.8741394281387329,
- "num_tokens": 18952967.0,
- "step": 2120
- },
- {
- "epoch": 1.6117021276595744,
- "grad_norm": 3.055197238922119,
- "learning_rate": 2.401588863591013e-06,
- "loss": 0.4686807692050934,
- "mean_token_accuracy": 0.8440030217170715,
- "num_tokens": 18958257.0,
- "step": 2121
- },
- {
- "epoch": 1.6124620060790273,
- "grad_norm": 2.268456220626831,
- "learning_rate": 2.3994961265926166e-06,
- "loss": 0.440069317817688,
- "mean_token_accuracy": 0.8534891605377197,
- "num_tokens": 18964745.0,
- "step": 2122
- },
- {
- "epoch": 1.6132218844984803,
- "grad_norm": 2.061185359954834,
- "learning_rate": 2.3974034601317085e-06,
- "loss": 0.4383159279823303,
- "mean_token_accuracy": 0.8484808802604675,
- "num_tokens": 18972136.0,
- "step": 2123
- },
- {
- "epoch": 1.6139817629179332,
- "grad_norm": 1.5121275186538696,
- "learning_rate": 2.3953108656770018e-06,
- "loss": 0.42403632402420044,
- "mean_token_accuracy": 0.8467602133750916,
- "num_tokens": 18985353.0,
- "step": 2124
- },
- {
- "epoch": 1.614741641337386,
- "grad_norm": 1.9965397119522095,
- "learning_rate": 2.3932183446971584e-06,
- "loss": 0.3915751576423645,
- "mean_token_accuracy": 0.8622956275939941,
- "num_tokens": 18992017.0,
- "step": 2125
- },
- {
- "epoch": 1.615501519756839,
- "grad_norm": 1.6688618659973145,
- "learning_rate": 2.3911258986607907e-06,
- "loss": 0.468288391828537,
- "mean_token_accuracy": 0.8372251987457275,
- "num_tokens": 19001930.0,
- "step": 2126
- },
- {
- "epoch": 1.6162613981762917,
- "grad_norm": 1.8984699249267578,
- "learning_rate": 2.3890335290364596e-06,
- "loss": 0.3082895278930664,
- "mean_token_accuracy": 0.8815990686416626,
- "num_tokens": 19009712.0,
- "step": 2127
- },
- {
- "epoch": 1.6170212765957448,
- "grad_norm": 2.6934773921966553,
- "learning_rate": 2.386941237292669e-06,
- "loss": 0.48406022787094116,
- "mean_token_accuracy": 0.8300775289535522,
- "num_tokens": 19015212.0,
- "step": 2128
- },
- {
- "epoch": 1.6177811550151975,
- "grad_norm": 1.6615487337112427,
- "learning_rate": 2.3848490248978693e-06,
- "loss": 0.45227736234664917,
- "mean_token_accuracy": 0.8421006798744202,
- "num_tokens": 19027115.0,
- "step": 2129
- },
- {
- "epoch": 1.6185410334346506,
- "grad_norm": 1.4625248908996582,
- "learning_rate": 2.3827568933204576e-06,
- "loss": 0.4141014814376831,
- "mean_token_accuracy": 0.8479453325271606,
- "num_tokens": 19041103.0,
- "step": 2130
- },
- {
- "epoch": 1.6193009118541033,
- "grad_norm": 1.856701135635376,
- "learning_rate": 2.3806648440287715e-06,
- "loss": 0.3440483808517456,
- "mean_token_accuracy": 0.8978210687637329,
- "num_tokens": 19048124.0,
- "step": 2131
- },
- {
- "epoch": 1.6200607902735562,
- "grad_norm": 1.7056550979614258,
- "learning_rate": 2.378572878491091e-06,
- "loss": 0.4136195182800293,
- "mean_token_accuracy": 0.8579289317131042,
- "num_tokens": 19057113.0,
- "step": 2132
- },
- {
- "epoch": 1.6208206686930091,
- "grad_norm": 1.4673033952713013,
- "learning_rate": 2.376480998175638e-06,
- "loss": 0.40176504850387573,
- "mean_token_accuracy": 0.8677150011062622,
- "num_tokens": 19068258.0,
- "step": 2133
- },
- {
- "epoch": 1.621580547112462,
- "grad_norm": 2.12859845161438,
- "learning_rate": 2.3743892045505764e-06,
- "loss": 0.39754825830459595,
- "mean_token_accuracy": 0.8486959934234619,
- "num_tokens": 19075469.0,
- "step": 2134
- },
- {
- "epoch": 1.622340425531915,
- "grad_norm": 1.474247694015503,
- "learning_rate": 2.372297499084006e-06,
- "loss": 0.3546760678291321,
- "mean_token_accuracy": 0.8767229318618774,
- "num_tokens": 19086744.0,
- "step": 2135
- },
- {
- "epoch": 1.6231003039513676,
- "grad_norm": 1.9945709705352783,
- "learning_rate": 2.3702058832439667e-06,
- "loss": 0.4200798273086548,
- "mean_token_accuracy": 0.8435655832290649,
- "num_tokens": 19095903.0,
- "step": 2136
- },
- {
- "epoch": 1.6238601823708207,
- "grad_norm": 2.71991229057312,
- "learning_rate": 2.368114358498434e-06,
- "loss": 0.44925457239151,
- "mean_token_accuracy": 0.8348450660705566,
- "num_tokens": 19100864.0,
- "step": 2137
- },
- {
- "epoch": 1.6246200607902734,
- "grad_norm": 2.817664623260498,
- "learning_rate": 2.366022926315322e-06,
- "loss": 0.44386279582977295,
- "mean_token_accuracy": 0.8739628791809082,
- "num_tokens": 19105355.0,
- "step": 2138
- },
- {
- "epoch": 1.6253799392097266,
- "grad_norm": 1.3673229217529297,
- "learning_rate": 2.3639315881624776e-06,
- "loss": 0.3693230152130127,
- "mean_token_accuracy": 0.8698620796203613,
- "num_tokens": 19116748.0,
- "step": 2139
- },
- {
- "epoch": 1.6261398176291793,
- "grad_norm": 2.712531805038452,
- "learning_rate": 2.361840345507683e-06,
- "loss": 0.4442938268184662,
- "mean_token_accuracy": 0.8433241844177246,
- "num_tokens": 19121437.0,
- "step": 2140
- },
- {
- "epoch": 1.6268996960486324,
- "grad_norm": 2.2885231971740723,
- "learning_rate": 2.359749199818651e-06,
- "loss": 0.4021872878074646,
- "mean_token_accuracy": 0.8605252504348755,
- "num_tokens": 19127633.0,
- "step": 2141
- },
- {
- "epoch": 1.627659574468085,
- "grad_norm": 1.9257299900054932,
- "learning_rate": 2.3576581525630297e-06,
- "loss": 0.3577788472175598,
- "mean_token_accuracy": 0.8691596388816833,
- "num_tokens": 19134450.0,
- "step": 2142
- },
- {
- "epoch": 1.628419452887538,
- "grad_norm": 1.5035467147827148,
- "learning_rate": 2.355567205208397e-06,
- "loss": 0.3800235986709595,
- "mean_token_accuracy": 0.867794394493103,
- "num_tokens": 19146149.0,
- "step": 2143
- },
- {
- "epoch": 1.6291793313069909,
- "grad_norm": 2.110445737838745,
- "learning_rate": 2.353476359222259e-06,
- "loss": 0.34394145011901855,
- "mean_token_accuracy": 0.8777303695678711,
- "num_tokens": 19152017.0,
- "step": 2144
- },
- {
- "epoch": 1.6299392097264438,
- "grad_norm": 1.1713787317276,
- "learning_rate": 2.351385616072052e-06,
- "loss": 0.4060516357421875,
- "mean_token_accuracy": 0.8411345481872559,
- "num_tokens": 19172089.0,
- "step": 2145
- },
- {
- "epoch": 1.6306990881458967,
- "grad_norm": 1.7600529193878174,
- "learning_rate": 2.3492949772251418e-06,
- "loss": 0.5299694538116455,
- "mean_token_accuracy": 0.8218191862106323,
- "num_tokens": 19184041.0,
- "step": 2146
- },
- {
- "epoch": 1.6314589665653494,
- "grad_norm": 1.7126617431640625,
- "learning_rate": 2.3472044441488175e-06,
- "loss": 0.38628721237182617,
- "mean_token_accuracy": 0.8526935577392578,
- "num_tokens": 19193101.0,
- "step": 2147
- },
- {
- "epoch": 1.6322188449848025,
- "grad_norm": 1.210344672203064,
- "learning_rate": 2.345114018310295e-06,
- "loss": 0.2732373774051666,
- "mean_token_accuracy": 0.8903822898864746,
- "num_tokens": 19206697.0,
- "step": 2148
- },
- {
- "epoch": 1.6329787234042552,
- "grad_norm": 1.6693075895309448,
- "learning_rate": 2.3430237011767166e-06,
- "loss": 0.3472709655761719,
- "mean_token_accuracy": 0.8767187595367432,
- "num_tokens": 19217008.0,
- "step": 2149
- },
- {
- "epoch": 1.6337386018237083,
- "grad_norm": 1.5242515802383423,
- "learning_rate": 2.3409334942151485e-06,
- "loss": 0.4345507025718689,
- "mean_token_accuracy": 0.8481311202049255,
- "num_tokens": 19231573.0,
- "step": 2150
- },
- {
- "epoch": 1.634498480243161,
- "grad_norm": 2.470122814178467,
- "learning_rate": 2.3388433988925767e-06,
- "loss": 0.4453052878379822,
- "mean_token_accuracy": 0.8411355018615723,
- "num_tokens": 19237076.0,
- "step": 2151
- },
- {
- "epoch": 1.635258358662614,
- "grad_norm": 2.4177467823028564,
- "learning_rate": 2.3367534166759105e-06,
- "loss": 0.454534113407135,
- "mean_token_accuracy": 0.8635509014129639,
- "num_tokens": 19242890.0,
- "step": 2152
- },
- {
- "epoch": 1.6360182370820668,
- "grad_norm": 2.8036744594573975,
- "learning_rate": 2.3346635490319815e-06,
- "loss": 0.4396413564682007,
- "mean_token_accuracy": 0.8491836786270142,
- "num_tokens": 19247492.0,
- "step": 2153
- },
- {
- "epoch": 1.6367781155015197,
- "grad_norm": 1.9286335706710815,
- "learning_rate": 2.3325737974275382e-06,
- "loss": 0.34988659620285034,
- "mean_token_accuracy": 0.8704243898391724,
- "num_tokens": 19254966.0,
- "step": 2154
- },
- {
- "epoch": 1.6375379939209727,
- "grad_norm": 1.8929904699325562,
- "learning_rate": 2.3304841633292487e-06,
- "loss": 0.4195491671562195,
- "mean_token_accuracy": 0.857181966304779,
- "num_tokens": 19263324.0,
- "step": 2155
- },
- {
- "epoch": 1.6382978723404256,
- "grad_norm": 2.2598466873168945,
- "learning_rate": 2.328394648203698e-06,
- "loss": 0.37977826595306396,
- "mean_token_accuracy": 0.8626722097396851,
- "num_tokens": 19269363.0,
- "step": 2156
- },
- {
- "epoch": 1.6390577507598785,
- "grad_norm": 1.8118126392364502,
- "learning_rate": 2.32630525351739e-06,
- "loss": 0.3532063364982605,
- "mean_token_accuracy": 0.8677854537963867,
- "num_tokens": 19277360.0,
- "step": 2157
- },
- {
- "epoch": 1.6398176291793312,
- "grad_norm": 1.5216798782348633,
- "learning_rate": 2.324215980736741e-06,
- "loss": 0.38609349727630615,
- "mean_token_accuracy": 0.8685325980186462,
- "num_tokens": 19292159.0,
- "step": 2158
- },
- {
- "epoch": 1.6405775075987843,
- "grad_norm": 3.0511462688446045,
- "learning_rate": 2.3221268313280836e-06,
- "loss": 0.21988365054130554,
- "mean_token_accuracy": 0.9172534942626953,
- "num_tokens": 19295735.0,
- "step": 2159
- },
- {
- "epoch": 1.641337386018237,
- "grad_norm": 1.957828164100647,
- "learning_rate": 2.320037806757662e-06,
- "loss": 0.3868909478187561,
- "mean_token_accuracy": 0.8605331182479858,
- "num_tokens": 19303287.0,
- "step": 2160
- },
- {
- "epoch": 1.64209726443769,
- "grad_norm": 2.590040922164917,
- "learning_rate": 2.317948908491636e-06,
- "loss": 0.3940129578113556,
- "mean_token_accuracy": 0.8814224004745483,
- "num_tokens": 19308101.0,
- "step": 2161
- },
- {
- "epoch": 1.6428571428571428,
- "grad_norm": 2.859248161315918,
- "learning_rate": 2.315860137996074e-06,
- "loss": 0.3437344431877136,
- "mean_token_accuracy": 0.8789017200469971,
- "num_tokens": 19313026.0,
- "step": 2162
- },
- {
- "epoch": 1.6436170212765957,
- "grad_norm": 1.1788666248321533,
- "learning_rate": 2.3137714967369544e-06,
- "loss": 0.3976179361343384,
- "mean_token_accuracy": 0.8383771181106567,
- "num_tokens": 19331103.0,
- "step": 2163
- },
- {
- "epoch": 1.6443768996960486,
- "grad_norm": 1.8409802913665771,
- "learning_rate": 2.3116829861801687e-06,
- "loss": 0.41898879408836365,
- "mean_token_accuracy": 0.8575010299682617,
- "num_tokens": 19340866.0,
- "step": 2164
- },
- {
- "epoch": 1.6451367781155015,
- "grad_norm": 1.4124691486358643,
- "learning_rate": 2.3095946077915115e-06,
- "loss": 0.333813339471817,
- "mean_token_accuracy": 0.8766071796417236,
- "num_tokens": 19353673.0,
- "step": 2165
- },
- {
- "epoch": 1.6458966565349544,
- "grad_norm": 1.76325261592865,
- "learning_rate": 2.307506363036688e-06,
- "loss": 0.4158991575241089,
- "mean_token_accuracy": 0.8522704839706421,
- "num_tokens": 19363635.0,
- "step": 2166
- },
- {
- "epoch": 1.6466565349544073,
- "grad_norm": 1.758833885192871,
- "learning_rate": 2.305418253381309e-06,
- "loss": 0.298480749130249,
- "mean_token_accuracy": 0.888424277305603,
- "num_tokens": 19372291.0,
- "step": 2167
- },
- {
- "epoch": 1.6474164133738602,
- "grad_norm": 1.6387488842010498,
- "learning_rate": 2.3033302802908895e-06,
- "loss": 0.4309447109699249,
- "mean_token_accuracy": 0.8672212362289429,
- "num_tokens": 19383480.0,
- "step": 2168
- },
- {
- "epoch": 1.648176291793313,
- "grad_norm": 1.5251084566116333,
- "learning_rate": 2.301242445230851e-06,
- "loss": 0.44890880584716797,
- "mean_token_accuracy": 0.847392737865448,
- "num_tokens": 19394810.0,
- "step": 2169
- },
- {
- "epoch": 1.648936170212766,
- "grad_norm": 1.6106950044631958,
- "learning_rate": 2.299154749666515e-06,
- "loss": 0.4403916597366333,
- "mean_token_accuracy": 0.8379756212234497,
- "num_tokens": 19405551.0,
- "step": 2170
- },
- {
- "epoch": 1.6496960486322187,
- "grad_norm": 1.4238437414169312,
- "learning_rate": 2.2970671950631066e-06,
- "loss": 0.4015567898750305,
- "mean_token_accuracy": 0.851482629776001,
- "num_tokens": 19418621.0,
- "step": 2171
- },
- {
- "epoch": 1.6504559270516719,
- "grad_norm": 1.3026156425476074,
- "learning_rate": 2.2949797828857527e-06,
- "loss": 0.3680947422981262,
- "mean_token_accuracy": 0.8641397953033447,
- "num_tokens": 19432118.0,
- "step": 2172
- },
- {
- "epoch": 1.6512158054711246,
- "grad_norm": 2.1265358924865723,
- "learning_rate": 2.2928925145994798e-06,
- "loss": 0.43980664014816284,
- "mean_token_accuracy": 0.8358430862426758,
- "num_tokens": 19439069.0,
- "step": 2173
- },
- {
- "epoch": 1.6519756838905775,
- "grad_norm": 1.8399443626403809,
- "learning_rate": 2.290805391669212e-06,
- "loss": 0.29801061749458313,
- "mean_token_accuracy": 0.8773187398910522,
- "num_tokens": 19446745.0,
- "step": 2174
- },
- {
- "epoch": 1.6527355623100304,
- "grad_norm": 1.8680047988891602,
- "learning_rate": 2.2887184155597725e-06,
- "loss": 0.3235543966293335,
- "mean_token_accuracy": 0.8754611015319824,
- "num_tokens": 19455266.0,
- "step": 2175
- },
- {
- "epoch": 1.6534954407294833,
- "grad_norm": 2.3048481941223145,
- "learning_rate": 2.286631587735883e-06,
- "loss": 0.4011988043785095,
- "mean_token_accuracy": 0.8531811237335205,
- "num_tokens": 19461049.0,
- "step": 2176
- },
- {
- "epoch": 1.6542553191489362,
- "grad_norm": 2.6067066192626953,
- "learning_rate": 2.2845449096621583e-06,
- "loss": 0.4957500696182251,
- "mean_token_accuracy": 0.8255549073219299,
- "num_tokens": 19466884.0,
- "step": 2177
- },
- {
- "epoch": 1.655015197568389,
- "grad_norm": 1.5211488008499146,
- "learning_rate": 2.282458382803109e-06,
- "loss": 0.32245099544525146,
- "mean_token_accuracy": 0.8865629434585571,
- "num_tokens": 19477294.0,
- "step": 2178
- },
- {
- "epoch": 1.655775075987842,
- "grad_norm": 2.245542526245117,
- "learning_rate": 2.280372008623142e-06,
- "loss": 0.3790864944458008,
- "mean_token_accuracy": 0.8766552209854126,
- "num_tokens": 19483385.0,
- "step": 2179
- },
- {
- "epoch": 1.6565349544072947,
- "grad_norm": 2.1158151626586914,
- "learning_rate": 2.2782857885865538e-06,
- "loss": 0.4726812243461609,
- "mean_token_accuracy": 0.8384029865264893,
- "num_tokens": 19491367.0,
- "step": 2180
- },
- {
- "epoch": 1.6572948328267478,
- "grad_norm": 3.301389694213867,
- "learning_rate": 2.2761997241575335e-06,
- "loss": 0.37664809823036194,
- "mean_token_accuracy": 0.8913813829421997,
- "num_tokens": 19494876.0,
- "step": 2181
- },
- {
- "epoch": 1.6580547112462005,
- "grad_norm": 2.2964162826538086,
- "learning_rate": 2.274113816800161e-06,
- "loss": 0.4110721945762634,
- "mean_token_accuracy": 0.8551756143569946,
- "num_tokens": 19500546.0,
- "step": 2182
- },
- {
- "epoch": 1.6588145896656536,
- "grad_norm": 3.368161916732788,
- "learning_rate": 2.272028067978408e-06,
- "loss": 0.39089250564575195,
- "mean_token_accuracy": 0.8786845207214355,
- "num_tokens": 19504142.0,
- "step": 2183
- },
- {
- "epoch": 1.6595744680851063,
- "grad_norm": 1.7299834489822388,
- "learning_rate": 2.2699424791561324e-06,
- "loss": 0.5205090641975403,
- "mean_token_accuracy": 0.8394201993942261,
- "num_tokens": 19514523.0,
- "step": 2184
- },
- {
- "epoch": 1.6603343465045592,
- "grad_norm": 2.045919418334961,
- "learning_rate": 2.267857051797081e-06,
- "loss": 0.49093255400657654,
- "mean_token_accuracy": 0.8338311910629272,
- "num_tokens": 19522439.0,
- "step": 2185
- },
- {
- "epoch": 1.6610942249240122,
- "grad_norm": 1.2035714387893677,
- "learning_rate": 2.265771787364886e-06,
- "loss": 0.37247753143310547,
- "mean_token_accuracy": 0.8873692750930786,
- "num_tokens": 19536717.0,
- "step": 2186
- },
- {
- "epoch": 1.661854103343465,
- "grad_norm": 2.6186633110046387,
- "learning_rate": 2.263686687323068e-06,
- "loss": 0.3318040370941162,
- "mean_token_accuracy": 0.8720577955245972,
- "num_tokens": 19541966.0,
- "step": 2187
- },
- {
- "epoch": 1.662613981762918,
- "grad_norm": 2.6845929622650146,
- "learning_rate": 2.261601753135029e-06,
- "loss": 0.32441991567611694,
- "mean_token_accuracy": 0.8700553178787231,
- "num_tokens": 19546644.0,
- "step": 2188
- },
- {
- "epoch": 1.6633738601823707,
- "grad_norm": 2.078998327255249,
- "learning_rate": 2.259516986264057e-06,
- "loss": 0.3424156904220581,
- "mean_token_accuracy": 0.8707810044288635,
- "num_tokens": 19553472.0,
- "step": 2189
- },
- {
- "epoch": 1.6641337386018238,
- "grad_norm": 2.380747079849243,
- "learning_rate": 2.2574323881733202e-06,
- "loss": 0.4994799494743347,
- "mean_token_accuracy": 0.817003607749939,
- "num_tokens": 19560502.0,
- "step": 2190
- },
- {
- "epoch": 1.6648936170212765,
- "grad_norm": 1.2984378337860107,
- "learning_rate": 2.255347960325871e-06,
- "loss": 0.33139657974243164,
- "mean_token_accuracy": 0.8763977289199829,
- "num_tokens": 19575624.0,
- "step": 2191
- },
- {
- "epoch": 1.6656534954407296,
- "grad_norm": 1.3232799768447876,
- "learning_rate": 2.2532637041846423e-06,
- "loss": 0.32994017004966736,
- "mean_token_accuracy": 0.8790634274482727,
- "num_tokens": 19588636.0,
- "step": 2192
- },
- {
- "epoch": 1.6664133738601823,
- "grad_norm": 2.11212158203125,
- "learning_rate": 2.2511796212124424e-06,
- "loss": 0.3140082359313965,
- "mean_token_accuracy": 0.8946622014045715,
- "num_tokens": 19594917.0,
- "step": 2193
- },
- {
- "epoch": 1.6671732522796354,
- "grad_norm": 2.7206521034240723,
- "learning_rate": 2.2490957128719627e-06,
- "loss": 0.3723612427711487,
- "mean_token_accuracy": 0.8781955242156982,
- "num_tokens": 19599310.0,
- "step": 2194
- },
- {
- "epoch": 1.667933130699088,
- "grad_norm": 2.6681952476501465,
- "learning_rate": 2.247011980625771e-06,
- "loss": 0.3740317225456238,
- "mean_token_accuracy": 0.8780536651611328,
- "num_tokens": 19604172.0,
- "step": 2195
- },
- {
- "epoch": 1.668693009118541,
- "grad_norm": 1.8933384418487549,
- "learning_rate": 2.2449284259363093e-06,
- "loss": 0.3359421491622925,
- "mean_token_accuracy": 0.8785334825515747,
- "num_tokens": 19612030.0,
- "step": 2196
- },
- {
- "epoch": 1.669452887537994,
- "grad_norm": 2.4779889583587646,
- "learning_rate": 2.2428450502658964e-06,
- "loss": 0.3724144399166107,
- "mean_token_accuracy": 0.8739810585975647,
- "num_tokens": 19617800.0,
- "step": 2197
- },
- {
- "epoch": 1.6702127659574468,
- "grad_norm": 3.0661120414733887,
- "learning_rate": 2.240761855076727e-06,
- "loss": 0.3627531826496124,
- "mean_token_accuracy": 0.865296483039856,
- "num_tokens": 19621885.0,
- "step": 2198
- },
- {
- "epoch": 1.6709726443768997,
- "grad_norm": 2.431708574295044,
- "learning_rate": 2.238678841830867e-06,
- "loss": 0.31396129727363586,
- "mean_token_accuracy": 0.9026765823364258,
- "num_tokens": 19627122.0,
- "step": 2199
- },
- {
- "epoch": 1.6717325227963524,
- "grad_norm": 2.5498745441436768,
- "learning_rate": 2.2365960119902543e-06,
- "loss": 0.3193191885948181,
- "mean_token_accuracy": 0.8750600218772888,
- "num_tokens": 19631771.0,
- "step": 2200
- },
- {
- "epoch": 1.6724924012158056,
- "grad_norm": 2.0419046878814697,
- "learning_rate": 2.2345133670167e-06,
- "loss": 0.32747960090637207,
- "mean_token_accuracy": 0.8603148460388184,
- "num_tokens": 19638972.0,
- "step": 2201
- },
- {
- "epoch": 1.6732522796352582,
- "grad_norm": 2.0412306785583496,
- "learning_rate": 2.232430908371885e-06,
- "loss": 0.4701780676841736,
- "mean_token_accuracy": 0.8318476676940918,
- "num_tokens": 19647968.0,
- "step": 2202
- },
- {
- "epoch": 1.6740121580547114,
- "grad_norm": 2.054070472717285,
- "learning_rate": 2.2303486375173586e-06,
- "loss": 0.33284813165664673,
- "mean_token_accuracy": 0.8760920763015747,
- "num_tokens": 19654032.0,
- "step": 2203
- },
- {
- "epoch": 1.674772036474164,
- "grad_norm": 1.6053217649459839,
- "learning_rate": 2.228266555914538e-06,
- "loss": 0.34431374073028564,
- "mean_token_accuracy": 0.8764770030975342,
- "num_tokens": 19663785.0,
- "step": 2204
- },
- {
- "epoch": 1.675531914893617,
- "grad_norm": 1.474494457244873,
- "learning_rate": 2.2261846650247077e-06,
- "loss": 0.3541037440299988,
- "mean_token_accuracy": 0.8782497644424438,
- "num_tokens": 19675498.0,
- "step": 2205
- },
- {
- "epoch": 1.6762917933130699,
- "grad_norm": 1.9318026304244995,
- "learning_rate": 2.224102966309021e-06,
- "loss": 0.4291660189628601,
- "mean_token_accuracy": 0.8424201607704163,
- "num_tokens": 19684576.0,
- "step": 2206
- },
- {
- "epoch": 1.6770516717325228,
- "grad_norm": 2.2150020599365234,
- "learning_rate": 2.2220214612284925e-06,
- "loss": 0.46187907457351685,
- "mean_token_accuracy": 0.840459942817688,
- "num_tokens": 19690412.0,
- "step": 2207
- },
- {
- "epoch": 1.6778115501519757,
- "grad_norm": 1.667281150817871,
- "learning_rate": 2.2199401512440037e-06,
- "loss": 0.37440744042396545,
- "mean_token_accuracy": 0.8694081902503967,
- "num_tokens": 19699600.0,
- "step": 2208
- },
- {
- "epoch": 1.6785714285714286,
- "grad_norm": 2.6446619033813477,
- "learning_rate": 2.2178590378162957e-06,
- "loss": 0.3301953077316284,
- "mean_token_accuracy": 0.8992182016372681,
- "num_tokens": 19704162.0,
- "step": 2209
- },
- {
- "epoch": 1.6793313069908815,
- "grad_norm": 1.4266780614852905,
- "learning_rate": 2.215778122405977e-06,
- "loss": 0.3811204135417938,
- "mean_token_accuracy": 0.861638069152832,
- "num_tokens": 19716511.0,
- "step": 2210
- },
- {
- "epoch": 1.6800911854103342,
- "grad_norm": 1.826087474822998,
- "learning_rate": 2.2136974064735132e-06,
- "loss": 0.4790012836456299,
- "mean_token_accuracy": 0.8404909372329712,
- "num_tokens": 19726645.0,
- "step": 2211
- },
- {
- "epoch": 1.6808510638297873,
- "grad_norm": 1.8551808595657349,
- "learning_rate": 2.2116168914792293e-06,
- "loss": 0.40999075770378113,
- "mean_token_accuracy": 0.8419463634490967,
- "num_tokens": 19735601.0,
- "step": 2212
- },
- {
- "epoch": 1.68161094224924,
- "grad_norm": 2.560124158859253,
- "learning_rate": 2.209536578883313e-06,
- "loss": 0.43428558111190796,
- "mean_token_accuracy": 0.8689159750938416,
- "num_tokens": 19741138.0,
- "step": 2213
- },
- {
- "epoch": 1.6823708206686931,
- "grad_norm": 2.0154869556427,
- "learning_rate": 2.207456470145807e-06,
- "loss": 0.43633338809013367,
- "mean_token_accuracy": 0.8646916151046753,
- "num_tokens": 19751929.0,
- "step": 2214
- },
- {
- "epoch": 1.6831306990881458,
- "grad_norm": 1.3583155870437622,
- "learning_rate": 2.205376566726611e-06,
- "loss": 0.3050280511379242,
- "mean_token_accuracy": 0.8998798727989197,
- "num_tokens": 19764012.0,
- "step": 2215
- },
- {
- "epoch": 1.6838905775075987,
- "grad_norm": 1.266262173652649,
- "learning_rate": 2.2032968700854813e-06,
- "loss": 0.4039713144302368,
- "mean_token_accuracy": 0.8571382164955139,
- "num_tokens": 19780683.0,
- "step": 2216
- },
- {
- "epoch": 1.6846504559270516,
- "grad_norm": 1.864356517791748,
- "learning_rate": 2.2012173816820297e-06,
- "loss": 0.361503541469574,
- "mean_token_accuracy": 0.868161678314209,
- "num_tokens": 19788907.0,
- "step": 2217
- },
- {
- "epoch": 1.6854103343465046,
- "grad_norm": 1.320155382156372,
- "learning_rate": 2.1991381029757216e-06,
- "loss": 0.28228244185447693,
- "mean_token_accuracy": 0.8945217132568359,
- "num_tokens": 19800354.0,
- "step": 2218
- },
- {
- "epoch": 1.6861702127659575,
- "grad_norm": 1.9706367254257202,
- "learning_rate": 2.1970590354258745e-06,
- "loss": 0.2849377989768982,
- "mean_token_accuracy": 0.9065699577331543,
- "num_tokens": 19806735.0,
- "step": 2219
- },
- {
- "epoch": 1.6869300911854104,
- "grad_norm": 1.9150370359420776,
- "learning_rate": 2.1949801804916563e-06,
- "loss": 0.4125257730484009,
- "mean_token_accuracy": 0.8642163872718811,
- "num_tokens": 19814056.0,
- "step": 2220
- },
- {
- "epoch": 1.6876899696048633,
- "grad_norm": 2.062589645385742,
- "learning_rate": 2.19290153963209e-06,
- "loss": 0.451707124710083,
- "mean_token_accuracy": 0.8311163187026978,
- "num_tokens": 19821263.0,
- "step": 2221
- },
- {
- "epoch": 1.688449848024316,
- "grad_norm": 1.3959208726882935,
- "learning_rate": 2.190823114306045e-06,
- "loss": 0.3326707184314728,
- "mean_token_accuracy": 0.9037837982177734,
- "num_tokens": 19835163.0,
- "step": 2222
- },
- {
- "epoch": 1.689209726443769,
- "grad_norm": 2.09995698928833,
- "learning_rate": 2.188744905972239e-06,
- "loss": 0.4144105315208435,
- "mean_token_accuracy": 0.8512029051780701,
- "num_tokens": 19843164.0,
- "step": 2223
- },
- {
- "epoch": 1.6899696048632218,
- "grad_norm": 1.4759427309036255,
- "learning_rate": 2.186666916089239e-06,
- "loss": 0.4707002639770508,
- "mean_token_accuracy": 0.8371601104736328,
- "num_tokens": 19858551.0,
- "step": 2224
- },
- {
- "epoch": 1.690729483282675,
- "grad_norm": 2.3398702144622803,
- "learning_rate": 2.1845891461154604e-06,
- "loss": 0.34672820568084717,
- "mean_token_accuracy": 0.879936695098877,
- "num_tokens": 19864348.0,
- "step": 2225
- },
- {
- "epoch": 1.6914893617021276,
- "grad_norm": 1.6283963918685913,
- "learning_rate": 2.1825115975091594e-06,
- "loss": 0.31835079193115234,
- "mean_token_accuracy": 0.8695961833000183,
- "num_tokens": 19873560.0,
- "step": 2226
- },
- {
- "epoch": 1.6922492401215805,
- "grad_norm": 2.035759687423706,
- "learning_rate": 2.1804342717284414e-06,
- "loss": 0.43110257387161255,
- "mean_token_accuracy": 0.8593922853469849,
- "num_tokens": 19880796.0,
- "step": 2227
- },
- {
- "epoch": 1.6930091185410334,
- "grad_norm": 2.1340725421905518,
- "learning_rate": 2.1783571702312523e-06,
- "loss": 0.46967440843582153,
- "mean_token_accuracy": 0.8839266300201416,
- "num_tokens": 19887911.0,
- "step": 2228
- },
- {
- "epoch": 1.6937689969604863,
- "grad_norm": 1.710340142250061,
- "learning_rate": 2.176280294475383e-06,
- "loss": 0.4167519807815552,
- "mean_token_accuracy": 0.8526116609573364,
- "num_tokens": 19896674.0,
- "step": 2229
- },
- {
- "epoch": 1.6945288753799392,
- "grad_norm": 1.7793304920196533,
- "learning_rate": 2.174203645918464e-06,
- "loss": 0.3875434994697571,
- "mean_token_accuracy": 0.8637192249298096,
- "num_tokens": 19904825.0,
- "step": 2230
- },
- {
- "epoch": 1.6952887537993921,
- "grad_norm": 1.7908778190612793,
- "learning_rate": 2.172127226017967e-06,
- "loss": 0.42065349221229553,
- "mean_token_accuracy": 0.850834846496582,
- "num_tokens": 19914377.0,
- "step": 2231
- },
- {
- "epoch": 1.696048632218845,
- "grad_norm": 3.0943970680236816,
- "learning_rate": 2.1700510362312053e-06,
- "loss": 0.44845050573349,
- "mean_token_accuracy": 0.8460367918014526,
- "num_tokens": 19918929.0,
- "step": 2232
- },
- {
- "epoch": 1.6968085106382977,
- "grad_norm": 1.5586018562316895,
- "learning_rate": 2.1679750780153265e-06,
- "loss": 0.4723482131958008,
- "mean_token_accuracy": 0.871384859085083,
- "num_tokens": 19932738.0,
- "step": 2233
- },
- {
- "epoch": 1.6975683890577509,
- "grad_norm": 2.014230728149414,
- "learning_rate": 2.1658993528273196e-06,
- "loss": 0.43307146430015564,
- "mean_token_accuracy": 0.8677935600280762,
- "num_tokens": 19940246.0,
- "step": 2234
- },
- {
- "epoch": 1.6983282674772036,
- "grad_norm": 1.528979778289795,
- "learning_rate": 2.163823862124007e-06,
- "loss": 0.3897377550601959,
- "mean_token_accuracy": 0.8737689256668091,
- "num_tokens": 19951187.0,
- "step": 2235
- },
- {
- "epoch": 1.6990881458966567,
- "grad_norm": 1.9856207370758057,
- "learning_rate": 2.1617486073620496e-06,
- "loss": 0.4285745620727539,
- "mean_token_accuracy": 0.8744081258773804,
- "num_tokens": 19957768.0,
- "step": 2236
- },
- {
- "epoch": 1.6998480243161094,
- "grad_norm": 2.130525827407837,
- "learning_rate": 2.15967358999794e-06,
- "loss": 0.405293732881546,
- "mean_token_accuracy": 0.8588452935218811,
- "num_tokens": 19965354.0,
- "step": 2237
- },
- {
- "epoch": 1.7006079027355623,
- "grad_norm": 1.665329098701477,
- "learning_rate": 2.1575988114880057e-06,
- "loss": 0.42987754940986633,
- "mean_token_accuracy": 0.846322238445282,
- "num_tokens": 19975780.0,
- "step": 2238
- },
- {
- "epoch": 1.7013677811550152,
- "grad_norm": 1.0725677013397217,
- "learning_rate": 2.155524273288405e-06,
- "loss": 0.31892159581184387,
- "mean_token_accuracy": 0.8692483305931091,
- "num_tokens": 19995875.0,
- "step": 2239
- },
- {
- "epoch": 1.702127659574468,
- "grad_norm": 2.282604455947876,
- "learning_rate": 2.15344997685513e-06,
- "loss": 0.4460654556751251,
- "mean_token_accuracy": 0.8623759746551514,
- "num_tokens": 20001466.0,
- "step": 2240
- },
- {
- "epoch": 1.702887537993921,
- "grad_norm": 1.1385949850082397,
- "learning_rate": 2.1513759236440024e-06,
- "loss": 0.37046104669570923,
- "mean_token_accuracy": 0.8637164831161499,
- "num_tokens": 20020998.0,
- "step": 2241
- },
- {
- "epoch": 1.7036474164133737,
- "grad_norm": 1.5521315336227417,
- "learning_rate": 2.1493021151106704e-06,
- "loss": 0.4526556134223938,
- "mean_token_accuracy": 0.8675785064697266,
- "num_tokens": 20032750.0,
- "step": 2242
- },
- {
- "epoch": 1.7044072948328268,
- "grad_norm": 1.7777446508407593,
- "learning_rate": 2.147228552710614e-06,
- "loss": 0.41294580698013306,
- "mean_token_accuracy": 0.8597785234451294,
- "num_tokens": 20041901.0,
- "step": 2243
- },
- {
- "epoch": 1.7051671732522795,
- "grad_norm": 1.5157700777053833,
- "learning_rate": 2.145155237899139e-06,
- "loss": 0.4158926010131836,
- "mean_token_accuracy": 0.8512611985206604,
- "num_tokens": 20053705.0,
- "step": 2244
- },
- {
- "epoch": 1.7059270516717326,
- "grad_norm": 1.5116809606552124,
- "learning_rate": 2.143082172131378e-06,
- "loss": 0.43943172693252563,
- "mean_token_accuracy": 0.8429899215698242,
- "num_tokens": 20069468.0,
- "step": 2245
- },
- {
- "epoch": 1.7066869300911853,
- "grad_norm": 1.6095285415649414,
- "learning_rate": 2.141009356862288e-06,
- "loss": 0.41325604915618896,
- "mean_token_accuracy": 0.8832963705062866,
- "num_tokens": 20080596.0,
- "step": 2246
- },
- {
- "epoch": 1.7074468085106385,
- "grad_norm": 1.39210844039917,
- "learning_rate": 2.138936793546649e-06,
- "loss": 0.3945302963256836,
- "mean_token_accuracy": 0.8698325753211975,
- "num_tokens": 20094158.0,
- "step": 2247
- },
- {
- "epoch": 1.7082066869300911,
- "grad_norm": 2.9576594829559326,
- "learning_rate": 2.1368644836390684e-06,
- "loss": 0.16507276892662048,
- "mean_token_accuracy": 0.9410445690155029,
- "num_tokens": 20097002.0,
- "step": 2248
- },
- {
- "epoch": 1.708966565349544,
- "grad_norm": 1.7631266117095947,
- "learning_rate": 2.134792428593971e-06,
- "loss": 0.519780695438385,
- "mean_token_accuracy": 0.8276066780090332,
- "num_tokens": 20107947.0,
- "step": 2249
- },
- {
- "epoch": 1.709726443768997,
- "grad_norm": 2.144636869430542,
- "learning_rate": 2.1327206298656055e-06,
- "loss": 0.32923734188079834,
- "mean_token_accuracy": 0.8766019344329834,
- "num_tokens": 20113676.0,
- "step": 2250
- },
- {
- "epoch": 1.7104863221884499,
- "grad_norm": 1.9511034488677979,
- "learning_rate": 2.130649088908041e-06,
- "loss": 0.4043842554092407,
- "mean_token_accuracy": 0.8525843620300293,
- "num_tokens": 20120787.0,
- "step": 2251
- },
- {
- "epoch": 1.7112462006079028,
- "grad_norm": 1.5001336336135864,
- "learning_rate": 2.1285778071751638e-06,
- "loss": 0.4800187051296234,
- "mean_token_accuracy": 0.8398486375808716,
- "num_tokens": 20133534.0,
- "step": 2252
- },
- {
- "epoch": 1.7120060790273555,
- "grad_norm": 1.435195803642273,
- "learning_rate": 2.126506786120678e-06,
- "loss": 0.44489604234695435,
- "mean_token_accuracy": 0.8444881439208984,
- "num_tokens": 20151787.0,
- "step": 2253
- },
- {
- "epoch": 1.7127659574468086,
- "grad_norm": 1.3056137561798096,
- "learning_rate": 2.1244360271981073e-06,
- "loss": 0.300567090511322,
- "mean_token_accuracy": 0.8903113007545471,
- "num_tokens": 20163390.0,
- "step": 2254
- },
- {
- "epoch": 1.7135258358662613,
- "grad_norm": 1.7347925901412964,
- "learning_rate": 2.1223655318607907e-06,
- "loss": 0.30601179599761963,
- "mean_token_accuracy": 0.8845717906951904,
- "num_tokens": 20171354.0,
- "step": 2255
- },
- {
- "epoch": 1.7142857142857144,
- "grad_norm": 1.316306471824646,
- "learning_rate": 2.1202953015618794e-06,
- "loss": 0.3972984552383423,
- "mean_token_accuracy": 0.845410943031311,
- "num_tokens": 20184464.0,
- "step": 2256
- },
- {
- "epoch": 1.715045592705167,
- "grad_norm": 2.1052892208099365,
- "learning_rate": 2.1182253377543428e-06,
- "loss": 0.3357020616531372,
- "mean_token_accuracy": 0.8853542804718018,
- "num_tokens": 20190539.0,
- "step": 2257
- },
- {
- "epoch": 1.71580547112462,
- "grad_norm": 1.4192553758621216,
- "learning_rate": 2.116155641890959e-06,
- "loss": 0.3881692588329315,
- "mean_token_accuracy": 0.8442144989967346,
- "num_tokens": 20204570.0,
- "step": 2258
- },
- {
- "epoch": 1.716565349544073,
- "grad_norm": 2.134113311767578,
- "learning_rate": 2.1140862154243223e-06,
- "loss": 0.37803274393081665,
- "mean_token_accuracy": 0.8703107237815857,
- "num_tokens": 20210535.0,
- "step": 2259
- },
- {
- "epoch": 1.7173252279635258,
- "grad_norm": 2.9149155616760254,
- "learning_rate": 2.1120170598068353e-06,
- "loss": 0.34860676527023315,
- "mean_token_accuracy": 0.8734345436096191,
- "num_tokens": 20214375.0,
- "step": 2260
- },
- {
- "epoch": 1.7180851063829787,
- "grad_norm": 1.6855589151382446,
- "learning_rate": 2.109948176490711e-06,
- "loss": 0.3676984906196594,
- "mean_token_accuracy": 0.8531560301780701,
- "num_tokens": 20223791.0,
- "step": 2261
- },
- {
- "epoch": 1.7188449848024316,
- "grad_norm": 2.09671950340271,
- "learning_rate": 2.10787956692797e-06,
- "loss": 0.41744115948677063,
- "mean_token_accuracy": 0.8570001125335693,
- "num_tokens": 20231254.0,
- "step": 2262
- },
- {
- "epoch": 1.7196048632218845,
- "grad_norm": 3.148813009262085,
- "learning_rate": 2.1058112325704436e-06,
- "loss": 0.20556189119815826,
- "mean_token_accuracy": 0.926898717880249,
- "num_tokens": 20234470.0,
- "step": 2263
- },
- {
- "epoch": 1.7203647416413372,
- "grad_norm": 1.9707107543945312,
- "learning_rate": 2.103743174869769e-06,
- "loss": 0.40733110904693604,
- "mean_token_accuracy": 0.8740406036376953,
- "num_tokens": 20242286.0,
- "step": 2264
- },
- {
- "epoch": 1.7211246200607904,
- "grad_norm": 1.2756069898605347,
- "learning_rate": 2.1016753952773867e-06,
- "loss": 0.3940718173980713,
- "mean_token_accuracy": 0.860906720161438,
- "num_tokens": 20260382.0,
- "step": 2265
- },
- {
- "epoch": 1.721884498480243,
- "grad_norm": 1.5074653625488281,
- "learning_rate": 2.0996078952445453e-06,
- "loss": 0.3353617191314697,
- "mean_token_accuracy": 0.8809853792190552,
- "num_tokens": 20271665.0,
- "step": 2266
- },
- {
- "epoch": 1.7226443768996962,
- "grad_norm": 1.4331210851669312,
- "learning_rate": 2.0975406762222966e-06,
- "loss": 0.32260069251060486,
- "mean_token_accuracy": 0.901330828666687,
- "num_tokens": 20283122.0,
- "step": 2267
- },
- {
- "epoch": 1.7234042553191489,
- "grad_norm": 2.2378969192504883,
- "learning_rate": 2.095473739661494e-06,
- "loss": 0.39086243510246277,
- "mean_token_accuracy": 0.8681687116622925,
- "num_tokens": 20289243.0,
- "step": 2268
- },
- {
- "epoch": 1.7241641337386018,
- "grad_norm": 2.754582405090332,
- "learning_rate": 2.093407087012791e-06,
- "loss": 0.42927244305610657,
- "mean_token_accuracy": 0.8594136834144592,
- "num_tokens": 20294537.0,
- "step": 2269
- },
- {
- "epoch": 1.7249240121580547,
- "grad_norm": 2.2721824645996094,
- "learning_rate": 2.091340719726647e-06,
- "loss": 0.42479783296585083,
- "mean_token_accuracy": 0.8411722183227539,
- "num_tokens": 20301502.0,
- "step": 2270
- },
- {
- "epoch": 1.7256838905775076,
- "grad_norm": 2.3230299949645996,
- "learning_rate": 2.089274639253317e-06,
- "loss": 0.4218963384628296,
- "mean_token_accuracy": 0.8498032093048096,
- "num_tokens": 20307710.0,
- "step": 2271
- },
- {
- "epoch": 1.7264437689969605,
- "grad_norm": 2.3499748706817627,
- "learning_rate": 2.0872088470428553e-06,
- "loss": 0.4472277760505676,
- "mean_token_accuracy": 0.8487255573272705,
- "num_tokens": 20313945.0,
- "step": 2272
- },
- {
- "epoch": 1.7272036474164134,
- "grad_norm": 1.3709690570831299,
- "learning_rate": 2.0851433445451142e-06,
- "loss": 0.38701117038726807,
- "mean_token_accuracy": 0.8592075109481812,
- "num_tokens": 20328023.0,
- "step": 2273
- },
- {
- "epoch": 1.7279635258358663,
- "grad_norm": 1.1293425559997559,
- "learning_rate": 2.0830781332097446e-06,
- "loss": 0.34000539779663086,
- "mean_token_accuracy": 0.8779317140579224,
- "num_tokens": 20346767.0,
- "step": 2274
- },
- {
- "epoch": 1.728723404255319,
- "grad_norm": 2.9770123958587646,
- "learning_rate": 2.08101321448619e-06,
- "loss": 0.4437636733055115,
- "mean_token_accuracy": 0.8398602604866028,
- "num_tokens": 20352306.0,
- "step": 2275
- },
- {
- "epoch": 1.7294832826747721,
- "grad_norm": 3.510955572128296,
- "learning_rate": 2.0789485898236897e-06,
- "loss": 0.3359706401824951,
- "mean_token_accuracy": 0.8872498273849487,
- "num_tokens": 20355560.0,
- "step": 2276
- },
- {
- "epoch": 1.7302431610942248,
- "grad_norm": 2.0873279571533203,
- "learning_rate": 2.076884260671276e-06,
- "loss": 0.38720619678497314,
- "mean_token_accuracy": 0.865881621837616,
- "num_tokens": 20362802.0,
- "step": 2277
- },
- {
- "epoch": 1.731003039513678,
- "grad_norm": 2.4871230125427246,
- "learning_rate": 2.0748202284777775e-06,
- "loss": 0.3250775933265686,
- "mean_token_accuracy": 0.8867610692977905,
- "num_tokens": 20367080.0,
- "step": 2278
- },
- {
- "epoch": 1.7317629179331306,
- "grad_norm": 3.5603582859039307,
- "learning_rate": 2.072756494691809e-06,
- "loss": 0.35600754618644714,
- "mean_token_accuracy": 0.8781189918518066,
- "num_tokens": 20370625.0,
- "step": 2279
- },
- {
- "epoch": 1.7325227963525835,
- "grad_norm": 2.0948755741119385,
- "learning_rate": 2.070693060761779e-06,
- "loss": 0.3558604419231415,
- "mean_token_accuracy": 0.902066707611084,
- "num_tokens": 20376835.0,
- "step": 2280
- },
- {
- "epoch": 1.7332826747720365,
- "grad_norm": 2.391188859939575,
- "learning_rate": 2.0686299281358837e-06,
- "loss": 0.36596938967704773,
- "mean_token_accuracy": 0.8741272687911987,
- "num_tokens": 20382282.0,
- "step": 2281
- },
- {
- "epoch": 1.7340425531914894,
- "grad_norm": 1.6906369924545288,
- "learning_rate": 2.0665670982621107e-06,
- "loss": 0.5241266489028931,
- "mean_token_accuracy": 0.8091107606887817,
- "num_tokens": 20393736.0,
- "step": 2282
- },
- {
- "epoch": 1.7348024316109423,
- "grad_norm": 1.7578394412994385,
- "learning_rate": 2.0645045725882334e-06,
- "loss": 0.37041786313056946,
- "mean_token_accuracy": 0.8907113075256348,
- "num_tokens": 20402715.0,
- "step": 2283
- },
- {
- "epoch": 1.7355623100303952,
- "grad_norm": 2.191727638244629,
- "learning_rate": 2.0624423525618097e-06,
- "loss": 0.43301627039909363,
- "mean_token_accuracy": 0.8706433773040771,
- "num_tokens": 20409976.0,
- "step": 2284
- },
- {
- "epoch": 1.736322188449848,
- "grad_norm": 1.958005666732788,
- "learning_rate": 2.0603804396301875e-06,
- "loss": 0.29002684354782104,
- "mean_token_accuracy": 0.8914110660552979,
- "num_tokens": 20417099.0,
- "step": 2285
- },
- {
- "epoch": 1.7370820668693008,
- "grad_norm": 2.477837085723877,
- "learning_rate": 2.058318835240495e-06,
- "loss": 0.2953898310661316,
- "mean_token_accuracy": 0.8975275754928589,
- "num_tokens": 20422251.0,
- "step": 2286
- },
- {
- "epoch": 1.737841945288754,
- "grad_norm": 2.156764268875122,
- "learning_rate": 2.0562575408396475e-06,
- "loss": 0.4063698649406433,
- "mean_token_accuracy": 0.8497642278671265,
- "num_tokens": 20429338.0,
- "step": 2287
- },
- {
- "epoch": 1.7386018237082066,
- "grad_norm": 1.6748939752578735,
- "learning_rate": 2.0541965578743373e-06,
- "loss": 0.3272587060928345,
- "mean_token_accuracy": 0.8646700382232666,
- "num_tokens": 20439680.0,
- "step": 2288
- },
- {
- "epoch": 1.7393617021276597,
- "grad_norm": 1.9948776960372925,
- "learning_rate": 2.0521358877910446e-06,
- "loss": 0.36843347549438477,
- "mean_token_accuracy": 0.8613901138305664,
- "num_tokens": 20448492.0,
- "step": 2289
- },
- {
- "epoch": 1.7401215805471124,
- "grad_norm": 2.231428623199463,
- "learning_rate": 2.0500755320360263e-06,
- "loss": 0.3905152380466461,
- "mean_token_accuracy": 0.8980990052223206,
- "num_tokens": 20453945.0,
- "step": 2290
- },
- {
- "epoch": 1.7408814589665653,
- "grad_norm": 2.2187650203704834,
- "learning_rate": 2.048015492055319e-06,
- "loss": 0.45920854806900024,
- "mean_token_accuracy": 0.8282852172851562,
- "num_tokens": 20462378.0,
- "step": 2291
- },
- {
- "epoch": 1.7416413373860182,
- "grad_norm": 2.0668466091156006,
- "learning_rate": 2.045955769294737e-06,
- "loss": 0.3227751553058624,
- "mean_token_accuracy": 0.8805934190750122,
- "num_tokens": 20469822.0,
- "step": 2292
- },
- {
- "epoch": 1.7424012158054711,
- "grad_norm": 1.9162774085998535,
- "learning_rate": 2.0438963651998747e-06,
- "loss": 0.4604800343513489,
- "mean_token_accuracy": 0.8441175818443298,
- "num_tokens": 20479099.0,
- "step": 2293
- },
- {
- "epoch": 1.743161094224924,
- "grad_norm": 2.645329713821411,
- "learning_rate": 2.0418372812161015e-06,
- "loss": 0.3239654004573822,
- "mean_token_accuracy": 0.8888648748397827,
- "num_tokens": 20483926.0,
- "step": 2294
- },
- {
- "epoch": 1.743920972644377,
- "grad_norm": 1.39468514919281,
- "learning_rate": 2.03977851878856e-06,
- "loss": 0.4003690183162689,
- "mean_token_accuracy": 0.8769714832305908,
- "num_tokens": 20496501.0,
- "step": 2295
- },
- {
- "epoch": 1.7446808510638299,
- "grad_norm": 3.509174346923828,
- "learning_rate": 2.0377200793621694e-06,
- "loss": 0.2948213517665863,
- "mean_token_accuracy": 0.8972329497337341,
- "num_tokens": 20500000.0,
- "step": 2296
- },
- {
- "epoch": 1.7454407294832825,
- "grad_norm": 1.5033894777297974,
- "learning_rate": 2.0356619643816234e-06,
- "loss": 0.40694737434387207,
- "mean_token_accuracy": 0.8607243895530701,
- "num_tokens": 20513473.0,
- "step": 2297
- },
- {
- "epoch": 1.7462006079027357,
- "grad_norm": 1.4324895143508911,
- "learning_rate": 2.0336041752913843e-06,
- "loss": 0.3899157643318176,
- "mean_token_accuracy": 0.858935534954071,
- "num_tokens": 20524516.0,
- "step": 2298
- },
- {
- "epoch": 1.7469604863221884,
- "grad_norm": 2.359544277191162,
- "learning_rate": 2.031546713535688e-06,
- "loss": 0.369213342666626,
- "mean_token_accuracy": 0.8741403818130493,
- "num_tokens": 20530421.0,
- "step": 2299
- },
- {
- "epoch": 1.7477203647416415,
- "grad_norm": 2.282637357711792,
- "learning_rate": 2.029489580558542e-06,
- "loss": 0.3255441188812256,
- "mean_token_accuracy": 0.9045462608337402,
- "num_tokens": 20535954.0,
- "step": 2300
- },
- {
- "epoch": 1.7484802431610942,
- "grad_norm": 1.7367198467254639,
- "learning_rate": 2.0274327778037204e-06,
- "loss": 0.43890488147735596,
- "mean_token_accuracy": 0.8494667410850525,
- "num_tokens": 20548638.0,
- "step": 2301
- },
- {
- "epoch": 1.749240121580547,
- "grad_norm": 1.6236488819122314,
- "learning_rate": 2.0253763067147657e-06,
- "loss": 0.4440777897834778,
- "mean_token_accuracy": 0.8414230942726135,
- "num_tokens": 20559263.0,
- "step": 2302
- },
- {
- "epoch": 1.75,
- "grad_norm": 1.3755455017089844,
- "learning_rate": 2.0233201687349888e-06,
- "loss": 0.3473797142505646,
- "mean_token_accuracy": 0.8742472529411316,
- "num_tokens": 20573109.0,
- "step": 2303
- },
- {
- "epoch": 1.750759878419453,
- "grad_norm": 3.271153688430786,
- "learning_rate": 2.0212643653074677e-06,
- "loss": 0.4965784549713135,
- "mean_token_accuracy": 0.8596988916397095,
- "num_tokens": 20578525.0,
- "step": 2304
- },
- {
- "epoch": 1.7515197568389058,
- "grad_norm": 2.6341168880462646,
- "learning_rate": 2.019208897875043e-06,
- "loss": 0.37775442004203796,
- "mean_token_accuracy": 0.8721816539764404,
- "num_tokens": 20583641.0,
- "step": 2305
- },
- {
- "epoch": 1.7522796352583585,
- "grad_norm": 1.8308569192886353,
- "learning_rate": 2.0171537678803222e-06,
- "loss": 0.3243415355682373,
- "mean_token_accuracy": 0.8837124109268188,
- "num_tokens": 20591725.0,
- "step": 2306
- },
- {
- "epoch": 1.7530395136778116,
- "grad_norm": 2.4362998008728027,
- "learning_rate": 2.015098976765673e-06,
- "loss": 0.3738787770271301,
- "mean_token_accuracy": 0.8974303007125854,
- "num_tokens": 20596587.0,
- "step": 2307
- },
- {
- "epoch": 1.7537993920972643,
- "grad_norm": 3.2920920848846436,
- "learning_rate": 2.0130445259732282e-06,
- "loss": 0.33901530504226685,
- "mean_token_accuracy": 0.9019063115119934,
- "num_tokens": 20600379.0,
- "step": 2308
- },
- {
- "epoch": 1.7545592705167175,
- "grad_norm": 1.290475606918335,
- "learning_rate": 2.01099041694488e-06,
- "loss": 0.37150678038597107,
- "mean_token_accuracy": 0.8542044758796692,
- "num_tokens": 20614340.0,
- "step": 2309
- },
- {
- "epoch": 1.7553191489361701,
- "grad_norm": 2.7794933319091797,
- "learning_rate": 2.0089366511222815e-06,
- "loss": 0.3746095895767212,
- "mean_token_accuracy": 0.8653185367584229,
- "num_tokens": 20622056.0,
- "step": 2310
- },
- {
- "epoch": 1.756079027355623,
- "grad_norm": 2.2112278938293457,
- "learning_rate": 2.006883229946843e-06,
- "loss": 0.35793858766555786,
- "mean_token_accuracy": 0.875727653503418,
- "num_tokens": 20628930.0,
- "step": 2311
- },
- {
- "epoch": 1.756838905775076,
- "grad_norm": 1.5240603685379028,
- "learning_rate": 2.0048301548597365e-06,
- "loss": 0.512831449508667,
- "mean_token_accuracy": 0.8139172792434692,
- "num_tokens": 20643159.0,
- "step": 2312
- },
- {
- "epoch": 1.7575987841945289,
- "grad_norm": 1.810485601425171,
- "learning_rate": 2.0027774273018894e-06,
- "loss": 0.43870818614959717,
- "mean_token_accuracy": 0.8313089609146118,
- "num_tokens": 20651914.0,
- "step": 2313
- },
- {
- "epoch": 1.7583586626139818,
- "grad_norm": 1.748178243637085,
- "learning_rate": 2.0007250487139827e-06,
- "loss": 0.42277514934539795,
- "mean_token_accuracy": 0.8463197946548462,
- "num_tokens": 20660054.0,
- "step": 2314
- },
- {
- "epoch": 1.7591185410334347,
- "grad_norm": 1.511717677116394,
- "learning_rate": 1.998673020536456e-06,
- "loss": 0.38304439187049866,
- "mean_token_accuracy": 0.8508470058441162,
- "num_tokens": 20673371.0,
- "step": 2315
- },
- {
- "epoch": 1.7598784194528876,
- "grad_norm": 1.7790700197219849,
- "learning_rate": 1.996621344209503e-06,
- "loss": 0.3838311433792114,
- "mean_token_accuracy": 0.8676829934120178,
- "num_tokens": 20682072.0,
- "step": 2316
- },
- {
- "epoch": 1.7606382978723403,
- "grad_norm": 1.9128468036651611,
- "learning_rate": 1.994570021173067e-06,
- "loss": 0.40384364128112793,
- "mean_token_accuracy": 0.8747294545173645,
- "num_tokens": 20689000.0,
- "step": 2317
- },
- {
- "epoch": 1.7613981762917934,
- "grad_norm": 3.286569118499756,
- "learning_rate": 1.9925190528668455e-06,
- "loss": 0.38019680976867676,
- "mean_token_accuracy": 0.8678069114685059,
- "num_tokens": 20692763.0,
- "step": 2318
- },
- {
- "epoch": 1.762158054711246,
- "grad_norm": 1.6108927726745605,
- "learning_rate": 1.990468440730288e-06,
- "loss": 0.3144170045852661,
- "mean_token_accuracy": 0.8695170879364014,
- "num_tokens": 20702620.0,
- "step": 2319
- },
- {
- "epoch": 1.7629179331306992,
- "grad_norm": 3.185225009918213,
- "learning_rate": 1.9884181862025938e-06,
- "loss": 0.41619348526000977,
- "mean_token_accuracy": 0.8543670177459717,
- "num_tokens": 20706857.0,
- "step": 2320
- },
- {
- "epoch": 1.763677811550152,
- "grad_norm": 2.3699469566345215,
- "learning_rate": 1.986368290722709e-06,
- "loss": 0.5115842819213867,
- "mean_token_accuracy": 0.8141909837722778,
- "num_tokens": 20713997.0,
- "step": 2321
- },
- {
- "epoch": 1.7644376899696048,
- "grad_norm": 1.4449706077575684,
- "learning_rate": 1.9843187557293286e-06,
- "loss": 0.419655442237854,
- "mean_token_accuracy": 0.8545533418655396,
- "num_tokens": 20726548.0,
- "step": 2322
- },
- {
- "epoch": 1.7651975683890577,
- "grad_norm": 2.127614974975586,
- "learning_rate": 1.9822695826608975e-06,
- "loss": 0.43722522258758545,
- "mean_token_accuracy": 0.8542283773422241,
- "num_tokens": 20733469.0,
- "step": 2323
- },
- {
- "epoch": 1.7659574468085106,
- "grad_norm": 3.3081557750701904,
- "learning_rate": 1.9802207729556023e-06,
- "loss": 0.30904972553253174,
- "mean_token_accuracy": 0.8896352648735046,
- "num_tokens": 20737190.0,
- "step": 2324
- },
- {
- "epoch": 1.7667173252279635,
- "grad_norm": 2.603506326675415,
- "learning_rate": 1.978172328051377e-06,
- "loss": 0.30952537059783936,
- "mean_token_accuracy": 0.8868587017059326,
- "num_tokens": 20741780.0,
- "step": 2325
- },
- {
- "epoch": 1.7674772036474165,
- "grad_norm": 2.576824903488159,
- "learning_rate": 1.9761242493858987e-06,
- "loss": 0.29593953490257263,
- "mean_token_accuracy": 0.888198733329773,
- "num_tokens": 20746324.0,
- "step": 2326
- },
- {
- "epoch": 1.7682370820668694,
- "grad_norm": 1.6168320178985596,
- "learning_rate": 1.9740765383965894e-06,
- "loss": 0.5093998908996582,
- "mean_token_accuracy": 0.8301646709442139,
- "num_tokens": 20760140.0,
- "step": 2327
- },
- {
- "epoch": 1.768996960486322,
- "grad_norm": 2.1162400245666504,
- "learning_rate": 1.9720291965206097e-06,
- "loss": 0.36714404821395874,
- "mean_token_accuracy": 0.8699671626091003,
- "num_tokens": 20766961.0,
- "step": 2328
- },
- {
- "epoch": 1.7697568389057752,
- "grad_norm": 1.046911597251892,
- "learning_rate": 1.969982225194864e-06,
- "loss": 0.40783989429473877,
- "mean_token_accuracy": 0.8474892377853394,
- "num_tokens": 20786737.0,
- "step": 2329
- },
- {
- "epoch": 1.7705167173252279,
- "grad_norm": 1.7059568166732788,
- "learning_rate": 1.9679356258559943e-06,
- "loss": 0.44083845615386963,
- "mean_token_accuracy": 0.841221034526825,
- "num_tokens": 20798907.0,
- "step": 2330
- },
- {
- "epoch": 1.771276595744681,
- "grad_norm": 1.5157767534255981,
- "learning_rate": 1.9658893999403847e-06,
- "loss": 0.4671107828617096,
- "mean_token_accuracy": 0.8252813816070557,
- "num_tokens": 20814304.0,
- "step": 2331
- },
- {
- "epoch": 1.7720364741641337,
- "grad_norm": 2.1340525150299072,
- "learning_rate": 1.9638435488841543e-06,
- "loss": 0.4088709354400635,
- "mean_token_accuracy": 0.8595127463340759,
- "num_tokens": 20821827.0,
- "step": 2332
- },
- {
- "epoch": 1.7727963525835866,
- "grad_norm": 1.948072910308838,
- "learning_rate": 1.96179807412316e-06,
- "loss": 0.3692860007286072,
- "mean_token_accuracy": 0.8678920269012451,
- "num_tokens": 20828612.0,
- "step": 2333
- },
- {
- "epoch": 1.7735562310030395,
- "grad_norm": 1.5731977224349976,
- "learning_rate": 1.959752977092995e-06,
- "loss": 0.3743135929107666,
- "mean_token_accuracy": 0.8723479509353638,
- "num_tokens": 20838497.0,
- "step": 2334
- },
- {
- "epoch": 1.7743161094224924,
- "grad_norm": 1.5506012439727783,
- "learning_rate": 1.957708259228987e-06,
- "loss": 0.4403391182422638,
- "mean_token_accuracy": 0.854604959487915,
- "num_tokens": 20851603.0,
- "step": 2335
- },
- {
- "epoch": 1.7750759878419453,
- "grad_norm": 1.154336929321289,
- "learning_rate": 1.9556639219661983e-06,
- "loss": 0.5281188488006592,
- "mean_token_accuracy": 0.8101300001144409,
- "num_tokens": 20875661.0,
- "step": 2336
- },
- {
- "epoch": 1.7758358662613982,
- "grad_norm": 4.720771312713623,
- "learning_rate": 1.9536199667394217e-06,
- "loss": 0.44419822096824646,
- "mean_token_accuracy": 0.8740090131759644,
- "num_tokens": 20886971.0,
- "step": 2337
- },
- {
- "epoch": 1.7765957446808511,
- "grad_norm": 1.5492230653762817,
- "learning_rate": 1.9515763949831852e-06,
- "loss": 0.4538637697696686,
- "mean_token_accuracy": 0.8362185955047607,
- "num_tokens": 20899212.0,
- "step": 2338
- },
- {
- "epoch": 1.7773556231003038,
- "grad_norm": 1.354101538658142,
- "learning_rate": 1.9495332081317466e-06,
- "loss": 0.4341534376144409,
- "mean_token_accuracy": 0.8380170464515686,
- "num_tokens": 20913065.0,
- "step": 2339
- },
- {
- "epoch": 1.778115501519757,
- "grad_norm": 1.5805599689483643,
- "learning_rate": 1.947490407619092e-06,
- "loss": 0.40928739309310913,
- "mean_token_accuracy": 0.8524469137191772,
- "num_tokens": 20922919.0,
- "step": 2340
- },
- {
- "epoch": 1.7788753799392096,
- "grad_norm": 2.097221851348877,
- "learning_rate": 1.945447994878937e-06,
- "loss": 0.4816104769706726,
- "mean_token_accuracy": 0.888654351234436,
- "num_tokens": 20931350.0,
- "step": 2341
- },
- {
- "epoch": 1.7796352583586628,
- "grad_norm": 1.7193297147750854,
- "learning_rate": 1.9434059713447264e-06,
- "loss": 0.44925639033317566,
- "mean_token_accuracy": 0.8500319123268127,
- "num_tokens": 20940546.0,
- "step": 2342
- },
- {
- "epoch": 1.7803951367781155,
- "grad_norm": 1.5971747636795044,
- "learning_rate": 1.9413643384496315e-06,
- "loss": 0.29559412598609924,
- "mean_token_accuracy": 0.8871279954910278,
- "num_tokens": 20950604.0,
- "step": 2343
- },
- {
- "epoch": 1.7811550151975684,
- "grad_norm": 2.788029670715332,
- "learning_rate": 1.9393230976265478e-06,
- "loss": 0.31713539361953735,
- "mean_token_accuracy": 0.8866176605224609,
- "num_tokens": 20955296.0,
- "step": 2344
- },
- {
- "epoch": 1.7819148936170213,
- "grad_norm": 1.5747952461242676,
- "learning_rate": 1.937282250308096e-06,
- "loss": 0.41813358664512634,
- "mean_token_accuracy": 0.8418053984642029,
- "num_tokens": 20967664.0,
- "step": 2345
- },
- {
- "epoch": 1.7826747720364742,
- "grad_norm": 2.0813145637512207,
- "learning_rate": 1.935241797926623e-06,
- "loss": 0.39056286215782166,
- "mean_token_accuracy": 0.8601781129837036,
- "num_tokens": 20975895.0,
- "step": 2346
- },
- {
- "epoch": 1.783434650455927,
- "grad_norm": 2.143022298812866,
- "learning_rate": 1.933201741914196e-06,
- "loss": 0.40797823667526245,
- "mean_token_accuracy": 0.8846398591995239,
- "num_tokens": 20983683.0,
- "step": 2347
- },
- {
- "epoch": 1.78419452887538,
- "grad_norm": 1.8451775312423706,
- "learning_rate": 1.931162083702606e-06,
- "loss": 0.34083136916160583,
- "mean_token_accuracy": 0.8643462657928467,
- "num_tokens": 20992621.0,
- "step": 2348
- },
- {
- "epoch": 1.784954407294833,
- "grad_norm": 1.8603935241699219,
- "learning_rate": 1.9291228247233607e-06,
- "loss": 0.4860231280326843,
- "mean_token_accuracy": 0.8391251564025879,
- "num_tokens": 21002427.0,
- "step": 2349
- },
- {
- "epoch": 1.7857142857142856,
- "grad_norm": 2.751711845397949,
- "learning_rate": 1.9270839664076937e-06,
- "loss": 0.30588358640670776,
- "mean_token_accuracy": 0.8836315274238586,
- "num_tokens": 21006898.0,
- "step": 2350
- },
- {
- "epoch": 1.7864741641337387,
- "grad_norm": 1.0335345268249512,
- "learning_rate": 1.9250455101865526e-06,
- "loss": 0.3119634985923767,
- "mean_token_accuracy": 0.8912283182144165,
- "num_tokens": 21024930.0,
- "step": 2351
- },
- {
- "epoch": 1.7872340425531914,
- "grad_norm": 2.4693806171417236,
- "learning_rate": 1.9230074574906043e-06,
- "loss": 0.1976669877767563,
- "mean_token_accuracy": 0.928974986076355,
- "num_tokens": 21029027.0,
- "step": 2352
- },
- {
- "epoch": 1.7879939209726445,
- "grad_norm": 1.2892690896987915,
- "learning_rate": 1.920969809750234e-06,
- "loss": 0.46008217334747314,
- "mean_token_accuracy": 0.8299605846405029,
- "num_tokens": 21047671.0,
- "step": 2353
- },
- {
- "epoch": 1.7887537993920972,
- "grad_norm": 3.162534713745117,
- "learning_rate": 1.91893256839554e-06,
- "loss": 0.2916071116924286,
- "mean_token_accuracy": 0.8932807445526123,
- "num_tokens": 21051555.0,
- "step": 2354
- },
- {
- "epoch": 1.7895136778115501,
- "grad_norm": 1.7627713680267334,
- "learning_rate": 1.916895734856338e-06,
- "loss": 0.3223535120487213,
- "mean_token_accuracy": 0.8852578401565552,
- "num_tokens": 21060056.0,
- "step": 2355
- },
- {
- "epoch": 1.790273556231003,
- "grad_norm": 1.9448071718215942,
- "learning_rate": 1.9148593105621542e-06,
- "loss": 0.3650452196598053,
- "mean_token_accuracy": 0.8709862232208252,
- "num_tokens": 21067190.0,
- "step": 2356
- },
- {
- "epoch": 1.791033434650456,
- "grad_norm": 2.026644229888916,
- "learning_rate": 1.9128232969422318e-06,
- "loss": 0.3620566427707672,
- "mean_token_accuracy": 0.865707516670227,
- "num_tokens": 21075197.0,
- "step": 2357
- },
- {
- "epoch": 1.7917933130699089,
- "grad_norm": 2.2628564834594727,
- "learning_rate": 1.9107876954255217e-06,
- "loss": 0.353444367647171,
- "mean_token_accuracy": 0.8590385913848877,
- "num_tokens": 21080823.0,
- "step": 2358
- },
- {
- "epoch": 1.7925531914893615,
- "grad_norm": 2.5959067344665527,
- "learning_rate": 1.908752507440689e-06,
- "loss": 0.43711763620376587,
- "mean_token_accuracy": 0.8539710640907288,
- "num_tokens": 21086016.0,
- "step": 2359
- },
- {
- "epoch": 1.7933130699088147,
- "grad_norm": 1.6228864192962646,
- "learning_rate": 1.906717734416105e-06,
- "loss": 0.38630396127700806,
- "mean_token_accuracy": 0.8611987829208374,
- "num_tokens": 21096573.0,
- "step": 2360
- },
- {
- "epoch": 1.7940729483282674,
- "grad_norm": 1.8471404314041138,
- "learning_rate": 1.9046833777798534e-06,
- "loss": 0.46608641743659973,
- "mean_token_accuracy": 0.8782031536102295,
- "num_tokens": 21105817.0,
- "step": 2361
- },
- {
- "epoch": 1.7948328267477205,
- "grad_norm": 2.6532235145568848,
- "learning_rate": 1.9026494389597239e-06,
- "loss": 0.3310372829437256,
- "mean_token_accuracy": 0.8781720399856567,
- "num_tokens": 21111192.0,
- "step": 2362
- },
- {
- "epoch": 1.7955927051671732,
- "grad_norm": 2.172534942626953,
- "learning_rate": 1.9006159193832124e-06,
- "loss": 0.49921661615371704,
- "mean_token_accuracy": 0.8215196132659912,
- "num_tokens": 21117878.0,
- "step": 2363
- },
- {
- "epoch": 1.7963525835866263,
- "grad_norm": 1.6507720947265625,
- "learning_rate": 1.8985828204775206e-06,
- "loss": 0.4189162850379944,
- "mean_token_accuracy": 0.8520572185516357,
- "num_tokens": 21128287.0,
- "step": 2364
- },
- {
- "epoch": 1.797112462006079,
- "grad_norm": 1.5932034254074097,
- "learning_rate": 1.8965501436695578e-06,
- "loss": 0.45531854033470154,
- "mean_token_accuracy": 0.8391242027282715,
- "num_tokens": 21140605.0,
- "step": 2365
- },
- {
- "epoch": 1.797872340425532,
- "grad_norm": 2.4680638313293457,
- "learning_rate": 1.894517890385933e-06,
- "loss": 0.41174983978271484,
- "mean_token_accuracy": 0.8616886138916016,
- "num_tokens": 21147045.0,
- "step": 2366
- },
- {
- "epoch": 1.7986322188449848,
- "grad_norm": 1.61875319480896,
- "learning_rate": 1.8924860620529594e-06,
- "loss": 0.47573935985565186,
- "mean_token_accuracy": 0.8347671031951904,
- "num_tokens": 21157253.0,
- "step": 2367
- },
- {
- "epoch": 1.7993920972644377,
- "grad_norm": 3.4389333724975586,
- "learning_rate": 1.8904546600966539e-06,
- "loss": 0.34975939989089966,
- "mean_token_accuracy": 0.8915865421295166,
- "num_tokens": 21160486.0,
- "step": 2368
- },
- {
- "epoch": 1.8001519756838906,
- "grad_norm": 2.0069527626037598,
- "learning_rate": 1.888423685942732e-06,
- "loss": 0.379585325717926,
- "mean_token_accuracy": 0.8605983257293701,
- "num_tokens": 21168016.0,
- "step": 2369
- },
- {
- "epoch": 1.8009118541033433,
- "grad_norm": 3.0740530490875244,
- "learning_rate": 1.886393141016609e-06,
- "loss": 0.5244829058647156,
- "mean_token_accuracy": 0.8282772302627563,
- "num_tokens": 21172851.0,
- "step": 2370
- },
- {
- "epoch": 1.8016717325227964,
- "grad_norm": 1.5724968910217285,
- "learning_rate": 1.8843630267434e-06,
- "loss": 0.2020694762468338,
- "mean_token_accuracy": 0.8882503509521484,
- "num_tokens": 21179866.0,
- "step": 2371
- },
- {
- "epoch": 1.8024316109422491,
- "grad_norm": 2.1539509296417236,
- "learning_rate": 1.8823333445479175e-06,
- "loss": 0.37903186678886414,
- "mean_token_accuracy": 0.8525497317314148,
- "num_tokens": 21186941.0,
- "step": 2372
- },
- {
- "epoch": 1.8031914893617023,
- "grad_norm": 2.0247764587402344,
- "learning_rate": 1.8803040958546708e-06,
- "loss": 0.293364018201828,
- "mean_token_accuracy": 0.8954306244850159,
- "num_tokens": 21193659.0,
- "step": 2373
- },
- {
- "epoch": 1.803951367781155,
- "grad_norm": 1.7034926414489746,
- "learning_rate": 1.8782752820878636e-06,
- "loss": 0.33828210830688477,
- "mean_token_accuracy": 0.9032940864562988,
- "num_tokens": 21201399.0,
- "step": 2374
- },
- {
- "epoch": 1.8047112462006079,
- "grad_norm": 1.7864601612091064,
- "learning_rate": 1.8762469046713954e-06,
- "loss": 0.3165147006511688,
- "mean_token_accuracy": 0.8997465372085571,
- "num_tokens": 21209105.0,
- "step": 2375
- },
- {
- "epoch": 1.8054711246200608,
- "grad_norm": 2.3371729850769043,
- "learning_rate": 1.8742189650288617e-06,
- "loss": 0.4036901593208313,
- "mean_token_accuracy": 0.8549420833587646,
- "num_tokens": 21215429.0,
- "step": 2376
- },
- {
- "epoch": 1.8062310030395137,
- "grad_norm": 1.7922348976135254,
- "learning_rate": 1.872191464583547e-06,
- "loss": 0.4366671144962311,
- "mean_token_accuracy": 0.8614166975021362,
- "num_tokens": 21226823.0,
- "step": 2377
- },
- {
- "epoch": 1.8069908814589666,
- "grad_norm": 2.1667943000793457,
- "learning_rate": 1.8701644047584294e-06,
- "loss": 0.3543647825717926,
- "mean_token_accuracy": 0.9031318426132202,
- "num_tokens": 21232823.0,
- "step": 2378
- },
- {
- "epoch": 1.8077507598784195,
- "grad_norm": 1.7554421424865723,
- "learning_rate": 1.868137786976177e-06,
- "loss": 0.32704365253448486,
- "mean_token_accuracy": 0.8990532755851746,
- "num_tokens": 21242036.0,
- "step": 2379
- },
- {
- "epoch": 1.8085106382978724,
- "grad_norm": 1.6723839044570923,
- "learning_rate": 1.8661116126591492e-06,
- "loss": 0.3665752410888672,
- "mean_token_accuracy": 0.8828305006027222,
- "num_tokens": 21251290.0,
- "step": 2380
- },
- {
- "epoch": 1.809270516717325,
- "grad_norm": 1.5078409910202026,
- "learning_rate": 1.8640858832293924e-06,
- "loss": 0.368108332157135,
- "mean_token_accuracy": 0.8720884323120117,
- "num_tokens": 21263510.0,
- "step": 2381
- },
- {
- "epoch": 1.8100303951367782,
- "grad_norm": 2.245553493499756,
- "learning_rate": 1.8620606001086423e-06,
- "loss": 0.3189915716648102,
- "mean_token_accuracy": 0.9015103578567505,
- "num_tokens": 21269690.0,
- "step": 2382
- },
- {
- "epoch": 1.810790273556231,
- "grad_norm": 1.780027151107788,
- "learning_rate": 1.8600357647183188e-06,
- "loss": 0.40369710326194763,
- "mean_token_accuracy": 0.8539618253707886,
- "num_tokens": 21278523.0,
- "step": 2383
- },
- {
- "epoch": 1.811550151975684,
- "grad_norm": 2.1727912425994873,
- "learning_rate": 1.8580113784795306e-06,
- "loss": 0.29285651445388794,
- "mean_token_accuracy": 0.8954071998596191,
- "num_tokens": 21284717.0,
- "step": 2384
- },
- {
- "epoch": 1.8123100303951367,
- "grad_norm": 2.310225248336792,
- "learning_rate": 1.8559874428130708e-06,
- "loss": 0.3090948760509491,
- "mean_token_accuracy": 0.8853784203529358,
- "num_tokens": 21290484.0,
- "step": 2385
- },
- {
- "epoch": 1.8130699088145896,
- "grad_norm": 1.6556873321533203,
- "learning_rate": 1.8539639591394131e-06,
- "loss": 0.4425269663333893,
- "mean_token_accuracy": 0.8488757610321045,
- "num_tokens": 21302588.0,
- "step": 2386
- },
- {
- "epoch": 1.8138297872340425,
- "grad_norm": 1.9238256216049194,
- "learning_rate": 1.8519409288787182e-06,
- "loss": 0.4781329929828644,
- "mean_token_accuracy": 0.8392970561981201,
- "num_tokens": 21310598.0,
- "step": 2387
- },
- {
- "epoch": 1.8145896656534954,
- "grad_norm": 1.4976142644882202,
- "learning_rate": 1.8499183534508263e-06,
- "loss": 0.36829859018325806,
- "mean_token_accuracy": 0.8687542676925659,
- "num_tokens": 21322668.0,
- "step": 2388
- },
- {
- "epoch": 1.8153495440729484,
- "grad_norm": 2.0216941833496094,
- "learning_rate": 1.8478962342752584e-06,
- "loss": 0.385962575674057,
- "mean_token_accuracy": 0.8908089399337769,
- "num_tokens": 21330378.0,
- "step": 2389
- },
- {
- "epoch": 1.8161094224924013,
- "grad_norm": 1.647863507270813,
- "learning_rate": 1.8458745727712142e-06,
- "loss": 0.30903705954551697,
- "mean_token_accuracy": 0.8914397954940796,
- "num_tokens": 21339932.0,
- "step": 2390
- },
- {
- "epoch": 1.8168693009118542,
- "grad_norm": 1.5832399129867554,
- "learning_rate": 1.8438533703575757e-06,
- "loss": 0.3636384606361389,
- "mean_token_accuracy": 0.8611595630645752,
- "num_tokens": 21351557.0,
- "step": 2391
- },
- {
- "epoch": 1.8176291793313069,
- "grad_norm": 3.0069241523742676,
- "learning_rate": 1.8418326284528997e-06,
- "loss": 0.37970617413520813,
- "mean_token_accuracy": 0.8620643615722656,
- "num_tokens": 21355704.0,
- "step": 2392
- },
- {
- "epoch": 1.81838905775076,
- "grad_norm": 2.004526376724243,
- "learning_rate": 1.8398123484754204e-06,
- "loss": 0.5333225131034851,
- "mean_token_accuracy": 0.8062554597854614,
- "num_tokens": 21364640.0,
- "step": 2393
- },
- {
- "epoch": 1.8191489361702127,
- "grad_norm": 1.449981689453125,
- "learning_rate": 1.8377925318430478e-06,
- "loss": 0.3736325800418854,
- "mean_token_accuracy": 0.858788251876831,
- "num_tokens": 21377025.0,
- "step": 2394
- },
- {
- "epoch": 1.8199088145896658,
- "grad_norm": 1.1959524154663086,
- "learning_rate": 1.8357731799733686e-06,
- "loss": 0.3272058963775635,
- "mean_token_accuracy": 0.8840590715408325,
- "num_tokens": 21395378.0,
- "step": 2395
- },
- {
- "epoch": 1.8206686930091185,
- "grad_norm": 2.134742498397827,
- "learning_rate": 1.8337542942836406e-06,
- "loss": 0.3737856149673462,
- "mean_token_accuracy": 0.8674061298370361,
- "num_tokens": 21402106.0,
- "step": 2396
- },
- {
- "epoch": 1.8214285714285714,
- "grad_norm": 2.2179460525512695,
- "learning_rate": 1.8317358761907945e-06,
- "loss": 0.37301796674728394,
- "mean_token_accuracy": 0.8605623245239258,
- "num_tokens": 21408367.0,
- "step": 2397
- },
- {
- "epoch": 1.8221884498480243,
- "grad_norm": 2.1718010902404785,
- "learning_rate": 1.8297179271114345e-06,
- "loss": 0.2772231101989746,
- "mean_token_accuracy": 0.8997501730918884,
- "num_tokens": 21414274.0,
- "step": 2398
- },
- {
- "epoch": 1.8229483282674772,
- "grad_norm": 1.410933494567871,
- "learning_rate": 1.827700448461836e-06,
- "loss": 0.4834601581096649,
- "mean_token_accuracy": 0.8382522463798523,
- "num_tokens": 21429120.0,
- "step": 2399
- },
- {
- "epoch": 1.8237082066869301,
- "grad_norm": 3.4779679775238037,
- "learning_rate": 1.8256834416579423e-06,
- "loss": 0.44643428921699524,
- "mean_token_accuracy": 0.8308249711990356,
- "num_tokens": 21432437.0,
- "step": 2400
- },
- {
- "epoch": 1.824468085106383,
- "grad_norm": 1.374484658241272,
- "learning_rate": 1.8236669081153657e-06,
- "loss": 0.3947869837284088,
- "mean_token_accuracy": 0.8605848550796509,
- "num_tokens": 21445656.0,
- "step": 2401
- },
- {
- "epoch": 1.825227963525836,
- "grad_norm": 1.9599316120147705,
- "learning_rate": 1.8216508492493887e-06,
- "loss": 0.49040719866752625,
- "mean_token_accuracy": 0.839459240436554,
- "num_tokens": 21452889.0,
- "step": 2402
- },
- {
- "epoch": 1.8259878419452886,
- "grad_norm": 2.1267881393432617,
- "learning_rate": 1.8196352664749578e-06,
- "loss": 0.3233179450035095,
- "mean_token_accuracy": 0.8841243386268616,
- "num_tokens": 21458788.0,
- "step": 2403
- },
- {
- "epoch": 1.8267477203647418,
- "grad_norm": 2.6356115341186523,
- "learning_rate": 1.8176201612066874e-06,
- "loss": 0.43436336517333984,
- "mean_token_accuracy": 0.850265622138977,
- "num_tokens": 21464305.0,
- "step": 2404
- },
- {
- "epoch": 1.8275075987841944,
- "grad_norm": 2.0232386589050293,
- "learning_rate": 1.8156055348588548e-06,
- "loss": 0.37281763553619385,
- "mean_token_accuracy": 0.8616300821304321,
- "num_tokens": 21471722.0,
- "step": 2405
- },
- {
- "epoch": 1.8282674772036476,
- "grad_norm": 3.2616260051727295,
- "learning_rate": 1.8135913888454034e-06,
- "loss": 0.2882898151874542,
- "mean_token_accuracy": 0.9001147747039795,
- "num_tokens": 21475400.0,
- "step": 2406
- },
- {
- "epoch": 1.8290273556231003,
- "grad_norm": 2.1665611267089844,
- "learning_rate": 1.8115777245799383e-06,
- "loss": 0.45269185304641724,
- "mean_token_accuracy": 0.8420798778533936,
- "num_tokens": 21481827.0,
- "step": 2407
- },
- {
- "epoch": 1.8297872340425532,
- "grad_norm": 1.4406569004058838,
- "learning_rate": 1.8095645434757261e-06,
- "loss": 0.43665701150894165,
- "mean_token_accuracy": 0.8401381969451904,
- "num_tokens": 21496441.0,
- "step": 2408
- },
- {
- "epoch": 1.830547112462006,
- "grad_norm": 1.6756342649459839,
- "learning_rate": 1.8075518469456944e-06,
- "loss": 0.3521783947944641,
- "mean_token_accuracy": 0.8737466335296631,
- "num_tokens": 21505568.0,
- "step": 2409
- },
- {
- "epoch": 1.831306990881459,
- "grad_norm": 1.6623140573501587,
- "learning_rate": 1.8055396364024318e-06,
- "loss": 0.344537615776062,
- "mean_token_accuracy": 0.886972188949585,
- "num_tokens": 21513252.0,
- "step": 2410
- },
- {
- "epoch": 1.832066869300912,
- "grad_norm": 2.064835548400879,
- "learning_rate": 1.803527913258186e-06,
- "loss": 0.3252706229686737,
- "mean_token_accuracy": 0.885245680809021,
- "num_tokens": 21520242.0,
- "step": 2411
- },
- {
- "epoch": 1.8328267477203646,
- "grad_norm": 1.9969112873077393,
- "learning_rate": 1.8015166789248606e-06,
- "loss": 0.34694376587867737,
- "mean_token_accuracy": 0.8818766474723816,
- "num_tokens": 21527524.0,
- "step": 2412
- },
- {
- "epoch": 1.8335866261398177,
- "grad_norm": 2.086148977279663,
- "learning_rate": 1.7995059348140165e-06,
- "loss": 0.23109188675880432,
- "mean_token_accuracy": 0.912773609161377,
- "num_tokens": 21532829.0,
- "step": 2413
- },
- {
- "epoch": 1.8343465045592704,
- "grad_norm": 1.80828058719635,
- "learning_rate": 1.7974956823368728e-06,
- "loss": 0.5422223210334778,
- "mean_token_accuracy": 0.8058640956878662,
- "num_tokens": 21544440.0,
- "step": 2414
- },
- {
- "epoch": 1.8351063829787235,
- "grad_norm": 1.8121788501739502,
- "learning_rate": 1.7954859229043017e-06,
- "loss": 0.3674035668373108,
- "mean_token_accuracy": 0.8628277778625488,
- "num_tokens": 21553160.0,
- "step": 2415
- },
- {
- "epoch": 1.8358662613981762,
- "grad_norm": 1.9307979345321655,
- "learning_rate": 1.7934766579268292e-06,
- "loss": 0.4528796672821045,
- "mean_token_accuracy": 0.8328302502632141,
- "num_tokens": 21563485.0,
- "step": 2416
- },
- {
- "epoch": 1.8366261398176293,
- "grad_norm": 1.2312756776809692,
- "learning_rate": 1.7914678888146347e-06,
- "loss": 0.40424543619155884,
- "mean_token_accuracy": 0.8571025133132935,
- "num_tokens": 21582662.0,
- "step": 2417
- },
- {
- "epoch": 1.837386018237082,
- "grad_norm": 1.6305770874023438,
- "learning_rate": 1.7894596169775514e-06,
- "loss": 0.36575305461883545,
- "mean_token_accuracy": 0.8768579959869385,
- "num_tokens": 21592930.0,
- "step": 2418
- },
- {
- "epoch": 1.838145896656535,
- "grad_norm": 1.8107178211212158,
- "learning_rate": 1.7874518438250598e-06,
- "loss": 0.3260963261127472,
- "mean_token_accuracy": 0.896018385887146,
- "num_tokens": 21600509.0,
- "step": 2419
- },
- {
- "epoch": 1.8389057750759878,
- "grad_norm": 2.7195847034454346,
- "learning_rate": 1.785444570766293e-06,
- "loss": 0.2728347182273865,
- "mean_token_accuracy": 0.9178709983825684,
- "num_tokens": 21604489.0,
- "step": 2420
- },
- {
- "epoch": 1.8396656534954408,
- "grad_norm": 1.9783591032028198,
- "learning_rate": 1.7834377992100332e-06,
- "loss": 0.3136378526687622,
- "mean_token_accuracy": 0.8844017386436462,
- "num_tokens": 21612060.0,
- "step": 2421
- },
- {
- "epoch": 1.8404255319148937,
- "grad_norm": 2.1911418437957764,
- "learning_rate": 1.7814315305647095e-06,
- "loss": 0.39013993740081787,
- "mean_token_accuracy": 0.8688976764678955,
- "num_tokens": 21618778.0,
- "step": 2422
- },
- {
- "epoch": 1.8411854103343464,
- "grad_norm": 1.9143604040145874,
- "learning_rate": 1.779425766238398e-06,
- "loss": 0.5113036632537842,
- "mean_token_accuracy": 0.8329141139984131,
- "num_tokens": 21628976.0,
- "step": 2423
- },
- {
- "epoch": 1.8419452887537995,
- "grad_norm": 1.4184197187423706,
- "learning_rate": 1.7774205076388207e-06,
- "loss": 0.3821067810058594,
- "mean_token_accuracy": 0.8604007959365845,
- "num_tokens": 21643145.0,
- "step": 2424
- },
- {
- "epoch": 1.8427051671732522,
- "grad_norm": 2.45896577835083,
- "learning_rate": 1.7754157561733476e-06,
- "loss": 0.3004961311817169,
- "mean_token_accuracy": 0.89884352684021,
- "num_tokens": 21647441.0,
- "step": 2425
- },
- {
- "epoch": 1.8434650455927053,
- "grad_norm": 1.7999277114868164,
- "learning_rate": 1.7734115132489887e-06,
- "loss": 0.42533132433891296,
- "mean_token_accuracy": 0.8838746547698975,
- "num_tokens": 21657445.0,
- "step": 2426
- },
- {
- "epoch": 1.844224924012158,
- "grad_norm": 2.099728584289551,
- "learning_rate": 1.7714077802723994e-06,
- "loss": 0.36200380325317383,
- "mean_token_accuracy": 0.86548912525177,
- "num_tokens": 21663966.0,
- "step": 2427
- },
- {
- "epoch": 1.844984802431611,
- "grad_norm": 2.1970369815826416,
- "learning_rate": 1.7694045586498754e-06,
- "loss": 0.34944331645965576,
- "mean_token_accuracy": 0.8670865297317505,
- "num_tokens": 21670051.0,
- "step": 2428
- },
- {
- "epoch": 1.8457446808510638,
- "grad_norm": 2.2928519248962402,
- "learning_rate": 1.7674018497873568e-06,
- "loss": 0.39500880241394043,
- "mean_token_accuracy": 0.8744652271270752,
- "num_tokens": 21676054.0,
- "step": 2429
- },
- {
- "epoch": 1.8465045592705167,
- "grad_norm": 1.7598960399627686,
- "learning_rate": 1.7653996550904208e-06,
- "loss": 0.40113672614097595,
- "mean_token_accuracy": 0.8552819490432739,
- "num_tokens": 21685514.0,
- "step": 2430
- },
- {
- "epoch": 1.8472644376899696,
- "grad_norm": 2.0529749393463135,
- "learning_rate": 1.7633979759642844e-06,
- "loss": 0.47586584091186523,
- "mean_token_accuracy": 0.8412872552871704,
- "num_tokens": 21693282.0,
- "step": 2431
- },
- {
- "epoch": 1.8480243161094225,
- "grad_norm": 2.2423181533813477,
- "learning_rate": 1.7613968138138027e-06,
- "loss": 0.2757381796836853,
- "mean_token_accuracy": 0.8992017507553101,
- "num_tokens": 21698439.0,
- "step": 2432
- },
- {
- "epoch": 1.8487841945288754,
- "grad_norm": 1.3280467987060547,
- "learning_rate": 1.7593961700434692e-06,
- "loss": 0.29535043239593506,
- "mean_token_accuracy": 0.8943840861320496,
- "num_tokens": 21711823.0,
- "step": 2433
- },
- {
- "epoch": 1.8495440729483281,
- "grad_norm": 2.589221715927124,
- "learning_rate": 1.7573960460574133e-06,
- "loss": 0.46775516867637634,
- "mean_token_accuracy": 0.8654797673225403,
- "num_tokens": 21717180.0,
- "step": 2434
- },
- {
- "epoch": 1.8503039513677813,
- "grad_norm": 2.1137642860412598,
- "learning_rate": 1.7553964432593976e-06,
- "loss": 0.3808780610561371,
- "mean_token_accuracy": 0.8759565353393555,
- "num_tokens": 21723980.0,
- "step": 2435
- },
- {
- "epoch": 1.851063829787234,
- "grad_norm": 2.386967182159424,
- "learning_rate": 1.75339736305282e-06,
- "loss": 0.42688336968421936,
- "mean_token_accuracy": 0.8488960266113281,
- "num_tokens": 21730411.0,
- "step": 2436
- },
- {
- "epoch": 1.851823708206687,
- "grad_norm": 1.586552619934082,
- "learning_rate": 1.7513988068407145e-06,
- "loss": 0.33497530221939087,
- "mean_token_accuracy": 0.8809621334075928,
- "num_tokens": 21740228.0,
- "step": 2437
- },
- {
- "epoch": 1.8525835866261398,
- "grad_norm": 2.107167959213257,
- "learning_rate": 1.7494007760257428e-06,
- "loss": 0.3801528513431549,
- "mean_token_accuracy": 0.8666986227035522,
- "num_tokens": 21746718.0,
- "step": 2438
- },
- {
- "epoch": 1.8533434650455927,
- "grad_norm": 2.514514684677124,
- "learning_rate": 1.7474032720101991e-06,
- "loss": 0.285498708486557,
- "mean_token_accuracy": 0.901540219783783,
- "num_tokens": 21751009.0,
- "step": 2439
- },
- {
- "epoch": 1.8541033434650456,
- "grad_norm": 1.8152034282684326,
- "learning_rate": 1.7454062961960102e-06,
- "loss": 0.3704795241355896,
- "mean_token_accuracy": 0.8630262613296509,
- "num_tokens": 21760164.0,
- "step": 2440
- },
- {
- "epoch": 1.8548632218844985,
- "grad_norm": 2.714531183242798,
- "learning_rate": 1.7434098499847308e-06,
- "loss": 0.5070809125900269,
- "mean_token_accuracy": 0.8408594131469727,
- "num_tokens": 21765602.0,
- "step": 2441
- },
- {
- "epoch": 1.8556231003039514,
- "grad_norm": 2.173832893371582,
- "learning_rate": 1.7414139347775423e-06,
- "loss": 0.3500945568084717,
- "mean_token_accuracy": 0.8733699321746826,
- "num_tokens": 21772029.0,
- "step": 2442
- },
- {
- "epoch": 1.8563829787234043,
- "grad_norm": 1.580376148223877,
- "learning_rate": 1.7394185519752546e-06,
- "loss": 0.5137908458709717,
- "mean_token_accuracy": 0.8141944408416748,
- "num_tokens": 21784531.0,
- "step": 2443
- },
- {
- "epoch": 1.8571428571428572,
- "grad_norm": 2.079318046569824,
- "learning_rate": 1.7374237029783064e-06,
- "loss": 0.41820770502090454,
- "mean_token_accuracy": 0.8513275384902954,
- "num_tokens": 21792047.0,
- "step": 2444
- },
- {
- "epoch": 1.85790273556231,
- "grad_norm": 2.6890387535095215,
- "learning_rate": 1.7354293891867582e-06,
- "loss": 0.3810037672519684,
- "mean_token_accuracy": 0.8790096044540405,
- "num_tokens": 21796634.0,
- "step": 2445
- },
- {
- "epoch": 1.858662613981763,
- "grad_norm": 2.161081552505493,
- "learning_rate": 1.7334356120002956e-06,
- "loss": 0.48064762353897095,
- "mean_token_accuracy": 0.8329977989196777,
- "num_tokens": 21803509.0,
- "step": 2446
- },
- {
- "epoch": 1.8594224924012157,
- "grad_norm": 1.9201551675796509,
- "learning_rate": 1.7314423728182283e-06,
- "loss": 0.36369895935058594,
- "mean_token_accuracy": 0.8713955879211426,
- "num_tokens": 21810528.0,
- "step": 2447
- },
- {
- "epoch": 1.8601823708206688,
- "grad_norm": 1.8095223903656006,
- "learning_rate": 1.7294496730394897e-06,
- "loss": 0.41493499279022217,
- "mean_token_accuracy": 0.855312705039978,
- "num_tokens": 21821176.0,
- "step": 2448
- },
- {
- "epoch": 1.8609422492401215,
- "grad_norm": 2.172389507293701,
- "learning_rate": 1.7274575140626318e-06,
- "loss": 0.3467463552951813,
- "mean_token_accuracy": 0.8801594972610474,
- "num_tokens": 21827486.0,
- "step": 2449
- },
- {
- "epoch": 1.8617021276595744,
- "grad_norm": 2.8139185905456543,
- "learning_rate": 1.7254658972858293e-06,
- "loss": 0.35121995210647583,
- "mean_token_accuracy": 0.8741901516914368,
- "num_tokens": 21831915.0,
- "step": 2450
- },
- {
- "epoch": 1.8624620060790273,
- "grad_norm": 1.2572762966156006,
- "learning_rate": 1.7234748241068742e-06,
- "loss": 0.3775328993797302,
- "mean_token_accuracy": 0.8547425866127014,
- "num_tokens": 21849623.0,
- "step": 2451
- },
- {
- "epoch": 1.8632218844984803,
- "grad_norm": 1.2357900142669678,
- "learning_rate": 1.7214842959231796e-06,
- "loss": 0.28715917468070984,
- "mean_token_accuracy": 0.9034290313720703,
- "num_tokens": 21864507.0,
- "step": 2452
- },
- {
- "epoch": 1.8639817629179332,
- "grad_norm": 1.2349165678024292,
- "learning_rate": 1.719494314131775e-06,
- "loss": 0.27918580174446106,
- "mean_token_accuracy": 0.9073119759559631,
- "num_tokens": 21878519.0,
- "step": 2453
- },
- {
- "epoch": 1.864741641337386,
- "grad_norm": 1.960353136062622,
- "learning_rate": 1.7175048801293042e-06,
- "loss": 0.49304282665252686,
- "mean_token_accuracy": 0.8193954229354858,
- "num_tokens": 21886861.0,
- "step": 2454
- },
- {
- "epoch": 1.865501519756839,
- "grad_norm": 1.480118751525879,
- "learning_rate": 1.7155159953120315e-06,
- "loss": 0.39433127641677856,
- "mean_token_accuracy": 0.8674266338348389,
- "num_tokens": 21899131.0,
- "step": 2455
- },
- {
- "epoch": 1.8662613981762917,
- "grad_norm": 2.3136367797851562,
- "learning_rate": 1.7135276610758309e-06,
- "loss": 0.40943437814712524,
- "mean_token_accuracy": 0.8511340022087097,
- "num_tokens": 21905550.0,
- "step": 2456
- },
- {
- "epoch": 1.8670212765957448,
- "grad_norm": 1.3622872829437256,
- "learning_rate": 1.7115398788161923e-06,
- "loss": 0.4255254566669464,
- "mean_token_accuracy": 0.8457357883453369,
- "num_tokens": 21919943.0,
- "step": 2457
- },
- {
- "epoch": 1.8677811550151975,
- "grad_norm": 1.8197853565216064,
- "learning_rate": 1.7095526499282172e-06,
- "loss": 0.33384573459625244,
- "mean_token_accuracy": 0.8757365942001343,
- "num_tokens": 21928368.0,
- "step": 2458
- },
- {
- "epoch": 1.8685410334346506,
- "grad_norm": 1.8771090507507324,
- "learning_rate": 1.7075659758066207e-06,
- "loss": 0.38854318857192993,
- "mean_token_accuracy": 0.8565001487731934,
- "num_tokens": 21936624.0,
- "step": 2459
- },
- {
- "epoch": 1.8693009118541033,
- "grad_norm": 1.449811577796936,
- "learning_rate": 1.7055798578457267e-06,
- "loss": 0.45504286885261536,
- "mean_token_accuracy": 0.8338158130645752,
- "num_tokens": 21952192.0,
- "step": 2460
- },
- {
- "epoch": 1.8700607902735562,
- "grad_norm": 2.253678321838379,
- "learning_rate": 1.703594297439469e-06,
- "loss": 0.44300752878189087,
- "mean_token_accuracy": 0.8451106548309326,
- "num_tokens": 21959107.0,
- "step": 2461
- },
- {
- "epoch": 1.8708206686930091,
- "grad_norm": 2.5431747436523438,
- "learning_rate": 1.7016092959813892e-06,
- "loss": 0.34692925214767456,
- "mean_token_accuracy": 0.8823766708374023,
- "num_tokens": 21964543.0,
- "step": 2462
- },
- {
- "epoch": 1.871580547112462,
- "grad_norm": 2.7001953125,
- "learning_rate": 1.6996248548646393e-06,
- "loss": 0.5270686745643616,
- "mean_token_accuracy": 0.8366886377334595,
- "num_tokens": 21970157.0,
- "step": 2463
- },
- {
- "epoch": 1.872340425531915,
- "grad_norm": 2.3855581283569336,
- "learning_rate": 1.6976409754819767e-06,
- "loss": 0.40109893679618835,
- "mean_token_accuracy": 0.8477234840393066,
- "num_tokens": 21976046.0,
- "step": 2464
- },
- {
- "epoch": 1.8731003039513676,
- "grad_norm": 1.6014364957809448,
- "learning_rate": 1.6956576592257635e-06,
- "loss": 0.4344262480735779,
- "mean_token_accuracy": 0.8464433550834656,
- "num_tokens": 21986299.0,
- "step": 2465
- },
- {
- "epoch": 1.8738601823708207,
- "grad_norm": 2.221372127532959,
- "learning_rate": 1.6936749074879663e-06,
- "loss": 0.24239015579223633,
- "mean_token_accuracy": 0.9185566306114197,
- "num_tokens": 21991541.0,
- "step": 2466
- },
- {
- "epoch": 1.8746200607902734,
- "grad_norm": 1.6672178506851196,
- "learning_rate": 1.6916927216601593e-06,
- "loss": 0.35219496488571167,
- "mean_token_accuracy": 0.8668237328529358,
- "num_tokens": 22000797.0,
- "step": 2467
- },
- {
- "epoch": 1.8753799392097266,
- "grad_norm": 1.364131212234497,
- "learning_rate": 1.6897111031335145e-06,
- "loss": 0.4456409513950348,
- "mean_token_accuracy": 0.8350487947463989,
- "num_tokens": 22018297.0,
- "step": 2468
- },
- {
- "epoch": 1.8761398176291793,
- "grad_norm": 1.4535794258117676,
- "learning_rate": 1.6877300532988095e-06,
- "loss": 0.395782470703125,
- "mean_token_accuracy": 0.8482908010482788,
- "num_tokens": 22030096.0,
- "step": 2469
- },
- {
- "epoch": 1.8768996960486324,
- "grad_norm": 2.0192270278930664,
- "learning_rate": 1.6857495735464196e-06,
- "loss": 0.31406813859939575,
- "mean_token_accuracy": 0.889453649520874,
- "num_tokens": 22036082.0,
- "step": 2470
- },
- {
- "epoch": 1.877659574468085,
- "grad_norm": 2.159257173538208,
- "learning_rate": 1.6837696652663244e-06,
- "loss": 0.43942126631736755,
- "mean_token_accuracy": 0.8518660068511963,
- "num_tokens": 22043413.0,
- "step": 2471
- },
- {
- "epoch": 1.878419452887538,
- "grad_norm": 1.9774882793426514,
- "learning_rate": 1.681790329848097e-06,
- "loss": 0.42464935779571533,
- "mean_token_accuracy": 0.8545591831207275,
- "num_tokens": 22050290.0,
- "step": 2472
- },
- {
- "epoch": 1.8791793313069909,
- "grad_norm": 1.0219167470932007,
- "learning_rate": 1.6798115686809125e-06,
- "loss": 0.36917346715927124,
- "mean_token_accuracy": 0.8650286197662354,
- "num_tokens": 22070408.0,
- "step": 2473
- },
- {
- "epoch": 1.8799392097264438,
- "grad_norm": 1.2943378686904907,
- "learning_rate": 1.677833383153542e-06,
- "loss": 0.3434808850288391,
- "mean_token_accuracy": 0.878541111946106,
- "num_tokens": 22083567.0,
- "step": 2474
- },
- {
- "epoch": 1.8806990881458967,
- "grad_norm": 3.582855224609375,
- "learning_rate": 1.6758557746543518e-06,
- "loss": 0.39738911390304565,
- "mean_token_accuracy": 0.8951535224914551,
- "num_tokens": 22086886.0,
- "step": 2475
- },
- {
- "epoch": 1.8814589665653494,
- "grad_norm": 1.680220365524292,
- "learning_rate": 1.673878744571304e-06,
- "loss": 0.38146206736564636,
- "mean_token_accuracy": 0.8596681356430054,
- "num_tokens": 22095564.0,
- "step": 2476
- },
- {
- "epoch": 1.8822188449848025,
- "grad_norm": 1.448194146156311,
- "learning_rate": 1.6719022942919527e-06,
- "loss": 0.43309977650642395,
- "mean_token_accuracy": 0.8669528961181641,
- "num_tokens": 22109333.0,
- "step": 2477
- },
- {
- "epoch": 1.8829787234042552,
- "grad_norm": 1.5353537797927856,
- "learning_rate": 1.6699264252034498e-06,
- "loss": 0.4479079842567444,
- "mean_token_accuracy": 0.8379873037338257,
- "num_tokens": 22124735.0,
- "step": 2478
- },
- {
- "epoch": 1.8837386018237083,
- "grad_norm": 1.1744320392608643,
- "learning_rate": 1.6679511386925337e-06,
- "loss": 0.31951260566711426,
- "mean_token_accuracy": 0.8792685270309448,
- "num_tokens": 22140882.0,
- "step": 2479
- },
- {
- "epoch": 1.884498480243161,
- "grad_norm": 2.1996841430664062,
- "learning_rate": 1.6659764361455383e-06,
- "loss": 0.39045992493629456,
- "mean_token_accuracy": 0.8587675094604492,
- "num_tokens": 22146843.0,
- "step": 2480
- },
- {
- "epoch": 1.885258358662614,
- "grad_norm": 3.494931697845459,
- "learning_rate": 1.6640023189483836e-06,
- "loss": 0.44756871461868286,
- "mean_token_accuracy": 0.8643628358840942,
- "num_tokens": 22150504.0,
- "step": 2481
- },
- {
- "epoch": 1.8860182370820668,
- "grad_norm": 2.2455973625183105,
- "learning_rate": 1.6620287884865831e-06,
- "loss": 0.3308878540992737,
- "mean_token_accuracy": 0.8748078942298889,
- "num_tokens": 22156537.0,
- "step": 2482
- },
- {
- "epoch": 1.8867781155015197,
- "grad_norm": 2.31868314743042,
- "learning_rate": 1.6600558461452368e-06,
- "loss": 0.46583569049835205,
- "mean_token_accuracy": 0.8438903093338013,
- "num_tokens": 22163501.0,
- "step": 2483
- },
- {
- "epoch": 1.8875379939209727,
- "grad_norm": 1.5695412158966064,
- "learning_rate": 1.65808349330903e-06,
- "loss": 0.351986825466156,
- "mean_token_accuracy": 0.8707568645477295,
- "num_tokens": 22173880.0,
- "step": 2484
- },
- {
- "epoch": 1.8882978723404256,
- "grad_norm": 1.4109563827514648,
- "learning_rate": 1.656111731362236e-06,
- "loss": 0.36058586835861206,
- "mean_token_accuracy": 0.8606001138687134,
- "num_tokens": 22189000.0,
- "step": 2485
- },
- {
- "epoch": 1.8890577507598785,
- "grad_norm": 1.0398776531219482,
- "learning_rate": 1.6541405616887138e-06,
- "loss": 0.36524999141693115,
- "mean_token_accuracy": 0.8690586090087891,
- "num_tokens": 22209187.0,
- "step": 2486
- },
- {
- "epoch": 1.8898176291793312,
- "grad_norm": 2.1050004959106445,
- "learning_rate": 1.6521699856719065e-06,
- "loss": 0.2988269329071045,
- "mean_token_accuracy": 0.8887280225753784,
- "num_tokens": 22215539.0,
- "step": 2487
- },
- {
- "epoch": 1.8905775075987843,
- "grad_norm": 2.5606791973114014,
- "learning_rate": 1.650200004694839e-06,
- "loss": 0.41077330708503723,
- "mean_token_accuracy": 0.8436049818992615,
- "num_tokens": 22221133.0,
- "step": 2488
- },
- {
- "epoch": 1.891337386018237,
- "grad_norm": 1.5786094665527344,
- "learning_rate": 1.6482306201401211e-06,
- "loss": 0.4217292368412018,
- "mean_token_accuracy": 0.859939455986023,
- "num_tokens": 22231578.0,
- "step": 2489
- },
- {
- "epoch": 1.89209726443769,
- "grad_norm": 1.7131884098052979,
- "learning_rate": 1.6462618333899422e-06,
- "loss": 0.3945464789867401,
- "mean_token_accuracy": 0.8679244518280029,
- "num_tokens": 22241252.0,
- "step": 2490
- },
- {
- "epoch": 1.8928571428571428,
- "grad_norm": 2.8350300788879395,
- "learning_rate": 1.6442936458260723e-06,
- "loss": 0.3992699384689331,
- "mean_token_accuracy": 0.8717275857925415,
- "num_tokens": 22246226.0,
- "step": 2491
- },
- {
- "epoch": 1.8936170212765957,
- "grad_norm": 2.2180120944976807,
- "learning_rate": 1.6423260588298608e-06,
- "loss": 0.3381099998950958,
- "mean_token_accuracy": 0.8968075513839722,
- "num_tokens": 22252355.0,
- "step": 2492
- },
- {
- "epoch": 1.8943768996960486,
- "grad_norm": 2.6498866081237793,
- "learning_rate": 1.6403590737822378e-06,
- "loss": 0.36339250206947327,
- "mean_token_accuracy": 0.8633373379707336,
- "num_tokens": 22257407.0,
- "step": 2493
- },
- {
- "epoch": 1.8951367781155015,
- "grad_norm": 2.634241819381714,
- "learning_rate": 1.6383926920637077e-06,
- "loss": 0.2562698721885681,
- "mean_token_accuracy": 0.8999600410461426,
- "num_tokens": 22261858.0,
- "step": 2494
- },
- {
- "epoch": 1.8958966565349544,
- "grad_norm": 2.0163333415985107,
- "learning_rate": 1.6364269150543533e-06,
- "loss": 0.3413389027118683,
- "mean_token_accuracy": 0.8718398809432983,
- "num_tokens": 22268517.0,
- "step": 2495
- },
- {
- "epoch": 1.8966565349544073,
- "grad_norm": 2.8333005905151367,
- "learning_rate": 1.6344617441338311e-06,
- "loss": 0.4354540705680847,
- "mean_token_accuracy": 0.8491238355636597,
- "num_tokens": 22273648.0,
- "step": 2496
- },
- {
- "epoch": 1.8974164133738602,
- "grad_norm": 1.6280957460403442,
- "learning_rate": 1.6324971806813766e-06,
- "loss": 0.3015792965888977,
- "mean_token_accuracy": 0.8937206268310547,
- "num_tokens": 22282521.0,
- "step": 2497
- },
- {
- "epoch": 1.898176291793313,
- "grad_norm": 1.2246302366256714,
- "learning_rate": 1.6305332260757937e-06,
- "loss": 0.26619502902030945,
- "mean_token_accuracy": 0.8886681199073792,
- "num_tokens": 22295179.0,
- "step": 2498
- },
- {
- "epoch": 1.898936170212766,
- "grad_norm": 2.4014432430267334,
- "learning_rate": 1.6285698816954626e-06,
- "loss": 0.3735058903694153,
- "mean_token_accuracy": 0.8693109750747681,
- "num_tokens": 22300681.0,
- "step": 2499
- },
- {
- "epoch": 1.8996960486322187,
- "grad_norm": 1.4447300434112549,
- "learning_rate": 1.6266071489183327e-06,
- "loss": 0.40768876671791077,
- "mean_token_accuracy": 0.8556059002876282,
- "num_tokens": 22312442.0,
- "step": 2500
- },
- {
- "epoch": 1.9004559270516719,
- "grad_norm": 2.1339821815490723,
- "learning_rate": 1.6246450291219268e-06,
- "loss": 0.33442017436027527,
- "mean_token_accuracy": 0.8837105631828308,
- "num_tokens": 22318779.0,
- "step": 2501
- },
- {
- "epoch": 1.9012158054711246,
- "grad_norm": 2.8564913272857666,
- "learning_rate": 1.6226835236833356e-06,
- "loss": 0.36013197898864746,
- "mean_token_accuracy": 0.8810569047927856,
- "num_tokens": 22323390.0,
- "step": 2502
- },
- {
- "epoch": 1.9019756838905775,
- "grad_norm": 2.1201915740966797,
- "learning_rate": 1.620722633979219e-06,
- "loss": 0.4587489664554596,
- "mean_token_accuracy": 0.8517274856567383,
- "num_tokens": 22330275.0,
- "step": 2503
- },
- {
- "epoch": 1.9027355623100304,
- "grad_norm": 2.211402177810669,
- "learning_rate": 1.6187623613858038e-06,
- "loss": 0.3698349595069885,
- "mean_token_accuracy": 0.8768182992935181,
- "num_tokens": 22336041.0,
- "step": 2504
- },
- {
- "epoch": 1.9034954407294833,
- "grad_norm": 1.421604871749878,
- "learning_rate": 1.6168027072788868e-06,
- "loss": 0.38086453080177307,
- "mean_token_accuracy": 0.8622198104858398,
- "num_tokens": 22349310.0,
- "step": 2505
- },
- {
- "epoch": 1.9042553191489362,
- "grad_norm": 2.4304113388061523,
- "learning_rate": 1.6148436730338279e-06,
- "loss": 0.34694477915763855,
- "mean_token_accuracy": 0.8833136558532715,
- "num_tokens": 22355069.0,
- "step": 2506
- },
- {
- "epoch": 1.905015197568389,
- "grad_norm": 2.1076772212982178,
- "learning_rate": 1.6128852600255518e-06,
- "loss": 0.4973800778388977,
- "mean_token_accuracy": 0.851190984249115,
- "num_tokens": 22362402.0,
- "step": 2507
- },
- {
- "epoch": 1.905775075987842,
- "grad_norm": 3.0934200286865234,
- "learning_rate": 1.6109274696285496e-06,
- "loss": 0.46498024463653564,
- "mean_token_accuracy": 0.8436626195907593,
- "num_tokens": 22367390.0,
- "step": 2508
- },
- {
- "epoch": 1.9065349544072947,
- "grad_norm": 2.0114359855651855,
- "learning_rate": 1.6089703032168736e-06,
- "loss": 0.45143815875053406,
- "mean_token_accuracy": 0.852748692035675,
- "num_tokens": 22377032.0,
- "step": 2509
- },
- {
- "epoch": 1.9072948328267478,
- "grad_norm": 1.8780893087387085,
- "learning_rate": 1.6070137621641382e-06,
- "loss": 0.3977179527282715,
- "mean_token_accuracy": 0.8556262850761414,
- "num_tokens": 22386880.0,
- "step": 2510
- },
- {
- "epoch": 1.9080547112462005,
- "grad_norm": 1.6748069524765015,
- "learning_rate": 1.6050578478435184e-06,
- "loss": 0.35590440034866333,
- "mean_token_accuracy": 0.8702141046524048,
- "num_tokens": 22396616.0,
- "step": 2511
- },
- {
- "epoch": 1.9088145896656536,
- "grad_norm": 0.9799401760101318,
- "learning_rate": 1.6031025616277512e-06,
- "loss": 0.3325427770614624,
- "mean_token_accuracy": 0.8771291971206665,
- "num_tokens": 22419580.0,
- "step": 2512
- },
- {
- "epoch": 1.9095744680851063,
- "grad_norm": 1.5084866285324097,
- "learning_rate": 1.6011479048891323e-06,
- "loss": 0.44336390495300293,
- "mean_token_accuracy": 0.8786209225654602,
- "num_tokens": 22434235.0,
- "step": 2513
- },
- {
- "epoch": 1.9103343465045592,
- "grad_norm": 1.8544305562973022,
- "learning_rate": 1.5991938789995138e-06,
- "loss": 0.3055306375026703,
- "mean_token_accuracy": 0.9043174982070923,
- "num_tokens": 22442003.0,
- "step": 2514
- },
- {
- "epoch": 1.9110942249240122,
- "grad_norm": 4.29932165145874,
- "learning_rate": 1.5972404853303061e-06,
- "loss": 0.386760413646698,
- "mean_token_accuracy": 0.8914207220077515,
- "num_tokens": 22444787.0,
- "step": 2515
- },
- {
- "epoch": 1.911854103343465,
- "grad_norm": 1.7560505867004395,
- "learning_rate": 1.595287725252478e-06,
- "loss": 0.4141422510147095,
- "mean_token_accuracy": 0.862310528755188,
- "num_tokens": 22453625.0,
- "step": 2516
- },
- {
- "epoch": 1.912613981762918,
- "grad_norm": 2.685443878173828,
- "learning_rate": 1.5933356001365502e-06,
- "loss": 0.36217260360717773,
- "mean_token_accuracy": 0.868883490562439,
- "num_tokens": 22458597.0,
- "step": 2517
- },
- {
- "epoch": 1.9133738601823707,
- "grad_norm": 2.2587239742279053,
- "learning_rate": 1.591384111352599e-06,
- "loss": 0.5298880934715271,
- "mean_token_accuracy": 0.821168839931488,
- "num_tokens": 22466091.0,
- "step": 2518
- },
- {
- "epoch": 1.9141337386018238,
- "grad_norm": 2.273380756378174,
- "learning_rate": 1.5894332602702545e-06,
- "loss": 0.3194117546081543,
- "mean_token_accuracy": 0.8849239945411682,
- "num_tokens": 22471785.0,
- "step": 2519
- },
- {
- "epoch": 1.9148936170212765,
- "grad_norm": 2.314634084701538,
- "learning_rate": 1.5874830482587003e-06,
- "loss": 0.457550585269928,
- "mean_token_accuracy": 0.8367670774459839,
- "num_tokens": 22479091.0,
- "step": 2520
- },
- {
- "epoch": 1.9156534954407296,
- "grad_norm": 2.16206693649292,
- "learning_rate": 1.585533476686669e-06,
- "loss": 0.43055859208106995,
- "mean_token_accuracy": 0.8659856915473938,
- "num_tokens": 22487379.0,
- "step": 2521
- },
- {
- "epoch": 1.9164133738601823,
- "grad_norm": 2.2091798782348633,
- "learning_rate": 1.5835845469224447e-06,
- "loss": 0.45421302318573,
- "mean_token_accuracy": 0.8418087959289551,
- "num_tokens": 22493755.0,
- "step": 2522
- },
- {
- "epoch": 1.9171732522796354,
- "grad_norm": 1.6166985034942627,
- "learning_rate": 1.5816362603338632e-06,
- "loss": 0.5211667418479919,
- "mean_token_accuracy": 0.809440016746521,
- "num_tokens": 22506648.0,
- "step": 2523
- },
- {
- "epoch": 1.917933130699088,
- "grad_norm": 2.4998703002929688,
- "learning_rate": 1.5796886182883053e-06,
- "loss": 0.45915648341178894,
- "mean_token_accuracy": 0.833067774772644,
- "num_tokens": 22513216.0,
- "step": 2524
- },
- {
- "epoch": 1.918693009118541,
- "grad_norm": 1.492928147315979,
- "learning_rate": 1.577741622152702e-06,
- "loss": 0.45581498742103577,
- "mean_token_accuracy": 0.8531479835510254,
- "num_tokens": 22524908.0,
- "step": 2525
- },
- {
- "epoch": 1.919452887537994,
- "grad_norm": 2.0502207279205322,
- "learning_rate": 1.5757952732935288e-06,
- "loss": 0.4156759977340698,
- "mean_token_accuracy": 0.8677599430084229,
- "num_tokens": 22532275.0,
- "step": 2526
- },
- {
- "epoch": 1.9202127659574468,
- "grad_norm": 2.4572031497955322,
- "learning_rate": 1.5738495730768104e-06,
- "loss": 0.43373313546180725,
- "mean_token_accuracy": 0.8435516357421875,
- "num_tokens": 22538272.0,
- "step": 2527
- },
- {
- "epoch": 1.9209726443768997,
- "grad_norm": 2.071903705596924,
- "learning_rate": 1.5719045228681127e-06,
- "loss": 0.3211413621902466,
- "mean_token_accuracy": 0.87841796875,
- "num_tokens": 22545487.0,
- "step": 2528
- },
- {
- "epoch": 1.9217325227963524,
- "grad_norm": 1.6742064952850342,
- "learning_rate": 1.5699601240325474e-06,
- "loss": 0.3704240322113037,
- "mean_token_accuracy": 0.8646563291549683,
- "num_tokens": 22554840.0,
- "step": 2529
- },
- {
- "epoch": 1.9224924012158056,
- "grad_norm": 1.0941399335861206,
- "learning_rate": 1.5680163779347668e-06,
- "loss": 0.3595704436302185,
- "mean_token_accuracy": 0.8680597543716431,
- "num_tokens": 22572627.0,
- "step": 2530
- },
- {
- "epoch": 1.9232522796352582,
- "grad_norm": 2.9815237522125244,
- "learning_rate": 1.5660732859389687e-06,
- "loss": 0.2941335141658783,
- "mean_token_accuracy": 0.8847303986549377,
- "num_tokens": 22576851.0,
- "step": 2531
- },
- {
- "epoch": 1.9240121580547114,
- "grad_norm": 2.898106813430786,
- "learning_rate": 1.5641308494088903e-06,
- "loss": 0.4066317081451416,
- "mean_token_accuracy": 0.8469538688659668,
- "num_tokens": 22581431.0,
- "step": 2532
- },
- {
- "epoch": 1.924772036474164,
- "grad_norm": 1.6757515668869019,
- "learning_rate": 1.5621890697078069e-06,
- "loss": 0.33923569321632385,
- "mean_token_accuracy": 0.8790708184242249,
- "num_tokens": 22590648.0,
- "step": 2533
- },
- {
- "epoch": 1.925531914893617,
- "grad_norm": 1.747314214706421,
- "learning_rate": 1.5602479481985333e-06,
- "loss": 0.4865703582763672,
- "mean_token_accuracy": 0.8314566612243652,
- "num_tokens": 22600153.0,
- "step": 2534
- },
- {
- "epoch": 1.9262917933130699,
- "grad_norm": 2.7927849292755127,
- "learning_rate": 1.5583074862434254e-06,
- "loss": 0.335658460855484,
- "mean_token_accuracy": 0.8769067525863647,
- "num_tokens": 22604864.0,
- "step": 2535
- },
- {
- "epoch": 1.9270516717325228,
- "grad_norm": 2.2553000450134277,
- "learning_rate": 1.5563676852043738e-06,
- "loss": 0.4442562460899353,
- "mean_token_accuracy": 0.8381515145301819,
- "num_tokens": 22611102.0,
- "step": 2536
- },
- {
- "epoch": 1.9278115501519757,
- "grad_norm": 1.1937638521194458,
- "learning_rate": 1.5544285464428044e-06,
- "loss": 0.38608425855636597,
- "mean_token_accuracy": 0.8589644432067871,
- "num_tokens": 22627781.0,
- "step": 2537
- },
- {
- "epoch": 1.9285714285714286,
- "grad_norm": 3.282639980316162,
- "learning_rate": 1.55249007131968e-06,
- "loss": 0.31231993436813354,
- "mean_token_accuracy": 0.8917703032493591,
- "num_tokens": 22632341.0,
- "step": 2538
- },
- {
- "epoch": 1.9293313069908815,
- "grad_norm": 2.3212976455688477,
- "learning_rate": 1.5505522611954977e-06,
- "loss": 0.34952571988105774,
- "mean_token_accuracy": 0.8752106428146362,
- "num_tokens": 22638572.0,
- "step": 2539
- },
- {
- "epoch": 1.9300911854103342,
- "grad_norm": 1.389098882675171,
- "learning_rate": 1.548615117430286e-06,
- "loss": 0.4298851788043976,
- "mean_token_accuracy": 0.871698260307312,
- "num_tokens": 22651875.0,
- "step": 2540
- },
- {
- "epoch": 1.9308510638297873,
- "grad_norm": 1.5333977937698364,
- "learning_rate": 1.5466786413836077e-06,
- "loss": 0.45540744066238403,
- "mean_token_accuracy": 0.8409075736999512,
- "num_tokens": 22662903.0,
- "step": 2541
- },
- {
- "epoch": 1.93161094224924,
- "grad_norm": 1.7833251953125,
- "learning_rate": 1.5447428344145565e-06,
- "loss": 0.333247572183609,
- "mean_token_accuracy": 0.8796100616455078,
- "num_tokens": 22671125.0,
- "step": 2542
- },
- {
- "epoch": 1.9323708206686931,
- "grad_norm": 1.5165303945541382,
- "learning_rate": 1.5428076978817564e-06,
- "loss": 0.3085063099861145,
- "mean_token_accuracy": 0.888705849647522,
- "num_tokens": 22681482.0,
- "step": 2543
- },
- {
- "epoch": 1.9331306990881458,
- "grad_norm": 2.3556196689605713,
- "learning_rate": 1.5408732331433596e-06,
- "loss": 0.44008776545524597,
- "mean_token_accuracy": 0.8578170537948608,
- "num_tokens": 22686952.0,
- "step": 2544
- },
- {
- "epoch": 1.9338905775075987,
- "grad_norm": 2.9572882652282715,
- "learning_rate": 1.538939441557048e-06,
- "loss": 0.3779261112213135,
- "mean_token_accuracy": 0.8657241463661194,
- "num_tokens": 22691211.0,
- "step": 2545
- },
- {
- "epoch": 1.9346504559270516,
- "grad_norm": 2.373473644256592,
- "learning_rate": 1.5370063244800326e-06,
- "loss": 0.4113072454929352,
- "mean_token_accuracy": 0.872116208076477,
- "num_tokens": 22697442.0,
- "step": 2546
- },
- {
- "epoch": 1.9354103343465046,
- "grad_norm": 2.270207643508911,
- "learning_rate": 1.5350738832690479e-06,
- "loss": 0.4021070897579193,
- "mean_token_accuracy": 0.8750372529029846,
- "num_tokens": 22703693.0,
- "step": 2547
- },
- {
- "epoch": 1.9361702127659575,
- "grad_norm": 2.429445266723633,
- "learning_rate": 1.5331421192803565e-06,
- "loss": 0.40210235118865967,
- "mean_token_accuracy": 0.8593704104423523,
- "num_tokens": 22709285.0,
- "step": 2548
- },
- {
- "epoch": 1.9369300911854104,
- "grad_norm": 1.4576458930969238,
- "learning_rate": 1.5312110338697427e-06,
- "loss": 0.44822201132774353,
- "mean_token_accuracy": 0.8737322688102722,
- "num_tokens": 22723743.0,
- "step": 2549
- },
- {
- "epoch": 1.9376899696048633,
- "grad_norm": 2.1008098125457764,
- "learning_rate": 1.5292806283925192e-06,
- "loss": 0.3514235019683838,
- "mean_token_accuracy": 0.8689005374908447,
- "num_tokens": 22730135.0,
- "step": 2550
- },
- {
- "epoch": 1.938449848024316,
- "grad_norm": 1.9786806106567383,
- "learning_rate": 1.5273509042035172e-06,
- "loss": 0.4483771324157715,
- "mean_token_accuracy": 0.8353633880615234,
- "num_tokens": 22738717.0,
- "step": 2551
- },
- {
- "epoch": 1.939209726443769,
- "grad_norm": 1.0649693012237549,
- "learning_rate": 1.5254218626570927e-06,
- "loss": 0.30712205171585083,
- "mean_token_accuracy": 0.8802675008773804,
- "num_tokens": 22757346.0,
- "step": 2552
- },
- {
- "epoch": 1.9399696048632218,
- "grad_norm": 3.0401108264923096,
- "learning_rate": 1.5234935051071193e-06,
- "loss": 0.5213959217071533,
- "mean_token_accuracy": 0.8249514102935791,
- "num_tokens": 22762169.0,
- "step": 2553
- },
- {
- "epoch": 1.940729483282675,
- "grad_norm": 2.892486572265625,
- "learning_rate": 1.521565832906994e-06,
- "loss": 0.5694394111633301,
- "mean_token_accuracy": 0.8139263391494751,
- "num_tokens": 22767824.0,
- "step": 2554
- },
- {
- "epoch": 1.9414893617021276,
- "grad_norm": 1.6187207698822021,
- "learning_rate": 1.519638847409632e-06,
- "loss": 0.46748271584510803,
- "mean_token_accuracy": 0.8541051149368286,
- "num_tokens": 22778195.0,
- "step": 2555
- },
- {
- "epoch": 1.9422492401215805,
- "grad_norm": 1.3857731819152832,
- "learning_rate": 1.5177125499674639e-06,
- "loss": 0.35661786794662476,
- "mean_token_accuracy": 0.8711516857147217,
- "num_tokens": 22792353.0,
- "step": 2556
- },
- {
- "epoch": 1.9430091185410334,
- "grad_norm": 1.108441710472107,
- "learning_rate": 1.515786941932441e-06,
- "loss": 0.3537200391292572,
- "mean_token_accuracy": 0.8739079833030701,
- "num_tokens": 22813185.0,
- "step": 2557
- },
- {
- "epoch": 1.9437689969604863,
- "grad_norm": 2.0528404712677,
- "learning_rate": 1.5138620246560295e-06,
- "loss": 0.4161028265953064,
- "mean_token_accuracy": 0.8385938405990601,
- "num_tokens": 22821227.0,
- "step": 2558
- },
- {
- "epoch": 1.9445288753799392,
- "grad_norm": 1.5123628377914429,
- "learning_rate": 1.5119377994892095e-06,
- "loss": 0.4420986473560333,
- "mean_token_accuracy": 0.8664361834526062,
- "num_tokens": 22835064.0,
- "step": 2559
- },
- {
- "epoch": 1.9452887537993921,
- "grad_norm": 2.5354838371276855,
- "learning_rate": 1.5100142677824752e-06,
- "loss": 0.3837323784828186,
- "mean_token_accuracy": 0.8607655763626099,
- "num_tokens": 22840455.0,
- "step": 2560
- },
- {
- "epoch": 1.946048632218845,
- "grad_norm": 1.1354057788848877,
- "learning_rate": 1.5080914308858375e-06,
- "loss": 0.39776813983917236,
- "mean_token_accuracy": 0.8586497902870178,
- "num_tokens": 22858828.0,
- "step": 2561
- },
- {
- "epoch": 1.9468085106382977,
- "grad_norm": 1.576740026473999,
- "learning_rate": 1.5061692901488161e-06,
- "loss": 0.3167848289012909,
- "mean_token_accuracy": 0.8876185417175293,
- "num_tokens": 22868674.0,
- "step": 2562
- },
- {
- "epoch": 1.9475683890577509,
- "grad_norm": 1.4835401773452759,
- "learning_rate": 1.5042478469204437e-06,
- "loss": 0.44950318336486816,
- "mean_token_accuracy": 0.8526639342308044,
- "num_tokens": 22883019.0,
- "step": 2563
- },
- {
- "epoch": 1.9483282674772036,
- "grad_norm": 1.617073655128479,
- "learning_rate": 1.502327102549262e-06,
- "loss": 0.45711010694503784,
- "mean_token_accuracy": 0.834361732006073,
- "num_tokens": 22896834.0,
- "step": 2564
- },
- {
- "epoch": 1.9490881458966567,
- "grad_norm": 1.3348414897918701,
- "learning_rate": 1.5004070583833252e-06,
- "loss": 0.3691314458847046,
- "mean_token_accuracy": 0.8779371380805969,
- "num_tokens": 22912350.0,
- "step": 2565
- },
- {
- "epoch": 1.9498480243161094,
- "grad_norm": 1.711234450340271,
- "learning_rate": 1.4984877157701932e-06,
- "loss": 0.38726937770843506,
- "mean_token_accuracy": 0.8704015016555786,
- "num_tokens": 22922575.0,
- "step": 2566
- },
- {
- "epoch": 1.9506079027355623,
- "grad_norm": 2.4587950706481934,
- "learning_rate": 1.4965690760569346e-06,
- "loss": 0.4455464482307434,
- "mean_token_accuracy": 0.8481032252311707,
- "num_tokens": 22928717.0,
- "step": 2567
- },
- {
- "epoch": 1.9513677811550152,
- "grad_norm": 2.4189560413360596,
- "learning_rate": 1.4946511405901237e-06,
- "loss": 0.4120418429374695,
- "mean_token_accuracy": 0.8519487380981445,
- "num_tokens": 22934977.0,
- "step": 2568
- },
- {
- "epoch": 1.952127659574468,
- "grad_norm": 1.2503050565719604,
- "learning_rate": 1.4927339107158437e-06,
- "loss": 0.4434332251548767,
- "mean_token_accuracy": 0.8448144793510437,
- "num_tokens": 22950061.0,
- "step": 2569
- },
- {
- "epoch": 1.952887537993921,
- "grad_norm": 1.788493275642395,
- "learning_rate": 1.4908173877796784e-06,
- "loss": 0.49203023314476013,
- "mean_token_accuracy": 0.8601495623588562,
- "num_tokens": 22961838.0,
- "step": 2570
- },
- {
- "epoch": 1.9536474164133737,
- "grad_norm": 1.4260050058364868,
- "learning_rate": 1.4889015731267186e-06,
- "loss": 0.3286570906639099,
- "mean_token_accuracy": 0.882429838180542,
- "num_tokens": 22973192.0,
- "step": 2571
- },
- {
- "epoch": 1.9544072948328268,
- "grad_norm": 1.6754822731018066,
- "learning_rate": 1.486986468101555e-06,
- "loss": 0.34655290842056274,
- "mean_token_accuracy": 0.8807861804962158,
- "num_tokens": 22983661.0,
- "step": 2572
- },
- {
- "epoch": 1.9551671732522795,
- "grad_norm": 1.9064570665359497,
- "learning_rate": 1.4850720740482842e-06,
- "loss": 0.34020254015922546,
- "mean_token_accuracy": 0.86677086353302,
- "num_tokens": 22991231.0,
- "step": 2573
- },
- {
- "epoch": 1.9559270516717326,
- "grad_norm": 1.977444052696228,
- "learning_rate": 1.4831583923105e-06,
- "loss": 0.21505260467529297,
- "mean_token_accuracy": 0.921241819858551,
- "num_tokens": 22996828.0,
- "step": 2574
- },
- {
- "epoch": 1.9566869300911853,
- "grad_norm": 1.1019235849380493,
- "learning_rate": 1.481245424231298e-06,
- "loss": 0.3804295063018799,
- "mean_token_accuracy": 0.8582668900489807,
- "num_tokens": 23016018.0,
- "step": 2575
- },
- {
- "epoch": 1.9574468085106385,
- "grad_norm": 1.7943179607391357,
- "learning_rate": 1.4793331711532743e-06,
- "loss": 0.38565245270729065,
- "mean_token_accuracy": 0.8599048256874084,
- "num_tokens": 23024461.0,
- "step": 2576
- },
- {
- "epoch": 1.9582066869300911,
- "grad_norm": 2.273824453353882,
- "learning_rate": 1.4774216344185204e-06,
- "loss": 0.46297723054885864,
- "mean_token_accuracy": 0.8294345140457153,
- "num_tokens": 23031687.0,
- "step": 2577
- },
- {
- "epoch": 1.958966565349544,
- "grad_norm": 2.308509111404419,
- "learning_rate": 1.4755108153686275e-06,
- "loss": 0.4366525411605835,
- "mean_token_accuracy": 0.8515903949737549,
- "num_tokens": 23037072.0,
- "step": 2578
- },
- {
- "epoch": 1.959726443768997,
- "grad_norm": 2.069028377532959,
- "learning_rate": 1.4736007153446803e-06,
- "loss": 0.33900877833366394,
- "mean_token_accuracy": 0.8937177658081055,
- "num_tokens": 23043207.0,
- "step": 2579
- },
- {
- "epoch": 1.9604863221884499,
- "grad_norm": 2.905163288116455,
- "learning_rate": 1.4716913356872614e-06,
- "loss": 0.3708382844924927,
- "mean_token_accuracy": 0.8936747312545776,
- "num_tokens": 23047020.0,
- "step": 2580
- },
- {
- "epoch": 1.9612462006079028,
- "grad_norm": 2.4153175354003906,
- "learning_rate": 1.4697826777364478e-06,
- "loss": 0.473562091588974,
- "mean_token_accuracy": 0.8350275158882141,
- "num_tokens": 23053282.0,
- "step": 2581
- },
- {
- "epoch": 1.9620060790273555,
- "grad_norm": 2.21589994430542,
- "learning_rate": 1.467874742831808e-06,
- "loss": 0.3812660276889801,
- "mean_token_accuracy": 0.8623865842819214,
- "num_tokens": 23059399.0,
- "step": 2582
- },
- {
- "epoch": 1.9627659574468086,
- "grad_norm": 1.0847623348236084,
- "learning_rate": 1.4659675323124037e-06,
- "loss": 0.3846944570541382,
- "mean_token_accuracy": 0.8633466958999634,
- "num_tokens": 23081005.0,
- "step": 2583
- },
- {
- "epoch": 1.9635258358662613,
- "grad_norm": 1.8754645586013794,
- "learning_rate": 1.46406104751679e-06,
- "loss": 0.3460300862789154,
- "mean_token_accuracy": 0.8757443428039551,
- "num_tokens": 23088710.0,
- "step": 2584
- },
- {
- "epoch": 1.9642857142857144,
- "grad_norm": 2.13075852394104,
- "learning_rate": 1.462155289783011e-06,
- "loss": 0.3060935139656067,
- "mean_token_accuracy": 0.9070644378662109,
- "num_tokens": 23094862.0,
- "step": 2585
- },
- {
- "epoch": 1.965045592705167,
- "grad_norm": 2.9674458503723145,
- "learning_rate": 1.4602502604486e-06,
- "loss": 0.4464406371116638,
- "mean_token_accuracy": 0.8497441411018372,
- "num_tokens": 23099821.0,
- "step": 2586
- },
- {
- "epoch": 1.96580547112462,
- "grad_norm": 1.9171007871627808,
- "learning_rate": 1.45834596085058e-06,
- "loss": 0.3905114531517029,
- "mean_token_accuracy": 0.8564352989196777,
- "num_tokens": 23107804.0,
- "step": 2587
- },
- {
- "epoch": 1.966565349544073,
- "grad_norm": 2.0817408561706543,
- "learning_rate": 1.456442392325463e-06,
- "loss": 0.3903818130493164,
- "mean_token_accuracy": 0.8671162128448486,
- "num_tokens": 23115224.0,
- "step": 2588
- },
- {
- "epoch": 1.9673252279635258,
- "grad_norm": 2.6379549503326416,
- "learning_rate": 1.4545395562092467e-06,
- "loss": 0.22965987026691437,
- "mean_token_accuracy": 0.9160916805267334,
- "num_tokens": 23119184.0,
- "step": 2589
- },
- {
- "epoch": 1.9680851063829787,
- "grad_norm": 2.525221824645996,
- "learning_rate": 1.4526374538374133e-06,
- "loss": 0.4132574498653412,
- "mean_token_accuracy": 0.8486990332603455,
- "num_tokens": 23124679.0,
- "step": 2590
- },
- {
- "epoch": 1.9688449848024316,
- "grad_norm": 2.0362391471862793,
- "learning_rate": 1.4507360865449318e-06,
- "loss": 0.29624345898628235,
- "mean_token_accuracy": 0.888127863407135,
- "num_tokens": 23130756.0,
- "step": 2591
- },
- {
- "epoch": 1.9696048632218845,
- "grad_norm": 1.5150481462478638,
- "learning_rate": 1.4488354556662553e-06,
- "loss": 0.3852264881134033,
- "mean_token_accuracy": 0.8532775640487671,
- "num_tokens": 23141597.0,
- "step": 2592
- },
- {
- "epoch": 1.9703647416413372,
- "grad_norm": 1.5255193710327148,
- "learning_rate": 1.4469355625353199e-06,
- "loss": 0.37015780806541443,
- "mean_token_accuracy": 0.8669752478599548,
- "num_tokens": 23152487.0,
- "step": 2593
- },
- {
- "epoch": 1.9711246200607904,
- "grad_norm": 1.1780041456222534,
- "learning_rate": 1.4450364084855433e-06,
- "loss": 0.34421291947364807,
- "mean_token_accuracy": 0.8593694567680359,
- "num_tokens": 23168769.0,
- "step": 2594
- },
- {
- "epoch": 1.971884498480243,
- "grad_norm": 2.4549946784973145,
- "learning_rate": 1.4431379948498254e-06,
- "loss": 0.4000544548034668,
- "mean_token_accuracy": 0.8551953434944153,
- "num_tokens": 23175428.0,
- "step": 2595
- },
- {
- "epoch": 1.9726443768996962,
- "grad_norm": 2.374192476272583,
- "learning_rate": 1.4412403229605453e-06,
- "loss": 0.31329840421676636,
- "mean_token_accuracy": 0.8917277455329895,
- "num_tokens": 23180678.0,
- "step": 2596
- },
- {
- "epoch": 1.9734042553191489,
- "grad_norm": 1.268515706062317,
- "learning_rate": 1.4393433941495638e-06,
- "loss": 0.34808623790740967,
- "mean_token_accuracy": 0.8726245164871216,
- "num_tokens": 23194733.0,
- "step": 2597
- },
- {
- "epoch": 1.9741641337386018,
- "grad_norm": 2.0898988246917725,
- "learning_rate": 1.4374472097482156e-06,
- "loss": 0.45849233865737915,
- "mean_token_accuracy": 0.8414266109466553,
- "num_tokens": 23202211.0,
- "step": 2598
- },
- {
- "epoch": 1.9749240121580547,
- "grad_norm": 2.1497802734375,
- "learning_rate": 1.4355517710873184e-06,
- "loss": 0.4304521977901459,
- "mean_token_accuracy": 0.8502874374389648,
- "num_tokens": 23209623.0,
- "step": 2599
- },
- {
- "epoch": 1.9756838905775076,
- "grad_norm": 1.821786880493164,
- "learning_rate": 1.4336570794971643e-06,
- "loss": 0.3910462558269501,
- "mean_token_accuracy": 0.8962477445602417,
- "num_tokens": 23218904.0,
- "step": 2600
- },
- {
- "epoch": 1.9764437689969605,
- "grad_norm": 2.2523093223571777,
- "learning_rate": 1.4317631363075186e-06,
- "loss": 0.3456020951271057,
- "mean_token_accuracy": 0.8703117370605469,
- "num_tokens": 23225602.0,
- "step": 2601
- },
- {
- "epoch": 1.9772036474164134,
- "grad_norm": 1.6920030117034912,
- "learning_rate": 1.4298699428476236e-06,
- "loss": 0.4629668593406677,
- "mean_token_accuracy": 0.841956615447998,
- "num_tokens": 23236812.0,
- "step": 2602
- },
- {
- "epoch": 1.9779635258358663,
- "grad_norm": 1.8796344995498657,
- "learning_rate": 1.427977500446199e-06,
- "loss": 0.3302173316478729,
- "mean_token_accuracy": 0.8769404888153076,
- "num_tokens": 23245851.0,
- "step": 2603
- },
- {
- "epoch": 1.978723404255319,
- "grad_norm": 2.4003775119781494,
- "learning_rate": 1.4260858104314299e-06,
- "loss": 0.48402607440948486,
- "mean_token_accuracy": 0.8477497100830078,
- "num_tokens": 23252429.0,
- "step": 2604
- },
- {
- "epoch": 1.9794832826747721,
- "grad_norm": 3.576800584793091,
- "learning_rate": 1.4241948741309783e-06,
- "loss": 0.2943669259548187,
- "mean_token_accuracy": 0.8933546543121338,
- "num_tokens": 23255431.0,
- "step": 2605
- },
- {
- "epoch": 1.9802431610942248,
- "grad_norm": 2.7589938640594482,
- "learning_rate": 1.4223046928719764e-06,
- "loss": 0.5138746500015259,
- "mean_token_accuracy": 0.817468523979187,
- "num_tokens": 23261351.0,
- "step": 2606
- },
- {
- "epoch": 1.981003039513678,
- "grad_norm": 1.6950130462646484,
- "learning_rate": 1.420415267981026e-06,
- "loss": 0.2744991183280945,
- "mean_token_accuracy": 0.9005721211433411,
- "num_tokens": 23269482.0,
- "step": 2607
- },
- {
- "epoch": 1.9817629179331306,
- "grad_norm": 1.5962934494018555,
- "learning_rate": 1.418526600784198e-06,
- "loss": 0.4629114270210266,
- "mean_token_accuracy": 0.8337699174880981,
- "num_tokens": 23279796.0,
- "step": 2608
- },
- {
- "epoch": 1.9825227963525835,
- "grad_norm": 1.4962197542190552,
- "learning_rate": 1.4166386926070322e-06,
- "loss": 0.4217689633369446,
- "mean_token_accuracy": 0.8445580005645752,
- "num_tokens": 23293050.0,
- "step": 2609
- },
- {
- "epoch": 1.9832826747720365,
- "grad_norm": 1.4243721961975098,
- "learning_rate": 1.414751544774535e-06,
- "loss": 0.4888152480125427,
- "mean_token_accuracy": 0.8298524022102356,
- "num_tokens": 23308501.0,
- "step": 2610
- },
- {
- "epoch": 1.9840425531914894,
- "grad_norm": 1.5776121616363525,
- "learning_rate": 1.412865158611179e-06,
- "loss": 0.3156965970993042,
- "mean_token_accuracy": 0.8773540258407593,
- "num_tokens": 23317401.0,
- "step": 2611
- },
- {
- "epoch": 1.9848024316109423,
- "grad_norm": 1.4690552949905396,
- "learning_rate": 1.4109795354409045e-06,
- "loss": 0.35854774713516235,
- "mean_token_accuracy": 0.869156002998352,
- "num_tokens": 23328891.0,
- "step": 2612
- },
- {
- "epoch": 1.9855623100303952,
- "grad_norm": 1.5036180019378662,
- "learning_rate": 1.4090946765871105e-06,
- "loss": 0.3579009771347046,
- "mean_token_accuracy": 0.8698509931564331,
- "num_tokens": 23340473.0,
- "step": 2613
- },
- {
- "epoch": 1.986322188449848,
- "grad_norm": 2.0811538696289062,
- "learning_rate": 1.4072105833726685e-06,
- "loss": 0.2905905246734619,
- "mean_token_accuracy": 0.9131759405136108,
- "num_tokens": 23346480.0,
- "step": 2614
- },
- {
- "epoch": 1.9870820668693008,
- "grad_norm": 1.2866275310516357,
- "learning_rate": 1.4053272571199037e-06,
- "loss": 0.4091147184371948,
- "mean_token_accuracy": 0.8537255525588989,
- "num_tokens": 23361957.0,
- "step": 2615
- },
- {
- "epoch": 1.987841945288754,
- "grad_norm": 1.439497470855713,
- "learning_rate": 1.4034446991506084e-06,
- "loss": 0.4888972342014313,
- "mean_token_accuracy": 0.8451695442199707,
- "num_tokens": 23374936.0,
- "step": 2616
- },
- {
- "epoch": 1.9886018237082066,
- "grad_norm": 1.758204698562622,
- "learning_rate": 1.401562910786034e-06,
- "loss": 0.4976118803024292,
- "mean_token_accuracy": 0.8346713781356812,
- "num_tokens": 23386102.0,
- "step": 2617
- },
- {
- "epoch": 1.9893617021276597,
- "grad_norm": 1.436486840248108,
- "learning_rate": 1.3996818933468926e-06,
- "loss": 0.42407113313674927,
- "mean_token_accuracy": 0.8529444932937622,
- "num_tokens": 23398645.0,
- "step": 2618
- },
- {
- "epoch": 1.9901215805471124,
- "grad_norm": 2.1466588973999023,
- "learning_rate": 1.397801648153354e-06,
- "loss": 0.45519331097602844,
- "mean_token_accuracy": 0.8460411429405212,
- "num_tokens": 23406162.0,
- "step": 2619
- },
- {
- "epoch": 1.9908814589665653,
- "grad_norm": 2.0492005348205566,
- "learning_rate": 1.395922176525047e-06,
- "loss": 0.31093084812164307,
- "mean_token_accuracy": 0.8927264213562012,
- "num_tokens": 23412051.0,
- "step": 2620
- },
- {
- "epoch": 1.9916413373860182,
- "grad_norm": 2.2639048099517822,
- "learning_rate": 1.3940434797810567e-06,
- "loss": 0.3804079592227936,
- "mean_token_accuracy": 0.8720212578773499,
- "num_tokens": 23418252.0,
- "step": 2621
- },
- {
- "epoch": 1.9924012158054711,
- "grad_norm": 1.9541687965393066,
- "learning_rate": 1.3921655592399256e-06,
- "loss": 0.38776344060897827,
- "mean_token_accuracy": 0.858753502368927,
- "num_tokens": 23425901.0,
- "step": 2622
- },
- {
- "epoch": 1.993161094224924,
- "grad_norm": 1.5119032859802246,
- "learning_rate": 1.3902884162196509e-06,
- "loss": 0.39581215381622314,
- "mean_token_accuracy": 0.8539663553237915,
- "num_tokens": 23439390.0,
- "step": 2623
- },
- {
- "epoch": 1.993920972644377,
- "grad_norm": 2.1608591079711914,
- "learning_rate": 1.388412052037682e-06,
- "loss": 0.41801220178604126,
- "mean_token_accuracy": 0.8703387975692749,
- "num_tokens": 23445725.0,
- "step": 2624
- },
- {
- "epoch": 1.9946808510638299,
- "grad_norm": 2.463165521621704,
- "learning_rate": 1.3865364680109239e-06,
- "loss": 0.3252835273742676,
- "mean_token_accuracy": 0.9031686186790466,
- "num_tokens": 23451122.0,
- "step": 2625
- },
- {
- "epoch": 1.9954407294832825,
- "grad_norm": 1.1901201009750366,
- "learning_rate": 1.384661665455736e-06,
- "loss": 0.3358447253704071,
- "mean_token_accuracy": 0.8767676949501038,
- "num_tokens": 23467381.0,
- "step": 2626
- },
- {
- "epoch": 1.9962006079027357,
- "grad_norm": 1.3035757541656494,
- "learning_rate": 1.3827876456879247e-06,
- "loss": 0.3736562430858612,
- "mean_token_accuracy": 0.849855899810791,
- "num_tokens": 23482192.0,
- "step": 2627
- },
- {
- "epoch": 1.9969604863221884,
- "grad_norm": 1.8807034492492676,
- "learning_rate": 1.3809144100227483e-06,
- "loss": 0.45943766832351685,
- "mean_token_accuracy": 0.8456380367279053,
- "num_tokens": 23495167.0,
- "step": 2628
- },
- {
- "epoch": 1.9977203647416415,
- "grad_norm": 2.3645784854888916,
- "learning_rate": 1.3790419597749198e-06,
- "loss": 0.4271511435508728,
- "mean_token_accuracy": 0.846099853515625,
- "num_tokens": 23500790.0,
- "step": 2629
- },
- {
- "epoch": 1.9984802431610942,
- "grad_norm": 1.8451792001724243,
- "learning_rate": 1.3771702962585928e-06,
- "loss": 0.38092344999313354,
- "mean_token_accuracy": 0.8641276359558105,
- "num_tokens": 23508845.0,
- "step": 2630
- },
- {
- "epoch": 1.999240121580547,
- "grad_norm": 1.1115045547485352,
- "learning_rate": 1.3752994207873743e-06,
- "loss": 0.35954269766807556,
- "mean_token_accuracy": 0.8642125129699707,
- "num_tokens": 23527929.0,
- "step": 2631
- },
- {
- "epoch": 2.0,
- "grad_norm": 1.406253457069397,
- "learning_rate": 1.373429334674317e-06,
- "loss": 0.33467042446136475,
- "mean_token_accuracy": 0.8713197708129883,
- "num_tokens": 23539356.0,
- "step": 2632
- },
- {
- "epoch": 2.0007598784194527,
- "grad_norm": 2.8150978088378906,
- "learning_rate": 1.3715600392319186e-06,
- "loss": 0.22929656505584717,
- "mean_token_accuracy": 0.9197485446929932,
- "num_tokens": 23543746.0,
- "step": 2633
- },
- {
- "epoch": 2.001519756838906,
- "grad_norm": 2.6291964054107666,
- "learning_rate": 1.369691535772123e-06,
- "loss": 0.290000855922699,
- "mean_token_accuracy": 0.8979663848876953,
- "num_tokens": 23548633.0,
- "step": 2634
- },
- {
- "epoch": 2.0022796352583585,
- "grad_norm": 1.724357008934021,
- "learning_rate": 1.3678238256063193e-06,
- "loss": 0.3717018663883209,
- "mean_token_accuracy": 0.8743406534194946,
- "num_tokens": 23557187.0,
- "step": 2635
- },
- {
- "epoch": 2.0030395136778116,
- "grad_norm": 2.3801965713500977,
- "learning_rate": 1.3659569100453346e-06,
- "loss": 0.3452329635620117,
- "mean_token_accuracy": 0.8799462914466858,
- "num_tokens": 23563321.0,
- "step": 2636
- },
- {
- "epoch": 2.0037993920972643,
- "grad_norm": 1.8925955295562744,
- "learning_rate": 1.3640907903994455e-06,
- "loss": 0.32880955934524536,
- "mean_token_accuracy": 0.888347864151001,
- "num_tokens": 23570571.0,
- "step": 2637
- },
- {
- "epoch": 2.0045592705167175,
- "grad_norm": 1.0761849880218506,
- "learning_rate": 1.3622254679783665e-06,
- "loss": 0.395224004983902,
- "mean_token_accuracy": 0.8637001514434814,
- "num_tokens": 23589504.0,
- "step": 2638
- },
- {
- "epoch": 2.00531914893617,
- "grad_norm": 2.1172127723693848,
- "learning_rate": 1.3603609440912508e-06,
- "loss": 0.32195356488227844,
- "mean_token_accuracy": 0.8984324932098389,
- "num_tokens": 23595586.0,
- "step": 2639
- },
- {
- "epoch": 2.0060790273556233,
- "grad_norm": 2.127723217010498,
- "learning_rate": 1.3584972200466936e-06,
- "loss": 0.4710606634616852,
- "mean_token_accuracy": 0.8563182950019836,
- "num_tokens": 23602747.0,
- "step": 2640
- },
- {
- "epoch": 2.006838905775076,
- "grad_norm": 1.9752192497253418,
- "learning_rate": 1.356634297152729e-06,
- "loss": 0.24204617738723755,
- "mean_token_accuracy": 0.9082983136177063,
- "num_tokens": 23609005.0,
- "step": 2641
- },
- {
- "epoch": 2.007598784194529,
- "grad_norm": 2.5435397624969482,
- "learning_rate": 1.3547721767168273e-06,
- "loss": 0.16702288389205933,
- "mean_token_accuracy": 0.9353867769241333,
- "num_tokens": 23612852.0,
- "step": 2642
- },
- {
- "epoch": 2.0083586626139818,
- "grad_norm": 1.8113304376602173,
- "learning_rate": 1.3529108600458967e-06,
- "loss": 0.4245433509349823,
- "mean_token_accuracy": 0.8446527719497681,
- "num_tokens": 23621462.0,
- "step": 2643
- },
- {
- "epoch": 2.0091185410334345,
- "grad_norm": 1.0438088178634644,
- "learning_rate": 1.3510503484462807e-06,
- "loss": 0.3710743188858032,
- "mean_token_accuracy": 0.8731123208999634,
- "num_tokens": 23642029.0,
- "step": 2644
- },
- {
- "epoch": 2.0098784194528876,
- "grad_norm": 1.9650516510009766,
- "learning_rate": 1.349190643223758e-06,
- "loss": 0.32384324073791504,
- "mean_token_accuracy": 0.8859044313430786,
- "num_tokens": 23648970.0,
- "step": 2645
- },
- {
- "epoch": 2.0106382978723403,
- "grad_norm": 1.4213180541992188,
- "learning_rate": 1.347331745683542e-06,
- "loss": 0.42391857504844666,
- "mean_token_accuracy": 0.8568997383117676,
- "num_tokens": 23663012.0,
- "step": 2646
- },
- {
- "epoch": 2.0113981762917934,
- "grad_norm": 1.852386236190796,
- "learning_rate": 1.3454736571302761e-06,
- "loss": 0.37283188104629517,
- "mean_token_accuracy": 0.9096506834030151,
- "num_tokens": 23671632.0,
- "step": 2647
- },
- {
- "epoch": 2.012158054711246,
- "grad_norm": 1.8350872993469238,
- "learning_rate": 1.3436163788680411e-06,
- "loss": 0.21148793399333954,
- "mean_token_accuracy": 0.9306647181510925,
- "num_tokens": 23678554.0,
- "step": 2648
- },
- {
- "epoch": 2.012917933130699,
- "grad_norm": 1.8285188674926758,
- "learning_rate": 1.3417599122003464e-06,
- "loss": 0.2638583183288574,
- "mean_token_accuracy": 0.904695987701416,
- "num_tokens": 23686905.0,
- "step": 2649
- },
- {
- "epoch": 2.013677811550152,
- "grad_norm": 1.1955424547195435,
- "learning_rate": 1.3399042584301298e-06,
- "loss": 0.30598434805870056,
- "mean_token_accuracy": 0.8953701257705688,
- "num_tokens": 23702734.0,
- "step": 2650
- },
- {
- "epoch": 2.014437689969605,
- "grad_norm": 1.5378512144088745,
- "learning_rate": 1.3380494188597603e-06,
- "loss": 0.33754611015319824,
- "mean_token_accuracy": 0.9063926935195923,
- "num_tokens": 23715891.0,
- "step": 2651
- },
- {
- "epoch": 2.0151975683890577,
- "grad_norm": 1.6957111358642578,
- "learning_rate": 1.3361953947910394e-06,
- "loss": 0.26302939653396606,
- "mean_token_accuracy": 0.90192711353302,
- "num_tokens": 23724034.0,
- "step": 2652
- },
- {
- "epoch": 2.015957446808511,
- "grad_norm": 1.1756837368011475,
- "learning_rate": 1.334342187525189e-06,
- "loss": 0.3312695622444153,
- "mean_token_accuracy": 0.870500385761261,
- "num_tokens": 23741241.0,
- "step": 2653
- },
- {
- "epoch": 2.0167173252279635,
- "grad_norm": 1.027145266532898,
- "learning_rate": 1.3324897983628621e-06,
- "loss": 0.2534530758857727,
- "mean_token_accuracy": 0.894199550151825,
- "num_tokens": 23758399.0,
- "step": 2654
- },
- {
- "epoch": 2.0174772036474162,
- "grad_norm": 2.2585113048553467,
- "learning_rate": 1.330638228604137e-06,
- "loss": 0.4558389186859131,
- "mean_token_accuracy": 0.8372241258621216,
- "num_tokens": 23766871.0,
- "step": 2655
- },
- {
- "epoch": 2.0182370820668694,
- "grad_norm": 1.886893630027771,
- "learning_rate": 1.3287874795485168e-06,
- "loss": 0.29894912242889404,
- "mean_token_accuracy": 0.9086098670959473,
- "num_tokens": 23774935.0,
- "step": 2656
- },
- {
- "epoch": 2.018996960486322,
- "grad_norm": 2.082537889480591,
- "learning_rate": 1.3269375524949286e-06,
- "loss": 0.39323803782463074,
- "mean_token_accuracy": 0.8598287105560303,
- "num_tokens": 23781303.0,
- "step": 2657
- },
- {
- "epoch": 2.019756838905775,
- "grad_norm": 1.7059803009033203,
- "learning_rate": 1.3250884487417227e-06,
- "loss": 0.17909850180149078,
- "mean_token_accuracy": 0.9276094436645508,
- "num_tokens": 23789148.0,
- "step": 2658
- },
- {
- "epoch": 2.020516717325228,
- "grad_norm": 2.150275945663452,
- "learning_rate": 1.3232401695866686e-06,
- "loss": 0.3707781434059143,
- "mean_token_accuracy": 0.8587700128555298,
- "num_tokens": 23795484.0,
- "step": 2659
- },
- {
- "epoch": 2.021276595744681,
- "grad_norm": 2.0554518699645996,
- "learning_rate": 1.321392716326963e-06,
- "loss": 0.33217954635620117,
- "mean_token_accuracy": 0.874828577041626,
- "num_tokens": 23802968.0,
- "step": 2660
- },
- {
- "epoch": 2.0220364741641337,
- "grad_norm": 2.4556071758270264,
- "learning_rate": 1.3195460902592193e-06,
- "loss": 0.2790899872779846,
- "mean_token_accuracy": 0.9071618914604187,
- "num_tokens": 23807788.0,
- "step": 2661
- },
- {
- "epoch": 2.022796352583587,
- "grad_norm": 1.7501509189605713,
- "learning_rate": 1.3177002926794685e-06,
- "loss": 0.3080750107765198,
- "mean_token_accuracy": 0.8942672610282898,
- "num_tokens": 23816023.0,
- "step": 2662
- },
- {
- "epoch": 2.0235562310030395,
- "grad_norm": 1.3934804201126099,
- "learning_rate": 1.3158553248831658e-06,
- "loss": 0.286912202835083,
- "mean_token_accuracy": 0.9284837245941162,
- "num_tokens": 23827186.0,
- "step": 2663
- },
- {
- "epoch": 2.024316109422492,
- "grad_norm": 1.2530465126037598,
- "learning_rate": 1.3140111881651773e-06,
- "loss": 0.2630627155303955,
- "mean_token_accuracy": 0.9029854536056519,
- "num_tokens": 23841399.0,
- "step": 2664
- },
- {
- "epoch": 2.0250759878419453,
- "grad_norm": 1.3417384624481201,
- "learning_rate": 1.312167883819791e-06,
- "loss": 0.37794870138168335,
- "mean_token_accuracy": 0.8722256422042847,
- "num_tokens": 23856061.0,
- "step": 2665
- },
- {
- "epoch": 2.025835866261398,
- "grad_norm": 2.234257698059082,
- "learning_rate": 1.3103254131407082e-06,
- "loss": 0.2739933133125305,
- "mean_token_accuracy": 0.9055665135383606,
- "num_tokens": 23861865.0,
- "step": 2666
- },
- {
- "epoch": 2.026595744680851,
- "grad_norm": 1.4187006950378418,
- "learning_rate": 1.308483777421046e-06,
- "loss": 0.24370817840099335,
- "mean_token_accuracy": 0.9145886301994324,
- "num_tokens": 23873632.0,
- "step": 2667
- },
- {
- "epoch": 2.027355623100304,
- "grad_norm": 2.3645882606506348,
- "learning_rate": 1.3066429779533352e-06,
- "loss": 0.23659822344779968,
- "mean_token_accuracy": 0.9209753274917603,
- "num_tokens": 23878866.0,
- "step": 2668
- },
- {
- "epoch": 2.028115501519757,
- "grad_norm": 1.4782226085662842,
- "learning_rate": 1.3048030160295196e-06,
- "loss": 0.3353138267993927,
- "mean_token_accuracy": 0.8747807741165161,
- "num_tokens": 23891089.0,
- "step": 2669
- },
- {
- "epoch": 2.0288753799392096,
- "grad_norm": 2.051754951477051,
- "learning_rate": 1.3029638929409555e-06,
- "loss": 0.2905973196029663,
- "mean_token_accuracy": 0.887441873550415,
- "num_tokens": 23897653.0,
- "step": 2670
- },
- {
- "epoch": 2.0296352583586628,
- "grad_norm": 1.322279453277588,
- "learning_rate": 1.3011256099784103e-06,
- "loss": 0.3938416540622711,
- "mean_token_accuracy": 0.8911079168319702,
- "num_tokens": 23912525.0,
- "step": 2671
- },
- {
- "epoch": 2.0303951367781155,
- "grad_norm": 1.87980318069458,
- "learning_rate": 1.2992881684320627e-06,
- "loss": 0.16637520492076874,
- "mean_token_accuracy": 0.9472321271896362,
- "num_tokens": 23918752.0,
- "step": 2672
- },
- {
- "epoch": 2.0311550151975686,
- "grad_norm": 2.0867233276367188,
- "learning_rate": 1.297451569591498e-06,
- "loss": 0.37282776832580566,
- "mean_token_accuracy": 0.8688399195671082,
- "num_tokens": 23925918.0,
- "step": 2673
- },
- {
- "epoch": 2.0319148936170213,
- "grad_norm": 1.129468560218811,
- "learning_rate": 1.2956158147457116e-06,
- "loss": 0.33072173595428467,
- "mean_token_accuracy": 0.8788217306137085,
- "num_tokens": 23944702.0,
- "step": 2674
- },
- {
- "epoch": 2.032674772036474,
- "grad_norm": 3.6016290187835693,
- "learning_rate": 1.2937809051831102e-06,
- "loss": 0.28343498706817627,
- "mean_token_accuracy": 0.911794900894165,
- "num_tokens": 23948417.0,
- "step": 2675
- },
- {
- "epoch": 2.033434650455927,
- "grad_norm": 1.4904811382293701,
- "learning_rate": 1.2919468421915008e-06,
- "loss": 0.4072638750076294,
- "mean_token_accuracy": 0.8615934252738953,
- "num_tokens": 23963654.0,
- "step": 2676
- },
- {
- "epoch": 2.0341945288753798,
- "grad_norm": 2.90740704536438,
- "learning_rate": 1.2901136270580994e-06,
- "loss": 0.3685106635093689,
- "mean_token_accuracy": 0.8923419713973999,
- "num_tokens": 23968608.0,
- "step": 2677
- },
- {
- "epoch": 2.034954407294833,
- "grad_norm": 1.8772104978561401,
- "learning_rate": 1.2882812610695305e-06,
- "loss": 0.2947828471660614,
- "mean_token_accuracy": 0.9065762758255005,
- "num_tokens": 23978298.0,
- "step": 2678
- },
- {
- "epoch": 2.0357142857142856,
- "grad_norm": 1.2135536670684814,
- "learning_rate": 1.2864497455118152e-06,
- "loss": 0.36015012860298157,
- "mean_token_accuracy": 0.8481813073158264,
- "num_tokens": 23995784.0,
- "step": 2679
- },
- {
- "epoch": 2.0364741641337387,
- "grad_norm": 1.941889762878418,
- "learning_rate": 1.2846190816703836e-06,
- "loss": 0.3004198670387268,
- "mean_token_accuracy": 0.8843618631362915,
- "num_tokens": 24002651.0,
- "step": 2680
- },
- {
- "epoch": 2.0372340425531914,
- "grad_norm": 1.8905075788497925,
- "learning_rate": 1.2827892708300648e-06,
- "loss": 0.26640570163726807,
- "mean_token_accuracy": 0.9079146385192871,
- "num_tokens": 24010400.0,
- "step": 2681
- },
- {
- "epoch": 2.0379939209726445,
- "grad_norm": 1.2975934743881226,
- "learning_rate": 1.280960314275092e-06,
- "loss": 0.19093887507915497,
- "mean_token_accuracy": 0.9277223348617554,
- "num_tokens": 24021528.0,
- "step": 2682
- },
- {
- "epoch": 2.038753799392097,
- "grad_norm": 1.6483098268508911,
- "learning_rate": 1.279132213289096e-06,
- "loss": 0.29260069131851196,
- "mean_token_accuracy": 0.892486572265625,
- "num_tokens": 24030470.0,
- "step": 2683
- },
- {
- "epoch": 2.0395136778115504,
- "grad_norm": 1.6875916719436646,
- "learning_rate": 1.2773049691551103e-06,
- "loss": 0.3784627914428711,
- "mean_token_accuracy": 0.8682783842086792,
- "num_tokens": 24041608.0,
- "step": 2684
- },
- {
- "epoch": 2.040273556231003,
- "grad_norm": 2.1055848598480225,
- "learning_rate": 1.2754785831555617e-06,
- "loss": 0.14676237106323242,
- "mean_token_accuracy": 0.9532995223999023,
- "num_tokens": 24046687.0,
- "step": 2685
- },
- {
- "epoch": 2.0410334346504557,
- "grad_norm": 1.3862961530685425,
- "learning_rate": 1.273653056572282e-06,
- "loss": 0.34408485889434814,
- "mean_token_accuracy": 0.8748919367790222,
- "num_tokens": 24059147.0,
- "step": 2686
- },
- {
- "epoch": 2.041793313069909,
- "grad_norm": 2.936876058578491,
- "learning_rate": 1.2718283906864939e-06,
- "loss": 0.2471027672290802,
- "mean_token_accuracy": 0.9177526235580444,
- "num_tokens": 24062963.0,
- "step": 2687
- },
- {
- "epoch": 2.0425531914893615,
- "grad_norm": 1.3992520570755005,
- "learning_rate": 1.2700045867788184e-06,
- "loss": 0.421109139919281,
- "mean_token_accuracy": 0.8664785623550415,
- "num_tokens": 24077912.0,
- "step": 2688
- },
- {
- "epoch": 2.0433130699088147,
- "grad_norm": 3.0531985759735107,
- "learning_rate": 1.2681816461292715e-06,
- "loss": 0.292591392993927,
- "mean_token_accuracy": 0.8992351293563843,
- "num_tokens": 24082058.0,
- "step": 2689
- },
- {
- "epoch": 2.0440729483282674,
- "grad_norm": 1.4562251567840576,
- "learning_rate": 1.2663595700172631e-06,
- "loss": 0.39367130398750305,
- "mean_token_accuracy": 0.8894597887992859,
- "num_tokens": 24093954.0,
- "step": 2690
- },
- {
- "epoch": 2.0448328267477205,
- "grad_norm": 1.9354028701782227,
- "learning_rate": 1.2645383597215965e-06,
- "loss": 0.28203579783439636,
- "mean_token_accuracy": 0.9011955261230469,
- "num_tokens": 24100590.0,
- "step": 2691
- },
- {
- "epoch": 2.045592705167173,
- "grad_norm": 1.5010690689086914,
- "learning_rate": 1.2627180165204671e-06,
- "loss": 0.3463609516620636,
- "mean_token_accuracy": 0.8978298306465149,
- "num_tokens": 24111104.0,
- "step": 2692
- },
- {
- "epoch": 2.0463525835866263,
- "grad_norm": 2.585813045501709,
- "learning_rate": 1.2608985416914616e-06,
- "loss": 0.2142711877822876,
- "mean_token_accuracy": 0.9260460138320923,
- "num_tokens": 24115301.0,
- "step": 2693
- },
- {
- "epoch": 2.047112462006079,
- "grad_norm": 2.317268133163452,
- "learning_rate": 1.259079936511558e-06,
- "loss": 0.14454546570777893,
- "mean_token_accuracy": 0.9498077034950256,
- "num_tokens": 24120295.0,
- "step": 2694
- },
- {
- "epoch": 2.047872340425532,
- "grad_norm": 1.966550350189209,
- "learning_rate": 1.257262202257124e-06,
- "loss": 0.20745311677455902,
- "mean_token_accuracy": 0.9157166481018066,
- "num_tokens": 24127158.0,
- "step": 2695
- },
- {
- "epoch": 2.048632218844985,
- "grad_norm": 1.6521401405334473,
- "learning_rate": 1.2554453402039124e-06,
- "loss": 0.2547406256198883,
- "mean_token_accuracy": 0.9356101751327515,
- "num_tokens": 24135620.0,
- "step": 2696
- },
- {
- "epoch": 2.0493920972644375,
- "grad_norm": 2.341756582260132,
- "learning_rate": 1.2536293516270704e-06,
- "loss": 0.35540008544921875,
- "mean_token_accuracy": 0.874363899230957,
- "num_tokens": 24141766.0,
- "step": 2697
- },
- {
- "epoch": 2.0501519756838906,
- "grad_norm": 1.7938716411590576,
- "learning_rate": 1.251814237801128e-06,
- "loss": 0.37250861525535583,
- "mean_token_accuracy": 0.8644422292709351,
- "num_tokens": 24149997.0,
- "step": 2698
- },
- {
- "epoch": 2.0509118541033433,
- "grad_norm": 2.0868122577667236,
- "learning_rate": 1.2500000000000007e-06,
- "loss": 0.44527092576026917,
- "mean_token_accuracy": 0.8510264158248901,
- "num_tokens": 24158208.0,
- "step": 2699
- },
- {
- "epoch": 2.0516717325227964,
- "grad_norm": 2.412604808807373,
- "learning_rate": 1.24818663949699e-06,
- "loss": 0.19276219606399536,
- "mean_token_accuracy": 0.9317681789398193,
- "num_tokens": 24162905.0,
- "step": 2700
- },
- {
- "epoch": 2.052431610942249,
- "grad_norm": 1.4488455057144165,
- "learning_rate": 1.246374157564785e-06,
- "loss": 0.3493705093860626,
- "mean_token_accuracy": 0.9016396999359131,
- "num_tokens": 24175852.0,
- "step": 2701
- },
- {
- "epoch": 2.0531914893617023,
- "grad_norm": 2.1629185676574707,
- "learning_rate": 1.2445625554754526e-06,
- "loss": 0.30588388442993164,
- "mean_token_accuracy": 0.8871392011642456,
- "num_tokens": 24181507.0,
- "step": 2702
- },
- {
- "epoch": 2.053951367781155,
- "grad_norm": 2.0489449501037598,
- "learning_rate": 1.2427518345004459e-06,
- "loss": 0.4578161835670471,
- "mean_token_accuracy": 0.8498104214668274,
- "num_tokens": 24191918.0,
- "step": 2703
- },
- {
- "epoch": 2.054711246200608,
- "grad_norm": 2.063019037246704,
- "learning_rate": 1.2409419959105981e-06,
- "loss": 0.31680572032928467,
- "mean_token_accuracy": 0.8809083700180054,
- "num_tokens": 24199336.0,
- "step": 2704
- },
- {
- "epoch": 2.0554711246200608,
- "grad_norm": 2.4594223499298096,
- "learning_rate": 1.239133040976124e-06,
- "loss": 0.3048282265663147,
- "mean_token_accuracy": 0.8897095322608948,
- "num_tokens": 24205118.0,
- "step": 2705
- },
- {
- "epoch": 2.056231003039514,
- "grad_norm": 1.6359999179840088,
- "learning_rate": 1.237324970966618e-06,
- "loss": 0.4312370717525482,
- "mean_token_accuracy": 0.8526142835617065,
- "num_tokens": 24215792.0,
- "step": 2706
- },
- {
- "epoch": 2.0569908814589666,
- "grad_norm": 1.5534536838531494,
- "learning_rate": 1.2355177871510538e-06,
- "loss": 0.3647908568382263,
- "mean_token_accuracy": 0.8680631518363953,
- "num_tokens": 24235325.0,
- "step": 2707
- },
- {
- "epoch": 2.0577507598784193,
- "grad_norm": 2.4902515411376953,
- "learning_rate": 1.2337114907977798e-06,
- "loss": 0.3605276942253113,
- "mean_token_accuracy": 0.8776376843452454,
- "num_tokens": 24241502.0,
- "step": 2708
- },
- {
- "epoch": 2.0585106382978724,
- "grad_norm": 1.7282993793487549,
- "learning_rate": 1.2319060831745273e-06,
- "loss": 0.38326722383499146,
- "mean_token_accuracy": 0.8531644344329834,
- "num_tokens": 24252665.0,
- "step": 2709
- },
- {
- "epoch": 2.059270516717325,
- "grad_norm": 1.4213361740112305,
- "learning_rate": 1.2301015655484006e-06,
- "loss": 0.32221150398254395,
- "mean_token_accuracy": 0.8890664577484131,
- "num_tokens": 24266409.0,
- "step": 2710
- },
- {
- "epoch": 2.060030395136778,
- "grad_norm": 2.6412453651428223,
- "learning_rate": 1.2282979391858767e-06,
- "loss": 0.20225220918655396,
- "mean_token_accuracy": 0.9287782311439514,
- "num_tokens": 24271069.0,
- "step": 2711
- },
- {
- "epoch": 2.060790273556231,
- "grad_norm": 3.2601654529571533,
- "learning_rate": 1.2264952053528145e-06,
- "loss": 0.23259003460407257,
- "mean_token_accuracy": 0.9290606379508972,
- "num_tokens": 24274992.0,
- "step": 2712
- },
- {
- "epoch": 2.061550151975684,
- "grad_norm": 1.6633410453796387,
- "learning_rate": 1.2246933653144386e-06,
- "loss": 0.355314165353775,
- "mean_token_accuracy": 0.870380163192749,
- "num_tokens": 24284917.0,
- "step": 2713
- },
- {
- "epoch": 2.0623100303951367,
- "grad_norm": 2.9081318378448486,
- "learning_rate": 1.2228924203353507e-06,
- "loss": 0.38050833344459534,
- "mean_token_accuracy": 0.8879997730255127,
- "num_tokens": 24289694.0,
- "step": 2714
- },
- {
- "epoch": 2.06306990881459,
- "grad_norm": 3.2404227256774902,
- "learning_rate": 1.2210923716795233e-06,
- "loss": 0.2502570152282715,
- "mean_token_accuracy": 0.9150978922843933,
- "num_tokens": 24293254.0,
- "step": 2715
- },
- {
- "epoch": 2.0638297872340425,
- "grad_norm": 1.9262174367904663,
- "learning_rate": 1.2192932206103e-06,
- "loss": 0.26763200759887695,
- "mean_token_accuracy": 0.9203122854232788,
- "num_tokens": 24300881.0,
- "step": 2716
- },
- {
- "epoch": 2.0645896656534957,
- "grad_norm": 1.6790109872817993,
- "learning_rate": 1.2174949683903943e-06,
- "loss": 0.22275440394878387,
- "mean_token_accuracy": 0.9212621450424194,
- "num_tokens": 24309288.0,
- "step": 2717
- },
- {
- "epoch": 2.0653495440729484,
- "grad_norm": 1.8272414207458496,
- "learning_rate": 1.2156976162818895e-06,
- "loss": 0.3183424472808838,
- "mean_token_accuracy": 0.8813169002532959,
- "num_tokens": 24316980.0,
- "step": 2718
- },
- {
- "epoch": 2.066109422492401,
- "grad_norm": 2.7388651371002197,
- "learning_rate": 1.2139011655462338e-06,
- "loss": 0.24794816970825195,
- "mean_token_accuracy": 0.9109550714492798,
- "num_tokens": 24321867.0,
- "step": 2719
- },
- {
- "epoch": 2.066869300911854,
- "grad_norm": 1.4866925477981567,
- "learning_rate": 1.2121056174442484e-06,
- "loss": 0.24177205562591553,
- "mean_token_accuracy": 0.9102780818939209,
- "num_tokens": 24332874.0,
- "step": 2720
- },
- {
- "epoch": 2.067629179331307,
- "grad_norm": 1.6006059646606445,
- "learning_rate": 1.2103109732361178e-06,
- "loss": 0.29220807552337646,
- "mean_token_accuracy": 0.8947570323944092,
- "num_tokens": 24342790.0,
- "step": 2721
- },
- {
- "epoch": 2.06838905775076,
- "grad_norm": 2.2688677310943604,
- "learning_rate": 1.208517234181391e-06,
- "loss": 0.39247143268585205,
- "mean_token_accuracy": 0.8514304161071777,
- "num_tokens": 24349329.0,
- "step": 2722
- },
- {
- "epoch": 2.0691489361702127,
- "grad_norm": 2.404534339904785,
- "learning_rate": 1.2067244015389829e-06,
- "loss": 0.4461793303489685,
- "mean_token_accuracy": 0.8531662821769714,
- "num_tokens": 24356287.0,
- "step": 2723
- },
- {
- "epoch": 2.069908814589666,
- "grad_norm": 1.813341498374939,
- "learning_rate": 1.204932476567175e-06,
- "loss": 0.38300177454948425,
- "mean_token_accuracy": 0.8597674369812012,
- "num_tokens": 24366181.0,
- "step": 2724
- },
- {
- "epoch": 2.0706686930091185,
- "grad_norm": 3.49125337600708,
- "learning_rate": 1.2031414605236066e-06,
- "loss": 0.33281540870666504,
- "mean_token_accuracy": 0.8774969577789307,
- "num_tokens": 24370362.0,
- "step": 2725
- },
- {
- "epoch": 2.0714285714285716,
- "grad_norm": 1.7682114839553833,
- "learning_rate": 1.2013513546652827e-06,
- "loss": 0.3001813590526581,
- "mean_token_accuracy": 0.8840254545211792,
- "num_tokens": 24380469.0,
- "step": 2726
- },
- {
- "epoch": 2.0721884498480243,
- "grad_norm": 2.3688952922821045,
- "learning_rate": 1.1995621602485685e-06,
- "loss": 0.20055249333381653,
- "mean_token_accuracy": 0.9246129989624023,
- "num_tokens": 24385474.0,
- "step": 2727
- },
- {
- "epoch": 2.072948328267477,
- "grad_norm": 2.3368382453918457,
- "learning_rate": 1.1977738785291894e-06,
- "loss": 0.18379954993724823,
- "mean_token_accuracy": 0.9385529160499573,
- "num_tokens": 24390002.0,
- "step": 2728
- },
- {
- "epoch": 2.07370820668693,
- "grad_norm": 1.857473373413086,
- "learning_rate": 1.1959865107622306e-06,
- "loss": 0.4606894552707672,
- "mean_token_accuracy": 0.8437427282333374,
- "num_tokens": 24400880.0,
- "step": 2729
- },
- {
- "epoch": 2.074468085106383,
- "grad_norm": 1.2714136838912964,
- "learning_rate": 1.1942000582021355e-06,
- "loss": 0.21171459555625916,
- "mean_token_accuracy": 0.9216019511222839,
- "num_tokens": 24413113.0,
- "step": 2730
- },
- {
- "epoch": 2.075227963525836,
- "grad_norm": 2.2025210857391357,
- "learning_rate": 1.1924145221027048e-06,
- "loss": 0.44211941957473755,
- "mean_token_accuracy": 0.8538386821746826,
- "num_tokens": 24420504.0,
- "step": 2731
- },
- {
- "epoch": 2.0759878419452886,
- "grad_norm": 1.6706589460372925,
- "learning_rate": 1.190629903717097e-06,
- "loss": 0.35163265466690063,
- "mean_token_accuracy": 0.8716240525245667,
- "num_tokens": 24430203.0,
- "step": 2732
- },
- {
- "epoch": 2.0767477203647418,
- "grad_norm": 2.299182176589966,
- "learning_rate": 1.1888462042978268e-06,
- "loss": 0.30983975529670715,
- "mean_token_accuracy": 0.8859797716140747,
- "num_tokens": 24437387.0,
- "step": 2733
- },
- {
- "epoch": 2.0775075987841944,
- "grad_norm": 2.975123167037964,
- "learning_rate": 1.1870634250967606e-06,
- "loss": 0.23585952818393707,
- "mean_token_accuracy": 0.9167368412017822,
- "num_tokens": 24441176.0,
- "step": 2734
- },
- {
- "epoch": 2.0782674772036476,
- "grad_norm": 1.1052464246749878,
- "learning_rate": 1.1852815673651246e-06,
- "loss": 0.24136316776275635,
- "mean_token_accuracy": 0.8897353410720825,
- "num_tokens": 24457092.0,
- "step": 2735
- },
- {
- "epoch": 2.0790273556231003,
- "grad_norm": 1.5531870126724243,
- "learning_rate": 1.1835006323534926e-06,
- "loss": 0.302223265171051,
- "mean_token_accuracy": 0.8940514326095581,
- "num_tokens": 24467643.0,
- "step": 2736
- },
- {
- "epoch": 2.0797872340425534,
- "grad_norm": 1.706140398979187,
- "learning_rate": 1.1817206213117943e-06,
- "loss": 0.39235255122184753,
- "mean_token_accuracy": 0.8615218997001648,
- "num_tokens": 24477715.0,
- "step": 2737
- },
- {
- "epoch": 2.080547112462006,
- "grad_norm": 2.1109750270843506,
- "learning_rate": 1.1799415354893103e-06,
- "loss": 0.2526751756668091,
- "mean_token_accuracy": 0.9108465909957886,
- "num_tokens": 24484248.0,
- "step": 2738
- },
- {
- "epoch": 2.0813069908814588,
- "grad_norm": 1.9943277835845947,
- "learning_rate": 1.178163376134671e-06,
- "loss": 0.3540172874927521,
- "mean_token_accuracy": 0.9131139516830444,
- "num_tokens": 24492207.0,
- "step": 2739
- },
- {
- "epoch": 2.082066869300912,
- "grad_norm": 1.9536099433898926,
- "learning_rate": 1.1763861444958573e-06,
- "loss": 0.3902950584888458,
- "mean_token_accuracy": 0.8611530065536499,
- "num_tokens": 24501567.0,
- "step": 2740
- },
- {
- "epoch": 2.0828267477203646,
- "grad_norm": 3.146925926208496,
- "learning_rate": 1.1746098418201987e-06,
- "loss": 0.43440669775009155,
- "mean_token_accuracy": 0.8709320425987244,
- "num_tokens": 24506684.0,
- "step": 2741
- },
- {
- "epoch": 2.0835866261398177,
- "grad_norm": 2.763427495956421,
- "learning_rate": 1.172834469354373e-06,
- "loss": 0.3513452410697937,
- "mean_token_accuracy": 0.8774256110191345,
- "num_tokens": 24511509.0,
- "step": 2742
- },
- {
- "epoch": 2.0843465045592704,
- "grad_norm": 2.773829221725464,
- "learning_rate": 1.1710600283444048e-06,
- "loss": 0.24668049812316895,
- "mean_token_accuracy": 0.9146889448165894,
- "num_tokens": 24516030.0,
- "step": 2743
- },
- {
- "epoch": 2.0851063829787235,
- "grad_norm": 1.666471242904663,
- "learning_rate": 1.169286520035666e-06,
- "loss": 0.36206915974617004,
- "mean_token_accuracy": 0.8711973428726196,
- "num_tokens": 24526656.0,
- "step": 2744
- },
- {
- "epoch": 2.085866261398176,
- "grad_norm": 2.818890333175659,
- "learning_rate": 1.1675139456728702e-06,
- "loss": 0.32967281341552734,
- "mean_token_accuracy": 0.880983829498291,
- "num_tokens": 24531625.0,
- "step": 2745
- },
- {
- "epoch": 2.0866261398176293,
- "grad_norm": 1.09058678150177,
- "learning_rate": 1.1657423065000811e-06,
- "loss": 0.36224377155303955,
- "mean_token_accuracy": 0.8708326816558838,
- "num_tokens": 24557123.0,
- "step": 2746
- },
- {
- "epoch": 2.087386018237082,
- "grad_norm": 1.1434987783432007,
- "learning_rate": 1.1639716037607036e-06,
- "loss": 0.26490458846092224,
- "mean_token_accuracy": 0.9131897687911987,
- "num_tokens": 24573223.0,
- "step": 2747
- },
- {
- "epoch": 2.088145896656535,
- "grad_norm": 2.437505006790161,
- "learning_rate": 1.1622018386974829e-06,
- "loss": 0.18964408338069916,
- "mean_token_accuracy": 0.9271818399429321,
- "num_tokens": 24578306.0,
- "step": 2748
- },
- {
- "epoch": 2.088905775075988,
- "grad_norm": 1.797308325767517,
- "learning_rate": 1.160433012552508e-06,
- "loss": 0.3090781569480896,
- "mean_token_accuracy": 0.8960750102996826,
- "num_tokens": 24587562.0,
- "step": 2749
- },
- {
- "epoch": 2.0896656534954405,
- "grad_norm": 2.4050841331481934,
- "learning_rate": 1.1586651265672122e-06,
- "loss": 0.4001041054725647,
- "mean_token_accuracy": 0.8588370084762573,
- "num_tokens": 24594223.0,
- "step": 2750
- },
- {
- "epoch": 2.0904255319148937,
- "grad_norm": 1.8757156133651733,
- "learning_rate": 1.1568981819823636e-06,
- "loss": 0.37845075130462646,
- "mean_token_accuracy": 0.866146445274353,
- "num_tokens": 24602556.0,
- "step": 2751
- },
- {
- "epoch": 2.0911854103343464,
- "grad_norm": 1.8205114603042603,
- "learning_rate": 1.1551321800380722e-06,
- "loss": 0.24738016724586487,
- "mean_token_accuracy": 0.923284113407135,
- "num_tokens": 24611627.0,
- "step": 2752
- },
- {
- "epoch": 2.0919452887537995,
- "grad_norm": 2.107512950897217,
- "learning_rate": 1.153367121973786e-06,
- "loss": 0.3062688410282135,
- "mean_token_accuracy": 0.8909003734588623,
- "num_tokens": 24619569.0,
- "step": 2753
- },
- {
- "epoch": 2.092705167173252,
- "grad_norm": 1.93110191822052,
- "learning_rate": 1.1516030090282915e-06,
- "loss": 0.38658422231674194,
- "mean_token_accuracy": 0.869437038898468,
- "num_tokens": 24628869.0,
- "step": 2754
- },
- {
- "epoch": 2.0934650455927053,
- "grad_norm": 2.3618004322052,
- "learning_rate": 1.1498398424397106e-06,
- "loss": 0.19193072617053986,
- "mean_token_accuracy": 0.9329519271850586,
- "num_tokens": 24633724.0,
- "step": 2755
- },
- {
- "epoch": 2.094224924012158,
- "grad_norm": 2.274510622024536,
- "learning_rate": 1.1480776234455024e-06,
- "loss": 0.24939998984336853,
- "mean_token_accuracy": 0.9104958772659302,
- "num_tokens": 24642762.0,
- "step": 2756
- },
- {
- "epoch": 2.094984802431611,
- "grad_norm": 1.7468934059143066,
- "learning_rate": 1.1463163532824572e-06,
- "loss": 0.3876607418060303,
- "mean_token_accuracy": 0.8540539145469666,
- "num_tokens": 24652138.0,
- "step": 2757
- },
- {
- "epoch": 2.095744680851064,
- "grad_norm": 2.905381441116333,
- "learning_rate": 1.1445560331867054e-06,
- "loss": 0.33666878938674927,
- "mean_token_accuracy": 0.8805598616600037,
- "num_tokens": 24656612.0,
- "step": 2758
- },
- {
- "epoch": 2.096504559270517,
- "grad_norm": 1.5513007640838623,
- "learning_rate": 1.142796664393707e-06,
- "loss": 0.25168463587760925,
- "mean_token_accuracy": 0.925534725189209,
- "num_tokens": 24667132.0,
- "step": 2759
- },
- {
- "epoch": 2.0972644376899696,
- "grad_norm": 1.6804249286651611,
- "learning_rate": 1.141038248138253e-06,
- "loss": 0.3862859010696411,
- "mean_token_accuracy": 0.8686253428459167,
- "num_tokens": 24679274.0,
- "step": 2760
- },
- {
- "epoch": 2.0980243161094223,
- "grad_norm": 1.7432880401611328,
- "learning_rate": 1.1392807856544682e-06,
- "loss": 0.3200700879096985,
- "mean_token_accuracy": 0.9188123941421509,
- "num_tokens": 24688628.0,
- "step": 2761
- },
- {
- "epoch": 2.0987841945288754,
- "grad_norm": 1.8734468221664429,
- "learning_rate": 1.1375242781758077e-06,
- "loss": 0.34758424758911133,
- "mean_token_accuracy": 0.8724187016487122,
- "num_tokens": 24698159.0,
- "step": 2762
- },
- {
- "epoch": 2.099544072948328,
- "grad_norm": 3.7156829833984375,
- "learning_rate": 1.1357687269350564e-06,
- "loss": 0.30014732480049133,
- "mean_token_accuracy": 0.9021577835083008,
- "num_tokens": 24701797.0,
- "step": 2763
- },
- {
- "epoch": 2.1003039513677813,
- "grad_norm": 1.5196985006332397,
- "learning_rate": 1.1340141331643276e-06,
- "loss": 0.45747464895248413,
- "mean_token_accuracy": 0.839891791343689,
- "num_tokens": 24716468.0,
- "step": 2764
- },
- {
- "epoch": 2.101063829787234,
- "grad_norm": 1.978009581565857,
- "learning_rate": 1.132260498095062e-06,
- "loss": 0.3130183815956116,
- "mean_token_accuracy": 0.90610271692276,
- "num_tokens": 24723211.0,
- "step": 2765
- },
- {
- "epoch": 2.101823708206687,
- "grad_norm": 1.5883251428604126,
- "learning_rate": 1.1305078229580294e-06,
- "loss": 0.30493029952049255,
- "mean_token_accuracy": 0.8889745473861694,
- "num_tokens": 24733839.0,
- "step": 2766
- },
- {
- "epoch": 2.1025835866261398,
- "grad_norm": 1.2397783994674683,
- "learning_rate": 1.128756108983325e-06,
- "loss": 0.2606407105922699,
- "mean_token_accuracy": 0.9061247110366821,
- "num_tokens": 24747488.0,
- "step": 2767
- },
- {
- "epoch": 2.103343465045593,
- "grad_norm": 1.3046784400939941,
- "learning_rate": 1.1270053574003658e-06,
- "loss": 0.38750404119491577,
- "mean_token_accuracy": 0.8777017593383789,
- "num_tokens": 24763893.0,
- "step": 2768
- },
- {
- "epoch": 2.1041033434650456,
- "grad_norm": 1.499266266822815,
- "learning_rate": 1.1252555694379005e-06,
- "loss": 0.4804937243461609,
- "mean_token_accuracy": 0.8344086408615112,
- "num_tokens": 24779323.0,
- "step": 2769
- },
- {
- "epoch": 2.1048632218844983,
- "grad_norm": 1.211094856262207,
- "learning_rate": 1.123506746323997e-06,
- "loss": 0.3579246997833252,
- "mean_token_accuracy": 0.8705919981002808,
- "num_tokens": 24794965.0,
- "step": 2770
- },
- {
- "epoch": 2.1056231003039514,
- "grad_norm": 2.490551471710205,
- "learning_rate": 1.1217588892860446e-06,
- "loss": 0.4084790349006653,
- "mean_token_accuracy": 0.8553222417831421,
- "num_tokens": 24800614.0,
- "step": 2771
- },
- {
- "epoch": 2.106382978723404,
- "grad_norm": 1.5249632596969604,
- "learning_rate": 1.1200119995507572e-06,
- "loss": 0.36853182315826416,
- "mean_token_accuracy": 0.8847414255142212,
- "num_tokens": 24812886.0,
- "step": 2772
- },
- {
- "epoch": 2.107142857142857,
- "grad_norm": 1.8510968685150146,
- "learning_rate": 1.1182660783441719e-06,
- "loss": 0.2918103337287903,
- "mean_token_accuracy": 0.8898224830627441,
- "num_tokens": 24821545.0,
- "step": 2773
- },
- {
- "epoch": 2.10790273556231,
- "grad_norm": 1.7721803188323975,
- "learning_rate": 1.11652112689164e-06,
- "loss": 0.2920452654361725,
- "mean_token_accuracy": 0.8879085779190063,
- "num_tokens": 24831526.0,
- "step": 2774
- },
- {
- "epoch": 2.108662613981763,
- "grad_norm": 1.3987336158752441,
- "learning_rate": 1.1147771464178378e-06,
- "loss": 0.4407062828540802,
- "mean_token_accuracy": 0.8472493886947632,
- "num_tokens": 24845847.0,
- "step": 2775
- },
- {
- "epoch": 2.1094224924012157,
- "grad_norm": 1.8927375078201294,
- "learning_rate": 1.1130341381467569e-06,
- "loss": 0.36293038725852966,
- "mean_token_accuracy": 0.8881135582923889,
- "num_tokens": 24854760.0,
- "step": 2776
- },
- {
- "epoch": 2.110182370820669,
- "grad_norm": 3.0480666160583496,
- "learning_rate": 1.111292103301708e-06,
- "loss": 0.30395108461380005,
- "mean_token_accuracy": 0.9036306142807007,
- "num_tokens": 24859051.0,
- "step": 2777
- },
- {
- "epoch": 2.1109422492401215,
- "grad_norm": 1.5833618640899658,
- "learning_rate": 1.1095510431053176e-06,
- "loss": 0.26424330472946167,
- "mean_token_accuracy": 0.9020674824714661,
- "num_tokens": 24869853.0,
- "step": 2778
- },
- {
- "epoch": 2.1117021276595747,
- "grad_norm": 1.645459532737732,
- "learning_rate": 1.1078109587795311e-06,
- "loss": 0.3563994765281677,
- "mean_token_accuracy": 0.8732106685638428,
- "num_tokens": 24880184.0,
- "step": 2779
- },
- {
- "epoch": 2.1124620060790273,
- "grad_norm": 2.2964093685150146,
- "learning_rate": 1.1060718515456022e-06,
- "loss": 0.19739922881126404,
- "mean_token_accuracy": 0.9273765087127686,
- "num_tokens": 24885398.0,
- "step": 2780
- },
- {
- "epoch": 2.11322188449848,
- "grad_norm": 2.094024181365967,
- "learning_rate": 1.1043337226241075e-06,
- "loss": 0.3321923315525055,
- "mean_token_accuracy": 0.8865819573402405,
- "num_tokens": 24893908.0,
- "step": 2781
- },
- {
- "epoch": 2.113981762917933,
- "grad_norm": 1.9787025451660156,
- "learning_rate": 1.1025965732349318e-06,
- "loss": 0.37631168961524963,
- "mean_token_accuracy": 0.8808693885803223,
- "num_tokens": 24901270.0,
- "step": 2782
- },
- {
- "epoch": 2.114741641337386,
- "grad_norm": 2.376060724258423,
- "learning_rate": 1.100860404597271e-06,
- "loss": 0.2591894268989563,
- "mean_token_accuracy": 0.9174780249595642,
- "num_tokens": 24906578.0,
- "step": 2783
- },
- {
- "epoch": 2.115501519756839,
- "grad_norm": 1.0967903137207031,
- "learning_rate": 1.0991252179296389e-06,
- "loss": 0.26626938581466675,
- "mean_token_accuracy": 0.9305505752563477,
- "num_tokens": 24922329.0,
- "step": 2784
- },
- {
- "epoch": 2.1162613981762917,
- "grad_norm": 3.3701183795928955,
- "learning_rate": 1.0973910144498534e-06,
- "loss": 0.2710079848766327,
- "mean_token_accuracy": 0.9095271825790405,
- "num_tokens": 24925777.0,
- "step": 2785
- },
- {
- "epoch": 2.117021276595745,
- "grad_norm": 1.636264681816101,
- "learning_rate": 1.0956577953750461e-06,
- "loss": 0.2995981276035309,
- "mean_token_accuracy": 0.8988568782806396,
- "num_tokens": 24934230.0,
- "step": 2786
- },
- {
- "epoch": 2.1177811550151975,
- "grad_norm": 2.3107731342315674,
- "learning_rate": 1.093925561921657e-06,
- "loss": 0.3424459397792816,
- "mean_token_accuracy": 0.9100210070610046,
- "num_tokens": 24939830.0,
- "step": 2787
- },
- {
- "epoch": 2.1185410334346506,
- "grad_norm": 1.814764380455017,
- "learning_rate": 1.0921943153054343e-06,
- "loss": 0.3182154893875122,
- "mean_token_accuracy": 0.883027195930481,
- "num_tokens": 24947764.0,
- "step": 2788
- },
- {
- "epoch": 2.1193009118541033,
- "grad_norm": 1.693555235862732,
- "learning_rate": 1.0904640567414332e-06,
- "loss": 0.3685447573661804,
- "mean_token_accuracy": 0.8900846242904663,
- "num_tokens": 24957680.0,
- "step": 2789
- },
- {
- "epoch": 2.1200607902735564,
- "grad_norm": 1.0726022720336914,
- "learning_rate": 1.088734787444017e-06,
- "loss": 0.28461548686027527,
- "mean_token_accuracy": 0.9026681184768677,
- "num_tokens": 24975181.0,
- "step": 2790
- },
- {
- "epoch": 2.120820668693009,
- "grad_norm": 1.3013874292373657,
- "learning_rate": 1.0870065086268506e-06,
- "loss": 0.28222548961639404,
- "mean_token_accuracy": 0.9041857719421387,
- "num_tokens": 24993211.0,
- "step": 2791
- },
- {
- "epoch": 2.121580547112462,
- "grad_norm": 2.592106580734253,
- "learning_rate": 1.085279221502909e-06,
- "loss": 0.31733593344688416,
- "mean_token_accuracy": 0.90151047706604,
- "num_tokens": 24998151.0,
- "step": 2792
- },
- {
- "epoch": 2.122340425531915,
- "grad_norm": 2.649210214614868,
- "learning_rate": 1.0835529272844694e-06,
- "loss": 0.341595321893692,
- "mean_token_accuracy": 0.8989696502685547,
- "num_tokens": 25003399.0,
- "step": 2793
- },
- {
- "epoch": 2.1231003039513676,
- "grad_norm": 2.376619577407837,
- "learning_rate": 1.0818276271831094e-06,
- "loss": 0.2770065665245056,
- "mean_token_accuracy": 0.8967875242233276,
- "num_tokens": 25009686.0,
- "step": 2794
- },
- {
- "epoch": 2.1238601823708207,
- "grad_norm": 2.1539604663848877,
- "learning_rate": 1.080103322409711e-06,
- "loss": 0.37501147389411926,
- "mean_token_accuracy": 0.8768513202667236,
- "num_tokens": 25016339.0,
- "step": 2795
- },
- {
- "epoch": 2.1246200607902734,
- "grad_norm": 2.5727670192718506,
- "learning_rate": 1.0783800141744607e-06,
- "loss": 0.31852903962135315,
- "mean_token_accuracy": 0.8897477388381958,
- "num_tokens": 25021410.0,
- "step": 2796
- },
- {
- "epoch": 2.1253799392097266,
- "grad_norm": 2.1428916454315186,
- "learning_rate": 1.0766577036868395e-06,
- "loss": 0.2348000407218933,
- "mean_token_accuracy": 0.9012142419815063,
- "num_tokens": 25027375.0,
- "step": 2797
- },
- {
- "epoch": 2.1261398176291793,
- "grad_norm": 2.4231064319610596,
- "learning_rate": 1.074936392155631e-06,
- "loss": 0.30580806732177734,
- "mean_token_accuracy": 0.8963108658790588,
- "num_tokens": 25033211.0,
- "step": 2798
- },
- {
- "epoch": 2.1268996960486324,
- "grad_norm": 2.1027259826660156,
- "learning_rate": 1.073216080788921e-06,
- "loss": 0.2508814334869385,
- "mean_token_accuracy": 0.9095165729522705,
- "num_tokens": 25040316.0,
- "step": 2799
- },
- {
- "epoch": 2.127659574468085,
- "grad_norm": 1.6513079404830933,
- "learning_rate": 1.0714967707940876e-06,
- "loss": 0.40694183111190796,
- "mean_token_accuracy": 0.8895826935768127,
- "num_tokens": 25054978.0,
- "step": 2800
- },
- {
- "epoch": 2.128419452887538,
- "grad_norm": 2.0551133155822754,
- "learning_rate": 1.0697784633778093e-06,
- "loss": 0.3452662229537964,
- "mean_token_accuracy": 0.8710684776306152,
- "num_tokens": 25062755.0,
- "step": 2801
- },
- {
- "epoch": 2.129179331306991,
- "grad_norm": 2.1780688762664795,
- "learning_rate": 1.0680611597460607e-06,
- "loss": 0.2918209135532379,
- "mean_token_accuracy": 0.8689337968826294,
- "num_tokens": 25069453.0,
- "step": 2802
- },
- {
- "epoch": 2.1299392097264436,
- "grad_norm": 1.7905635833740234,
- "learning_rate": 1.0663448611041114e-06,
- "loss": 0.3535313308238983,
- "mean_token_accuracy": 0.8762770295143127,
- "num_tokens": 25080004.0,
- "step": 2803
- },
- {
- "epoch": 2.1306990881458967,
- "grad_norm": 1.6187241077423096,
- "learning_rate": 1.0646295686565258e-06,
- "loss": 0.3042716681957245,
- "mean_token_accuracy": 0.884156346321106,
- "num_tokens": 25089652.0,
- "step": 2804
- },
- {
- "epoch": 2.1314589665653494,
- "grad_norm": 2.667459011077881,
- "learning_rate": 1.0629152836071633e-06,
- "loss": 0.3904019892215729,
- "mean_token_accuracy": 0.8603606224060059,
- "num_tokens": 25095556.0,
- "step": 2805
- },
- {
- "epoch": 2.1322188449848025,
- "grad_norm": 1.4227970838546753,
- "learning_rate": 1.0612020071591722e-06,
- "loss": 0.3765299320220947,
- "mean_token_accuracy": 0.8655093908309937,
- "num_tokens": 25108963.0,
- "step": 2806
- },
- {
- "epoch": 2.132978723404255,
- "grad_norm": 2.262726068496704,
- "learning_rate": 1.0594897405149994e-06,
- "loss": 0.2727298140525818,
- "mean_token_accuracy": 0.9005513191223145,
- "num_tokens": 25115135.0,
- "step": 2807
- },
- {
- "epoch": 2.1337386018237083,
- "grad_norm": 2.0810186862945557,
- "learning_rate": 1.0577784848763773e-06,
- "loss": 0.4001343250274658,
- "mean_token_accuracy": 0.8537896871566772,
- "num_tokens": 25123079.0,
- "step": 2808
- },
- {
- "epoch": 2.134498480243161,
- "grad_norm": 1.6573376655578613,
- "learning_rate": 1.0560682414443315e-06,
- "loss": 0.4197486340999603,
- "mean_token_accuracy": 0.8549862504005432,
- "num_tokens": 25135398.0,
- "step": 2809
- },
- {
- "epoch": 2.135258358662614,
- "grad_norm": 2.200150489807129,
- "learning_rate": 1.0543590114191768e-06,
- "loss": 0.32026296854019165,
- "mean_token_accuracy": 0.8797904253005981,
- "num_tokens": 25141382.0,
- "step": 2810
- },
- {
- "epoch": 2.136018237082067,
- "grad_norm": 2.678558111190796,
- "learning_rate": 1.0526507960005164e-06,
- "loss": 0.30048054456710815,
- "mean_token_accuracy": 0.8849201202392578,
- "num_tokens": 25146235.0,
- "step": 2811
- },
- {
- "epoch": 2.13677811550152,
- "grad_norm": 1.5207500457763672,
- "learning_rate": 1.0509435963872422e-06,
- "loss": 0.3706427216529846,
- "mean_token_accuracy": 0.8740214109420776,
- "num_tokens": 25157108.0,
- "step": 2812
- },
- {
- "epoch": 2.1375379939209727,
- "grad_norm": 1.4632720947265625,
- "learning_rate": 1.049237413777532e-06,
- "loss": 0.27156776189804077,
- "mean_token_accuracy": 0.8950715661048889,
- "num_tokens": 25167937.0,
- "step": 2813
- },
- {
- "epoch": 2.1382978723404253,
- "grad_norm": 2.101048469543457,
- "learning_rate": 1.0475322493688506e-06,
- "loss": 0.366736501455307,
- "mean_token_accuracy": 0.8700850009918213,
- "num_tokens": 25177043.0,
- "step": 2814
- },
- {
- "epoch": 2.1390577507598785,
- "grad_norm": 2.54221248626709,
- "learning_rate": 1.0458281043579482e-06,
- "loss": 0.20383943617343903,
- "mean_token_accuracy": 0.9226665496826172,
- "num_tokens": 25182105.0,
- "step": 2815
- },
- {
- "epoch": 2.139817629179331,
- "grad_norm": 1.7742674350738525,
- "learning_rate": 1.04412497994086e-06,
- "loss": 0.26852455735206604,
- "mean_token_accuracy": 0.8987031579017639,
- "num_tokens": 25190178.0,
- "step": 2816
- },
- {
- "epoch": 2.1405775075987843,
- "grad_norm": 3.2856075763702393,
- "learning_rate": 1.0424228773129019e-06,
- "loss": 0.24643859267234802,
- "mean_token_accuracy": 0.9189155101776123,
- "num_tokens": 25194105.0,
- "step": 2817
- },
- {
- "epoch": 2.141337386018237,
- "grad_norm": 3.374311923980713,
- "learning_rate": 1.0407217976686777e-06,
- "loss": 0.2575511336326599,
- "mean_token_accuracy": 0.9143530130386353,
- "num_tokens": 25197787.0,
- "step": 2818
- },
- {
- "epoch": 2.14209726443769,
- "grad_norm": 1.4967217445373535,
- "learning_rate": 1.03902174220207e-06,
- "loss": 0.3054750859737396,
- "mean_token_accuracy": 0.8989205360412598,
- "num_tokens": 25209150.0,
- "step": 2819
- },
- {
- "epoch": 2.142857142857143,
- "grad_norm": 2.654459238052368,
- "learning_rate": 1.0373227121062423e-06,
- "loss": 0.27398061752319336,
- "mean_token_accuracy": 0.9181102514266968,
- "num_tokens": 25214015.0,
- "step": 2820
- },
- {
- "epoch": 2.143617021276596,
- "grad_norm": 1.3205828666687012,
- "learning_rate": 1.0356247085736388e-06,
- "loss": 0.4085468053817749,
- "mean_token_accuracy": 0.8745299577713013,
- "num_tokens": 25230588.0,
- "step": 2821
- },
- {
- "epoch": 2.1443768996960486,
- "grad_norm": 1.6965736150741577,
- "learning_rate": 1.0339277327959863e-06,
- "loss": 0.27269643545150757,
- "mean_token_accuracy": 0.9001271724700928,
- "num_tokens": 25239298.0,
- "step": 2822
- },
- {
- "epoch": 2.1451367781155017,
- "grad_norm": 2.789114236831665,
- "learning_rate": 1.0322317859642852e-06,
- "loss": 0.2319176197052002,
- "mean_token_accuracy": 0.9237110614776611,
- "num_tokens": 25243286.0,
- "step": 2823
- },
- {
- "epoch": 2.1458966565349544,
- "grad_norm": 1.8817718029022217,
- "learning_rate": 1.0305368692688175e-06,
- "loss": 0.2917990982532501,
- "mean_token_accuracy": 0.9211062788963318,
- "num_tokens": 25250575.0,
- "step": 2824
- },
- {
- "epoch": 2.146656534954407,
- "grad_norm": 2.1824984550476074,
- "learning_rate": 1.0288429838991405e-06,
- "loss": 0.39010798931121826,
- "mean_token_accuracy": 0.8887852430343628,
- "num_tokens": 25257947.0,
- "step": 2825
- },
- {
- "epoch": 2.1474164133738602,
- "grad_norm": 1.302579641342163,
- "learning_rate": 1.0271501310440882e-06,
- "loss": 0.3511282503604889,
- "mean_token_accuracy": 0.8728797435760498,
- "num_tokens": 25272846.0,
- "step": 2826
- },
- {
- "epoch": 2.148176291793313,
- "grad_norm": 1.691807746887207,
- "learning_rate": 1.0254583118917699e-06,
- "loss": 0.34246695041656494,
- "mean_token_accuracy": 0.8743435144424438,
- "num_tokens": 25283004.0,
- "step": 2827
- },
- {
- "epoch": 2.148936170212766,
- "grad_norm": 1.2483569383621216,
- "learning_rate": 1.0237675276295709e-06,
- "loss": 0.3346659243106842,
- "mean_token_accuracy": 0.8823951482772827,
- "num_tokens": 25297786.0,
- "step": 2828
- },
- {
- "epoch": 2.1496960486322187,
- "grad_norm": 3.7242841720581055,
- "learning_rate": 1.022077779444145e-06,
- "loss": 0.25516486167907715,
- "mean_token_accuracy": 0.9189130663871765,
- "num_tokens": 25301524.0,
- "step": 2829
- },
- {
- "epoch": 2.150455927051672,
- "grad_norm": 2.5851144790649414,
- "learning_rate": 1.020389068521426e-06,
- "loss": 0.3543069362640381,
- "mean_token_accuracy": 0.8942399621009827,
- "num_tokens": 25307277.0,
- "step": 2830
- },
- {
- "epoch": 2.1512158054711246,
- "grad_norm": 1.3453631401062012,
- "learning_rate": 1.018701396046616e-06,
- "loss": 0.2900702953338623,
- "mean_token_accuracy": 0.8847548365592957,
- "num_tokens": 25321366.0,
- "step": 2831
- },
- {
- "epoch": 2.1519756838905777,
- "grad_norm": 1.6905686855316162,
- "learning_rate": 1.0170147632041858e-06,
- "loss": 0.24844832718372345,
- "mean_token_accuracy": 0.9167388677597046,
- "num_tokens": 25328916.0,
- "step": 2832
- },
- {
- "epoch": 2.1527355623100304,
- "grad_norm": 2.6469411849975586,
- "learning_rate": 1.0153291711778825e-06,
- "loss": 0.18566903471946716,
- "mean_token_accuracy": 0.9346771836280823,
- "num_tokens": 25332871.0,
- "step": 2833
- },
- {
- "epoch": 2.1534954407294835,
- "grad_norm": 1.3880906105041504,
- "learning_rate": 1.0136446211507175e-06,
- "loss": 0.37413570284843445,
- "mean_token_accuracy": 0.8685535788536072,
- "num_tokens": 25347447.0,
- "step": 2834
- },
- {
- "epoch": 2.154255319148936,
- "grad_norm": 1.1376656293869019,
- "learning_rate": 1.0119611143049731e-06,
- "loss": 0.2844143509864807,
- "mean_token_accuracy": 0.8910006284713745,
- "num_tokens": 25365930.0,
- "step": 2835
- },
- {
- "epoch": 2.155015197568389,
- "grad_norm": 2.259666919708252,
- "learning_rate": 1.0102786518221997e-06,
- "loss": 0.3148176074028015,
- "mean_token_accuracy": 0.8851165175437927,
- "num_tokens": 25373047.0,
- "step": 2836
- },
- {
- "epoch": 2.155775075987842,
- "grad_norm": 3.304095506668091,
- "learning_rate": 1.0085972348832138e-06,
- "loss": 0.2042517364025116,
- "mean_token_accuracy": 0.9247308969497681,
- "num_tokens": 25376348.0,
- "step": 2837
- },
- {
- "epoch": 2.1565349544072947,
- "grad_norm": 1.9856120347976685,
- "learning_rate": 1.0069168646680985e-06,
- "loss": 0.3547414541244507,
- "mean_token_accuracy": 0.8941285610198975,
- "num_tokens": 25384675.0,
- "step": 2838
- },
- {
- "epoch": 2.157294832826748,
- "grad_norm": 2.8482213020324707,
- "learning_rate": 1.0052375423562038e-06,
- "loss": 0.3530133366584778,
- "mean_token_accuracy": 0.8789700269699097,
- "num_tokens": 25389631.0,
- "step": 2839
- },
- {
- "epoch": 2.1580547112462005,
- "grad_norm": 1.4270408153533936,
- "learning_rate": 1.0035592691261395e-06,
- "loss": 0.34078776836395264,
- "mean_token_accuracy": 0.8648165464401245,
- "num_tokens": 25403746.0,
- "step": 2840
- },
- {
- "epoch": 2.1588145896656536,
- "grad_norm": 0.9342723488807678,
- "learning_rate": 1.0018820461557852e-06,
- "loss": 0.2615935504436493,
- "mean_token_accuracy": 0.9082236289978027,
- "num_tokens": 25424695.0,
- "step": 2841
- },
- {
- "epoch": 2.1595744680851063,
- "grad_norm": 2.695632219314575,
- "learning_rate": 1.0002058746222807e-06,
- "loss": 0.2202145904302597,
- "mean_token_accuracy": 0.9221563339233398,
- "num_tokens": 25428783.0,
- "step": 2842
- },
- {
- "epoch": 2.1603343465045595,
- "grad_norm": 1.5679794549942017,
- "learning_rate": 9.985307557020257e-07,
- "loss": 0.24275024235248566,
- "mean_token_accuracy": 0.9363338351249695,
- "num_tokens": 25439104.0,
- "step": 2843
- },
- {
- "epoch": 2.161094224924012,
- "grad_norm": 1.5985528230667114,
- "learning_rate": 9.968566905706833e-07,
- "loss": 0.2541901171207428,
- "mean_token_accuracy": 0.9040743112564087,
- "num_tokens": 25448829.0,
- "step": 2844
- },
- {
- "epoch": 2.161854103343465,
- "grad_norm": 2.6022164821624756,
- "learning_rate": 9.951836804031795e-07,
- "loss": 0.24492180347442627,
- "mean_token_accuracy": 0.9109418392181396,
- "num_tokens": 25453902.0,
- "step": 2845
- },
- {
- "epoch": 2.162613981762918,
- "grad_norm": 1.6719969511032104,
- "learning_rate": 9.935117263736943e-07,
- "loss": 0.43255117535591125,
- "mean_token_accuracy": 0.868374228477478,
- "num_tokens": 25465538.0,
- "step": 2846
- },
- {
- "epoch": 2.1633738601823707,
- "grad_norm": 1.8284894227981567,
- "learning_rate": 9.918408296556706e-07,
- "loss": 0.32285982370376587,
- "mean_token_accuracy": 0.9016412496566772,
- "num_tokens": 25473721.0,
- "step": 2847
- },
- {
- "epoch": 2.164133738601824,
- "grad_norm": 1.4488024711608887,
- "learning_rate": 9.90170991421808e-07,
- "loss": 0.35639309883117676,
- "mean_token_accuracy": 0.8861881494522095,
- "num_tokens": 25487535.0,
- "step": 2848
- },
- {
- "epoch": 2.1648936170212765,
- "grad_norm": 2.089930534362793,
- "learning_rate": 9.88502212844063e-07,
- "loss": 0.2588546574115753,
- "mean_token_accuracy": 0.9029642939567566,
- "num_tokens": 25494567.0,
- "step": 2849
- },
- {
- "epoch": 2.1656534954407296,
- "grad_norm": 1.1274315118789673,
- "learning_rate": 9.86834495093649e-07,
- "loss": 0.37268880009651184,
- "mean_token_accuracy": 0.859347939491272,
- "num_tokens": 25518278.0,
- "step": 2850
- },
- {
- "epoch": 2.1664133738601823,
- "grad_norm": 2.3886640071868896,
- "learning_rate": 9.851678393410343e-07,
- "loss": 0.34938913583755493,
- "mean_token_accuracy": 0.8724287748336792,
- "num_tokens": 25524001.0,
- "step": 2851
- },
- {
- "epoch": 2.1671732522796354,
- "grad_norm": 2.521230459213257,
- "learning_rate": 9.83502246755942e-07,
- "loss": 0.34781408309936523,
- "mean_token_accuracy": 0.8970093131065369,
- "num_tokens": 25529982.0,
- "step": 2852
- },
- {
- "epoch": 2.167933130699088,
- "grad_norm": 2.467618942260742,
- "learning_rate": 9.818377185073493e-07,
- "loss": 0.29725387692451477,
- "mean_token_accuracy": 0.8991899490356445,
- "num_tokens": 25535356.0,
- "step": 2853
- },
- {
- "epoch": 2.1686930091185412,
- "grad_norm": 2.335873603820801,
- "learning_rate": 9.801742557634872e-07,
- "loss": 0.39603036642074585,
- "mean_token_accuracy": 0.8755916357040405,
- "num_tokens": 25542526.0,
- "step": 2854
- },
- {
- "epoch": 2.169452887537994,
- "grad_norm": 1.8388596773147583,
- "learning_rate": 9.78511859691835e-07,
- "loss": 0.3414672017097473,
- "mean_token_accuracy": 0.8951467275619507,
- "num_tokens": 25551904.0,
- "step": 2855
- },
- {
- "epoch": 2.1702127659574466,
- "grad_norm": 1.86272394657135,
- "learning_rate": 9.768505314591295e-07,
- "loss": 0.45748448371887207,
- "mean_token_accuracy": 0.8614354133605957,
- "num_tokens": 25562197.0,
- "step": 2856
- },
- {
- "epoch": 2.1709726443768997,
- "grad_norm": 1.9142264127731323,
- "learning_rate": 9.751902722313527e-07,
- "loss": 0.20877259969711304,
- "mean_token_accuracy": 0.9316688179969788,
- "num_tokens": 25569403.0,
- "step": 2857
- },
- {
- "epoch": 2.1717325227963524,
- "grad_norm": 2.1138272285461426,
- "learning_rate": 9.73531083173739e-07,
- "loss": 0.37058722972869873,
- "mean_token_accuracy": 0.8654135465621948,
- "num_tokens": 25577200.0,
- "step": 2858
- },
- {
- "epoch": 2.1724924012158056,
- "grad_norm": 1.973467469215393,
- "learning_rate": 9.718729654507713e-07,
- "loss": 0.4106993079185486,
- "mean_token_accuracy": 0.8958662152290344,
- "num_tokens": 25585694.0,
- "step": 2859
- },
- {
- "epoch": 2.1732522796352582,
- "grad_norm": 1.957513451576233,
- "learning_rate": 9.702159202261802e-07,
- "loss": 0.2067333608865738,
- "mean_token_accuracy": 0.9413473606109619,
- "num_tokens": 25591604.0,
- "step": 2860
- },
- {
- "epoch": 2.1740121580547114,
- "grad_norm": 2.7639806270599365,
- "learning_rate": 9.685599486629444e-07,
- "loss": 0.3446827232837677,
- "mean_token_accuracy": 0.8837845325469971,
- "num_tokens": 25596528.0,
- "step": 2861
- },
- {
- "epoch": 2.174772036474164,
- "grad_norm": 2.483734607696533,
- "learning_rate": 9.669050519232875e-07,
- "loss": 0.21230249106884003,
- "mean_token_accuracy": 0.9334918856620789,
- "num_tokens": 25601182.0,
- "step": 2862
- },
- {
- "epoch": 2.175531914893617,
- "grad_norm": 1.7194870710372925,
- "learning_rate": 9.65251231168681e-07,
- "loss": 0.2657586932182312,
- "mean_token_accuracy": 0.9035707712173462,
- "num_tokens": 25610561.0,
- "step": 2863
- },
- {
- "epoch": 2.17629179331307,
- "grad_norm": 2.6709611415863037,
- "learning_rate": 9.63598487559839e-07,
- "loss": 0.3673030138015747,
- "mean_token_accuracy": 0.8976202011108398,
- "num_tokens": 25615822.0,
- "step": 2864
- },
- {
- "epoch": 2.1770516717325226,
- "grad_norm": 1.6646889448165894,
- "learning_rate": 9.619468222567216e-07,
- "loss": 0.2796666622161865,
- "mean_token_accuracy": 0.8698215484619141,
- "num_tokens": 25626148.0,
- "step": 2865
- },
- {
- "epoch": 2.1778115501519757,
- "grad_norm": 1.8341799974441528,
- "learning_rate": 9.602962364185286e-07,
- "loss": 0.44835132360458374,
- "mean_token_accuracy": 0.84391850233078,
- "num_tokens": 25636305.0,
- "step": 2866
- },
- {
- "epoch": 2.1785714285714284,
- "grad_norm": 2.3579823970794678,
- "learning_rate": 9.586467312037076e-07,
- "loss": 0.2875673472881317,
- "mean_token_accuracy": 0.889403223991394,
- "num_tokens": 25642593.0,
- "step": 2867
- },
- {
- "epoch": 2.1793313069908815,
- "grad_norm": 1.1284339427947998,
- "learning_rate": 9.569983077699447e-07,
- "loss": 0.3402171730995178,
- "mean_token_accuracy": 0.8795222043991089,
- "num_tokens": 25663734.0,
- "step": 2868
- },
- {
- "epoch": 2.180091185410334,
- "grad_norm": 1.4705578088760376,
- "learning_rate": 9.553509672741646e-07,
- "loss": 0.4216107726097107,
- "mean_token_accuracy": 0.845354437828064,
- "num_tokens": 25678197.0,
- "step": 2869
- },
- {
- "epoch": 2.1808510638297873,
- "grad_norm": 2.6181085109710693,
- "learning_rate": 9.53704710872535e-07,
- "loss": 0.2777765393257141,
- "mean_token_accuracy": 0.8884872198104858,
- "num_tokens": 25683808.0,
- "step": 2870
- },
- {
- "epoch": 2.18161094224924,
- "grad_norm": 2.7285003662109375,
- "learning_rate": 9.520595397204643e-07,
- "loss": 0.33339786529541016,
- "mean_token_accuracy": 0.8892828226089478,
- "num_tokens": 25690125.0,
- "step": 2871
- },
- {
- "epoch": 2.182370820668693,
- "grad_norm": 2.200571298599243,
- "learning_rate": 9.504154549725944e-07,
- "loss": 0.46546393632888794,
- "mean_token_accuracy": 0.8389996290206909,
- "num_tokens": 25697279.0,
- "step": 2872
- },
- {
- "epoch": 2.183130699088146,
- "grad_norm": 3.491392135620117,
- "learning_rate": 9.487724577828081e-07,
- "loss": 0.17026299238204956,
- "mean_token_accuracy": 0.9410334825515747,
- "num_tokens": 25700263.0,
- "step": 2873
- },
- {
- "epoch": 2.183890577507599,
- "grad_norm": 2.7800233364105225,
- "learning_rate": 9.471305493042243e-07,
- "loss": 0.2309894859790802,
- "mean_token_accuracy": 0.9233936071395874,
- "num_tokens": 25704486.0,
- "step": 2874
- },
- {
- "epoch": 2.1846504559270516,
- "grad_norm": 2.6505582332611084,
- "learning_rate": 9.454897306891972e-07,
- "loss": 0.4378674328327179,
- "mean_token_accuracy": 0.8846660852432251,
- "num_tokens": 25710115.0,
- "step": 2875
- },
- {
- "epoch": 2.1854103343465043,
- "grad_norm": 1.5393849611282349,
- "learning_rate": 9.438500030893166e-07,
- "loss": 0.42081019282341003,
- "mean_token_accuracy": 0.8672939538955688,
- "num_tokens": 25724598.0,
- "step": 2876
- },
- {
- "epoch": 2.1861702127659575,
- "grad_norm": 1.911198377609253,
- "learning_rate": 9.422113676554073e-07,
- "loss": 0.19115394353866577,
- "mean_token_accuracy": 0.9201297163963318,
- "num_tokens": 25731040.0,
- "step": 2877
- },
- {
- "epoch": 2.18693009118541,
- "grad_norm": 1.371443748474121,
- "learning_rate": 9.405738255375243e-07,
- "loss": 0.3639947772026062,
- "mean_token_accuracy": 0.8653393983840942,
- "num_tokens": 25745335.0,
- "step": 2878
- },
- {
- "epoch": 2.1876899696048633,
- "grad_norm": 3.216238498687744,
- "learning_rate": 9.389373778849612e-07,
- "loss": 0.2623414397239685,
- "mean_token_accuracy": 0.9046015739440918,
- "num_tokens": 25749223.0,
- "step": 2879
- },
- {
- "epoch": 2.188449848024316,
- "grad_norm": 2.7558846473693848,
- "learning_rate": 9.37302025846237e-07,
- "loss": 0.31921297311782837,
- "mean_token_accuracy": 0.8903186321258545,
- "num_tokens": 25754341.0,
- "step": 2880
- },
- {
- "epoch": 2.189209726443769,
- "grad_norm": 2.06365704536438,
- "learning_rate": 9.356677705691058e-07,
- "loss": 0.357482373714447,
- "mean_token_accuracy": 0.8661626577377319,
- "num_tokens": 25761199.0,
- "step": 2881
- },
- {
- "epoch": 2.189969604863222,
- "grad_norm": 3.240328550338745,
- "learning_rate": 9.340346132005507e-07,
- "loss": 0.3157888650894165,
- "mean_token_accuracy": 0.8948285579681396,
- "num_tokens": 25765099.0,
- "step": 2882
- },
- {
- "epoch": 2.190729483282675,
- "grad_norm": 1.4671967029571533,
- "learning_rate": 9.324025548867849e-07,
- "loss": 0.32077109813690186,
- "mean_token_accuracy": 0.8813248872756958,
- "num_tokens": 25777636.0,
- "step": 2883
- },
- {
- "epoch": 2.1914893617021276,
- "grad_norm": 2.6475353240966797,
- "learning_rate": 9.307715967732492e-07,
- "loss": 0.35567623376846313,
- "mean_token_accuracy": 0.8738130331039429,
- "num_tokens": 25783737.0,
- "step": 2884
- },
- {
- "epoch": 2.1922492401215807,
- "grad_norm": 1.791491150856018,
- "learning_rate": 9.29141740004613e-07,
- "loss": 0.2556282877922058,
- "mean_token_accuracy": 0.9223519563674927,
- "num_tokens": 25792069.0,
- "step": 2885
- },
- {
- "epoch": 2.1930091185410334,
- "grad_norm": 2.3944389820098877,
- "learning_rate": 9.275129857247722e-07,
- "loss": 0.3145869970321655,
- "mean_token_accuracy": 0.8938079476356506,
- "num_tokens": 25798400.0,
- "step": 2886
- },
- {
- "epoch": 2.193768996960486,
- "grad_norm": 2.0802059173583984,
- "learning_rate": 9.258853350768499e-07,
- "loss": 0.37343069911003113,
- "mean_token_accuracy": 0.8705670833587646,
- "num_tokens": 25806567.0,
- "step": 2887
- },
- {
- "epoch": 2.1945288753799392,
- "grad_norm": 2.10831880569458,
- "learning_rate": 9.242587892031945e-07,
- "loss": 0.1989251971244812,
- "mean_token_accuracy": 0.931064248085022,
- "num_tokens": 25812715.0,
- "step": 2888
- },
- {
- "epoch": 2.195288753799392,
- "grad_norm": 2.1305530071258545,
- "learning_rate": 9.226333492453759e-07,
- "loss": 0.29377204179763794,
- "mean_token_accuracy": 0.8942701816558838,
- "num_tokens": 25819988.0,
- "step": 2889
- },
- {
- "epoch": 2.196048632218845,
- "grad_norm": 2.179025411605835,
- "learning_rate": 9.210090163441928e-07,
- "loss": 0.37565115094184875,
- "mean_token_accuracy": 0.8700202703475952,
- "num_tokens": 25827777.0,
- "step": 2890
- },
- {
- "epoch": 2.1968085106382977,
- "grad_norm": 3.177180290222168,
- "learning_rate": 9.19385791639665e-07,
- "loss": 0.16646479070186615,
- "mean_token_accuracy": 0.9426749348640442,
- "num_tokens": 25831724.0,
- "step": 2891
- },
- {
- "epoch": 2.197568389057751,
- "grad_norm": 1.103196620941162,
- "learning_rate": 9.177636762710321e-07,
- "loss": 0.29140013456344604,
- "mean_token_accuracy": 0.8789779543876648,
- "num_tokens": 25854707.0,
- "step": 2892
- },
- {
- "epoch": 2.1983282674772036,
- "grad_norm": 1.597692847251892,
- "learning_rate": 9.161426713767574e-07,
- "loss": 0.37799614667892456,
- "mean_token_accuracy": 0.8623079061508179,
- "num_tokens": 25868429.0,
- "step": 2893
- },
- {
- "epoch": 2.1990881458966567,
- "grad_norm": 2.227132558822632,
- "learning_rate": 9.145227780945265e-07,
- "loss": 0.2683261036872864,
- "mean_token_accuracy": 0.9092563390731812,
- "num_tokens": 25875367.0,
- "step": 2894
- },
- {
- "epoch": 2.1998480243161094,
- "grad_norm": 3.1229634284973145,
- "learning_rate": 9.129039975612408e-07,
- "loss": 0.21859994530677795,
- "mean_token_accuracy": 0.9187530875205994,
- "num_tokens": 25879456.0,
- "step": 2895
- },
- {
- "epoch": 2.2006079027355625,
- "grad_norm": 2.3224828243255615,
- "learning_rate": 9.112863309130235e-07,
- "loss": 0.3557605743408203,
- "mean_token_accuracy": 0.8735873103141785,
- "num_tokens": 25886477.0,
- "step": 2896
- },
- {
- "epoch": 2.201367781155015,
- "grad_norm": 1.7784863710403442,
- "learning_rate": 9.096697792852155e-07,
- "loss": 0.334577351808548,
- "mean_token_accuracy": 0.8948780298233032,
- "num_tokens": 25894977.0,
- "step": 2897
- },
- {
- "epoch": 2.202127659574468,
- "grad_norm": 2.34066104888916,
- "learning_rate": 9.080543438123746e-07,
- "loss": 0.16479721665382385,
- "mean_token_accuracy": 0.9405456781387329,
- "num_tokens": 25900015.0,
- "step": 2898
- },
- {
- "epoch": 2.202887537993921,
- "grad_norm": 1.944082498550415,
- "learning_rate": 9.064400256282757e-07,
- "loss": 0.40259572863578796,
- "mean_token_accuracy": 0.8632713556289673,
- "num_tokens": 25908749.0,
- "step": 2899
- },
- {
- "epoch": 2.2036474164133737,
- "grad_norm": 1.2758828401565552,
- "learning_rate": 9.048268258659098e-07,
- "loss": 0.3939874470233917,
- "mean_token_accuracy": 0.8652969598770142,
- "num_tokens": 25924972.0,
- "step": 2900
- },
- {
- "epoch": 2.204407294832827,
- "grad_norm": 1.4483891725540161,
- "learning_rate": 9.032147456574822e-07,
- "loss": 0.4132935404777527,
- "mean_token_accuracy": 0.868486762046814,
- "num_tokens": 25939785.0,
- "step": 2901
- },
- {
- "epoch": 2.2051671732522795,
- "grad_norm": 1.4866713285446167,
- "learning_rate": 9.01603786134413e-07,
- "loss": 0.3644951581954956,
- "mean_token_accuracy": 0.8750203847885132,
- "num_tokens": 25952648.0,
- "step": 2902
- },
- {
- "epoch": 2.2059270516717326,
- "grad_norm": 1.6555454730987549,
- "learning_rate": 8.999939484273362e-07,
- "loss": 0.48656779527664185,
- "mean_token_accuracy": 0.8372372984886169,
- "num_tokens": 25965062.0,
- "step": 2903
- },
- {
- "epoch": 2.2066869300911853,
- "grad_norm": 2.3154168128967285,
- "learning_rate": 8.983852336660959e-07,
- "loss": 0.3768891990184784,
- "mean_token_accuracy": 0.8614999055862427,
- "num_tokens": 25972152.0,
- "step": 2904
- },
- {
- "epoch": 2.2074468085106385,
- "grad_norm": 2.3618056774139404,
- "learning_rate": 8.967776429797529e-07,
- "loss": 0.24905793368816376,
- "mean_token_accuracy": 0.9170958995819092,
- "num_tokens": 25977808.0,
- "step": 2905
- },
- {
- "epoch": 2.208206686930091,
- "grad_norm": 1.929051399230957,
- "learning_rate": 8.951711774965741e-07,
- "loss": 0.38099539279937744,
- "mean_token_accuracy": 0.8812143802642822,
- "num_tokens": 25987871.0,
- "step": 2906
- },
- {
- "epoch": 2.2089665653495443,
- "grad_norm": 1.6529620885849,
- "learning_rate": 8.93565838344039e-07,
- "loss": 0.31784749031066895,
- "mean_token_accuracy": 0.8929437398910522,
- "num_tokens": 25997777.0,
- "step": 2907
- },
- {
- "epoch": 2.209726443768997,
- "grad_norm": 2.1413469314575195,
- "learning_rate": 8.919616266488373e-07,
- "loss": 0.4043882191181183,
- "mean_token_accuracy": 0.8937146663665771,
- "num_tokens": 26005213.0,
- "step": 2908
- },
- {
- "epoch": 2.2104863221884496,
- "grad_norm": 1.3838988542556763,
- "learning_rate": 8.903585435368658e-07,
- "loss": 0.2858969569206238,
- "mean_token_accuracy": 0.9084860682487488,
- "num_tokens": 26018371.0,
- "step": 2909
- },
- {
- "epoch": 2.211246200607903,
- "grad_norm": 1.2853319644927979,
- "learning_rate": 8.887565901332304e-07,
- "loss": 0.3178713619709015,
- "mean_token_accuracy": 0.872230589389801,
- "num_tokens": 26034136.0,
- "step": 2910
- },
- {
- "epoch": 2.2120060790273555,
- "grad_norm": 2.9032399654388428,
- "learning_rate": 8.871557675622442e-07,
- "loss": 0.20348960161209106,
- "mean_token_accuracy": 0.9275314807891846,
- "num_tokens": 26038299.0,
- "step": 2911
- },
- {
- "epoch": 2.2127659574468086,
- "grad_norm": 2.4349892139434814,
- "learning_rate": 8.855560769474237e-07,
- "loss": 0.24282032251358032,
- "mean_token_accuracy": 0.9103988409042358,
- "num_tokens": 26043427.0,
- "step": 2912
- },
- {
- "epoch": 2.2135258358662613,
- "grad_norm": 2.324664831161499,
- "learning_rate": 8.839575194114958e-07,
- "loss": 0.3808317184448242,
- "mean_token_accuracy": 0.8598989844322205,
- "num_tokens": 26049667.0,
- "step": 2913
- },
- {
- "epoch": 2.2142857142857144,
- "grad_norm": 2.594947576522827,
- "learning_rate": 8.823600960763901e-07,
- "loss": 0.39623332023620605,
- "mean_token_accuracy": 0.8738477230072021,
- "num_tokens": 26055428.0,
- "step": 2914
- },
- {
- "epoch": 2.215045592705167,
- "grad_norm": 1.674308180809021,
- "learning_rate": 8.807638080632375e-07,
- "loss": 0.2641369104385376,
- "mean_token_accuracy": 0.9119734764099121,
- "num_tokens": 26064355.0,
- "step": 2915
- },
- {
- "epoch": 2.2158054711246202,
- "grad_norm": 2.9884912967681885,
- "learning_rate": 8.791686564923746e-07,
- "loss": 0.19229236245155334,
- "mean_token_accuracy": 0.9388723969459534,
- "num_tokens": 26067563.0,
- "step": 2916
- },
- {
- "epoch": 2.216565349544073,
- "grad_norm": 1.8513846397399902,
- "learning_rate": 8.775746424833428e-07,
- "loss": 0.3076218366622925,
- "mean_token_accuracy": 0.9165210723876953,
- "num_tokens": 26075609.0,
- "step": 2917
- },
- {
- "epoch": 2.217325227963526,
- "grad_norm": 1.229604721069336,
- "learning_rate": 8.759817671548801e-07,
- "loss": 0.2727023959159851,
- "mean_token_accuracy": 0.8931418061256409,
- "num_tokens": 26091183.0,
- "step": 2918
- },
- {
- "epoch": 2.2180851063829787,
- "grad_norm": 2.384413957595825,
- "learning_rate": 8.743900316249273e-07,
- "loss": 0.27312609553337097,
- "mean_token_accuracy": 0.8972288370132446,
- "num_tokens": 26096677.0,
- "step": 2919
- },
- {
- "epoch": 2.2188449848024314,
- "grad_norm": 2.186370611190796,
- "learning_rate": 8.727994370106288e-07,
- "loss": 0.36045557260513306,
- "mean_token_accuracy": 0.8788503408432007,
- "num_tokens": 26104464.0,
- "step": 2920
- },
- {
- "epoch": 2.2196048632218845,
- "grad_norm": 2.769796848297119,
- "learning_rate": 8.71209984428322e-07,
- "loss": 0.3427591919898987,
- "mean_token_accuracy": 0.892108678817749,
- "num_tokens": 26109571.0,
- "step": 2921
- },
- {
- "epoch": 2.2203647416413372,
- "grad_norm": 2.9888014793395996,
- "learning_rate": 8.696216749935471e-07,
- "loss": 0.20137615501880646,
- "mean_token_accuracy": 0.9366025924682617,
- "num_tokens": 26113165.0,
- "step": 2922
- },
- {
- "epoch": 2.2211246200607904,
- "grad_norm": 1.484858751296997,
- "learning_rate": 8.680345098210408e-07,
- "loss": 0.2884698510169983,
- "mean_token_accuracy": 0.8992507457733154,
- "num_tokens": 26124385.0,
- "step": 2923
- },
- {
- "epoch": 2.221884498480243,
- "grad_norm": 1.690119981765747,
- "learning_rate": 8.664484900247363e-07,
- "loss": 0.34275567531585693,
- "mean_token_accuracy": 0.8682634234428406,
- "num_tokens": 26134944.0,
- "step": 2924
- },
- {
- "epoch": 2.222644376899696,
- "grad_norm": 1.6171982288360596,
- "learning_rate": 8.64863616717764e-07,
- "loss": 0.256338506937027,
- "mean_token_accuracy": 0.9281957745552063,
- "num_tokens": 26143586.0,
- "step": 2925
- },
- {
- "epoch": 2.223404255319149,
- "grad_norm": 2.4853835105895996,
- "learning_rate": 8.632798910124493e-07,
- "loss": 0.26290056109428406,
- "mean_token_accuracy": 0.9119559526443481,
- "num_tokens": 26148931.0,
- "step": 2926
- },
- {
- "epoch": 2.224164133738602,
- "grad_norm": 2.0014333724975586,
- "learning_rate": 8.616973140203097e-07,
- "loss": 0.33400261402130127,
- "mean_token_accuracy": 0.8796782493591309,
- "num_tokens": 26156246.0,
- "step": 2927
- },
- {
- "epoch": 2.2249240121580547,
- "grad_norm": 1.4637027978897095,
- "learning_rate": 8.601158868520617e-07,
- "loss": 0.24374958872795105,
- "mean_token_accuracy": 0.9116952419281006,
- "num_tokens": 26166431.0,
- "step": 2928
- },
- {
- "epoch": 2.225683890577508,
- "grad_norm": 2.2056987285614014,
- "learning_rate": 8.585356106176093e-07,
- "loss": 0.3419337570667267,
- "mean_token_accuracy": 0.8703858852386475,
- "num_tokens": 26173974.0,
- "step": 2929
- },
- {
- "epoch": 2.2264437689969605,
- "grad_norm": 1.3687927722930908,
- "learning_rate": 8.569564864260524e-07,
- "loss": 0.43176111578941345,
- "mean_token_accuracy": 0.8616900444030762,
- "num_tokens": 26191632.0,
- "step": 2930
- },
- {
- "epoch": 2.227203647416413,
- "grad_norm": 1.4975634813308716,
- "learning_rate": 8.553785153856809e-07,
- "loss": 0.38525745272636414,
- "mean_token_accuracy": 0.8611687421798706,
- "num_tokens": 26203300.0,
- "step": 2931
- },
- {
- "epoch": 2.2279635258358663,
- "grad_norm": 1.970109462738037,
- "learning_rate": 8.538016986039751e-07,
- "loss": 0.31731468439102173,
- "mean_token_accuracy": 0.884365975856781,
- "num_tokens": 26210037.0,
- "step": 2932
- },
- {
- "epoch": 2.228723404255319,
- "grad_norm": 2.681717872619629,
- "learning_rate": 8.522260371876068e-07,
- "loss": 0.2770140767097473,
- "mean_token_accuracy": 0.9020107984542847,
- "num_tokens": 26215460.0,
- "step": 2933
- },
- {
- "epoch": 2.229483282674772,
- "grad_norm": 2.2324795722961426,
- "learning_rate": 8.506515322424349e-07,
- "loss": 0.30599141120910645,
- "mean_token_accuracy": 0.8939633965492249,
- "num_tokens": 26221260.0,
- "step": 2934
- },
- {
- "epoch": 2.230243161094225,
- "grad_norm": 2.08915376663208,
- "learning_rate": 8.49078184873508e-07,
- "loss": 0.3609209954738617,
- "mean_token_accuracy": 0.8776482343673706,
- "num_tokens": 26228397.0,
- "step": 2935
- },
- {
- "epoch": 2.231003039513678,
- "grad_norm": 1.641366958618164,
- "learning_rate": 8.475059961850617e-07,
- "loss": 0.2969125509262085,
- "mean_token_accuracy": 0.8949217796325684,
- "num_tokens": 26238533.0,
- "step": 2936
- },
- {
- "epoch": 2.2317629179331306,
- "grad_norm": 1.082148551940918,
- "learning_rate": 8.459349672805198e-07,
- "loss": 0.23957109451293945,
- "mean_token_accuracy": 0.9255712032318115,
- "num_tokens": 26254154.0,
- "step": 2937
- },
- {
- "epoch": 2.2325227963525838,
- "grad_norm": 2.495208740234375,
- "learning_rate": 8.443650992624877e-07,
- "loss": 0.2879767417907715,
- "mean_token_accuracy": 0.8911515474319458,
- "num_tokens": 26260812.0,
- "step": 2938
- },
- {
- "epoch": 2.2332826747720365,
- "grad_norm": 3.566549062728882,
- "learning_rate": 8.427963932327621e-07,
- "loss": 0.31420570611953735,
- "mean_token_accuracy": 0.8888009190559387,
- "num_tokens": 26264592.0,
- "step": 2939
- },
- {
- "epoch": 2.2340425531914896,
- "grad_norm": 2.217177391052246,
- "learning_rate": 8.412288502923211e-07,
- "loss": 0.30547618865966797,
- "mean_token_accuracy": 0.9065294861793518,
- "num_tokens": 26270729.0,
- "step": 2940
- },
- {
- "epoch": 2.2348024316109423,
- "grad_norm": 1.404260277748108,
- "learning_rate": 8.396624715413251e-07,
- "loss": 0.32485032081604004,
- "mean_token_accuracy": 0.8799532651901245,
- "num_tokens": 26284280.0,
- "step": 2941
- },
- {
- "epoch": 2.235562310030395,
- "grad_norm": 1.5519827604293823,
- "learning_rate": 8.380972580791191e-07,
- "loss": 0.3330575227737427,
- "mean_token_accuracy": 0.8865892887115479,
- "num_tokens": 26293635.0,
- "step": 2942
- },
- {
- "epoch": 2.236322188449848,
- "grad_norm": 2.604766845703125,
- "learning_rate": 8.365332110042323e-07,
- "loss": 0.18986842036247253,
- "mean_token_accuracy": 0.9276989102363586,
- "num_tokens": 26298553.0,
- "step": 2943
- },
- {
- "epoch": 2.237082066869301,
- "grad_norm": 2.1750004291534424,
- "learning_rate": 8.349703314143712e-07,
- "loss": 0.3661153018474579,
- "mean_token_accuracy": 0.8879489302635193,
- "num_tokens": 26305697.0,
- "step": 2944
- },
- {
- "epoch": 2.237841945288754,
- "grad_norm": 2.247069835662842,
- "learning_rate": 8.334086204064254e-07,
- "loss": 0.3127560615539551,
- "mean_token_accuracy": 0.8846344351768494,
- "num_tokens": 26312347.0,
- "step": 2945
- },
- {
- "epoch": 2.2386018237082066,
- "grad_norm": 1.905275821685791,
- "learning_rate": 8.318480790764638e-07,
- "loss": 0.44245776534080505,
- "mean_token_accuracy": 0.87440425157547,
- "num_tokens": 26322787.0,
- "step": 2946
- },
- {
- "epoch": 2.2393617021276597,
- "grad_norm": 1.8596254587173462,
- "learning_rate": 8.302887085197342e-07,
- "loss": 0.30068373680114746,
- "mean_token_accuracy": 0.8847110271453857,
- "num_tokens": 26330437.0,
- "step": 2947
- },
- {
- "epoch": 2.2401215805471124,
- "grad_norm": 2.0028860569000244,
- "learning_rate": 8.28730509830663e-07,
- "loss": 0.4276006817817688,
- "mean_token_accuracy": 0.8406014442443848,
- "num_tokens": 26340100.0,
- "step": 2948
- },
- {
- "epoch": 2.2408814589665655,
- "grad_norm": 2.494434356689453,
- "learning_rate": 8.271734841028553e-07,
- "loss": 0.3874223232269287,
- "mean_token_accuracy": 0.8782174587249756,
- "num_tokens": 26345750.0,
- "step": 2949
- },
- {
- "epoch": 2.2416413373860182,
- "grad_norm": 1.955613613128662,
- "learning_rate": 8.256176324290885e-07,
- "loss": 0.28770074248313904,
- "mean_token_accuracy": 0.9004360437393188,
- "num_tokens": 26353342.0,
- "step": 2950
- },
- {
- "epoch": 2.2424012158054714,
- "grad_norm": 1.7579785585403442,
- "learning_rate": 8.240629559013222e-07,
- "loss": 0.2277943640947342,
- "mean_token_accuracy": 0.9145861864089966,
- "num_tokens": 26361348.0,
- "step": 2951
- },
- {
- "epoch": 2.243161094224924,
- "grad_norm": 1.5848479270935059,
- "learning_rate": 8.22509455610688e-07,
- "loss": 0.32944542169570923,
- "mean_token_accuracy": 0.8662827014923096,
- "num_tokens": 26372006.0,
- "step": 2952
- },
- {
- "epoch": 2.2439209726443767,
- "grad_norm": 2.6263222694396973,
- "learning_rate": 8.209571326474897e-07,
- "loss": 0.34646326303482056,
- "mean_token_accuracy": 0.8817736506462097,
- "num_tokens": 26377664.0,
- "step": 2953
- },
- {
- "epoch": 2.24468085106383,
- "grad_norm": 2.407590627670288,
- "learning_rate": 8.194059881012107e-07,
- "loss": 0.41302192211151123,
- "mean_token_accuracy": 0.8898757696151733,
- "num_tokens": 26384225.0,
- "step": 2954
- },
- {
- "epoch": 2.2454407294832825,
- "grad_norm": 2.5156402587890625,
- "learning_rate": 8.178560230605012e-07,
- "loss": 0.3468608558177948,
- "mean_token_accuracy": 0.8879599571228027,
- "num_tokens": 26389374.0,
- "step": 2955
- },
- {
- "epoch": 2.2462006079027357,
- "grad_norm": 1.5076090097427368,
- "learning_rate": 8.163072386131876e-07,
- "loss": 0.3750625550746918,
- "mean_token_accuracy": 0.8712738752365112,
- "num_tokens": 26402674.0,
- "step": 2956
- },
- {
- "epoch": 2.2469604863221884,
- "grad_norm": 1.5181068181991577,
- "learning_rate": 8.147596358462662e-07,
- "loss": 0.19113478064537048,
- "mean_token_accuracy": 0.9323463439941406,
- "num_tokens": 26411626.0,
- "step": 2957
- },
- {
- "epoch": 2.2477203647416415,
- "grad_norm": 1.0806915760040283,
- "learning_rate": 8.132132158459044e-07,
- "loss": 0.3411233425140381,
- "mean_token_accuracy": 0.8736830949783325,
- "num_tokens": 26435891.0,
- "step": 2958
- },
- {
- "epoch": 2.248480243161094,
- "grad_norm": 1.5527247190475464,
- "learning_rate": 8.116679796974389e-07,
- "loss": 0.425741970539093,
- "mean_token_accuracy": 0.8448845148086548,
- "num_tokens": 26448134.0,
- "step": 2959
- },
- {
- "epoch": 2.2492401215805473,
- "grad_norm": 1.2390631437301636,
- "learning_rate": 8.10123928485377e-07,
- "loss": 0.38084933161735535,
- "mean_token_accuracy": 0.8656617999076843,
- "num_tokens": 26467213.0,
- "step": 2960
- },
- {
- "epoch": 2.25,
- "grad_norm": 3.0672852993011475,
- "learning_rate": 8.08581063293391e-07,
- "loss": 0.29300111532211304,
- "mean_token_accuracy": 0.8933638334274292,
- "num_tokens": 26471599.0,
- "step": 2961
- },
- {
- "epoch": 2.250759878419453,
- "grad_norm": 1.2359145879745483,
- "learning_rate": 8.070393852043251e-07,
- "loss": 0.41337621212005615,
- "mean_token_accuracy": 0.854198694229126,
- "num_tokens": 26488461.0,
- "step": 2962
- },
- {
- "epoch": 2.251519756838906,
- "grad_norm": 1.8551225662231445,
- "learning_rate": 8.054988953001889e-07,
- "loss": 0.3036419153213501,
- "mean_token_accuracy": 0.8883144855499268,
- "num_tokens": 26496398.0,
- "step": 2963
- },
- {
- "epoch": 2.2522796352583585,
- "grad_norm": 1.3691812753677368,
- "learning_rate": 8.039595946621551e-07,
- "loss": 0.3286219835281372,
- "mean_token_accuracy": 0.892130434513092,
- "num_tokens": 26510493.0,
- "step": 2964
- },
- {
- "epoch": 2.2530395136778116,
- "grad_norm": 1.7371556758880615,
- "learning_rate": 8.024214843705647e-07,
- "loss": 0.4105026125907898,
- "mean_token_accuracy": 0.8889180421829224,
- "num_tokens": 26519148.0,
- "step": 2965
- },
- {
- "epoch": 2.2537993920972643,
- "grad_norm": 2.211665630340576,
- "learning_rate": 8.00884565504925e-07,
- "loss": 0.3912196159362793,
- "mean_token_accuracy": 0.8632891774177551,
- "num_tokens": 26526314.0,
- "step": 2966
- },
- {
- "epoch": 2.2545592705167175,
- "grad_norm": 2.476206064224243,
- "learning_rate": 7.993488391439025e-07,
- "loss": 0.20462508499622345,
- "mean_token_accuracy": 0.9276266098022461,
- "num_tokens": 26531781.0,
- "step": 2967
- },
- {
- "epoch": 2.25531914893617,
- "grad_norm": 1.4944102764129639,
- "learning_rate": 7.978143063653296e-07,
- "loss": 0.2694895267486572,
- "mean_token_accuracy": 0.9033881425857544,
- "num_tokens": 26543780.0,
- "step": 2968
- },
- {
- "epoch": 2.2560790273556233,
- "grad_norm": 1.7570104598999023,
- "learning_rate": 7.962809682462008e-07,
- "loss": 0.3060353100299835,
- "mean_token_accuracy": 0.8908290863037109,
- "num_tokens": 26551978.0,
- "step": 2969
- },
- {
- "epoch": 2.256838905775076,
- "grad_norm": 2.215514898300171,
- "learning_rate": 7.947488258626718e-07,
- "loss": 0.2930528521537781,
- "mean_token_accuracy": 0.8989757299423218,
- "num_tokens": 26558267.0,
- "step": 2970
- },
- {
- "epoch": 2.2575987841945286,
- "grad_norm": 2.3069000244140625,
- "learning_rate": 7.93217880290059e-07,
- "loss": 0.18501774966716766,
- "mean_token_accuracy": 0.931271493434906,
- "num_tokens": 26563286.0,
- "step": 2971
- },
- {
- "epoch": 2.2583586626139818,
- "grad_norm": 1.6555116176605225,
- "learning_rate": 7.916881326028387e-07,
- "loss": 0.3178265392780304,
- "mean_token_accuracy": 0.9016884565353394,
- "num_tokens": 26572087.0,
- "step": 2972
- },
- {
- "epoch": 2.2591185410334345,
- "grad_norm": 2.222161054611206,
- "learning_rate": 7.901595838746471e-07,
- "loss": 0.3013504445552826,
- "mean_token_accuracy": 0.8942798376083374,
- "num_tokens": 26578159.0,
- "step": 2973
- },
- {
- "epoch": 2.2598784194528876,
- "grad_norm": 1.979411005973816,
- "learning_rate": 7.886322351782782e-07,
- "loss": 0.42746615409851074,
- "mean_token_accuracy": 0.85303795337677,
- "num_tokens": 26586252.0,
- "step": 2974
- },
- {
- "epoch": 2.2606382978723403,
- "grad_norm": 1.4925786256790161,
- "learning_rate": 7.871060875856854e-07,
- "loss": 0.33495625853538513,
- "mean_token_accuracy": 0.8911026120185852,
- "num_tokens": 26599921.0,
- "step": 2975
- },
- {
- "epoch": 2.2613981762917934,
- "grad_norm": 1.9037046432495117,
- "learning_rate": 7.855811421679746e-07,
- "loss": 0.31471866369247437,
- "mean_token_accuracy": 0.9007552862167358,
- "num_tokens": 26607954.0,
- "step": 2976
- },
- {
- "epoch": 2.262158054711246,
- "grad_norm": 2.2751407623291016,
- "learning_rate": 7.840573999954154e-07,
- "loss": 0.26972368359565735,
- "mean_token_accuracy": 0.8992317914962769,
- "num_tokens": 26614036.0,
- "step": 2977
- },
- {
- "epoch": 2.262917933130699,
- "grad_norm": 2.680572271347046,
- "learning_rate": 7.825348621374257e-07,
- "loss": 0.4264066219329834,
- "mean_token_accuracy": 0.8547691106796265,
- "num_tokens": 26619545.0,
- "step": 2978
- },
- {
- "epoch": 2.263677811550152,
- "grad_norm": 2.3535876274108887,
- "learning_rate": 7.810135296625817e-07,
- "loss": 0.37871062755584717,
- "mean_token_accuracy": 0.8621708750724792,
- "num_tokens": 26626248.0,
- "step": 2979
- },
- {
- "epoch": 2.264437689969605,
- "grad_norm": 1.2249537706375122,
- "learning_rate": 7.794934036386139e-07,
- "loss": 0.3877285122871399,
- "mean_token_accuracy": 0.8593572378158569,
- "num_tokens": 26648023.0,
- "step": 2980
- },
- {
- "epoch": 2.2651975683890577,
- "grad_norm": 2.43371844291687,
- "learning_rate": 7.779744851324048e-07,
- "loss": 0.37463510036468506,
- "mean_token_accuracy": 0.8646193742752075,
- "num_tokens": 26654016.0,
- "step": 2981
- },
- {
- "epoch": 2.2659574468085104,
- "grad_norm": 1.7429327964782715,
- "learning_rate": 7.7645677520999e-07,
- "loss": 0.4033060669898987,
- "mean_token_accuracy": 0.8644014596939087,
- "num_tokens": 26664447.0,
- "step": 2982
- },
- {
- "epoch": 2.2667173252279635,
- "grad_norm": 2.4090006351470947,
- "learning_rate": 7.749402749365573e-07,
- "loss": 0.2981206774711609,
- "mean_token_accuracy": 0.8886175751686096,
- "num_tokens": 26670355.0,
- "step": 2983
- },
- {
- "epoch": 2.2674772036474162,
- "grad_norm": 1.3855396509170532,
- "learning_rate": 7.734249853764428e-07,
- "loss": 0.35967472195625305,
- "mean_token_accuracy": 0.8652631044387817,
- "num_tokens": 26685385.0,
- "step": 2984
- },
- {
- "epoch": 2.2682370820668694,
- "grad_norm": 1.328214168548584,
- "learning_rate": 7.719109075931375e-07,
- "loss": 0.3571951389312744,
- "mean_token_accuracy": 0.8894522190093994,
- "num_tokens": 26703265.0,
- "step": 2985
- },
- {
- "epoch": 2.268996960486322,
- "grad_norm": 2.5001046657562256,
- "learning_rate": 7.703980426492791e-07,
- "loss": 0.3512844741344452,
- "mean_token_accuracy": 0.887405514717102,
- "num_tokens": 26709095.0,
- "step": 2986
- },
- {
- "epoch": 2.269756838905775,
- "grad_norm": 1.8704569339752197,
- "learning_rate": 7.688863916066524e-07,
- "loss": 0.2746743857860565,
- "mean_token_accuracy": 0.903412401676178,
- "num_tokens": 26716815.0,
- "step": 2987
- },
- {
- "epoch": 2.270516717325228,
- "grad_norm": 2.1134285926818848,
- "learning_rate": 7.673759555261947e-07,
- "loss": 0.38385504484176636,
- "mean_token_accuracy": 0.8759124279022217,
- "num_tokens": 26724046.0,
- "step": 2988
- },
- {
- "epoch": 2.271276595744681,
- "grad_norm": 1.2651840448379517,
- "learning_rate": 7.65866735467988e-07,
- "loss": 0.3499506413936615,
- "mean_token_accuracy": 0.8704953193664551,
- "num_tokens": 26743024.0,
- "step": 2989
- },
- {
- "epoch": 2.2720364741641337,
- "grad_norm": 1.7289817333221436,
- "learning_rate": 7.643587324912597e-07,
- "loss": 0.3768725097179413,
- "mean_token_accuracy": 0.8623670339584351,
- "num_tokens": 26754336.0,
- "step": 2990
- },
- {
- "epoch": 2.272796352583587,
- "grad_norm": 1.6121667623519897,
- "learning_rate": 7.628519476543839e-07,
- "loss": 0.42746737599372864,
- "mean_token_accuracy": 0.8425478935241699,
- "num_tokens": 26766813.0,
- "step": 2991
- },
- {
- "epoch": 2.2735562310030395,
- "grad_norm": 2.705442428588867,
- "learning_rate": 7.613463820148831e-07,
- "loss": 0.27137982845306396,
- "mean_token_accuracy": 0.9014253616333008,
- "num_tokens": 26772565.0,
- "step": 2992
- },
- {
- "epoch": 2.274316109422492,
- "grad_norm": 1.3811960220336914,
- "learning_rate": 7.598420366294185e-07,
- "loss": 0.2957465350627899,
- "mean_token_accuracy": 0.8935354351997375,
- "num_tokens": 26787325.0,
- "step": 2993
- },
- {
- "epoch": 2.2750759878419453,
- "grad_norm": 2.469336986541748,
- "learning_rate": 7.583389125537982e-07,
- "loss": 0.2811780273914337,
- "mean_token_accuracy": 0.8956634998321533,
- "num_tokens": 26793457.0,
- "step": 2994
- },
- {
- "epoch": 2.275835866261398,
- "grad_norm": 2.945681571960449,
- "learning_rate": 7.568370108429732e-07,
- "loss": 0.3186708092689514,
- "mean_token_accuracy": 0.8817545175552368,
- "num_tokens": 26797867.0,
- "step": 2995
- },
- {
- "epoch": 2.276595744680851,
- "grad_norm": 1.7748228311538696,
- "learning_rate": 7.553363325510355e-07,
- "loss": 0.3279818892478943,
- "mean_token_accuracy": 0.884396493434906,
- "num_tokens": 26806656.0,
- "step": 2996
- },
- {
- "epoch": 2.277355623100304,
- "grad_norm": 1.312500238418579,
- "learning_rate": 7.538368787312186e-07,
- "loss": 0.3754822611808777,
- "mean_token_accuracy": 0.8653179407119751,
- "num_tokens": 26823126.0,
- "step": 2997
- },
- {
- "epoch": 2.278115501519757,
- "grad_norm": 3.1305344104766846,
- "learning_rate": 7.523386504358984e-07,
- "loss": 0.3293214440345764,
- "mean_token_accuracy": 0.8908799886703491,
- "num_tokens": 26828250.0,
- "step": 2998
- },
- {
- "epoch": 2.2788753799392096,
- "grad_norm": 2.6449344158172607,
- "learning_rate": 7.508416487165862e-07,
- "loss": 0.23732036352157593,
- "mean_token_accuracy": 0.9029837846755981,
- "num_tokens": 26833123.0,
- "step": 2999
- },
- {
- "epoch": 2.2796352583586628,
- "grad_norm": 2.04388427734375,
- "learning_rate": 7.49345874623939e-07,
- "loss": 0.31240373849868774,
- "mean_token_accuracy": 0.8860392570495605,
- "num_tokens": 26840878.0,
- "step": 3000
- },
- {
- "epoch": 2.2803951367781155,
- "grad_norm": 1.1828604936599731,
- "learning_rate": 7.478513292077463e-07,
- "loss": 0.32127636671066284,
- "mean_token_accuracy": 0.8938446044921875,
- "num_tokens": 26858916.0,
- "step": 3001
- },
- {
- "epoch": 2.2811550151975686,
- "grad_norm": 2.5061612129211426,
- "learning_rate": 7.46358013516938e-07,
- "loss": 0.30558091402053833,
- "mean_token_accuracy": 0.8819161653518677,
- "num_tokens": 26864218.0,
- "step": 3002
- },
- {
- "epoch": 2.2819148936170213,
- "grad_norm": 2.424044609069824,
- "learning_rate": 7.448659285995808e-07,
- "loss": 0.3008216917514801,
- "mean_token_accuracy": 0.8751994371414185,
- "num_tokens": 26869646.0,
- "step": 3003
- },
- {
- "epoch": 2.282674772036474,
- "grad_norm": 1.3576173782348633,
- "learning_rate": 7.433750755028774e-07,
- "loss": 0.3001647889614105,
- "mean_token_accuracy": 0.8996933698654175,
- "num_tokens": 26884385.0,
- "step": 3004
- },
- {
- "epoch": 2.283434650455927,
- "grad_norm": 2.237589120864868,
- "learning_rate": 7.418854552731655e-07,
- "loss": 0.3126741051673889,
- "mean_token_accuracy": 0.8910979628562927,
- "num_tokens": 26891109.0,
- "step": 3005
- },
- {
- "epoch": 2.2841945288753798,
- "grad_norm": 2.1947414875030518,
- "learning_rate": 7.403970689559184e-07,
- "loss": 0.29793858528137207,
- "mean_token_accuracy": 0.9057353734970093,
- "num_tokens": 26897905.0,
- "step": 3006
- },
- {
- "epoch": 2.284954407294833,
- "grad_norm": 1.4252705574035645,
- "learning_rate": 7.389099175957426e-07,
- "loss": 0.2873227298259735,
- "mean_token_accuracy": 0.8910978436470032,
- "num_tokens": 26910322.0,
- "step": 3007
- },
- {
- "epoch": 2.2857142857142856,
- "grad_norm": 1.2200649976730347,
- "learning_rate": 7.374240022363785e-07,
- "loss": 0.2782876491546631,
- "mean_token_accuracy": 0.8948163390159607,
- "num_tokens": 26927253.0,
- "step": 3008
- },
- {
- "epoch": 2.2864741641337387,
- "grad_norm": 2.1249423027038574,
- "learning_rate": 7.359393239206991e-07,
- "loss": 0.4046584367752075,
- "mean_token_accuracy": 0.8653120994567871,
- "num_tokens": 26934798.0,
- "step": 3009
- },
- {
- "epoch": 2.2872340425531914,
- "grad_norm": 1.6851856708526611,
- "learning_rate": 7.344558836907067e-07,
- "loss": 0.3814213275909424,
- "mean_token_accuracy": 0.8618872165679932,
- "num_tokens": 26944984.0,
- "step": 3010
- },
- {
- "epoch": 2.2879939209726445,
- "grad_norm": 1.5802191495895386,
- "learning_rate": 7.329736825875388e-07,
- "loss": 0.28643855452537537,
- "mean_token_accuracy": 0.9038295745849609,
- "num_tokens": 26957832.0,
- "step": 3011
- },
- {
- "epoch": 2.288753799392097,
- "grad_norm": 1.6257383823394775,
- "learning_rate": 7.314927216514617e-07,
- "loss": 0.264072448015213,
- "mean_token_accuracy": 0.9089190363883972,
- "num_tokens": 26967621.0,
- "step": 3012
- },
- {
- "epoch": 2.2895136778115504,
- "grad_norm": 2.107192039489746,
- "learning_rate": 7.300130019218688e-07,
- "loss": 0.2772635817527771,
- "mean_token_accuracy": 0.9071067571640015,
- "num_tokens": 26974669.0,
- "step": 3013
- },
- {
- "epoch": 2.290273556231003,
- "grad_norm": 1.496505618095398,
- "learning_rate": 7.285345244372843e-07,
- "loss": 0.2936630845069885,
- "mean_token_accuracy": 0.8946818113327026,
- "num_tokens": 26985942.0,
- "step": 3014
- },
- {
- "epoch": 2.2910334346504557,
- "grad_norm": 1.6122950315475464,
- "learning_rate": 7.270572902353634e-07,
- "loss": 0.2819349765777588,
- "mean_token_accuracy": 0.8909854888916016,
- "num_tokens": 26996231.0,
- "step": 3015
- },
- {
- "epoch": 2.291793313069909,
- "grad_norm": 1.9463475942611694,
- "learning_rate": 7.255813003528834e-07,
- "loss": 0.2584724426269531,
- "mean_token_accuracy": 0.9069744348526001,
- "num_tokens": 27003253.0,
- "step": 3016
- },
- {
- "epoch": 2.2925531914893615,
- "grad_norm": 2.1707770824432373,
- "learning_rate": 7.241065558257513e-07,
- "loss": 0.17524898052215576,
- "mean_token_accuracy": 0.926141083240509,
- "num_tokens": 27009501.0,
- "step": 3017
- },
- {
- "epoch": 2.2933130699088147,
- "grad_norm": 2.1424882411956787,
- "learning_rate": 7.226330576889998e-07,
- "loss": 0.26512211561203003,
- "mean_token_accuracy": 0.9059023857116699,
- "num_tokens": 27016096.0,
- "step": 3018
- },
- {
- "epoch": 2.2940729483282674,
- "grad_norm": 3.50669264793396,
- "learning_rate": 7.211608069767867e-07,
- "loss": 0.24738222360610962,
- "mean_token_accuracy": 0.9179760217666626,
- "num_tokens": 27019810.0,
- "step": 3019
- },
- {
- "epoch": 2.2948328267477205,
- "grad_norm": 1.5426064729690552,
- "learning_rate": 7.196898047223943e-07,
- "loss": 0.2762960195541382,
- "mean_token_accuracy": 0.8937389850616455,
- "num_tokens": 27031952.0,
- "step": 3020
- },
- {
- "epoch": 2.295592705167173,
- "grad_norm": 2.469064712524414,
- "learning_rate": 7.182200519582283e-07,
- "loss": 0.2877562940120697,
- "mean_token_accuracy": 0.9252556562423706,
- "num_tokens": 27036673.0,
- "step": 3021
- },
- {
- "epoch": 2.2963525835866263,
- "grad_norm": 3.289813756942749,
- "learning_rate": 7.167515497158179e-07,
- "loss": 0.2837294340133667,
- "mean_token_accuracy": 0.9070497155189514,
- "num_tokens": 27041001.0,
- "step": 3022
- },
- {
- "epoch": 2.297112462006079,
- "grad_norm": 1.7201104164123535,
- "learning_rate": 7.152842990258147e-07,
- "loss": 0.44239580631256104,
- "mean_token_accuracy": 0.8443326354026794,
- "num_tokens": 27052265.0,
- "step": 3023
- },
- {
- "epoch": 2.297872340425532,
- "grad_norm": 1.3710078001022339,
- "learning_rate": 7.138183009179922e-07,
- "loss": 0.40450236201286316,
- "mean_token_accuracy": 0.87160724401474,
- "num_tokens": 27068475.0,
- "step": 3024
- },
- {
- "epoch": 2.298632218844985,
- "grad_norm": 2.1379098892211914,
- "learning_rate": 7.123535564212419e-07,
- "loss": 0.3432690501213074,
- "mean_token_accuracy": 0.8736584186553955,
- "num_tokens": 27075548.0,
- "step": 3025
- },
- {
- "epoch": 2.2993920972644375,
- "grad_norm": 2.423079252243042,
- "learning_rate": 7.108900665635815e-07,
- "loss": 0.27869731187820435,
- "mean_token_accuracy": 0.9046810865402222,
- "num_tokens": 27081560.0,
- "step": 3026
- },
- {
- "epoch": 2.3001519756838906,
- "grad_norm": 1.2137898206710815,
- "learning_rate": 7.094278323721418e-07,
- "loss": 0.41351836919784546,
- "mean_token_accuracy": 0.8553295135498047,
- "num_tokens": 27098346.0,
- "step": 3027
- },
- {
- "epoch": 2.3009118541033433,
- "grad_norm": 1.371337890625,
- "learning_rate": 7.079668548731757e-07,
- "loss": 0.29800572991371155,
- "mean_token_accuracy": 0.9219756126403809,
- "num_tokens": 27111678.0,
- "step": 3028
- },
- {
- "epoch": 2.3016717325227964,
- "grad_norm": 3.133449077606201,
- "learning_rate": 7.065071350920538e-07,
- "loss": 0.39177340269088745,
- "mean_token_accuracy": 0.8742524981498718,
- "num_tokens": 27116496.0,
- "step": 3029
- },
- {
- "epoch": 2.302431610942249,
- "grad_norm": 1.4038591384887695,
- "learning_rate": 7.050486740532633e-07,
- "loss": 0.2862081825733185,
- "mean_token_accuracy": 0.8894703984260559,
- "num_tokens": 27130806.0,
- "step": 3030
- },
- {
- "epoch": 2.3031914893617023,
- "grad_norm": 1.806132197380066,
- "learning_rate": 7.035914727804085e-07,
- "loss": 0.42546606063842773,
- "mean_token_accuracy": 0.876154363155365,
- "num_tokens": 27143687.0,
- "step": 3031
- },
- {
- "epoch": 2.303951367781155,
- "grad_norm": 1.8565905094146729,
- "learning_rate": 7.021355322962103e-07,
- "loss": 0.304633229970932,
- "mean_token_accuracy": 0.896949052810669,
- "num_tokens": 27152532.0,
- "step": 3032
- },
- {
- "epoch": 2.304711246200608,
- "grad_norm": 2.8857851028442383,
- "learning_rate": 7.006808536225009e-07,
- "loss": 0.3943948745727539,
- "mean_token_accuracy": 0.8629783391952515,
- "num_tokens": 27157824.0,
- "step": 3033
- },
- {
- "epoch": 2.3054711246200608,
- "grad_norm": 1.7708746194839478,
- "learning_rate": 6.992274377802328e-07,
- "loss": 0.46951010823249817,
- "mean_token_accuracy": 0.8334795236587524,
- "num_tokens": 27169445.0,
- "step": 3034
- },
- {
- "epoch": 2.306231003039514,
- "grad_norm": 2.5275487899780273,
- "learning_rate": 6.977752857894684e-07,
- "loss": 0.3764885365962982,
- "mean_token_accuracy": 0.8665527105331421,
- "num_tokens": 27176545.0,
- "step": 3035
- },
- {
- "epoch": 2.3069908814589666,
- "grad_norm": 1.9251405000686646,
- "learning_rate": 6.963243986693832e-07,
- "loss": 0.44473910331726074,
- "mean_token_accuracy": 0.8828103542327881,
- "num_tokens": 27187808.0,
- "step": 3036
- },
- {
- "epoch": 2.3077507598784193,
- "grad_norm": 2.1559739112854004,
- "learning_rate": 6.94874777438265e-07,
- "loss": 0.35055795311927795,
- "mean_token_accuracy": 0.8815537691116333,
- "num_tokens": 27195493.0,
- "step": 3037
- },
- {
- "epoch": 2.3085106382978724,
- "grad_norm": 1.2242814302444458,
- "learning_rate": 6.934264231135163e-07,
- "loss": 0.38762199878692627,
- "mean_token_accuracy": 0.8607999086380005,
- "num_tokens": 27213291.0,
- "step": 3038
- },
- {
- "epoch": 2.309270516717325,
- "grad_norm": 3.787707805633545,
- "learning_rate": 6.919793367116453e-07,
- "loss": 0.299210786819458,
- "mean_token_accuracy": 0.8993752002716064,
- "num_tokens": 27216930.0,
- "step": 3039
- },
- {
- "epoch": 2.310030395136778,
- "grad_norm": 1.4088979959487915,
- "learning_rate": 6.905335192482734e-07,
- "loss": 0.337495893239975,
- "mean_token_accuracy": 0.8903428912162781,
- "num_tokens": 27229441.0,
- "step": 3040
- },
- {
- "epoch": 2.310790273556231,
- "grad_norm": 2.0042521953582764,
- "learning_rate": 6.890889717381333e-07,
- "loss": 0.2732951045036316,
- "mean_token_accuracy": 0.8986722826957703,
- "num_tokens": 27237525.0,
- "step": 3041
- },
- {
- "epoch": 2.311550151975684,
- "grad_norm": 2.4301047325134277,
- "learning_rate": 6.876456951950614e-07,
- "loss": 0.25528258085250854,
- "mean_token_accuracy": 0.9083898663520813,
- "num_tokens": 27243073.0,
- "step": 3042
- },
- {
- "epoch": 2.3123100303951367,
- "grad_norm": 1.4725151062011719,
- "learning_rate": 6.862036906320055e-07,
- "loss": 0.3366362452507019,
- "mean_token_accuracy": 0.8746060729026794,
- "num_tokens": 27255151.0,
- "step": 3043
- },
- {
- "epoch": 2.31306990881459,
- "grad_norm": 2.687649965286255,
- "learning_rate": 6.847629590610202e-07,
- "loss": 0.30955633521080017,
- "mean_token_accuracy": 0.8862895369529724,
- "num_tokens": 27259909.0,
- "step": 3044
- },
- {
- "epoch": 2.3138297872340425,
- "grad_norm": 1.9105106592178345,
- "learning_rate": 6.833235014932662e-07,
- "loss": 0.3366878628730774,
- "mean_token_accuracy": 0.8920552134513855,
- "num_tokens": 27268003.0,
- "step": 3045
- },
- {
- "epoch": 2.3145896656534957,
- "grad_norm": 2.278108596801758,
- "learning_rate": 6.818853189390104e-07,
- "loss": 0.41192957758903503,
- "mean_token_accuracy": 0.8558850288391113,
- "num_tokens": 27275447.0,
- "step": 3046
- },
- {
- "epoch": 2.3153495440729484,
- "grad_norm": 3.114295482635498,
- "learning_rate": 6.804484124076249e-07,
- "loss": 0.16981825232505798,
- "mean_token_accuracy": 0.9305338859558105,
- "num_tokens": 27279348.0,
- "step": 3047
- },
- {
- "epoch": 2.316109422492401,
- "grad_norm": 1.188263177871704,
- "learning_rate": 6.790127829075843e-07,
- "loss": 0.3003719747066498,
- "mean_token_accuracy": 0.8945091366767883,
- "num_tokens": 27296576.0,
- "step": 3048
- },
- {
- "epoch": 2.316869300911854,
- "grad_norm": 1.4627037048339844,
- "learning_rate": 6.775784314464717e-07,
- "loss": 0.42125576734542847,
- "mean_token_accuracy": 0.85997474193573,
- "num_tokens": 27310603.0,
- "step": 3049
- },
- {
- "epoch": 2.317629179331307,
- "grad_norm": 1.86640465259552,
- "learning_rate": 6.761453590309675e-07,
- "loss": 0.27236056327819824,
- "mean_token_accuracy": 0.8952003717422485,
- "num_tokens": 27320635.0,
- "step": 3050
- },
- {
- "epoch": 2.31838905775076,
- "grad_norm": 2.3250787258148193,
- "learning_rate": 6.747135666668581e-07,
- "loss": 0.35650634765625,
- "mean_token_accuracy": 0.8870455026626587,
- "num_tokens": 27326778.0,
- "step": 3051
- },
- {
- "epoch": 2.3191489361702127,
- "grad_norm": 1.493028163909912,
- "learning_rate": 6.732830553590305e-07,
- "loss": 0.3086358308792114,
- "mean_token_accuracy": 0.8837405443191528,
- "num_tokens": 27341792.0,
- "step": 3052
- },
- {
- "epoch": 2.319908814589666,
- "grad_norm": 1.9723037481307983,
- "learning_rate": 6.718538261114727e-07,
- "loss": 0.2970390021800995,
- "mean_token_accuracy": 0.8897635340690613,
- "num_tokens": 27349764.0,
- "step": 3053
- },
- {
- "epoch": 2.3206686930091185,
- "grad_norm": 2.418403387069702,
- "learning_rate": 6.704258799272723e-07,
- "loss": 0.31288546323776245,
- "mean_token_accuracy": 0.8795867562294006,
- "num_tokens": 27355223.0,
- "step": 3054
- },
- {
- "epoch": 2.3214285714285716,
- "grad_norm": 1.866711139678955,
- "learning_rate": 6.689992178086174e-07,
- "loss": 0.2915012240409851,
- "mean_token_accuracy": 0.8901758790016174,
- "num_tokens": 27363363.0,
- "step": 3055
- },
- {
- "epoch": 2.3221884498480243,
- "grad_norm": 2.52559494972229,
- "learning_rate": 6.675738407567941e-07,
- "loss": 0.28706514835357666,
- "mean_token_accuracy": 0.9131950736045837,
- "num_tokens": 27368937.0,
- "step": 3056
- },
- {
- "epoch": 2.3229483282674774,
- "grad_norm": 1.5393383502960205,
- "learning_rate": 6.661497497721872e-07,
- "loss": 0.41627925634384155,
- "mean_token_accuracy": 0.8846169114112854,
- "num_tokens": 27381824.0,
- "step": 3057
- },
- {
- "epoch": 2.32370820668693,
- "grad_norm": 1.2711350917816162,
- "learning_rate": 6.647269458542793e-07,
- "loss": 0.3200211524963379,
- "mean_token_accuracy": 0.8812989592552185,
- "num_tokens": 27399489.0,
- "step": 3058
- },
- {
- "epoch": 2.324468085106383,
- "grad_norm": 2.4790799617767334,
- "learning_rate": 6.633054300016464e-07,
- "loss": 0.21309956908226013,
- "mean_token_accuracy": 0.9245274066925049,
- "num_tokens": 27403825.0,
- "step": 3059
- },
- {
- "epoch": 2.325227963525836,
- "grad_norm": 1.937660813331604,
- "learning_rate": 6.618852032119655e-07,
- "loss": 0.18426720798015594,
- "mean_token_accuracy": 0.9317672252655029,
- "num_tokens": 27410934.0,
- "step": 3060
- },
- {
- "epoch": 2.3259878419452886,
- "grad_norm": 1.4951587915420532,
- "learning_rate": 6.604662664820063e-07,
- "loss": 0.27759790420532227,
- "mean_token_accuracy": 0.9198849201202393,
- "num_tokens": 27421281.0,
- "step": 3061
- },
- {
- "epoch": 2.3267477203647418,
- "grad_norm": 1.6459094285964966,
- "learning_rate": 6.590486208076319e-07,
- "loss": 0.3164416551589966,
- "mean_token_accuracy": 0.8805180788040161,
- "num_tokens": 27431545.0,
- "step": 3062
- },
- {
- "epoch": 2.3275075987841944,
- "grad_norm": 1.6612298488616943,
- "learning_rate": 6.576322671838003e-07,
- "loss": 0.35754746198654175,
- "mean_token_accuracy": 0.8680465817451477,
- "num_tokens": 27441566.0,
- "step": 3063
- },
- {
- "epoch": 2.3282674772036476,
- "grad_norm": 2.4485018253326416,
- "learning_rate": 6.562172066045655e-07,
- "loss": 0.2957935929298401,
- "mean_token_accuracy": 0.886491596698761,
- "num_tokens": 27447186.0,
- "step": 3064
- },
- {
- "epoch": 2.3290273556231003,
- "grad_norm": 1.9771100282669067,
- "learning_rate": 6.548034400630693e-07,
- "loss": 0.3137952387332916,
- "mean_token_accuracy": 0.8874903321266174,
- "num_tokens": 27454347.0,
- "step": 3065
- },
- {
- "epoch": 2.329787234042553,
- "grad_norm": 4.502175331115723,
- "learning_rate": 6.533909685515483e-07,
- "loss": 0.30587732791900635,
- "mean_token_accuracy": 0.8878371715545654,
- "num_tokens": 27457322.0,
- "step": 3066
- },
- {
- "epoch": 2.330547112462006,
- "grad_norm": 1.041748285293579,
- "learning_rate": 6.519797930613289e-07,
- "loss": 0.2936970889568329,
- "mean_token_accuracy": 0.8899037837982178,
- "num_tokens": 27476750.0,
- "step": 3067
- },
- {
- "epoch": 2.331306990881459,
- "grad_norm": 1.57416570186615,
- "learning_rate": 6.505699145828287e-07,
- "loss": 0.2849736511707306,
- "mean_token_accuracy": 0.8906558156013489,
- "num_tokens": 27489326.0,
- "step": 3068
- },
- {
- "epoch": 2.332066869300912,
- "grad_norm": 2.879692792892456,
- "learning_rate": 6.491613341055547e-07,
- "loss": 0.22944235801696777,
- "mean_token_accuracy": 0.9167940616607666,
- "num_tokens": 27493562.0,
- "step": 3069
- },
- {
- "epoch": 2.3328267477203646,
- "grad_norm": 2.3187942504882812,
- "learning_rate": 6.477540526181036e-07,
- "loss": 0.3072662949562073,
- "mean_token_accuracy": 0.8936570882797241,
- "num_tokens": 27499670.0,
- "step": 3070
- },
- {
- "epoch": 2.3335866261398177,
- "grad_norm": 1.3098584413528442,
- "learning_rate": 6.463480711081577e-07,
- "loss": 0.4124477505683899,
- "mean_token_accuracy": 0.8422118425369263,
- "num_tokens": 27518197.0,
- "step": 3071
- },
- {
- "epoch": 2.3343465045592704,
- "grad_norm": 1.874219298362732,
- "learning_rate": 6.449433905624916e-07,
- "loss": 0.34171411395072937,
- "mean_token_accuracy": 0.8761874437332153,
- "num_tokens": 27526512.0,
- "step": 3072
- },
- {
- "epoch": 2.3351063829787235,
- "grad_norm": 3.3637123107910156,
- "learning_rate": 6.435400119669618e-07,
- "loss": 0.23634830117225647,
- "mean_token_accuracy": 0.9309012293815613,
- "num_tokens": 27529921.0,
- "step": 3073
- },
- {
- "epoch": 2.335866261398176,
- "grad_norm": 2.025264263153076,
- "learning_rate": 6.421379363065142e-07,
- "loss": 0.352272629737854,
- "mean_token_accuracy": 0.8678278923034668,
- "num_tokens": 27537122.0,
- "step": 3074
- },
- {
- "epoch": 2.3366261398176293,
- "grad_norm": 1.7762253284454346,
- "learning_rate": 6.407371645651808e-07,
- "loss": 0.3190876841545105,
- "mean_token_accuracy": 0.8870849609375,
- "num_tokens": 27547436.0,
- "step": 3075
- },
- {
- "epoch": 2.337386018237082,
- "grad_norm": 1.4258071184158325,
- "learning_rate": 6.393376977260754e-07,
- "loss": 0.24304701387882233,
- "mean_token_accuracy": 0.9347224235534668,
- "num_tokens": 27559322.0,
- "step": 3076
- },
- {
- "epoch": 2.3381458966565347,
- "grad_norm": 2.015075922012329,
- "learning_rate": 6.379395367713983e-07,
- "loss": 0.37574928998947144,
- "mean_token_accuracy": 0.8884165287017822,
- "num_tokens": 27566564.0,
- "step": 3077
- },
- {
- "epoch": 2.338905775075988,
- "grad_norm": 2.2211477756500244,
- "learning_rate": 6.365426826824328e-07,
- "loss": 0.3210097551345825,
- "mean_token_accuracy": 0.8879522085189819,
- "num_tokens": 27573643.0,
- "step": 3078
- },
- {
- "epoch": 2.339665653495441,
- "grad_norm": 2.102496385574341,
- "learning_rate": 6.351471364395448e-07,
- "loss": 0.4013458490371704,
- "mean_token_accuracy": 0.887574315071106,
- "num_tokens": 27580724.0,
- "step": 3079
- },
- {
- "epoch": 2.3404255319148937,
- "grad_norm": 1.6786696910858154,
- "learning_rate": 6.337528990221822e-07,
- "loss": 0.3980376124382019,
- "mean_token_accuracy": 0.8881500363349915,
- "num_tokens": 27592147.0,
- "step": 3080
- },
- {
- "epoch": 2.3411854103343464,
- "grad_norm": 2.541473388671875,
- "learning_rate": 6.323599714088754e-07,
- "loss": 0.1682094782590866,
- "mean_token_accuracy": 0.9426926374435425,
- "num_tokens": 27596757.0,
- "step": 3081
- },
- {
- "epoch": 2.3419452887537995,
- "grad_norm": 2.0378596782684326,
- "learning_rate": 6.309683545772327e-07,
- "loss": 0.4023628234863281,
- "mean_token_accuracy": 0.8561117649078369,
- "num_tokens": 27604923.0,
- "step": 3082
- },
- {
- "epoch": 2.342705167173252,
- "grad_norm": 1.7666785717010498,
- "learning_rate": 6.29578049503946e-07,
- "loss": 0.37102991342544556,
- "mean_token_accuracy": 0.8807623386383057,
- "num_tokens": 27614106.0,
- "step": 3083
- },
- {
- "epoch": 2.3434650455927053,
- "grad_norm": 1.6605560779571533,
- "learning_rate": 6.281890571647853e-07,
- "loss": 0.4239729642868042,
- "mean_token_accuracy": 0.8428831696510315,
- "num_tokens": 27626568.0,
- "step": 3084
- },
- {
- "epoch": 2.344224924012158,
- "grad_norm": 1.9562166929244995,
- "learning_rate": 6.268013785345969e-07,
- "loss": 0.16737908124923706,
- "mean_token_accuracy": 0.9457347393035889,
- "num_tokens": 27632789.0,
- "step": 3085
- },
- {
- "epoch": 2.344984802431611,
- "grad_norm": 2.274827480316162,
- "learning_rate": 6.254150145873081e-07,
- "loss": 0.3866672217845917,
- "mean_token_accuracy": 0.8498655557632446,
- "num_tokens": 27639692.0,
- "step": 3086
- },
- {
- "epoch": 2.345744680851064,
- "grad_norm": 1.9612165689468384,
- "learning_rate": 6.240299662959237e-07,
- "loss": 0.2607918977737427,
- "mean_token_accuracy": 0.9195128679275513,
- "num_tokens": 27646911.0,
- "step": 3087
- },
- {
- "epoch": 2.3465045592705165,
- "grad_norm": 1.6821730136871338,
- "learning_rate": 6.226462346325221e-07,
- "loss": 0.3244997560977936,
- "mean_token_accuracy": 0.8889811038970947,
- "num_tokens": 27656789.0,
- "step": 3088
- },
- {
- "epoch": 2.3472644376899696,
- "grad_norm": 1.8024263381958008,
- "learning_rate": 6.2126382056826e-07,
- "loss": 0.28899791836738586,
- "mean_token_accuracy": 0.8931136131286621,
- "num_tokens": 27666153.0,
- "step": 3089
- },
- {
- "epoch": 2.3480243161094223,
- "grad_norm": 2.8205342292785645,
- "learning_rate": 6.198827250733694e-07,
- "loss": 0.32387930154800415,
- "mean_token_accuracy": 0.9032641649246216,
- "num_tokens": 27671042.0,
- "step": 3090
- },
- {
- "epoch": 2.3487841945288754,
- "grad_norm": 2.8001155853271484,
- "learning_rate": 6.185029491171554e-07,
- "loss": 0.3122251331806183,
- "mean_token_accuracy": 0.9122956395149231,
- "num_tokens": 27675732.0,
- "step": 3091
- },
- {
- "epoch": 2.349544072948328,
- "grad_norm": 2.6694142818450928,
- "learning_rate": 6.171244936679985e-07,
- "loss": 0.3166629374027252,
- "mean_token_accuracy": 0.875450074672699,
- "num_tokens": 27681448.0,
- "step": 3092
- },
- {
- "epoch": 2.3503039513677813,
- "grad_norm": 1.515966534614563,
- "learning_rate": 6.157473596933517e-07,
- "loss": 0.17373405396938324,
- "mean_token_accuracy": 0.933076798915863,
- "num_tokens": 27690654.0,
- "step": 3093
- },
- {
- "epoch": 2.351063829787234,
- "grad_norm": 2.4486823081970215,
- "learning_rate": 6.143715481597404e-07,
- "loss": 0.18732565641403198,
- "mean_token_accuracy": 0.9323808550834656,
- "num_tokens": 27696111.0,
- "step": 3094
- },
- {
- "epoch": 2.351823708206687,
- "grad_norm": 2.3000645637512207,
- "learning_rate": 6.129970600327623e-07,
- "loss": 0.267723023891449,
- "mean_token_accuracy": 0.9053730964660645,
- "num_tokens": 27702103.0,
- "step": 3095
- },
- {
- "epoch": 2.3525835866261398,
- "grad_norm": 2.533583164215088,
- "learning_rate": 6.116238962770868e-07,
- "loss": 0.40778815746307373,
- "mean_token_accuracy": 0.8500792980194092,
- "num_tokens": 27708868.0,
- "step": 3096
- },
- {
- "epoch": 2.353343465045593,
- "grad_norm": 1.9357147216796875,
- "learning_rate": 6.102520578564508e-07,
- "loss": 0.2880813479423523,
- "mean_token_accuracy": 0.8895434141159058,
- "num_tokens": 27716730.0,
- "step": 3097
- },
- {
- "epoch": 2.3541033434650456,
- "grad_norm": 3.1041259765625,
- "learning_rate": 6.088815457336664e-07,
- "loss": 0.21810382604599,
- "mean_token_accuracy": 0.9217148423194885,
- "num_tokens": 27720792.0,
- "step": 3098
- },
- {
- "epoch": 2.3548632218844983,
- "grad_norm": 2.890695095062256,
- "learning_rate": 6.075123608706093e-07,
- "loss": 0.4002879858016968,
- "mean_token_accuracy": 0.8573901653289795,
- "num_tokens": 27726201.0,
- "step": 3099
- },
- {
- "epoch": 2.3556231003039514,
- "grad_norm": 3.4247958660125732,
- "learning_rate": 6.061445042282271e-07,
- "loss": 0.4269426465034485,
- "mean_token_accuracy": 0.848825216293335,
- "num_tokens": 27730419.0,
- "step": 3100
- },
- {
- "epoch": 2.356382978723404,
- "grad_norm": 1.8903621435165405,
- "learning_rate": 6.047779767665341e-07,
- "loss": 0.460983544588089,
- "mean_token_accuracy": 0.8535886406898499,
- "num_tokens": 27741121.0,
- "step": 3101
- },
- {
- "epoch": 2.357142857142857,
- "grad_norm": 2.6975221633911133,
- "learning_rate": 6.03412779444612e-07,
- "loss": 0.34841713309288025,
- "mean_token_accuracy": 0.8812501430511475,
- "num_tokens": 27746537.0,
- "step": 3102
- },
- {
- "epoch": 2.35790273556231,
- "grad_norm": 1.4414833784103394,
- "learning_rate": 6.02048913220609e-07,
- "loss": 0.34440115094184875,
- "mean_token_accuracy": 0.8725030422210693,
- "num_tokens": 27761085.0,
- "step": 3103
- },
- {
- "epoch": 2.358662613981763,
- "grad_norm": 1.7643623352050781,
- "learning_rate": 6.006863790517392e-07,
- "loss": 0.31087273359298706,
- "mean_token_accuracy": 0.9108829498291016,
- "num_tokens": 27769320.0,
- "step": 3104
- },
- {
- "epoch": 2.3594224924012157,
- "grad_norm": 1.365966558456421,
- "learning_rate": 5.993251778942794e-07,
- "loss": 0.501873254776001,
- "mean_token_accuracy": 0.8246122598648071,
- "num_tokens": 27791567.0,
- "step": 3105
- },
- {
- "epoch": 2.360182370820669,
- "grad_norm": 2.5981390476226807,
- "learning_rate": 5.979653107035754e-07,
- "loss": 0.27364015579223633,
- "mean_token_accuracy": 0.8946651816368103,
- "num_tokens": 27796849.0,
- "step": 3106
- },
- {
- "epoch": 2.3609422492401215,
- "grad_norm": 3.3564229011535645,
- "learning_rate": 5.966067784340346e-07,
- "loss": 0.2456880509853363,
- "mean_token_accuracy": 0.9110729694366455,
- "num_tokens": 27800785.0,
- "step": 3107
- },
- {
- "epoch": 2.3617021276595747,
- "grad_norm": 1.6739033460617065,
- "learning_rate": 5.952495820391244e-07,
- "loss": 0.30737343430519104,
- "mean_token_accuracy": 0.8898587226867676,
- "num_tokens": 27811982.0,
- "step": 3108
- },
- {
- "epoch": 2.3624620060790273,
- "grad_norm": 1.4430924654006958,
- "learning_rate": 5.9389372247138e-07,
- "loss": 0.46142861247062683,
- "mean_token_accuracy": 0.8355259895324707,
- "num_tokens": 27827765.0,
- "step": 3109
- },
- {
- "epoch": 2.36322188449848,
- "grad_norm": 3.7220218181610107,
- "learning_rate": 5.92539200682396e-07,
- "loss": 0.18588921427726746,
- "mean_token_accuracy": 0.9419732093811035,
- "num_tokens": 27830551.0,
- "step": 3110
- },
- {
- "epoch": 2.363981762917933,
- "grad_norm": 2.4770448207855225,
- "learning_rate": 5.911860176228262e-07,
- "loss": 0.3194807767868042,
- "mean_token_accuracy": 0.8959789276123047,
- "num_tokens": 27836529.0,
- "step": 3111
- },
- {
- "epoch": 2.364741641337386,
- "grad_norm": 2.1989665031433105,
- "learning_rate": 5.898341742423866e-07,
- "loss": 0.23653598129749298,
- "mean_token_accuracy": 0.9119038581848145,
- "num_tokens": 27842019.0,
- "step": 3112
- },
- {
- "epoch": 2.365501519756839,
- "grad_norm": 1.9562573432922363,
- "learning_rate": 5.884836714898554e-07,
- "loss": 0.320852130651474,
- "mean_token_accuracy": 0.8902987837791443,
- "num_tokens": 27850663.0,
- "step": 3113
- },
- {
- "epoch": 2.3662613981762917,
- "grad_norm": 1.4759801626205444,
- "learning_rate": 5.871345103130646e-07,
- "loss": 0.2739158570766449,
- "mean_token_accuracy": 0.9033761024475098,
- "num_tokens": 27863451.0,
- "step": 3114
- },
- {
- "epoch": 2.367021276595745,
- "grad_norm": 1.7798938751220703,
- "learning_rate": 5.857866916589089e-07,
- "loss": 0.35400229692459106,
- "mean_token_accuracy": 0.8623180389404297,
- "num_tokens": 27873669.0,
- "step": 3115
- },
- {
- "epoch": 2.3677811550151975,
- "grad_norm": 2.269472360610962,
- "learning_rate": 5.84440216473339e-07,
- "loss": 0.3717876672744751,
- "mean_token_accuracy": 0.8853007555007935,
- "num_tokens": 27880307.0,
- "step": 3116
- },
- {
- "epoch": 2.3685410334346506,
- "grad_norm": 1.5675846338272095,
- "learning_rate": 5.830950857013629e-07,
- "loss": 0.3465133011341095,
- "mean_token_accuracy": 0.876459002494812,
- "num_tokens": 27893889.0,
- "step": 3117
- },
- {
- "epoch": 2.3693009118541033,
- "grad_norm": 2.782482147216797,
- "learning_rate": 5.817513002870451e-07,
- "loss": 0.14173674583435059,
- "mean_token_accuracy": 0.9492213726043701,
- "num_tokens": 27897693.0,
- "step": 3118
- },
- {
- "epoch": 2.3700607902735564,
- "grad_norm": 1.830674171447754,
- "learning_rate": 5.80408861173507e-07,
- "loss": 0.2692085802555084,
- "mean_token_accuracy": 0.9287421107292175,
- "num_tokens": 27905261.0,
- "step": 3119
- },
- {
- "epoch": 2.370820668693009,
- "grad_norm": 2.2477660179138184,
- "learning_rate": 5.790677693029217e-07,
- "loss": 0.32119685411453247,
- "mean_token_accuracy": 0.8751975297927856,
- "num_tokens": 27911581.0,
- "step": 3120
- },
- {
- "epoch": 2.371580547112462,
- "grad_norm": 2.3288302421569824,
- "learning_rate": 5.777280256165218e-07,
- "loss": 0.34133443236351013,
- "mean_token_accuracy": 0.8763091564178467,
- "num_tokens": 27918603.0,
- "step": 3121
- },
- {
- "epoch": 2.372340425531915,
- "grad_norm": 1.595375895500183,
- "learning_rate": 5.763896310545893e-07,
- "loss": 0.30863112211227417,
- "mean_token_accuracy": 0.8858665823936462,
- "num_tokens": 27929892.0,
- "step": 3122
- },
- {
- "epoch": 2.3731003039513676,
- "grad_norm": 2.0553293228149414,
- "learning_rate": 5.750525865564613e-07,
- "loss": 0.28052228689193726,
- "mean_token_accuracy": 0.8970555067062378,
- "num_tokens": 27937532.0,
- "step": 3123
- },
- {
- "epoch": 2.3738601823708207,
- "grad_norm": 1.4700267314910889,
- "learning_rate": 5.737168930605272e-07,
- "loss": 0.27994588017463684,
- "mean_token_accuracy": 0.9026262760162354,
- "num_tokens": 27948679.0,
- "step": 3124
- },
- {
- "epoch": 2.3746200607902734,
- "grad_norm": 3.2083890438079834,
- "learning_rate": 5.723825515042284e-07,
- "loss": 0.1810106784105301,
- "mean_token_accuracy": 0.9297720193862915,
- "num_tokens": 27952090.0,
- "step": 3125
- },
- {
- "epoch": 2.3753799392097266,
- "grad_norm": 1.4345086812973022,
- "learning_rate": 5.710495628240567e-07,
- "loss": 0.2929079830646515,
- "mean_token_accuracy": 0.8950849771499634,
- "num_tokens": 27964959.0,
- "step": 3126
- },
- {
- "epoch": 2.3761398176291793,
- "grad_norm": 2.0222737789154053,
- "learning_rate": 5.697179279555551e-07,
- "loss": 0.41308528184890747,
- "mean_token_accuracy": 0.8616737127304077,
- "num_tokens": 27973803.0,
- "step": 3127
- },
- {
- "epoch": 2.3768996960486324,
- "grad_norm": 1.2820483446121216,
- "learning_rate": 5.683876478333161e-07,
- "loss": 0.4069697856903076,
- "mean_token_accuracy": 0.8547379970550537,
- "num_tokens": 27991576.0,
- "step": 3128
- },
- {
- "epoch": 2.377659574468085,
- "grad_norm": 2.3709049224853516,
- "learning_rate": 5.670587233909819e-07,
- "loss": 0.1923210471868515,
- "mean_token_accuracy": 0.9360835552215576,
- "num_tokens": 27997051.0,
- "step": 3129
- },
- {
- "epoch": 2.378419452887538,
- "grad_norm": 1.874002456665039,
- "learning_rate": 5.657311555612433e-07,
- "loss": 0.431087851524353,
- "mean_token_accuracy": 0.8736472129821777,
- "num_tokens": 28004863.0,
- "step": 3130
- },
- {
- "epoch": 2.379179331306991,
- "grad_norm": 1.0792341232299805,
- "learning_rate": 5.64404945275836e-07,
- "loss": 0.38039785623550415,
- "mean_token_accuracy": 0.8523920178413391,
- "num_tokens": 28027220.0,
- "step": 3131
- },
- {
- "epoch": 2.3799392097264436,
- "grad_norm": 1.7947046756744385,
- "learning_rate": 5.630800934655481e-07,
- "loss": 0.29587826132774353,
- "mean_token_accuracy": 0.8919603824615479,
- "num_tokens": 28035495.0,
- "step": 3132
- },
- {
- "epoch": 2.3806990881458967,
- "grad_norm": 3.4972469806671143,
- "learning_rate": 5.617566010602113e-07,
- "loss": 0.31223949790000916,
- "mean_token_accuracy": 0.895270586013794,
- "num_tokens": 28039135.0,
- "step": 3133
- },
- {
- "epoch": 2.3814589665653494,
- "grad_norm": 2.331387758255005,
- "learning_rate": 5.60434468988702e-07,
- "loss": 0.30856233835220337,
- "mean_token_accuracy": 0.8810996413230896,
- "num_tokens": 28045572.0,
- "step": 3134
- },
- {
- "epoch": 2.3822188449848025,
- "grad_norm": 1.9918609857559204,
- "learning_rate": 5.591136981789439e-07,
- "loss": 0.3031975328922272,
- "mean_token_accuracy": 0.9028782844543457,
- "num_tokens": 28051851.0,
- "step": 3135
- },
- {
- "epoch": 2.382978723404255,
- "grad_norm": 1.6089690923690796,
- "learning_rate": 5.577942895579064e-07,
- "loss": 0.34390494227409363,
- "mean_token_accuracy": 0.8744557499885559,
- "num_tokens": 28062705.0,
- "step": 3136
- },
- {
- "epoch": 2.3837386018237083,
- "grad_norm": 1.4829623699188232,
- "learning_rate": 5.564762440515994e-07,
- "loss": 0.3172723650932312,
- "mean_token_accuracy": 0.9192344546318054,
- "num_tokens": 28073539.0,
- "step": 3137
- },
- {
- "epoch": 2.384498480243161,
- "grad_norm": 1.4833530187606812,
- "learning_rate": 5.551595625850786e-07,
- "loss": 0.3714778423309326,
- "mean_token_accuracy": 0.8697570562362671,
- "num_tokens": 28085949.0,
- "step": 3138
- },
- {
- "epoch": 2.385258358662614,
- "grad_norm": 3.140885829925537,
- "learning_rate": 5.538442460824417e-07,
- "loss": 0.3266214430332184,
- "mean_token_accuracy": 0.9124236702919006,
- "num_tokens": 28090639.0,
- "step": 3139
- },
- {
- "epoch": 2.386018237082067,
- "grad_norm": 1.731658935546875,
- "learning_rate": 5.525302954668285e-07,
- "loss": 0.21903038024902344,
- "mean_token_accuracy": 0.9181338548660278,
- "num_tokens": 28099076.0,
- "step": 3140
- },
- {
- "epoch": 2.38677811550152,
- "grad_norm": 1.2315683364868164,
- "learning_rate": 5.5121771166042e-07,
- "loss": 0.25057584047317505,
- "mean_token_accuracy": 0.9130429029464722,
- "num_tokens": 28113532.0,
- "step": 3141
- },
- {
- "epoch": 2.3875379939209727,
- "grad_norm": 3.888575553894043,
- "learning_rate": 5.499064955844383e-07,
- "loss": 0.173577219247818,
- "mean_token_accuracy": 0.9388964176177979,
- "num_tokens": 28116683.0,
- "step": 3142
- },
- {
- "epoch": 2.3882978723404253,
- "grad_norm": 1.4791816473007202,
- "learning_rate": 5.48596648159145e-07,
- "loss": 0.38739481568336487,
- "mean_token_accuracy": 0.9086727499961853,
- "num_tokens": 28129363.0,
- "step": 3143
- },
- {
- "epoch": 2.3890577507598785,
- "grad_norm": 2.1314213275909424,
- "learning_rate": 5.472881703038418e-07,
- "loss": 0.3724244236946106,
- "mean_token_accuracy": 0.8749525547027588,
- "num_tokens": 28136421.0,
- "step": 3144
- },
- {
- "epoch": 2.389817629179331,
- "grad_norm": 2.4120898246765137,
- "learning_rate": 5.459810629368692e-07,
- "loss": 0.36195144057273865,
- "mean_token_accuracy": 0.869860053062439,
- "num_tokens": 28143903.0,
- "step": 3145
- },
- {
- "epoch": 2.3905775075987843,
- "grad_norm": 1.7327654361724854,
- "learning_rate": 5.446753269756036e-07,
- "loss": 0.3846886157989502,
- "mean_token_accuracy": 0.859398603439331,
- "num_tokens": 28155403.0,
- "step": 3146
- },
- {
- "epoch": 2.391337386018237,
- "grad_norm": 1.2435929775238037,
- "learning_rate": 5.433709633364637e-07,
- "loss": 0.36000579595565796,
- "mean_token_accuracy": 0.8722110986709595,
- "num_tokens": 28171739.0,
- "step": 3147
- },
- {
- "epoch": 2.39209726443769,
- "grad_norm": 1.746272325515747,
- "learning_rate": 5.420679729348993e-07,
- "loss": 0.36778098344802856,
- "mean_token_accuracy": 0.8639050722122192,
- "num_tokens": 28182326.0,
- "step": 3148
- },
- {
- "epoch": 2.392857142857143,
- "grad_norm": 2.0103561878204346,
- "learning_rate": 5.407663566854008e-07,
- "loss": 0.3921544551849365,
- "mean_token_accuracy": 0.8679144382476807,
- "num_tokens": 28191456.0,
- "step": 3149
- },
- {
- "epoch": 2.393617021276596,
- "grad_norm": 1.792054533958435,
- "learning_rate": 5.394661155014921e-07,
- "loss": 0.4300078749656677,
- "mean_token_accuracy": 0.8496290445327759,
- "num_tokens": 28201943.0,
- "step": 3150
- },
- {
- "epoch": 2.3943768996960486,
- "grad_norm": 1.1109238862991333,
- "learning_rate": 5.381672502957324e-07,
- "loss": 0.3262210190296173,
- "mean_token_accuracy": 0.8634629845619202,
- "num_tokens": 28221353.0,
- "step": 3151
- },
- {
- "epoch": 2.3951367781155017,
- "grad_norm": 1.855241060256958,
- "learning_rate": 5.368697619797159e-07,
- "loss": 0.3076592981815338,
- "mean_token_accuracy": 0.9093140959739685,
- "num_tokens": 28229172.0,
- "step": 3152
- },
- {
- "epoch": 2.3958966565349544,
- "grad_norm": 2.416808605194092,
- "learning_rate": 5.355736514640697e-07,
- "loss": 0.27811431884765625,
- "mean_token_accuracy": 0.9024926424026489,
- "num_tokens": 28234877.0,
- "step": 3153
- },
- {
- "epoch": 2.396656534954407,
- "grad_norm": 1.6434770822525024,
- "learning_rate": 5.342789196584527e-07,
- "loss": 0.43254753947257996,
- "mean_token_accuracy": 0.8404601812362671,
- "num_tokens": 28245905.0,
- "step": 3154
- },
- {
- "epoch": 2.3974164133738602,
- "grad_norm": 2.4053826332092285,
- "learning_rate": 5.329855674715592e-07,
- "loss": 0.3984904885292053,
- "mean_token_accuracy": 0.8764510154724121,
- "num_tokens": 28251558.0,
- "step": 3155
- },
- {
- "epoch": 2.398176291793313,
- "grad_norm": 1.60322904586792,
- "learning_rate": 5.316935958111139e-07,
- "loss": 0.34025734663009644,
- "mean_token_accuracy": 0.8753441572189331,
- "num_tokens": 28261596.0,
- "step": 3156
- },
- {
- "epoch": 2.398936170212766,
- "grad_norm": 1.5645020008087158,
- "learning_rate": 5.304030055838704e-07,
- "loss": 0.35805732011795044,
- "mean_token_accuracy": 0.8628225922584534,
- "num_tokens": 28272233.0,
- "step": 3157
- },
- {
- "epoch": 2.3996960486322187,
- "grad_norm": 2.0708835124969482,
- "learning_rate": 5.291137976956148e-07,
- "loss": 0.35056009888648987,
- "mean_token_accuracy": 0.8771238923072815,
- "num_tokens": 28279905.0,
- "step": 3158
- },
- {
- "epoch": 2.400455927051672,
- "grad_norm": 1.9882023334503174,
- "learning_rate": 5.278259730511651e-07,
- "loss": 0.30454230308532715,
- "mean_token_accuracy": 0.883628249168396,
- "num_tokens": 28287183.0,
- "step": 3159
- },
- {
- "epoch": 2.4012158054711246,
- "grad_norm": 2.3435161113739014,
- "learning_rate": 5.26539532554364e-07,
- "loss": 0.262816458940506,
- "mean_token_accuracy": 0.8924182653427124,
- "num_tokens": 28293816.0,
- "step": 3160
- },
- {
- "epoch": 2.4019756838905777,
- "grad_norm": 1.5700311660766602,
- "learning_rate": 5.252544771080853e-07,
- "loss": 0.43474194407463074,
- "mean_token_accuracy": 0.8594561815261841,
- "num_tokens": 28306346.0,
- "step": 3161
- },
- {
- "epoch": 2.4027355623100304,
- "grad_norm": 1.8969467878341675,
- "learning_rate": 5.239708076142311e-07,
- "loss": 0.309972882270813,
- "mean_token_accuracy": 0.8846274614334106,
- "num_tokens": 28314843.0,
- "step": 3162
- },
- {
- "epoch": 2.4034954407294835,
- "grad_norm": 2.2149617671966553,
- "learning_rate": 5.226885249737292e-07,
- "loss": 0.40023672580718994,
- "mean_token_accuracy": 0.8641965389251709,
- "num_tokens": 28322124.0,
- "step": 3163
- },
- {
- "epoch": 2.404255319148936,
- "grad_norm": 1.3280621767044067,
- "learning_rate": 5.214076300865359e-07,
- "loss": 0.31123194098472595,
- "mean_token_accuracy": 0.8883715271949768,
- "num_tokens": 28336490.0,
- "step": 3164
- },
- {
- "epoch": 2.405015197568389,
- "grad_norm": 1.402884602546692,
- "learning_rate": 5.201281238516318e-07,
- "loss": 0.2590488791465759,
- "mean_token_accuracy": 0.9011414051055908,
- "num_tokens": 28349094.0,
- "step": 3165
- },
- {
- "epoch": 2.405775075987842,
- "grad_norm": 1.6564174890518188,
- "learning_rate": 5.188500071670235e-07,
- "loss": 0.23672837018966675,
- "mean_token_accuracy": 0.9133221507072449,
- "num_tokens": 28357665.0,
- "step": 3166
- },
- {
- "epoch": 2.4065349544072947,
- "grad_norm": 1.9133414030075073,
- "learning_rate": 5.175732809297435e-07,
- "loss": 0.40488386154174805,
- "mean_token_accuracy": 0.8534098863601685,
- "num_tokens": 28366519.0,
- "step": 3167
- },
- {
- "epoch": 2.407294832826748,
- "grad_norm": 1.447898268699646,
- "learning_rate": 5.16297946035847e-07,
- "loss": 0.3679184913635254,
- "mean_token_accuracy": 0.8696858882904053,
- "num_tokens": 28379315.0,
- "step": 3168
- },
- {
- "epoch": 2.4080547112462005,
- "grad_norm": 3.454120397567749,
- "learning_rate": 5.150240033804116e-07,
- "loss": 0.23210272192955017,
- "mean_token_accuracy": 0.9179670214653015,
- "num_tokens": 28382844.0,
- "step": 3169
- },
- {
- "epoch": 2.4088145896656536,
- "grad_norm": 1.7603836059570312,
- "learning_rate": 5.137514538575419e-07,
- "loss": 0.4491140842437744,
- "mean_token_accuracy": 0.8472066521644592,
- "num_tokens": 28394064.0,
- "step": 3170
- },
- {
- "epoch": 2.4095744680851063,
- "grad_norm": 1.3338149785995483,
- "learning_rate": 5.124802983603602e-07,
- "loss": 0.3237353563308716,
- "mean_token_accuracy": 0.8897873163223267,
- "num_tokens": 28410190.0,
- "step": 3171
- },
- {
- "epoch": 2.410334346504559,
- "grad_norm": 2.6191205978393555,
- "learning_rate": 5.112105377810128e-07,
- "loss": 0.3119543194770813,
- "mean_token_accuracy": 0.889589250087738,
- "num_tokens": 28414838.0,
- "step": 3172
- },
- {
- "epoch": 2.411094224924012,
- "grad_norm": 2.583130121231079,
- "learning_rate": 5.099421730106669e-07,
- "loss": 0.2616881728172302,
- "mean_token_accuracy": 0.9155426621437073,
- "num_tokens": 28419792.0,
- "step": 3173
- },
- {
- "epoch": 2.4118541033434653,
- "grad_norm": 2.875683307647705,
- "learning_rate": 5.086752049395094e-07,
- "loss": 0.2567689120769501,
- "mean_token_accuracy": 0.9075877666473389,
- "num_tokens": 28424069.0,
- "step": 3174
- },
- {
- "epoch": 2.412613981762918,
- "grad_norm": 1.695042371749878,
- "learning_rate": 5.074096344567475e-07,
- "loss": 0.3164510130882263,
- "mean_token_accuracy": 0.8845095634460449,
- "num_tokens": 28433279.0,
- "step": 3175
- },
- {
- "epoch": 2.4133738601823707,
- "grad_norm": 2.110863447189331,
- "learning_rate": 5.061454624506074e-07,
- "loss": 0.22680208086967468,
- "mean_token_accuracy": 0.9221781492233276,
- "num_tokens": 28439569.0,
- "step": 3176
- },
- {
- "epoch": 2.414133738601824,
- "grad_norm": 2.030958890914917,
- "learning_rate": 5.048826898083331e-07,
- "loss": 0.3482169210910797,
- "mean_token_accuracy": 0.8853809833526611,
- "num_tokens": 28447203.0,
- "step": 3177
- },
- {
- "epoch": 2.4148936170212765,
- "grad_norm": 1.9921014308929443,
- "learning_rate": 5.036213174161877e-07,
- "loss": 0.29343554377555847,
- "mean_token_accuracy": 0.893486499786377,
- "num_tokens": 28454923.0,
- "step": 3178
- },
- {
- "epoch": 2.4156534954407296,
- "grad_norm": 4.079009532928467,
- "learning_rate": 5.023613461594512e-07,
- "loss": 0.2569321095943451,
- "mean_token_accuracy": 0.9205472469329834,
- "num_tokens": 28458173.0,
- "step": 3179
- },
- {
- "epoch": 2.4164133738601823,
- "grad_norm": 3.077458143234253,
- "learning_rate": 5.01102776922418e-07,
- "loss": 0.3203810453414917,
- "mean_token_accuracy": 0.8863208293914795,
- "num_tokens": 28462449.0,
- "step": 3180
- },
- {
- "epoch": 2.4171732522796354,
- "grad_norm": 2.4658167362213135,
- "learning_rate": 4.998456105884025e-07,
- "loss": 0.33045345544815063,
- "mean_token_accuracy": 0.8856333494186401,
- "num_tokens": 28468051.0,
- "step": 3181
- },
- {
- "epoch": 2.417933130699088,
- "grad_norm": 2.053370714187622,
- "learning_rate": 4.985898480397322e-07,
- "loss": 0.2415514886379242,
- "mean_token_accuracy": 0.9296282529830933,
- "num_tokens": 28473839.0,
- "step": 3182
- },
- {
- "epoch": 2.418693009118541,
- "grad_norm": 2.705026149749756,
- "learning_rate": 4.973354901577487e-07,
- "loss": 0.3233085870742798,
- "mean_token_accuracy": 0.8820867538452148,
- "num_tokens": 28479419.0,
- "step": 3183
- },
- {
- "epoch": 2.419452887537994,
- "grad_norm": 2.1648733615875244,
- "learning_rate": 4.960825378228082e-07,
- "loss": 0.25225499272346497,
- "mean_token_accuracy": 0.9170141220092773,
- "num_tokens": 28484968.0,
- "step": 3184
- },
- {
- "epoch": 2.420212765957447,
- "grad_norm": 1.8317075967788696,
- "learning_rate": 4.948309919142832e-07,
- "loss": 0.3143184781074524,
- "mean_token_accuracy": 0.8824752569198608,
- "num_tokens": 28492904.0,
- "step": 3185
- },
- {
- "epoch": 2.4209726443768997,
- "grad_norm": 2.591052770614624,
- "learning_rate": 4.935808533105546e-07,
- "loss": 0.31191521883010864,
- "mean_token_accuracy": 0.9041938185691833,
- "num_tokens": 28498136.0,
- "step": 3186
- },
- {
- "epoch": 2.4217325227963524,
- "grad_norm": 2.200559139251709,
- "learning_rate": 4.923321228890184e-07,
- "loss": 0.23661679029464722,
- "mean_token_accuracy": 0.9179906845092773,
- "num_tokens": 28504246.0,
- "step": 3187
- },
- {
- "epoch": 2.4224924012158056,
- "grad_norm": 1.6311591863632202,
- "learning_rate": 4.910848015260822e-07,
- "loss": 0.35421687364578247,
- "mean_token_accuracy": 0.8728591799736023,
- "num_tokens": 28515481.0,
- "step": 3188
- },
- {
- "epoch": 2.4232522796352582,
- "grad_norm": 2.1564102172851562,
- "learning_rate": 4.898388900971635e-07,
- "loss": 0.30809515714645386,
- "mean_token_accuracy": 0.8960262537002563,
- "num_tokens": 28521294.0,
- "step": 3189
- },
- {
- "epoch": 2.4240121580547114,
- "grad_norm": 2.1413958072662354,
- "learning_rate": 4.885943894766909e-07,
- "loss": 0.217842698097229,
- "mean_token_accuracy": 0.9408326745033264,
- "num_tokens": 28527104.0,
- "step": 3190
- },
- {
- "epoch": 2.424772036474164,
- "grad_norm": 2.373764991760254,
- "learning_rate": 4.873513005381042e-07,
- "loss": 0.33814239501953125,
- "mean_token_accuracy": 0.9007177352905273,
- "num_tokens": 28533654.0,
- "step": 3191
- },
- {
- "epoch": 2.425531914893617,
- "grad_norm": 1.8809123039245605,
- "learning_rate": 4.861096241538483e-07,
- "loss": 0.4467903971672058,
- "mean_token_accuracy": 0.8424190282821655,
- "num_tokens": 28543667.0,
- "step": 3192
- },
- {
- "epoch": 2.42629179331307,
- "grad_norm": 1.4945175647735596,
- "learning_rate": 4.848693611953825e-07,
- "loss": 0.3123834729194641,
- "mean_token_accuracy": 0.9072253704071045,
- "num_tokens": 28554944.0,
- "step": 3193
- },
- {
- "epoch": 2.4270516717325226,
- "grad_norm": 1.8136200904846191,
- "learning_rate": 4.836305125331695e-07,
- "loss": 0.27221372723579407,
- "mean_token_accuracy": 0.9039586782455444,
- "num_tokens": 28563082.0,
- "step": 3194
- },
- {
- "epoch": 2.4278115501519757,
- "grad_norm": 4.269916534423828,
- "learning_rate": 4.823930790366801e-07,
- "loss": 0.2660295069217682,
- "mean_token_accuracy": 0.9072147607803345,
- "num_tokens": 28566246.0,
- "step": 3195
- },
- {
- "epoch": 2.4285714285714284,
- "grad_norm": 2.273453950881958,
- "learning_rate": 4.811570615743952e-07,
- "loss": 0.27304959297180176,
- "mean_token_accuracy": 0.9012454748153687,
- "num_tokens": 28572906.0,
- "step": 3196
- },
- {
- "epoch": 2.4293313069908815,
- "grad_norm": 2.0931496620178223,
- "learning_rate": 4.799224610137975e-07,
- "loss": 0.2358006238937378,
- "mean_token_accuracy": 0.9263845682144165,
- "num_tokens": 28579148.0,
- "step": 3197
- },
- {
- "epoch": 2.430091185410334,
- "grad_norm": 1.885201334953308,
- "learning_rate": 4.786892782213781e-07,
- "loss": 0.346000611782074,
- "mean_token_accuracy": 0.8616824150085449,
- "num_tokens": 28587361.0,
- "step": 3198
- },
- {
- "epoch": 2.4308510638297873,
- "grad_norm": 2.229367971420288,
- "learning_rate": 4.774575140626317e-07,
- "loss": 0.2951638400554657,
- "mean_token_accuracy": 0.8938933610916138,
- "num_tokens": 28593622.0,
- "step": 3199
- },
- {
- "epoch": 2.43161094224924,
- "grad_norm": 2.681004762649536,
- "learning_rate": 4.7622716940205787e-07,
- "loss": 0.2588275671005249,
- "mean_token_accuracy": 0.9101524353027344,
- "num_tokens": 28597890.0,
- "step": 3200
- },
- {
- "epoch": 2.432370820668693,
- "grad_norm": 1.8040683269500732,
- "learning_rate": 4.7499824510316013e-07,
- "loss": 0.3194184899330139,
- "mean_token_accuracy": 0.906498908996582,
- "num_tokens": 28606885.0,
- "step": 3201
- },
- {
- "epoch": 2.433130699088146,
- "grad_norm": 3.4185421466827393,
- "learning_rate": 4.7377074202844514e-07,
- "loss": 0.4457589387893677,
- "mean_token_accuracy": 0.8387380838394165,
- "num_tokens": 28611709.0,
- "step": 3202
- },
- {
- "epoch": 2.433890577507599,
- "grad_norm": 2.6594674587249756,
- "learning_rate": 4.7254466103941995e-07,
- "loss": 0.3260703384876251,
- "mean_token_accuracy": 0.9050778150558472,
- "num_tokens": 28616931.0,
- "step": 3203
- },
- {
- "epoch": 2.4346504559270516,
- "grad_norm": 1.999886393547058,
- "learning_rate": 4.713200029965978e-07,
- "loss": 0.2933492660522461,
- "mean_token_accuracy": 0.900344729423523,
- "num_tokens": 28624034.0,
- "step": 3204
- },
- {
- "epoch": 2.4354103343465043,
- "grad_norm": 1.932508111000061,
- "learning_rate": 4.700967687594901e-07,
- "loss": 0.29114463925361633,
- "mean_token_accuracy": 0.9247289896011353,
- "num_tokens": 28633528.0,
- "step": 3205
- },
- {
- "epoch": 2.4361702127659575,
- "grad_norm": 4.55303430557251,
- "learning_rate": 4.68874959186609e-07,
- "loss": 0.2936939597129822,
- "mean_token_accuracy": 0.9006574153900146,
- "num_tokens": 28636289.0,
- "step": 3206
- },
- {
- "epoch": 2.43693009118541,
- "grad_norm": 2.02156400680542,
- "learning_rate": 4.6765457513546747e-07,
- "loss": 0.3098263740539551,
- "mean_token_accuracy": 0.8965007066726685,
- "num_tokens": 28643491.0,
- "step": 3207
- },
- {
- "epoch": 2.4376899696048633,
- "grad_norm": 2.018125295639038,
- "learning_rate": 4.664356174625795e-07,
- "loss": 0.4749948978424072,
- "mean_token_accuracy": 0.8366118669509888,
- "num_tokens": 28654136.0,
- "step": 3208
- },
- {
- "epoch": 2.438449848024316,
- "grad_norm": 2.0175318717956543,
- "learning_rate": 4.6521808702345516e-07,
- "loss": 0.31277763843536377,
- "mean_token_accuracy": 0.8878506422042847,
- "num_tokens": 28662363.0,
- "step": 3209
- },
- {
- "epoch": 2.439209726443769,
- "grad_norm": 1.37982177734375,
- "learning_rate": 4.640019846726043e-07,
- "loss": 0.3872165083885193,
- "mean_token_accuracy": 0.8586703538894653,
- "num_tokens": 28681637.0,
- "step": 3210
- },
- {
- "epoch": 2.439969604863222,
- "grad_norm": 2.265124559402466,
- "learning_rate": 4.6278731126353447e-07,
- "loss": 0.20262989401817322,
- "mean_token_accuracy": 0.9290857315063477,
- "num_tokens": 28687032.0,
- "step": 3211
- },
- {
- "epoch": 2.440729483282675,
- "grad_norm": 1.730516791343689,
- "learning_rate": 4.615740676487507e-07,
- "loss": 0.21819885075092316,
- "mean_token_accuracy": 0.9351010322570801,
- "num_tokens": 28694692.0,
- "step": 3212
- },
- {
- "epoch": 2.4414893617021276,
- "grad_norm": 2.297240972518921,
- "learning_rate": 4.603622546797534e-07,
- "loss": 0.34703850746154785,
- "mean_token_accuracy": 0.8764227628707886,
- "num_tokens": 28700838.0,
- "step": 3213
- },
- {
- "epoch": 2.4422492401215807,
- "grad_norm": 1.3174461126327515,
- "learning_rate": 4.591518732070402e-07,
- "loss": 0.27869975566864014,
- "mean_token_accuracy": 0.8975766897201538,
- "num_tokens": 28715114.0,
- "step": 3214
- },
- {
- "epoch": 2.4430091185410334,
- "grad_norm": 1.8751143217086792,
- "learning_rate": 4.5794292408010285e-07,
- "loss": 0.4260019361972809,
- "mean_token_accuracy": 0.8564238548278809,
- "num_tokens": 28724176.0,
- "step": 3215
- },
- {
- "epoch": 2.443768996960486,
- "grad_norm": 2.095414161682129,
- "learning_rate": 4.5673540814742875e-07,
- "loss": 0.2791098952293396,
- "mean_token_accuracy": 0.896371603012085,
- "num_tokens": 28730815.0,
- "step": 3216
- },
- {
- "epoch": 2.4445288753799392,
- "grad_norm": 1.470991611480713,
- "learning_rate": 4.555293262564994e-07,
- "loss": 0.3128473162651062,
- "mean_token_accuracy": 0.8857797980308533,
- "num_tokens": 28743271.0,
- "step": 3217
- },
- {
- "epoch": 2.445288753799392,
- "grad_norm": 1.8783953189849854,
- "learning_rate": 4.5432467925378784e-07,
- "loss": 0.24838949739933014,
- "mean_token_accuracy": 0.9119431972503662,
- "num_tokens": 28751291.0,
- "step": 3218
- },
- {
- "epoch": 2.446048632218845,
- "grad_norm": 2.134469747543335,
- "learning_rate": 4.53121467984764e-07,
- "loss": 0.390994668006897,
- "mean_token_accuracy": 0.8823093175888062,
- "num_tokens": 28758262.0,
- "step": 3219
- },
- {
- "epoch": 2.4468085106382977,
- "grad_norm": 1.369758129119873,
- "learning_rate": 4.5191969329388627e-07,
- "loss": 0.33717092871665955,
- "mean_token_accuracy": 0.8909138441085815,
- "num_tokens": 28770330.0,
- "step": 3220
- },
- {
- "epoch": 2.447568389057751,
- "grad_norm": 1.3363337516784668,
- "learning_rate": 4.5071935602460704e-07,
- "loss": 0.41521191596984863,
- "mean_token_accuracy": 0.8482609987258911,
- "num_tokens": 28788148.0,
- "step": 3221
- },
- {
- "epoch": 2.4483282674772036,
- "grad_norm": 1.5309195518493652,
- "learning_rate": 4.495204570193687e-07,
- "loss": 0.23737329244613647,
- "mean_token_accuracy": 0.9094061851501465,
- "num_tokens": 28798150.0,
- "step": 3222
- },
- {
- "epoch": 2.4490881458966567,
- "grad_norm": 1.5956720113754272,
- "learning_rate": 4.483229971196054e-07,
- "loss": 0.24943354725837708,
- "mean_token_accuracy": 0.9051728248596191,
- "num_tokens": 28808116.0,
- "step": 3223
- },
- {
- "epoch": 2.4498480243161094,
- "grad_norm": 1.515918254852295,
- "learning_rate": 4.4712697716573994e-07,
- "loss": 0.3883020281791687,
- "mean_token_accuracy": 0.8599046468734741,
- "num_tokens": 28822835.0,
- "step": 3224
- },
- {
- "epoch": 2.4506079027355625,
- "grad_norm": 1.3584989309310913,
- "learning_rate": 4.4593239799718636e-07,
- "loss": 0.33565959334373474,
- "mean_token_accuracy": 0.8725172281265259,
- "num_tokens": 28841697.0,
- "step": 3225
- },
- {
- "epoch": 2.451367781155015,
- "grad_norm": 2.800762414932251,
- "learning_rate": 4.447392604523443e-07,
- "loss": 0.36243852972984314,
- "mean_token_accuracy": 0.8881685733795166,
- "num_tokens": 28847164.0,
- "step": 3226
- },
- {
- "epoch": 2.452127659574468,
- "grad_norm": 1.3506053686141968,
- "learning_rate": 4.43547565368605e-07,
- "loss": 0.21717754006385803,
- "mean_token_accuracy": 0.9296318292617798,
- "num_tokens": 28858658.0,
- "step": 3227
- },
- {
- "epoch": 2.452887537993921,
- "grad_norm": 2.094951868057251,
- "learning_rate": 4.423573135823464e-07,
- "loss": 0.3554617762565613,
- "mean_token_accuracy": 0.8762428760528564,
- "num_tokens": 28866509.0,
- "step": 3228
- },
- {
- "epoch": 2.4536474164133737,
- "grad_norm": 1.4730854034423828,
- "learning_rate": 4.411685059289314e-07,
- "loss": 0.2805292010307312,
- "mean_token_accuracy": 0.9004697799682617,
- "num_tokens": 28878151.0,
- "step": 3229
- },
- {
- "epoch": 2.454407294832827,
- "grad_norm": 2.1443302631378174,
- "learning_rate": 4.399811432427123e-07,
- "loss": 0.3829796314239502,
- "mean_token_accuracy": 0.866457462310791,
- "num_tokens": 28886050.0,
- "step": 3230
- },
- {
- "epoch": 2.4551671732522795,
- "grad_norm": 3.437201738357544,
- "learning_rate": 4.387952263570261e-07,
- "loss": 0.18470892310142517,
- "mean_token_accuracy": 0.9365379810333252,
- "num_tokens": 28889484.0,
- "step": 3231
- },
- {
- "epoch": 2.4559270516717326,
- "grad_norm": 2.6203434467315674,
- "learning_rate": 4.376107561041937e-07,
- "loss": 0.25328633189201355,
- "mean_token_accuracy": 0.921377956867218,
- "num_tokens": 28893972.0,
- "step": 3232
- },
- {
- "epoch": 2.4566869300911853,
- "grad_norm": 2.4467883110046387,
- "learning_rate": 4.3642773331552203e-07,
- "loss": 0.2748469412326813,
- "mean_token_accuracy": 0.9046314358711243,
- "num_tokens": 28899118.0,
- "step": 3233
- },
- {
- "epoch": 2.4574468085106385,
- "grad_norm": 1.9845495223999023,
- "learning_rate": 4.352461588213036e-07,
- "loss": 0.443121075630188,
- "mean_token_accuracy": 0.8609750866889954,
- "num_tokens": 28909005.0,
- "step": 3234
- },
- {
- "epoch": 2.458206686930091,
- "grad_norm": 2.8748083114624023,
- "learning_rate": 4.340660334508115e-07,
- "loss": 0.22461901605129242,
- "mean_token_accuracy": 0.916649341583252,
- "num_tokens": 28913366.0,
- "step": 3235
- },
- {
- "epoch": 2.4589665653495443,
- "grad_norm": 1.7406567335128784,
- "learning_rate": 4.328873580323034e-07,
- "loss": 0.4147683382034302,
- "mean_token_accuracy": 0.8523626327514648,
- "num_tokens": 28924695.0,
- "step": 3236
- },
- {
- "epoch": 2.459726443768997,
- "grad_norm": 1.767052412033081,
- "learning_rate": 4.3171013339301905e-07,
- "loss": 0.38994747400283813,
- "mean_token_accuracy": 0.863203227519989,
- "num_tokens": 28935163.0,
- "step": 3237
- },
- {
- "epoch": 2.4604863221884496,
- "grad_norm": 1.2257410287857056,
- "learning_rate": 4.305343603591802e-07,
- "loss": 0.23309440910816193,
- "mean_token_accuracy": 0.9016385674476624,
- "num_tokens": 28948374.0,
- "step": 3238
- },
- {
- "epoch": 2.461246200607903,
- "grad_norm": 1.3017674684524536,
- "learning_rate": 4.293600397559897e-07,
- "loss": 0.2825638949871063,
- "mean_token_accuracy": 0.8953868746757507,
- "num_tokens": 28961695.0,
- "step": 3239
- },
- {
- "epoch": 2.4620060790273555,
- "grad_norm": 1.475160837173462,
- "learning_rate": 4.2818717240763115e-07,
- "loss": 0.30598294734954834,
- "mean_token_accuracy": 0.8774391412734985,
- "num_tokens": 28976399.0,
- "step": 3240
- },
- {
- "epoch": 2.4627659574468086,
- "grad_norm": 2.1078310012817383,
- "learning_rate": 4.2701575913726644e-07,
- "loss": 0.4696943759918213,
- "mean_token_accuracy": 0.8566044569015503,
- "num_tokens": 28985515.0,
- "step": 3241
- },
- {
- "epoch": 2.4635258358662613,
- "grad_norm": 2.587887763977051,
- "learning_rate": 4.258458007670413e-07,
- "loss": 0.32537540793418884,
- "mean_token_accuracy": 0.8889709711074829,
- "num_tokens": 28990365.0,
- "step": 3242
- },
- {
- "epoch": 2.4642857142857144,
- "grad_norm": 1.622995138168335,
- "learning_rate": 4.2467729811807497e-07,
- "loss": 0.47171884775161743,
- "mean_token_accuracy": 0.8305673599243164,
- "num_tokens": 29002644.0,
- "step": 3243
- },
- {
- "epoch": 2.465045592705167,
- "grad_norm": 2.0702009201049805,
- "learning_rate": 4.235102520104681e-07,
- "loss": 0.45754289627075195,
- "mean_token_accuracy": 0.8536194562911987,
- "num_tokens": 29011325.0,
- "step": 3244
- },
- {
- "epoch": 2.4658054711246202,
- "grad_norm": 1.4394203424453735,
- "learning_rate": 4.2234466326330023e-07,
- "loss": 0.36623480916023254,
- "mean_token_accuracy": 0.8834698796272278,
- "num_tokens": 29028440.0,
- "step": 3245
- },
- {
- "epoch": 2.466565349544073,
- "grad_norm": 1.6777557134628296,
- "learning_rate": 4.211805326946247e-07,
- "loss": 0.19617480039596558,
- "mean_token_accuracy": 0.920343279838562,
- "num_tokens": 29035936.0,
- "step": 3246
- },
- {
- "epoch": 2.467325227963526,
- "grad_norm": 1.7396641969680786,
- "learning_rate": 4.200178611214736e-07,
- "loss": 0.3978565037250519,
- "mean_token_accuracy": 0.8532278537750244,
- "num_tokens": 29046734.0,
- "step": 3247
- },
- {
- "epoch": 2.4680851063829787,
- "grad_norm": 2.9263904094696045,
- "learning_rate": 4.18856649359855e-07,
- "loss": 0.24883142113685608,
- "mean_token_accuracy": 0.9077831506729126,
- "num_tokens": 29051879.0,
- "step": 3248
- },
- {
- "epoch": 2.4688449848024314,
- "grad_norm": 2.637763500213623,
- "learning_rate": 4.1769689822475147e-07,
- "loss": 0.3370334506034851,
- "mean_token_accuracy": 0.8828175067901611,
- "num_tokens": 29057684.0,
- "step": 3249
- },
- {
- "epoch": 2.4696048632218845,
- "grad_norm": 1.768539309501648,
- "learning_rate": 4.165386085301212e-07,
- "loss": 0.32484760880470276,
- "mean_token_accuracy": 0.8829447031021118,
- "num_tokens": 29066105.0,
- "step": 3250
- },
- {
- "epoch": 2.4703647416413372,
- "grad_norm": 1.4335054159164429,
- "learning_rate": 4.1538178108889717e-07,
- "loss": 0.442533940076828,
- "mean_token_accuracy": 0.846094012260437,
- "num_tokens": 29082385.0,
- "step": 3251
- },
- {
- "epoch": 2.4711246200607904,
- "grad_norm": 2.007174491882324,
- "learning_rate": 4.1422641671298336e-07,
- "loss": 0.2856018841266632,
- "mean_token_accuracy": 0.9205893278121948,
- "num_tokens": 29089022.0,
- "step": 3252
- },
- {
- "epoch": 2.471884498480243,
- "grad_norm": 2.225895404815674,
- "learning_rate": 4.1307251621326124e-07,
- "loss": 0.17259414494037628,
- "mean_token_accuracy": 0.9244140386581421,
- "num_tokens": 29094176.0,
- "step": 3253
- },
- {
- "epoch": 2.472644376899696,
- "grad_norm": 2.6121842861175537,
- "learning_rate": 4.1192008039958236e-07,
- "loss": 0.37352171540260315,
- "mean_token_accuracy": 0.8913992643356323,
- "num_tokens": 29099565.0,
- "step": 3254
- },
- {
- "epoch": 2.473404255319149,
- "grad_norm": 1.5645455121994019,
- "learning_rate": 4.1076911008076895e-07,
- "loss": 0.37237828969955444,
- "mean_token_accuracy": 0.879361629486084,
- "num_tokens": 29112039.0,
- "step": 3255
- },
- {
- "epoch": 2.474164133738602,
- "grad_norm": 3.144536018371582,
- "learning_rate": 4.096196060646168e-07,
- "loss": 0.2038595974445343,
- "mean_token_accuracy": 0.9299201369285583,
- "num_tokens": 29115720.0,
- "step": 3256
- },
- {
- "epoch": 2.4749240121580547,
- "grad_norm": 2.679821014404297,
- "learning_rate": 4.0847156915789385e-07,
- "loss": 0.41715145111083984,
- "mean_token_accuracy": 0.862784743309021,
- "num_tokens": 29120944.0,
- "step": 3257
- },
- {
- "epoch": 2.475683890577508,
- "grad_norm": 2.243694305419922,
- "learning_rate": 4.073250001663345e-07,
- "loss": 0.43414735794067383,
- "mean_token_accuracy": 0.8504310250282288,
- "num_tokens": 29128842.0,
- "step": 3258
- },
- {
- "epoch": 2.4764437689969605,
- "grad_norm": 2.636111259460449,
- "learning_rate": 4.061798998946459e-07,
- "loss": 0.2401021122932434,
- "mean_token_accuracy": 0.910351037979126,
- "num_tokens": 29133769.0,
- "step": 3259
- },
- {
- "epoch": 2.477203647416413,
- "grad_norm": 2.6116414070129395,
- "learning_rate": 4.050362691465032e-07,
- "loss": 0.3290833830833435,
- "mean_token_accuracy": 0.8770763278007507,
- "num_tokens": 29138639.0,
- "step": 3260
- },
- {
- "epoch": 2.4779635258358663,
- "grad_norm": 2.279324531555176,
- "learning_rate": 4.038941087245507e-07,
- "loss": 0.38752615451812744,
- "mean_token_accuracy": 0.8624980449676514,
- "num_tokens": 29145966.0,
- "step": 3261
- },
- {
- "epoch": 2.478723404255319,
- "grad_norm": 2.2280423641204834,
- "learning_rate": 4.0275341943040057e-07,
- "loss": 0.3724668025970459,
- "mean_token_accuracy": 0.8737661838531494,
- "num_tokens": 29152705.0,
- "step": 3262
- },
- {
- "epoch": 2.479483282674772,
- "grad_norm": 2.030075788497925,
- "learning_rate": 4.0161420206463243e-07,
- "loss": 0.32603174448013306,
- "mean_token_accuracy": 0.8819995522499084,
- "num_tokens": 29159853.0,
- "step": 3263
- },
- {
- "epoch": 2.480243161094225,
- "grad_norm": 2.048346519470215,
- "learning_rate": 4.0047645742679275e-07,
- "loss": 0.3046466112136841,
- "mean_token_accuracy": 0.8898575305938721,
- "num_tokens": 29167744.0,
- "step": 3264
- },
- {
- "epoch": 2.481003039513678,
- "grad_norm": 2.8435800075531006,
- "learning_rate": 3.9934018631539506e-07,
- "loss": 0.3660475015640259,
- "mean_token_accuracy": 0.8754674196243286,
- "num_tokens": 29173234.0,
- "step": 3265
- },
- {
- "epoch": 2.4817629179331306,
- "grad_norm": 1.7785491943359375,
- "learning_rate": 3.982053895279173e-07,
- "loss": 0.39483463764190674,
- "mean_token_accuracy": 0.8613039255142212,
- "num_tokens": 29182555.0,
- "step": 3266
- },
- {
- "epoch": 2.4825227963525838,
- "grad_norm": 2.384479522705078,
- "learning_rate": 3.970720678608034e-07,
- "loss": 0.3536769151687622,
- "mean_token_accuracy": 0.8700416088104248,
- "num_tokens": 29189742.0,
- "step": 3267
- },
- {
- "epoch": 2.4832826747720365,
- "grad_norm": 2.368417978286743,
- "learning_rate": 3.9594022210946355e-07,
- "loss": 0.2937469780445099,
- "mean_token_accuracy": 0.8970743417739868,
- "num_tokens": 29194960.0,
- "step": 3268
- },
- {
- "epoch": 2.4840425531914896,
- "grad_norm": 2.3920481204986572,
- "learning_rate": 3.948098530682695e-07,
- "loss": 0.29564806818962097,
- "mean_token_accuracy": 0.913650393486023,
- "num_tokens": 29200704.0,
- "step": 3269
- },
- {
- "epoch": 2.4848024316109423,
- "grad_norm": 1.1830788850784302,
- "learning_rate": 3.9368096153055783e-07,
- "loss": 0.39095211029052734,
- "mean_token_accuracy": 0.8536444902420044,
- "num_tokens": 29224862.0,
- "step": 3270
- },
- {
- "epoch": 2.485562310030395,
- "grad_norm": 1.4365004301071167,
- "learning_rate": 3.925535482886286e-07,
- "loss": 0.27921199798583984,
- "mean_token_accuracy": 0.8939366936683655,
- "num_tokens": 29237375.0,
- "step": 3271
- },
- {
- "epoch": 2.486322188449848,
- "grad_norm": 2.1955132484436035,
- "learning_rate": 3.9142761413374336e-07,
- "loss": 0.41748374700546265,
- "mean_token_accuracy": 0.8621724843978882,
- "num_tokens": 29244655.0,
- "step": 3272
- },
- {
- "epoch": 2.487082066869301,
- "grad_norm": 2.4120712280273438,
- "learning_rate": 3.90303159856126e-07,
- "loss": 0.2881275415420532,
- "mean_token_accuracy": 0.8962163329124451,
- "num_tokens": 29250350.0,
- "step": 3273
- },
- {
- "epoch": 2.487841945288754,
- "grad_norm": 1.315206527709961,
- "learning_rate": 3.891801862449629e-07,
- "loss": 0.33958539366722107,
- "mean_token_accuracy": 0.8800086379051208,
- "num_tokens": 29264563.0,
- "step": 3274
- },
- {
- "epoch": 2.4886018237082066,
- "grad_norm": 1.9663656949996948,
- "learning_rate": 3.880586940883979e-07,
- "loss": 0.35844963788986206,
- "mean_token_accuracy": 0.8683270215988159,
- "num_tokens": 29273782.0,
- "step": 3275
- },
- {
- "epoch": 2.4893617021276597,
- "grad_norm": 1.4438722133636475,
- "learning_rate": 3.869386841735395e-07,
- "loss": 0.39307960867881775,
- "mean_token_accuracy": 0.8902837038040161,
- "num_tokens": 29288914.0,
- "step": 3276
- },
- {
- "epoch": 2.4901215805471124,
- "grad_norm": 2.779317617416382,
- "learning_rate": 3.8582015728645366e-07,
- "loss": 0.237838476896286,
- "mean_token_accuracy": 0.9132705926895142,
- "num_tokens": 29293218.0,
- "step": 3277
- },
- {
- "epoch": 2.4908814589665655,
- "grad_norm": 1.6183768510818481,
- "learning_rate": 3.8470311421216435e-07,
- "loss": 0.24135810136795044,
- "mean_token_accuracy": 0.9351533055305481,
- "num_tokens": 29301928.0,
- "step": 3278
- },
- {
- "epoch": 2.4916413373860182,
- "grad_norm": 1.6468756198883057,
- "learning_rate": 3.835875557346552e-07,
- "loss": 0.34042105078697205,
- "mean_token_accuracy": 0.8920395374298096,
- "num_tokens": 29313740.0,
- "step": 3279
- },
- {
- "epoch": 2.4924012158054714,
- "grad_norm": 1.6257606744766235,
- "learning_rate": 3.8247348263687035e-07,
- "loss": 0.3479476869106293,
- "mean_token_accuracy": 0.8826069831848145,
- "num_tokens": 29323650.0,
- "step": 3280
- },
- {
- "epoch": 2.493161094224924,
- "grad_norm": 1.7144103050231934,
- "learning_rate": 3.81360895700707e-07,
- "loss": 0.3905973434448242,
- "mean_token_accuracy": 0.8974796533584595,
- "num_tokens": 29333192.0,
- "step": 3281
- },
- {
- "epoch": 2.4939209726443767,
- "grad_norm": 1.3381150960922241,
- "learning_rate": 3.802497957070225e-07,
- "loss": 0.31121304631233215,
- "mean_token_accuracy": 0.8921661376953125,
- "num_tokens": 29348219.0,
- "step": 3282
- },
- {
- "epoch": 2.49468085106383,
- "grad_norm": 1.8036452531814575,
- "learning_rate": 3.7914018343562896e-07,
- "loss": 0.4264541268348694,
- "mean_token_accuracy": 0.8469835519790649,
- "num_tokens": 29359632.0,
- "step": 3283
- },
- {
- "epoch": 2.4954407294832825,
- "grad_norm": 1.7335898876190186,
- "learning_rate": 3.780320596652956e-07,
- "loss": 0.2710324823856354,
- "mean_token_accuracy": 0.9050130844116211,
- "num_tokens": 29368771.0,
- "step": 3284
- },
- {
- "epoch": 2.4962006079027357,
- "grad_norm": 1.5694719552993774,
- "learning_rate": 3.7692542517374615e-07,
- "loss": 0.3114343285560608,
- "mean_token_accuracy": 0.8869681358337402,
- "num_tokens": 29379694.0,
- "step": 3285
- },
- {
- "epoch": 2.4969604863221884,
- "grad_norm": 2.042365074157715,
- "learning_rate": 3.75820280737659e-07,
- "loss": 0.23643970489501953,
- "mean_token_accuracy": 0.9191685318946838,
- "num_tokens": 29385914.0,
- "step": 3286
- },
- {
- "epoch": 2.4977203647416415,
- "grad_norm": 2.2526986598968506,
- "learning_rate": 3.7471662713266744e-07,
- "loss": 0.3166671097278595,
- "mean_token_accuracy": 0.901310384273529,
- "num_tokens": 29392128.0,
- "step": 3287
- },
- {
- "epoch": 2.498480243161094,
- "grad_norm": 1.474029541015625,
- "learning_rate": 3.7361446513335816e-07,
- "loss": 0.4021439552307129,
- "mean_token_accuracy": 0.9001395106315613,
- "num_tokens": 29404742.0,
- "step": 3288
- },
- {
- "epoch": 2.499240121580547,
- "grad_norm": 1.3057628870010376,
- "learning_rate": 3.725137955132707e-07,
- "loss": 0.30949655175209045,
- "mean_token_accuracy": 0.8990561962127686,
- "num_tokens": 29421839.0,
- "step": 3289
- },
- {
- "epoch": 2.5,
- "grad_norm": 1.61989164352417,
- "learning_rate": 3.7141461904489665e-07,
- "loss": 0.3134443163871765,
- "mean_token_accuracy": 0.8906387090682983,
- "num_tokens": 29432127.0,
- "step": 3290
- },
- {
- "epoch": 2.500759878419453,
- "grad_norm": 1.5306038856506348,
- "learning_rate": 3.70316936499682e-07,
- "loss": 0.4017624855041504,
- "mean_token_accuracy": 0.845695436000824,
- "num_tokens": 29444397.0,
- "step": 3291
- },
- {
- "epoch": 2.501519756838906,
- "grad_norm": 1.2971603870391846,
- "learning_rate": 3.6922074864802095e-07,
- "loss": 0.4591655135154724,
- "mean_token_accuracy": 0.8666995763778687,
- "num_tokens": 29461121.0,
- "step": 3292
- },
- {
- "epoch": 2.5022796352583585,
- "grad_norm": 1.9822273254394531,
- "learning_rate": 3.681260562592609e-07,
- "loss": 0.3666776716709137,
- "mean_token_accuracy": 0.8733338117599487,
- "num_tokens": 29469211.0,
- "step": 3293
- },
- {
- "epoch": 2.5030395136778116,
- "grad_norm": 2.331378936767578,
- "learning_rate": 3.670328601016995e-07,
- "loss": 0.3511161506175995,
- "mean_token_accuracy": 0.8734879493713379,
- "num_tokens": 29475473.0,
- "step": 3294
- },
- {
- "epoch": 2.5037993920972643,
- "grad_norm": 1.2138792276382446,
- "learning_rate": 3.659411609425834e-07,
- "loss": 0.2819535732269287,
- "mean_token_accuracy": 0.9210860729217529,
- "num_tokens": 29492447.0,
- "step": 3295
- },
- {
- "epoch": 2.5045592705167175,
- "grad_norm": 1.4580892324447632,
- "learning_rate": 3.648509595481095e-07,
- "loss": 0.37376853823661804,
- "mean_token_accuracy": 0.868643045425415,
- "num_tokens": 29506128.0,
- "step": 3296
- },
- {
- "epoch": 2.50531914893617,
- "grad_norm": 2.3763513565063477,
- "learning_rate": 3.6376225668342287e-07,
- "loss": 0.3229329586029053,
- "mean_token_accuracy": 0.8802589178085327,
- "num_tokens": 29512500.0,
- "step": 3297
- },
- {
- "epoch": 2.5060790273556233,
- "grad_norm": 1.7995069026947021,
- "learning_rate": 3.626750531126169e-07,
- "loss": 0.2303360551595688,
- "mean_token_accuracy": 0.9212342500686646,
- "num_tokens": 29518867.0,
- "step": 3298
- },
- {
- "epoch": 2.506838905775076,
- "grad_norm": 2.4798812866210938,
- "learning_rate": 3.615893495987335e-07,
- "loss": 0.15825161337852478,
- "mean_token_accuracy": 0.9465295076370239,
- "num_tokens": 29523418.0,
- "step": 3299
- },
- {
- "epoch": 2.5075987841945286,
- "grad_norm": 2.6747193336486816,
- "learning_rate": 3.6050514690376124e-07,
- "loss": 0.3672150671482086,
- "mean_token_accuracy": 0.8869320154190063,
- "num_tokens": 29534685.0,
- "step": 3300
- },
- {
- "epoch": 2.5083586626139818,
- "grad_norm": 1.47441828250885,
- "learning_rate": 3.594224457886336e-07,
- "loss": 0.3551298975944519,
- "mean_token_accuracy": 0.8751654624938965,
- "num_tokens": 29546692.0,
- "step": 3301
- },
- {
- "epoch": 2.509118541033435,
- "grad_norm": 2.2134389877319336,
- "learning_rate": 3.5834124701323414e-07,
- "loss": 0.39865267276763916,
- "mean_token_accuracy": 0.8581235408782959,
- "num_tokens": 29553889.0,
- "step": 3302
- },
- {
- "epoch": 2.5098784194528876,
- "grad_norm": 1.9763301610946655,
- "learning_rate": 3.5726155133638915e-07,
- "loss": 0.29025325179100037,
- "mean_token_accuracy": 0.8915338516235352,
- "num_tokens": 29562429.0,
- "step": 3303
- },
- {
- "epoch": 2.5106382978723403,
- "grad_norm": 2.347961187362671,
- "learning_rate": 3.561833595158698e-07,
- "loss": 0.33726242184638977,
- "mean_token_accuracy": 0.8788525462150574,
- "num_tokens": 29568696.0,
- "step": 3304
- },
- {
- "epoch": 2.5113981762917934,
- "grad_norm": 1.7410497665405273,
- "learning_rate": 3.5510667230839237e-07,
- "loss": 0.3604505956172943,
- "mean_token_accuracy": 0.8745309114456177,
- "num_tokens": 29579020.0,
- "step": 3305
- },
- {
- "epoch": 2.512158054711246,
- "grad_norm": 2.8427274227142334,
- "learning_rate": 3.540314904696196e-07,
- "loss": 0.16700688004493713,
- "mean_token_accuracy": 0.9461087584495544,
- "num_tokens": 29583216.0,
- "step": 3306
- },
- {
- "epoch": 2.512917933130699,
- "grad_norm": 3.4459211826324463,
- "learning_rate": 3.529578147541532e-07,
- "loss": 0.20073774456977844,
- "mean_token_accuracy": 0.9330953359603882,
- "num_tokens": 29586393.0,
- "step": 3307
- },
- {
- "epoch": 2.513677811550152,
- "grad_norm": 1.2530099153518677,
- "learning_rate": 3.518856459155409e-07,
- "loss": 0.3268885016441345,
- "mean_token_accuracy": 0.8808276653289795,
- "num_tokens": 29602387.0,
- "step": 3308
- },
- {
- "epoch": 2.514437689969605,
- "grad_norm": 2.64876389503479,
- "learning_rate": 3.508149847062725e-07,
- "loss": 0.328682541847229,
- "mean_token_accuracy": 0.8907853364944458,
- "num_tokens": 29608298.0,
- "step": 3309
- },
- {
- "epoch": 2.5151975683890577,
- "grad_norm": 2.3505539894104004,
- "learning_rate": 3.4974583187777853e-07,
- "loss": 0.3768400549888611,
- "mean_token_accuracy": 0.8646256327629089,
- "num_tokens": 29615035.0,
- "step": 3310
- },
- {
- "epoch": 2.5159574468085104,
- "grad_norm": 3.298685073852539,
- "learning_rate": 3.4867818818043217e-07,
- "loss": 0.4103941023349762,
- "mean_token_accuracy": 0.8660793304443359,
- "num_tokens": 29619522.0,
- "step": 3311
- },
- {
- "epoch": 2.5167173252279635,
- "grad_norm": 1.8788949251174927,
- "learning_rate": 3.476120543635469e-07,
- "loss": 0.39368999004364014,
- "mean_token_accuracy": 0.861727237701416,
- "num_tokens": 29628297.0,
- "step": 3312
- },
- {
- "epoch": 2.5174772036474167,
- "grad_norm": 1.3355047702789307,
- "learning_rate": 3.4654743117537525e-07,
- "loss": 0.30587559938430786,
- "mean_token_accuracy": 0.8944345116615295,
- "num_tokens": 29643010.0,
- "step": 3313
- },
- {
- "epoch": 2.5182370820668694,
- "grad_norm": 1.6371463537216187,
- "learning_rate": 3.4548431936311275e-07,
- "loss": 0.35551705956459045,
- "mean_token_accuracy": 0.8975727558135986,
- "num_tokens": 29654169.0,
- "step": 3314
- },
- {
- "epoch": 2.518996960486322,
- "grad_norm": 1.8126708269119263,
- "learning_rate": 3.4442271967289083e-07,
- "loss": 0.40501973032951355,
- "mean_token_accuracy": 0.872620701789856,
- "num_tokens": 29665965.0,
- "step": 3315
- },
- {
- "epoch": 2.519756838905775,
- "grad_norm": 2.9103341102600098,
- "learning_rate": 3.433626328497805e-07,
- "loss": 0.21716530621051788,
- "mean_token_accuracy": 0.9180731773376465,
- "num_tokens": 29670529.0,
- "step": 3316
- },
- {
- "epoch": 2.520516717325228,
- "grad_norm": 1.3893235921859741,
- "learning_rate": 3.4230405963779357e-07,
- "loss": 0.2638336420059204,
- "mean_token_accuracy": 0.9039981365203857,
- "num_tokens": 29681585.0,
- "step": 3317
- },
- {
- "epoch": 2.521276595744681,
- "grad_norm": 2.408050298690796,
- "learning_rate": 3.412470007798757e-07,
- "loss": 0.4774054288864136,
- "mean_token_accuracy": 0.835527777671814,
- "num_tokens": 29688642.0,
- "step": 3318
- },
- {
- "epoch": 2.5220364741641337,
- "grad_norm": 2.923038959503174,
- "learning_rate": 3.4019145701791186e-07,
- "loss": 0.24404606223106384,
- "mean_token_accuracy": 0.9276547431945801,
- "num_tokens": 29692516.0,
- "step": 3319
- },
- {
- "epoch": 2.522796352583587,
- "grad_norm": 3.470700740814209,
- "learning_rate": 3.3913742909272353e-07,
- "loss": 0.26732707023620605,
- "mean_token_accuracy": 0.910873293876648,
- "num_tokens": 29695779.0,
- "step": 3320
- },
- {
- "epoch": 2.5235562310030395,
- "grad_norm": 2.2419376373291016,
- "learning_rate": 3.3808491774406817e-07,
- "loss": 0.16050264239311218,
- "mean_token_accuracy": 0.934256911277771,
- "num_tokens": 29701486.0,
- "step": 3321
- },
- {
- "epoch": 2.524316109422492,
- "grad_norm": 2.3232672214508057,
- "learning_rate": 3.370339237106385e-07,
- "loss": 0.23050843179225922,
- "mean_token_accuracy": 0.9202409982681274,
- "num_tokens": 29706780.0,
- "step": 3322
- },
- {
- "epoch": 2.5250759878419453,
- "grad_norm": 3.012422800064087,
- "learning_rate": 3.359844477300633e-07,
- "loss": 0.22087830305099487,
- "mean_token_accuracy": 0.9293035268783569,
- "num_tokens": 29711164.0,
- "step": 3323
- },
- {
- "epoch": 2.5258358662613984,
- "grad_norm": 3.0274150371551514,
- "learning_rate": 3.3493649053890325e-07,
- "loss": 0.1908535212278366,
- "mean_token_accuracy": 0.9202175140380859,
- "num_tokens": 29714988.0,
- "step": 3324
- },
- {
- "epoch": 2.526595744680851,
- "grad_norm": 1.9113285541534424,
- "learning_rate": 3.3389005287265713e-07,
- "loss": 0.3098488748073578,
- "mean_token_accuracy": 0.8901765942573547,
- "num_tokens": 29722665.0,
- "step": 3325
- },
- {
- "epoch": 2.527355623100304,
- "grad_norm": 2.3841238021850586,
- "learning_rate": 3.32845135465755e-07,
- "loss": 0.25352805852890015,
- "mean_token_accuracy": 0.9079523682594299,
- "num_tokens": 29727646.0,
- "step": 3326
- },
- {
- "epoch": 2.528115501519757,
- "grad_norm": 2.134140968322754,
- "learning_rate": 3.3180173905155906e-07,
- "loss": 0.24720364809036255,
- "mean_token_accuracy": 0.9039219617843628,
- "num_tokens": 29734233.0,
- "step": 3327
- },
- {
- "epoch": 2.5288753799392096,
- "grad_norm": 1.9245797395706177,
- "learning_rate": 3.3075986436236494e-07,
- "loss": 0.2697824537754059,
- "mean_token_accuracy": 0.9077266454696655,
- "num_tokens": 29742107.0,
- "step": 3328
- },
- {
- "epoch": 2.5296352583586628,
- "grad_norm": 2.5044164657592773,
- "learning_rate": 3.297195121294022e-07,
- "loss": 0.3145396411418915,
- "mean_token_accuracy": 0.8834670782089233,
- "num_tokens": 29747755.0,
- "step": 3329
- },
- {
- "epoch": 2.5303951367781155,
- "grad_norm": 3.475567102432251,
- "learning_rate": 3.286806830828285e-07,
- "loss": 0.14926226437091827,
- "mean_token_accuracy": 0.9487104415893555,
- "num_tokens": 29750730.0,
- "step": 3330
- },
- {
- "epoch": 2.5311550151975686,
- "grad_norm": 2.0287671089172363,
- "learning_rate": 3.2764337795173433e-07,
- "loss": 0.3795855641365051,
- "mean_token_accuracy": 0.8685719966888428,
- "num_tokens": 29758328.0,
- "step": 3331
- },
- {
- "epoch": 2.5319148936170213,
- "grad_norm": 1.4884649515151978,
- "learning_rate": 3.2660759746414055e-07,
- "loss": 0.3048096299171448,
- "mean_token_accuracy": 0.8908923268318176,
- "num_tokens": 29770486.0,
- "step": 3332
- },
- {
- "epoch": 2.532674772036474,
- "grad_norm": 2.0645828247070312,
- "learning_rate": 3.255733423469978e-07,
- "loss": 0.3477875590324402,
- "mean_token_accuracy": 0.8803027868270874,
- "num_tokens": 29778363.0,
- "step": 3333
- },
- {
- "epoch": 2.533434650455927,
- "grad_norm": 2.032289981842041,
- "learning_rate": 3.245406133261858e-07,
- "loss": 0.39452236890792847,
- "mean_token_accuracy": 0.8499241471290588,
- "num_tokens": 29786353.0,
- "step": 3334
- },
- {
- "epoch": 2.53419452887538,
- "grad_norm": 2.146658420562744,
- "learning_rate": 3.235094111265141e-07,
- "loss": 0.250872939825058,
- "mean_token_accuracy": 0.9086864590644836,
- "num_tokens": 29793122.0,
- "step": 3335
- },
- {
- "epoch": 2.534954407294833,
- "grad_norm": 1.407880187034607,
- "learning_rate": 3.224797364717197e-07,
- "loss": 0.30364125967025757,
- "mean_token_accuracy": 0.875752329826355,
- "num_tokens": 29806866.0,
- "step": 3336
- },
- {
- "epoch": 2.5357142857142856,
- "grad_norm": 2.6231658458709717,
- "learning_rate": 3.214515900844681e-07,
- "loss": 0.31516194343566895,
- "mean_token_accuracy": 0.8799179792404175,
- "num_tokens": 29813035.0,
- "step": 3337
- },
- {
- "epoch": 2.5364741641337387,
- "grad_norm": 2.3876113891601562,
- "learning_rate": 3.204249726863523e-07,
- "loss": 0.3034508526325226,
- "mean_token_accuracy": 0.8916938304901123,
- "num_tokens": 29818810.0,
- "step": 3338
- },
- {
- "epoch": 2.5372340425531914,
- "grad_norm": 2.16711163520813,
- "learning_rate": 3.1939988499789075e-07,
- "loss": 0.25329700112342834,
- "mean_token_accuracy": 0.9260494112968445,
- "num_tokens": 29825472.0,
- "step": 3339
- },
- {
- "epoch": 2.5379939209726445,
- "grad_norm": 2.5136961936950684,
- "learning_rate": 3.18376327738531e-07,
- "loss": 0.3313722312450409,
- "mean_token_accuracy": 0.8868670463562012,
- "num_tokens": 29831426.0,
- "step": 3340
- },
- {
- "epoch": 2.538753799392097,
- "grad_norm": 1.7886340618133545,
- "learning_rate": 3.1735430162664366e-07,
- "loss": 0.3526390492916107,
- "mean_token_accuracy": 0.8689097762107849,
- "num_tokens": 29840212.0,
- "step": 3341
- },
- {
- "epoch": 2.5395136778115504,
- "grad_norm": 2.2471916675567627,
- "learning_rate": 3.1633380737952663e-07,
- "loss": 0.21594303846359253,
- "mean_token_accuracy": 0.9280022382736206,
- "num_tokens": 29845696.0,
- "step": 3342
- },
- {
- "epoch": 2.540273556231003,
- "grad_norm": 1.1835771799087524,
- "learning_rate": 3.15314845713402e-07,
- "loss": 0.2646978497505188,
- "mean_token_accuracy": 0.8992418050765991,
- "num_tokens": 29861802.0,
- "step": 3343
- },
- {
- "epoch": 2.5410334346504557,
- "grad_norm": 2.2009525299072266,
- "learning_rate": 3.14297417343416e-07,
- "loss": 0.4950712323188782,
- "mean_token_accuracy": 0.8226115703582764,
- "num_tokens": 29869931.0,
- "step": 3344
- },
- {
- "epoch": 2.541793313069909,
- "grad_norm": 1.2517180442810059,
- "learning_rate": 3.1328152298363943e-07,
- "loss": 0.26179224252700806,
- "mean_token_accuracy": 0.9045941829681396,
- "num_tokens": 29883562.0,
- "step": 3345
- },
- {
- "epoch": 2.5425531914893615,
- "grad_norm": 2.1705822944641113,
- "learning_rate": 3.122671633470664e-07,
- "loss": 0.38098567724227905,
- "mean_token_accuracy": 0.8638834357261658,
- "num_tokens": 29891094.0,
- "step": 3346
- },
- {
- "epoch": 2.5433130699088147,
- "grad_norm": 1.5869110822677612,
- "learning_rate": 3.1125433914561185e-07,
- "loss": 0.36774593591690063,
- "mean_token_accuracy": 0.8730655908584595,
- "num_tokens": 29901795.0,
- "step": 3347
- },
- {
- "epoch": 2.5440729483282674,
- "grad_norm": 1.267867922782898,
- "learning_rate": 3.1024305109011664e-07,
- "loss": 0.30716824531555176,
- "mean_token_accuracy": 0.8794038891792297,
- "num_tokens": 29918112.0,
- "step": 3348
- },
- {
- "epoch": 2.5448328267477205,
- "grad_norm": 1.7851269245147705,
- "learning_rate": 3.092332998903416e-07,
- "loss": 0.3374805748462677,
- "mean_token_accuracy": 0.8766556978225708,
- "num_tokens": 29927770.0,
- "step": 3349
- },
- {
- "epoch": 2.545592705167173,
- "grad_norm": 1.7153595685958862,
- "learning_rate": 3.082250862549671e-07,
- "loss": 0.4149400293827057,
- "mean_token_accuracy": 0.853299617767334,
- "num_tokens": 29939361.0,
- "step": 3350
- },
- {
- "epoch": 2.5463525835866263,
- "grad_norm": 2.676774740219116,
- "learning_rate": 3.0721841089159823e-07,
- "loss": 0.2004309445619583,
- "mean_token_accuracy": 0.9245458841323853,
- "num_tokens": 29943717.0,
- "step": 3351
- },
- {
- "epoch": 2.547112462006079,
- "grad_norm": 3.0472381114959717,
- "learning_rate": 3.0621327450675806e-07,
- "loss": 0.31185799837112427,
- "mean_token_accuracy": 0.8936638832092285,
- "num_tokens": 29948613.0,
- "step": 3352
- },
- {
- "epoch": 2.547872340425532,
- "grad_norm": 3.141087055206299,
- "learning_rate": 3.0520967780588966e-07,
- "loss": 0.34619835019111633,
- "mean_token_accuracy": 0.8754764199256897,
- "num_tokens": 29952477.0,
- "step": 3353
- },
- {
- "epoch": 2.548632218844985,
- "grad_norm": 1.277807593345642,
- "learning_rate": 3.0420762149335566e-07,
- "loss": 0.41385579109191895,
- "mean_token_accuracy": 0.8646053075790405,
- "num_tokens": 29972620.0,
- "step": 3354
- },
- {
- "epoch": 2.5493920972644375,
- "grad_norm": 1.8656301498413086,
- "learning_rate": 3.0320710627243815e-07,
- "loss": 0.33177047967910767,
- "mean_token_accuracy": 0.884863018989563,
- "num_tokens": 29980861.0,
- "step": 3355
- },
- {
- "epoch": 2.5501519756838906,
- "grad_norm": 1.5590285062789917,
- "learning_rate": 3.022081328453372e-07,
- "loss": 0.35837340354919434,
- "mean_token_accuracy": 0.8669678568840027,
- "num_tokens": 29992920.0,
- "step": 3356
- },
- {
- "epoch": 2.5509118541033433,
- "grad_norm": 1.3580808639526367,
- "learning_rate": 3.0121070191317075e-07,
- "loss": 0.30251336097717285,
- "mean_token_accuracy": 0.891779363155365,
- "num_tokens": 30006416.0,
- "step": 3357
- },
- {
- "epoch": 2.5516717325227964,
- "grad_norm": 1.2978777885437012,
- "learning_rate": 3.002148141759739e-07,
- "loss": 0.3747216463088989,
- "mean_token_accuracy": 0.8675031661987305,
- "num_tokens": 30026730.0,
- "step": 3358
- },
- {
- "epoch": 2.552431610942249,
- "grad_norm": 2.1855390071868896,
- "learning_rate": 2.992204703326995e-07,
- "loss": 0.25247129797935486,
- "mean_token_accuracy": 0.9170730113983154,
- "num_tokens": 30032920.0,
- "step": 3359
- },
- {
- "epoch": 2.5531914893617023,
- "grad_norm": 1.46858811378479,
- "learning_rate": 2.9822767108121623e-07,
- "loss": 0.45840656757354736,
- "mean_token_accuracy": 0.8472789525985718,
- "num_tokens": 30046347.0,
- "step": 3360
- },
- {
- "epoch": 2.553951367781155,
- "grad_norm": 1.7625445127487183,
- "learning_rate": 2.9723641711830896e-07,
- "loss": 0.34696075320243835,
- "mean_token_accuracy": 0.8730940222740173,
- "num_tokens": 30057264.0,
- "step": 3361
- },
- {
- "epoch": 2.5547112462006076,
- "grad_norm": 2.3647844791412354,
- "learning_rate": 2.96246709139677e-07,
- "loss": 0.3888760209083557,
- "mean_token_accuracy": 0.8829300403594971,
- "num_tokens": 30064199.0,
- "step": 3362
- },
- {
- "epoch": 2.5554711246200608,
- "grad_norm": 1.3508832454681396,
- "learning_rate": 2.9525854783993696e-07,
- "loss": 0.2998582720756531,
- "mean_token_accuracy": 0.8910796642303467,
- "num_tokens": 30078083.0,
- "step": 3363
- },
- {
- "epoch": 2.556231003039514,
- "grad_norm": 1.8688349723815918,
- "learning_rate": 2.942719339126171e-07,
- "loss": 0.23044756054878235,
- "mean_token_accuracy": 0.9150751233100891,
- "num_tokens": 30086010.0,
- "step": 3364
- },
- {
- "epoch": 2.5569908814589666,
- "grad_norm": 2.7221083641052246,
- "learning_rate": 2.932868680501613e-07,
- "loss": 0.30724483728408813,
- "mean_token_accuracy": 0.9012277126312256,
- "num_tokens": 30091524.0,
- "step": 3365
- },
- {
- "epoch": 2.5577507598784193,
- "grad_norm": 2.5149598121643066,
- "learning_rate": 2.92303350943928e-07,
- "loss": 0.37096866965293884,
- "mean_token_accuracy": 0.8573155403137207,
- "num_tokens": 30097860.0,
- "step": 3366
- },
- {
- "epoch": 2.5585106382978724,
- "grad_norm": 2.9985098838806152,
- "learning_rate": 2.913213832841857e-07,
- "loss": 0.3397367596626282,
- "mean_token_accuracy": 0.8724661469459534,
- "num_tokens": 30107543.0,
- "step": 3367
- },
- {
- "epoch": 2.559270516717325,
- "grad_norm": 2.119527816772461,
- "learning_rate": 2.9034096576011805e-07,
- "loss": 0.34516414999961853,
- "mean_token_accuracy": 0.8728296756744385,
- "num_tokens": 30114737.0,
- "step": 3368
- },
- {
- "epoch": 2.560030395136778,
- "grad_norm": 2.6809260845184326,
- "learning_rate": 2.893620990598192e-07,
- "loss": 0.4649572968482971,
- "mean_token_accuracy": 0.8441047668457031,
- "num_tokens": 30120640.0,
- "step": 3369
- },
- {
- "epoch": 2.560790273556231,
- "grad_norm": 1.634458065032959,
- "learning_rate": 2.8838478387029605e-07,
- "loss": 0.3435993194580078,
- "mean_token_accuracy": 0.8726693987846375,
- "num_tokens": 30133091.0,
- "step": 3370
- },
- {
- "epoch": 2.561550151975684,
- "grad_norm": 1.7352157831192017,
- "learning_rate": 2.8740902087746604e-07,
- "loss": 0.3171056807041168,
- "mean_token_accuracy": 0.8962107300758362,
- "num_tokens": 30141735.0,
- "step": 3371
- },
- {
- "epoch": 2.5623100303951367,
- "grad_norm": 2.8209640979766846,
- "learning_rate": 2.8643481076615717e-07,
- "loss": 0.24519780278205872,
- "mean_token_accuracy": 0.9098281860351562,
- "num_tokens": 30146073.0,
- "step": 3372
- },
- {
- "epoch": 2.5630699088145894,
- "grad_norm": 2.1111650466918945,
- "learning_rate": 2.854621542201064e-07,
- "loss": 0.34583622217178345,
- "mean_token_accuracy": 0.8917075395584106,
- "num_tokens": 30153104.0,
- "step": 3373
- },
- {
- "epoch": 2.5638297872340425,
- "grad_norm": 1.5275969505310059,
- "learning_rate": 2.844910519219632e-07,
- "loss": 0.33743610978126526,
- "mean_token_accuracy": 0.8789186477661133,
- "num_tokens": 30166414.0,
- "step": 3374
- },
- {
- "epoch": 2.5645896656534957,
- "grad_norm": 3.6885430812835693,
- "learning_rate": 2.835215045532841e-07,
- "loss": 0.3318662643432617,
- "mean_token_accuracy": 0.880516767501831,
- "num_tokens": 30170397.0,
- "step": 3375
- },
- {
- "epoch": 2.5653495440729484,
- "grad_norm": 3.58422589302063,
- "learning_rate": 2.8255351279453446e-07,
- "loss": 0.24304428696632385,
- "mean_token_accuracy": 0.911949634552002,
- "num_tokens": 30173809.0,
- "step": 3376
- },
- {
- "epoch": 2.566109422492401,
- "grad_norm": 2.180278778076172,
- "learning_rate": 2.815870773250873e-07,
- "loss": 0.2282833755016327,
- "mean_token_accuracy": 0.9192917346954346,
- "num_tokens": 30179431.0,
- "step": 3377
- },
- {
- "epoch": 2.566869300911854,
- "grad_norm": 1.925766110420227,
- "learning_rate": 2.8062219882322636e-07,
- "loss": 0.38162487745285034,
- "mean_token_accuracy": 0.8635650873184204,
- "num_tokens": 30194252.0,
- "step": 3378
- },
- {
- "epoch": 2.567629179331307,
- "grad_norm": 1.9528982639312744,
- "learning_rate": 2.796588779661388e-07,
- "loss": 0.3215118646621704,
- "mean_token_accuracy": 0.8850376605987549,
- "num_tokens": 30202341.0,
- "step": 3379
- },
- {
- "epoch": 2.56838905775076,
- "grad_norm": 1.9466958045959473,
- "learning_rate": 2.786971154299209e-07,
- "loss": 0.3743375539779663,
- "mean_token_accuracy": 0.8669804930686951,
- "num_tokens": 30210657.0,
- "step": 3380
- },
- {
- "epoch": 2.5691489361702127,
- "grad_norm": 1.0222121477127075,
- "learning_rate": 2.777369118895745e-07,
- "loss": 0.28801876306533813,
- "mean_token_accuracy": 0.8983622789382935,
- "num_tokens": 30232182.0,
- "step": 3381
- },
- {
- "epoch": 2.569908814589666,
- "grad_norm": 1.5706082582473755,
- "learning_rate": 2.767782680190073e-07,
- "loss": 0.37556713819503784,
- "mean_token_accuracy": 0.8659577369689941,
- "num_tokens": 30244819.0,
- "step": 3382
- },
- {
- "epoch": 2.5706686930091185,
- "grad_norm": 2.5092997550964355,
- "learning_rate": 2.7582118449103273e-07,
- "loss": 0.4440537691116333,
- "mean_token_accuracy": 0.8627067804336548,
- "num_tokens": 30251856.0,
- "step": 3383
- },
- {
- "epoch": 2.571428571428571,
- "grad_norm": 2.2710351943969727,
- "learning_rate": 2.748656619773687e-07,
- "loss": 0.12478743493556976,
- "mean_token_accuracy": 0.9581196904182434,
- "num_tokens": 30255765.0,
- "step": 3384
- },
- {
- "epoch": 2.5721884498480243,
- "grad_norm": 1.5596920251846313,
- "learning_rate": 2.739117011486378e-07,
- "loss": 0.23946957290172577,
- "mean_token_accuracy": 0.9149091243743896,
- "num_tokens": 30265134.0,
- "step": 3385
- },
- {
- "epoch": 2.5729483282674774,
- "grad_norm": 2.5665597915649414,
- "learning_rate": 2.729593026743668e-07,
- "loss": 0.22638919949531555,
- "mean_token_accuracy": 0.9160120487213135,
- "num_tokens": 30269971.0,
- "step": 3386
- },
- {
- "epoch": 2.57370820668693,
- "grad_norm": 2.1374216079711914,
- "learning_rate": 2.7200846722298503e-07,
- "loss": 0.3681026101112366,
- "mean_token_accuracy": 0.8709797263145447,
- "num_tokens": 30277792.0,
- "step": 3387
- },
- {
- "epoch": 2.574468085106383,
- "grad_norm": 1.5955793857574463,
- "learning_rate": 2.710591954618247e-07,
- "loss": 0.3560969829559326,
- "mean_token_accuracy": 0.8950826525688171,
- "num_tokens": 30289038.0,
- "step": 3388
- },
- {
- "epoch": 2.575227963525836,
- "grad_norm": 1.561316967010498,
- "learning_rate": 2.701114880571232e-07,
- "loss": 0.29359546303749084,
- "mean_token_accuracy": 0.9007925987243652,
- "num_tokens": 30298228.0,
- "step": 3389
- },
- {
- "epoch": 2.5759878419452886,
- "grad_norm": 1.7596205472946167,
- "learning_rate": 2.6916534567401675e-07,
- "loss": 0.29790499806404114,
- "mean_token_accuracy": 0.8907828330993652,
- "num_tokens": 30306908.0,
- "step": 3390
- },
- {
- "epoch": 2.5767477203647418,
- "grad_norm": 2.1243667602539062,
- "learning_rate": 2.6822076897654453e-07,
- "loss": 0.26356661319732666,
- "mean_token_accuracy": 0.9012589454650879,
- "num_tokens": 30312971.0,
- "step": 3391
- },
- {
- "epoch": 2.5775075987841944,
- "grad_norm": 2.35373592376709,
- "learning_rate": 2.6727775862764703e-07,
- "loss": 0.3303247094154358,
- "mean_token_accuracy": 0.876170814037323,
- "num_tokens": 30319303.0,
- "step": 3392
- },
- {
- "epoch": 2.5782674772036476,
- "grad_norm": 2.5983684062957764,
- "learning_rate": 2.663363152891654e-07,
- "loss": 0.3094015121459961,
- "mean_token_accuracy": 0.9034996628761292,
- "num_tokens": 30324454.0,
- "step": 3393
- },
- {
- "epoch": 2.5790273556231003,
- "grad_norm": 2.264035940170288,
- "learning_rate": 2.653964396218406e-07,
- "loss": 0.42449623346328735,
- "mean_token_accuracy": 0.8461374044418335,
- "num_tokens": 30331213.0,
- "step": 3394
- },
- {
- "epoch": 2.579787234042553,
- "grad_norm": 1.591833233833313,
- "learning_rate": 2.64458132285314e-07,
- "loss": 0.3518860340118408,
- "mean_token_accuracy": 0.8751099705696106,
- "num_tokens": 30341328.0,
- "step": 3395
- },
- {
- "epoch": 2.580547112462006,
- "grad_norm": 2.4209396839141846,
- "learning_rate": 2.635213939381248e-07,
- "loss": 0.3116898238658905,
- "mean_token_accuracy": 0.9000394344329834,
- "num_tokens": 30346970.0,
- "step": 3396
- },
- {
- "epoch": 2.581306990881459,
- "grad_norm": 2.373574733734131,
- "learning_rate": 2.625862252377129e-07,
- "loss": 0.2558296322822571,
- "mean_token_accuracy": 0.9050976037979126,
- "num_tokens": 30352535.0,
- "step": 3397
- },
- {
- "epoch": 2.582066869300912,
- "grad_norm": 2.3691492080688477,
- "learning_rate": 2.61652626840416e-07,
- "loss": 0.34974297881126404,
- "mean_token_accuracy": 0.880367636680603,
- "num_tokens": 30359008.0,
- "step": 3398
- },
- {
- "epoch": 2.5828267477203646,
- "grad_norm": 2.6194329261779785,
- "learning_rate": 2.6072059940146775e-07,
- "loss": 0.302560031414032,
- "mean_token_accuracy": 0.9090637564659119,
- "num_tokens": 30364545.0,
- "step": 3399
- },
- {
- "epoch": 2.5835866261398177,
- "grad_norm": 1.8017159700393677,
- "learning_rate": 2.597901435750025e-07,
- "loss": 0.2855827212333679,
- "mean_token_accuracy": 0.8930953741073608,
- "num_tokens": 30372593.0,
- "step": 3400
- },
- {
- "epoch": 2.5843465045592704,
- "grad_norm": 1.736401915550232,
- "learning_rate": 2.5886126001405e-07,
- "loss": 0.38662317395210266,
- "mean_token_accuracy": 0.876146674156189,
- "num_tokens": 30382097.0,
- "step": 3401
- },
- {
- "epoch": 2.5851063829787235,
- "grad_norm": 1.174890398979187,
- "learning_rate": 2.579339493705355e-07,
- "loss": 0.351195752620697,
- "mean_token_accuracy": 0.8636453747749329,
- "num_tokens": 30399208.0,
- "step": 3402
- },
- {
- "epoch": 2.585866261398176,
- "grad_norm": 1.9311470985412598,
- "learning_rate": 2.5700821229528164e-07,
- "loss": 0.3222745656967163,
- "mean_token_accuracy": 0.9071283936500549,
- "num_tokens": 30406648.0,
- "step": 3403
- },
- {
- "epoch": 2.5866261398176293,
- "grad_norm": 1.7329829931259155,
- "learning_rate": 2.5608404943800627e-07,
- "loss": 0.25072571635246277,
- "mean_token_accuracy": 0.9056229591369629,
- "num_tokens": 30414836.0,
- "step": 3404
- },
- {
- "epoch": 2.587386018237082,
- "grad_norm": 2.8609302043914795,
- "learning_rate": 2.5516146144732273e-07,
- "loss": 0.23907656967639923,
- "mean_token_accuracy": 0.9093331694602966,
- "num_tokens": 30419027.0,
- "step": 3405
- },
- {
- "epoch": 2.5881458966565347,
- "grad_norm": 2.0544052124023438,
- "learning_rate": 2.5424044897073895e-07,
- "loss": 0.28297221660614014,
- "mean_token_accuracy": 0.8845421075820923,
- "num_tokens": 30426720.0,
- "step": 3406
- },
- {
- "epoch": 2.588905775075988,
- "grad_norm": 2.0454416275024414,
- "learning_rate": 2.533210126546565e-07,
- "loss": 0.40411946177482605,
- "mean_token_accuracy": 0.8890959024429321,
- "num_tokens": 30434413.0,
- "step": 3407
- },
- {
- "epoch": 2.589665653495441,
- "grad_norm": 2.5405404567718506,
- "learning_rate": 2.52403153144371e-07,
- "loss": 0.2860855460166931,
- "mean_token_accuracy": 0.9015365242958069,
- "num_tokens": 30439934.0,
- "step": 3408
- },
- {
- "epoch": 2.5904255319148937,
- "grad_norm": 1.7092106342315674,
- "learning_rate": 2.514868710840723e-07,
- "loss": 0.36490949988365173,
- "mean_token_accuracy": 0.8814249634742737,
- "num_tokens": 30450150.0,
- "step": 3409
- },
- {
- "epoch": 2.5911854103343464,
- "grad_norm": 2.2119903564453125,
- "learning_rate": 2.505721671168426e-07,
- "loss": 0.3019217848777771,
- "mean_token_accuracy": 0.915499210357666,
- "num_tokens": 30456068.0,
- "step": 3410
- },
- {
- "epoch": 2.5919452887537995,
- "grad_norm": 2.1960413455963135,
- "learning_rate": 2.496590418846545e-07,
- "loss": 0.21500837802886963,
- "mean_token_accuracy": 0.9256033897399902,
- "num_tokens": 30461738.0,
- "step": 3411
- },
- {
- "epoch": 2.592705167173252,
- "grad_norm": 2.9918036460876465,
- "learning_rate": 2.48747496028377e-07,
- "loss": 0.3921341001987457,
- "mean_token_accuracy": 0.8748230934143066,
- "num_tokens": 30466747.0,
- "step": 3412
- },
- {
- "epoch": 2.5934650455927053,
- "grad_norm": 3.091017723083496,
- "learning_rate": 2.478375301877664e-07,
- "loss": 0.19394469261169434,
- "mean_token_accuracy": 0.9396419525146484,
- "num_tokens": 30470176.0,
- "step": 3413
- },
- {
- "epoch": 2.594224924012158,
- "grad_norm": 1.3302737474441528,
- "learning_rate": 2.4692914500147185e-07,
- "loss": 0.43362653255462646,
- "mean_token_accuracy": 0.844821572303772,
- "num_tokens": 30486501.0,
- "step": 3414
- },
- {
- "epoch": 2.594984802431611,
- "grad_norm": 1.7620038986206055,
- "learning_rate": 2.460223411070337e-07,
- "loss": 0.2638559937477112,
- "mean_token_accuracy": 0.8977950215339661,
- "num_tokens": 30495182.0,
- "step": 3415
- },
- {
- "epoch": 2.595744680851064,
- "grad_norm": 2.4946224689483643,
- "learning_rate": 2.451171191408813e-07,
- "loss": 0.1953703612089157,
- "mean_token_accuracy": 0.9253969192504883,
- "num_tokens": 30499923.0,
- "step": 3416
- },
- {
- "epoch": 2.5965045592705165,
- "grad_norm": 1.3302149772644043,
- "learning_rate": 2.4421347973833443e-07,
- "loss": 0.32972219586372375,
- "mean_token_accuracy": 0.9001818299293518,
- "num_tokens": 30515685.0,
- "step": 3417
- },
- {
- "epoch": 2.5972644376899696,
- "grad_norm": 1.4542583227157593,
- "learning_rate": 2.4331142353360206e-07,
- "loss": 0.2967185378074646,
- "mean_token_accuracy": 0.9060331583023071,
- "num_tokens": 30527102.0,
- "step": 3418
- },
- {
- "epoch": 2.5980243161094227,
- "grad_norm": 2.951036214828491,
- "learning_rate": 2.424109511597822e-07,
- "loss": 0.2523000240325928,
- "mean_token_accuracy": 0.9064282774925232,
- "num_tokens": 30531309.0,
- "step": 3419
- },
- {
- "epoch": 2.5987841945288754,
- "grad_norm": 2.1855340003967285,
- "learning_rate": 2.4151206324886047e-07,
- "loss": 0.35201045870780945,
- "mean_token_accuracy": 0.8787988424301147,
- "num_tokens": 30538014.0,
- "step": 3420
- },
- {
- "epoch": 2.599544072948328,
- "grad_norm": 2.579791307449341,
- "learning_rate": 2.406147604317119e-07,
- "loss": 0.24575991928577423,
- "mean_token_accuracy": 0.9309802055358887,
- "num_tokens": 30542437.0,
- "step": 3421
- },
- {
- "epoch": 2.6003039513677813,
- "grad_norm": 2.758512496948242,
- "learning_rate": 2.397190433380964e-07,
- "loss": 0.3121788501739502,
- "mean_token_accuracy": 0.8949024677276611,
- "num_tokens": 30547171.0,
- "step": 3422
- },
- {
- "epoch": 2.601063829787234,
- "grad_norm": 1.7849500179290771,
- "learning_rate": 2.388249125966646e-07,
- "loss": 0.3810131251811981,
- "mean_token_accuracy": 0.8799927830696106,
- "num_tokens": 30556368.0,
- "step": 3423
- },
- {
- "epoch": 2.601823708206687,
- "grad_norm": 2.701768636703491,
- "learning_rate": 2.3793236883495164e-07,
- "loss": 0.2190743237733841,
- "mean_token_accuracy": 0.9288224577903748,
- "num_tokens": 30561367.0,
- "step": 3424
- },
- {
- "epoch": 2.6025835866261398,
- "grad_norm": 2.0361149311065674,
- "learning_rate": 2.3704141267937797e-07,
- "loss": 0.37623006105422974,
- "mean_token_accuracy": 0.8677272796630859,
- "num_tokens": 30569589.0,
- "step": 3425
- },
- {
- "epoch": 2.603343465045593,
- "grad_norm": 1.094288945198059,
- "learning_rate": 2.3615204475525096e-07,
- "loss": 0.3885940909385681,
- "mean_token_accuracy": 0.8518509864807129,
- "num_tokens": 30592538.0,
- "step": 3426
- },
- {
- "epoch": 2.6041033434650456,
- "grad_norm": 3.1634905338287354,
- "learning_rate": 2.3526426568676485e-07,
- "loss": 0.14411768317222595,
- "mean_token_accuracy": 0.9483509063720703,
- "num_tokens": 30595768.0,
- "step": 3427
- },
- {
- "epoch": 2.6048632218844983,
- "grad_norm": 1.642171859741211,
- "learning_rate": 2.3437807609699575e-07,
- "loss": 0.28384336829185486,
- "mean_token_accuracy": 0.8940542936325073,
- "num_tokens": 30605361.0,
- "step": 3428
- },
- {
- "epoch": 2.6056231003039514,
- "grad_norm": 1.567029356956482,
- "learning_rate": 2.3349347660790582e-07,
- "loss": 0.373100221157074,
- "mean_token_accuracy": 0.8695693016052246,
- "num_tokens": 30616182.0,
- "step": 3429
- },
- {
- "epoch": 2.6063829787234045,
- "grad_norm": 1.5392675399780273,
- "learning_rate": 2.3261046784034154e-07,
- "loss": 0.4163264036178589,
- "mean_token_accuracy": 0.8596208095550537,
- "num_tokens": 30628601.0,
- "step": 3430
- },
- {
- "epoch": 2.607142857142857,
- "grad_norm": 1.5044162273406982,
- "learning_rate": 2.3172905041403181e-07,
- "loss": 0.3813124895095825,
- "mean_token_accuracy": 0.8577728271484375,
- "num_tokens": 30641924.0,
- "step": 3431
- },
- {
- "epoch": 2.60790273556231,
- "grad_norm": 1.4375652074813843,
- "learning_rate": 2.3084922494758965e-07,
- "loss": 0.33166638016700745,
- "mean_token_accuracy": 0.8825733661651611,
- "num_tokens": 30653849.0,
- "step": 3432
- },
- {
- "epoch": 2.608662613981763,
- "grad_norm": 2.5562593936920166,
- "learning_rate": 2.299709920585108e-07,
- "loss": 0.3969959616661072,
- "mean_token_accuracy": 0.8612505197525024,
- "num_tokens": 30659471.0,
- "step": 3433
- },
- {
- "epoch": 2.6094224924012157,
- "grad_norm": 3.2285826206207275,
- "learning_rate": 2.2909435236317224e-07,
- "loss": 0.24361640214920044,
- "mean_token_accuracy": 0.9103600978851318,
- "num_tokens": 30664129.0,
- "step": 3434
- },
- {
- "epoch": 2.610182370820669,
- "grad_norm": 2.702500343322754,
- "learning_rate": 2.2821930647683427e-07,
- "loss": 0.28006303310394287,
- "mean_token_accuracy": 0.9067277908325195,
- "num_tokens": 30668453.0,
- "step": 3435
- },
- {
- "epoch": 2.6109422492401215,
- "grad_norm": 1.6491931676864624,
- "learning_rate": 2.2734585501363676e-07,
- "loss": 0.38273465633392334,
- "mean_token_accuracy": 0.8603695631027222,
- "num_tokens": 30680159.0,
- "step": 3436
- },
- {
- "epoch": 2.6117021276595747,
- "grad_norm": 2.628532648086548,
- "learning_rate": 2.2647399858660156e-07,
- "loss": 0.28879645466804504,
- "mean_token_accuracy": 0.9014753103256226,
- "num_tokens": 30685706.0,
- "step": 3437
- },
- {
- "epoch": 2.6124620060790273,
- "grad_norm": 1.515868067741394,
- "learning_rate": 2.2560373780763256e-07,
- "loss": 0.3872387707233429,
- "mean_token_accuracy": 0.8627544641494751,
- "num_tokens": 30696942.0,
- "step": 3438
- },
- {
- "epoch": 2.61322188449848,
- "grad_norm": 2.4761857986450195,
- "learning_rate": 2.2473507328751086e-07,
- "loss": 0.3222554624080658,
- "mean_token_accuracy": 0.8839071989059448,
- "num_tokens": 30703089.0,
- "step": 3439
- },
- {
- "epoch": 2.613981762917933,
- "grad_norm": 1.5424152612686157,
- "learning_rate": 2.238680056358991e-07,
- "loss": 0.24553638696670532,
- "mean_token_accuracy": 0.9186095595359802,
- "num_tokens": 30712643.0,
- "step": 3440
- },
- {
- "epoch": 2.6147416413373863,
- "grad_norm": 2.1723358631134033,
- "learning_rate": 2.2300253546133883e-07,
- "loss": 0.22538061439990997,
- "mean_token_accuracy": 0.914456844329834,
- "num_tokens": 30719326.0,
- "step": 3441
- },
- {
- "epoch": 2.615501519756839,
- "grad_norm": 2.0607242584228516,
- "learning_rate": 2.2213866337125022e-07,
- "loss": 0.40517157316207886,
- "mean_token_accuracy": 0.8558610677719116,
- "num_tokens": 30726700.0,
- "step": 3442
- },
- {
- "epoch": 2.6162613981762917,
- "grad_norm": 1.3590739965438843,
- "learning_rate": 2.2127638997193196e-07,
- "loss": 0.3030068874359131,
- "mean_token_accuracy": 0.904723048210144,
- "num_tokens": 30739090.0,
- "step": 3443
- },
- {
- "epoch": 2.617021276595745,
- "grad_norm": 1.3497486114501953,
- "learning_rate": 2.2041571586856104e-07,
- "loss": 0.33204561471939087,
- "mean_token_accuracy": 0.8720648288726807,
- "num_tokens": 30755883.0,
- "step": 3444
- },
- {
- "epoch": 2.6177811550151975,
- "grad_norm": 4.2434515953063965,
- "learning_rate": 2.1955664166519036e-07,
- "loss": 0.16747456789016724,
- "mean_token_accuracy": 0.9386751651763916,
- "num_tokens": 30758472.0,
- "step": 3445
- },
- {
- "epoch": 2.6185410334346506,
- "grad_norm": 2.629639148712158,
- "learning_rate": 2.1869916796475294e-07,
- "loss": 0.3494086265563965,
- "mean_token_accuracy": 0.8652780055999756,
- "num_tokens": 30765014.0,
- "step": 3446
- },
- {
- "epoch": 2.6193009118541033,
- "grad_norm": 1.56986403465271,
- "learning_rate": 2.1784329536905653e-07,
- "loss": 0.288389652967453,
- "mean_token_accuracy": 0.8911552429199219,
- "num_tokens": 30783893.0,
- "step": 3447
- },
- {
- "epoch": 2.6200607902735564,
- "grad_norm": 2.137489080429077,
- "learning_rate": 2.1698902447878478e-07,
- "loss": 0.32554084062576294,
- "mean_token_accuracy": 0.8827146291732788,
- "num_tokens": 30791222.0,
- "step": 3448
- },
- {
- "epoch": 2.620820668693009,
- "grad_norm": 1.7718229293823242,
- "learning_rate": 2.1613635589349756e-07,
- "loss": 0.433074414730072,
- "mean_token_accuracy": 0.8626075983047485,
- "num_tokens": 30800909.0,
- "step": 3449
- },
- {
- "epoch": 2.621580547112462,
- "grad_norm": 1.8075933456420898,
- "learning_rate": 2.1528529021163203e-07,
- "loss": 0.3695775270462036,
- "mean_token_accuracy": 0.8898511528968811,
- "num_tokens": 30809680.0,
- "step": 3450
- },
- {
- "epoch": 2.622340425531915,
- "grad_norm": 2.61863374710083,
- "learning_rate": 2.1443582803049757e-07,
- "loss": 0.3161890506744385,
- "mean_token_accuracy": 0.9073872566223145,
- "num_tokens": 30814582.0,
- "step": 3451
- },
- {
- "epoch": 2.6231003039513676,
- "grad_norm": 1.9178471565246582,
- "learning_rate": 2.1358796994628005e-07,
- "loss": 0.26871830224990845,
- "mean_token_accuracy": 0.9038676023483276,
- "num_tokens": 30822408.0,
- "step": 3452
- },
- {
- "epoch": 2.6238601823708207,
- "grad_norm": 1.4968323707580566,
- "learning_rate": 2.1274171655403852e-07,
- "loss": 0.30813854932785034,
- "mean_token_accuracy": 0.8859157562255859,
- "num_tokens": 30833627.0,
- "step": 3453
- },
- {
- "epoch": 2.6246200607902734,
- "grad_norm": 2.584803581237793,
- "learning_rate": 2.118970684477062e-07,
- "loss": 0.3794214129447937,
- "mean_token_accuracy": 0.8644092679023743,
- "num_tokens": 30839801.0,
- "step": 3454
- },
- {
- "epoch": 2.6253799392097266,
- "grad_norm": 1.4426536560058594,
- "learning_rate": 2.1105402622008996e-07,
- "loss": 0.3904871344566345,
- "mean_token_accuracy": 0.8601649403572083,
- "num_tokens": 30854293.0,
- "step": 3455
- },
- {
- "epoch": 2.6261398176291793,
- "grad_norm": 2.42291522026062,
- "learning_rate": 2.1021259046286907e-07,
- "loss": 0.2830442786216736,
- "mean_token_accuracy": 0.918805718421936,
- "num_tokens": 30859112.0,
- "step": 3456
- },
- {
- "epoch": 2.6268996960486324,
- "grad_norm": 1.4296268224716187,
- "learning_rate": 2.0937276176659553e-07,
- "loss": 0.3172294497489929,
- "mean_token_accuracy": 0.8801482915878296,
- "num_tokens": 30871653.0,
- "step": 3457
- },
- {
- "epoch": 2.627659574468085,
- "grad_norm": 2.1253740787506104,
- "learning_rate": 2.0853454072069402e-07,
- "loss": 0.39093419909477234,
- "mean_token_accuracy": 0.9099202156066895,
- "num_tokens": 30879156.0,
- "step": 3458
- },
- {
- "epoch": 2.628419452887538,
- "grad_norm": 2.829529047012329,
- "learning_rate": 2.0769792791345945e-07,
- "loss": 0.35299739241600037,
- "mean_token_accuracy": 0.8719742894172668,
- "num_tokens": 30883839.0,
- "step": 3459
- },
- {
- "epoch": 2.629179331306991,
- "grad_norm": 1.4410310983657837,
- "learning_rate": 2.068629239320588e-07,
- "loss": 0.3550579845905304,
- "mean_token_accuracy": 0.8580837249755859,
- "num_tokens": 30897204.0,
- "step": 3460
- },
- {
- "epoch": 2.6299392097264436,
- "grad_norm": 1.8980296850204468,
- "learning_rate": 2.0602952936253112e-07,
- "loss": 0.33656907081604004,
- "mean_token_accuracy": 0.9022824764251709,
- "num_tokens": 30905304.0,
- "step": 3461
- },
- {
- "epoch": 2.6306990881458967,
- "grad_norm": 2.6224915981292725,
- "learning_rate": 2.0519774478978404e-07,
- "loss": 0.383074015378952,
- "mean_token_accuracy": 0.8744953870773315,
- "num_tokens": 30911118.0,
- "step": 3462
- },
- {
- "epoch": 2.6314589665653494,
- "grad_norm": 1.8675706386566162,
- "learning_rate": 2.043675707975959e-07,
- "loss": 0.36784154176712036,
- "mean_token_accuracy": 0.8660717606544495,
- "num_tokens": 30919786.0,
- "step": 3463
- },
- {
- "epoch": 2.6322188449848025,
- "grad_norm": 1.7601722478866577,
- "learning_rate": 2.0353900796861503e-07,
- "loss": 0.4188779294490814,
- "mean_token_accuracy": 0.8655462861061096,
- "num_tokens": 30930882.0,
- "step": 3464
- },
- {
- "epoch": 2.632978723404255,
- "grad_norm": 1.760291337966919,
- "learning_rate": 2.027120568843588e-07,
- "loss": 0.31421059370040894,
- "mean_token_accuracy": 0.8876073360443115,
- "num_tokens": 30940524.0,
- "step": 3465
- },
- {
- "epoch": 2.6337386018237083,
- "grad_norm": 2.0120749473571777,
- "learning_rate": 2.0188671812521293e-07,
- "loss": 0.4053173065185547,
- "mean_token_accuracy": 0.855548620223999,
- "num_tokens": 30949577.0,
- "step": 3466
- },
- {
- "epoch": 2.634498480243161,
- "grad_norm": 1.1353741884231567,
- "learning_rate": 2.0106299227043298e-07,
- "loss": 0.24654456973075867,
- "mean_token_accuracy": 0.905929684638977,
- "num_tokens": 30965797.0,
- "step": 3467
- },
- {
- "epoch": 2.6352583586626137,
- "grad_norm": 2.011974811553955,
- "learning_rate": 2.002408798981395e-07,
- "loss": 0.37494587898254395,
- "mean_token_accuracy": 0.8785897493362427,
- "num_tokens": 30974271.0,
- "step": 3468
- },
- {
- "epoch": 2.636018237082067,
- "grad_norm": 1.3929005861282349,
- "learning_rate": 1.9942038158532407e-07,
- "loss": 0.43479201197624207,
- "mean_token_accuracy": 0.8481380939483643,
- "num_tokens": 30992451.0,
- "step": 3469
- },
- {
- "epoch": 2.63677811550152,
- "grad_norm": 2.2714993953704834,
- "learning_rate": 1.9860149790784432e-07,
- "loss": 0.36299505829811096,
- "mean_token_accuracy": 0.8728935718536377,
- "num_tokens": 30999180.0,
- "step": 3470
- },
- {
- "epoch": 2.6375379939209727,
- "grad_norm": 1.722923755645752,
- "learning_rate": 1.977842294404228e-07,
- "loss": 0.2461910843849182,
- "mean_token_accuracy": 0.9146148562431335,
- "num_tokens": 31008617.0,
- "step": 3471
- },
- {
- "epoch": 2.6382978723404253,
- "grad_norm": 1.4508280754089355,
- "learning_rate": 1.9696857675665122e-07,
- "loss": 0.3511884808540344,
- "mean_token_accuracy": 0.869759202003479,
- "num_tokens": 31022158.0,
- "step": 3472
- },
- {
- "epoch": 2.6390577507598785,
- "grad_norm": 2.5803074836730957,
- "learning_rate": 1.9615454042898635e-07,
- "loss": 0.27785009145736694,
- "mean_token_accuracy": 0.9075050354003906,
- "num_tokens": 31027176.0,
- "step": 3473
- },
- {
- "epoch": 2.639817629179331,
- "grad_norm": 3.2428712844848633,
- "learning_rate": 1.95342121028749e-07,
- "loss": 0.30596673488616943,
- "mean_token_accuracy": 0.8934510946273804,
- "num_tokens": 31031140.0,
- "step": 3474
- },
- {
- "epoch": 2.6405775075987843,
- "grad_norm": 1.5055527687072754,
- "learning_rate": 1.9453131912612694e-07,
- "loss": 0.3586134612560272,
- "mean_token_accuracy": 0.87983238697052,
- "num_tokens": 31041878.0,
- "step": 3475
- },
- {
- "epoch": 2.641337386018237,
- "grad_norm": 2.8457231521606445,
- "learning_rate": 1.9372213529017192e-07,
- "loss": 0.314262330532074,
- "mean_token_accuracy": 0.8857930302619934,
- "num_tokens": 31046670.0,
- "step": 3476
- },
- {
- "epoch": 2.64209726443769,
- "grad_norm": 2.661770820617676,
- "learning_rate": 1.9291457008880077e-07,
- "loss": 0.3096502125263214,
- "mean_token_accuracy": 0.9015626907348633,
- "num_tokens": 31052419.0,
- "step": 3477
- },
- {
- "epoch": 2.642857142857143,
- "grad_norm": 1.9692156314849854,
- "learning_rate": 1.9210862408879373e-07,
- "loss": 0.33081287145614624,
- "mean_token_accuracy": 0.8793413639068604,
- "num_tokens": 31060462.0,
- "step": 3478
- },
- {
- "epoch": 2.6436170212765955,
- "grad_norm": 2.454256772994995,
- "learning_rate": 1.9130429785579441e-07,
- "loss": 0.486195832490921,
- "mean_token_accuracy": 0.8472193479537964,
- "num_tokens": 31066537.0,
- "step": 3479
- },
- {
- "epoch": 2.6443768996960486,
- "grad_norm": 3.121835470199585,
- "learning_rate": 1.9050159195431017e-07,
- "loss": 0.28688520193099976,
- "mean_token_accuracy": 0.8916707038879395,
- "num_tokens": 31071061.0,
- "step": 3480
- },
- {
- "epoch": 2.6451367781155017,
- "grad_norm": 2.0197176933288574,
- "learning_rate": 1.8970050694771064e-07,
- "loss": 0.2587219774723053,
- "mean_token_accuracy": 0.9204096794128418,
- "num_tokens": 31077438.0,
- "step": 3481
- },
- {
- "epoch": 2.6458966565349544,
- "grad_norm": 2.305452585220337,
- "learning_rate": 1.8890104339822913e-07,
- "loss": 0.3234187960624695,
- "mean_token_accuracy": 0.8695623874664307,
- "num_tokens": 31084445.0,
- "step": 3482
- },
- {
- "epoch": 2.646656534954407,
- "grad_norm": 2.671178102493286,
- "learning_rate": 1.881032018669579e-07,
- "loss": 0.31658151745796204,
- "mean_token_accuracy": 0.9211946725845337,
- "num_tokens": 31090229.0,
- "step": 3483
- },
- {
- "epoch": 2.6474164133738602,
- "grad_norm": 1.9448342323303223,
- "learning_rate": 1.8730698291385518e-07,
- "loss": 0.4380547106266022,
- "mean_token_accuracy": 0.8881628513336182,
- "num_tokens": 31098328.0,
- "step": 3484
- },
- {
- "epoch": 2.648176291793313,
- "grad_norm": 2.00927734375,
- "learning_rate": 1.8651238709773646e-07,
- "loss": 0.30627715587615967,
- "mean_token_accuracy": 0.9037996530532837,
- "num_tokens": 31106114.0,
- "step": 3485
- },
- {
- "epoch": 2.648936170212766,
- "grad_norm": 1.800561547279358,
- "learning_rate": 1.8571941497627976e-07,
- "loss": 0.3352568745613098,
- "mean_token_accuracy": 0.8773363828659058,
- "num_tokens": 31114962.0,
- "step": 3486
- },
- {
- "epoch": 2.6496960486322187,
- "grad_norm": 1.2112451791763306,
- "learning_rate": 1.8492806710602495e-07,
- "loss": 0.30349305272102356,
- "mean_token_accuracy": 0.8948603272438049,
- "num_tokens": 31131202.0,
- "step": 3487
- },
- {
- "epoch": 2.650455927051672,
- "grad_norm": 1.241676926612854,
- "learning_rate": 1.8413834404236857e-07,
- "loss": 0.33237409591674805,
- "mean_token_accuracy": 0.8674747943878174,
- "num_tokens": 31146087.0,
- "step": 3488
- },
- {
- "epoch": 2.6512158054711246,
- "grad_norm": 1.7932970523834229,
- "learning_rate": 1.8335024633956977e-07,
- "loss": 0.2946045696735382,
- "mean_token_accuracy": 0.9197652339935303,
- "num_tokens": 31153539.0,
- "step": 3489
- },
- {
- "epoch": 2.6519756838905773,
- "grad_norm": 1.4799917936325073,
- "learning_rate": 1.8256377455074526e-07,
- "loss": 0.41131776571273804,
- "mean_token_accuracy": 0.859546422958374,
- "num_tokens": 31165330.0,
- "step": 3490
- },
- {
- "epoch": 2.6527355623100304,
- "grad_norm": 1.196844458580017,
- "learning_rate": 1.8177892922787154e-07,
- "loss": 0.3251150846481323,
- "mean_token_accuracy": 0.8738864660263062,
- "num_tokens": 31182815.0,
- "step": 3491
- },
- {
- "epoch": 2.6534954407294835,
- "grad_norm": 1.954189419746399,
- "learning_rate": 1.809957109217833e-07,
- "loss": 0.31352269649505615,
- "mean_token_accuracy": 0.8898859024047852,
- "num_tokens": 31190907.0,
- "step": 3492
- },
- {
- "epoch": 2.654255319148936,
- "grad_norm": 2.5248095989227295,
- "learning_rate": 1.802141201821736e-07,
- "loss": 0.29824098944664,
- "mean_token_accuracy": 0.9073196053504944,
- "num_tokens": 31196077.0,
- "step": 3493
- },
- {
- "epoch": 2.655015197568389,
- "grad_norm": 2.163174629211426,
- "learning_rate": 1.7943415755759168e-07,
- "loss": 0.3291153311729431,
- "mean_token_accuracy": 0.8850691318511963,
- "num_tokens": 31202843.0,
- "step": 3494
- },
- {
- "epoch": 2.655775075987842,
- "grad_norm": 1.1075550317764282,
- "learning_rate": 1.7865582359544664e-07,
- "loss": 0.3335857093334198,
- "mean_token_accuracy": 0.877744197845459,
- "num_tokens": 31224407.0,
- "step": 3495
- },
- {
- "epoch": 2.6565349544072947,
- "grad_norm": 3.600712299346924,
- "learning_rate": 1.7787911884200314e-07,
- "loss": 0.24402567744255066,
- "mean_token_accuracy": 0.9030617475509644,
- "num_tokens": 31228150.0,
- "step": 3496
- },
- {
- "epoch": 2.657294832826748,
- "grad_norm": 2.5282156467437744,
- "learning_rate": 1.7710404384238156e-07,
- "loss": 0.3065975606441498,
- "mean_token_accuracy": 0.8894387483596802,
- "num_tokens": 31233676.0,
- "step": 3497
- },
- {
- "epoch": 2.6580547112462005,
- "grad_norm": 5.057322025299072,
- "learning_rate": 1.7633059914055976e-07,
- "loss": 0.3121221661567688,
- "mean_token_accuracy": 0.8697853088378906,
- "num_tokens": 31241436.0,
- "step": 3498
- },
- {
- "epoch": 2.6588145896656536,
- "grad_norm": 2.3506245613098145,
- "learning_rate": 1.7555878527937164e-07,
- "loss": 0.3100275993347168,
- "mean_token_accuracy": 0.8860085010528564,
- "num_tokens": 31249589.0,
- "step": 3499
- },
- {
- "epoch": 2.6595744680851063,
- "grad_norm": 1.352675199508667,
- "learning_rate": 1.7478860280050525e-07,
- "loss": 0.3743774890899658,
- "mean_token_accuracy": 0.8581909537315369,
- "num_tokens": 31264177.0,
- "step": 3500
- },
- {
- "epoch": 2.660334346504559,
- "grad_norm": 1.4283853769302368,
- "learning_rate": 1.740200522445043e-07,
- "loss": 0.3012605905532837,
- "mean_token_accuracy": 0.8875954151153564,
- "num_tokens": 31278104.0,
- "step": 3501
- },
- {
- "epoch": 2.661094224924012,
- "grad_norm": 1.2291043996810913,
- "learning_rate": 1.7325313415076705e-07,
- "loss": 0.28256118297576904,
- "mean_token_accuracy": 0.8932200074195862,
- "num_tokens": 31295863.0,
- "step": 3502
- },
- {
- "epoch": 2.6618541033434653,
- "grad_norm": 1.4281202554702759,
- "learning_rate": 1.7248784905754656e-07,
- "loss": 0.17757278680801392,
- "mean_token_accuracy": 0.9204857349395752,
- "num_tokens": 31304203.0,
- "step": 3503
- },
- {
- "epoch": 2.662613981762918,
- "grad_norm": 1.369604229927063,
- "learning_rate": 1.717241975019493e-07,
- "loss": 0.35701876878738403,
- "mean_token_accuracy": 0.8924071192741394,
- "num_tokens": 31317585.0,
- "step": 3504
- },
- {
- "epoch": 2.6633738601823707,
- "grad_norm": 1.8434638977050781,
- "learning_rate": 1.7096218001993514e-07,
- "loss": 0.2783927619457245,
- "mean_token_accuracy": 0.9073910713195801,
- "num_tokens": 31325380.0,
- "step": 3505
- },
- {
- "epoch": 2.664133738601824,
- "grad_norm": 1.946325421333313,
- "learning_rate": 1.702017971463174e-07,
- "loss": 0.2873200476169586,
- "mean_token_accuracy": 0.8956313133239746,
- "num_tokens": 31333366.0,
- "step": 3506
- },
- {
- "epoch": 2.6648936170212765,
- "grad_norm": 2.468369960784912,
- "learning_rate": 1.6944304941476224e-07,
- "loss": 0.2589072287082672,
- "mean_token_accuracy": 0.9237367510795593,
- "num_tokens": 31337721.0,
- "step": 3507
- },
- {
- "epoch": 2.6656534954407296,
- "grad_norm": 1.1283265352249146,
- "learning_rate": 1.686859373577876e-07,
- "loss": 0.3271624445915222,
- "mean_token_accuracy": 0.8839015960693359,
- "num_tokens": 31355493.0,
- "step": 3508
- },
- {
- "epoch": 2.6664133738601823,
- "grad_norm": 1.9863340854644775,
- "learning_rate": 1.679304615067634e-07,
- "loss": 0.24140994250774384,
- "mean_token_accuracy": 0.9161529541015625,
- "num_tokens": 31362707.0,
- "step": 3509
- },
- {
- "epoch": 2.6671732522796354,
- "grad_norm": 1.8522552251815796,
- "learning_rate": 1.671766223919133e-07,
- "loss": 0.3312528133392334,
- "mean_token_accuracy": 0.8730556964874268,
- "num_tokens": 31371077.0,
- "step": 3510
- },
- {
- "epoch": 2.667933130699088,
- "grad_norm": 2.4215502738952637,
- "learning_rate": 1.6642442054230935e-07,
- "loss": 0.3685656189918518,
- "mean_token_accuracy": 0.8850007653236389,
- "num_tokens": 31378208.0,
- "step": 3511
- },
- {
- "epoch": 2.668693009118541,
- "grad_norm": 2.1833741664886475,
- "learning_rate": 1.6567385648587563e-07,
- "loss": 0.34506508708000183,
- "mean_token_accuracy": 0.8798409104347229,
- "num_tokens": 31384364.0,
- "step": 3512
- },
- {
- "epoch": 2.669452887537994,
- "grad_norm": 1.5749074220657349,
- "learning_rate": 1.6492493074938777e-07,
- "loss": 0.426993191242218,
- "mean_token_accuracy": 0.8461192846298218,
- "num_tokens": 31399653.0,
- "step": 3513
- },
- {
- "epoch": 2.670212765957447,
- "grad_norm": 1.782159686088562,
- "learning_rate": 1.6417764385846996e-07,
- "loss": 0.43299031257629395,
- "mean_token_accuracy": 0.8456183075904846,
- "num_tokens": 31410255.0,
- "step": 3514
- },
- {
- "epoch": 2.6709726443768997,
- "grad_norm": 1.3696199655532837,
- "learning_rate": 1.6343199633759715e-07,
- "loss": 0.24636408686637878,
- "mean_token_accuracy": 0.8885586261749268,
- "num_tokens": 31422388.0,
- "step": 3515
- },
- {
- "epoch": 2.6717325227963524,
- "grad_norm": 1.9061282873153687,
- "learning_rate": 1.6268798871009405e-07,
- "loss": 0.4061458706855774,
- "mean_token_accuracy": 0.8875166177749634,
- "num_tokens": 31431610.0,
- "step": 3516
- },
- {
- "epoch": 2.6724924012158056,
- "grad_norm": 1.906085729598999,
- "learning_rate": 1.6194562149813241e-07,
- "loss": 0.4171827435493469,
- "mean_token_accuracy": 0.848915159702301,
- "num_tokens": 31440612.0,
- "step": 3517
- },
- {
- "epoch": 2.6732522796352582,
- "grad_norm": 1.7384947538375854,
- "learning_rate": 1.6120489522273548e-07,
- "loss": 0.38559412956237793,
- "mean_token_accuracy": 0.860315203666687,
- "num_tokens": 31451002.0,
- "step": 3518
- },
- {
- "epoch": 2.6740121580547114,
- "grad_norm": 3.150087356567383,
- "learning_rate": 1.6046581040377317e-07,
- "loss": 0.17975735664367676,
- "mean_token_accuracy": 0.9390251636505127,
- "num_tokens": 31454609.0,
- "step": 3519
- },
- {
- "epoch": 2.674772036474164,
- "grad_norm": 1.9782978296279907,
- "learning_rate": 1.5972836755996286e-07,
- "loss": 0.4016202688217163,
- "mean_token_accuracy": 0.8536617755889893,
- "num_tokens": 31463351.0,
- "step": 3520
- },
- {
- "epoch": 2.675531914893617,
- "grad_norm": 1.459272861480713,
- "learning_rate": 1.589925672088713e-07,
- "loss": 0.32752668857574463,
- "mean_token_accuracy": 0.8932114839553833,
- "num_tokens": 31475029.0,
- "step": 3521
- },
- {
- "epoch": 2.67629179331307,
- "grad_norm": 1.5019307136535645,
- "learning_rate": 1.5825840986691155e-07,
- "loss": 0.47891637682914734,
- "mean_token_accuracy": 0.8196566700935364,
- "num_tokens": 31489340.0,
- "step": 3522
- },
- {
- "epoch": 2.6770516717325226,
- "grad_norm": 1.9832415580749512,
- "learning_rate": 1.5752589604934255e-07,
- "loss": 0.3787233829498291,
- "mean_token_accuracy": 0.8592989444732666,
- "num_tokens": 31498173.0,
- "step": 3523
- },
- {
- "epoch": 2.6778115501519757,
- "grad_norm": 1.6112871170043945,
- "learning_rate": 1.567950262702714e-07,
- "loss": 0.394833505153656,
- "mean_token_accuracy": 0.8762246370315552,
- "num_tokens": 31509701.0,
- "step": 3524
- },
- {
- "epoch": 2.678571428571429,
- "grad_norm": 2.542189598083496,
- "learning_rate": 1.560658010426505e-07,
- "loss": 0.344679057598114,
- "mean_token_accuracy": 0.8738337159156799,
- "num_tokens": 31516174.0,
- "step": 3525
- },
- {
- "epoch": 2.6793313069908815,
- "grad_norm": 1.6784722805023193,
- "learning_rate": 1.5533822087827805e-07,
- "loss": 0.2981395423412323,
- "mean_token_accuracy": 0.9238042831420898,
- "num_tokens": 31526373.0,
- "step": 3526
- },
- {
- "epoch": 2.680091185410334,
- "grad_norm": 2.1711673736572266,
- "learning_rate": 1.54612286287798e-07,
- "loss": 0.32182997465133667,
- "mean_token_accuracy": 0.8804676532745361,
- "num_tokens": 31532221.0,
- "step": 3527
- },
- {
- "epoch": 2.6808510638297873,
- "grad_norm": 2.920492172241211,
- "learning_rate": 1.5388799778069896e-07,
- "loss": 0.42035239934921265,
- "mean_token_accuracy": 0.8616809844970703,
- "num_tokens": 31537349.0,
- "step": 3528
- },
- {
- "epoch": 2.68161094224924,
- "grad_norm": 1.6369318962097168,
- "learning_rate": 1.5316535586531483e-07,
- "loss": 0.3083080053329468,
- "mean_token_accuracy": 0.8857955932617188,
- "num_tokens": 31548063.0,
- "step": 3529
- },
- {
- "epoch": 2.682370820668693,
- "grad_norm": 1.745784044265747,
- "learning_rate": 1.5244436104882327e-07,
- "loss": 0.3295830190181732,
- "mean_token_accuracy": 0.8790948390960693,
- "num_tokens": 31557297.0,
- "step": 3530
- },
- {
- "epoch": 2.683130699088146,
- "grad_norm": 2.933802843093872,
- "learning_rate": 1.5172501383724668e-07,
- "loss": 0.20540538430213928,
- "mean_token_accuracy": 0.9353891611099243,
- "num_tokens": 31561267.0,
- "step": 3531
- },
- {
- "epoch": 2.683890577507599,
- "grad_norm": 1.1792415380477905,
- "learning_rate": 1.5100731473544932e-07,
- "loss": 0.2857414484024048,
- "mean_token_accuracy": 0.8919717073440552,
- "num_tokens": 31577364.0,
- "step": 3532
- },
- {
- "epoch": 2.6846504559270516,
- "grad_norm": 1.5752356052398682,
- "learning_rate": 1.5029126424714186e-07,
- "loss": 0.42933136224746704,
- "mean_token_accuracy": 0.8738011717796326,
- "num_tokens": 31593255.0,
- "step": 3533
- },
- {
- "epoch": 2.6854103343465043,
- "grad_norm": 1.4097353219985962,
- "learning_rate": 1.495768628748745e-07,
- "loss": 0.41403159499168396,
- "mean_token_accuracy": 0.8538030385971069,
- "num_tokens": 31606689.0,
- "step": 3534
- },
- {
- "epoch": 2.6861702127659575,
- "grad_norm": 1.3788182735443115,
- "learning_rate": 1.4886411112004258e-07,
- "loss": 0.3825019299983978,
- "mean_token_accuracy": 0.870381236076355,
- "num_tokens": 31623528.0,
- "step": 3535
- },
- {
- "epoch": 2.6869300911854106,
- "grad_norm": 2.3032004833221436,
- "learning_rate": 1.481530094828823e-07,
- "loss": 0.28886643052101135,
- "mean_token_accuracy": 0.9053950905799866,
- "num_tokens": 31629949.0,
- "step": 3536
- },
- {
- "epoch": 2.6876899696048633,
- "grad_norm": 1.8950154781341553,
- "learning_rate": 1.4744355846247254e-07,
- "loss": 0.3261764645576477,
- "mean_token_accuracy": 0.8882689476013184,
- "num_tokens": 31639482.0,
- "step": 3537
- },
- {
- "epoch": 2.688449848024316,
- "grad_norm": 2.8152518272399902,
- "learning_rate": 1.4673575855673278e-07,
- "loss": 0.19367718696594238,
- "mean_token_accuracy": 0.948776364326477,
- "num_tokens": 31643354.0,
- "step": 3538
- },
- {
- "epoch": 2.689209726443769,
- "grad_norm": 2.1745874881744385,
- "learning_rate": 1.460296102624248e-07,
- "loss": 0.3250897526741028,
- "mean_token_accuracy": 0.8834096193313599,
- "num_tokens": 31651085.0,
- "step": 3539
- },
- {
- "epoch": 2.689969604863222,
- "grad_norm": 2.5239014625549316,
- "learning_rate": 1.4532511407515022e-07,
- "loss": 0.3069056570529938,
- "mean_token_accuracy": 0.8939725160598755,
- "num_tokens": 31656790.0,
- "step": 3540
- },
- {
- "epoch": 2.690729483282675,
- "grad_norm": 2.19575572013855,
- "learning_rate": 1.4462227048935185e-07,
- "loss": 0.38596993684768677,
- "mean_token_accuracy": 0.8545209169387817,
- "num_tokens": 31664577.0,
- "step": 3541
- },
- {
- "epoch": 2.6914893617021276,
- "grad_norm": 2.4618618488311768,
- "learning_rate": 1.439210799983126e-07,
- "loss": 0.43490833044052124,
- "mean_token_accuracy": 0.8452163338661194,
- "num_tokens": 31670328.0,
- "step": 3542
- },
- {
- "epoch": 2.6922492401215807,
- "grad_norm": 1.6371922492980957,
- "learning_rate": 1.4322154309415387e-07,
- "loss": 0.36862409114837646,
- "mean_token_accuracy": 0.8575112819671631,
- "num_tokens": 31680342.0,
- "step": 3543
- },
- {
- "epoch": 2.6930091185410334,
- "grad_norm": 3.311603546142578,
- "learning_rate": 1.425236602678387e-07,
- "loss": 0.3098670542240143,
- "mean_token_accuracy": 0.8895800113677979,
- "num_tokens": 31686819.0,
- "step": 3544
- },
- {
- "epoch": 2.693768996960486,
- "grad_norm": 2.246453285217285,
- "learning_rate": 1.4182743200916839e-07,
- "loss": 0.2145545780658722,
- "mean_token_accuracy": 0.9456803798675537,
- "num_tokens": 31692024.0,
- "step": 3545
- },
- {
- "epoch": 2.6945288753799392,
- "grad_norm": 2.962627410888672,
- "learning_rate": 1.4113285880678145e-07,
- "loss": 0.22648683190345764,
- "mean_token_accuracy": 0.9368027448654175,
- "num_tokens": 31696292.0,
- "step": 3546
- },
- {
- "epoch": 2.6952887537993924,
- "grad_norm": 2.3828611373901367,
- "learning_rate": 1.4043994114815663e-07,
- "loss": 0.28031831979751587,
- "mean_token_accuracy": 0.8995643854141235,
- "num_tokens": 31701896.0,
- "step": 3547
- },
- {
- "epoch": 2.696048632218845,
- "grad_norm": 2.749218463897705,
- "learning_rate": 1.3974867951961097e-07,
- "loss": 0.31309080123901367,
- "mean_token_accuracy": 0.8827601671218872,
- "num_tokens": 31707434.0,
- "step": 3548
- },
- {
- "epoch": 2.6968085106382977,
- "grad_norm": 1.5682415962219238,
- "learning_rate": 1.3905907440629752e-07,
- "loss": 0.2794681191444397,
- "mean_token_accuracy": 0.9000695943832397,
- "num_tokens": 31718923.0,
- "step": 3549
- },
- {
- "epoch": 2.697568389057751,
- "grad_norm": 2.2193145751953125,
- "learning_rate": 1.38371126292208e-07,
- "loss": 0.31643980741500854,
- "mean_token_accuracy": 0.8916857242584229,
- "num_tokens": 31724566.0,
- "step": 3550
- },
- {
- "epoch": 2.6983282674772036,
- "grad_norm": 2.14003324508667,
- "learning_rate": 1.3768483566017093e-07,
- "loss": 0.3225042521953583,
- "mean_token_accuracy": 0.8810629844665527,
- "num_tokens": 31731363.0,
- "step": 3551
- },
- {
- "epoch": 2.6990881458966567,
- "grad_norm": 2.594632863998413,
- "learning_rate": 1.3700020299185156e-07,
- "loss": 0.28227928280830383,
- "mean_token_accuracy": 0.8986451625823975,
- "num_tokens": 31736574.0,
- "step": 3552
- },
- {
- "epoch": 2.6998480243161094,
- "grad_norm": 1.8695379495620728,
- "learning_rate": 1.3631722876775137e-07,
- "loss": 0.46631208062171936,
- "mean_token_accuracy": 0.8425353765487671,
- "num_tokens": 31746568.0,
- "step": 3553
- },
- {
- "epoch": 2.7006079027355625,
- "grad_norm": 2.1246798038482666,
- "learning_rate": 1.3563591346720806e-07,
- "loss": 0.3978712260723114,
- "mean_token_accuracy": 0.85677170753479,
- "num_tokens": 31755499.0,
- "step": 3554
- },
- {
- "epoch": 2.701367781155015,
- "grad_norm": 1.9348199367523193,
- "learning_rate": 1.3495625756839464e-07,
- "loss": 0.4381856620311737,
- "mean_token_accuracy": 0.8389089107513428,
- "num_tokens": 31765267.0,
- "step": 3555
- },
- {
- "epoch": 2.702127659574468,
- "grad_norm": 3.3802061080932617,
- "learning_rate": 1.342782615483204e-07,
- "loss": 0.2558897137641907,
- "mean_token_accuracy": 0.9038383960723877,
- "num_tokens": 31769169.0,
- "step": 3556
- },
- {
- "epoch": 2.702887537993921,
- "grad_norm": 1.8666874170303345,
- "learning_rate": 1.3360192588282832e-07,
- "loss": 0.3420698642730713,
- "mean_token_accuracy": 0.8731567859649658,
- "num_tokens": 31778500.0,
- "step": 3557
- },
- {
- "epoch": 2.7036474164133737,
- "grad_norm": 2.2502217292785645,
- "learning_rate": 1.3292725104659676e-07,
- "loss": 0.33352571725845337,
- "mean_token_accuracy": 0.889266848564148,
- "num_tokens": 31786245.0,
- "step": 3558
- },
- {
- "epoch": 2.704407294832827,
- "grad_norm": 1.7217984199523926,
- "learning_rate": 1.3225423751313942e-07,
- "loss": 0.3671357035636902,
- "mean_token_accuracy": 0.8806703686714172,
- "num_tokens": 31796242.0,
- "step": 3559
- },
- {
- "epoch": 2.7051671732522795,
- "grad_norm": 2.5113964080810547,
- "learning_rate": 1.315828857548024e-07,
- "loss": 0.24104978144168854,
- "mean_token_accuracy": 0.9279846549034119,
- "num_tokens": 31801005.0,
- "step": 3560
- },
- {
- "epoch": 2.7059270516717326,
- "grad_norm": 2.0345516204833984,
- "learning_rate": 1.309131962427662e-07,
- "loss": 0.3277859687805176,
- "mean_token_accuracy": 0.8744111061096191,
- "num_tokens": 31810184.0,
- "step": 3561
- },
- {
- "epoch": 2.7066869300911853,
- "grad_norm": 1.2103748321533203,
- "learning_rate": 1.3024516944704495e-07,
- "loss": 0.34378400444984436,
- "mean_token_accuracy": 0.8734696507453918,
- "num_tokens": 31830255.0,
- "step": 3562
- },
- {
- "epoch": 2.7074468085106385,
- "grad_norm": 2.3213655948638916,
- "learning_rate": 1.2957880583648525e-07,
- "loss": 0.38547977805137634,
- "mean_token_accuracy": 0.8699804544448853,
- "num_tokens": 31836624.0,
- "step": 3563
- },
- {
- "epoch": 2.708206686930091,
- "grad_norm": 1.3899281024932861,
- "learning_rate": 1.2891410587876714e-07,
- "loss": 0.38521939516067505,
- "mean_token_accuracy": 0.8629069924354553,
- "num_tokens": 31851201.0,
- "step": 3564
- },
- {
- "epoch": 2.7089665653495443,
- "grad_norm": 1.9310930967330933,
- "learning_rate": 1.2825107004040272e-07,
- "loss": 0.26716265082359314,
- "mean_token_accuracy": 0.9009085893630981,
- "num_tokens": 31858683.0,
- "step": 3565
- },
- {
- "epoch": 2.709726443768997,
- "grad_norm": 2.839961290359497,
- "learning_rate": 1.2758969878673504e-07,
- "loss": 0.3741273880004883,
- "mean_token_accuracy": 0.8934653997421265,
- "num_tokens": 31864354.0,
- "step": 3566
- },
- {
- "epoch": 2.7104863221884496,
- "grad_norm": 1.374866247177124,
- "learning_rate": 1.269299925819409e-07,
- "loss": 0.43979907035827637,
- "mean_token_accuracy": 0.8200695514678955,
- "num_tokens": 31879875.0,
- "step": 3567
- },
- {
- "epoch": 2.711246200607903,
- "grad_norm": 1.149755597114563,
- "learning_rate": 1.262719518890279e-07,
- "loss": 0.375344842672348,
- "mean_token_accuracy": 0.8663579225540161,
- "num_tokens": 31902014.0,
- "step": 3568
- },
- {
- "epoch": 2.7120060790273555,
- "grad_norm": 1.5612202882766724,
- "learning_rate": 1.2561557716983308e-07,
- "loss": 0.3224652409553528,
- "mean_token_accuracy": 0.8762812614440918,
- "num_tokens": 31913496.0,
- "step": 3569
- },
- {
- "epoch": 2.7127659574468086,
- "grad_norm": 2.291853666305542,
- "learning_rate": 1.2496086888502595e-07,
- "loss": 0.299552321434021,
- "mean_token_accuracy": 0.8792698383331299,
- "num_tokens": 31919505.0,
- "step": 3570
- },
- {
- "epoch": 2.7135258358662613,
- "grad_norm": 2.799447536468506,
- "learning_rate": 1.2430782749410676e-07,
- "loss": 0.16546699404716492,
- "mean_token_accuracy": 0.943824052810669,
- "num_tokens": 31923154.0,
- "step": 3571
- },
- {
- "epoch": 2.7142857142857144,
- "grad_norm": 1.4593926668167114,
- "learning_rate": 1.2365645345540383e-07,
- "loss": 0.35158461332321167,
- "mean_token_accuracy": 0.8825424909591675,
- "num_tokens": 31936316.0,
- "step": 3572
- },
- {
- "epoch": 2.715045592705167,
- "grad_norm": 1.3870587348937988,
- "learning_rate": 1.2300674722607735e-07,
- "loss": 0.25250178575515747,
- "mean_token_accuracy": 0.900173544883728,
- "num_tokens": 31948979.0,
- "step": 3573
- },
- {
- "epoch": 2.71580547112462,
- "grad_norm": 1.8494576215744019,
- "learning_rate": 1.223587092621162e-07,
- "loss": 0.36176151037216187,
- "mean_token_accuracy": 0.8696292638778687,
- "num_tokens": 31957512.0,
- "step": 3574
- },
- {
- "epoch": 2.716565349544073,
- "grad_norm": 2.2320656776428223,
- "learning_rate": 1.2171234001833788e-07,
- "loss": 0.3317434787750244,
- "mean_token_accuracy": 0.8897237777709961,
- "num_tokens": 31964788.0,
- "step": 3575
- },
- {
- "epoch": 2.717325227963526,
- "grad_norm": 2.424726963043213,
- "learning_rate": 1.2106763994838954e-07,
- "loss": 0.2880811095237732,
- "mean_token_accuracy": 0.8983594179153442,
- "num_tokens": 31970888.0,
- "step": 3576
- },
- {
- "epoch": 2.7180851063829787,
- "grad_norm": 1.7122806310653687,
- "learning_rate": 1.204246095047465e-07,
- "loss": 0.4846091568470001,
- "mean_token_accuracy": 0.8358923196792603,
- "num_tokens": 31981891.0,
- "step": 3577
- },
- {
- "epoch": 2.7188449848024314,
- "grad_norm": 2.3445510864257812,
- "learning_rate": 1.1978324913871214e-07,
- "loss": 0.28702512383461,
- "mean_token_accuracy": 0.8942852020263672,
- "num_tokens": 31987375.0,
- "step": 3578
- },
- {
- "epoch": 2.7196048632218845,
- "grad_norm": 2.418414831161499,
- "learning_rate": 1.1914355930041838e-07,
- "loss": 0.27506208419799805,
- "mean_token_accuracy": 0.9329943656921387,
- "num_tokens": 31992517.0,
- "step": 3579
- },
- {
- "epoch": 2.7203647416413372,
- "grad_norm": 2.363285541534424,
- "learning_rate": 1.1850554043882329e-07,
- "loss": 0.32415682077407837,
- "mean_token_accuracy": 0.9004105925559998,
- "num_tokens": 31998223.0,
- "step": 3580
- },
- {
- "epoch": 2.7211246200607904,
- "grad_norm": 1.5022046566009521,
- "learning_rate": 1.178691930017134e-07,
- "loss": 0.2446850836277008,
- "mean_token_accuracy": 0.9055813550949097,
- "num_tokens": 32008038.0,
- "step": 3581
- },
- {
- "epoch": 2.721884498480243,
- "grad_norm": 1.7016842365264893,
- "learning_rate": 1.172345174357023e-07,
- "loss": 0.356515109539032,
- "mean_token_accuracy": 0.876318097114563,
- "num_tokens": 32018738.0,
- "step": 3582
- },
- {
- "epoch": 2.722644376899696,
- "grad_norm": 2.113873243331909,
- "learning_rate": 1.1660151418622923e-07,
- "loss": 0.24748530983924866,
- "mean_token_accuracy": 0.9214030504226685,
- "num_tokens": 32025225.0,
- "step": 3583
- },
- {
- "epoch": 2.723404255319149,
- "grad_norm": 1.6737921237945557,
- "learning_rate": 1.159701836975602e-07,
- "loss": 0.30180150270462036,
- "mean_token_accuracy": 0.9211363792419434,
- "num_tokens": 32034579.0,
- "step": 3584
- },
- {
- "epoch": 2.7241641337386016,
- "grad_norm": 1.4193580150604248,
- "learning_rate": 1.153405264127877e-07,
- "loss": 0.2939320504665375,
- "mean_token_accuracy": 0.9005526304244995,
- "num_tokens": 32046461.0,
- "step": 3585
- },
- {
- "epoch": 2.7249240121580547,
- "grad_norm": 2.273599863052368,
- "learning_rate": 1.1471254277382882e-07,
- "loss": 0.3552356958389282,
- "mean_token_accuracy": 0.8682018518447876,
- "num_tokens": 32056210.0,
- "step": 3586
- },
- {
- "epoch": 2.725683890577508,
- "grad_norm": 2.242373466491699,
- "learning_rate": 1.1408623322142736e-07,
- "loss": 0.37924283742904663,
- "mean_token_accuracy": 0.8833099603652954,
- "num_tokens": 32063545.0,
- "step": 3587
- },
- {
- "epoch": 2.7264437689969605,
- "grad_norm": 2.039243459701538,
- "learning_rate": 1.134615981951509e-07,
- "loss": 0.29171228408813477,
- "mean_token_accuracy": 0.8961814641952515,
- "num_tokens": 32070870.0,
- "step": 3588
- },
- {
- "epoch": 2.727203647416413,
- "grad_norm": 1.8081161975860596,
- "learning_rate": 1.1283863813339263e-07,
- "loss": 0.34568479657173157,
- "mean_token_accuracy": 0.9093149900436401,
- "num_tokens": 32078829.0,
- "step": 3589
- },
- {
- "epoch": 2.7279635258358663,
- "grad_norm": 2.301534414291382,
- "learning_rate": 1.1221735347336976e-07,
- "loss": 0.32527366280555725,
- "mean_token_accuracy": 0.8894226551055908,
- "num_tokens": 32084533.0,
- "step": 3590
- },
- {
- "epoch": 2.728723404255319,
- "grad_norm": 1.9389806985855103,
- "learning_rate": 1.1159774465112433e-07,
- "loss": 0.39770618081092834,
- "mean_token_accuracy": 0.8613806962966919,
- "num_tokens": 32092713.0,
- "step": 3591
- },
- {
- "epoch": 2.729483282674772,
- "grad_norm": 1.6589549779891968,
- "learning_rate": 1.1097981210152042e-07,
- "loss": 0.3170590400695801,
- "mean_token_accuracy": 0.8901652097702026,
- "num_tokens": 32102904.0,
- "step": 3592
- },
- {
- "epoch": 2.730243161094225,
- "grad_norm": 1.8090909719467163,
- "learning_rate": 1.1036355625824808e-07,
- "loss": 0.274291455745697,
- "mean_token_accuracy": 0.9074428081512451,
- "num_tokens": 32111009.0,
- "step": 3593
- },
- {
- "epoch": 2.731003039513678,
- "grad_norm": 2.431757688522339,
- "learning_rate": 1.0974897755381936e-07,
- "loss": 0.30703026056289673,
- "mean_token_accuracy": 0.9109988808631897,
- "num_tokens": 32116173.0,
- "step": 3594
- },
- {
- "epoch": 2.7317629179331306,
- "grad_norm": 1.8828567266464233,
- "learning_rate": 1.0913607641956842e-07,
- "loss": 0.34009286761283875,
- "mean_token_accuracy": 0.8761146068572998,
- "num_tokens": 32124273.0,
- "step": 3595
- },
- {
- "epoch": 2.7325227963525833,
- "grad_norm": 1.2194745540618896,
- "learning_rate": 1.0852485328565337e-07,
- "loss": 0.2432229220867157,
- "mean_token_accuracy": 0.8984386920928955,
- "num_tokens": 32137430.0,
- "step": 3596
- },
- {
- "epoch": 2.7332826747720365,
- "grad_norm": 2.3038880825042725,
- "learning_rate": 1.0791530858105387e-07,
- "loss": 0.2546696066856384,
- "mean_token_accuracy": 0.9092214107513428,
- "num_tokens": 32145207.0,
- "step": 3597
- },
- {
- "epoch": 2.7340425531914896,
- "grad_norm": 2.807394504547119,
- "learning_rate": 1.0730744273357213e-07,
- "loss": 0.33576664328575134,
- "mean_token_accuracy": 0.8793773651123047,
- "num_tokens": 32150161.0,
- "step": 3598
- },
- {
- "epoch": 2.7348024316109423,
- "grad_norm": 1.8207601308822632,
- "learning_rate": 1.067012561698319e-07,
- "loss": 0.43848833441734314,
- "mean_token_accuracy": 0.8729845285415649,
- "num_tokens": 32160180.0,
- "step": 3599
- },
- {
- "epoch": 2.735562310030395,
- "grad_norm": 1.5954468250274658,
- "learning_rate": 1.0609674931527786e-07,
- "loss": 0.3471013307571411,
- "mean_token_accuracy": 0.889906644821167,
- "num_tokens": 32172442.0,
- "step": 3600
- },
- {
- "epoch": 2.736322188449848,
- "grad_norm": 1.2474297285079956,
- "learning_rate": 1.0549392259417646e-07,
- "loss": 0.2967996299266815,
- "mean_token_accuracy": 0.887985110282898,
- "num_tokens": 32187624.0,
- "step": 3601
- },
- {
- "epoch": 2.737082066869301,
- "grad_norm": 1.4285695552825928,
- "learning_rate": 1.0489277642961481e-07,
- "loss": 0.2793816924095154,
- "mean_token_accuracy": 0.8948850631713867,
- "num_tokens": 32199904.0,
- "step": 3602
- },
- {
- "epoch": 2.737841945288754,
- "grad_norm": 1.4096852540969849,
- "learning_rate": 1.0429331124350045e-07,
- "loss": 0.39516502618789673,
- "mean_token_accuracy": 0.8942514657974243,
- "num_tokens": 32213145.0,
- "step": 3603
- },
- {
- "epoch": 2.7386018237082066,
- "grad_norm": 1.4818166494369507,
- "learning_rate": 1.0369552745656014e-07,
- "loss": 0.3851013779640198,
- "mean_token_accuracy": 0.8604148626327515,
- "num_tokens": 32225576.0,
- "step": 3604
- },
- {
- "epoch": 2.7393617021276597,
- "grad_norm": 2.0186386108398438,
- "learning_rate": 1.0309942548834329e-07,
- "loss": 0.2715086340904236,
- "mean_token_accuracy": 0.9169677495956421,
- "num_tokens": 32232808.0,
- "step": 3605
- },
- {
- "epoch": 2.7401215805471124,
- "grad_norm": 2.3498101234436035,
- "learning_rate": 1.0250500575721578e-07,
- "loss": 0.2616893947124481,
- "mean_token_accuracy": 0.9052878618240356,
- "num_tokens": 32239209.0,
- "step": 3606
- },
- {
- "epoch": 2.740881458966565,
- "grad_norm": 2.3760416507720947,
- "learning_rate": 1.0191226868036419e-07,
- "loss": 0.3654823303222656,
- "mean_token_accuracy": 0.9066962003707886,
- "num_tokens": 32245690.0,
- "step": 3607
- },
- {
- "epoch": 2.7416413373860182,
- "grad_norm": 1.9187121391296387,
- "learning_rate": 1.0132121467379574e-07,
- "loss": 0.2764931321144104,
- "mean_token_accuracy": 0.9288564920425415,
- "num_tokens": 32252804.0,
- "step": 3608
- },
- {
- "epoch": 2.7424012158054714,
- "grad_norm": 2.57564115524292,
- "learning_rate": 1.0073184415233334e-07,
- "loss": 0.2813187837600708,
- "mean_token_accuracy": 0.890303909778595,
- "num_tokens": 32258534.0,
- "step": 3609
- },
- {
- "epoch": 2.743161094224924,
- "grad_norm": 2.0758004188537598,
- "learning_rate": 1.0014415752962081e-07,
- "loss": 0.29847270250320435,
- "mean_token_accuracy": 0.8947038054466248,
- "num_tokens": 32265373.0,
- "step": 3610
- },
- {
- "epoch": 2.7439209726443767,
- "grad_norm": 3.005535840988159,
- "learning_rate": 9.955815521811852e-08,
- "loss": 0.2781291604042053,
- "mean_token_accuracy": 0.899482250213623,
- "num_tokens": 32269487.0,
- "step": 3611
- },
- {
- "epoch": 2.74468085106383,
- "grad_norm": 2.131834030151367,
- "learning_rate": 9.897383762910606e-08,
- "loss": 0.2915271520614624,
- "mean_token_accuracy": 0.8984331488609314,
- "num_tokens": 32276242.0,
- "step": 3612
- },
- {
- "epoch": 2.7454407294832825,
- "grad_norm": 2.048445463180542,
- "learning_rate": 9.839120517267986e-08,
- "loss": 0.38389909267425537,
- "mean_token_accuracy": 0.8720065951347351,
- "num_tokens": 32284956.0,
- "step": 3613
- },
- {
- "epoch": 2.7462006079027357,
- "grad_norm": 3.3529200553894043,
- "learning_rate": 9.781025825775392e-08,
- "loss": 0.29694801568984985,
- "mean_token_accuracy": 0.8991866111755371,
- "num_tokens": 32289109.0,
- "step": 3614
- },
- {
- "epoch": 2.7469604863221884,
- "grad_norm": 2.5099470615386963,
- "learning_rate": 9.72309972920582e-08,
- "loss": 0.2015802264213562,
- "mean_token_accuracy": 0.9364612102508545,
- "num_tokens": 32294163.0,
- "step": 3615
- },
- {
- "epoch": 2.7477203647416415,
- "grad_norm": 1.7144349813461304,
- "learning_rate": 9.665342268214167e-08,
- "loss": 0.42185109853744507,
- "mean_token_accuracy": 0.8469204902648926,
- "num_tokens": 32304034.0,
- "step": 3616
- },
- {
- "epoch": 2.748480243161094,
- "grad_norm": 1.7306944131851196,
- "learning_rate": 9.607753483336812e-08,
- "loss": 0.294491708278656,
- "mean_token_accuracy": 0.8831486701965332,
- "num_tokens": 32314079.0,
- "step": 3617
- },
- {
- "epoch": 2.749240121580547,
- "grad_norm": 1.5339795351028442,
- "learning_rate": 9.55033341499173e-08,
- "loss": 0.4163019359111786,
- "mean_token_accuracy": 0.8496603965759277,
- "num_tokens": 32325707.0,
- "step": 3618
- },
- {
- "epoch": 2.75,
- "grad_norm": 1.878015398979187,
- "learning_rate": 9.493082103478519e-08,
- "loss": 0.2632361650466919,
- "mean_token_accuracy": 0.8944116830825806,
- "num_tokens": 32333710.0,
- "step": 3619
- },
- {
- "epoch": 2.750759878419453,
- "grad_norm": 1.771299958229065,
- "learning_rate": 9.43599958897845e-08,
- "loss": 0.3327634334564209,
- "mean_token_accuracy": 0.8778671026229858,
- "num_tokens": 32343311.0,
- "step": 3620
- },
- {
- "epoch": 2.751519756838906,
- "grad_norm": 1.358282208442688,
- "learning_rate": 9.379085911554148e-08,
- "loss": 0.3822714686393738,
- "mean_token_accuracy": 0.8586339354515076,
- "num_tokens": 32361435.0,
- "step": 3621
- },
- {
- "epoch": 2.7522796352583585,
- "grad_norm": 1.9158512353897095,
- "learning_rate": 9.322341111149852e-08,
- "loss": 0.23024609684944153,
- "mean_token_accuracy": 0.9222040176391602,
- "num_tokens": 32368371.0,
- "step": 3622
- },
- {
- "epoch": 2.7530395136778116,
- "grad_norm": 1.247992753982544,
- "learning_rate": 9.265765227591261e-08,
- "loss": 0.3436150550842285,
- "mean_token_accuracy": 0.8803039789199829,
- "num_tokens": 32388723.0,
- "step": 3623
- },
- {
- "epoch": 2.7537993920972643,
- "grad_norm": 1.913124918937683,
- "learning_rate": 9.209358300585474e-08,
- "loss": 0.35059863328933716,
- "mean_token_accuracy": 0.875072717666626,
- "num_tokens": 32397011.0,
- "step": 3624
- },
- {
- "epoch": 2.7545592705167175,
- "grad_norm": 2.487434148788452,
- "learning_rate": 9.153120369721047e-08,
- "loss": 0.2234063446521759,
- "mean_token_accuracy": 0.904019832611084,
- "num_tokens": 32402316.0,
- "step": 3625
- },
- {
- "epoch": 2.75531914893617,
- "grad_norm": 2.188255548477173,
- "learning_rate": 9.09705147446796e-08,
- "loss": 0.19389624893665314,
- "mean_token_accuracy": 0.9302033185958862,
- "num_tokens": 32408031.0,
- "step": 3626
- },
- {
- "epoch": 2.7560790273556233,
- "grad_norm": 2.892735004425049,
- "learning_rate": 9.041151654177488e-08,
- "loss": 0.24316613376140594,
- "mean_token_accuracy": 0.9222840070724487,
- "num_tokens": 32412498.0,
- "step": 3627
- },
- {
- "epoch": 2.756838905775076,
- "grad_norm": 2.6814024448394775,
- "learning_rate": 8.985420948082329e-08,
- "loss": 0.2725716233253479,
- "mean_token_accuracy": 0.9069510698318481,
- "num_tokens": 32417717.0,
- "step": 3628
- },
- {
- "epoch": 2.7575987841945286,
- "grad_norm": 2.8956947326660156,
- "learning_rate": 8.929859395296365e-08,
- "loss": 0.3466540575027466,
- "mean_token_accuracy": 0.8771743774414062,
- "num_tokens": 32422425.0,
- "step": 3629
- },
- {
- "epoch": 2.7583586626139818,
- "grad_norm": 2.393306016921997,
- "learning_rate": 8.874467034814816e-08,
- "loss": 0.40261518955230713,
- "mean_token_accuracy": 0.8902627229690552,
- "num_tokens": 32428512.0,
- "step": 3630
- },
- {
- "epoch": 2.759118541033435,
- "grad_norm": 2.201388359069824,
- "learning_rate": 8.819243905514308e-08,
- "loss": 0.28923481702804565,
- "mean_token_accuracy": 0.8948091268539429,
- "num_tokens": 32434316.0,
- "step": 3631
- },
- {
- "epoch": 2.7598784194528876,
- "grad_norm": 1.9007173776626587,
- "learning_rate": 8.764190046152421e-08,
- "loss": 0.3775410056114197,
- "mean_token_accuracy": 0.8737541437149048,
- "num_tokens": 32442785.0,
- "step": 3632
- },
- {
- "epoch": 2.7606382978723403,
- "grad_norm": 1.0914241075515747,
- "learning_rate": 8.709305495368137e-08,
- "loss": 0.27528852224349976,
- "mean_token_accuracy": 0.8981513977050781,
- "num_tokens": 32462749.0,
- "step": 3633
- },
- {
- "epoch": 2.7613981762917934,
- "grad_norm": 2.024019718170166,
- "learning_rate": 8.654590291681531e-08,
- "loss": 0.3178071677684784,
- "mean_token_accuracy": 0.8825376033782959,
- "num_tokens": 32470041.0,
- "step": 3634
- },
- {
- "epoch": 2.762158054711246,
- "grad_norm": 1.038554072380066,
- "learning_rate": 8.600044473493856e-08,
- "loss": 0.26435115933418274,
- "mean_token_accuracy": 0.9002813100814819,
- "num_tokens": 32492633.0,
- "step": 3635
- },
- {
- "epoch": 2.762917933130699,
- "grad_norm": 3.143336057662964,
- "learning_rate": 8.545668079087438e-08,
- "loss": 0.356077641248703,
- "mean_token_accuracy": 0.890540361404419,
- "num_tokens": 32497085.0,
- "step": 3636
- },
- {
- "epoch": 2.763677811550152,
- "grad_norm": 1.8176860809326172,
- "learning_rate": 8.491461146625774e-08,
- "loss": 0.42660102248191833,
- "mean_token_accuracy": 0.8467463254928589,
- "num_tokens": 32506375.0,
- "step": 3637
- },
- {
- "epoch": 2.764437689969605,
- "grad_norm": 1.7116483449935913,
- "learning_rate": 8.437423714153292e-08,
- "loss": 0.3794213533401489,
- "mean_token_accuracy": 0.8674054145812988,
- "num_tokens": 32517443.0,
- "step": 3638
- },
- {
- "epoch": 2.7651975683890577,
- "grad_norm": 3.004796266555786,
- "learning_rate": 8.383555819595601e-08,
- "loss": 0.3199142515659332,
- "mean_token_accuracy": 0.8825819492340088,
- "num_tokens": 32527003.0,
- "step": 3639
- },
- {
- "epoch": 2.7659574468085104,
- "grad_norm": 2.6139073371887207,
- "learning_rate": 8.329857500759291e-08,
- "loss": 0.4262070059776306,
- "mean_token_accuracy": 0.8643308281898499,
- "num_tokens": 32533227.0,
- "step": 3640
- },
- {
- "epoch": 2.7667173252279635,
- "grad_norm": 1.4850772619247437,
- "learning_rate": 8.2763287953318e-08,
- "loss": 0.4211199879646301,
- "mean_token_accuracy": 0.8522083759307861,
- "num_tokens": 32546463.0,
- "step": 3641
- },
- {
- "epoch": 2.7674772036474167,
- "grad_norm": 2.1967451572418213,
- "learning_rate": 8.22296974088177e-08,
- "loss": 0.32154369354248047,
- "mean_token_accuracy": 0.9058319926261902,
- "num_tokens": 32554292.0,
- "step": 3642
- },
- {
- "epoch": 2.7682370820668694,
- "grad_norm": 1.4377225637435913,
- "learning_rate": 8.169780374858577e-08,
- "loss": 0.34665489196777344,
- "mean_token_accuracy": 0.8763554096221924,
- "num_tokens": 32567357.0,
- "step": 3643
- },
- {
- "epoch": 2.768996960486322,
- "grad_norm": 1.8216571807861328,
- "learning_rate": 8.116760734592527e-08,
- "loss": 0.39765921235084534,
- "mean_token_accuracy": 0.8595637679100037,
- "num_tokens": 32577681.0,
- "step": 3644
- },
- {
- "epoch": 2.769756838905775,
- "grad_norm": 3.732693672180176,
- "learning_rate": 8.063910857294881e-08,
- "loss": 0.16449159383773804,
- "mean_token_accuracy": 0.9406331777572632,
- "num_tokens": 32580792.0,
- "step": 3645
- },
- {
- "epoch": 2.770516717325228,
- "grad_norm": 1.4248076677322388,
- "learning_rate": 8.011230780057749e-08,
- "loss": 0.43648213148117065,
- "mean_token_accuracy": 0.8409627676010132,
- "num_tokens": 32596950.0,
- "step": 3646
- },
- {
- "epoch": 2.771276595744681,
- "grad_norm": 1.5802161693572998,
- "learning_rate": 7.958720539853971e-08,
- "loss": 0.41201114654541016,
- "mean_token_accuracy": 0.8678973913192749,
- "num_tokens": 32608870.0,
- "step": 3647
- },
- {
- "epoch": 2.7720364741641337,
- "grad_norm": 1.864032506942749,
- "learning_rate": 7.906380173537315e-08,
- "loss": 0.3839274048805237,
- "mean_token_accuracy": 0.863370418548584,
- "num_tokens": 32619357.0,
- "step": 3648
- },
- {
- "epoch": 2.772796352583587,
- "grad_norm": 2.0040485858917236,
- "learning_rate": 7.854209717842231e-08,
- "loss": 0.4682219624519348,
- "mean_token_accuracy": 0.8341292142868042,
- "num_tokens": 32628659.0,
- "step": 3649
- },
- {
- "epoch": 2.7735562310030395,
- "grad_norm": 3.2517287731170654,
- "learning_rate": 7.80220920938396e-08,
- "loss": 0.3697377145290375,
- "mean_token_accuracy": 0.8937886357307434,
- "num_tokens": 32632724.0,
- "step": 3650
- },
- {
- "epoch": 2.774316109422492,
- "grad_norm": 1.437434434890747,
- "learning_rate": 7.750378684658444e-08,
- "loss": 0.21713104844093323,
- "mean_token_accuracy": 0.9223493337631226,
- "num_tokens": 32643085.0,
- "step": 3651
- },
- {
- "epoch": 2.7750759878419453,
- "grad_norm": 1.3312400579452515,
- "learning_rate": 7.698718180042392e-08,
- "loss": 0.3078494668006897,
- "mean_token_accuracy": 0.8865747451782227,
- "num_tokens": 32657205.0,
- "step": 3652
- },
- {
- "epoch": 2.7758358662613984,
- "grad_norm": 1.3009766340255737,
- "learning_rate": 7.647227731793078e-08,
- "loss": 0.33374494314193726,
- "mean_token_accuracy": 0.8755972385406494,
- "num_tokens": 32670785.0,
- "step": 3653
- },
- {
- "epoch": 2.776595744680851,
- "grad_norm": 1.7956385612487793,
- "learning_rate": 7.595907376048512e-08,
- "loss": 0.3185005486011505,
- "mean_token_accuracy": 0.896104097366333,
- "num_tokens": 32679376.0,
- "step": 3654
- },
- {
- "epoch": 2.777355623100304,
- "grad_norm": 1.9820408821105957,
- "learning_rate": 7.544757148827297e-08,
- "loss": 0.34602630138397217,
- "mean_token_accuracy": 0.9006669521331787,
- "num_tokens": 32687327.0,
- "step": 3655
- },
- {
- "epoch": 2.778115501519757,
- "grad_norm": 1.447498083114624,
- "learning_rate": 7.493777086028608e-08,
- "loss": 0.29633957147598267,
- "mean_token_accuracy": 0.8827477693557739,
- "num_tokens": 32698669.0,
- "step": 3656
- },
- {
- "epoch": 2.7788753799392096,
- "grad_norm": 1.195237159729004,
- "learning_rate": 7.442967223432212e-08,
- "loss": 0.25846078991889954,
- "mean_token_accuracy": 0.932551920413971,
- "num_tokens": 32713411.0,
- "step": 3657
- },
- {
- "epoch": 2.7796352583586628,
- "grad_norm": 1.4306368827819824,
- "learning_rate": 7.392327596698474e-08,
- "loss": 0.22794288396835327,
- "mean_token_accuracy": 0.9128783941268921,
- "num_tokens": 32724629.0,
- "step": 3658
- },
- {
- "epoch": 2.7803951367781155,
- "grad_norm": 3.5105903148651123,
- "learning_rate": 7.341858241368182e-08,
- "loss": 0.21695205569267273,
- "mean_token_accuracy": 0.9189575910568237,
- "num_tokens": 32728392.0,
- "step": 3659
- },
- {
- "epoch": 2.7811550151975686,
- "grad_norm": 2.8782589435577393,
- "learning_rate": 7.291559192862701e-08,
- "loss": 0.3374413847923279,
- "mean_token_accuracy": 0.9080451726913452,
- "num_tokens": 32733126.0,
- "step": 3660
- },
- {
- "epoch": 2.7819148936170213,
- "grad_norm": 1.9232850074768066,
- "learning_rate": 7.24143048648382e-08,
- "loss": 0.2707790732383728,
- "mean_token_accuracy": 0.9045628309249878,
- "num_tokens": 32741378.0,
- "step": 3661
- },
- {
- "epoch": 2.782674772036474,
- "grad_norm": 1.1166657209396362,
- "learning_rate": 7.19147215741381e-08,
- "loss": 0.2668237090110779,
- "mean_token_accuracy": 0.8920862674713135,
- "num_tokens": 32760317.0,
- "step": 3662
- },
- {
- "epoch": 2.783434650455927,
- "grad_norm": 3.9177591800689697,
- "learning_rate": 7.141684240715374e-08,
- "loss": 0.18272298574447632,
- "mean_token_accuracy": 0.94575434923172,
- "num_tokens": 32763663.0,
- "step": 3663
- },
- {
- "epoch": 2.78419452887538,
- "grad_norm": 1.9616899490356445,
- "learning_rate": 7.092066771331507e-08,
- "loss": 0.20110884308815002,
- "mean_token_accuracy": 0.9169102907180786,
- "num_tokens": 32770243.0,
- "step": 3664
- },
- {
- "epoch": 2.784954407294833,
- "grad_norm": 3.5950927734375,
- "learning_rate": 7.042619784085741e-08,
- "loss": 0.24979421496391296,
- "mean_token_accuracy": 0.9095007181167603,
- "num_tokens": 32773985.0,
- "step": 3665
- },
- {
- "epoch": 2.7857142857142856,
- "grad_norm": 1.8824433088302612,
- "learning_rate": 6.993343313681872e-08,
- "loss": 0.32540541887283325,
- "mean_token_accuracy": 0.8754172921180725,
- "num_tokens": 32782040.0,
- "step": 3666
- },
- {
- "epoch": 2.7864741641337387,
- "grad_norm": 1.7720941305160522,
- "learning_rate": 6.944237394703985e-08,
- "loss": 0.2930932641029358,
- "mean_token_accuracy": 0.8913610577583313,
- "num_tokens": 32790338.0,
- "step": 3667
- },
- {
- "epoch": 2.7872340425531914,
- "grad_norm": 1.6130414009094238,
- "learning_rate": 6.895302061616483e-08,
- "loss": 0.35470184683799744,
- "mean_token_accuracy": 0.8745495676994324,
- "num_tokens": 32801160.0,
- "step": 3668
- },
- {
- "epoch": 2.7879939209726445,
- "grad_norm": 1.315376877784729,
- "learning_rate": 6.846537348764116e-08,
- "loss": 0.33905792236328125,
- "mean_token_accuracy": 0.8629679679870605,
- "num_tokens": 32816508.0,
- "step": 3669
- },
- {
- "epoch": 2.788753799392097,
- "grad_norm": 1.9508394002914429,
- "learning_rate": 6.797943290371839e-08,
- "loss": 0.27722638845443726,
- "mean_token_accuracy": 0.8903636932373047,
- "num_tokens": 32824029.0,
- "step": 3670
- },
- {
- "epoch": 2.7895136778115504,
- "grad_norm": 0.9335530996322632,
- "learning_rate": 6.74951992054479e-08,
- "loss": 0.3004249632358551,
- "mean_token_accuracy": 0.887278139591217,
- "num_tokens": 32849091.0,
- "step": 3671
- },
- {
- "epoch": 2.790273556231003,
- "grad_norm": 1.8353229761123657,
- "learning_rate": 6.701267273268392e-08,
- "loss": 0.3471749424934387,
- "mean_token_accuracy": 0.8823778629302979,
- "num_tokens": 32858285.0,
- "step": 3672
- },
- {
- "epoch": 2.7910334346504557,
- "grad_norm": 2.469905138015747,
- "learning_rate": 6.653185382408195e-08,
- "loss": 0.27492985129356384,
- "mean_token_accuracy": 0.898033857345581,
- "num_tokens": 32863568.0,
- "step": 3673
- },
- {
- "epoch": 2.791793313069909,
- "grad_norm": 1.861342430114746,
- "learning_rate": 6.605274281709929e-08,
- "loss": 0.4201383590698242,
- "mean_token_accuracy": 0.8511666655540466,
- "num_tokens": 32873794.0,
- "step": 3674
- },
- {
- "epoch": 2.7925531914893615,
- "grad_norm": 1.6716010570526123,
- "learning_rate": 6.557534004799443e-08,
- "loss": 0.31345364451408386,
- "mean_token_accuracy": 0.8953241109848022,
- "num_tokens": 32883515.0,
- "step": 3675
- },
- {
- "epoch": 2.7933130699088147,
- "grad_norm": 1.566288709640503,
- "learning_rate": 6.509964585182688e-08,
- "loss": 0.36333587765693665,
- "mean_token_accuracy": 0.866706132888794,
- "num_tokens": 32895232.0,
- "step": 3676
- },
- {
- "epoch": 2.7940729483282674,
- "grad_norm": 1.5501067638397217,
- "learning_rate": 6.462566056245761e-08,
- "loss": 0.2846035957336426,
- "mean_token_accuracy": 0.9041277766227722,
- "num_tokens": 32903854.0,
- "step": 3677
- },
- {
- "epoch": 2.7948328267477205,
- "grad_norm": 2.15285325050354,
- "learning_rate": 6.415338451254722e-08,
- "loss": 0.35233989357948303,
- "mean_token_accuracy": 0.8840795159339905,
- "num_tokens": 32911633.0,
- "step": 3678
- },
- {
- "epoch": 2.795592705167173,
- "grad_norm": 1.3108829259872437,
- "learning_rate": 6.368281803355692e-08,
- "loss": 0.3379764258861542,
- "mean_token_accuracy": 0.9114458560943604,
- "num_tokens": 32925455.0,
- "step": 3679
- },
- {
- "epoch": 2.7963525835866263,
- "grad_norm": 1.818579912185669,
- "learning_rate": 6.321396145574948e-08,
- "loss": 0.32847997546195984,
- "mean_token_accuracy": 0.8970182538032532,
- "num_tokens": 32935029.0,
- "step": 3680
- },
- {
- "epoch": 2.797112462006079,
- "grad_norm": 3.7173373699188232,
- "learning_rate": 6.274681510818587e-08,
- "loss": 0.18795353174209595,
- "mean_token_accuracy": 0.9429396986961365,
- "num_tokens": 32938652.0,
- "step": 3681
- },
- {
- "epoch": 2.797872340425532,
- "grad_norm": 2.2997212409973145,
- "learning_rate": 6.228137931872713e-08,
- "loss": 0.34515100717544556,
- "mean_token_accuracy": 0.878103494644165,
- "num_tokens": 32945409.0,
- "step": 3682
- },
- {
- "epoch": 2.798632218844985,
- "grad_norm": 2.424675941467285,
- "learning_rate": 6.18176544140342e-08,
- "loss": 0.2552722990512848,
- "mean_token_accuracy": 0.9087961316108704,
- "num_tokens": 32950721.0,
- "step": 3683
- },
- {
- "epoch": 2.7993920972644375,
- "grad_norm": 2.662060022354126,
- "learning_rate": 6.135564071956729e-08,
- "loss": 0.2554262578487396,
- "mean_token_accuracy": 0.9034075736999512,
- "num_tokens": 32955891.0,
- "step": 3684
- },
- {
- "epoch": 2.8001519756838906,
- "grad_norm": 1.1945017576217651,
- "learning_rate": 6.089533855958508e-08,
- "loss": 0.36223694682121277,
- "mean_token_accuracy": 0.8567380905151367,
- "num_tokens": 32971543.0,
- "step": 3685
- },
- {
- "epoch": 2.8009118541033433,
- "grad_norm": 1.2724100351333618,
- "learning_rate": 6.043674825714607e-08,
- "loss": 0.35224610567092896,
- "mean_token_accuracy": 0.8696926832199097,
- "num_tokens": 32986452.0,
- "step": 3686
- },
- {
- "epoch": 2.8016717325227964,
- "grad_norm": 1.3042409420013428,
- "learning_rate": 5.997987013410533e-08,
- "loss": 0.38680803775787354,
- "mean_token_accuracy": 0.8600257635116577,
- "num_tokens": 33005534.0,
- "step": 3687
- },
- {
- "epoch": 2.802431610942249,
- "grad_norm": 2.448430299758911,
- "learning_rate": 5.9524704511118305e-08,
- "loss": 0.13345648348331451,
- "mean_token_accuracy": 0.9592865705490112,
- "num_tokens": 33009403.0,
- "step": 3688
- },
- {
- "epoch": 2.8031914893617023,
- "grad_norm": 1.1455037593841553,
- "learning_rate": 5.9071251707638056e-08,
- "loss": 0.3144465982913971,
- "mean_token_accuracy": 0.8841190338134766,
- "num_tokens": 33028129.0,
- "step": 3689
- },
- {
- "epoch": 2.803951367781155,
- "grad_norm": 2.0947425365448,
- "learning_rate": 5.861951204191446e-08,
- "loss": 0.36041027307510376,
- "mean_token_accuracy": 0.8605015873908997,
- "num_tokens": 33036379.0,
- "step": 3690
- },
- {
- "epoch": 2.8047112462006076,
- "grad_norm": 3.1552155017852783,
- "learning_rate": 5.8169485830996134e-08,
- "loss": 0.32727721333503723,
- "mean_token_accuracy": 0.9110068678855896,
- "num_tokens": 33040276.0,
- "step": 3691
- },
- {
- "epoch": 2.8054711246200608,
- "grad_norm": 2.5555851459503174,
- "learning_rate": 5.772117339072902e-08,
- "loss": 0.23542895913124084,
- "mean_token_accuracy": 0.91229647397995,
- "num_tokens": 33045308.0,
- "step": 3692
- },
- {
- "epoch": 2.806231003039514,
- "grad_norm": 2.4970197677612305,
- "learning_rate": 5.7274575035755896e-08,
- "loss": 0.13501018285751343,
- "mean_token_accuracy": 0.9495668411254883,
- "num_tokens": 33049012.0,
- "step": 3693
- },
- {
- "epoch": 2.8069908814589666,
- "grad_norm": 3.25179123878479,
- "learning_rate": 5.68296910795163e-08,
- "loss": 0.39757871627807617,
- "mean_token_accuracy": 0.8692524433135986,
- "num_tokens": 33053004.0,
- "step": 3694
- },
- {
- "epoch": 2.8077507598784193,
- "grad_norm": 2.4152987003326416,
- "learning_rate": 5.6386521834247696e-08,
- "loss": 0.3562552332878113,
- "mean_token_accuracy": 0.8817118406295776,
- "num_tokens": 33059557.0,
- "step": 3695
- },
- {
- "epoch": 2.8085106382978724,
- "grad_norm": 2.051687002182007,
- "learning_rate": 5.5945067610982395e-08,
- "loss": 0.5281018018722534,
- "mean_token_accuracy": 0.8174080848693848,
- "num_tokens": 33068691.0,
- "step": 3696
- },
- {
- "epoch": 2.809270516717325,
- "grad_norm": 3.8002891540527344,
- "learning_rate": 5.550532871955061e-08,
- "loss": 0.20866292715072632,
- "mean_token_accuracy": 0.9262990951538086,
- "num_tokens": 33072085.0,
- "step": 3697
- },
- {
- "epoch": 2.810030395136778,
- "grad_norm": 2.3774707317352295,
- "learning_rate": 5.506730546857797e-08,
- "loss": 0.2632027566432953,
- "mean_token_accuracy": 0.9251352548599243,
- "num_tokens": 33078720.0,
- "step": 3698
- },
- {
- "epoch": 2.810790273556231,
- "grad_norm": 1.3897415399551392,
- "learning_rate": 5.463099816548578e-08,
- "loss": 0.3936246931552887,
- "mean_token_accuracy": 0.8637404441833496,
- "num_tokens": 33092660.0,
- "step": 3699
- },
- {
- "epoch": 2.811550151975684,
- "grad_norm": 1.5614900588989258,
- "learning_rate": 5.419640711649188e-08,
- "loss": 0.44372743368148804,
- "mean_token_accuracy": 0.8500189185142517,
- "num_tokens": 33104431.0,
- "step": 3700
- },
- {
- "epoch": 2.8123100303951367,
- "grad_norm": 1.466921329498291,
- "learning_rate": 5.376353262660811e-08,
- "loss": 0.3102647066116333,
- "mean_token_accuracy": 0.8741628527641296,
- "num_tokens": 33115290.0,
- "step": 3701
- },
- {
- "epoch": 2.8130699088145894,
- "grad_norm": 1.6993112564086914,
- "learning_rate": 5.333237499964283e-08,
- "loss": 0.4017091989517212,
- "mean_token_accuracy": 0.865143358707428,
- "num_tokens": 33126710.0,
- "step": 3702
- },
- {
- "epoch": 2.8138297872340425,
- "grad_norm": 2.2112064361572266,
- "learning_rate": 5.290293453819956e-08,
- "loss": 0.3109806776046753,
- "mean_token_accuracy": 0.9097060561180115,
- "num_tokens": 33133186.0,
- "step": 3703
- },
- {
- "epoch": 2.8145896656534957,
- "grad_norm": 1.9934327602386475,
- "learning_rate": 5.247521154367552e-08,
- "loss": 0.35044047236442566,
- "mean_token_accuracy": 0.874421238899231,
- "num_tokens": 33140329.0,
- "step": 3704
- },
- {
- "epoch": 2.8153495440729484,
- "grad_norm": 2.815687656402588,
- "learning_rate": 5.2049206316263366e-08,
- "loss": 0.2516332268714905,
- "mean_token_accuracy": 0.9180612564086914,
- "num_tokens": 33144861.0,
- "step": 3705
- },
- {
- "epoch": 2.816109422492401,
- "grad_norm": 1.7479608058929443,
- "learning_rate": 5.162491915495005e-08,
- "loss": 0.16342511773109436,
- "mean_token_accuracy": 0.9410310983657837,
- "num_tokens": 33151936.0,
- "step": 3706
- },
- {
- "epoch": 2.816869300911854,
- "grad_norm": 1.3695951700210571,
- "learning_rate": 5.120235035751653e-08,
- "loss": 0.2908460199832916,
- "mean_token_accuracy": 0.9211517572402954,
- "num_tokens": 33164151.0,
- "step": 3707
- },
- {
- "epoch": 2.817629179331307,
- "grad_norm": 2.370861768722534,
- "learning_rate": 5.0781500220537797e-08,
- "loss": 0.26081186532974243,
- "mean_token_accuracy": 0.9090365171432495,
- "num_tokens": 33169551.0,
- "step": 3708
- },
- {
- "epoch": 2.81838905775076,
- "grad_norm": 1.627031922340393,
- "learning_rate": 5.036236903938285e-08,
- "loss": 0.2977932393550873,
- "mean_token_accuracy": 0.9078235626220703,
- "num_tokens": 33179586.0,
- "step": 3709
- },
- {
- "epoch": 2.8191489361702127,
- "grad_norm": 1.830381155014038,
- "learning_rate": 4.9944957108213896e-08,
- "loss": 0.2239128053188324,
- "mean_token_accuracy": 0.9216980934143066,
- "num_tokens": 33186754.0,
- "step": 3710
- },
- {
- "epoch": 2.819908814589666,
- "grad_norm": 2.419703245162964,
- "learning_rate": 4.952926471998687e-08,
- "loss": 0.3302939832210541,
- "mean_token_accuracy": 0.9000803232192993,
- "num_tokens": 33192512.0,
- "step": 3711
- },
- {
- "epoch": 2.8206686930091185,
- "grad_norm": 2.2166857719421387,
- "learning_rate": 4.911529216645089e-08,
- "loss": 0.2880767285823822,
- "mean_token_accuracy": 0.9058420658111572,
- "num_tokens": 33198274.0,
- "step": 3712
- },
- {
- "epoch": 2.821428571428571,
- "grad_norm": 1.357695460319519,
- "learning_rate": 4.8703039738147165e-08,
- "loss": 0.38549065589904785,
- "mean_token_accuracy": 0.8689560890197754,
- "num_tokens": 33213015.0,
- "step": 3713
- },
- {
- "epoch": 2.8221884498480243,
- "grad_norm": 1.3445006608963013,
- "learning_rate": 4.829250772441091e-08,
- "loss": 0.28673315048217773,
- "mean_token_accuracy": 0.8871713876724243,
- "num_tokens": 33226895.0,
- "step": 3714
- },
- {
- "epoch": 2.8229483282674774,
- "grad_norm": 2.043430805206299,
- "learning_rate": 4.788369641336943e-08,
- "loss": 0.27235424518585205,
- "mean_token_accuracy": 0.9001829624176025,
- "num_tokens": 33233991.0,
- "step": 3715
- },
- {
- "epoch": 2.82370820668693,
- "grad_norm": 3.290034294128418,
- "learning_rate": 4.7476606091941544e-08,
- "loss": 0.3277619481086731,
- "mean_token_accuracy": 0.9064863324165344,
- "num_tokens": 33238393.0,
- "step": 3716
- },
- {
- "epoch": 2.824468085106383,
- "grad_norm": 3.1663918495178223,
- "learning_rate": 4.707123704583927e-08,
- "loss": 0.2841528058052063,
- "mean_token_accuracy": 0.9187209606170654,
- "num_tokens": 33242428.0,
- "step": 3717
- },
- {
- "epoch": 2.825227963525836,
- "grad_norm": 1.2812966108322144,
- "learning_rate": 4.6667589559566405e-08,
- "loss": 0.4020092785358429,
- "mean_token_accuracy": 0.8751412630081177,
- "num_tokens": 33257996.0,
- "step": 3718
- },
- {
- "epoch": 2.8259878419452886,
- "grad_norm": 1.4390029907226562,
- "learning_rate": 4.626566391641774e-08,
- "loss": 0.44845378398895264,
- "mean_token_accuracy": 0.8416492938995361,
- "num_tokens": 33271661.0,
- "step": 3719
- },
- {
- "epoch": 2.8267477203647418,
- "grad_norm": 1.5283807516098022,
- "learning_rate": 4.586546039848094e-08,
- "loss": 0.28856372833251953,
- "mean_token_accuracy": 0.8961426019668579,
- "num_tokens": 33282969.0,
- "step": 3720
- },
- {
- "epoch": 2.8275075987841944,
- "grad_norm": 1.5666929483413696,
- "learning_rate": 4.546697928663357e-08,
- "loss": 0.3489445149898529,
- "mean_token_accuracy": 0.8704522848129272,
- "num_tokens": 33293549.0,
- "step": 3721
- },
- {
- "epoch": 2.8282674772036476,
- "grad_norm": 1.6343169212341309,
- "learning_rate": 4.5070220860545244e-08,
- "loss": 0.3505254089832306,
- "mean_token_accuracy": 0.8735896348953247,
- "num_tokens": 33304821.0,
- "step": 3722
- },
- {
- "epoch": 2.8290273556231003,
- "grad_norm": 1.963257074356079,
- "learning_rate": 4.467518539867655e-08,
- "loss": 0.3180759847164154,
- "mean_token_accuracy": 0.8902066946029663,
- "num_tokens": 33312313.0,
- "step": 3723
- },
- {
- "epoch": 2.829787234042553,
- "grad_norm": 3.3562021255493164,
- "learning_rate": 4.428187317827848e-08,
- "loss": 0.23085635900497437,
- "mean_token_accuracy": 0.9242620468139648,
- "num_tokens": 33315831.0,
- "step": 3724
- },
- {
- "epoch": 2.830547112462006,
- "grad_norm": 1.7402317523956299,
- "learning_rate": 4.3890284475392175e-08,
- "loss": 0.27766430377960205,
- "mean_token_accuracy": 0.8943138122558594,
- "num_tokens": 33324982.0,
- "step": 3725
- },
- {
- "epoch": 2.831306990881459,
- "grad_norm": 1.6835107803344727,
- "learning_rate": 4.350041956485029e-08,
- "loss": 0.35358738899230957,
- "mean_token_accuracy": 0.8683137893676758,
- "num_tokens": 33334979.0,
- "step": 3726
- },
- {
- "epoch": 2.832066869300912,
- "grad_norm": 2.232856035232544,
- "learning_rate": 4.311227872027479e-08,
- "loss": 0.3305876851081848,
- "mean_token_accuracy": 0.885346531867981,
- "num_tokens": 33341951.0,
- "step": 3727
- },
- {
- "epoch": 2.8328267477203646,
- "grad_norm": 1.763230800628662,
- "learning_rate": 4.272586221407776e-08,
- "loss": 0.3677369952201843,
- "mean_token_accuracy": 0.8810771703720093,
- "num_tokens": 33351110.0,
- "step": 3728
- },
- {
- "epoch": 2.8335866261398177,
- "grad_norm": 1.3161970376968384,
- "learning_rate": 4.2341170317461433e-08,
- "loss": 0.4191834628582001,
- "mean_token_accuracy": 0.8625809550285339,
- "num_tokens": 33368231.0,
- "step": 3729
- },
- {
- "epoch": 2.8343465045592704,
- "grad_norm": 2.151383399963379,
- "learning_rate": 4.1958203300417056e-08,
- "loss": 0.30521994829177856,
- "mean_token_accuracy": 0.8904989957809448,
- "num_tokens": 33374755.0,
- "step": 3730
- },
- {
- "epoch": 2.8351063829787235,
- "grad_norm": 1.2751890420913696,
- "learning_rate": 4.1576961431726016e-08,
- "loss": 0.2024286538362503,
- "mean_token_accuracy": 0.9254995584487915,
- "num_tokens": 33385820.0,
- "step": 3731
- },
- {
- "epoch": 2.835866261398176,
- "grad_norm": 2.5229005813598633,
- "learning_rate": 4.119744497895817e-08,
- "loss": 0.2631904184818268,
- "mean_token_accuracy": 0.9213854074478149,
- "num_tokens": 33390577.0,
- "step": 3732
- },
- {
- "epoch": 2.8366261398176293,
- "grad_norm": 1.3829402923583984,
- "learning_rate": 4.0819654208472947e-08,
- "loss": 0.3373589813709259,
- "mean_token_accuracy": 0.8810330629348755,
- "num_tokens": 33404300.0,
- "step": 3733
- },
- {
- "epoch": 2.837386018237082,
- "grad_norm": 1.395129919052124,
- "learning_rate": 4.044358938541853e-08,
- "loss": 0.27040547132492065,
- "mean_token_accuracy": 0.8935626745223999,
- "num_tokens": 33418071.0,
- "step": 3734
- },
- {
- "epoch": 2.8381458966565347,
- "grad_norm": 2.4185354709625244,
- "learning_rate": 4.006925077373158e-08,
- "loss": 0.2641582489013672,
- "mean_token_accuracy": 0.9196245670318604,
- "num_tokens": 33423213.0,
- "step": 3735
- },
- {
- "epoch": 2.838905775075988,
- "grad_norm": 1.9432255029678345,
- "learning_rate": 3.969663863613721e-08,
- "loss": 0.31337353587150574,
- "mean_token_accuracy": 0.886800229549408,
- "num_tokens": 33432442.0,
- "step": 3736
- },
- {
- "epoch": 2.839665653495441,
- "grad_norm": 1.1473867893218994,
- "learning_rate": 3.9325753234149276e-08,
- "loss": 0.3156060576438904,
- "mean_token_accuracy": 0.8809531331062317,
- "num_tokens": 33452184.0,
- "step": 3737
- },
- {
- "epoch": 2.8404255319148937,
- "grad_norm": 2.233121633529663,
- "learning_rate": 3.8956594828069295e-08,
- "loss": 0.31154608726501465,
- "mean_token_accuracy": 0.8883147239685059,
- "num_tokens": 33458643.0,
- "step": 3738
- },
- {
- "epoch": 2.8411854103343464,
- "grad_norm": 2.165466070175171,
- "learning_rate": 3.8589163676986674e-08,
- "loss": 0.38480815291404724,
- "mean_token_accuracy": 0.8609665036201477,
- "num_tokens": 33466465.0,
- "step": 3739
- },
- {
- "epoch": 2.8419452887537995,
- "grad_norm": 3.5072174072265625,
- "learning_rate": 3.822346003877875e-08,
- "loss": 0.45201557874679565,
- "mean_token_accuracy": 0.8519665002822876,
- "num_tokens": 33470826.0,
- "step": 3740
- },
- {
- "epoch": 2.842705167173252,
- "grad_norm": 2.2038586139678955,
- "learning_rate": 3.785948417011076e-08,
- "loss": 0.34780675172805786,
- "mean_token_accuracy": 0.8806177377700806,
- "num_tokens": 33478706.0,
- "step": 3741
- },
- {
- "epoch": 2.8434650455927053,
- "grad_norm": 1.8423243761062622,
- "learning_rate": 3.749723632643476e-08,
- "loss": 0.2681577205657959,
- "mean_token_accuracy": 0.9055651426315308,
- "num_tokens": 33486200.0,
- "step": 3742
- },
- {
- "epoch": 2.844224924012158,
- "grad_norm": 1.3372201919555664,
- "learning_rate": 3.713671676199016e-08,
- "loss": 0.3277212381362915,
- "mean_token_accuracy": 0.8801225423812866,
- "num_tokens": 33499465.0,
- "step": 3743
- },
- {
- "epoch": 2.844984802431611,
- "grad_norm": 2.303901195526123,
- "learning_rate": 3.677792572980371e-08,
- "loss": 0.2349717617034912,
- "mean_token_accuracy": 0.9109916090965271,
- "num_tokens": 33505491.0,
- "step": 3744
- },
- {
- "epoch": 2.845744680851064,
- "grad_norm": 2.1374688148498535,
- "learning_rate": 3.642086348168844e-08,
- "loss": 0.3567136526107788,
- "mean_token_accuracy": 0.8669205904006958,
- "num_tokens": 33512665.0,
- "step": 3745
- },
- {
- "epoch": 2.8465045592705165,
- "grad_norm": 3.476426362991333,
- "learning_rate": 3.6065530268244445e-08,
- "loss": 0.3189643621444702,
- "mean_token_accuracy": 0.882624626159668,
- "num_tokens": 33516449.0,
- "step": 3746
- },
- {
- "epoch": 2.8472644376899696,
- "grad_norm": 1.094572901725769,
- "learning_rate": 3.5711926338858335e-08,
- "loss": 0.25354239344596863,
- "mean_token_accuracy": 0.9008959531784058,
- "num_tokens": 33536298.0,
- "step": 3747
- },
- {
- "epoch": 2.8480243161094227,
- "grad_norm": 1.375033974647522,
- "learning_rate": 3.536005194170328e-08,
- "loss": 0.2859119772911072,
- "mean_token_accuracy": 0.8998885154724121,
- "num_tokens": 33548861.0,
- "step": 3748
- },
- {
- "epoch": 2.8487841945288754,
- "grad_norm": 1.96660578250885,
- "learning_rate": 3.5009907323737826e-08,
- "loss": 0.35728299617767334,
- "mean_token_accuracy": 0.8976923227310181,
- "num_tokens": 33556270.0,
- "step": 3749
- },
- {
- "epoch": 2.849544072948328,
- "grad_norm": 2.8434062004089355,
- "learning_rate": 3.466149273070707e-08,
- "loss": 0.25592705607414246,
- "mean_token_accuracy": 0.9228044748306274,
- "num_tokens": 33560603.0,
- "step": 3750
- },
- {
- "epoch": 2.8503039513677813,
- "grad_norm": 2.7658159732818604,
- "learning_rate": 3.431480840714152e-08,
- "loss": 0.33110958337783813,
- "mean_token_accuracy": 0.8761162161827087,
- "num_tokens": 33565428.0,
- "step": 3751
- },
- {
- "epoch": 2.851063829787234,
- "grad_norm": 3.696040391921997,
- "learning_rate": 3.396985459635821e-08,
- "loss": 0.29301607608795166,
- "mean_token_accuracy": 0.9034254550933838,
- "num_tokens": 33568866.0,
- "step": 3752
- },
- {
- "epoch": 2.851823708206687,
- "grad_norm": 1.8923646211624146,
- "learning_rate": 3.3626631540458754e-08,
- "loss": 0.3817586600780487,
- "mean_token_accuracy": 0.8635997772216797,
- "num_tokens": 33578141.0,
- "step": 3753
- },
- {
- "epoch": 2.8525835866261398,
- "grad_norm": 1.6717027425765991,
- "learning_rate": 3.328513948032991e-08,
- "loss": 0.37302929162979126,
- "mean_token_accuracy": 0.8670454025268555,
- "num_tokens": 33588694.0,
- "step": 3754
- },
- {
- "epoch": 2.853343465045593,
- "grad_norm": 3.2549097537994385,
- "learning_rate": 3.29453786556444e-08,
- "loss": 0.27366238832473755,
- "mean_token_accuracy": 0.9079047441482544,
- "num_tokens": 33592813.0,
- "step": 3755
- },
- {
- "epoch": 2.8541033434650456,
- "grad_norm": 1.3533412218093872,
- "learning_rate": 3.260734930485926e-08,
- "loss": 0.4412471652030945,
- "mean_token_accuracy": 0.839799165725708,
- "num_tokens": 33609765.0,
- "step": 3756
- },
- {
- "epoch": 2.8548632218844983,
- "grad_norm": 2.876262903213501,
- "learning_rate": 3.227105166521638e-08,
- "loss": 0.3382536768913269,
- "mean_token_accuracy": 0.879544734954834,
- "num_tokens": 33614131.0,
- "step": 3757
- },
- {
- "epoch": 2.8556231003039514,
- "grad_norm": 1.9969818592071533,
- "learning_rate": 3.193648597274279e-08,
- "loss": 0.24406743049621582,
- "mean_token_accuracy": 0.9072264432907104,
- "num_tokens": 33621867.0,
- "step": 3758
- },
- {
- "epoch": 2.8563829787234045,
- "grad_norm": 2.934230089187622,
- "learning_rate": 3.1603652462249e-08,
- "loss": 0.0893428698182106,
- "mean_token_accuracy": 0.9702994227409363,
- "num_tokens": 33625133.0,
- "step": 3759
- },
- {
- "epoch": 2.857142857142857,
- "grad_norm": 3.162353038787842,
- "learning_rate": 3.127255136733093e-08,
- "loss": 0.2535284459590912,
- "mean_token_accuracy": 0.8997728824615479,
- "num_tokens": 33629391.0,
- "step": 3760
- },
- {
- "epoch": 2.85790273556231,
- "grad_norm": 1.3975396156311035,
- "learning_rate": 3.094318292036824e-08,
- "loss": 0.37006449699401855,
- "mean_token_accuracy": 0.8666602373123169,
- "num_tokens": 33644831.0,
- "step": 3761
- },
- {
- "epoch": 2.858662613981763,
- "grad_norm": 1.4016542434692383,
- "learning_rate": 3.061554735252325e-08,
- "loss": 0.33619073033332825,
- "mean_token_accuracy": 0.8836570978164673,
- "num_tokens": 33660918.0,
- "step": 3762
- },
- {
- "epoch": 2.8594224924012157,
- "grad_norm": 1.3213437795639038,
- "learning_rate": 3.028964489374453e-08,
- "loss": 0.29083719849586487,
- "mean_token_accuracy": 0.9077234864234924,
- "num_tokens": 33672778.0,
- "step": 3763
- },
- {
- "epoch": 2.860182370820669,
- "grad_norm": 2.157179594039917,
- "learning_rate": 2.9965475772762154e-08,
- "loss": 0.35480785369873047,
- "mean_token_accuracy": 0.8937191367149353,
- "num_tokens": 33680991.0,
- "step": 3764
- },
- {
- "epoch": 2.8609422492401215,
- "grad_norm": 3.584878921508789,
- "learning_rate": 2.96430402170908e-08,
- "loss": 0.34448280930519104,
- "mean_token_accuracy": 0.878994345664978,
- "num_tokens": 33685137.0,
- "step": 3765
- },
- {
- "epoch": 2.8617021276595747,
- "grad_norm": 1.7320963144302368,
- "learning_rate": 2.9322338453028066e-08,
- "loss": 0.40042293071746826,
- "mean_token_accuracy": 0.8563319444656372,
- "num_tokens": 33694591.0,
- "step": 3766
- },
- {
- "epoch": 2.8624620060790273,
- "grad_norm": 1.6684232950210571,
- "learning_rate": 2.900337070565473e-08,
- "loss": 0.4402884542942047,
- "mean_token_accuracy": 0.8600190877914429,
- "num_tokens": 33708467.0,
- "step": 3767
- },
- {
- "epoch": 2.86322188449848,
- "grad_norm": 1.9484777450561523,
- "learning_rate": 2.8686137198834784e-08,
- "loss": 0.2297988086938858,
- "mean_token_accuracy": 0.9216253161430359,
- "num_tokens": 33715825.0,
- "step": 3768
- },
- {
- "epoch": 2.863981762917933,
- "grad_norm": 1.4594624042510986,
- "learning_rate": 2.8370638155215125e-08,
- "loss": 0.2471354901790619,
- "mean_token_accuracy": 0.9343935251235962,
- "num_tokens": 33726774.0,
- "step": 3769
- },
- {
- "epoch": 2.8647416413373863,
- "grad_norm": 1.75857412815094,
- "learning_rate": 2.805687379622446e-08,
- "loss": 0.3599606156349182,
- "mean_token_accuracy": 0.8600481748580933,
- "num_tokens": 33738487.0,
- "step": 3770
- },
- {
- "epoch": 2.865501519756839,
- "grad_norm": 2.5933029651641846,
- "learning_rate": 2.774484434207525e-08,
- "loss": 0.3705040514469147,
- "mean_token_accuracy": 0.8960624933242798,
- "num_tokens": 33743954.0,
- "step": 3771
- },
- {
- "epoch": 2.8662613981762917,
- "grad_norm": 2.339298963546753,
- "learning_rate": 2.7434550011761763e-08,
- "loss": 0.35568612813949585,
- "mean_token_accuracy": 0.8733487725257874,
- "num_tokens": 33750214.0,
- "step": 3772
- },
- {
- "epoch": 2.867021276595745,
- "grad_norm": 2.2959485054016113,
- "learning_rate": 2.712599102306035e-08,
- "loss": 0.2672561705112457,
- "mean_token_accuracy": 0.9030044078826904,
- "num_tokens": 33756736.0,
- "step": 3773
- },
- {
- "epoch": 2.8677811550151975,
- "grad_norm": 1.500349521636963,
- "learning_rate": 2.681916759252917e-08,
- "loss": 0.41401299834251404,
- "mean_token_accuracy": 0.8844438195228577,
- "num_tokens": 33769268.0,
- "step": 3774
- },
- {
- "epoch": 2.8685410334346506,
- "grad_norm": 1.7837727069854736,
- "learning_rate": 2.6514079935509586e-08,
- "loss": 0.2668437957763672,
- "mean_token_accuracy": 0.8956533670425415,
- "num_tokens": 33777122.0,
- "step": 3775
- },
- {
- "epoch": 2.8693009118541033,
- "grad_norm": 1.717192530632019,
- "learning_rate": 2.6210728266123364e-08,
- "loss": 0.25972551107406616,
- "mean_token_accuracy": 0.883383572101593,
- "num_tokens": 33785044.0,
- "step": 3776
- },
- {
- "epoch": 2.8700607902735564,
- "grad_norm": 1.9367283582687378,
- "learning_rate": 2.5909112797274093e-08,
- "loss": 0.44500526785850525,
- "mean_token_accuracy": 0.8556182980537415,
- "num_tokens": 33794610.0,
- "step": 3777
- },
- {
- "epoch": 2.870820668693009,
- "grad_norm": 1.4821012020111084,
- "learning_rate": 2.560923374064772e-08,
- "loss": 0.3385273218154907,
- "mean_token_accuracy": 0.873454749584198,
- "num_tokens": 33807602.0,
- "step": 3778
- },
- {
- "epoch": 2.871580547112462,
- "grad_norm": 2.105130195617676,
- "learning_rate": 2.531109130671061e-08,
- "loss": 0.2996317446231842,
- "mean_token_accuracy": 0.8943172693252563,
- "num_tokens": 33814280.0,
- "step": 3779
- },
- {
- "epoch": 2.872340425531915,
- "grad_norm": 2.1374971866607666,
- "learning_rate": 2.501468570471066e-08,
- "loss": 0.3201690912246704,
- "mean_token_accuracy": 0.8778494596481323,
- "num_tokens": 33821842.0,
- "step": 3780
- },
- {
- "epoch": 2.8731003039513676,
- "grad_norm": 2.2370989322662354,
- "learning_rate": 2.4720017142676745e-08,
- "loss": 0.4030833840370178,
- "mean_token_accuracy": 0.8520782589912415,
- "num_tokens": 33830051.0,
- "step": 3781
- },
- {
- "epoch": 2.8738601823708207,
- "grad_norm": 2.3659868240356445,
- "learning_rate": 2.4427085827418706e-08,
- "loss": 0.2570466697216034,
- "mean_token_accuracy": 0.9111968874931335,
- "num_tokens": 33835753.0,
- "step": 3782
- },
- {
- "epoch": 2.8746200607902734,
- "grad_norm": 2.252115249633789,
- "learning_rate": 2.4135891964526535e-08,
- "loss": 0.373632550239563,
- "mean_token_accuracy": 0.8691182136535645,
- "num_tokens": 33842183.0,
- "step": 3783
- },
- {
- "epoch": 2.8753799392097266,
- "grad_norm": 1.216013789176941,
- "learning_rate": 2.3846435758372034e-08,
- "loss": 0.3572605848312378,
- "mean_token_accuracy": 0.8590090274810791,
- "num_tokens": 33860538.0,
- "step": 3784
- },
- {
- "epoch": 2.8761398176291793,
- "grad_norm": 2.739243268966675,
- "learning_rate": 2.3558717412106025e-08,
- "loss": 0.3257160782814026,
- "mean_token_accuracy": 0.8806333541870117,
- "num_tokens": 33866134.0,
- "step": 3785
- },
- {
- "epoch": 2.8768996960486324,
- "grad_norm": 1.683767557144165,
- "learning_rate": 2.3272737127660595e-08,
- "loss": 0.3267333507537842,
- "mean_token_accuracy": 0.9005235433578491,
- "num_tokens": 33875630.0,
- "step": 3786
- },
- {
- "epoch": 2.877659574468085,
- "grad_norm": 1.4830154180526733,
- "learning_rate": 2.2988495105748245e-08,
- "loss": 0.28507307171821594,
- "mean_token_accuracy": 0.9133665561676025,
- "num_tokens": 33887110.0,
- "step": 3787
- },
- {
- "epoch": 2.878419452887538,
- "grad_norm": 1.7522467374801636,
- "learning_rate": 2.2705991545859953e-08,
- "loss": 0.45354849100112915,
- "mean_token_accuracy": 0.8899869918823242,
- "num_tokens": 33898735.0,
- "step": 3788
- },
- {
- "epoch": 2.879179331306991,
- "grad_norm": 2.4311375617980957,
- "learning_rate": 2.242522664626823e-08,
- "loss": 0.3417064845561981,
- "mean_token_accuracy": 0.8724955320358276,
- "num_tokens": 33906031.0,
- "step": 3789
- },
- {
- "epoch": 2.8799392097264436,
- "grad_norm": 2.44846510887146,
- "learning_rate": 2.2146200604024614e-08,
- "loss": 0.3186315596103668,
- "mean_token_accuracy": 0.8888083696365356,
- "num_tokens": 33911248.0,
- "step": 3790
- },
- {
- "epoch": 2.8806990881458967,
- "grad_norm": 1.4528448581695557,
- "learning_rate": 2.1868913614959963e-08,
- "loss": 0.34161821007728577,
- "mean_token_accuracy": 0.87728351354599,
- "num_tokens": 33923786.0,
- "step": 3791
- },
- {
- "epoch": 2.8814589665653494,
- "grad_norm": 1.8418529033660889,
- "learning_rate": 2.1593365873685544e-08,
- "loss": 0.2751237452030182,
- "mean_token_accuracy": 0.9060730338096619,
- "num_tokens": 33930983.0,
- "step": 3792
- },
- {
- "epoch": 2.8822188449848025,
- "grad_norm": 1.5349152088165283,
- "learning_rate": 2.131955757359111e-08,
- "loss": 0.31487759947776794,
- "mean_token_accuracy": 0.8839719891548157,
- "num_tokens": 33942600.0,
- "step": 3793
- },
- {
- "epoch": 2.882978723404255,
- "grad_norm": 2.317296266555786,
- "learning_rate": 2.1047488906845715e-08,
- "loss": 0.22481049597263336,
- "mean_token_accuracy": 0.9269076585769653,
- "num_tokens": 33947548.0,
- "step": 3794
- },
- {
- "epoch": 2.8837386018237083,
- "grad_norm": 1.9512174129486084,
- "learning_rate": 2.0777160064397727e-08,
- "loss": 0.33469653129577637,
- "mean_token_accuracy": 0.8800324201583862,
- "num_tokens": 33955699.0,
- "step": 3795
- },
- {
- "epoch": 2.884498480243161,
- "grad_norm": 1.3140486478805542,
- "learning_rate": 2.050857123597455e-08,
- "loss": 0.3801634609699249,
- "mean_token_accuracy": 0.8677546977996826,
- "num_tokens": 33972033.0,
- "step": 3796
- },
- {
- "epoch": 2.8852583586626137,
- "grad_norm": 2.4413559436798096,
- "learning_rate": 2.024172261008178e-08,
- "loss": 0.4444601535797119,
- "mean_token_accuracy": 0.8535408973693848,
- "num_tokens": 33978859.0,
- "step": 3797
- },
- {
- "epoch": 2.886018237082067,
- "grad_norm": 1.8970952033996582,
- "learning_rate": 1.997661437400461e-08,
- "loss": 0.29712194204330444,
- "mean_token_accuracy": 0.8958410024642944,
- "num_tokens": 33988416.0,
- "step": 3798
- },
- {
- "epoch": 2.88677811550152,
- "grad_norm": 1.4225033521652222,
- "learning_rate": 1.9713246713805588e-08,
- "loss": 0.2251742035150528,
- "mean_token_accuracy": 0.9059432744979858,
- "num_tokens": 33998579.0,
- "step": 3799
- },
- {
- "epoch": 2.8875379939209727,
- "grad_norm": 3.485994338989258,
- "learning_rate": 1.9451619814326307e-08,
- "loss": 0.2449614405632019,
- "mean_token_accuracy": 0.9136157035827637,
- "num_tokens": 34002108.0,
- "step": 3800
- },
- {
- "epoch": 2.8882978723404253,
- "grad_norm": 1.7383781671524048,
- "learning_rate": 1.91917338591871e-08,
- "loss": 0.3420751690864563,
- "mean_token_accuracy": 0.8810985088348389,
- "num_tokens": 34010102.0,
- "step": 3801
- },
- {
- "epoch": 2.8890577507598785,
- "grad_norm": 2.587632894515991,
- "learning_rate": 1.893358903078568e-08,
- "loss": 0.38646167516708374,
- "mean_token_accuracy": 0.8570578098297119,
- "num_tokens": 34016684.0,
- "step": 3802
- },
- {
- "epoch": 2.889817629179331,
- "grad_norm": 1.2580358982086182,
- "learning_rate": 1.867718551029768e-08,
- "loss": 0.23658394813537598,
- "mean_token_accuracy": 0.9092692136764526,
- "num_tokens": 34029808.0,
- "step": 3803
- },
- {
- "epoch": 2.8905775075987843,
- "grad_norm": 1.808404803276062,
- "learning_rate": 1.842252347767748e-08,
- "loss": 0.2760203778743744,
- "mean_token_accuracy": 0.8876132965087891,
- "num_tokens": 34038138.0,
- "step": 3804
- },
- {
- "epoch": 2.891337386018237,
- "grad_norm": 1.6140836477279663,
- "learning_rate": 1.8169603111656554e-08,
- "loss": 0.3449614346027374,
- "mean_token_accuracy": 0.8777539730072021,
- "num_tokens": 34048093.0,
- "step": 3805
- },
- {
- "epoch": 2.89209726443769,
- "grad_norm": 3.784487724304199,
- "learning_rate": 1.791842458974402e-08,
- "loss": 0.3181925415992737,
- "mean_token_accuracy": 0.8902693390846252,
- "num_tokens": 34051903.0,
- "step": 3806
- },
- {
- "epoch": 2.892857142857143,
- "grad_norm": 1.726521372795105,
- "learning_rate": 1.7668988088226922e-08,
- "loss": 0.3940914273262024,
- "mean_token_accuracy": 0.8877660036087036,
- "num_tokens": 34062433.0,
- "step": 3807
- },
- {
- "epoch": 2.8936170212765955,
- "grad_norm": 2.1862759590148926,
- "learning_rate": 1.7421293782168837e-08,
- "loss": 0.2806234061717987,
- "mean_token_accuracy": 0.9004480838775635,
- "num_tokens": 34068835.0,
- "step": 3808
- },
- {
- "epoch": 2.8943768996960486,
- "grad_norm": 1.8618063926696777,
- "learning_rate": 1.717534184541153e-08,
- "loss": 0.3391259014606476,
- "mean_token_accuracy": 0.8807502388954163,
- "num_tokens": 34078044.0,
- "step": 3809
- },
- {
- "epoch": 2.8951367781155017,
- "grad_norm": 2.19085431098938,
- "learning_rate": 1.6931132450573873e-08,
- "loss": 0.34228384494781494,
- "mean_token_accuracy": 0.8653440475463867,
- "num_tokens": 34084925.0,
- "step": 3810
- },
- {
- "epoch": 2.8958966565349544,
- "grad_norm": 2.0328660011291504,
- "learning_rate": 1.6688665769050704e-08,
- "loss": 0.3773893117904663,
- "mean_token_accuracy": 0.8646367788314819,
- "num_tokens": 34092740.0,
- "step": 3811
- },
- {
- "epoch": 2.896656534954407,
- "grad_norm": 2.489732265472412,
- "learning_rate": 1.644794197101507e-08,
- "loss": 0.2722119688987732,
- "mean_token_accuracy": 0.9241745471954346,
- "num_tokens": 34097475.0,
- "step": 3812
- },
- {
- "epoch": 2.8974164133738602,
- "grad_norm": 2.709529161453247,
- "learning_rate": 1.620896122541571e-08,
- "loss": 0.2608666718006134,
- "mean_token_accuracy": 0.9132722020149231,
- "num_tokens": 34101961.0,
- "step": 3813
- },
- {
- "epoch": 2.898176291793313,
- "grad_norm": 2.089813709259033,
- "learning_rate": 1.5971723699979015e-08,
- "loss": 0.3234292268753052,
- "mean_token_accuracy": 0.9032332897186279,
- "num_tokens": 34109427.0,
- "step": 3814
- },
- {
- "epoch": 2.898936170212766,
- "grad_norm": 1.3891119956970215,
- "learning_rate": 1.5736229561207072e-08,
- "loss": 0.2506135404109955,
- "mean_token_accuracy": 0.8997396230697632,
- "num_tokens": 34121770.0,
- "step": 3815
- },
- {
- "epoch": 2.8996960486322187,
- "grad_norm": 1.9386579990386963,
- "learning_rate": 1.5502478974378788e-08,
- "loss": 0.29841434955596924,
- "mean_token_accuracy": 0.8915755748748779,
- "num_tokens": 34130111.0,
- "step": 3816
- },
- {
- "epoch": 2.900455927051672,
- "grad_norm": 1.601960301399231,
- "learning_rate": 1.5270472103549317e-08,
- "loss": 0.34736987948417664,
- "mean_token_accuracy": 0.876467227935791,
- "num_tokens": 34142053.0,
- "step": 3817
- },
- {
- "epoch": 2.9012158054711246,
- "grad_norm": 2.42319393157959,
- "learning_rate": 1.5040209111550075e-08,
- "loss": 0.24774286150932312,
- "mean_token_accuracy": 0.9127346873283386,
- "num_tokens": 34146627.0,
- "step": 3818
- },
- {
- "epoch": 2.9019756838905773,
- "grad_norm": 2.711033582687378,
- "learning_rate": 1.4811690159988456e-08,
- "loss": 0.30365103483200073,
- "mean_token_accuracy": 0.8981214165687561,
- "num_tokens": 34151735.0,
- "step": 3819
- },
- {
- "epoch": 2.9027355623100304,
- "grad_norm": 3.105949640274048,
- "learning_rate": 1.4584915409248113e-08,
- "loss": 0.38369080424308777,
- "mean_token_accuracy": 0.8762385845184326,
- "num_tokens": 34156484.0,
- "step": 3820
- },
- {
- "epoch": 2.9034954407294835,
- "grad_norm": 1.9705839157104492,
- "learning_rate": 1.435988501848784e-08,
- "loss": 0.33529043197631836,
- "mean_token_accuracy": 0.8921652436256409,
- "num_tokens": 34164241.0,
- "step": 3821
- },
- {
- "epoch": 2.904255319148936,
- "grad_norm": 2.084878921508789,
- "learning_rate": 1.413659914564297e-08,
- "loss": 0.24922935664653778,
- "mean_token_accuracy": 0.9262560606002808,
- "num_tokens": 34169898.0,
- "step": 3822
- },
- {
- "epoch": 2.905015197568389,
- "grad_norm": 1.593758225440979,
- "learning_rate": 1.3915057947423705e-08,
- "loss": 0.3691917657852173,
- "mean_token_accuracy": 0.8785613775253296,
- "num_tokens": 34181419.0,
- "step": 3823
- },
- {
- "epoch": 2.905775075987842,
- "grad_norm": 1.772596001625061,
- "learning_rate": 1.3695261579316776e-08,
- "loss": 0.358150839805603,
- "mean_token_accuracy": 0.8747056722640991,
- "num_tokens": 34190872.0,
- "step": 3824
- },
- {
- "epoch": 2.9065349544072947,
- "grad_norm": 2.1670494079589844,
- "learning_rate": 1.3477210195583234e-08,
- "loss": 0.30586451292037964,
- "mean_token_accuracy": 0.8851495981216431,
- "num_tokens": 34197353.0,
- "step": 3825
- },
- {
- "epoch": 2.907294832826748,
- "grad_norm": 2.7168121337890625,
- "learning_rate": 1.3260903949260107e-08,
- "loss": 0.2924152612686157,
- "mean_token_accuracy": 0.8947597146034241,
- "num_tokens": 34201889.0,
- "step": 3826
- },
- {
- "epoch": 2.9080547112462005,
- "grad_norm": 1.576528787612915,
- "learning_rate": 1.3046342992159567e-08,
- "loss": 0.33903738856315613,
- "mean_token_accuracy": 0.8710857033729553,
- "num_tokens": 34212640.0,
- "step": 3827
- },
- {
- "epoch": 2.9088145896656536,
- "grad_norm": 1.3831605911254883,
- "learning_rate": 1.2833527474868657e-08,
- "loss": 0.2891062796115875,
- "mean_token_accuracy": 0.8909540176391602,
- "num_tokens": 34223917.0,
- "step": 3828
- },
- {
- "epoch": 2.9095744680851063,
- "grad_norm": 2.075225353240967,
- "learning_rate": 1.2622457546749567e-08,
- "loss": 0.14886733889579773,
- "mean_token_accuracy": 0.9509548544883728,
- "num_tokens": 34228609.0,
- "step": 3829
- },
- {
- "epoch": 2.910334346504559,
- "grad_norm": 2.658463478088379,
- "learning_rate": 1.2413133355939356e-08,
- "loss": 0.13472142815589905,
- "mean_token_accuracy": 0.957228422164917,
- "num_tokens": 34232011.0,
- "step": 3830
- },
- {
- "epoch": 2.911094224924012,
- "grad_norm": 1.8684933185577393,
- "learning_rate": 1.2205555049349394e-08,
- "loss": 0.13954663276672363,
- "mean_token_accuracy": 0.953221321105957,
- "num_tokens": 34237643.0,
- "step": 3831
- },
- {
- "epoch": 2.9118541033434653,
- "grad_norm": 1.799784779548645,
- "learning_rate": 1.1999722772666478e-08,
- "loss": 0.28668212890625,
- "mean_token_accuracy": 0.9035641551017761,
- "num_tokens": 34246593.0,
- "step": 3832
- },
- {
- "epoch": 2.912613981762918,
- "grad_norm": 1.3970232009887695,
- "learning_rate": 1.1795636670351718e-08,
- "loss": 0.2589891254901886,
- "mean_token_accuracy": 0.9162927865982056,
- "num_tokens": 34257535.0,
- "step": 3833
- },
- {
- "epoch": 2.9133738601823707,
- "grad_norm": 2.5260443687438965,
- "learning_rate": 1.1593296885640259e-08,
- "loss": 0.452729195356369,
- "mean_token_accuracy": 0.8569157123565674,
- "num_tokens": 34263834.0,
- "step": 3834
- },
- {
- "epoch": 2.914133738601824,
- "grad_norm": 1.879526972770691,
- "learning_rate": 1.1392703560542118e-08,
- "loss": 0.3608126640319824,
- "mean_token_accuracy": 0.8750635385513306,
- "num_tokens": 34272156.0,
- "step": 3835
- },
- {
- "epoch": 2.9148936170212765,
- "grad_norm": 1.9857182502746582,
- "learning_rate": 1.1193856835841344e-08,
- "loss": 0.28058698773384094,
- "mean_token_accuracy": 0.8984638452529907,
- "num_tokens": 34280438.0,
- "step": 3836
- },
- {
- "epoch": 2.9156534954407296,
- "grad_norm": 1.9187198877334595,
- "learning_rate": 1.0996756851096579e-08,
- "loss": 0.3203415870666504,
- "mean_token_accuracy": 0.8920673131942749,
- "num_tokens": 34288330.0,
- "step": 3837
- },
- {
- "epoch": 2.9164133738601823,
- "grad_norm": 1.6627569198608398,
- "learning_rate": 1.0801403744639672e-08,
- "loss": 0.30393654108047485,
- "mean_token_accuracy": 0.8877602815628052,
- "num_tokens": 34297701.0,
- "step": 3838
- },
- {
- "epoch": 2.9171732522796354,
- "grad_norm": 1.4527947902679443,
- "learning_rate": 1.0607797653577333e-08,
- "loss": 0.33950865268707275,
- "mean_token_accuracy": 0.8850067853927612,
- "num_tokens": 34311995.0,
- "step": 3839
- },
- {
- "epoch": 2.917933130699088,
- "grad_norm": 1.694217324256897,
- "learning_rate": 1.0415938713789487e-08,
- "loss": 0.33595266938209534,
- "mean_token_accuracy": 0.878333568572998,
- "num_tokens": 34322095.0,
- "step": 3840
- },
- {
- "epoch": 2.918693009118541,
- "grad_norm": 2.3357045650482178,
- "learning_rate": 1.0225827059930082e-08,
- "loss": 0.2966959476470947,
- "mean_token_accuracy": 0.893630862236023,
- "num_tokens": 34328400.0,
- "step": 3841
- },
- {
- "epoch": 2.919452887537994,
- "grad_norm": 1.9848041534423828,
- "learning_rate": 1.0037462825427113e-08,
- "loss": 0.4187622368335724,
- "mean_token_accuracy": 0.8641717433929443,
- "num_tokens": 34337203.0,
- "step": 3842
- },
- {
- "epoch": 2.920212765957447,
- "grad_norm": 1.7696800231933594,
- "learning_rate": 9.850846142481773e-09,
- "loss": 0.34298282861709595,
- "mean_token_accuracy": 0.8812298774719238,
- "num_tokens": 34346584.0,
- "step": 3843
- },
- {
- "epoch": 2.9209726443768997,
- "grad_norm": 2.6058225631713867,
- "learning_rate": 9.665977142068738e-09,
- "loss": 0.2776247262954712,
- "mean_token_accuracy": 0.908215343952179,
- "num_tokens": 34351472.0,
- "step": 3844
- },
- {
- "epoch": 2.9217325227963524,
- "grad_norm": 2.4990663528442383,
- "learning_rate": 9.482855953936443e-09,
- "loss": 0.2577187418937683,
- "mean_token_accuracy": 0.9113357663154602,
- "num_tokens": 34357101.0,
- "step": 3845
- },
- {
- "epoch": 2.9224924012158056,
- "grad_norm": 3.2842514514923096,
- "learning_rate": 9.30148270660569e-09,
- "loss": 0.23392081260681152,
- "mean_token_accuracy": 0.9370708465576172,
- "num_tokens": 34360674.0,
- "step": 3846
- },
- {
- "epoch": 2.9232522796352582,
- "grad_norm": 2.2124083042144775,
- "learning_rate": 9.121857527372157e-09,
- "loss": 0.3026091456413269,
- "mean_token_accuracy": 0.886944055557251,
- "num_tokens": 34367471.0,
- "step": 3847
- },
- {
- "epoch": 2.9240121580547114,
- "grad_norm": 1.6130470037460327,
- "learning_rate": 8.943980542302777e-09,
- "loss": 0.33204811811447144,
- "mean_token_accuracy": 0.8805426359176636,
- "num_tokens": 34377461.0,
- "step": 3848
- },
- {
- "epoch": 2.924772036474164,
- "grad_norm": 1.6536617279052734,
- "learning_rate": 8.767851876239075e-09,
- "loss": 0.33671748638153076,
- "mean_token_accuracy": 0.8811848163604736,
- "num_tokens": 34386732.0,
- "step": 3849
- },
- {
- "epoch": 2.925531914893617,
- "grad_norm": 1.9558135271072388,
- "learning_rate": 8.59347165279495e-09,
- "loss": 0.3325084447860718,
- "mean_token_accuracy": 0.8823798894882202,
- "num_tokens": 34395705.0,
- "step": 3850
- },
- {
- "epoch": 2.92629179331307,
- "grad_norm": 2.2350621223449707,
- "learning_rate": 8.420839994356666e-09,
- "loss": 0.28383463621139526,
- "mean_token_accuracy": 0.8957310914993286,
- "num_tokens": 34402470.0,
- "step": 3851
- },
- {
- "epoch": 2.9270516717325226,
- "grad_norm": 1.9859482049942017,
- "learning_rate": 8.249957022084254e-09,
- "loss": 0.2720850110054016,
- "mean_token_accuracy": 0.9078607559204102,
- "num_tokens": 34410536.0,
- "step": 3852
- },
- {
- "epoch": 2.9278115501519757,
- "grad_norm": 1.3174400329589844,
- "learning_rate": 8.080822855909832e-09,
- "loss": 0.330660879611969,
- "mean_token_accuracy": 0.8777779936790466,
- "num_tokens": 34425639.0,
- "step": 3853
- },
- {
- "epoch": 2.928571428571429,
- "grad_norm": 1.0108131170272827,
- "learning_rate": 7.913437614538166e-09,
- "loss": 0.3833892345428467,
- "mean_token_accuracy": 0.8571817874908447,
- "num_tokens": 34451572.0,
- "step": 3854
- },
- {
- "epoch": 2.9293313069908815,
- "grad_norm": 1.347409725189209,
- "learning_rate": 7.747801415446677e-09,
- "loss": 0.3100135028362274,
- "mean_token_accuracy": 0.9087803363800049,
- "num_tokens": 34465396.0,
- "step": 3855
- },
- {
- "epoch": 2.930091185410334,
- "grad_norm": 1.4636729955673218,
- "learning_rate": 7.583914374885426e-09,
- "loss": 0.32699912786483765,
- "mean_token_accuracy": 0.87745201587677,
- "num_tokens": 34477359.0,
- "step": 3856
- },
- {
- "epoch": 2.9308510638297873,
- "grad_norm": 2.9707272052764893,
- "learning_rate": 7.4217766078760185e-09,
- "loss": 0.18189990520477295,
- "mean_token_accuracy": 0.9301153421401978,
- "num_tokens": 34481356.0,
- "step": 3857
- },
- {
- "epoch": 2.93161094224924,
- "grad_norm": 2.3689684867858887,
- "learning_rate": 7.261388228213817e-09,
- "loss": 0.3382490873336792,
- "mean_token_accuracy": 0.9132488965988159,
- "num_tokens": 34487525.0,
- "step": 3858
- },
- {
- "epoch": 2.932370820668693,
- "grad_norm": 2.3896703720092773,
- "learning_rate": 7.102749348465166e-09,
- "loss": 0.3891000747680664,
- "mean_token_accuracy": 0.8888499736785889,
- "num_tokens": 34493053.0,
- "step": 3859
- },
- {
- "epoch": 2.933130699088146,
- "grad_norm": 3.2713520526885986,
- "learning_rate": 6.945860079969058e-09,
- "loss": 0.26146358251571655,
- "mean_token_accuracy": 0.9090266227722168,
- "num_tokens": 34496938.0,
- "step": 3860
- },
- {
- "epoch": 2.933890577507599,
- "grad_norm": 2.9600296020507812,
- "learning_rate": 6.790720532836026e-09,
- "loss": 0.3506978750228882,
- "mean_token_accuracy": 0.8768079280853271,
- "num_tokens": 34501615.0,
- "step": 3861
- },
- {
- "epoch": 2.9346504559270516,
- "grad_norm": 2.640066146850586,
- "learning_rate": 6.6373308159495275e-09,
- "loss": 0.39720577001571655,
- "mean_token_accuracy": 0.8565619587898254,
- "num_tokens": 34507438.0,
- "step": 3862
- },
- {
- "epoch": 2.9354103343465043,
- "grad_norm": 1.9988795518875122,
- "learning_rate": 6.485691036964003e-09,
- "loss": 0.18736782670021057,
- "mean_token_accuracy": 0.92908775806427,
- "num_tokens": 34513939.0,
- "step": 3863
- },
- {
- "epoch": 2.9361702127659575,
- "grad_norm": 1.0155757665634155,
- "learning_rate": 6.3358013023062656e-09,
- "loss": 0.2876095771789551,
- "mean_token_accuracy": 0.8892107009887695,
- "num_tokens": 34537761.0,
- "step": 3864
- },
- {
- "epoch": 2.9369300911854106,
- "grad_norm": 1.1695115566253662,
- "learning_rate": 6.1876617171743865e-09,
- "loss": 0.2740359306335449,
- "mean_token_accuracy": 0.9104942083358765,
- "num_tokens": 34557302.0,
- "step": 3865
- },
- {
- "epoch": 2.9376899696048633,
- "grad_norm": 2.36651349067688,
- "learning_rate": 6.04127238553881e-09,
- "loss": 0.2734505534172058,
- "mean_token_accuracy": 0.897555947303772,
- "num_tokens": 34563225.0,
- "step": 3866
- },
- {
- "epoch": 2.938449848024316,
- "grad_norm": 3.6499621868133545,
- "learning_rate": 5.896633410141239e-09,
- "loss": 0.30723029375076294,
- "mean_token_accuracy": 0.9058237075805664,
- "num_tokens": 34567037.0,
- "step": 3867
- },
- {
- "epoch": 2.939209726443769,
- "grad_norm": 2.1518232822418213,
- "learning_rate": 5.753744892494639e-09,
- "loss": 0.46499863266944885,
- "mean_token_accuracy": 0.8726839423179626,
- "num_tokens": 34576470.0,
- "step": 3868
- },
- {
- "epoch": 2.939969604863222,
- "grad_norm": 2.8443753719329834,
- "learning_rate": 5.612606932883513e-09,
- "loss": 0.33730241656303406,
- "mean_token_accuracy": 0.8874512910842896,
- "num_tokens": 34582531.0,
- "step": 3869
- },
- {
- "epoch": 2.940729483282675,
- "grad_norm": 1.732109546661377,
- "learning_rate": 5.473219630364457e-09,
- "loss": 0.28559115529060364,
- "mean_token_accuracy": 0.894468367099762,
- "num_tokens": 34591376.0,
- "step": 3870
- },
- {
- "epoch": 2.9414893617021276,
- "grad_norm": 2.533249855041504,
- "learning_rate": 5.335583082764495e-09,
- "loss": 0.23319819569587708,
- "mean_token_accuracy": 0.9174623489379883,
- "num_tokens": 34596426.0,
- "step": 3871
- },
- {
- "epoch": 2.9422492401215807,
- "grad_norm": 2.3505852222442627,
- "learning_rate": 5.19969738668219e-09,
- "loss": 0.3506584167480469,
- "mean_token_accuracy": 0.8692278861999512,
- "num_tokens": 34603058.0,
- "step": 3872
- },
- {
- "epoch": 2.9430091185410334,
- "grad_norm": 1.9322142601013184,
- "learning_rate": 5.065562637487365e-09,
- "loss": 0.2503264546394348,
- "mean_token_accuracy": 0.9093045592308044,
- "num_tokens": 34610330.0,
- "step": 3873
- },
- {
- "epoch": 2.943768996960486,
- "grad_norm": 2.398416519165039,
- "learning_rate": 4.933178929321103e-09,
- "loss": 0.3825327157974243,
- "mean_token_accuracy": 0.8750842809677124,
- "num_tokens": 34617198.0,
- "step": 3874
- },
- {
- "epoch": 2.9445288753799392,
- "grad_norm": 2.3529703617095947,
- "learning_rate": 4.802546355095472e-09,
- "loss": 0.3877553343772888,
- "mean_token_accuracy": 0.8654624223709106,
- "num_tokens": 34624459.0,
- "step": 3875
- },
- {
- "epoch": 2.9452887537993924,
- "grad_norm": 1.4786031246185303,
- "learning_rate": 4.673665006492967e-09,
- "loss": 0.42244911193847656,
- "mean_token_accuracy": 0.8566311597824097,
- "num_tokens": 34639359.0,
- "step": 3876
- },
- {
- "epoch": 2.946048632218845,
- "grad_norm": 2.091810703277588,
- "learning_rate": 4.546534973968175e-09,
- "loss": 0.27733317017555237,
- "mean_token_accuracy": 0.9033011794090271,
- "num_tokens": 34646046.0,
- "step": 3877
- },
- {
- "epoch": 2.9468085106382977,
- "grad_norm": 1.900180459022522,
- "learning_rate": 4.4211563467452814e-09,
- "loss": 0.44815146923065186,
- "mean_token_accuracy": 0.8379489183425903,
- "num_tokens": 34656329.0,
- "step": 3878
- },
- {
- "epoch": 2.947568389057751,
- "grad_norm": 1.3837320804595947,
- "learning_rate": 4.297529212820006e-09,
- "loss": 0.33357739448547363,
- "mean_token_accuracy": 0.8788042068481445,
- "num_tokens": 34671303.0,
- "step": 3879
- },
- {
- "epoch": 2.9483282674772036,
- "grad_norm": 1.3475737571716309,
- "learning_rate": 4.175653658958501e-09,
- "loss": 0.30217933654785156,
- "mean_token_accuracy": 0.8932538032531738,
- "num_tokens": 34685080.0,
- "step": 3880
- },
- {
- "epoch": 2.9490881458966567,
- "grad_norm": 1.6425048112869263,
- "learning_rate": 4.055529770698175e-09,
- "loss": 0.4368054270744324,
- "mean_token_accuracy": 0.8392083644866943,
- "num_tokens": 34695104.0,
- "step": 3881
- },
- {
- "epoch": 2.9498480243161094,
- "grad_norm": 1.729368805885315,
- "learning_rate": 3.937157632346311e-09,
- "loss": 0.28259193897247314,
- "mean_token_accuracy": 0.9338148236274719,
- "num_tokens": 34706664.0,
- "step": 3882
- },
- {
- "epoch": 2.9506079027355625,
- "grad_norm": 1.8707934617996216,
- "learning_rate": 3.820537326980622e-09,
- "loss": 0.40049535036087036,
- "mean_token_accuracy": 0.8617393374443054,
- "num_tokens": 34715401.0,
- "step": 3883
- },
- {
- "epoch": 2.951367781155015,
- "grad_norm": 1.7935676574707031,
- "learning_rate": 3.7056689364503574e-09,
- "loss": 0.3386167585849762,
- "mean_token_accuracy": 0.8947521448135376,
- "num_tokens": 34724093.0,
- "step": 3884
- },
- {
- "epoch": 2.952127659574468,
- "grad_norm": 2.6346704959869385,
- "learning_rate": 3.592552541374361e-09,
- "loss": 0.1505163311958313,
- "mean_token_accuracy": 0.9515544176101685,
- "num_tokens": 34727908.0,
- "step": 3885
- },
- {
- "epoch": 2.952887537993921,
- "grad_norm": 2.2813618183135986,
- "learning_rate": 3.481188221142184e-09,
- "loss": 0.3014339506626129,
- "mean_token_accuracy": 0.8985507488250732,
- "num_tokens": 34734037.0,
- "step": 3886
- },
- {
- "epoch": 2.9536474164133737,
- "grad_norm": 2.482675313949585,
- "learning_rate": 3.37157605391325e-09,
- "loss": 0.3489428758621216,
- "mean_token_accuracy": 0.8771353960037231,
- "num_tokens": 34739874.0,
- "step": 3887
- },
- {
- "epoch": 2.954407294832827,
- "grad_norm": 2.3721418380737305,
- "learning_rate": 3.2637161166179654e-09,
- "loss": 0.3582353889942169,
- "mean_token_accuracy": 0.861088752746582,
- "num_tokens": 34747007.0,
- "step": 3888
- },
- {
- "epoch": 2.9551671732522795,
- "grad_norm": 2.1871862411499023,
- "learning_rate": 3.1576084849563315e-09,
- "loss": 0.30689212679862976,
- "mean_token_accuracy": 0.8910759687423706,
- "num_tokens": 34753361.0,
- "step": 3889
- },
- {
- "epoch": 2.9559270516717326,
- "grad_norm": 2.1797537803649902,
- "learning_rate": 3.0532532333987785e-09,
- "loss": 0.3343493938446045,
- "mean_token_accuracy": 0.880067765712738,
- "num_tokens": 34760824.0,
- "step": 3890
- },
- {
- "epoch": 2.9566869300911853,
- "grad_norm": 2.6021335124969482,
- "learning_rate": 2.9506504351861644e-09,
- "loss": 0.34991219639778137,
- "mean_token_accuracy": 0.8728436231613159,
- "num_tokens": 34766783.0,
- "step": 3891
- },
- {
- "epoch": 2.9574468085106385,
- "grad_norm": 2.202974319458008,
- "learning_rate": 2.849800162328664e-09,
- "loss": 0.3138400912284851,
- "mean_token_accuracy": 0.8995538949966431,
- "num_tokens": 34773174.0,
- "step": 3892
- },
- {
- "epoch": 2.958206686930091,
- "grad_norm": 1.687474250793457,
- "learning_rate": 2.7507024856071595e-09,
- "loss": 0.40479594469070435,
- "mean_token_accuracy": 0.8831138610839844,
- "num_tokens": 34785142.0,
- "step": 3893
- },
- {
- "epoch": 2.9589665653495443,
- "grad_norm": 1.960195779800415,
- "learning_rate": 2.6533574745718493e-09,
- "loss": 0.3259456157684326,
- "mean_token_accuracy": 0.8871631622314453,
- "num_tokens": 34793001.0,
- "step": 3894
- },
- {
- "epoch": 2.959726443768997,
- "grad_norm": 2.89237904548645,
- "learning_rate": 2.557765197543638e-09,
- "loss": 0.32338041067123413,
- "mean_token_accuracy": 0.9038220047950745,
- "num_tokens": 34797424.0,
- "step": 3895
- },
- {
- "epoch": 2.9604863221884496,
- "grad_norm": 2.655599594116211,
- "learning_rate": 2.4639257216127476e-09,
- "loss": 0.2710941731929779,
- "mean_token_accuracy": 0.8990030884742737,
- "num_tokens": 34802629.0,
- "step": 3896
- },
- {
- "epoch": 2.961246200607903,
- "grad_norm": 2.8153562545776367,
- "learning_rate": 2.3718391126392735e-09,
- "loss": 0.4303235411643982,
- "mean_token_accuracy": 0.8491297960281372,
- "num_tokens": 34807870.0,
- "step": 3897
- },
- {
- "epoch": 2.9620060790273555,
- "grad_norm": 1.4196341037750244,
- "learning_rate": 2.2815054352531842e-09,
- "loss": 0.38827845454216003,
- "mean_token_accuracy": 0.8595222234725952,
- "num_tokens": 34823597.0,
- "step": 3898
- },
- {
- "epoch": 2.9627659574468086,
- "grad_norm": 2.9653196334838867,
- "learning_rate": 2.192924752854042e-09,
- "loss": 0.2555926442146301,
- "mean_token_accuracy": 0.9074755907058716,
- "num_tokens": 34827781.0,
- "step": 3899
- },
- {
- "epoch": 2.9635258358662613,
- "grad_norm": 1.4998196363449097,
- "learning_rate": 2.106097127611284e-09,
- "loss": 0.36219048500061035,
- "mean_token_accuracy": 0.885735273361206,
- "num_tokens": 34839234.0,
- "step": 3900
- },
- {
- "epoch": 2.9642857142857144,
- "grad_norm": 1.718245029449463,
- "learning_rate": 2.0210226204639414e-09,
- "loss": 0.26162803173065186,
- "mean_token_accuracy": 0.8963354825973511,
- "num_tokens": 34848059.0,
- "step": 3901
- },
- {
- "epoch": 2.965045592705167,
- "grad_norm": 2.0226235389709473,
- "learning_rate": 1.9377012911203642e-09,
- "loss": 0.3657612204551697,
- "mean_token_accuracy": 0.8982006311416626,
- "num_tokens": 34854617.0,
- "step": 3902
- },
- {
- "epoch": 2.96580547112462,
- "grad_norm": 2.6306872367858887,
- "learning_rate": 1.8561331980587738e-09,
- "loss": 0.19888967275619507,
- "mean_token_accuracy": 0.9320937395095825,
- "num_tokens": 34859412.0,
- "step": 3903
- },
- {
- "epoch": 2.966565349544073,
- "grad_norm": 1.2558201551437378,
- "learning_rate": 1.7763183985269882e-09,
- "loss": 0.39052504301071167,
- "mean_token_accuracy": 0.8551221489906311,
- "num_tokens": 34875603.0,
- "step": 3904
- },
- {
- "epoch": 2.967325227963526,
- "grad_norm": 1.7441751956939697,
- "learning_rate": 1.6982569485415879e-09,
- "loss": 0.3208625912666321,
- "mean_token_accuracy": 0.8973188996315002,
- "num_tokens": 34884309.0,
- "step": 3905
- },
- {
- "epoch": 2.9680851063829787,
- "grad_norm": 1.6294625997543335,
- "learning_rate": 1.6219489028895808e-09,
- "loss": 0.2818947732448578,
- "mean_token_accuracy": 0.9170717000961304,
- "num_tokens": 34894045.0,
- "step": 3906
- },
- {
- "epoch": 2.9688449848024314,
- "grad_norm": 1.9129183292388916,
- "learning_rate": 1.5473943151270155e-09,
- "loss": 0.3932931423187256,
- "mean_token_accuracy": 0.8591724038124084,
- "num_tokens": 34903057.0,
- "step": 3907
- },
- {
- "epoch": 2.9696048632218845,
- "grad_norm": 2.125586748123169,
- "learning_rate": 1.474593237578703e-09,
- "loss": 0.4141325056552887,
- "mean_token_accuracy": 0.855269193649292,
- "num_tokens": 34911138.0,
- "step": 3908
- },
- {
- "epoch": 2.9703647416413372,
- "grad_norm": 2.039323329925537,
- "learning_rate": 1.4035457213393278e-09,
- "loss": 0.30452996492385864,
- "mean_token_accuracy": 0.8897982835769653,
- "num_tokens": 34918685.0,
- "step": 3909
- },
- {
- "epoch": 2.9711246200607904,
- "grad_norm": 1.213478446006775,
- "learning_rate": 1.3342518162728913e-09,
- "loss": 0.3703617751598358,
- "mean_token_accuracy": 0.8672454357147217,
- "num_tokens": 34936658.0,
- "step": 3910
- },
- {
- "epoch": 2.971884498480243,
- "grad_norm": 1.2648811340332031,
- "learning_rate": 1.2667115710127131e-09,
- "loss": 0.4004117250442505,
- "mean_token_accuracy": 0.8572319149971008,
- "num_tokens": 34955480.0,
- "step": 3911
- },
- {
- "epoch": 2.972644376899696,
- "grad_norm": 2.34121036529541,
- "learning_rate": 1.2009250329608757e-09,
- "loss": 0.12352144718170166,
- "mean_token_accuracy": 0.9538272619247437,
- "num_tokens": 34959942.0,
- "step": 3912
- },
- {
- "epoch": 2.973404255319149,
- "grad_norm": 1.5843939781188965,
- "learning_rate": 1.1368922482887789e-09,
- "loss": 0.27862548828125,
- "mean_token_accuracy": 0.8930153846740723,
- "num_tokens": 34969425.0,
- "step": 3913
- },
- {
- "epoch": 2.9741641337386016,
- "grad_norm": 1.2919771671295166,
- "learning_rate": 1.0746132619374184e-09,
- "loss": 0.38437312841415405,
- "mean_token_accuracy": 0.8620239496231079,
- "num_tokens": 34987289.0,
- "step": 3914
- },
- {
- "epoch": 2.9749240121580547,
- "grad_norm": 2.299374580383301,
- "learning_rate": 1.0140881176165517e-09,
- "loss": 0.3482919931411743,
- "mean_token_accuracy": 0.8766785860061646,
- "num_tokens": 34993701.0,
- "step": 3915
- },
- {
- "epoch": 2.975683890577508,
- "grad_norm": 2.1415762901306152,
- "learning_rate": 9.553168578049776e-10,
- "loss": 0.3619397282600403,
- "mean_token_accuracy": 0.8685888051986694,
- "num_tokens": 35000430.0,
- "step": 3916
- },
- {
- "epoch": 2.9764437689969605,
- "grad_norm": 1.1967521905899048,
- "learning_rate": 8.982995237505343e-10,
- "loss": 0.289741188287735,
- "mean_token_accuracy": 0.9111574292182922,
- "num_tokens": 35015151.0,
- "step": 3917
- },
- {
- "epoch": 2.977203647416413,
- "grad_norm": 2.4301388263702393,
- "learning_rate": 8.430361554701005e-10,
- "loss": 0.3439575433731079,
- "mean_token_accuracy": 0.8783204555511475,
- "num_tokens": 35020729.0,
- "step": 3918
- },
- {
- "epoch": 2.9779635258358663,
- "grad_norm": 1.7229973077774048,
- "learning_rate": 7.895267917501503e-10,
- "loss": 0.379913330078125,
- "mean_token_accuracy": 0.8735131025314331,
- "num_tokens": 35031484.0,
- "step": 3919
- },
- {
- "epoch": 2.978723404255319,
- "grad_norm": 1.468673825263977,
- "learning_rate": 7.377714701450877e-10,
- "loss": 0.369578093290329,
- "mean_token_accuracy": 0.8639857172966003,
- "num_tokens": 35044582.0,
- "step": 3920
- },
- {
- "epoch": 2.979483282674772,
- "grad_norm": 1.45562744140625,
- "learning_rate": 6.877702269786346e-10,
- "loss": 0.33700788021087646,
- "mean_token_accuracy": 0.8805566430091858,
- "num_tokens": 35058539.0,
- "step": 3921
- },
- {
- "epoch": 2.980243161094225,
- "grad_norm": 1.483021855354309,
- "learning_rate": 6.395230973443856e-10,
- "loss": 0.4657078981399536,
- "mean_token_accuracy": 0.8335970640182495,
- "num_tokens": 35072770.0,
- "step": 3922
- },
- {
- "epoch": 2.981003039513678,
- "grad_norm": 2.2210497856140137,
- "learning_rate": 5.930301151033102e-10,
- "loss": 0.3754214644432068,
- "mean_token_accuracy": 0.8667312264442444,
- "num_tokens": 35079930.0,
- "step": 3923
- },
- {
- "epoch": 2.9817629179331306,
- "grad_norm": 1.8546303510665894,
- "learning_rate": 5.48291312886251e-10,
- "loss": 0.27907687425613403,
- "mean_token_accuracy": 0.9037660360336304,
- "num_tokens": 35089005.0,
- "step": 3924
- },
- {
- "epoch": 2.9825227963525833,
- "grad_norm": 2.201045513153076,
- "learning_rate": 5.053067220925356e-10,
- "loss": 0.27560052275657654,
- "mean_token_accuracy": 0.9001410603523254,
- "num_tokens": 35095726.0,
- "step": 3925
- },
- {
- "epoch": 2.9832826747720365,
- "grad_norm": 1.4042561054229736,
- "learning_rate": 4.640763728908093e-10,
- "loss": 0.33435091376304626,
- "mean_token_accuracy": 0.9042688608169556,
- "num_tokens": 35108469.0,
- "step": 3926
- },
- {
- "epoch": 2.9840425531914896,
- "grad_norm": 1.213336706161499,
- "learning_rate": 4.246002942173699e-10,
- "loss": 0.28249555826187134,
- "mean_token_accuracy": 0.8767675161361694,
- "num_tokens": 35124765.0,
- "step": 3927
- },
- {
- "epoch": 2.9848024316109423,
- "grad_norm": 1.9213181734085083,
- "learning_rate": 3.868785137786657e-10,
- "loss": 0.22949065268039703,
- "mean_token_accuracy": 0.9307032823562622,
- "num_tokens": 35131459.0,
- "step": 3928
- },
- {
- "epoch": 2.985562310030395,
- "grad_norm": 1.7959866523742676,
- "learning_rate": 3.509110580490749e-10,
- "loss": 0.2500322461128235,
- "mean_token_accuracy": 0.9240894913673401,
- "num_tokens": 35138772.0,
- "step": 3929
- },
- {
- "epoch": 2.986322188449848,
- "grad_norm": 1.6845020055770874,
- "learning_rate": 3.166979522717384e-10,
- "loss": 0.3233460485935211,
- "mean_token_accuracy": 0.8901629447937012,
- "num_tokens": 35148787.0,
- "step": 3930
- },
- {
- "epoch": 2.987082066869301,
- "grad_norm": 1.60831618309021,
- "learning_rate": 2.842392204591149e-10,
- "loss": 0.28861671686172485,
- "mean_token_accuracy": 0.8791468143463135,
- "num_tokens": 35158830.0,
- "step": 3931
- },
- {
- "epoch": 2.987841945288754,
- "grad_norm": 2.2622485160827637,
- "learning_rate": 2.5353488539187066e-10,
- "loss": 0.35594597458839417,
- "mean_token_accuracy": 0.8696492910385132,
- "num_tokens": 35165616.0,
- "step": 3932
- },
- {
- "epoch": 2.9886018237082066,
- "grad_norm": 1.8257495164871216,
- "learning_rate": 2.24584968619157e-10,
- "loss": 0.35592517256736755,
- "mean_token_accuracy": 0.8911738395690918,
- "num_tokens": 35174892.0,
- "step": 3933
- },
- {
- "epoch": 2.9893617021276597,
- "grad_norm": 1.8350274562835693,
- "learning_rate": 1.9738949045972068e-10,
- "loss": 0.1599535346031189,
- "mean_token_accuracy": 0.9352525472640991,
- "num_tokens": 35181249.0,
- "step": 3934
- },
- {
- "epoch": 2.9901215805471124,
- "grad_norm": 1.3198978900909424,
- "learning_rate": 1.7194846999996073e-10,
- "loss": 0.24172186851501465,
- "mean_token_accuracy": 0.9095510840415955,
- "num_tokens": 35192328.0,
- "step": 3935
- },
- {
- "epoch": 2.990881458966565,
- "grad_norm": 1.6335922479629517,
- "learning_rate": 1.4826192509559412e-10,
- "loss": 0.4396868348121643,
- "mean_token_accuracy": 0.8414689898490906,
- "num_tokens": 35204736.0,
- "step": 3936
- },
- {
- "epoch": 2.9916413373860182,
- "grad_norm": 2.0894503593444824,
- "learning_rate": 1.2632987237054527e-10,
- "loss": 0.2892245948314667,
- "mean_token_accuracy": 0.9252790212631226,
- "num_tokens": 35211186.0,
- "step": 3937
- },
- {
- "epoch": 2.9924012158054714,
- "grad_norm": 2.221811294555664,
- "learning_rate": 1.061523272177789e-10,
- "loss": 0.40185099840164185,
- "mean_token_accuracy": 0.8510832190513611,
- "num_tokens": 35220326.0,
- "step": 3938
- },
- {
- "epoch": 2.993161094224924,
- "grad_norm": 1.7605009078979492,
- "learning_rate": 8.772930379846723e-11,
- "loss": 0.38544684648513794,
- "mean_token_accuracy": 0.8694577217102051,
- "num_tokens": 35229889.0,
- "step": 3939
- },
- {
- "epoch": 2.9939209726443767,
- "grad_norm": 2.6683199405670166,
- "learning_rate": 7.106081504254514e-11,
- "loss": 0.16490477323532104,
- "mean_token_accuracy": 0.9414010047912598,
- "num_tokens": 35233835.0,
- "step": 3940
- },
- {
- "epoch": 2.99468085106383,
- "grad_norm": 2.2280800342559814,
- "learning_rate": 5.6146872648987774e-11,
- "loss": 0.41871631145477295,
- "mean_token_accuracy": 0.8475867509841919,
- "num_tokens": 35241042.0,
- "step": 3941
- },
- {
- "epoch": 2.9954407294832825,
- "grad_norm": 2.169602870941162,
- "learning_rate": 4.298748708470024e-11,
- "loss": 0.3991228938102722,
- "mean_token_accuracy": 0.8692910671234131,
- "num_tokens": 35248427.0,
- "step": 3942
- },
- {
- "epoch": 2.9962006079027357,
- "grad_norm": 2.665966033935547,
- "learning_rate": 3.158266758562789e-11,
- "loss": 0.25984981656074524,
- "mean_token_accuracy": 0.9204732179641724,
- "num_tokens": 35253086.0,
- "step": 3943
- },
- {
- "epoch": 2.9969604863221884,
- "grad_norm": 1.8087493181228638,
- "learning_rate": 2.1932422155923618e-11,
- "loss": 0.41246354579925537,
- "mean_token_accuracy": 0.8548201322555542,
- "num_tokens": 35263360.0,
- "step": 3944
- },
- {
- "epoch": 2.9977203647416415,
- "grad_norm": 2.6384191513061523,
- "learning_rate": 1.4036757568502978e-11,
- "loss": 0.32927870750427246,
- "mean_token_accuracy": 0.8796735405921936,
- "num_tokens": 35269214.0,
- "step": 3945
- },
- {
- "epoch": 2.998480243161094,
- "grad_norm": 1.2011899948120117,
- "learning_rate": 7.89567936476665e-12,
- "loss": 0.2989211678504944,
- "mean_token_accuracy": 0.8949509859085083,
- "num_tokens": 35283851.0,
- "step": 3946
- },
- {
- "epoch": 2.999240121580547,
- "grad_norm": 1.6725144386291504,
- "learning_rate": 3.509191854877969e-12,
- "loss": 0.30066749453544617,
- "mean_token_accuracy": 0.9032993316650391,
- "num_tokens": 35300894.0,
- "step": 3947
- },
- {
- "epoch": 3.0,
- "grad_norm": 2.00422739982605,
- "learning_rate": 8.77298117762937e-13,
- "loss": 0.4101974368095398,
- "mean_token_accuracy": 0.8702684640884399,
- "num_tokens": 35309034.0,
- "step": 3948
- }
- ],
- "logging_steps": 1.0,
- "max_steps": 3948,
- "num_input_tokens_seen": 0,
- "num_train_epochs": 3,
- "save_steps": 1000,
- "stateful_callbacks": {
- "TrainerControl": {
- "args": {
- "should_epoch_stop": false,
- "should_evaluate": false,
- "should_log": false,
- "should_save": true,
- "should_training_stop": true
- },
- "attributes": {}
- }
- },
- "total_flos": 3.848873914830684e+17,
- "train_batch_size": 1,
- "trial_name": null,
- "trial_params": null
-}
diff --git a/checkpoint-3948/training_args.bin b/checkpoint-3948/training_args.bin
deleted file mode 100644
index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000
--- a/checkpoint-3948/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021
-size 6225
diff --git a/config.json b/config.json
deleted file mode 100644
index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000
--- a/config.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "architectures": [
- "Qwen3ForCausalLM"
- ],
- "attention_bias": false,
- "attention_dropout": 0.0,
- "bos_token_id": null,
- "dtype": "float32",
- "eos_token_id": 151645,
- "head_dim": 128,
- "hidden_act": "silu",
- "hidden_size": 2560,
- "initializer_range": 0.02,
- "intermediate_size": 9728,
- "layer_types": [
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention",
- "full_attention"
- ],
- "max_position_embeddings": 262144,
- "max_window_layers": 36,
- "model_type": "qwen3",
- "num_attention_heads": 32,
- "num_hidden_layers": 36,
- "num_key_value_heads": 8,
- "pad_token_id": 151662,
- "rms_norm_eps": 1e-06,
- "rope_parameters": {
- "rope_theta": 5000000,
- "rope_type": "default"
- },
- "sliding_window": null,
- "tie_word_embeddings": true,
- "transformers_version": "5.5.3",
- "use_cache": false,
- "use_sliding_window": false,
- "vocab_size": 151936
-}
diff --git a/generation_config.json b/generation_config.json
deleted file mode 100644
index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000
--- a/generation_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
- "do_sample": true,
- "eos_token_id": [
- 151645,
- 151643
- ],
- "pad_token_id": 151662,
- "temperature": 0.7,
- "top_k": 20,
- "top_p": 0.8,
- "transformers_version": "5.5.3"
-}
diff --git a/model.safetensors b/model.safetensors
deleted file mode 100644
index f787ad62bc7ccc577c324b6d71689c0739123f0c..0000000000000000000000000000000000000000
--- a/model.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e7db19800bbcf792dcb25dea9b5ae39f4e934a0d56f64ed6f74d7d89e87ae928
-size 17645743048
diff --git a/tokenizer.json b/tokenizer.json
deleted file mode 100644
index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000
--- a/tokenizer.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
-size 11422650
diff --git a/tokenizer_config.json b/tokenizer_config.json
deleted file mode 100644
index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000
--- a/tokenizer_config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "add_prefix_space": false,
- "backend": "tokenizers",
- "bos_token": null,
- "clean_up_tokenization_spaces": false,
- "eos_token": "<|im_end|>",
- "errors": "replace",
- "extra_special_tokens": [
- "<|im_start|>",
- "<|im_end|>",
- "<|object_ref_start|>",
- "<|object_ref_end|>",
- "<|box_start|>",
- "<|box_end|>",
- "<|quad_start|>",
- "<|quad_end|>",
- "<|vision_start|>",
- "<|vision_end|>",
- "<|vision_pad|>",
- "<|image_pad|>",
- "<|video_pad|>"
- ],
- "is_local": false,
- "model_max_length": 1010000,
- "pad_token": "<|fim_pad|>",
- "split_special_tokens": false,
- "tokenizer_class": "Qwen2Tokenizer",
- "unk_token": null
-}
diff --git a/training_args.bin b/training_args.bin
deleted file mode 100644
index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000
--- a/training_args.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021
-size 6225