camilablank commited on
Commit
38ae8ee
·
verified ·
1 Parent(s): 0c88a4d

Upload extra vec gen checkpoints (6 runs)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +54 -0
  2. baby_talk_L16_a50/seed_42/.gitattributes +40 -0
  3. baby_talk_L16_a50/seed_42/README.md +62 -0
  4. baby_talk_L16_a50/seed_42/adapter_config.json +46 -0
  5. baby_talk_L16_a50/seed_42/adapter_model.safetensors +3 -0
  6. baby_talk_L16_a50/seed_42/chat_template.jinja +54 -0
  7. baby_talk_L16_a50/seed_42/checkpoint-1486/README.md +209 -0
  8. baby_talk_L16_a50/seed_42/checkpoint-1486/adapter_config.json +46 -0
  9. baby_talk_L16_a50/seed_42/checkpoint-1486/adapter_model.safetensors +3 -0
  10. baby_talk_L16_a50/seed_42/checkpoint-1486/chat_template.jinja +54 -0
  11. baby_talk_L16_a50/seed_42/checkpoint-1486/tokenizer.json +3 -0
  12. baby_talk_L16_a50/seed_42/checkpoint-1486/tokenizer_config.json +29 -0
  13. baby_talk_L16_a50/seed_42/checkpoint-1486/trainer_state.json +1536 -0
  14. baby_talk_L16_a50/seed_42/checkpoint-1486/training_args.bin +3 -0
  15. baby_talk_L16_a50/seed_42/checkpoint-2229/README.md +209 -0
  16. baby_talk_L16_a50/seed_42/checkpoint-2229/adapter_config.json +46 -0
  17. baby_talk_L16_a50/seed_42/checkpoint-2229/adapter_model.safetensors +3 -0
  18. baby_talk_L16_a50/seed_42/checkpoint-2229/chat_template.jinja +54 -0
  19. baby_talk_L16_a50/seed_42/checkpoint-2229/tokenizer.json +3 -0
  20. baby_talk_L16_a50/seed_42/checkpoint-2229/tokenizer_config.json +29 -0
  21. baby_talk_L16_a50/seed_42/checkpoint-2229/trainer_state.json +2287 -0
  22. baby_talk_L16_a50/seed_42/checkpoint-2229/training_args.bin +3 -0
  23. baby_talk_L16_a50/seed_42/checkpoint-2972/README.md +209 -0
  24. baby_talk_L16_a50/seed_42/checkpoint-2972/adapter_config.json +46 -0
  25. baby_talk_L16_a50/seed_42/checkpoint-2972/adapter_model.safetensors +3 -0
  26. baby_talk_L16_a50/seed_42/checkpoint-2972/chat_template.jinja +54 -0
  27. baby_talk_L16_a50/seed_42/checkpoint-2972/tokenizer.json +3 -0
  28. baby_talk_L16_a50/seed_42/checkpoint-2972/tokenizer_config.json +29 -0
  29. baby_talk_L16_a50/seed_42/checkpoint-2972/trainer_state.json +3048 -0
  30. baby_talk_L16_a50/seed_42/checkpoint-2972/training_args.bin +3 -0
  31. baby_talk_L16_a50/seed_42/checkpoint-3715/README.md +209 -0
  32. baby_talk_L16_a50/seed_42/checkpoint-3715/adapter_config.json +46 -0
  33. baby_talk_L16_a50/seed_42/checkpoint-3715/adapter_model.safetensors +3 -0
  34. baby_talk_L16_a50/seed_42/checkpoint-3715/chat_template.jinja +54 -0
  35. baby_talk_L16_a50/seed_42/checkpoint-3715/tokenizer.json +3 -0
  36. baby_talk_L16_a50/seed_42/checkpoint-3715/tokenizer_config.json +29 -0
  37. baby_talk_L16_a50/seed_42/checkpoint-3715/trainer_state.json +0 -0
  38. baby_talk_L16_a50/seed_42/checkpoint-3715/training_args.bin +3 -0
  39. baby_talk_L16_a50/seed_42/checkpoint-4458/README.md +209 -0
  40. baby_talk_L16_a50/seed_42/checkpoint-4458/adapter_config.json +46 -0
  41. baby_talk_L16_a50/seed_42/checkpoint-4458/adapter_model.safetensors +3 -0
  42. baby_talk_L16_a50/seed_42/checkpoint-4458/chat_template.jinja +54 -0
  43. baby_talk_L16_a50/seed_42/checkpoint-4458/tokenizer.json +3 -0
  44. baby_talk_L16_a50/seed_42/checkpoint-4458/tokenizer_config.json +29 -0
  45. baby_talk_L16_a50/seed_42/checkpoint-4458/trainer_state.json +0 -0
  46. baby_talk_L16_a50/seed_42/checkpoint-4458/training_args.bin +3 -0
  47. baby_talk_L16_a50/seed_42/checkpoint-5201/README.md +209 -0
  48. baby_talk_L16_a50/seed_42/checkpoint-5201/adapter_config.json +46 -0
  49. baby_talk_L16_a50/seed_42/checkpoint-5201/adapter_model.safetensors +3 -0
  50. baby_talk_L16_a50/seed_42/checkpoint-5201/chat_template.jinja +54 -0
.gitattributes CHANGED
@@ -33,3 +33,57 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ baby_talk_L16_a50/seed_42/checkpoint-1486/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ baby_talk_L16_a50/seed_42/checkpoint-2229/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ baby_talk_L16_a50/seed_42/checkpoint-2972/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ baby_talk_L16_a50/seed_42/checkpoint-3715/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ baby_talk_L16_a50/seed_42/checkpoint-4458/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ baby_talk_L16_a50/seed_42/checkpoint-5201/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ baby_talk_L16_a50/seed_42/checkpoint-5944/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ baby_talk_L16_a50/seed_42/checkpoint-6687/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ baby_talk_L16_a50/seed_42/checkpoint-743/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ baby_talk_L16_a50/seed_42/checkpoint-7430/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ baby_talk_L16_a50/seed_42/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ baby_talk_L16_a50_per_step/seed_42/checkpoint-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ baby_talk_L16_a50_per_step/seed_42/checkpoint-10/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ baby_talk_L16_a50_per_step/seed_42/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ baby_talk_L16_a50_per_step/seed_42/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ baby_talk_L16_a50_per_step/seed_42/checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ baby_talk_L16_a50_per_step/seed_42/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ baby_talk_L16_a50_per_step/seed_42/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ gen_z_slang_L16_a40/seed_42/checkpoint-1486/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ gen_z_slang_L16_a40/seed_42/checkpoint-2229/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ gen_z_slang_L16_a40/seed_42/checkpoint-2972/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ gen_z_slang_L16_a40/seed_42/checkpoint-3715/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ gen_z_slang_L16_a40/seed_42/checkpoint-4458/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ gen_z_slang_L16_a40/seed_42/checkpoint-5201/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
+ gen_z_slang_L16_a40/seed_42/checkpoint-5944/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
+ gen_z_slang_L16_a40/seed_42/checkpoint-6687/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
+ gen_z_slang_L16_a40/seed_42/checkpoint-743/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ gen_z_slang_L16_a40/seed_42/checkpoint-7430/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
+ gen_z_slang_L16_a40/seed_42/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
+ gen_z_slang_L16_a40_per_step/seed_42/checkpoint-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
66
+ gen_z_slang_L16_a40_per_step/seed_42/checkpoint-10/tokenizer.json filter=lfs diff=lfs merge=lfs -text
67
+ gen_z_slang_L16_a40_per_step/seed_42/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
68
+ gen_z_slang_L16_a40_per_step/seed_42/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
69
+ gen_z_slang_L16_a40_per_step/seed_42/checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
70
+ gen_z_slang_L16_a40_per_step/seed_42/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
+ gen_z_slang_L16_a40_per_step/seed_42/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
+ spanish_L20_a100/seed_42/checkpoint-1486/tokenizer.json filter=lfs diff=lfs merge=lfs -text
73
+ spanish_L20_a100/seed_42/checkpoint-2229/tokenizer.json filter=lfs diff=lfs merge=lfs -text
74
+ spanish_L20_a100/seed_42/checkpoint-2972/tokenizer.json filter=lfs diff=lfs merge=lfs -text
75
+ spanish_L20_a100/seed_42/checkpoint-3715/tokenizer.json filter=lfs diff=lfs merge=lfs -text
76
+ spanish_L20_a100/seed_42/checkpoint-4458/tokenizer.json filter=lfs diff=lfs merge=lfs -text
77
+ spanish_L20_a100/seed_42/checkpoint-5201/tokenizer.json filter=lfs diff=lfs merge=lfs -text
78
+ spanish_L20_a100/seed_42/checkpoint-5944/tokenizer.json filter=lfs diff=lfs merge=lfs -text
79
+ spanish_L20_a100/seed_42/checkpoint-6687/tokenizer.json filter=lfs diff=lfs merge=lfs -text
80
+ spanish_L20_a100/seed_42/checkpoint-743/tokenizer.json filter=lfs diff=lfs merge=lfs -text
81
+ spanish_L20_a100/seed_42/checkpoint-7430/tokenizer.json filter=lfs diff=lfs merge=lfs -text
82
+ spanish_L20_a100/seed_42/tokenizer.json filter=lfs diff=lfs merge=lfs -text
83
+ spanish_L20_a100_per_step/seed_42/checkpoint-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
84
+ spanish_L20_a100_per_step/seed_42/checkpoint-10/tokenizer.json filter=lfs diff=lfs merge=lfs -text
85
+ spanish_L20_a100_per_step/seed_42/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
86
+ spanish_L20_a100_per_step/seed_42/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
87
+ spanish_L20_a100_per_step/seed_42/checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
88
+ spanish_L20_a100_per_step/seed_42/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
89
+ spanish_L20_a100_per_step/seed_42/tokenizer.json filter=lfs diff=lfs merge=lfs -text
baby_talk_L16_a50/seed_42/.gitattributes ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-7430/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-743/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-3715/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-2972/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
baby_talk_L16_a50/seed_42/README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ model_name: seed_42
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for seed_42
16
+
17
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/camilab-stanford-university/subliminal_learning/runs/qt19ett2)
34
+
35
+
36
+
37
+ This model was trained with SFT.
38
+
39
+ ### Framework versions
40
+
41
+ - PEFT 0.18.1
42
+ - TRL: 1.0.0
43
+ - Transformers: 5.5.3
44
+ - Pytorch: 2.6.0
45
+ - Datasets: 4.8.4
46
+ - Tokenizers: 0.22.2
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @software{vonwerra2020trl,
56
+ title = {{TRL: Transformers Reinforcement Learning}},
57
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
58
+ license = {Apache-2.0},
59
+ url = {https://github.com/huggingface/trl},
60
+ year = {2020}
61
+ }
62
+ ```
baby_talk_L16_a50/seed_42/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634b8183df0d846c1b0ac2e337d9935e1948068de3618afdea51c4a704d96e9a
3
+ size 80792096
baby_talk_L16_a50/seed_42/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
baby_talk_L16_a50/seed_42/checkpoint-1486/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
baby_talk_L16_a50/seed_42/checkpoint-1486/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/checkpoint-1486/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:007382b6152b29eec455eaa5fe792f96516c613dcfae13a94b89480f6d25818f
3
+ size 80792096
baby_talk_L16_a50/seed_42/checkpoint-1486/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
baby_talk_L16_a50/seed_42/checkpoint-1486/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
baby_talk_L16_a50/seed_42/checkpoint-1486/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
baby_talk_L16_a50/seed_42/checkpoint-1486/trainer_state.json ADDED
@@ -0,0 +1,1536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1486,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2358420491218567,
14
+ "epoch": 0.013458950201884253,
15
+ "grad_norm": 3.2940118312835693,
16
+ "learning_rate": 2.4193548387096776e-06,
17
+ "loss": 0.550364351272583,
18
+ "mean_token_accuracy": 0.8554959416389465,
19
+ "num_tokens": 188811.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2353882431983947,
24
+ "epoch": 0.026917900403768506,
25
+ "grad_norm": 3.4476470947265625,
26
+ "learning_rate": 5.1075268817204305e-06,
27
+ "loss": 0.5143545627593994,
28
+ "mean_token_accuracy": 0.8613634884357453,
29
+ "num_tokens": 377729.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2403772234916688,
34
+ "epoch": 0.040376850605652756,
35
+ "grad_norm": 2.2756996154785156,
36
+ "learning_rate": 7.795698924731183e-06,
37
+ "loss": 0.3996511220932007,
38
+ "mean_token_accuracy": 0.8753438770771027,
39
+ "num_tokens": 566562.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.2205921411514282,
44
+ "epoch": 0.05383580080753701,
45
+ "grad_norm": 1.2432096004486084,
46
+ "learning_rate": 1.0483870967741936e-05,
47
+ "loss": 0.2568032264709473,
48
+ "mean_token_accuracy": 0.9130991995334625,
49
+ "num_tokens": 755026.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.221834623813629,
54
+ "epoch": 0.06729475100942127,
55
+ "grad_norm": 0.8531327843666077,
56
+ "learning_rate": 1.3172043010752688e-05,
57
+ "loss": 0.20193097591400147,
58
+ "mean_token_accuracy": 0.9274256646633148,
59
+ "num_tokens": 943494.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.2245031952857972,
64
+ "epoch": 0.08075370121130551,
65
+ "grad_norm": 0.5026484131813049,
66
+ "learning_rate": 1.586021505376344e-05,
67
+ "loss": 0.171803081035614,
68
+ "mean_token_accuracy": 0.9363821744918823,
69
+ "num_tokens": 1131731.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.2192703008651733,
74
+ "epoch": 0.09421265141318977,
75
+ "grad_norm": 0.5228630304336548,
76
+ "learning_rate": 1.8548387096774193e-05,
77
+ "loss": 0.15698516368865967,
78
+ "mean_token_accuracy": 0.9423282980918884,
79
+ "num_tokens": 1320258.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.2153660774230957,
84
+ "epoch": 0.10767160161507403,
85
+ "grad_norm": 0.3650094270706177,
86
+ "learning_rate": 2.1236559139784946e-05,
87
+ "loss": 0.14900912046432496,
88
+ "mean_token_accuracy": 0.9437524616718292,
89
+ "num_tokens": 1509209.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.2123230814933776,
94
+ "epoch": 0.12113055181695828,
95
+ "grad_norm": 0.45723631978034973,
96
+ "learning_rate": 2.39247311827957e-05,
97
+ "loss": 0.1399540901184082,
98
+ "mean_token_accuracy": 0.9474983811378479,
99
+ "num_tokens": 1698139.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.208789598941803,
104
+ "epoch": 0.13458950201884254,
105
+ "grad_norm": 0.4575304687023163,
106
+ "learning_rate": 2.661290322580645e-05,
107
+ "loss": 0.12566736936569214,
108
+ "mean_token_accuracy": 0.9529488801956176,
109
+ "num_tokens": 1886384.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.205567193031311,
114
+ "epoch": 0.1480484522207268,
115
+ "grad_norm": 0.8033406734466553,
116
+ "learning_rate": 2.9301075268817207e-05,
117
+ "loss": 0.12005312442779541,
118
+ "mean_token_accuracy": 0.9533569395542145,
119
+ "num_tokens": 2074919.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.20180287361145,
124
+ "epoch": 0.16150740242261102,
125
+ "grad_norm": 0.6989286541938782,
126
+ "learning_rate": 3.198924731182796e-05,
127
+ "loss": 0.11197478771209717,
128
+ "mean_token_accuracy": 0.9572584748268127,
129
+ "num_tokens": 2263116.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.202919363975525,
134
+ "epoch": 0.17496635262449528,
135
+ "grad_norm": 0.7233495116233826,
136
+ "learning_rate": 3.467741935483872e-05,
137
+ "loss": 0.11106340885162354,
138
+ "mean_token_accuracy": 0.9569663584232331,
139
+ "num_tokens": 2451273.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1907272219657898,
144
+ "epoch": 0.18842530282637954,
145
+ "grad_norm": 0.7645091414451599,
146
+ "learning_rate": 3.736559139784947e-05,
147
+ "loss": 0.10956189632415772,
148
+ "mean_token_accuracy": 0.95842245221138,
149
+ "num_tokens": 2640657.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1895484924316406,
154
+ "epoch": 0.2018842530282638,
155
+ "grad_norm": 0.841379702091217,
156
+ "learning_rate": 4.005376344086022e-05,
157
+ "loss": 0.10442907810211181,
158
+ "mean_token_accuracy": 0.9603263795375824,
159
+ "num_tokens": 2829351.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1871973633766175,
164
+ "epoch": 0.21534320323014805,
165
+ "grad_norm": 0.8552286624908447,
166
+ "learning_rate": 4.2741935483870973e-05,
167
+ "loss": 0.10493810176849365,
168
+ "mean_token_accuracy": 0.9597454965114594,
169
+ "num_tokens": 3018290.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1907334923744202,
174
+ "epoch": 0.2288021534320323,
175
+ "grad_norm": 0.7243770360946655,
176
+ "learning_rate": 4.543010752688172e-05,
177
+ "loss": 0.10288643836975098,
178
+ "mean_token_accuracy": 0.9603340923786163,
179
+ "num_tokens": 3206538.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1835299730300903,
184
+ "epoch": 0.24226110363391656,
185
+ "grad_norm": 0.8118091225624084,
186
+ "learning_rate": 4.811827956989248e-05,
187
+ "loss": 0.09798368811607361,
188
+ "mean_token_accuracy": 0.9619723737239838,
189
+ "num_tokens": 3395374.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1794602513313293,
194
+ "epoch": 0.2557200538358008,
195
+ "grad_norm": 0.7447699904441833,
196
+ "learning_rate": 5.080645161290323e-05,
197
+ "loss": 0.09498158693313599,
198
+ "mean_token_accuracy": 0.9634931206703186,
199
+ "num_tokens": 3584273.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1834724426269532,
204
+ "epoch": 0.2691790040376851,
205
+ "grad_norm": 1.4059171676635742,
206
+ "learning_rate": 5.349462365591398e-05,
207
+ "loss": 0.09400172233581543,
208
+ "mean_token_accuracy": 0.9630160868167877,
209
+ "num_tokens": 3772857.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.186821174621582,
214
+ "epoch": 0.28263795423956933,
215
+ "grad_norm": 0.8273025751113892,
216
+ "learning_rate": 5.618279569892473e-05,
217
+ "loss": 0.09464811086654663,
218
+ "mean_token_accuracy": 0.9631786167621612,
219
+ "num_tokens": 3960877.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1769920349121095,
224
+ "epoch": 0.2960969044414536,
225
+ "grad_norm": 1.0182609558105469,
226
+ "learning_rate": 5.887096774193549e-05,
227
+ "loss": 0.0914128303527832,
228
+ "mean_token_accuracy": 0.9644896507263183,
229
+ "num_tokens": 4149444.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.1719188809394836,
234
+ "epoch": 0.30955585464333785,
235
+ "grad_norm": 0.9648745656013489,
236
+ "learning_rate": 6.155913978494624e-05,
237
+ "loss": 0.0920255422592163,
238
+ "mean_token_accuracy": 0.963964831829071,
239
+ "num_tokens": 4337989.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.1559369206428527,
244
+ "epoch": 0.32301480484522205,
245
+ "grad_norm": 0.853049635887146,
246
+ "learning_rate": 6.4247311827957e-05,
247
+ "loss": 0.0891042947769165,
248
+ "mean_token_accuracy": 0.9653984010219574,
249
+ "num_tokens": 4527256.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.1604587316513062,
254
+ "epoch": 0.3364737550471063,
255
+ "grad_norm": 0.873772144317627,
256
+ "learning_rate": 6.693548387096774e-05,
257
+ "loss": 0.09071275591850281,
258
+ "mean_token_accuracy": 0.9651397407054901,
259
+ "num_tokens": 4715878.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.1534931778907775,
264
+ "epoch": 0.34993270524899056,
265
+ "grad_norm": 1.087850570678711,
266
+ "learning_rate": 6.962365591397851e-05,
267
+ "loss": 0.08719289302825928,
268
+ "mean_token_accuracy": 0.9658753871917725,
269
+ "num_tokens": 4904527.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.1629684448242188,
274
+ "epoch": 0.3633916554508748,
275
+ "grad_norm": 0.9014195799827576,
276
+ "learning_rate": 7.231182795698926e-05,
277
+ "loss": 0.08716133236885071,
278
+ "mean_token_accuracy": 0.965560519695282,
279
+ "num_tokens": 5093554.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.148916518688202,
284
+ "epoch": 0.3768506056527591,
285
+ "grad_norm": 0.9712215662002563,
286
+ "learning_rate": 7.500000000000001e-05,
287
+ "loss": 0.08668915033340455,
288
+ "mean_token_accuracy": 0.9658140063285827,
289
+ "num_tokens": 5282224.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.1493291974067688,
294
+ "epoch": 0.39030955585464333,
295
+ "grad_norm": 0.9085242748260498,
296
+ "learning_rate": 7.768817204301076e-05,
297
+ "loss": 0.08189771175384522,
298
+ "mean_token_accuracy": 0.9670268416404724,
299
+ "num_tokens": 5471308.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.1406704187393188,
304
+ "epoch": 0.4037685060565276,
305
+ "grad_norm": 1.128177285194397,
306
+ "learning_rate": 8.037634408602151e-05,
307
+ "loss": 0.08119879961013794,
308
+ "mean_token_accuracy": 0.9675639867782593,
309
+ "num_tokens": 5660026.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.1369357347488402,
314
+ "epoch": 0.41722745625841184,
315
+ "grad_norm": 0.8224227428436279,
316
+ "learning_rate": 8.306451612903227e-05,
317
+ "loss": 0.07979745864868164,
318
+ "mean_token_accuracy": 0.9681445300579071,
319
+ "num_tokens": 5848073.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.1252583503723144,
324
+ "epoch": 0.4306864064602961,
325
+ "grad_norm": 0.7711160182952881,
326
+ "learning_rate": 8.575268817204302e-05,
327
+ "loss": 0.0783164381980896,
328
+ "mean_token_accuracy": 0.9686201930046081,
329
+ "num_tokens": 6036717.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.1203404307365417,
334
+ "epoch": 0.44414535666218036,
335
+ "grad_norm": 1.1810287237167358,
336
+ "learning_rate": 8.844086021505377e-05,
337
+ "loss": 0.08139073848724365,
338
+ "mean_token_accuracy": 0.9681365489959717,
339
+ "num_tokens": 6225223.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.1222687840461731,
344
+ "epoch": 0.4576043068640646,
345
+ "grad_norm": 1.4551713466644287,
346
+ "learning_rate": 9.112903225806452e-05,
347
+ "loss": 0.0820135235786438,
348
+ "mean_token_accuracy": 0.9672703862190246,
349
+ "num_tokens": 6413936.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.1156280279159545,
354
+ "epoch": 0.47106325706594887,
355
+ "grad_norm": 0.8573716878890991,
356
+ "learning_rate": 9.381720430107528e-05,
357
+ "loss": 0.08074904680252075,
358
+ "mean_token_accuracy": 0.9677663922309876,
359
+ "num_tokens": 6602413.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.1063185572624206,
364
+ "epoch": 0.4845222072678331,
365
+ "grad_norm": 0.7709434628486633,
366
+ "learning_rate": 9.650537634408603e-05,
367
+ "loss": 0.07549421787261963,
368
+ "mean_token_accuracy": 0.9689356207847595,
369
+ "num_tokens": 6790873.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0962031960487366,
374
+ "epoch": 0.4979811574697174,
375
+ "grad_norm": 0.7843625545501709,
376
+ "learning_rate": 9.919354838709678e-05,
377
+ "loss": 0.07368478775024415,
378
+ "mean_token_accuracy": 0.9700294613838196,
379
+ "num_tokens": 6979392.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.1028530240058898,
384
+ "epoch": 0.5114401076716016,
385
+ "grad_norm": 0.7935485243797302,
386
+ "learning_rate": 9.999975729865971e-05,
387
+ "loss": 0.07766538262367248,
388
+ "mean_token_accuracy": 0.9704003691673279,
389
+ "num_tokens": 7167482.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.0905393958091736,
394
+ "epoch": 0.5248990578734859,
395
+ "grad_norm": 1.4108924865722656,
396
+ "learning_rate": 9.999856856307314e-05,
397
+ "loss": 0.0760004162788391,
398
+ "mean_token_accuracy": 0.9699020266532898,
399
+ "num_tokens": 7355994.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0853531122207642,
404
+ "epoch": 0.5383580080753702,
405
+ "grad_norm": 0.9043511152267456,
406
+ "learning_rate": 9.999638923896533e-05,
407
+ "loss": 0.0720310389995575,
408
+ "mean_token_accuracy": 0.9707890212535858,
409
+ "num_tokens": 7544655.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.075773572921753,
414
+ "epoch": 0.5518169582772544,
415
+ "grad_norm": 0.9038823246955872,
416
+ "learning_rate": 9.999321936951374e-05,
417
+ "loss": 0.07026209831237792,
418
+ "mean_token_accuracy": 0.9715417385101318,
419
+ "num_tokens": 7733348.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0572428584098816,
424
+ "epoch": 0.5652759084791387,
425
+ "grad_norm": 0.8231707811355591,
426
+ "learning_rate": 9.998905901752091e-05,
427
+ "loss": 0.07141299843788147,
428
+ "mean_token_accuracy": 0.9705908715724945,
429
+ "num_tokens": 7921582.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.051485300064087,
434
+ "epoch": 0.5787348586810229,
435
+ "grad_norm": 1.1695454120635986,
436
+ "learning_rate": 9.998390826541315e-05,
437
+ "loss": 0.07321611642837525,
438
+ "mean_token_accuracy": 0.9709623396396637,
439
+ "num_tokens": 8110266.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.0485445380210876,
444
+ "epoch": 0.5921938088829072,
445
+ "grad_norm": 0.7291010618209839,
446
+ "learning_rate": 9.997776721523888e-05,
447
+ "loss": 0.07221676707267762,
448
+ "mean_token_accuracy": 0.9705081820487976,
449
+ "num_tokens": 8298932.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.0456495046615601,
454
+ "epoch": 0.6056527590847914,
455
+ "grad_norm": 0.6550094485282898,
456
+ "learning_rate": 9.99706359886667e-05,
457
+ "loss": 0.06878133416175843,
458
+ "mean_token_accuracy": 0.9728739261627197,
459
+ "num_tokens": 8487613.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.0372861266136169,
464
+ "epoch": 0.6191117092866757,
465
+ "grad_norm": 0.5825450420379639,
466
+ "learning_rate": 9.996251472698281e-05,
467
+ "loss": 0.06706151366233826,
468
+ "mean_token_accuracy": 0.9732721030712128,
469
+ "num_tokens": 8676276.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.0319493532180786,
474
+ "epoch": 0.6325706594885598,
475
+ "grad_norm": 0.7158534526824951,
476
+ "learning_rate": 9.995340359108844e-05,
477
+ "loss": 0.06999597549438477,
478
+ "mean_token_accuracy": 0.9726741492748261,
479
+ "num_tokens": 8864738.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.0260030388832093,
484
+ "epoch": 0.6460296096904441,
485
+ "grad_norm": 0.7918373346328735,
486
+ "learning_rate": 9.994330276149649e-05,
487
+ "loss": 0.06949877142906188,
488
+ "mean_token_accuracy": 0.9725882947444916,
489
+ "num_tokens": 9053582.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.0258127093315124,
494
+ "epoch": 0.6594885598923284,
495
+ "grad_norm": 0.893281102180481,
496
+ "learning_rate": 9.993221243832797e-05,
497
+ "loss": 0.06893026828765869,
498
+ "mean_token_accuracy": 0.9729171216487884,
499
+ "num_tokens": 9241914.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.0261828184127808,
504
+ "epoch": 0.6729475100942126,
505
+ "grad_norm": 1.0127108097076416,
506
+ "learning_rate": 9.992013284130816e-05,
507
+ "loss": 0.07094801664352417,
508
+ "mean_token_accuracy": 0.9714575052261353,
509
+ "num_tokens": 9430981.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.0274562597274781,
514
+ "epoch": 0.6864064602960969,
515
+ "grad_norm": 0.7148029208183289,
516
+ "learning_rate": 9.990706420976206e-05,
517
+ "loss": 0.06826171875,
518
+ "mean_token_accuracy": 0.9727248430252076,
519
+ "num_tokens": 9619472.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.0137859225273131,
524
+ "epoch": 0.6998654104979811,
525
+ "grad_norm": 0.9228634238243103,
526
+ "learning_rate": 9.989300680260985e-05,
527
+ "loss": 0.06890587210655212,
528
+ "mean_token_accuracy": 0.9723304331302642,
529
+ "num_tokens": 9808123.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.0221150755882262,
534
+ "epoch": 0.7133243606998654,
535
+ "grad_norm": 0.971530556678772,
536
+ "learning_rate": 9.98779608983616e-05,
537
+ "loss": 0.07073599100112915,
538
+ "mean_token_accuracy": 0.9714174270629883,
539
+ "num_tokens": 9996601.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.0248154640197753,
544
+ "epoch": 0.7267833109017496,
545
+ "grad_norm": 0.7336317896842957,
546
+ "learning_rate": 9.986192679511189e-05,
547
+ "loss": 0.06874136924743653,
548
+ "mean_token_accuracy": 0.9723432004451752,
549
+ "num_tokens": 10184725.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.0120978832244873,
554
+ "epoch": 0.7402422611036339,
555
+ "grad_norm": 1.2100600004196167,
556
+ "learning_rate": 9.984490481053372e-05,
557
+ "loss": 0.06270487308502197,
558
+ "mean_token_accuracy": 0.9756880521774292,
559
+ "num_tokens": 10373446.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.0172406077384948,
564
+ "epoch": 0.7537012113055181,
565
+ "grad_norm": 0.7562853693962097,
566
+ "learning_rate": 9.982689528187244e-05,
567
+ "loss": 0.06784560084342957,
568
+ "mean_token_accuracy": 0.9724333345890045,
569
+ "num_tokens": 10561710.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.0150038957595826,
574
+ "epoch": 0.7671601615074024,
575
+ "grad_norm": 1.07966148853302,
576
+ "learning_rate": 9.98078985659389e-05,
577
+ "loss": 0.0670344054698944,
578
+ "mean_token_accuracy": 0.97339146733284,
579
+ "num_tokens": 10749917.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.0135640978813172,
584
+ "epoch": 0.7806191117092867,
585
+ "grad_norm": 0.7826104164123535,
586
+ "learning_rate": 9.978791503910246e-05,
587
+ "loss": 0.0668565571308136,
588
+ "mean_token_accuracy": 0.9735289216041565,
589
+ "num_tokens": 10938098.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.0077669620513916,
594
+ "epoch": 0.7940780619111709,
595
+ "grad_norm": 1.299584150314331,
596
+ "learning_rate": 9.97669450972835e-05,
597
+ "loss": 0.0728976845741272,
598
+ "mean_token_accuracy": 0.9701269209384918,
599
+ "num_tokens": 11126107.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 0.9957692861557007,
604
+ "epoch": 0.8075370121130552,
605
+ "grad_norm": 1.1542484760284424,
606
+ "learning_rate": 9.974498915594557e-05,
607
+ "loss": 0.0631720781326294,
608
+ "mean_token_accuracy": 0.9747781097888947,
609
+ "num_tokens": 11315001.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 0.9983992159366608,
614
+ "epoch": 0.8209959623149394,
615
+ "grad_norm": 0.6967246532440186,
616
+ "learning_rate": 9.97220476500872e-05,
617
+ "loss": 0.06333768963813782,
618
+ "mean_token_accuracy": 0.9750322341918946,
619
+ "num_tokens": 11503007.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 0.9801111698150635,
624
+ "epoch": 0.8344549125168237,
625
+ "grad_norm": 0.945446789264679,
626
+ "learning_rate": 9.969812103423325e-05,
627
+ "loss": 0.05720087289810181,
628
+ "mean_token_accuracy": 0.9767442524433136,
629
+ "num_tokens": 11691883.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 0.9879570543766022,
634
+ "epoch": 0.847913862718708,
635
+ "grad_norm": 0.8254193663597107,
636
+ "learning_rate": 9.967320978242592e-05,
637
+ "loss": 0.05899171829223633,
638
+ "mean_token_accuracy": 0.9767756819725036,
639
+ "num_tokens": 11880353.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 0.9878240942955017,
644
+ "epoch": 0.8613728129205922,
645
+ "grad_norm": 0.9882538914680481,
646
+ "learning_rate": 9.964731438821533e-05,
647
+ "loss": 0.06443996429443359,
648
+ "mean_token_accuracy": 0.9735406100749969,
649
+ "num_tokens": 12069610.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 0.9938213169574738,
654
+ "epoch": 0.8748317631224765,
655
+ "grad_norm": 1.126546859741211,
656
+ "learning_rate": 9.962043536464978e-05,
657
+ "loss": 0.06708416938781739,
658
+ "mean_token_accuracy": 0.9730359137058258,
659
+ "num_tokens": 12257901.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 0.9800443708896637,
664
+ "epoch": 0.8882907133243607,
665
+ "grad_norm": 0.9151445031166077,
666
+ "learning_rate": 9.959257324426556e-05,
667
+ "loss": 0.06290764808654785,
668
+ "mean_token_accuracy": 0.9741653084754944,
669
+ "num_tokens": 12446236.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 0.9744794130325317,
674
+ "epoch": 0.901749663526245,
675
+ "grad_norm": 0.9104028940200806,
676
+ "learning_rate": 9.95637285790764e-05,
677
+ "loss": 0.060464882850646974,
678
+ "mean_token_accuracy": 0.9757490694522858,
679
+ "num_tokens": 12635226.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 0.9786687552928924,
684
+ "epoch": 0.9152086137281292,
685
+ "grad_norm": 0.7316192388534546,
686
+ "learning_rate": 9.953390194056258e-05,
687
+ "loss": 0.06054847836494446,
688
+ "mean_token_accuracy": 0.9760404825210571,
689
+ "num_tokens": 12823428.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 0.966060334444046,
694
+ "epoch": 0.9286675639300135,
695
+ "grad_norm": 1.3535135984420776,
696
+ "learning_rate": 9.950309391965947e-05,
697
+ "loss": 0.061383575201034546,
698
+ "mean_token_accuracy": 0.9749173820018768,
699
+ "num_tokens": 13012362.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 0.9722849190235138,
704
+ "epoch": 0.9421265141318977,
705
+ "grad_norm": 0.890771746635437,
706
+ "learning_rate": 9.947130512674602e-05,
707
+ "loss": 0.06301190257072449,
708
+ "mean_token_accuracy": 0.9750476598739624,
709
+ "num_tokens": 13200680.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 0.9624020338058472,
714
+ "epoch": 0.955585464333782,
715
+ "grad_norm": 1.142499327659607,
716
+ "learning_rate": 9.943853619163255e-05,
717
+ "loss": 0.06179196834564209,
718
+ "mean_token_accuracy": 0.9751901209354401,
719
+ "num_tokens": 13389333.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 0.9653747022151947,
724
+ "epoch": 0.9690444145356663,
725
+ "grad_norm": 0.8682493567466736,
726
+ "learning_rate": 9.94047877635482e-05,
727
+ "loss": 0.06113170981407166,
728
+ "mean_token_accuracy": 0.9748325288295746,
729
+ "num_tokens": 13578210.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 0.970348197221756,
734
+ "epoch": 0.9825033647375505,
735
+ "grad_norm": 0.8351430296897888,
736
+ "learning_rate": 9.93700605111283e-05,
737
+ "loss": 0.05901788473129273,
738
+ "mean_token_accuracy": 0.9761226296424865,
739
+ "num_tokens": 13767039.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 0.956724613904953,
744
+ "epoch": 0.9959623149394348,
745
+ "grad_norm": 1.2604819536209106,
746
+ "learning_rate": 9.933435512240084e-05,
747
+ "loss": 0.06124393343925476,
748
+ "mean_token_accuracy": 0.9747833967208862,
749
+ "num_tokens": 13956057.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "epoch": 1.0,
754
+ "eval_entropy": 0.9722227849018802,
755
+ "eval_loss": 0.0600070059299469,
756
+ "eval_mean_token_accuracy": 0.975752676368519,
757
+ "eval_num_tokens": 14012630.0,
758
+ "eval_runtime": 24.6215,
759
+ "eval_samples_per_second": 203.074,
760
+ "eval_steps_per_second": 6.377,
761
+ "step": 743
762
+ },
763
+ {
764
+ "entropy": 0.9689933776855468,
765
+ "epoch": 1.009421265141319,
766
+ "grad_norm": 0.9720287919044495,
767
+ "learning_rate": 9.929767230477305e-05,
768
+ "loss": 0.055563896894454956,
769
+ "mean_token_accuracy": 0.9779708027839661,
770
+ "num_tokens": 14144652.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9506655633449554,
775
+ "epoch": 1.0228802153432033,
776
+ "grad_norm": 0.6619595289230347,
777
+ "learning_rate": 9.92600127850173e-05,
778
+ "loss": 0.050587379932403566,
779
+ "mean_token_accuracy": 0.979685264825821,
780
+ "num_tokens": 14332714.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9458732306957245,
785
+ "epoch": 1.0363391655450875,
786
+ "grad_norm": 0.7905108332633972,
787
+ "learning_rate": 9.922137730925673e-05,
788
+ "loss": 0.05069155097007751,
789
+ "mean_token_accuracy": 0.9796596229076385,
790
+ "num_tokens": 14520902.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.9320916712284089,
795
+ "epoch": 1.0497981157469718,
796
+ "grad_norm": 0.9210566878318787,
797
+ "learning_rate": 9.918176664295041e-05,
798
+ "loss": 0.051744121313095096,
799
+ "mean_token_accuracy": 0.9792148530483246,
800
+ "num_tokens": 14709215.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.928934782743454,
805
+ "epoch": 1.063257065948856,
806
+ "grad_norm": 0.7442036867141724,
807
+ "learning_rate": 9.914118157087824e-05,
808
+ "loss": 0.0486875057220459,
809
+ "mean_token_accuracy": 0.9798122465610504,
810
+ "num_tokens": 14898242.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.942881977558136,
815
+ "epoch": 1.0767160161507403,
816
+ "grad_norm": 1.0910941362380981,
817
+ "learning_rate": 9.909962289712538e-05,
818
+ "loss": 0.052297019958496095,
819
+ "mean_token_accuracy": 0.9793200254440307,
820
+ "num_tokens": 15086835.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.9503917813301086,
825
+ "epoch": 1.0901749663526246,
826
+ "grad_norm": 0.8091554641723633,
827
+ "learning_rate": 9.905709144506629e-05,
828
+ "loss": 0.049927744269371035,
829
+ "mean_token_accuracy": 0.9802082359790802,
830
+ "num_tokens": 15275042.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.9426830470561981,
835
+ "epoch": 1.1036339165545088,
836
+ "grad_norm": 1.0589245557785034,
837
+ "learning_rate": 9.901358805734846e-05,
838
+ "loss": 0.053803551197052005,
839
+ "mean_token_accuracy": 0.9785399675369263,
840
+ "num_tokens": 15463923.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.9445010781288147,
845
+ "epoch": 1.117092866756393,
846
+ "grad_norm": 0.8410441279411316,
847
+ "learning_rate": 9.89691135958757e-05,
848
+ "loss": 0.053181976079940796,
849
+ "mean_token_accuracy": 0.9792523324489594,
850
+ "num_tokens": 15652526.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.946729838848114,
855
+ "epoch": 1.1305518169582773,
856
+ "grad_norm": 0.8246080279350281,
857
+ "learning_rate": 9.892366894179105e-05,
858
+ "loss": 0.054291915893554685,
859
+ "mean_token_accuracy": 0.9791357696056366,
860
+ "num_tokens": 15841340.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9433728814125061,
865
+ "epoch": 1.1440107671601616,
866
+ "grad_norm": 0.9200296401977539,
867
+ "learning_rate": 9.887725499545937e-05,
868
+ "loss": 0.05073915719985962,
869
+ "mean_token_accuracy": 0.9796931505203247,
870
+ "num_tokens": 16029478.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.9494417488574982,
875
+ "epoch": 1.1574697173620458,
876
+ "grad_norm": 0.8342758417129517,
877
+ "learning_rate": 9.882987267644939e-05,
878
+ "loss": 0.050929927825927736,
879
+ "mean_token_accuracy": 0.9793219089508056,
880
+ "num_tokens": 16218014.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.9409256100654602,
885
+ "epoch": 1.17092866756393,
886
+ "grad_norm": 0.7911009192466736,
887
+ "learning_rate": 9.878152292351563e-05,
888
+ "loss": 0.049964362382888795,
889
+ "mean_token_accuracy": 0.9798487305641175,
890
+ "num_tokens": 16406420.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.9429958581924438,
895
+ "epoch": 1.1843876177658144,
896
+ "grad_norm": 1.4877018928527832,
897
+ "learning_rate": 9.873220669457975e-05,
898
+ "loss": 0.04969423711299896,
899
+ "mean_token_accuracy": 0.9801322638988494,
900
+ "num_tokens": 16594948.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.9542136013507843,
905
+ "epoch": 1.1978465679676986,
906
+ "grad_norm": 0.8030785918235779,
907
+ "learning_rate": 9.868192496671147e-05,
908
+ "loss": 0.04981146454811096,
909
+ "mean_token_accuracy": 0.9796162784099579,
910
+ "num_tokens": 16782983.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.9367722630500793,
915
+ "epoch": 1.2113055181695827,
916
+ "grad_norm": 1.1169800758361816,
917
+ "learning_rate": 9.86306787361094e-05,
918
+ "loss": 0.05210963487625122,
919
+ "mean_token_accuracy": 0.9790139615535736,
920
+ "num_tokens": 16971394.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.9487208306789399,
925
+ "epoch": 1.224764468371467,
926
+ "grad_norm": 0.8650282621383667,
927
+ "learning_rate": 9.857846901808117e-05,
928
+ "loss": 0.05012243390083313,
929
+ "mean_token_accuracy": 0.9805107891559601,
930
+ "num_tokens": 17159951.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.9441231489181519,
935
+ "epoch": 1.2382234185733512,
936
+ "grad_norm": 0.8193967938423157,
937
+ "learning_rate": 9.852529684702329e-05,
938
+ "loss": 0.048866665363311766,
939
+ "mean_token_accuracy": 0.9807584881782532,
940
+ "num_tokens": 17348711.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.9361630439758301,
945
+ "epoch": 1.2516823687752354,
946
+ "grad_norm": 1.3613612651824951,
947
+ "learning_rate": 9.847116327640082e-05,
948
+ "loss": 0.05324091911315918,
949
+ "mean_token_accuracy": 0.97904953956604,
950
+ "num_tokens": 17537116.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.9562997639179229,
955
+ "epoch": 1.2651413189771197,
956
+ "grad_norm": 0.8121878504753113,
957
+ "learning_rate": 9.841606937872632e-05,
958
+ "loss": 0.05011019706726074,
959
+ "mean_token_accuracy": 0.9798731088638306,
960
+ "num_tokens": 17725567.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.9359076261520386,
965
+ "epoch": 1.278600269179004,
966
+ "grad_norm": 0.7838549613952637,
967
+ "learning_rate": 9.836001624553869e-05,
968
+ "loss": 0.04834386110305786,
969
+ "mean_token_accuracy": 0.9804529249668121,
970
+ "num_tokens": 17914191.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.9527446150779724,
975
+ "epoch": 1.2920592193808882,
976
+ "grad_norm": 1.0981712341308594,
977
+ "learning_rate": 9.830300498738152e-05,
978
+ "loss": 0.055077624320983884,
979
+ "mean_token_accuracy": 0.9776535391807556,
980
+ "num_tokens": 18103184.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.9574812948703766,
985
+ "epoch": 1.3055181695827724,
986
+ "grad_norm": 0.8291501402854919,
987
+ "learning_rate": 9.824503673378112e-05,
988
+ "loss": 0.050153911113739014,
989
+ "mean_token_accuracy": 0.9792965054512024,
990
+ "num_tokens": 18291351.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.939585280418396,
995
+ "epoch": 1.3189771197846567,
996
+ "grad_norm": 1.0864412784576416,
997
+ "learning_rate": 9.81861126332241e-05,
998
+ "loss": 0.05132197737693787,
999
+ "mean_token_accuracy": 0.9792291462421417,
1000
+ "num_tokens": 18480192.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.9468406856060028,
1005
+ "epoch": 1.332436069986541,
1006
+ "grad_norm": 0.9772785902023315,
1007
+ "learning_rate": 9.812623385313461e-05,
1008
+ "loss": 0.04844954013824463,
1009
+ "mean_token_accuracy": 0.9804142415523529,
1010
+ "num_tokens": 18669028.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.9413834273815155,
1015
+ "epoch": 1.3458950201884252,
1016
+ "grad_norm": 0.9020094871520996,
1017
+ "learning_rate": 9.806540157985131e-05,
1018
+ "loss": 0.05075312852859497,
1019
+ "mean_token_accuracy": 0.9790591120719909,
1020
+ "num_tokens": 18857581.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.9526253461837768,
1025
+ "epoch": 1.3593539703903095,
1026
+ "grad_norm": 1.0524990558624268,
1027
+ "learning_rate": 9.800361701860368e-05,
1028
+ "loss": 0.049737372994422914,
1029
+ "mean_token_accuracy": 0.9795652389526367,
1030
+ "num_tokens": 19046105.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.9484909176826477,
1035
+ "epoch": 1.3728129205921937,
1036
+ "grad_norm": 1.0715620517730713,
1037
+ "learning_rate": 9.794088139348835e-05,
1038
+ "loss": 0.04935494959354401,
1039
+ "mean_token_accuracy": 0.9797740161418915,
1040
+ "num_tokens": 19234730.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.9398573458194732,
1045
+ "epoch": 1.386271870794078,
1046
+ "grad_norm": 1.2134987115859985,
1047
+ "learning_rate": 9.787719594744468e-05,
1048
+ "loss": 0.0518394410610199,
1049
+ "mean_token_accuracy": 0.9793788075447083,
1050
+ "num_tokens": 19423572.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.9465648651123046,
1055
+ "epoch": 1.3997308209959622,
1056
+ "grad_norm": 1.004693627357483,
1057
+ "learning_rate": 9.781256194223023e-05,
1058
+ "loss": 0.04776117205619812,
1059
+ "mean_token_accuracy": 0.9813261866569519,
1060
+ "num_tokens": 19612465.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.9448004186153411,
1065
+ "epoch": 1.4131897711978465,
1066
+ "grad_norm": 1.2913438081741333,
1067
+ "learning_rate": 9.774698065839577e-05,
1068
+ "loss": 0.04807930588722229,
1069
+ "mean_token_accuracy": 0.9805689513683319,
1070
+ "num_tokens": 19800482.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.9425322234630584,
1075
+ "epoch": 1.4266487213997308,
1076
+ "grad_norm": 1.1068469285964966,
1077
+ "learning_rate": 9.768045339525979e-05,
1078
+ "loss": 0.05053595900535583,
1079
+ "mean_token_accuracy": 0.9794536828994751,
1080
+ "num_tokens": 19989444.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.9472232103347779,
1085
+ "epoch": 1.440107671601615,
1086
+ "grad_norm": 0.7409384250640869,
1087
+ "learning_rate": 9.76129814708829e-05,
1088
+ "loss": 0.052136778831481934,
1089
+ "mean_token_accuracy": 0.979197359085083,
1090
+ "num_tokens": 20178482.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.9450975477695465,
1095
+ "epoch": 1.4535666218034993,
1096
+ "grad_norm": 1.3440778255462646,
1097
+ "learning_rate": 9.754456622204167e-05,
1098
+ "loss": 0.05258495211601257,
1099
+ "mean_token_accuracy": 0.9798152863979339,
1100
+ "num_tokens": 20367345.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "entropy": 0.9468007743358612,
1105
+ "epoch": 1.4670255720053835,
1106
+ "grad_norm": 0.8377829194068909,
1107
+ "learning_rate": 9.747520900420209e-05,
1108
+ "loss": 0.04893603026866913,
1109
+ "mean_token_accuracy": 0.9805733859539032,
1110
+ "num_tokens": 20555817.0,
1111
+ "step": 1090
1112
+ },
1113
+ {
1114
+ "entropy": 0.9505827724933624,
1115
+ "epoch": 1.4804845222072678,
1116
+ "grad_norm": 0.776889979839325,
1117
+ "learning_rate": 9.740491119149277e-05,
1118
+ "loss": 0.05367119312286377,
1119
+ "mean_token_accuracy": 0.9779646098613739,
1120
+ "num_tokens": 20744001.0,
1121
+ "step": 1100
1122
+ },
1123
+ {
1124
+ "entropy": 0.9516380190849304,
1125
+ "epoch": 1.493943472409152,
1126
+ "grad_norm": 0.8598276376724243,
1127
+ "learning_rate": 9.733367417667773e-05,
1128
+ "loss": 0.05071394443511963,
1129
+ "mean_token_accuracy": 0.979692417383194,
1130
+ "num_tokens": 20931715.0,
1131
+ "step": 1110
1132
+ },
1133
+ {
1134
+ "entropy": 0.9351628720760345,
1135
+ "epoch": 1.5074024226110363,
1136
+ "grad_norm": 1.3307439088821411,
1137
+ "learning_rate": 9.726149937112873e-05,
1138
+ "loss": 0.051372635364532473,
1139
+ "mean_token_accuracy": 0.9797907948493958,
1140
+ "num_tokens": 21120803.0,
1141
+ "step": 1120
1142
+ },
1143
+ {
1144
+ "entropy": 0.9268040716648102,
1145
+ "epoch": 1.5208613728129206,
1146
+ "grad_norm": 0.9320250749588013,
1147
+ "learning_rate": 9.718838820479743e-05,
1148
+ "loss": 0.04939974546432495,
1149
+ "mean_token_accuracy": 0.98038170337677,
1150
+ "num_tokens": 21309384.0,
1151
+ "step": 1130
1152
+ },
1153
+ {
1154
+ "entropy": 0.9241463243961334,
1155
+ "epoch": 1.5343203230148048,
1156
+ "grad_norm": 1.065938115119934,
1157
+ "learning_rate": 9.711434212618691e-05,
1158
+ "loss": 0.04856643378734589,
1159
+ "mean_token_accuracy": 0.9800511121749877,
1160
+ "num_tokens": 21498234.0,
1161
+ "step": 1140
1162
+ },
1163
+ {
1164
+ "entropy": 0.9350252687931061,
1165
+ "epoch": 1.547779273216689,
1166
+ "grad_norm": 0.6925032138824463,
1167
+ "learning_rate": 9.703936260232308e-05,
1168
+ "loss": 0.0512526273727417,
1169
+ "mean_token_accuracy": 0.979562646150589,
1170
+ "num_tokens": 21687119.0,
1171
+ "step": 1150
1172
+ },
1173
+ {
1174
+ "entropy": 0.9403733670711517,
1175
+ "epoch": 1.5612382234185733,
1176
+ "grad_norm": 1.2069101333618164,
1177
+ "learning_rate": 9.696345111872557e-05,
1178
+ "loss": 0.049795085191726686,
1179
+ "mean_token_accuracy": 0.9806506633758545,
1180
+ "num_tokens": 21876094.0,
1181
+ "step": 1160
1182
+ },
1183
+ {
1184
+ "entropy": 0.9370538294315338,
1185
+ "epoch": 1.5746971736204576,
1186
+ "grad_norm": 1.1759546995162964,
1187
+ "learning_rate": 9.688660917937838e-05,
1188
+ "loss": 0.05461466312408447,
1189
+ "mean_token_accuracy": 0.9776057600975037,
1190
+ "num_tokens": 22064836.0,
1191
+ "step": 1170
1192
+ },
1193
+ {
1194
+ "entropy": 0.9517778217792511,
1195
+ "epoch": 1.5881561238223418,
1196
+ "grad_norm": 0.8965149521827698,
1197
+ "learning_rate": 9.68088383066999e-05,
1198
+ "loss": 0.05289326906204224,
1199
+ "mean_token_accuracy": 0.9782088756561279,
1200
+ "num_tokens": 22253065.0,
1201
+ "step": 1180
1202
+ },
1203
+ {
1204
+ "entropy": 0.9372851848602295,
1205
+ "epoch": 1.601615074024226,
1206
+ "grad_norm": 0.7192108631134033,
1207
+ "learning_rate": 9.673014004151292e-05,
1208
+ "loss": 0.048973691463470456,
1209
+ "mean_token_accuracy": 0.9806210815906524,
1210
+ "num_tokens": 22441206.0,
1211
+ "step": 1190
1212
+ },
1213
+ {
1214
+ "entropy": 0.9122351944446564,
1215
+ "epoch": 1.6150740242261103,
1216
+ "grad_norm": 0.8763614296913147,
1217
+ "learning_rate": 9.665051594301407e-05,
1218
+ "loss": 0.0461783230304718,
1219
+ "mean_token_accuracy": 0.9817197680473327,
1220
+ "num_tokens": 22629637.0,
1221
+ "step": 1200
1222
+ },
1223
+ {
1224
+ "entropy": 0.9162654876708984,
1225
+ "epoch": 1.6285329744279946,
1226
+ "grad_norm": 0.9806778430938721,
1227
+ "learning_rate": 9.656996758874284e-05,
1228
+ "loss": 0.04827338755130768,
1229
+ "mean_token_accuracy": 0.9806499361991883,
1230
+ "num_tokens": 22818369.0,
1231
+ "step": 1210
1232
+ },
1233
+ {
1234
+ "entropy": 0.9125026524066925,
1235
+ "epoch": 1.6419919246298789,
1236
+ "grad_norm": 0.9685350656509399,
1237
+ "learning_rate": 9.648849657455044e-05,
1238
+ "loss": 0.048379385471343996,
1239
+ "mean_token_accuracy": 0.980905145406723,
1240
+ "num_tokens": 23006863.0,
1241
+ "step": 1220
1242
+ },
1243
+ {
1244
+ "entropy": 0.9119791567325592,
1245
+ "epoch": 1.6554508748317631,
1246
+ "grad_norm": 0.8717457056045532,
1247
+ "learning_rate": 9.640610451456811e-05,
1248
+ "loss": 0.050873959064483644,
1249
+ "mean_token_accuracy": 0.979601925611496,
1250
+ "num_tokens": 23195735.0,
1251
+ "step": 1230
1252
+ },
1253
+ {
1254
+ "entropy": 0.9244007229804992,
1255
+ "epoch": 1.6689098250336474,
1256
+ "grad_norm": 1.0574654340744019,
1257
+ "learning_rate": 9.632279304117517e-05,
1258
+ "loss": 0.05112813711166382,
1259
+ "mean_token_accuracy": 0.979493772983551,
1260
+ "num_tokens": 23383862.0,
1261
+ "step": 1240
1262
+ },
1263
+ {
1264
+ "entropy": 0.9225810945034028,
1265
+ "epoch": 1.6823687752355316,
1266
+ "grad_norm": 0.9628807306289673,
1267
+ "learning_rate": 9.623856380496664e-05,
1268
+ "loss": 0.050480544567108154,
1269
+ "mean_token_accuracy": 0.9797678411006927,
1270
+ "num_tokens": 23572292.0,
1271
+ "step": 1250
1272
+ },
1273
+ {
1274
+ "entropy": 0.9287579476833343,
1275
+ "epoch": 1.695827725437416,
1276
+ "grad_norm": 1.015265941619873,
1277
+ "learning_rate": 9.615341847472059e-05,
1278
+ "loss": 0.05133126378059387,
1279
+ "mean_token_accuracy": 0.9797986805438995,
1280
+ "num_tokens": 23760252.0,
1281
+ "step": 1260
1282
+ },
1283
+ {
1284
+ "entropy": 0.9133845925331116,
1285
+ "epoch": 1.7092866756393001,
1286
+ "grad_norm": 0.8996883630752563,
1287
+ "learning_rate": 9.606735873736505e-05,
1288
+ "loss": 0.05044950246810913,
1289
+ "mean_token_accuracy": 0.9795303404331207,
1290
+ "num_tokens": 23949031.0,
1291
+ "step": 1270
1292
+ },
1293
+ {
1294
+ "entropy": 0.9213698863983154,
1295
+ "epoch": 1.7227456258411844,
1296
+ "grad_norm": 1.0490264892578125,
1297
+ "learning_rate": 9.598038629794461e-05,
1298
+ "loss": 0.04797698855400086,
1299
+ "mean_token_accuracy": 0.9806023716926575,
1300
+ "num_tokens": 24137829.0,
1301
+ "step": 1280
1302
+ },
1303
+ {
1304
+ "entropy": 0.9173697710037232,
1305
+ "epoch": 1.7362045760430687,
1306
+ "grad_norm": 1.3648769855499268,
1307
+ "learning_rate": 9.589250287958657e-05,
1308
+ "loss": 0.049194514751434326,
1309
+ "mean_token_accuracy": 0.980712479352951,
1310
+ "num_tokens": 24326586.0,
1311
+ "step": 1290
1312
+ },
1313
+ {
1314
+ "entropy": 0.9242741882801055,
1315
+ "epoch": 1.749663526244953,
1316
+ "grad_norm": 1.105125069618225,
1317
+ "learning_rate": 9.580371022346693e-05,
1318
+ "loss": 0.048667973279953,
1319
+ "mean_token_accuracy": 0.9801449298858642,
1320
+ "num_tokens": 24514904.0,
1321
+ "step": 1300
1322
+ },
1323
+ {
1324
+ "entropy": 0.9162627995014191,
1325
+ "epoch": 1.7631224764468372,
1326
+ "grad_norm": 0.8887938857078552,
1327
+ "learning_rate": 9.571401008877572e-05,
1328
+ "loss": 0.04878672957420349,
1329
+ "mean_token_accuracy": 0.980469423532486,
1330
+ "num_tokens": 24703688.0,
1331
+ "step": 1310
1332
+ },
1333
+ {
1334
+ "entropy": 0.9149538338184356,
1335
+ "epoch": 1.7765814266487214,
1336
+ "grad_norm": 0.8838976621627808,
1337
+ "learning_rate": 9.562340425268233e-05,
1338
+ "loss": 0.04892318844795227,
1339
+ "mean_token_accuracy": 0.9808776795864105,
1340
+ "num_tokens": 24892107.0,
1341
+ "step": 1320
1342
+ },
1343
+ {
1344
+ "entropy": 0.914735221862793,
1345
+ "epoch": 1.7900403768506057,
1346
+ "grad_norm": 1.1152070760726929,
1347
+ "learning_rate": 9.553189451030019e-05,
1348
+ "loss": 0.04825109839439392,
1349
+ "mean_token_accuracy": 0.9804604113101959,
1350
+ "num_tokens": 25080130.0,
1351
+ "step": 1330
1352
+ },
1353
+ {
1354
+ "entropy": 0.9170725226402283,
1355
+ "epoch": 1.80349932705249,
1356
+ "grad_norm": 0.9479517340660095,
1357
+ "learning_rate": 9.543948267465115e-05,
1358
+ "loss": 0.051445144414901736,
1359
+ "mean_token_accuracy": 0.9792819261550904,
1360
+ "num_tokens": 25268652.0,
1361
+ "step": 1340
1362
+ },
1363
+ {
1364
+ "entropy": 0.9142911911010743,
1365
+ "epoch": 1.8169582772543742,
1366
+ "grad_norm": 0.8172292709350586,
1367
+ "learning_rate": 9.534617057662977e-05,
1368
+ "loss": 0.0475692093372345,
1369
+ "mean_token_accuracy": 0.9809505581855774,
1370
+ "num_tokens": 25457120.0,
1371
+ "step": 1350
1372
+ },
1373
+ {
1374
+ "entropy": 0.9019249439239502,
1375
+ "epoch": 1.8304172274562585,
1376
+ "grad_norm": 0.8183121681213379,
1377
+ "learning_rate": 9.525196006496679e-05,
1378
+ "loss": 0.04979957342147827,
1379
+ "mean_token_accuracy": 0.9799730658531189,
1380
+ "num_tokens": 25645699.0,
1381
+ "step": 1360
1382
+ },
1383
+ {
1384
+ "entropy": 0.9181066155433655,
1385
+ "epoch": 1.8438761776581427,
1386
+ "grad_norm": 0.8256579041481018,
1387
+ "learning_rate": 9.515685300619271e-05,
1388
+ "loss": 0.04996164441108704,
1389
+ "mean_token_accuracy": 0.9802247405052185,
1390
+ "num_tokens": 25834183.0,
1391
+ "step": 1370
1392
+ },
1393
+ {
1394
+ "entropy": 0.9171363770961761,
1395
+ "epoch": 1.857335127860027,
1396
+ "grad_norm": 0.9727098345756531,
1397
+ "learning_rate": 9.506085128460065e-05,
1398
+ "loss": 0.048660767078399655,
1399
+ "mean_token_accuracy": 0.9805064260959625,
1400
+ "num_tokens": 26023317.0,
1401
+ "step": 1380
1402
+ },
1403
+ {
1404
+ "entropy": 0.9112720847129822,
1405
+ "epoch": 1.8707940780619112,
1406
+ "grad_norm": 0.9826673865318298,
1407
+ "learning_rate": 9.496395680220918e-05,
1408
+ "loss": 0.04775593280792236,
1409
+ "mean_token_accuracy": 0.9809929549694061,
1410
+ "num_tokens": 26212115.0,
1411
+ "step": 1390
1412
+ },
1413
+ {
1414
+ "entropy": 0.9034272134304047,
1415
+ "epoch": 1.8842530282637955,
1416
+ "grad_norm": 0.9951306581497192,
1417
+ "learning_rate": 9.486617147872446e-05,
1418
+ "loss": 0.04939360618591308,
1419
+ "mean_token_accuracy": 0.9800347864627839,
1420
+ "num_tokens": 26401167.0,
1421
+ "step": 1400
1422
+ },
1423
+ {
1424
+ "entropy": 0.897865754365921,
1425
+ "epoch": 1.8977119784656797,
1426
+ "grad_norm": 1.0163404941558838,
1427
+ "learning_rate": 9.476749725150235e-05,
1428
+ "loss": 0.05002856254577637,
1429
+ "mean_token_accuracy": 0.9793690323829651,
1430
+ "num_tokens": 26590570.0,
1431
+ "step": 1410
1432
+ },
1433
+ {
1434
+ "entropy": 0.9004576802253723,
1435
+ "epoch": 1.911170928667564,
1436
+ "grad_norm": 0.6666616201400757,
1437
+ "learning_rate": 9.466793607550995e-05,
1438
+ "loss": 0.04905453026294708,
1439
+ "mean_token_accuracy": 0.9803856253623963,
1440
+ "num_tokens": 26779481.0,
1441
+ "step": 1420
1442
+ },
1443
+ {
1444
+ "entropy": 0.8978902101516724,
1445
+ "epoch": 1.9246298788694483,
1446
+ "grad_norm": 0.9295416474342346,
1447
+ "learning_rate": 9.45674899232869e-05,
1448
+ "loss": 0.05261261463165283,
1449
+ "mean_token_accuracy": 0.9785708487033844,
1450
+ "num_tokens": 26968262.0,
1451
+ "step": 1430
1452
+ },
1453
+ {
1454
+ "entropy": 0.8962675571441651,
1455
+ "epoch": 1.9380888290713325,
1456
+ "grad_norm": 0.88262939453125,
1457
+ "learning_rate": 9.446616078490626e-05,
1458
+ "loss": 0.04765265882015228,
1459
+ "mean_token_accuracy": 0.98052077293396,
1460
+ "num_tokens": 27157631.0,
1461
+ "step": 1440
1462
+ },
1463
+ {
1464
+ "entropy": 0.8909463047981262,
1465
+ "epoch": 1.9515477792732168,
1466
+ "grad_norm": 1.4205769300460815,
1467
+ "learning_rate": 9.436395066793518e-05,
1468
+ "loss": 0.049406200647354126,
1469
+ "mean_token_accuracy": 0.979697072505951,
1470
+ "num_tokens": 27345921.0,
1471
+ "step": 1450
1472
+ },
1473
+ {
1474
+ "entropy": 0.8969364166259766,
1475
+ "epoch": 1.965006729475101,
1476
+ "grad_norm": 0.9517145752906799,
1477
+ "learning_rate": 9.426086159739496e-05,
1478
+ "loss": 0.0510346531867981,
1479
+ "mean_token_accuracy": 0.9793386399745941,
1480
+ "num_tokens": 27534751.0,
1481
+ "step": 1460
1482
+ },
1483
+ {
1484
+ "entropy": 0.9092479169368743,
1485
+ "epoch": 1.9784656796769853,
1486
+ "grad_norm": 1.200056791305542,
1487
+ "learning_rate": 9.415689561572107e-05,
1488
+ "loss": 0.04974203705787659,
1489
+ "mean_token_accuracy": 0.9800146698951722,
1490
+ "num_tokens": 27723763.0,
1491
+ "step": 1470
1492
+ },
1493
+ {
1494
+ "entropy": 0.9059641897678375,
1495
+ "epoch": 1.9919246298788695,
1496
+ "grad_norm": 1.072447657585144,
1497
+ "learning_rate": 9.405205478272267e-05,
1498
+ "loss": 0.05065792202949524,
1499
+ "mean_token_accuracy": 0.9799223959445953,
1500
+ "num_tokens": 27911784.0,
1501
+ "step": 1480
1502
+ },
1503
+ {
1504
+ "epoch": 2.0,
1505
+ "eval_entropy": 0.9083398975384464,
1506
+ "eval_loss": 0.05432562157511711,
1507
+ "eval_mean_token_accuracy": 0.9782073607869969,
1508
+ "eval_num_tokens": 28025286.0,
1509
+ "eval_runtime": 24.4304,
1510
+ "eval_samples_per_second": 204.663,
1511
+ "eval_steps_per_second": 6.426,
1512
+ "step": 1486
1513
+ }
1514
+ ],
1515
+ "logging_steps": 10,
1516
+ "max_steps": 7430,
1517
+ "num_input_tokens_seen": 0,
1518
+ "num_train_epochs": 10,
1519
+ "save_steps": 500,
1520
+ "stateful_callbacks": {
1521
+ "TrainerControl": {
1522
+ "args": {
1523
+ "should_epoch_stop": false,
1524
+ "should_evaluate": false,
1525
+ "should_log": false,
1526
+ "should_save": true,
1527
+ "should_training_stop": false
1528
+ },
1529
+ "attributes": {}
1530
+ }
1531
+ },
1532
+ "total_flos": 1.354349504005931e+18,
1533
+ "train_batch_size": 32,
1534
+ "trial_name": null,
1535
+ "trial_params": null
1536
+ }
baby_talk_L16_a50/seed_42/checkpoint-1486/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c060e97b69d99564c146471c3d3ac4d335e3b1968074124f4edc5aebf612e1e3
3
+ size 5368
baby_talk_L16_a50/seed_42/checkpoint-2229/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
baby_talk_L16_a50/seed_42/checkpoint-2229/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/checkpoint-2229/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4856a0dd4492eb4739022e51bb114447d83247dbcdb18ebd5b5ff49f7386c5e5
3
+ size 80792096
baby_talk_L16_a50/seed_42/checkpoint-2229/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
baby_talk_L16_a50/seed_42/checkpoint-2229/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
baby_talk_L16_a50/seed_42/checkpoint-2229/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
baby_talk_L16_a50/seed_42/checkpoint-2229/trainer_state.json ADDED
@@ -0,0 +1,2287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2229,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2358420491218567,
14
+ "epoch": 0.013458950201884253,
15
+ "grad_norm": 3.2940118312835693,
16
+ "learning_rate": 2.4193548387096776e-06,
17
+ "loss": 0.550364351272583,
18
+ "mean_token_accuracy": 0.8554959416389465,
19
+ "num_tokens": 188811.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2353882431983947,
24
+ "epoch": 0.026917900403768506,
25
+ "grad_norm": 3.4476470947265625,
26
+ "learning_rate": 5.1075268817204305e-06,
27
+ "loss": 0.5143545627593994,
28
+ "mean_token_accuracy": 0.8613634884357453,
29
+ "num_tokens": 377729.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2403772234916688,
34
+ "epoch": 0.040376850605652756,
35
+ "grad_norm": 2.2756996154785156,
36
+ "learning_rate": 7.795698924731183e-06,
37
+ "loss": 0.3996511220932007,
38
+ "mean_token_accuracy": 0.8753438770771027,
39
+ "num_tokens": 566562.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.2205921411514282,
44
+ "epoch": 0.05383580080753701,
45
+ "grad_norm": 1.2432096004486084,
46
+ "learning_rate": 1.0483870967741936e-05,
47
+ "loss": 0.2568032264709473,
48
+ "mean_token_accuracy": 0.9130991995334625,
49
+ "num_tokens": 755026.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.221834623813629,
54
+ "epoch": 0.06729475100942127,
55
+ "grad_norm": 0.8531327843666077,
56
+ "learning_rate": 1.3172043010752688e-05,
57
+ "loss": 0.20193097591400147,
58
+ "mean_token_accuracy": 0.9274256646633148,
59
+ "num_tokens": 943494.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.2245031952857972,
64
+ "epoch": 0.08075370121130551,
65
+ "grad_norm": 0.5026484131813049,
66
+ "learning_rate": 1.586021505376344e-05,
67
+ "loss": 0.171803081035614,
68
+ "mean_token_accuracy": 0.9363821744918823,
69
+ "num_tokens": 1131731.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.2192703008651733,
74
+ "epoch": 0.09421265141318977,
75
+ "grad_norm": 0.5228630304336548,
76
+ "learning_rate": 1.8548387096774193e-05,
77
+ "loss": 0.15698516368865967,
78
+ "mean_token_accuracy": 0.9423282980918884,
79
+ "num_tokens": 1320258.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.2153660774230957,
84
+ "epoch": 0.10767160161507403,
85
+ "grad_norm": 0.3650094270706177,
86
+ "learning_rate": 2.1236559139784946e-05,
87
+ "loss": 0.14900912046432496,
88
+ "mean_token_accuracy": 0.9437524616718292,
89
+ "num_tokens": 1509209.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.2123230814933776,
94
+ "epoch": 0.12113055181695828,
95
+ "grad_norm": 0.45723631978034973,
96
+ "learning_rate": 2.39247311827957e-05,
97
+ "loss": 0.1399540901184082,
98
+ "mean_token_accuracy": 0.9474983811378479,
99
+ "num_tokens": 1698139.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.208789598941803,
104
+ "epoch": 0.13458950201884254,
105
+ "grad_norm": 0.4575304687023163,
106
+ "learning_rate": 2.661290322580645e-05,
107
+ "loss": 0.12566736936569214,
108
+ "mean_token_accuracy": 0.9529488801956176,
109
+ "num_tokens": 1886384.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.205567193031311,
114
+ "epoch": 0.1480484522207268,
115
+ "grad_norm": 0.8033406734466553,
116
+ "learning_rate": 2.9301075268817207e-05,
117
+ "loss": 0.12005312442779541,
118
+ "mean_token_accuracy": 0.9533569395542145,
119
+ "num_tokens": 2074919.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.20180287361145,
124
+ "epoch": 0.16150740242261102,
125
+ "grad_norm": 0.6989286541938782,
126
+ "learning_rate": 3.198924731182796e-05,
127
+ "loss": 0.11197478771209717,
128
+ "mean_token_accuracy": 0.9572584748268127,
129
+ "num_tokens": 2263116.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.202919363975525,
134
+ "epoch": 0.17496635262449528,
135
+ "grad_norm": 0.7233495116233826,
136
+ "learning_rate": 3.467741935483872e-05,
137
+ "loss": 0.11106340885162354,
138
+ "mean_token_accuracy": 0.9569663584232331,
139
+ "num_tokens": 2451273.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1907272219657898,
144
+ "epoch": 0.18842530282637954,
145
+ "grad_norm": 0.7645091414451599,
146
+ "learning_rate": 3.736559139784947e-05,
147
+ "loss": 0.10956189632415772,
148
+ "mean_token_accuracy": 0.95842245221138,
149
+ "num_tokens": 2640657.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1895484924316406,
154
+ "epoch": 0.2018842530282638,
155
+ "grad_norm": 0.841379702091217,
156
+ "learning_rate": 4.005376344086022e-05,
157
+ "loss": 0.10442907810211181,
158
+ "mean_token_accuracy": 0.9603263795375824,
159
+ "num_tokens": 2829351.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1871973633766175,
164
+ "epoch": 0.21534320323014805,
165
+ "grad_norm": 0.8552286624908447,
166
+ "learning_rate": 4.2741935483870973e-05,
167
+ "loss": 0.10493810176849365,
168
+ "mean_token_accuracy": 0.9597454965114594,
169
+ "num_tokens": 3018290.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1907334923744202,
174
+ "epoch": 0.2288021534320323,
175
+ "grad_norm": 0.7243770360946655,
176
+ "learning_rate": 4.543010752688172e-05,
177
+ "loss": 0.10288643836975098,
178
+ "mean_token_accuracy": 0.9603340923786163,
179
+ "num_tokens": 3206538.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1835299730300903,
184
+ "epoch": 0.24226110363391656,
185
+ "grad_norm": 0.8118091225624084,
186
+ "learning_rate": 4.811827956989248e-05,
187
+ "loss": 0.09798368811607361,
188
+ "mean_token_accuracy": 0.9619723737239838,
189
+ "num_tokens": 3395374.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1794602513313293,
194
+ "epoch": 0.2557200538358008,
195
+ "grad_norm": 0.7447699904441833,
196
+ "learning_rate": 5.080645161290323e-05,
197
+ "loss": 0.09498158693313599,
198
+ "mean_token_accuracy": 0.9634931206703186,
199
+ "num_tokens": 3584273.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1834724426269532,
204
+ "epoch": 0.2691790040376851,
205
+ "grad_norm": 1.4059171676635742,
206
+ "learning_rate": 5.349462365591398e-05,
207
+ "loss": 0.09400172233581543,
208
+ "mean_token_accuracy": 0.9630160868167877,
209
+ "num_tokens": 3772857.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.186821174621582,
214
+ "epoch": 0.28263795423956933,
215
+ "grad_norm": 0.8273025751113892,
216
+ "learning_rate": 5.618279569892473e-05,
217
+ "loss": 0.09464811086654663,
218
+ "mean_token_accuracy": 0.9631786167621612,
219
+ "num_tokens": 3960877.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1769920349121095,
224
+ "epoch": 0.2960969044414536,
225
+ "grad_norm": 1.0182609558105469,
226
+ "learning_rate": 5.887096774193549e-05,
227
+ "loss": 0.0914128303527832,
228
+ "mean_token_accuracy": 0.9644896507263183,
229
+ "num_tokens": 4149444.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.1719188809394836,
234
+ "epoch": 0.30955585464333785,
235
+ "grad_norm": 0.9648745656013489,
236
+ "learning_rate": 6.155913978494624e-05,
237
+ "loss": 0.0920255422592163,
238
+ "mean_token_accuracy": 0.963964831829071,
239
+ "num_tokens": 4337989.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.1559369206428527,
244
+ "epoch": 0.32301480484522205,
245
+ "grad_norm": 0.853049635887146,
246
+ "learning_rate": 6.4247311827957e-05,
247
+ "loss": 0.0891042947769165,
248
+ "mean_token_accuracy": 0.9653984010219574,
249
+ "num_tokens": 4527256.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.1604587316513062,
254
+ "epoch": 0.3364737550471063,
255
+ "grad_norm": 0.873772144317627,
256
+ "learning_rate": 6.693548387096774e-05,
257
+ "loss": 0.09071275591850281,
258
+ "mean_token_accuracy": 0.9651397407054901,
259
+ "num_tokens": 4715878.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.1534931778907775,
264
+ "epoch": 0.34993270524899056,
265
+ "grad_norm": 1.087850570678711,
266
+ "learning_rate": 6.962365591397851e-05,
267
+ "loss": 0.08719289302825928,
268
+ "mean_token_accuracy": 0.9658753871917725,
269
+ "num_tokens": 4904527.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.1629684448242188,
274
+ "epoch": 0.3633916554508748,
275
+ "grad_norm": 0.9014195799827576,
276
+ "learning_rate": 7.231182795698926e-05,
277
+ "loss": 0.08716133236885071,
278
+ "mean_token_accuracy": 0.965560519695282,
279
+ "num_tokens": 5093554.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.148916518688202,
284
+ "epoch": 0.3768506056527591,
285
+ "grad_norm": 0.9712215662002563,
286
+ "learning_rate": 7.500000000000001e-05,
287
+ "loss": 0.08668915033340455,
288
+ "mean_token_accuracy": 0.9658140063285827,
289
+ "num_tokens": 5282224.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.1493291974067688,
294
+ "epoch": 0.39030955585464333,
295
+ "grad_norm": 0.9085242748260498,
296
+ "learning_rate": 7.768817204301076e-05,
297
+ "loss": 0.08189771175384522,
298
+ "mean_token_accuracy": 0.9670268416404724,
299
+ "num_tokens": 5471308.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.1406704187393188,
304
+ "epoch": 0.4037685060565276,
305
+ "grad_norm": 1.128177285194397,
306
+ "learning_rate": 8.037634408602151e-05,
307
+ "loss": 0.08119879961013794,
308
+ "mean_token_accuracy": 0.9675639867782593,
309
+ "num_tokens": 5660026.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.1369357347488402,
314
+ "epoch": 0.41722745625841184,
315
+ "grad_norm": 0.8224227428436279,
316
+ "learning_rate": 8.306451612903227e-05,
317
+ "loss": 0.07979745864868164,
318
+ "mean_token_accuracy": 0.9681445300579071,
319
+ "num_tokens": 5848073.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.1252583503723144,
324
+ "epoch": 0.4306864064602961,
325
+ "grad_norm": 0.7711160182952881,
326
+ "learning_rate": 8.575268817204302e-05,
327
+ "loss": 0.0783164381980896,
328
+ "mean_token_accuracy": 0.9686201930046081,
329
+ "num_tokens": 6036717.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.1203404307365417,
334
+ "epoch": 0.44414535666218036,
335
+ "grad_norm": 1.1810287237167358,
336
+ "learning_rate": 8.844086021505377e-05,
337
+ "loss": 0.08139073848724365,
338
+ "mean_token_accuracy": 0.9681365489959717,
339
+ "num_tokens": 6225223.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.1222687840461731,
344
+ "epoch": 0.4576043068640646,
345
+ "grad_norm": 1.4551713466644287,
346
+ "learning_rate": 9.112903225806452e-05,
347
+ "loss": 0.0820135235786438,
348
+ "mean_token_accuracy": 0.9672703862190246,
349
+ "num_tokens": 6413936.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.1156280279159545,
354
+ "epoch": 0.47106325706594887,
355
+ "grad_norm": 0.8573716878890991,
356
+ "learning_rate": 9.381720430107528e-05,
357
+ "loss": 0.08074904680252075,
358
+ "mean_token_accuracy": 0.9677663922309876,
359
+ "num_tokens": 6602413.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.1063185572624206,
364
+ "epoch": 0.4845222072678331,
365
+ "grad_norm": 0.7709434628486633,
366
+ "learning_rate": 9.650537634408603e-05,
367
+ "loss": 0.07549421787261963,
368
+ "mean_token_accuracy": 0.9689356207847595,
369
+ "num_tokens": 6790873.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0962031960487366,
374
+ "epoch": 0.4979811574697174,
375
+ "grad_norm": 0.7843625545501709,
376
+ "learning_rate": 9.919354838709678e-05,
377
+ "loss": 0.07368478775024415,
378
+ "mean_token_accuracy": 0.9700294613838196,
379
+ "num_tokens": 6979392.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.1028530240058898,
384
+ "epoch": 0.5114401076716016,
385
+ "grad_norm": 0.7935485243797302,
386
+ "learning_rate": 9.999975729865971e-05,
387
+ "loss": 0.07766538262367248,
388
+ "mean_token_accuracy": 0.9704003691673279,
389
+ "num_tokens": 7167482.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.0905393958091736,
394
+ "epoch": 0.5248990578734859,
395
+ "grad_norm": 1.4108924865722656,
396
+ "learning_rate": 9.999856856307314e-05,
397
+ "loss": 0.0760004162788391,
398
+ "mean_token_accuracy": 0.9699020266532898,
399
+ "num_tokens": 7355994.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0853531122207642,
404
+ "epoch": 0.5383580080753702,
405
+ "grad_norm": 0.9043511152267456,
406
+ "learning_rate": 9.999638923896533e-05,
407
+ "loss": 0.0720310389995575,
408
+ "mean_token_accuracy": 0.9707890212535858,
409
+ "num_tokens": 7544655.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.075773572921753,
414
+ "epoch": 0.5518169582772544,
415
+ "grad_norm": 0.9038823246955872,
416
+ "learning_rate": 9.999321936951374e-05,
417
+ "loss": 0.07026209831237792,
418
+ "mean_token_accuracy": 0.9715417385101318,
419
+ "num_tokens": 7733348.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0572428584098816,
424
+ "epoch": 0.5652759084791387,
425
+ "grad_norm": 0.8231707811355591,
426
+ "learning_rate": 9.998905901752091e-05,
427
+ "loss": 0.07141299843788147,
428
+ "mean_token_accuracy": 0.9705908715724945,
429
+ "num_tokens": 7921582.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.051485300064087,
434
+ "epoch": 0.5787348586810229,
435
+ "grad_norm": 1.1695454120635986,
436
+ "learning_rate": 9.998390826541315e-05,
437
+ "loss": 0.07321611642837525,
438
+ "mean_token_accuracy": 0.9709623396396637,
439
+ "num_tokens": 8110266.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.0485445380210876,
444
+ "epoch": 0.5921938088829072,
445
+ "grad_norm": 0.7291010618209839,
446
+ "learning_rate": 9.997776721523888e-05,
447
+ "loss": 0.07221676707267762,
448
+ "mean_token_accuracy": 0.9705081820487976,
449
+ "num_tokens": 8298932.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.0456495046615601,
454
+ "epoch": 0.6056527590847914,
455
+ "grad_norm": 0.6550094485282898,
456
+ "learning_rate": 9.99706359886667e-05,
457
+ "loss": 0.06878133416175843,
458
+ "mean_token_accuracy": 0.9728739261627197,
459
+ "num_tokens": 8487613.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.0372861266136169,
464
+ "epoch": 0.6191117092866757,
465
+ "grad_norm": 0.5825450420379639,
466
+ "learning_rate": 9.996251472698281e-05,
467
+ "loss": 0.06706151366233826,
468
+ "mean_token_accuracy": 0.9732721030712128,
469
+ "num_tokens": 8676276.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.0319493532180786,
474
+ "epoch": 0.6325706594885598,
475
+ "grad_norm": 0.7158534526824951,
476
+ "learning_rate": 9.995340359108844e-05,
477
+ "loss": 0.06999597549438477,
478
+ "mean_token_accuracy": 0.9726741492748261,
479
+ "num_tokens": 8864738.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.0260030388832093,
484
+ "epoch": 0.6460296096904441,
485
+ "grad_norm": 0.7918373346328735,
486
+ "learning_rate": 9.994330276149649e-05,
487
+ "loss": 0.06949877142906188,
488
+ "mean_token_accuracy": 0.9725882947444916,
489
+ "num_tokens": 9053582.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.0258127093315124,
494
+ "epoch": 0.6594885598923284,
495
+ "grad_norm": 0.893281102180481,
496
+ "learning_rate": 9.993221243832797e-05,
497
+ "loss": 0.06893026828765869,
498
+ "mean_token_accuracy": 0.9729171216487884,
499
+ "num_tokens": 9241914.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.0261828184127808,
504
+ "epoch": 0.6729475100942126,
505
+ "grad_norm": 1.0127108097076416,
506
+ "learning_rate": 9.992013284130816e-05,
507
+ "loss": 0.07094801664352417,
508
+ "mean_token_accuracy": 0.9714575052261353,
509
+ "num_tokens": 9430981.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.0274562597274781,
514
+ "epoch": 0.6864064602960969,
515
+ "grad_norm": 0.7148029208183289,
516
+ "learning_rate": 9.990706420976206e-05,
517
+ "loss": 0.06826171875,
518
+ "mean_token_accuracy": 0.9727248430252076,
519
+ "num_tokens": 9619472.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.0137859225273131,
524
+ "epoch": 0.6998654104979811,
525
+ "grad_norm": 0.9228634238243103,
526
+ "learning_rate": 9.989300680260985e-05,
527
+ "loss": 0.06890587210655212,
528
+ "mean_token_accuracy": 0.9723304331302642,
529
+ "num_tokens": 9808123.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.0221150755882262,
534
+ "epoch": 0.7133243606998654,
535
+ "grad_norm": 0.971530556678772,
536
+ "learning_rate": 9.98779608983616e-05,
537
+ "loss": 0.07073599100112915,
538
+ "mean_token_accuracy": 0.9714174270629883,
539
+ "num_tokens": 9996601.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.0248154640197753,
544
+ "epoch": 0.7267833109017496,
545
+ "grad_norm": 0.7336317896842957,
546
+ "learning_rate": 9.986192679511189e-05,
547
+ "loss": 0.06874136924743653,
548
+ "mean_token_accuracy": 0.9723432004451752,
549
+ "num_tokens": 10184725.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.0120978832244873,
554
+ "epoch": 0.7402422611036339,
555
+ "grad_norm": 1.2100600004196167,
556
+ "learning_rate": 9.984490481053372e-05,
557
+ "loss": 0.06270487308502197,
558
+ "mean_token_accuracy": 0.9756880521774292,
559
+ "num_tokens": 10373446.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.0172406077384948,
564
+ "epoch": 0.7537012113055181,
565
+ "grad_norm": 0.7562853693962097,
566
+ "learning_rate": 9.982689528187244e-05,
567
+ "loss": 0.06784560084342957,
568
+ "mean_token_accuracy": 0.9724333345890045,
569
+ "num_tokens": 10561710.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.0150038957595826,
574
+ "epoch": 0.7671601615074024,
575
+ "grad_norm": 1.07966148853302,
576
+ "learning_rate": 9.98078985659389e-05,
577
+ "loss": 0.0670344054698944,
578
+ "mean_token_accuracy": 0.97339146733284,
579
+ "num_tokens": 10749917.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.0135640978813172,
584
+ "epoch": 0.7806191117092867,
585
+ "grad_norm": 0.7826104164123535,
586
+ "learning_rate": 9.978791503910246e-05,
587
+ "loss": 0.0668565571308136,
588
+ "mean_token_accuracy": 0.9735289216041565,
589
+ "num_tokens": 10938098.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.0077669620513916,
594
+ "epoch": 0.7940780619111709,
595
+ "grad_norm": 1.299584150314331,
596
+ "learning_rate": 9.97669450972835e-05,
597
+ "loss": 0.0728976845741272,
598
+ "mean_token_accuracy": 0.9701269209384918,
599
+ "num_tokens": 11126107.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 0.9957692861557007,
604
+ "epoch": 0.8075370121130552,
605
+ "grad_norm": 1.1542484760284424,
606
+ "learning_rate": 9.974498915594557e-05,
607
+ "loss": 0.0631720781326294,
608
+ "mean_token_accuracy": 0.9747781097888947,
609
+ "num_tokens": 11315001.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 0.9983992159366608,
614
+ "epoch": 0.8209959623149394,
615
+ "grad_norm": 0.6967246532440186,
616
+ "learning_rate": 9.97220476500872e-05,
617
+ "loss": 0.06333768963813782,
618
+ "mean_token_accuracy": 0.9750322341918946,
619
+ "num_tokens": 11503007.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 0.9801111698150635,
624
+ "epoch": 0.8344549125168237,
625
+ "grad_norm": 0.945446789264679,
626
+ "learning_rate": 9.969812103423325e-05,
627
+ "loss": 0.05720087289810181,
628
+ "mean_token_accuracy": 0.9767442524433136,
629
+ "num_tokens": 11691883.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 0.9879570543766022,
634
+ "epoch": 0.847913862718708,
635
+ "grad_norm": 0.8254193663597107,
636
+ "learning_rate": 9.967320978242592e-05,
637
+ "loss": 0.05899171829223633,
638
+ "mean_token_accuracy": 0.9767756819725036,
639
+ "num_tokens": 11880353.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 0.9878240942955017,
644
+ "epoch": 0.8613728129205922,
645
+ "grad_norm": 0.9882538914680481,
646
+ "learning_rate": 9.964731438821533e-05,
647
+ "loss": 0.06443996429443359,
648
+ "mean_token_accuracy": 0.9735406100749969,
649
+ "num_tokens": 12069610.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 0.9938213169574738,
654
+ "epoch": 0.8748317631224765,
655
+ "grad_norm": 1.126546859741211,
656
+ "learning_rate": 9.962043536464978e-05,
657
+ "loss": 0.06708416938781739,
658
+ "mean_token_accuracy": 0.9730359137058258,
659
+ "num_tokens": 12257901.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 0.9800443708896637,
664
+ "epoch": 0.8882907133243607,
665
+ "grad_norm": 0.9151445031166077,
666
+ "learning_rate": 9.959257324426556e-05,
667
+ "loss": 0.06290764808654785,
668
+ "mean_token_accuracy": 0.9741653084754944,
669
+ "num_tokens": 12446236.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 0.9744794130325317,
674
+ "epoch": 0.901749663526245,
675
+ "grad_norm": 0.9104028940200806,
676
+ "learning_rate": 9.95637285790764e-05,
677
+ "loss": 0.060464882850646974,
678
+ "mean_token_accuracy": 0.9757490694522858,
679
+ "num_tokens": 12635226.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 0.9786687552928924,
684
+ "epoch": 0.9152086137281292,
685
+ "grad_norm": 0.7316192388534546,
686
+ "learning_rate": 9.953390194056258e-05,
687
+ "loss": 0.06054847836494446,
688
+ "mean_token_accuracy": 0.9760404825210571,
689
+ "num_tokens": 12823428.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 0.966060334444046,
694
+ "epoch": 0.9286675639300135,
695
+ "grad_norm": 1.3535135984420776,
696
+ "learning_rate": 9.950309391965947e-05,
697
+ "loss": 0.061383575201034546,
698
+ "mean_token_accuracy": 0.9749173820018768,
699
+ "num_tokens": 13012362.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 0.9722849190235138,
704
+ "epoch": 0.9421265141318977,
705
+ "grad_norm": 0.890771746635437,
706
+ "learning_rate": 9.947130512674602e-05,
707
+ "loss": 0.06301190257072449,
708
+ "mean_token_accuracy": 0.9750476598739624,
709
+ "num_tokens": 13200680.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 0.9624020338058472,
714
+ "epoch": 0.955585464333782,
715
+ "grad_norm": 1.142499327659607,
716
+ "learning_rate": 9.943853619163255e-05,
717
+ "loss": 0.06179196834564209,
718
+ "mean_token_accuracy": 0.9751901209354401,
719
+ "num_tokens": 13389333.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 0.9653747022151947,
724
+ "epoch": 0.9690444145356663,
725
+ "grad_norm": 0.8682493567466736,
726
+ "learning_rate": 9.94047877635482e-05,
727
+ "loss": 0.06113170981407166,
728
+ "mean_token_accuracy": 0.9748325288295746,
729
+ "num_tokens": 13578210.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 0.970348197221756,
734
+ "epoch": 0.9825033647375505,
735
+ "grad_norm": 0.8351430296897888,
736
+ "learning_rate": 9.93700605111283e-05,
737
+ "loss": 0.05901788473129273,
738
+ "mean_token_accuracy": 0.9761226296424865,
739
+ "num_tokens": 13767039.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 0.956724613904953,
744
+ "epoch": 0.9959623149394348,
745
+ "grad_norm": 1.2604819536209106,
746
+ "learning_rate": 9.933435512240084e-05,
747
+ "loss": 0.06124393343925476,
748
+ "mean_token_accuracy": 0.9747833967208862,
749
+ "num_tokens": 13956057.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "epoch": 1.0,
754
+ "eval_entropy": 0.9722227849018802,
755
+ "eval_loss": 0.0600070059299469,
756
+ "eval_mean_token_accuracy": 0.975752676368519,
757
+ "eval_num_tokens": 14012630.0,
758
+ "eval_runtime": 24.6215,
759
+ "eval_samples_per_second": 203.074,
760
+ "eval_steps_per_second": 6.377,
761
+ "step": 743
762
+ },
763
+ {
764
+ "entropy": 0.9689933776855468,
765
+ "epoch": 1.009421265141319,
766
+ "grad_norm": 0.9720287919044495,
767
+ "learning_rate": 9.929767230477305e-05,
768
+ "loss": 0.055563896894454956,
769
+ "mean_token_accuracy": 0.9779708027839661,
770
+ "num_tokens": 14144652.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9506655633449554,
775
+ "epoch": 1.0228802153432033,
776
+ "grad_norm": 0.6619595289230347,
777
+ "learning_rate": 9.92600127850173e-05,
778
+ "loss": 0.050587379932403566,
779
+ "mean_token_accuracy": 0.979685264825821,
780
+ "num_tokens": 14332714.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9458732306957245,
785
+ "epoch": 1.0363391655450875,
786
+ "grad_norm": 0.7905108332633972,
787
+ "learning_rate": 9.922137730925673e-05,
788
+ "loss": 0.05069155097007751,
789
+ "mean_token_accuracy": 0.9796596229076385,
790
+ "num_tokens": 14520902.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.9320916712284089,
795
+ "epoch": 1.0497981157469718,
796
+ "grad_norm": 0.9210566878318787,
797
+ "learning_rate": 9.918176664295041e-05,
798
+ "loss": 0.051744121313095096,
799
+ "mean_token_accuracy": 0.9792148530483246,
800
+ "num_tokens": 14709215.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.928934782743454,
805
+ "epoch": 1.063257065948856,
806
+ "grad_norm": 0.7442036867141724,
807
+ "learning_rate": 9.914118157087824e-05,
808
+ "loss": 0.0486875057220459,
809
+ "mean_token_accuracy": 0.9798122465610504,
810
+ "num_tokens": 14898242.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.942881977558136,
815
+ "epoch": 1.0767160161507403,
816
+ "grad_norm": 1.0910941362380981,
817
+ "learning_rate": 9.909962289712538e-05,
818
+ "loss": 0.052297019958496095,
819
+ "mean_token_accuracy": 0.9793200254440307,
820
+ "num_tokens": 15086835.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.9503917813301086,
825
+ "epoch": 1.0901749663526246,
826
+ "grad_norm": 0.8091554641723633,
827
+ "learning_rate": 9.905709144506629e-05,
828
+ "loss": 0.049927744269371035,
829
+ "mean_token_accuracy": 0.9802082359790802,
830
+ "num_tokens": 15275042.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.9426830470561981,
835
+ "epoch": 1.1036339165545088,
836
+ "grad_norm": 1.0589245557785034,
837
+ "learning_rate": 9.901358805734846e-05,
838
+ "loss": 0.053803551197052005,
839
+ "mean_token_accuracy": 0.9785399675369263,
840
+ "num_tokens": 15463923.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.9445010781288147,
845
+ "epoch": 1.117092866756393,
846
+ "grad_norm": 0.8410441279411316,
847
+ "learning_rate": 9.89691135958757e-05,
848
+ "loss": 0.053181976079940796,
849
+ "mean_token_accuracy": 0.9792523324489594,
850
+ "num_tokens": 15652526.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.946729838848114,
855
+ "epoch": 1.1305518169582773,
856
+ "grad_norm": 0.8246080279350281,
857
+ "learning_rate": 9.892366894179105e-05,
858
+ "loss": 0.054291915893554685,
859
+ "mean_token_accuracy": 0.9791357696056366,
860
+ "num_tokens": 15841340.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9433728814125061,
865
+ "epoch": 1.1440107671601616,
866
+ "grad_norm": 0.9200296401977539,
867
+ "learning_rate": 9.887725499545937e-05,
868
+ "loss": 0.05073915719985962,
869
+ "mean_token_accuracy": 0.9796931505203247,
870
+ "num_tokens": 16029478.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.9494417488574982,
875
+ "epoch": 1.1574697173620458,
876
+ "grad_norm": 0.8342758417129517,
877
+ "learning_rate": 9.882987267644939e-05,
878
+ "loss": 0.050929927825927736,
879
+ "mean_token_accuracy": 0.9793219089508056,
880
+ "num_tokens": 16218014.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.9409256100654602,
885
+ "epoch": 1.17092866756393,
886
+ "grad_norm": 0.7911009192466736,
887
+ "learning_rate": 9.878152292351563e-05,
888
+ "loss": 0.049964362382888795,
889
+ "mean_token_accuracy": 0.9798487305641175,
890
+ "num_tokens": 16406420.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.9429958581924438,
895
+ "epoch": 1.1843876177658144,
896
+ "grad_norm": 1.4877018928527832,
897
+ "learning_rate": 9.873220669457975e-05,
898
+ "loss": 0.04969423711299896,
899
+ "mean_token_accuracy": 0.9801322638988494,
900
+ "num_tokens": 16594948.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.9542136013507843,
905
+ "epoch": 1.1978465679676986,
906
+ "grad_norm": 0.8030785918235779,
907
+ "learning_rate": 9.868192496671147e-05,
908
+ "loss": 0.04981146454811096,
909
+ "mean_token_accuracy": 0.9796162784099579,
910
+ "num_tokens": 16782983.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.9367722630500793,
915
+ "epoch": 1.2113055181695827,
916
+ "grad_norm": 1.1169800758361816,
917
+ "learning_rate": 9.86306787361094e-05,
918
+ "loss": 0.05210963487625122,
919
+ "mean_token_accuracy": 0.9790139615535736,
920
+ "num_tokens": 16971394.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.9487208306789399,
925
+ "epoch": 1.224764468371467,
926
+ "grad_norm": 0.8650282621383667,
927
+ "learning_rate": 9.857846901808117e-05,
928
+ "loss": 0.05012243390083313,
929
+ "mean_token_accuracy": 0.9805107891559601,
930
+ "num_tokens": 17159951.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.9441231489181519,
935
+ "epoch": 1.2382234185733512,
936
+ "grad_norm": 0.8193967938423157,
937
+ "learning_rate": 9.852529684702329e-05,
938
+ "loss": 0.048866665363311766,
939
+ "mean_token_accuracy": 0.9807584881782532,
940
+ "num_tokens": 17348711.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.9361630439758301,
945
+ "epoch": 1.2516823687752354,
946
+ "grad_norm": 1.3613612651824951,
947
+ "learning_rate": 9.847116327640082e-05,
948
+ "loss": 0.05324091911315918,
949
+ "mean_token_accuracy": 0.97904953956604,
950
+ "num_tokens": 17537116.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.9562997639179229,
955
+ "epoch": 1.2651413189771197,
956
+ "grad_norm": 0.8121878504753113,
957
+ "learning_rate": 9.841606937872632e-05,
958
+ "loss": 0.05011019706726074,
959
+ "mean_token_accuracy": 0.9798731088638306,
960
+ "num_tokens": 17725567.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.9359076261520386,
965
+ "epoch": 1.278600269179004,
966
+ "grad_norm": 0.7838549613952637,
967
+ "learning_rate": 9.836001624553869e-05,
968
+ "loss": 0.04834386110305786,
969
+ "mean_token_accuracy": 0.9804529249668121,
970
+ "num_tokens": 17914191.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.9527446150779724,
975
+ "epoch": 1.2920592193808882,
976
+ "grad_norm": 1.0981712341308594,
977
+ "learning_rate": 9.830300498738152e-05,
978
+ "loss": 0.055077624320983884,
979
+ "mean_token_accuracy": 0.9776535391807556,
980
+ "num_tokens": 18103184.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.9574812948703766,
985
+ "epoch": 1.3055181695827724,
986
+ "grad_norm": 0.8291501402854919,
987
+ "learning_rate": 9.824503673378112e-05,
988
+ "loss": 0.050153911113739014,
989
+ "mean_token_accuracy": 0.9792965054512024,
990
+ "num_tokens": 18291351.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.939585280418396,
995
+ "epoch": 1.3189771197846567,
996
+ "grad_norm": 1.0864412784576416,
997
+ "learning_rate": 9.81861126332241e-05,
998
+ "loss": 0.05132197737693787,
999
+ "mean_token_accuracy": 0.9792291462421417,
1000
+ "num_tokens": 18480192.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.9468406856060028,
1005
+ "epoch": 1.332436069986541,
1006
+ "grad_norm": 0.9772785902023315,
1007
+ "learning_rate": 9.812623385313461e-05,
1008
+ "loss": 0.04844954013824463,
1009
+ "mean_token_accuracy": 0.9804142415523529,
1010
+ "num_tokens": 18669028.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.9413834273815155,
1015
+ "epoch": 1.3458950201884252,
1016
+ "grad_norm": 0.9020094871520996,
1017
+ "learning_rate": 9.806540157985131e-05,
1018
+ "loss": 0.05075312852859497,
1019
+ "mean_token_accuracy": 0.9790591120719909,
1020
+ "num_tokens": 18857581.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.9526253461837768,
1025
+ "epoch": 1.3593539703903095,
1026
+ "grad_norm": 1.0524990558624268,
1027
+ "learning_rate": 9.800361701860368e-05,
1028
+ "loss": 0.049737372994422914,
1029
+ "mean_token_accuracy": 0.9795652389526367,
1030
+ "num_tokens": 19046105.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.9484909176826477,
1035
+ "epoch": 1.3728129205921937,
1036
+ "grad_norm": 1.0715620517730713,
1037
+ "learning_rate": 9.794088139348835e-05,
1038
+ "loss": 0.04935494959354401,
1039
+ "mean_token_accuracy": 0.9797740161418915,
1040
+ "num_tokens": 19234730.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.9398573458194732,
1045
+ "epoch": 1.386271870794078,
1046
+ "grad_norm": 1.2134987115859985,
1047
+ "learning_rate": 9.787719594744468e-05,
1048
+ "loss": 0.0518394410610199,
1049
+ "mean_token_accuracy": 0.9793788075447083,
1050
+ "num_tokens": 19423572.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.9465648651123046,
1055
+ "epoch": 1.3997308209959622,
1056
+ "grad_norm": 1.004693627357483,
1057
+ "learning_rate": 9.781256194223023e-05,
1058
+ "loss": 0.04776117205619812,
1059
+ "mean_token_accuracy": 0.9813261866569519,
1060
+ "num_tokens": 19612465.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.9448004186153411,
1065
+ "epoch": 1.4131897711978465,
1066
+ "grad_norm": 1.2913438081741333,
1067
+ "learning_rate": 9.774698065839577e-05,
1068
+ "loss": 0.04807930588722229,
1069
+ "mean_token_accuracy": 0.9805689513683319,
1070
+ "num_tokens": 19800482.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.9425322234630584,
1075
+ "epoch": 1.4266487213997308,
1076
+ "grad_norm": 1.1068469285964966,
1077
+ "learning_rate": 9.768045339525979e-05,
1078
+ "loss": 0.05053595900535583,
1079
+ "mean_token_accuracy": 0.9794536828994751,
1080
+ "num_tokens": 19989444.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.9472232103347779,
1085
+ "epoch": 1.440107671601615,
1086
+ "grad_norm": 0.7409384250640869,
1087
+ "learning_rate": 9.76129814708829e-05,
1088
+ "loss": 0.052136778831481934,
1089
+ "mean_token_accuracy": 0.979197359085083,
1090
+ "num_tokens": 20178482.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.9450975477695465,
1095
+ "epoch": 1.4535666218034993,
1096
+ "grad_norm": 1.3440778255462646,
1097
+ "learning_rate": 9.754456622204167e-05,
1098
+ "loss": 0.05258495211601257,
1099
+ "mean_token_accuracy": 0.9798152863979339,
1100
+ "num_tokens": 20367345.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "entropy": 0.9468007743358612,
1105
+ "epoch": 1.4670255720053835,
1106
+ "grad_norm": 0.8377829194068909,
1107
+ "learning_rate": 9.747520900420209e-05,
1108
+ "loss": 0.04893603026866913,
1109
+ "mean_token_accuracy": 0.9805733859539032,
1110
+ "num_tokens": 20555817.0,
1111
+ "step": 1090
1112
+ },
1113
+ {
1114
+ "entropy": 0.9505827724933624,
1115
+ "epoch": 1.4804845222072678,
1116
+ "grad_norm": 0.776889979839325,
1117
+ "learning_rate": 9.740491119149277e-05,
1118
+ "loss": 0.05367119312286377,
1119
+ "mean_token_accuracy": 0.9779646098613739,
1120
+ "num_tokens": 20744001.0,
1121
+ "step": 1100
1122
+ },
1123
+ {
1124
+ "entropy": 0.9516380190849304,
1125
+ "epoch": 1.493943472409152,
1126
+ "grad_norm": 0.8598276376724243,
1127
+ "learning_rate": 9.733367417667773e-05,
1128
+ "loss": 0.05071394443511963,
1129
+ "mean_token_accuracy": 0.979692417383194,
1130
+ "num_tokens": 20931715.0,
1131
+ "step": 1110
1132
+ },
1133
+ {
1134
+ "entropy": 0.9351628720760345,
1135
+ "epoch": 1.5074024226110363,
1136
+ "grad_norm": 1.3307439088821411,
1137
+ "learning_rate": 9.726149937112873e-05,
1138
+ "loss": 0.051372635364532473,
1139
+ "mean_token_accuracy": 0.9797907948493958,
1140
+ "num_tokens": 21120803.0,
1141
+ "step": 1120
1142
+ },
1143
+ {
1144
+ "entropy": 0.9268040716648102,
1145
+ "epoch": 1.5208613728129206,
1146
+ "grad_norm": 0.9320250749588013,
1147
+ "learning_rate": 9.718838820479743e-05,
1148
+ "loss": 0.04939974546432495,
1149
+ "mean_token_accuracy": 0.98038170337677,
1150
+ "num_tokens": 21309384.0,
1151
+ "step": 1130
1152
+ },
1153
+ {
1154
+ "entropy": 0.9241463243961334,
1155
+ "epoch": 1.5343203230148048,
1156
+ "grad_norm": 1.065938115119934,
1157
+ "learning_rate": 9.711434212618691e-05,
1158
+ "loss": 0.04856643378734589,
1159
+ "mean_token_accuracy": 0.9800511121749877,
1160
+ "num_tokens": 21498234.0,
1161
+ "step": 1140
1162
+ },
1163
+ {
1164
+ "entropy": 0.9350252687931061,
1165
+ "epoch": 1.547779273216689,
1166
+ "grad_norm": 0.6925032138824463,
1167
+ "learning_rate": 9.703936260232308e-05,
1168
+ "loss": 0.0512526273727417,
1169
+ "mean_token_accuracy": 0.979562646150589,
1170
+ "num_tokens": 21687119.0,
1171
+ "step": 1150
1172
+ },
1173
+ {
1174
+ "entropy": 0.9403733670711517,
1175
+ "epoch": 1.5612382234185733,
1176
+ "grad_norm": 1.2069101333618164,
1177
+ "learning_rate": 9.696345111872557e-05,
1178
+ "loss": 0.049795085191726686,
1179
+ "mean_token_accuracy": 0.9806506633758545,
1180
+ "num_tokens": 21876094.0,
1181
+ "step": 1160
1182
+ },
1183
+ {
1184
+ "entropy": 0.9370538294315338,
1185
+ "epoch": 1.5746971736204576,
1186
+ "grad_norm": 1.1759546995162964,
1187
+ "learning_rate": 9.688660917937838e-05,
1188
+ "loss": 0.05461466312408447,
1189
+ "mean_token_accuracy": 0.9776057600975037,
1190
+ "num_tokens": 22064836.0,
1191
+ "step": 1170
1192
+ },
1193
+ {
1194
+ "entropy": 0.9517778217792511,
1195
+ "epoch": 1.5881561238223418,
1196
+ "grad_norm": 0.8965149521827698,
1197
+ "learning_rate": 9.68088383066999e-05,
1198
+ "loss": 0.05289326906204224,
1199
+ "mean_token_accuracy": 0.9782088756561279,
1200
+ "num_tokens": 22253065.0,
1201
+ "step": 1180
1202
+ },
1203
+ {
1204
+ "entropy": 0.9372851848602295,
1205
+ "epoch": 1.601615074024226,
1206
+ "grad_norm": 0.7192108631134033,
1207
+ "learning_rate": 9.673014004151292e-05,
1208
+ "loss": 0.048973691463470456,
1209
+ "mean_token_accuracy": 0.9806210815906524,
1210
+ "num_tokens": 22441206.0,
1211
+ "step": 1190
1212
+ },
1213
+ {
1214
+ "entropy": 0.9122351944446564,
1215
+ "epoch": 1.6150740242261103,
1216
+ "grad_norm": 0.8763614296913147,
1217
+ "learning_rate": 9.665051594301407e-05,
1218
+ "loss": 0.0461783230304718,
1219
+ "mean_token_accuracy": 0.9817197680473327,
1220
+ "num_tokens": 22629637.0,
1221
+ "step": 1200
1222
+ },
1223
+ {
1224
+ "entropy": 0.9162654876708984,
1225
+ "epoch": 1.6285329744279946,
1226
+ "grad_norm": 0.9806778430938721,
1227
+ "learning_rate": 9.656996758874284e-05,
1228
+ "loss": 0.04827338755130768,
1229
+ "mean_token_accuracy": 0.9806499361991883,
1230
+ "num_tokens": 22818369.0,
1231
+ "step": 1210
1232
+ },
1233
+ {
1234
+ "entropy": 0.9125026524066925,
1235
+ "epoch": 1.6419919246298789,
1236
+ "grad_norm": 0.9685350656509399,
1237
+ "learning_rate": 9.648849657455044e-05,
1238
+ "loss": 0.048379385471343996,
1239
+ "mean_token_accuracy": 0.980905145406723,
1240
+ "num_tokens": 23006863.0,
1241
+ "step": 1220
1242
+ },
1243
+ {
1244
+ "entropy": 0.9119791567325592,
1245
+ "epoch": 1.6554508748317631,
1246
+ "grad_norm": 0.8717457056045532,
1247
+ "learning_rate": 9.640610451456811e-05,
1248
+ "loss": 0.050873959064483644,
1249
+ "mean_token_accuracy": 0.979601925611496,
1250
+ "num_tokens": 23195735.0,
1251
+ "step": 1230
1252
+ },
1253
+ {
1254
+ "entropy": 0.9244007229804992,
1255
+ "epoch": 1.6689098250336474,
1256
+ "grad_norm": 1.0574654340744019,
1257
+ "learning_rate": 9.632279304117517e-05,
1258
+ "loss": 0.05112813711166382,
1259
+ "mean_token_accuracy": 0.979493772983551,
1260
+ "num_tokens": 23383862.0,
1261
+ "step": 1240
1262
+ },
1263
+ {
1264
+ "entropy": 0.9225810945034028,
1265
+ "epoch": 1.6823687752355316,
1266
+ "grad_norm": 0.9628807306289673,
1267
+ "learning_rate": 9.623856380496664e-05,
1268
+ "loss": 0.050480544567108154,
1269
+ "mean_token_accuracy": 0.9797678411006927,
1270
+ "num_tokens": 23572292.0,
1271
+ "step": 1250
1272
+ },
1273
+ {
1274
+ "entropy": 0.9287579476833343,
1275
+ "epoch": 1.695827725437416,
1276
+ "grad_norm": 1.015265941619873,
1277
+ "learning_rate": 9.615341847472059e-05,
1278
+ "loss": 0.05133126378059387,
1279
+ "mean_token_accuracy": 0.9797986805438995,
1280
+ "num_tokens": 23760252.0,
1281
+ "step": 1260
1282
+ },
1283
+ {
1284
+ "entropy": 0.9133845925331116,
1285
+ "epoch": 1.7092866756393001,
1286
+ "grad_norm": 0.8996883630752563,
1287
+ "learning_rate": 9.606735873736505e-05,
1288
+ "loss": 0.05044950246810913,
1289
+ "mean_token_accuracy": 0.9795303404331207,
1290
+ "num_tokens": 23949031.0,
1291
+ "step": 1270
1292
+ },
1293
+ {
1294
+ "entropy": 0.9213698863983154,
1295
+ "epoch": 1.7227456258411844,
1296
+ "grad_norm": 1.0490264892578125,
1297
+ "learning_rate": 9.598038629794461e-05,
1298
+ "loss": 0.04797698855400086,
1299
+ "mean_token_accuracy": 0.9806023716926575,
1300
+ "num_tokens": 24137829.0,
1301
+ "step": 1280
1302
+ },
1303
+ {
1304
+ "entropy": 0.9173697710037232,
1305
+ "epoch": 1.7362045760430687,
1306
+ "grad_norm": 1.3648769855499268,
1307
+ "learning_rate": 9.589250287958657e-05,
1308
+ "loss": 0.049194514751434326,
1309
+ "mean_token_accuracy": 0.980712479352951,
1310
+ "num_tokens": 24326586.0,
1311
+ "step": 1290
1312
+ },
1313
+ {
1314
+ "entropy": 0.9242741882801055,
1315
+ "epoch": 1.749663526244953,
1316
+ "grad_norm": 1.105125069618225,
1317
+ "learning_rate": 9.580371022346693e-05,
1318
+ "loss": 0.048667973279953,
1319
+ "mean_token_accuracy": 0.9801449298858642,
1320
+ "num_tokens": 24514904.0,
1321
+ "step": 1300
1322
+ },
1323
+ {
1324
+ "entropy": 0.9162627995014191,
1325
+ "epoch": 1.7631224764468372,
1326
+ "grad_norm": 0.8887938857078552,
1327
+ "learning_rate": 9.571401008877572e-05,
1328
+ "loss": 0.04878672957420349,
1329
+ "mean_token_accuracy": 0.980469423532486,
1330
+ "num_tokens": 24703688.0,
1331
+ "step": 1310
1332
+ },
1333
+ {
1334
+ "entropy": 0.9149538338184356,
1335
+ "epoch": 1.7765814266487214,
1336
+ "grad_norm": 0.8838976621627808,
1337
+ "learning_rate": 9.562340425268233e-05,
1338
+ "loss": 0.04892318844795227,
1339
+ "mean_token_accuracy": 0.9808776795864105,
1340
+ "num_tokens": 24892107.0,
1341
+ "step": 1320
1342
+ },
1343
+ {
1344
+ "entropy": 0.914735221862793,
1345
+ "epoch": 1.7900403768506057,
1346
+ "grad_norm": 1.1152070760726929,
1347
+ "learning_rate": 9.553189451030019e-05,
1348
+ "loss": 0.04825109839439392,
1349
+ "mean_token_accuracy": 0.9804604113101959,
1350
+ "num_tokens": 25080130.0,
1351
+ "step": 1330
1352
+ },
1353
+ {
1354
+ "entropy": 0.9170725226402283,
1355
+ "epoch": 1.80349932705249,
1356
+ "grad_norm": 0.9479517340660095,
1357
+ "learning_rate": 9.543948267465115e-05,
1358
+ "loss": 0.051445144414901736,
1359
+ "mean_token_accuracy": 0.9792819261550904,
1360
+ "num_tokens": 25268652.0,
1361
+ "step": 1340
1362
+ },
1363
+ {
1364
+ "entropy": 0.9142911911010743,
1365
+ "epoch": 1.8169582772543742,
1366
+ "grad_norm": 0.8172292709350586,
1367
+ "learning_rate": 9.534617057662977e-05,
1368
+ "loss": 0.0475692093372345,
1369
+ "mean_token_accuracy": 0.9809505581855774,
1370
+ "num_tokens": 25457120.0,
1371
+ "step": 1350
1372
+ },
1373
+ {
1374
+ "entropy": 0.9019249439239502,
1375
+ "epoch": 1.8304172274562585,
1376
+ "grad_norm": 0.8183121681213379,
1377
+ "learning_rate": 9.525196006496679e-05,
1378
+ "loss": 0.04979957342147827,
1379
+ "mean_token_accuracy": 0.9799730658531189,
1380
+ "num_tokens": 25645699.0,
1381
+ "step": 1360
1382
+ },
1383
+ {
1384
+ "entropy": 0.9181066155433655,
1385
+ "epoch": 1.8438761776581427,
1386
+ "grad_norm": 0.8256579041481018,
1387
+ "learning_rate": 9.515685300619271e-05,
1388
+ "loss": 0.04996164441108704,
1389
+ "mean_token_accuracy": 0.9802247405052185,
1390
+ "num_tokens": 25834183.0,
1391
+ "step": 1370
1392
+ },
1393
+ {
1394
+ "entropy": 0.9171363770961761,
1395
+ "epoch": 1.857335127860027,
1396
+ "grad_norm": 0.9727098345756531,
1397
+ "learning_rate": 9.506085128460065e-05,
1398
+ "loss": 0.048660767078399655,
1399
+ "mean_token_accuracy": 0.9805064260959625,
1400
+ "num_tokens": 26023317.0,
1401
+ "step": 1380
1402
+ },
1403
+ {
1404
+ "entropy": 0.9112720847129822,
1405
+ "epoch": 1.8707940780619112,
1406
+ "grad_norm": 0.9826673865318298,
1407
+ "learning_rate": 9.496395680220918e-05,
1408
+ "loss": 0.04775593280792236,
1409
+ "mean_token_accuracy": 0.9809929549694061,
1410
+ "num_tokens": 26212115.0,
1411
+ "step": 1390
1412
+ },
1413
+ {
1414
+ "entropy": 0.9034272134304047,
1415
+ "epoch": 1.8842530282637955,
1416
+ "grad_norm": 0.9951306581497192,
1417
+ "learning_rate": 9.486617147872446e-05,
1418
+ "loss": 0.04939360618591308,
1419
+ "mean_token_accuracy": 0.9800347864627839,
1420
+ "num_tokens": 26401167.0,
1421
+ "step": 1400
1422
+ },
1423
+ {
1424
+ "entropy": 0.897865754365921,
1425
+ "epoch": 1.8977119784656797,
1426
+ "grad_norm": 1.0163404941558838,
1427
+ "learning_rate": 9.476749725150235e-05,
1428
+ "loss": 0.05002856254577637,
1429
+ "mean_token_accuracy": 0.9793690323829651,
1430
+ "num_tokens": 26590570.0,
1431
+ "step": 1410
1432
+ },
1433
+ {
1434
+ "entropy": 0.9004576802253723,
1435
+ "epoch": 1.911170928667564,
1436
+ "grad_norm": 0.6666616201400757,
1437
+ "learning_rate": 9.466793607550995e-05,
1438
+ "loss": 0.04905453026294708,
1439
+ "mean_token_accuracy": 0.9803856253623963,
1440
+ "num_tokens": 26779481.0,
1441
+ "step": 1420
1442
+ },
1443
+ {
1444
+ "entropy": 0.8978902101516724,
1445
+ "epoch": 1.9246298788694483,
1446
+ "grad_norm": 0.9295416474342346,
1447
+ "learning_rate": 9.45674899232869e-05,
1448
+ "loss": 0.05261261463165283,
1449
+ "mean_token_accuracy": 0.9785708487033844,
1450
+ "num_tokens": 26968262.0,
1451
+ "step": 1430
1452
+ },
1453
+ {
1454
+ "entropy": 0.8962675571441651,
1455
+ "epoch": 1.9380888290713325,
1456
+ "grad_norm": 0.88262939453125,
1457
+ "learning_rate": 9.446616078490626e-05,
1458
+ "loss": 0.04765265882015228,
1459
+ "mean_token_accuracy": 0.98052077293396,
1460
+ "num_tokens": 27157631.0,
1461
+ "step": 1440
1462
+ },
1463
+ {
1464
+ "entropy": 0.8909463047981262,
1465
+ "epoch": 1.9515477792732168,
1466
+ "grad_norm": 1.4205769300460815,
1467
+ "learning_rate": 9.436395066793518e-05,
1468
+ "loss": 0.049406200647354126,
1469
+ "mean_token_accuracy": 0.979697072505951,
1470
+ "num_tokens": 27345921.0,
1471
+ "step": 1450
1472
+ },
1473
+ {
1474
+ "entropy": 0.8969364166259766,
1475
+ "epoch": 1.965006729475101,
1476
+ "grad_norm": 0.9517145752906799,
1477
+ "learning_rate": 9.426086159739496e-05,
1478
+ "loss": 0.0510346531867981,
1479
+ "mean_token_accuracy": 0.9793386399745941,
1480
+ "num_tokens": 27534751.0,
1481
+ "step": 1460
1482
+ },
1483
+ {
1484
+ "entropy": 0.9092479169368743,
1485
+ "epoch": 1.9784656796769853,
1486
+ "grad_norm": 1.200056791305542,
1487
+ "learning_rate": 9.415689561572107e-05,
1488
+ "loss": 0.04974203705787659,
1489
+ "mean_token_accuracy": 0.9800146698951722,
1490
+ "num_tokens": 27723763.0,
1491
+ "step": 1470
1492
+ },
1493
+ {
1494
+ "entropy": 0.9059641897678375,
1495
+ "epoch": 1.9919246298788695,
1496
+ "grad_norm": 1.072447657585144,
1497
+ "learning_rate": 9.405205478272267e-05,
1498
+ "loss": 0.05065792202949524,
1499
+ "mean_token_accuracy": 0.9799223959445953,
1500
+ "num_tokens": 27911784.0,
1501
+ "step": 1480
1502
+ },
1503
+ {
1504
+ "epoch": 2.0,
1505
+ "eval_entropy": 0.9083398975384464,
1506
+ "eval_loss": 0.05432562157511711,
1507
+ "eval_mean_token_accuracy": 0.9782073607869969,
1508
+ "eval_num_tokens": 28025286.0,
1509
+ "eval_runtime": 24.4304,
1510
+ "eval_samples_per_second": 204.663,
1511
+ "eval_steps_per_second": 6.426,
1512
+ "step": 1486
1513
+ },
1514
+ {
1515
+ "entropy": 0.9113237977027893,
1516
+ "epoch": 2.005383580080754,
1517
+ "grad_norm": 1.0167063474655151,
1518
+ "learning_rate": 9.394634117554173e-05,
1519
+ "loss": 0.044942992925643924,
1520
+ "mean_token_accuracy": 0.9818171083927154,
1521
+ "num_tokens": 28100906.0,
1522
+ "step": 1490
1523
+ },
1524
+ {
1525
+ "entropy": 0.8780008792877197,
1526
+ "epoch": 2.018842530282638,
1527
+ "grad_norm": 1.9552241563796997,
1528
+ "learning_rate": 9.38397568886119e-05,
1529
+ "loss": 0.036248764395713805,
1530
+ "mean_token_accuracy": 0.9852245450019836,
1531
+ "num_tokens": 28289445.0,
1532
+ "step": 1500
1533
+ },
1534
+ {
1535
+ "entropy": 0.8721176147460937,
1536
+ "epoch": 2.0323014804845223,
1537
+ "grad_norm": 0.8951388001441956,
1538
+ "learning_rate": 9.373230403361712e-05,
1539
+ "loss": 0.037987279891967776,
1540
+ "mean_token_accuracy": 0.9850614190101623,
1541
+ "num_tokens": 28477972.0,
1542
+ "step": 1510
1543
+ },
1544
+ {
1545
+ "entropy": 0.8922657191753387,
1546
+ "epoch": 2.0457604306864066,
1547
+ "grad_norm": 0.9028105735778809,
1548
+ "learning_rate": 9.362398473944958e-05,
1549
+ "loss": 0.03588914275169373,
1550
+ "mean_token_accuracy": 0.9863102614879609,
1551
+ "num_tokens": 28666272.0,
1552
+ "step": 1520
1553
+ },
1554
+ {
1555
+ "entropy": 0.877799642086029,
1556
+ "epoch": 2.059219380888291,
1557
+ "grad_norm": 1.153886318206787,
1558
+ "learning_rate": 9.35148011521677e-05,
1559
+ "loss": 0.03577309250831604,
1560
+ "mean_token_accuracy": 0.9863148391246795,
1561
+ "num_tokens": 28854780.0,
1562
+ "step": 1530
1563
+ },
1564
+ {
1565
+ "entropy": 0.8708971381187439,
1566
+ "epoch": 2.072678331090175,
1567
+ "grad_norm": 1.042955994606018,
1568
+ "learning_rate": 9.340475543495364e-05,
1569
+ "loss": 0.038266432285308835,
1570
+ "mean_token_accuracy": 0.9848419308662415,
1571
+ "num_tokens": 29043741.0,
1572
+ "step": 1540
1573
+ },
1574
+ {
1575
+ "entropy": 0.8800721943378449,
1576
+ "epoch": 2.0861372812920593,
1577
+ "grad_norm": 0.6789060235023499,
1578
+ "learning_rate": 9.329384976807023e-05,
1579
+ "loss": 0.032948991656303404,
1580
+ "mean_token_accuracy": 0.9871293902397156,
1581
+ "num_tokens": 29232719.0,
1582
+ "step": 1550
1583
+ },
1584
+ {
1585
+ "entropy": 0.8743580460548401,
1586
+ "epoch": 2.0995962314939436,
1587
+ "grad_norm": 1.1780879497528076,
1588
+ "learning_rate": 9.318208634881802e-05,
1589
+ "loss": 0.036874374747276305,
1590
+ "mean_token_accuracy": 0.9859182178974152,
1591
+ "num_tokens": 29421817.0,
1592
+ "step": 1560
1593
+ },
1594
+ {
1595
+ "entropy": 0.8851303637027741,
1596
+ "epoch": 2.113055181695828,
1597
+ "grad_norm": 0.747734785079956,
1598
+ "learning_rate": 9.306946739149161e-05,
1599
+ "loss": 0.0364631175994873,
1600
+ "mean_token_accuracy": 0.9862085223197937,
1601
+ "num_tokens": 29610344.0,
1602
+ "step": 1570
1603
+ },
1604
+ {
1605
+ "entropy": 0.8798732101917267,
1606
+ "epoch": 2.126514131897712,
1607
+ "grad_norm": 1.5001860857009888,
1608
+ "learning_rate": 9.29559951273358e-05,
1609
+ "loss": 0.03813003897666931,
1610
+ "mean_token_accuracy": 0.9852604746818543,
1611
+ "num_tokens": 29798997.0,
1612
+ "step": 1580
1613
+ },
1614
+ {
1615
+ "entropy": 0.8797551989555359,
1616
+ "epoch": 2.1399730820995964,
1617
+ "grad_norm": 0.9593478441238403,
1618
+ "learning_rate": 9.284167180450141e-05,
1619
+ "loss": 0.0394927829504013,
1620
+ "mean_token_accuracy": 0.984604275226593,
1621
+ "num_tokens": 29987809.0,
1622
+ "step": 1590
1623
+ },
1624
+ {
1625
+ "entropy": 0.8860546290874481,
1626
+ "epoch": 2.1534320323014806,
1627
+ "grad_norm": 0.8347703218460083,
1628
+ "learning_rate": 9.272649968800069e-05,
1629
+ "loss": 0.036699697375297546,
1630
+ "mean_token_accuracy": 0.985380882024765,
1631
+ "num_tokens": 30176234.0,
1632
+ "step": 1600
1633
+ },
1634
+ {
1635
+ "entropy": 0.8783667802810669,
1636
+ "epoch": 2.166890982503365,
1637
+ "grad_norm": 1.1154481172561646,
1638
+ "learning_rate": 9.26104810596625e-05,
1639
+ "loss": 0.03756999969482422,
1640
+ "mean_token_accuracy": 0.9853451430797577,
1641
+ "num_tokens": 30364657.0,
1642
+ "step": 1610
1643
+ },
1644
+ {
1645
+ "entropy": 0.8684478521347045,
1646
+ "epoch": 2.180349932705249,
1647
+ "grad_norm": 0.7515475153923035,
1648
+ "learning_rate": 9.249361821808708e-05,
1649
+ "loss": 0.0382376104593277,
1650
+ "mean_token_accuracy": 0.9854029655456543,
1651
+ "num_tokens": 30552904.0,
1652
+ "step": 1620
1653
+ },
1654
+ {
1655
+ "entropy": 0.8636964917182922,
1656
+ "epoch": 2.1938088829071334,
1657
+ "grad_norm": 0.7711939215660095,
1658
+ "learning_rate": 9.237591347860052e-05,
1659
+ "loss": 0.036220991611480714,
1660
+ "mean_token_accuracy": 0.9860713243484497,
1661
+ "num_tokens": 30741259.0,
1662
+ "step": 1630
1663
+ },
1664
+ {
1665
+ "entropy": 0.8592016279697419,
1666
+ "epoch": 2.2072678331090176,
1667
+ "grad_norm": 1.1143887042999268,
1668
+ "learning_rate": 9.225736917320886e-05,
1669
+ "loss": 0.036316031217575075,
1670
+ "mean_token_accuracy": 0.985757052898407,
1671
+ "num_tokens": 30930144.0,
1672
+ "step": 1640
1673
+ },
1674
+ {
1675
+ "entropy": 0.8671079576015472,
1676
+ "epoch": 2.220726783310902,
1677
+ "grad_norm": 0.8980015516281128,
1678
+ "learning_rate": 9.213798765055187e-05,
1679
+ "loss": 0.03822658061981201,
1680
+ "mean_token_accuracy": 0.9847764372825623,
1681
+ "num_tokens": 31118624.0,
1682
+ "step": 1650
1683
+ },
1684
+ {
1685
+ "entropy": 0.8785548269748688,
1686
+ "epoch": 2.234185733512786,
1687
+ "grad_norm": 1.094323992729187,
1688
+ "learning_rate": 9.20177712758566e-05,
1689
+ "loss": 0.03736622333526611,
1690
+ "mean_token_accuracy": 0.9850081980228425,
1691
+ "num_tokens": 31306833.0,
1692
+ "step": 1660
1693
+ },
1694
+ {
1695
+ "entropy": 0.866977310180664,
1696
+ "epoch": 2.2476446837146704,
1697
+ "grad_norm": 0.8372092843055725,
1698
+ "learning_rate": 9.189672243089046e-05,
1699
+ "loss": 0.0401554524898529,
1700
+ "mean_token_accuracy": 0.9840337932109833,
1701
+ "num_tokens": 31495503.0,
1702
+ "step": 1670
1703
+ },
1704
+ {
1705
+ "entropy": 0.8787918210029602,
1706
+ "epoch": 2.2611036339165547,
1707
+ "grad_norm": 1.5408164262771606,
1708
+ "learning_rate": 9.177484351391402e-05,
1709
+ "loss": 0.0368030846118927,
1710
+ "mean_token_accuracy": 0.9847234487533569,
1711
+ "num_tokens": 31683865.0,
1712
+ "step": 1680
1713
+ },
1714
+ {
1715
+ "entropy": 0.872721153497696,
1716
+ "epoch": 2.274562584118439,
1717
+ "grad_norm": 1.1115421056747437,
1718
+ "learning_rate": 9.165213693963355e-05,
1719
+ "loss": 0.037859299778938295,
1720
+ "mean_token_accuracy": 0.9851646661758423,
1721
+ "num_tokens": 31871903.0,
1722
+ "step": 1690
1723
+ },
1724
+ {
1725
+ "entropy": 0.8774094223976135,
1726
+ "epoch": 2.288021534320323,
1727
+ "grad_norm": 1.0301331281661987,
1728
+ "learning_rate": 9.152860513915314e-05,
1729
+ "loss": 0.038671016693115234,
1730
+ "mean_token_accuracy": 0.9840058028697968,
1731
+ "num_tokens": 32060500.0,
1732
+ "step": 1700
1733
+ },
1734
+ {
1735
+ "entropy": 0.889606237411499,
1736
+ "epoch": 2.3014804845222074,
1737
+ "grad_norm": 0.8601903915405273,
1738
+ "learning_rate": 9.140425055992648e-05,
1739
+ "loss": 0.039603835344314574,
1740
+ "mean_token_accuracy": 0.9840235590934754,
1741
+ "num_tokens": 32248744.0,
1742
+ "step": 1710
1743
+ },
1744
+ {
1745
+ "entropy": 0.886734277009964,
1746
+ "epoch": 2.3149394347240917,
1747
+ "grad_norm": 1.1029839515686035,
1748
+ "learning_rate": 9.127907566570853e-05,
1749
+ "loss": 0.039513933658599856,
1750
+ "mean_token_accuracy": 0.9844573020935059,
1751
+ "num_tokens": 32437640.0,
1752
+ "step": 1720
1753
+ },
1754
+ {
1755
+ "entropy": 0.8843017637729644,
1756
+ "epoch": 2.328398384925976,
1757
+ "grad_norm": 1.2545154094696045,
1758
+ "learning_rate": 9.115308293650653e-05,
1759
+ "loss": 0.036970189213752745,
1760
+ "mean_token_accuracy": 0.985239815711975,
1761
+ "num_tokens": 32625986.0,
1762
+ "step": 1730
1763
+ },
1764
+ {
1765
+ "entropy": 0.8913422048091888,
1766
+ "epoch": 2.34185733512786,
1767
+ "grad_norm": 0.9613803625106812,
1768
+ "learning_rate": 9.102627486853099e-05,
1769
+ "loss": 0.03811657428741455,
1770
+ "mean_token_accuracy": 0.9852804243564606,
1771
+ "num_tokens": 32814531.0,
1772
+ "step": 1740
1773
+ },
1774
+ {
1775
+ "entropy": 0.8907467782497406,
1776
+ "epoch": 2.3553162853297445,
1777
+ "grad_norm": 1.1811398267745972,
1778
+ "learning_rate": 9.089865397414614e-05,
1779
+ "loss": 0.03903660774230957,
1780
+ "mean_token_accuracy": 0.9842524945735931,
1781
+ "num_tokens": 33002960.0,
1782
+ "step": 1750
1783
+ },
1784
+ {
1785
+ "entropy": 0.8833664715290069,
1786
+ "epoch": 2.3687752355316287,
1787
+ "grad_norm": 0.8338477611541748,
1788
+ "learning_rate": 9.077022278182024e-05,
1789
+ "loss": 0.03982087969779968,
1790
+ "mean_token_accuracy": 0.9841201484203339,
1791
+ "num_tokens": 33191183.0,
1792
+ "step": 1760
1793
+ },
1794
+ {
1795
+ "entropy": 0.8783826470375061,
1796
+ "epoch": 2.382234185733513,
1797
+ "grad_norm": 0.6904510259628296,
1798
+ "learning_rate": 9.064098383607545e-05,
1799
+ "loss": 0.03699290752410889,
1800
+ "mean_token_accuracy": 0.9854120731353759,
1801
+ "num_tokens": 33379798.0,
1802
+ "step": 1770
1803
+ },
1804
+ {
1805
+ "entropy": 0.8871048510074615,
1806
+ "epoch": 2.3956931359353972,
1807
+ "grad_norm": 1.009539246559143,
1808
+ "learning_rate": 9.051093969743738e-05,
1809
+ "loss": 0.03926805555820465,
1810
+ "mean_token_accuracy": 0.9843246698379516,
1811
+ "num_tokens": 33568531.0,
1812
+ "step": 1780
1813
+ },
1814
+ {
1815
+ "entropy": 0.8973462700843811,
1816
+ "epoch": 2.409152086137281,
1817
+ "grad_norm": 0.8916401267051697,
1818
+ "learning_rate": 9.03800929423844e-05,
1819
+ "loss": 0.039596831798553465,
1820
+ "mean_token_accuracy": 0.9839228212833404,
1821
+ "num_tokens": 33756333.0,
1822
+ "step": 1790
1823
+ },
1824
+ {
1825
+ "entropy": 0.8954619467258453,
1826
+ "epoch": 2.4226110363391653,
1827
+ "grad_norm": 1.056504487991333,
1828
+ "learning_rate": 9.024844616329662e-05,
1829
+ "loss": 0.0396859347820282,
1830
+ "mean_token_accuracy": 0.9837161123752594,
1831
+ "num_tokens": 33944904.0,
1832
+ "step": 1800
1833
+ },
1834
+ {
1835
+ "entropy": 0.8952643811702728,
1836
+ "epoch": 2.4360699865410496,
1837
+ "grad_norm": 1.0442084074020386,
1838
+ "learning_rate": 9.011600196840447e-05,
1839
+ "loss": 0.037324142456054685,
1840
+ "mean_token_accuracy": 0.9852897703647614,
1841
+ "num_tokens": 34133530.0,
1842
+ "step": 1810
1843
+ },
1844
+ {
1845
+ "entropy": 0.8874911010265351,
1846
+ "epoch": 2.449528936742934,
1847
+ "grad_norm": 0.9926224946975708,
1848
+ "learning_rate": 8.998276298173707e-05,
1849
+ "loss": 0.03637495338916778,
1850
+ "mean_token_accuracy": 0.9853426337242126,
1851
+ "num_tokens": 34322042.0,
1852
+ "step": 1820
1853
+ },
1854
+ {
1855
+ "entropy": 0.8900584518909455,
1856
+ "epoch": 2.462987886944818,
1857
+ "grad_norm": 1.001734733581543,
1858
+ "learning_rate": 8.984873184307017e-05,
1859
+ "loss": 0.04029585719108582,
1860
+ "mean_token_accuracy": 0.9842372059822082,
1861
+ "num_tokens": 34510259.0,
1862
+ "step": 1830
1863
+ },
1864
+ {
1865
+ "entropy": 0.8986345052719116,
1866
+ "epoch": 2.4764468371467023,
1867
+ "grad_norm": 0.772941529750824,
1868
+ "learning_rate": 8.971391120787397e-05,
1869
+ "loss": 0.0401084691286087,
1870
+ "mean_token_accuracy": 0.9841427087783814,
1871
+ "num_tokens": 34698669.0,
1872
+ "step": 1840
1873
+ },
1874
+ {
1875
+ "entropy": 0.8911370873451233,
1876
+ "epoch": 2.4899057873485866,
1877
+ "grad_norm": 0.895031213760376,
1878
+ "learning_rate": 8.957830374726042e-05,
1879
+ "loss": 0.03941032290458679,
1880
+ "mean_token_accuracy": 0.9841966688632965,
1881
+ "num_tokens": 34887160.0,
1882
+ "step": 1850
1883
+ },
1884
+ {
1885
+ "entropy": 0.8901963472366333,
1886
+ "epoch": 2.503364737550471,
1887
+ "grad_norm": 1.0677918195724487,
1888
+ "learning_rate": 8.944191214793028e-05,
1889
+ "loss": 0.03503885865211487,
1890
+ "mean_token_accuracy": 0.9856241464614868,
1891
+ "num_tokens": 35075891.0,
1892
+ "step": 1860
1893
+ },
1894
+ {
1895
+ "entropy": 0.8817630231380462,
1896
+ "epoch": 2.516823687752355,
1897
+ "grad_norm": 1.1200164556503296,
1898
+ "learning_rate": 8.930473911212e-05,
1899
+ "loss": 0.03878425657749176,
1900
+ "mean_token_accuracy": 0.9846492648124695,
1901
+ "num_tokens": 35264455.0,
1902
+ "step": 1870
1903
+ },
1904
+ {
1905
+ "entropy": 0.8897477686405182,
1906
+ "epoch": 2.5302826379542394,
1907
+ "grad_norm": 0.8058190941810608,
1908
+ "learning_rate": 8.916678735754809e-05,
1909
+ "loss": 0.041558006405830385,
1910
+ "mean_token_accuracy": 0.9830883860588073,
1911
+ "num_tokens": 35453251.0,
1912
+ "step": 1880
1913
+ },
1914
+ {
1915
+ "entropy": 0.8873181045055389,
1916
+ "epoch": 2.5437415881561236,
1917
+ "grad_norm": 0.7765992283821106,
1918
+ "learning_rate": 8.902805961736123e-05,
1919
+ "loss": 0.03555050492286682,
1920
+ "mean_token_accuracy": 0.9858225345611572,
1921
+ "num_tokens": 35641959.0,
1922
+ "step": 1890
1923
+ },
1924
+ {
1925
+ "entropy": 0.8782606959342957,
1926
+ "epoch": 2.557200538358008,
1927
+ "grad_norm": 0.8488866686820984,
1928
+ "learning_rate": 8.88885586400803e-05,
1929
+ "loss": 0.03606923818588257,
1930
+ "mean_token_accuracy": 0.9858632445335388,
1931
+ "num_tokens": 35829636.0,
1932
+ "step": 1900
1933
+ },
1934
+ {
1935
+ "entropy": 0.8869808673858642,
1936
+ "epoch": 2.570659488559892,
1937
+ "grad_norm": 1.1734739542007446,
1938
+ "learning_rate": 8.874828718954576e-05,
1939
+ "loss": 0.040956351161003116,
1940
+ "mean_token_accuracy": 0.9835374176502227,
1941
+ "num_tokens": 36017818.0,
1942
+ "step": 1910
1943
+ },
1944
+ {
1945
+ "entropy": 0.8850924432277679,
1946
+ "epoch": 2.5841184387617764,
1947
+ "grad_norm": 1.3584387302398682,
1948
+ "learning_rate": 8.86072480448629e-05,
1949
+ "loss": 0.042437011003494264,
1950
+ "mean_token_accuracy": 0.9826479077339172,
1951
+ "num_tokens": 36206505.0,
1952
+ "step": 1920
1953
+ },
1954
+ {
1955
+ "entropy": 0.8801687896251679,
1956
+ "epoch": 2.5975773889636606,
1957
+ "grad_norm": 1.2862240076065063,
1958
+ "learning_rate": 8.84654440003469e-05,
1959
+ "loss": 0.03880060315132141,
1960
+ "mean_token_accuracy": 0.9843404710292816,
1961
+ "num_tokens": 36395802.0,
1962
+ "step": 1930
1963
+ },
1964
+ {
1965
+ "entropy": 0.8752781748771667,
1966
+ "epoch": 2.611036339165545,
1967
+ "grad_norm": 0.7341249585151672,
1968
+ "learning_rate": 8.83228778654674e-05,
1969
+ "loss": 0.03776344656944275,
1970
+ "mean_token_accuracy": 0.9852877616882324,
1971
+ "num_tokens": 36584370.0,
1972
+ "step": 1940
1973
+ },
1974
+ {
1975
+ "entropy": 0.8759494423866272,
1976
+ "epoch": 2.624495289367429,
1977
+ "grad_norm": 0.9807276129722595,
1978
+ "learning_rate": 8.817955246479276e-05,
1979
+ "loss": 0.03780297338962555,
1980
+ "mean_token_accuracy": 0.9853227376937866,
1981
+ "num_tokens": 36772865.0,
1982
+ "step": 1950
1983
+ },
1984
+ {
1985
+ "entropy": 0.8753129065036773,
1986
+ "epoch": 2.6379542395693134,
1987
+ "grad_norm": 1.1391628980636597,
1988
+ "learning_rate": 8.803547063793422e-05,
1989
+ "loss": 0.03953765034675598,
1990
+ "mean_token_accuracy": 0.9839197635650635,
1991
+ "num_tokens": 36961462.0,
1992
+ "step": 1960
1993
+ },
1994
+ {
1995
+ "entropy": 0.8771899223327637,
1996
+ "epoch": 2.6514131897711977,
1997
+ "grad_norm": 1.0865391492843628,
1998
+ "learning_rate": 8.789063523948958e-05,
1999
+ "loss": 0.03856399655342102,
2000
+ "mean_token_accuracy": 0.9848688066005706,
2001
+ "num_tokens": 37150486.0,
2002
+ "step": 1970
2003
+ },
2004
+ {
2005
+ "entropy": 0.8774881184101104,
2006
+ "epoch": 2.664872139973082,
2007
+ "grad_norm": 0.8561595678329468,
2008
+ "learning_rate": 8.774504913898663e-05,
2009
+ "loss": 0.03795175850391388,
2010
+ "mean_token_accuracy": 0.9849309325218201,
2011
+ "num_tokens": 37338890.0,
2012
+ "step": 1980
2013
+ },
2014
+ {
2015
+ "entropy": 0.8841517448425293,
2016
+ "epoch": 2.678331090174966,
2017
+ "grad_norm": 0.9458399415016174,
2018
+ "learning_rate": 8.75987152208264e-05,
2019
+ "loss": 0.03860213160514832,
2020
+ "mean_token_accuracy": 0.9842230796813964,
2021
+ "num_tokens": 37526886.0,
2022
+ "step": 1990
2023
+ },
2024
+ {
2025
+ "entropy": 0.8809043228626251,
2026
+ "epoch": 2.6917900403768504,
2027
+ "grad_norm": 0.9360871315002441,
2028
+ "learning_rate": 8.745163638422583e-05,
2029
+ "loss": 0.03709094822406769,
2030
+ "mean_token_accuracy": 0.9855118036270142,
2031
+ "num_tokens": 37715408.0,
2032
+ "step": 2000
2033
+ },
2034
+ {
2035
+ "entropy": 0.8805519282817841,
2036
+ "epoch": 2.7052489905787347,
2037
+ "grad_norm": 0.8852124214172363,
2038
+ "learning_rate": 8.730381554316051e-05,
2039
+ "loss": 0.03869341611862183,
2040
+ "mean_token_accuracy": 0.9846724212169647,
2041
+ "num_tokens": 37904328.0,
2042
+ "step": 2010
2043
+ },
2044
+ {
2045
+ "entropy": 0.8796228408813477,
2046
+ "epoch": 2.718707940780619,
2047
+ "grad_norm": 1.189297080039978,
2048
+ "learning_rate": 8.715525562630687e-05,
2049
+ "loss": 0.03675769567489624,
2050
+ "mean_token_accuracy": 0.9854080140590668,
2051
+ "num_tokens": 38092776.0,
2052
+ "step": 2020
2053
+ },
2054
+ {
2055
+ "entropy": 0.8722020089626312,
2056
+ "epoch": 2.732166890982503,
2057
+ "grad_norm": 0.7027947306632996,
2058
+ "learning_rate": 8.700595957698411e-05,
2059
+ "loss": 0.03889244794845581,
2060
+ "mean_token_accuracy": 0.984558242559433,
2061
+ "num_tokens": 38282221.0,
2062
+ "step": 2030
2063
+ },
2064
+ {
2065
+ "entropy": 0.8746632814407349,
2066
+ "epoch": 2.7456258411843875,
2067
+ "grad_norm": 0.9799376726150513,
2068
+ "learning_rate": 8.685593035309598e-05,
2069
+ "loss": 0.03815680146217346,
2070
+ "mean_token_accuracy": 0.9845012128353119,
2071
+ "num_tokens": 38470633.0,
2072
+ "step": 2040
2073
+ },
2074
+ {
2075
+ "entropy": 0.8766567528247833,
2076
+ "epoch": 2.7590847913862717,
2077
+ "grad_norm": 0.850459098815918,
2078
+ "learning_rate": 8.670517092707213e-05,
2079
+ "loss": 0.039029371738433835,
2080
+ "mean_token_accuracy": 0.9845906794071198,
2081
+ "num_tokens": 38659048.0,
2082
+ "step": 2050
2083
+ },
2084
+ {
2085
+ "entropy": 0.8778688013553619,
2086
+ "epoch": 2.772543741588156,
2087
+ "grad_norm": 0.8540134429931641,
2088
+ "learning_rate": 8.655368428580919e-05,
2089
+ "loss": 0.039645448327064514,
2090
+ "mean_token_accuracy": 0.9840566098690033,
2091
+ "num_tokens": 38847370.0,
2092
+ "step": 2060
2093
+ },
2094
+ {
2095
+ "entropy": 0.8758600771427154,
2096
+ "epoch": 2.7860026917900402,
2097
+ "grad_norm": 0.8036021590232849,
2098
+ "learning_rate": 8.640147343061165e-05,
2099
+ "loss": 0.038226932287216187,
2100
+ "mean_token_accuracy": 0.9845647215843201,
2101
+ "num_tokens": 39036040.0,
2102
+ "step": 2070
2103
+ },
2104
+ {
2105
+ "entropy": 0.8652268946170807,
2106
+ "epoch": 2.7994616419919245,
2107
+ "grad_norm": 0.8526738286018372,
2108
+ "learning_rate": 8.624854137713234e-05,
2109
+ "loss": 0.03798363208770752,
2110
+ "mean_token_accuracy": 0.9845202267169952,
2111
+ "num_tokens": 39224695.0,
2112
+ "step": 2080
2113
+ },
2114
+ {
2115
+ "entropy": 0.8707200348377228,
2116
+ "epoch": 2.8129205921938087,
2117
+ "grad_norm": 0.9852389097213745,
2118
+ "learning_rate": 8.609489115531278e-05,
2119
+ "loss": 0.037227436900138855,
2120
+ "mean_token_accuracy": 0.9852050960063934,
2121
+ "num_tokens": 39413360.0,
2122
+ "step": 2090
2123
+ },
2124
+ {
2125
+ "entropy": 0.8711447060108185,
2126
+ "epoch": 2.826379542395693,
2127
+ "grad_norm": 1.228277325630188,
2128
+ "learning_rate": 8.594052580932301e-05,
2129
+ "loss": 0.03698050379753113,
2130
+ "mean_token_accuracy": 0.9852443158626556,
2131
+ "num_tokens": 39602544.0,
2132
+ "step": 2100
2133
+ },
2134
+ {
2135
+ "entropy": 0.8742912471294403,
2136
+ "epoch": 2.8398384925975773,
2137
+ "grad_norm": 0.8440544605255127,
2138
+ "learning_rate": 8.578544839750141e-05,
2139
+ "loss": 0.03757670521736145,
2140
+ "mean_token_accuracy": 0.9848618268966675,
2141
+ "num_tokens": 39791340.0,
2142
+ "step": 2110
2143
+ },
2144
+ {
2145
+ "entropy": 0.8801726162433624,
2146
+ "epoch": 2.8532974427994615,
2147
+ "grad_norm": 1.0955252647399902,
2148
+ "learning_rate": 8.562966199229399e-05,
2149
+ "loss": 0.040004149079322815,
2150
+ "mean_token_accuracy": 0.9841345012187958,
2151
+ "num_tokens": 39980212.0,
2152
+ "step": 2120
2153
+ },
2154
+ {
2155
+ "entropy": 0.8931492984294891,
2156
+ "epoch": 2.8667563930013458,
2157
+ "grad_norm": 0.5722652673721313,
2158
+ "learning_rate": 8.547316968019363e-05,
2159
+ "loss": 0.038096648454666135,
2160
+ "mean_token_accuracy": 0.9849136590957641,
2161
+ "num_tokens": 40169122.0,
2162
+ "step": 2130
2163
+ },
2164
+ {
2165
+ "entropy": 0.8767493844032288,
2166
+ "epoch": 2.88021534320323,
2167
+ "grad_norm": 0.7663730382919312,
2168
+ "learning_rate": 8.531597456167885e-05,
2169
+ "loss": 0.03729096055030823,
2170
+ "mean_token_accuracy": 0.985169780254364,
2171
+ "num_tokens": 40357678.0,
2172
+ "step": 2140
2173
+ },
2174
+ {
2175
+ "entropy": 0.8770561516284943,
2176
+ "epoch": 2.8936742934051143,
2177
+ "grad_norm": 1.1505672931671143,
2178
+ "learning_rate": 8.515807975115239e-05,
2179
+ "loss": 0.03984796404838562,
2180
+ "mean_token_accuracy": 0.9838845670223236,
2181
+ "num_tokens": 40546892.0,
2182
+ "step": 2150
2183
+ },
2184
+ {
2185
+ "entropy": 0.8789129912853241,
2186
+ "epoch": 2.9071332436069985,
2187
+ "grad_norm": 0.8355761766433716,
2188
+ "learning_rate": 8.499948837687959e-05,
2189
+ "loss": 0.03769223690032959,
2190
+ "mean_token_accuracy": 0.9847559213638306,
2191
+ "num_tokens": 40736109.0,
2192
+ "step": 2160
2193
+ },
2194
+ {
2195
+ "entropy": 0.8781779289245606,
2196
+ "epoch": 2.920592193808883,
2197
+ "grad_norm": 0.9103918075561523,
2198
+ "learning_rate": 8.484020358092625e-05,
2199
+ "loss": 0.038262826204299924,
2200
+ "mean_token_accuracy": 0.984741848707199,
2201
+ "num_tokens": 40924341.0,
2202
+ "step": 2170
2203
+ },
2204
+ {
2205
+ "entropy": 0.8552807629108429,
2206
+ "epoch": 2.934051144010767,
2207
+ "grad_norm": 0.8338510990142822,
2208
+ "learning_rate": 8.468022851909657e-05,
2209
+ "loss": 0.0355743408203125,
2210
+ "mean_token_accuracy": 0.9855036079883576,
2211
+ "num_tokens": 41113849.0,
2212
+ "step": 2180
2213
+ },
2214
+ {
2215
+ "entropy": 0.8705591142177582,
2216
+ "epoch": 2.9475100942126513,
2217
+ "grad_norm": 0.8334057927131653,
2218
+ "learning_rate": 8.451956636087046e-05,
2219
+ "loss": 0.037476515769958495,
2220
+ "mean_token_accuracy": 0.9854551255702972,
2221
+ "num_tokens": 41303130.0,
2222
+ "step": 2190
2223
+ },
2224
+ {
2225
+ "entropy": 0.8809274792671203,
2226
+ "epoch": 2.9609690444145356,
2227
+ "grad_norm": 0.9424103498458862,
2228
+ "learning_rate": 8.435822028934087e-05,
2229
+ "loss": 0.03624279499053955,
2230
+ "mean_token_accuracy": 0.9856135487556458,
2231
+ "num_tokens": 41491750.0,
2232
+ "step": 2200
2233
+ },
2234
+ {
2235
+ "entropy": 0.8818133771419525,
2236
+ "epoch": 2.97442799461642,
2237
+ "grad_norm": 1.04243004322052,
2238
+ "learning_rate": 8.41961935011506e-05,
2239
+ "loss": 0.039132320880889894,
2240
+ "mean_token_accuracy": 0.984746390581131,
2241
+ "num_tokens": 41680493.0,
2242
+ "step": 2210
2243
+ },
2244
+ {
2245
+ "entropy": 0.8838395297527313,
2246
+ "epoch": 2.987886944818304,
2247
+ "grad_norm": 0.8650784492492676,
2248
+ "learning_rate": 8.403348920642911e-05,
2249
+ "loss": 0.03822052478790283,
2250
+ "mean_token_accuracy": 0.9852108955383301,
2251
+ "num_tokens": 41868378.0,
2252
+ "step": 2220
2253
+ },
2254
+ {
2255
+ "epoch": 3.0,
2256
+ "eval_entropy": 0.8800126549544608,
2257
+ "eval_loss": 0.05468416586518288,
2258
+ "eval_mean_token_accuracy": 0.9781015617832257,
2259
+ "eval_num_tokens": 42038037.0,
2260
+ "eval_runtime": 24.4825,
2261
+ "eval_samples_per_second": 204.228,
2262
+ "eval_steps_per_second": 6.413,
2263
+ "step": 2229
2264
+ }
2265
+ ],
2266
+ "logging_steps": 10,
2267
+ "max_steps": 7430,
2268
+ "num_input_tokens_seen": 0,
2269
+ "num_train_epochs": 10,
2270
+ "save_steps": 500,
2271
+ "stateful_callbacks": {
2272
+ "TrainerControl": {
2273
+ "args": {
2274
+ "should_epoch_stop": false,
2275
+ "should_evaluate": false,
2276
+ "should_log": false,
2277
+ "should_save": true,
2278
+ "should_training_stop": false
2279
+ },
2280
+ "attributes": {}
2281
+ }
2282
+ },
2283
+ "total_flos": 2.0311131063058432e+18,
2284
+ "train_batch_size": 32,
2285
+ "trial_name": null,
2286
+ "trial_params": null
2287
+ }
baby_talk_L16_a50/seed_42/checkpoint-2229/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c060e97b69d99564c146471c3d3ac4d335e3b1968074124f4edc5aebf612e1e3
3
+ size 5368
baby_talk_L16_a50/seed_42/checkpoint-2972/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
baby_talk_L16_a50/seed_42/checkpoint-2972/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/checkpoint-2972/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2515d21f30afa1abce215a6e8019acc54800da53f5bc3c4775850dda77992ce
3
+ size 80792096
baby_talk_L16_a50/seed_42/checkpoint-2972/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
baby_talk_L16_a50/seed_42/checkpoint-2972/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
baby_talk_L16_a50/seed_42/checkpoint-2972/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
baby_talk_L16_a50/seed_42/checkpoint-2972/trainer_state.json ADDED
@@ -0,0 +1,3048 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2972,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.2358420491218567,
14
+ "epoch": 0.013458950201884253,
15
+ "grad_norm": 3.2940118312835693,
16
+ "learning_rate": 2.4193548387096776e-06,
17
+ "loss": 0.550364351272583,
18
+ "mean_token_accuracy": 0.8554959416389465,
19
+ "num_tokens": 188811.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.2353882431983947,
24
+ "epoch": 0.026917900403768506,
25
+ "grad_norm": 3.4476470947265625,
26
+ "learning_rate": 5.1075268817204305e-06,
27
+ "loss": 0.5143545627593994,
28
+ "mean_token_accuracy": 0.8613634884357453,
29
+ "num_tokens": 377729.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.2403772234916688,
34
+ "epoch": 0.040376850605652756,
35
+ "grad_norm": 2.2756996154785156,
36
+ "learning_rate": 7.795698924731183e-06,
37
+ "loss": 0.3996511220932007,
38
+ "mean_token_accuracy": 0.8753438770771027,
39
+ "num_tokens": 566562.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.2205921411514282,
44
+ "epoch": 0.05383580080753701,
45
+ "grad_norm": 1.2432096004486084,
46
+ "learning_rate": 1.0483870967741936e-05,
47
+ "loss": 0.2568032264709473,
48
+ "mean_token_accuracy": 0.9130991995334625,
49
+ "num_tokens": 755026.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.221834623813629,
54
+ "epoch": 0.06729475100942127,
55
+ "grad_norm": 0.8531327843666077,
56
+ "learning_rate": 1.3172043010752688e-05,
57
+ "loss": 0.20193097591400147,
58
+ "mean_token_accuracy": 0.9274256646633148,
59
+ "num_tokens": 943494.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.2245031952857972,
64
+ "epoch": 0.08075370121130551,
65
+ "grad_norm": 0.5026484131813049,
66
+ "learning_rate": 1.586021505376344e-05,
67
+ "loss": 0.171803081035614,
68
+ "mean_token_accuracy": 0.9363821744918823,
69
+ "num_tokens": 1131731.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 1.2192703008651733,
74
+ "epoch": 0.09421265141318977,
75
+ "grad_norm": 0.5228630304336548,
76
+ "learning_rate": 1.8548387096774193e-05,
77
+ "loss": 0.15698516368865967,
78
+ "mean_token_accuracy": 0.9423282980918884,
79
+ "num_tokens": 1320258.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 1.2153660774230957,
84
+ "epoch": 0.10767160161507403,
85
+ "grad_norm": 0.3650094270706177,
86
+ "learning_rate": 2.1236559139784946e-05,
87
+ "loss": 0.14900912046432496,
88
+ "mean_token_accuracy": 0.9437524616718292,
89
+ "num_tokens": 1509209.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 1.2123230814933776,
94
+ "epoch": 0.12113055181695828,
95
+ "grad_norm": 0.45723631978034973,
96
+ "learning_rate": 2.39247311827957e-05,
97
+ "loss": 0.1399540901184082,
98
+ "mean_token_accuracy": 0.9474983811378479,
99
+ "num_tokens": 1698139.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 1.208789598941803,
104
+ "epoch": 0.13458950201884254,
105
+ "grad_norm": 0.4575304687023163,
106
+ "learning_rate": 2.661290322580645e-05,
107
+ "loss": 0.12566736936569214,
108
+ "mean_token_accuracy": 0.9529488801956176,
109
+ "num_tokens": 1886384.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 1.205567193031311,
114
+ "epoch": 0.1480484522207268,
115
+ "grad_norm": 0.8033406734466553,
116
+ "learning_rate": 2.9301075268817207e-05,
117
+ "loss": 0.12005312442779541,
118
+ "mean_token_accuracy": 0.9533569395542145,
119
+ "num_tokens": 2074919.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 1.20180287361145,
124
+ "epoch": 0.16150740242261102,
125
+ "grad_norm": 0.6989286541938782,
126
+ "learning_rate": 3.198924731182796e-05,
127
+ "loss": 0.11197478771209717,
128
+ "mean_token_accuracy": 0.9572584748268127,
129
+ "num_tokens": 2263116.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 1.202919363975525,
134
+ "epoch": 0.17496635262449528,
135
+ "grad_norm": 0.7233495116233826,
136
+ "learning_rate": 3.467741935483872e-05,
137
+ "loss": 0.11106340885162354,
138
+ "mean_token_accuracy": 0.9569663584232331,
139
+ "num_tokens": 2451273.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 1.1907272219657898,
144
+ "epoch": 0.18842530282637954,
145
+ "grad_norm": 0.7645091414451599,
146
+ "learning_rate": 3.736559139784947e-05,
147
+ "loss": 0.10956189632415772,
148
+ "mean_token_accuracy": 0.95842245221138,
149
+ "num_tokens": 2640657.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 1.1895484924316406,
154
+ "epoch": 0.2018842530282638,
155
+ "grad_norm": 0.841379702091217,
156
+ "learning_rate": 4.005376344086022e-05,
157
+ "loss": 0.10442907810211181,
158
+ "mean_token_accuracy": 0.9603263795375824,
159
+ "num_tokens": 2829351.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 1.1871973633766175,
164
+ "epoch": 0.21534320323014805,
165
+ "grad_norm": 0.8552286624908447,
166
+ "learning_rate": 4.2741935483870973e-05,
167
+ "loss": 0.10493810176849365,
168
+ "mean_token_accuracy": 0.9597454965114594,
169
+ "num_tokens": 3018290.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 1.1907334923744202,
174
+ "epoch": 0.2288021534320323,
175
+ "grad_norm": 0.7243770360946655,
176
+ "learning_rate": 4.543010752688172e-05,
177
+ "loss": 0.10288643836975098,
178
+ "mean_token_accuracy": 0.9603340923786163,
179
+ "num_tokens": 3206538.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 1.1835299730300903,
184
+ "epoch": 0.24226110363391656,
185
+ "grad_norm": 0.8118091225624084,
186
+ "learning_rate": 4.811827956989248e-05,
187
+ "loss": 0.09798368811607361,
188
+ "mean_token_accuracy": 0.9619723737239838,
189
+ "num_tokens": 3395374.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 1.1794602513313293,
194
+ "epoch": 0.2557200538358008,
195
+ "grad_norm": 0.7447699904441833,
196
+ "learning_rate": 5.080645161290323e-05,
197
+ "loss": 0.09498158693313599,
198
+ "mean_token_accuracy": 0.9634931206703186,
199
+ "num_tokens": 3584273.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 1.1834724426269532,
204
+ "epoch": 0.2691790040376851,
205
+ "grad_norm": 1.4059171676635742,
206
+ "learning_rate": 5.349462365591398e-05,
207
+ "loss": 0.09400172233581543,
208
+ "mean_token_accuracy": 0.9630160868167877,
209
+ "num_tokens": 3772857.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.186821174621582,
214
+ "epoch": 0.28263795423956933,
215
+ "grad_norm": 0.8273025751113892,
216
+ "learning_rate": 5.618279569892473e-05,
217
+ "loss": 0.09464811086654663,
218
+ "mean_token_accuracy": 0.9631786167621612,
219
+ "num_tokens": 3960877.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 1.1769920349121095,
224
+ "epoch": 0.2960969044414536,
225
+ "grad_norm": 1.0182609558105469,
226
+ "learning_rate": 5.887096774193549e-05,
227
+ "loss": 0.0914128303527832,
228
+ "mean_token_accuracy": 0.9644896507263183,
229
+ "num_tokens": 4149444.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 1.1719188809394836,
234
+ "epoch": 0.30955585464333785,
235
+ "grad_norm": 0.9648745656013489,
236
+ "learning_rate": 6.155913978494624e-05,
237
+ "loss": 0.0920255422592163,
238
+ "mean_token_accuracy": 0.963964831829071,
239
+ "num_tokens": 4337989.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 1.1559369206428527,
244
+ "epoch": 0.32301480484522205,
245
+ "grad_norm": 0.853049635887146,
246
+ "learning_rate": 6.4247311827957e-05,
247
+ "loss": 0.0891042947769165,
248
+ "mean_token_accuracy": 0.9653984010219574,
249
+ "num_tokens": 4527256.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 1.1604587316513062,
254
+ "epoch": 0.3364737550471063,
255
+ "grad_norm": 0.873772144317627,
256
+ "learning_rate": 6.693548387096774e-05,
257
+ "loss": 0.09071275591850281,
258
+ "mean_token_accuracy": 0.9651397407054901,
259
+ "num_tokens": 4715878.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 1.1534931778907775,
264
+ "epoch": 0.34993270524899056,
265
+ "grad_norm": 1.087850570678711,
266
+ "learning_rate": 6.962365591397851e-05,
267
+ "loss": 0.08719289302825928,
268
+ "mean_token_accuracy": 0.9658753871917725,
269
+ "num_tokens": 4904527.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 1.1629684448242188,
274
+ "epoch": 0.3633916554508748,
275
+ "grad_norm": 0.9014195799827576,
276
+ "learning_rate": 7.231182795698926e-05,
277
+ "loss": 0.08716133236885071,
278
+ "mean_token_accuracy": 0.965560519695282,
279
+ "num_tokens": 5093554.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 1.148916518688202,
284
+ "epoch": 0.3768506056527591,
285
+ "grad_norm": 0.9712215662002563,
286
+ "learning_rate": 7.500000000000001e-05,
287
+ "loss": 0.08668915033340455,
288
+ "mean_token_accuracy": 0.9658140063285827,
289
+ "num_tokens": 5282224.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 1.1493291974067688,
294
+ "epoch": 0.39030955585464333,
295
+ "grad_norm": 0.9085242748260498,
296
+ "learning_rate": 7.768817204301076e-05,
297
+ "loss": 0.08189771175384522,
298
+ "mean_token_accuracy": 0.9670268416404724,
299
+ "num_tokens": 5471308.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 1.1406704187393188,
304
+ "epoch": 0.4037685060565276,
305
+ "grad_norm": 1.128177285194397,
306
+ "learning_rate": 8.037634408602151e-05,
307
+ "loss": 0.08119879961013794,
308
+ "mean_token_accuracy": 0.9675639867782593,
309
+ "num_tokens": 5660026.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 1.1369357347488402,
314
+ "epoch": 0.41722745625841184,
315
+ "grad_norm": 0.8224227428436279,
316
+ "learning_rate": 8.306451612903227e-05,
317
+ "loss": 0.07979745864868164,
318
+ "mean_token_accuracy": 0.9681445300579071,
319
+ "num_tokens": 5848073.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 1.1252583503723144,
324
+ "epoch": 0.4306864064602961,
325
+ "grad_norm": 0.7711160182952881,
326
+ "learning_rate": 8.575268817204302e-05,
327
+ "loss": 0.0783164381980896,
328
+ "mean_token_accuracy": 0.9686201930046081,
329
+ "num_tokens": 6036717.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 1.1203404307365417,
334
+ "epoch": 0.44414535666218036,
335
+ "grad_norm": 1.1810287237167358,
336
+ "learning_rate": 8.844086021505377e-05,
337
+ "loss": 0.08139073848724365,
338
+ "mean_token_accuracy": 0.9681365489959717,
339
+ "num_tokens": 6225223.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 1.1222687840461731,
344
+ "epoch": 0.4576043068640646,
345
+ "grad_norm": 1.4551713466644287,
346
+ "learning_rate": 9.112903225806452e-05,
347
+ "loss": 0.0820135235786438,
348
+ "mean_token_accuracy": 0.9672703862190246,
349
+ "num_tokens": 6413936.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 1.1156280279159545,
354
+ "epoch": 0.47106325706594887,
355
+ "grad_norm": 0.8573716878890991,
356
+ "learning_rate": 9.381720430107528e-05,
357
+ "loss": 0.08074904680252075,
358
+ "mean_token_accuracy": 0.9677663922309876,
359
+ "num_tokens": 6602413.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 1.1063185572624206,
364
+ "epoch": 0.4845222072678331,
365
+ "grad_norm": 0.7709434628486633,
366
+ "learning_rate": 9.650537634408603e-05,
367
+ "loss": 0.07549421787261963,
368
+ "mean_token_accuracy": 0.9689356207847595,
369
+ "num_tokens": 6790873.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 1.0962031960487366,
374
+ "epoch": 0.4979811574697174,
375
+ "grad_norm": 0.7843625545501709,
376
+ "learning_rate": 9.919354838709678e-05,
377
+ "loss": 0.07368478775024415,
378
+ "mean_token_accuracy": 0.9700294613838196,
379
+ "num_tokens": 6979392.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 1.1028530240058898,
384
+ "epoch": 0.5114401076716016,
385
+ "grad_norm": 0.7935485243797302,
386
+ "learning_rate": 9.999975729865971e-05,
387
+ "loss": 0.07766538262367248,
388
+ "mean_token_accuracy": 0.9704003691673279,
389
+ "num_tokens": 7167482.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 1.0905393958091736,
394
+ "epoch": 0.5248990578734859,
395
+ "grad_norm": 1.4108924865722656,
396
+ "learning_rate": 9.999856856307314e-05,
397
+ "loss": 0.0760004162788391,
398
+ "mean_token_accuracy": 0.9699020266532898,
399
+ "num_tokens": 7355994.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 1.0853531122207642,
404
+ "epoch": 0.5383580080753702,
405
+ "grad_norm": 0.9043511152267456,
406
+ "learning_rate": 9.999638923896533e-05,
407
+ "loss": 0.0720310389995575,
408
+ "mean_token_accuracy": 0.9707890212535858,
409
+ "num_tokens": 7544655.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 1.075773572921753,
414
+ "epoch": 0.5518169582772544,
415
+ "grad_norm": 0.9038823246955872,
416
+ "learning_rate": 9.999321936951374e-05,
417
+ "loss": 0.07026209831237792,
418
+ "mean_token_accuracy": 0.9715417385101318,
419
+ "num_tokens": 7733348.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 1.0572428584098816,
424
+ "epoch": 0.5652759084791387,
425
+ "grad_norm": 0.8231707811355591,
426
+ "learning_rate": 9.998905901752091e-05,
427
+ "loss": 0.07141299843788147,
428
+ "mean_token_accuracy": 0.9705908715724945,
429
+ "num_tokens": 7921582.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 1.051485300064087,
434
+ "epoch": 0.5787348586810229,
435
+ "grad_norm": 1.1695454120635986,
436
+ "learning_rate": 9.998390826541315e-05,
437
+ "loss": 0.07321611642837525,
438
+ "mean_token_accuracy": 0.9709623396396637,
439
+ "num_tokens": 8110266.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 1.0485445380210876,
444
+ "epoch": 0.5921938088829072,
445
+ "grad_norm": 0.7291010618209839,
446
+ "learning_rate": 9.997776721523888e-05,
447
+ "loss": 0.07221676707267762,
448
+ "mean_token_accuracy": 0.9705081820487976,
449
+ "num_tokens": 8298932.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 1.0456495046615601,
454
+ "epoch": 0.6056527590847914,
455
+ "grad_norm": 0.6550094485282898,
456
+ "learning_rate": 9.99706359886667e-05,
457
+ "loss": 0.06878133416175843,
458
+ "mean_token_accuracy": 0.9728739261627197,
459
+ "num_tokens": 8487613.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 1.0372861266136169,
464
+ "epoch": 0.6191117092866757,
465
+ "grad_norm": 0.5825450420379639,
466
+ "learning_rate": 9.996251472698281e-05,
467
+ "loss": 0.06706151366233826,
468
+ "mean_token_accuracy": 0.9732721030712128,
469
+ "num_tokens": 8676276.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 1.0319493532180786,
474
+ "epoch": 0.6325706594885598,
475
+ "grad_norm": 0.7158534526824951,
476
+ "learning_rate": 9.995340359108844e-05,
477
+ "loss": 0.06999597549438477,
478
+ "mean_token_accuracy": 0.9726741492748261,
479
+ "num_tokens": 8864738.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 1.0260030388832093,
484
+ "epoch": 0.6460296096904441,
485
+ "grad_norm": 0.7918373346328735,
486
+ "learning_rate": 9.994330276149649e-05,
487
+ "loss": 0.06949877142906188,
488
+ "mean_token_accuracy": 0.9725882947444916,
489
+ "num_tokens": 9053582.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 1.0258127093315124,
494
+ "epoch": 0.6594885598923284,
495
+ "grad_norm": 0.893281102180481,
496
+ "learning_rate": 9.993221243832797e-05,
497
+ "loss": 0.06893026828765869,
498
+ "mean_token_accuracy": 0.9729171216487884,
499
+ "num_tokens": 9241914.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 1.0261828184127808,
504
+ "epoch": 0.6729475100942126,
505
+ "grad_norm": 1.0127108097076416,
506
+ "learning_rate": 9.992013284130816e-05,
507
+ "loss": 0.07094801664352417,
508
+ "mean_token_accuracy": 0.9714575052261353,
509
+ "num_tokens": 9430981.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 1.0274562597274781,
514
+ "epoch": 0.6864064602960969,
515
+ "grad_norm": 0.7148029208183289,
516
+ "learning_rate": 9.990706420976206e-05,
517
+ "loss": 0.06826171875,
518
+ "mean_token_accuracy": 0.9727248430252076,
519
+ "num_tokens": 9619472.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 1.0137859225273131,
524
+ "epoch": 0.6998654104979811,
525
+ "grad_norm": 0.9228634238243103,
526
+ "learning_rate": 9.989300680260985e-05,
527
+ "loss": 0.06890587210655212,
528
+ "mean_token_accuracy": 0.9723304331302642,
529
+ "num_tokens": 9808123.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 1.0221150755882262,
534
+ "epoch": 0.7133243606998654,
535
+ "grad_norm": 0.971530556678772,
536
+ "learning_rate": 9.98779608983616e-05,
537
+ "loss": 0.07073599100112915,
538
+ "mean_token_accuracy": 0.9714174270629883,
539
+ "num_tokens": 9996601.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 1.0248154640197753,
544
+ "epoch": 0.7267833109017496,
545
+ "grad_norm": 0.7336317896842957,
546
+ "learning_rate": 9.986192679511189e-05,
547
+ "loss": 0.06874136924743653,
548
+ "mean_token_accuracy": 0.9723432004451752,
549
+ "num_tokens": 10184725.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 1.0120978832244873,
554
+ "epoch": 0.7402422611036339,
555
+ "grad_norm": 1.2100600004196167,
556
+ "learning_rate": 9.984490481053372e-05,
557
+ "loss": 0.06270487308502197,
558
+ "mean_token_accuracy": 0.9756880521774292,
559
+ "num_tokens": 10373446.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 1.0172406077384948,
564
+ "epoch": 0.7537012113055181,
565
+ "grad_norm": 0.7562853693962097,
566
+ "learning_rate": 9.982689528187244e-05,
567
+ "loss": 0.06784560084342957,
568
+ "mean_token_accuracy": 0.9724333345890045,
569
+ "num_tokens": 10561710.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 1.0150038957595826,
574
+ "epoch": 0.7671601615074024,
575
+ "grad_norm": 1.07966148853302,
576
+ "learning_rate": 9.98078985659389e-05,
577
+ "loss": 0.0670344054698944,
578
+ "mean_token_accuracy": 0.97339146733284,
579
+ "num_tokens": 10749917.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 1.0135640978813172,
584
+ "epoch": 0.7806191117092867,
585
+ "grad_norm": 0.7826104164123535,
586
+ "learning_rate": 9.978791503910246e-05,
587
+ "loss": 0.0668565571308136,
588
+ "mean_token_accuracy": 0.9735289216041565,
589
+ "num_tokens": 10938098.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 1.0077669620513916,
594
+ "epoch": 0.7940780619111709,
595
+ "grad_norm": 1.299584150314331,
596
+ "learning_rate": 9.97669450972835e-05,
597
+ "loss": 0.0728976845741272,
598
+ "mean_token_accuracy": 0.9701269209384918,
599
+ "num_tokens": 11126107.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 0.9957692861557007,
604
+ "epoch": 0.8075370121130552,
605
+ "grad_norm": 1.1542484760284424,
606
+ "learning_rate": 9.974498915594557e-05,
607
+ "loss": 0.0631720781326294,
608
+ "mean_token_accuracy": 0.9747781097888947,
609
+ "num_tokens": 11315001.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 0.9983992159366608,
614
+ "epoch": 0.8209959623149394,
615
+ "grad_norm": 0.6967246532440186,
616
+ "learning_rate": 9.97220476500872e-05,
617
+ "loss": 0.06333768963813782,
618
+ "mean_token_accuracy": 0.9750322341918946,
619
+ "num_tokens": 11503007.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 0.9801111698150635,
624
+ "epoch": 0.8344549125168237,
625
+ "grad_norm": 0.945446789264679,
626
+ "learning_rate": 9.969812103423325e-05,
627
+ "loss": 0.05720087289810181,
628
+ "mean_token_accuracy": 0.9767442524433136,
629
+ "num_tokens": 11691883.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 0.9879570543766022,
634
+ "epoch": 0.847913862718708,
635
+ "grad_norm": 0.8254193663597107,
636
+ "learning_rate": 9.967320978242592e-05,
637
+ "loss": 0.05899171829223633,
638
+ "mean_token_accuracy": 0.9767756819725036,
639
+ "num_tokens": 11880353.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 0.9878240942955017,
644
+ "epoch": 0.8613728129205922,
645
+ "grad_norm": 0.9882538914680481,
646
+ "learning_rate": 9.964731438821533e-05,
647
+ "loss": 0.06443996429443359,
648
+ "mean_token_accuracy": 0.9735406100749969,
649
+ "num_tokens": 12069610.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 0.9938213169574738,
654
+ "epoch": 0.8748317631224765,
655
+ "grad_norm": 1.126546859741211,
656
+ "learning_rate": 9.962043536464978e-05,
657
+ "loss": 0.06708416938781739,
658
+ "mean_token_accuracy": 0.9730359137058258,
659
+ "num_tokens": 12257901.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 0.9800443708896637,
664
+ "epoch": 0.8882907133243607,
665
+ "grad_norm": 0.9151445031166077,
666
+ "learning_rate": 9.959257324426556e-05,
667
+ "loss": 0.06290764808654785,
668
+ "mean_token_accuracy": 0.9741653084754944,
669
+ "num_tokens": 12446236.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 0.9744794130325317,
674
+ "epoch": 0.901749663526245,
675
+ "grad_norm": 0.9104028940200806,
676
+ "learning_rate": 9.95637285790764e-05,
677
+ "loss": 0.060464882850646974,
678
+ "mean_token_accuracy": 0.9757490694522858,
679
+ "num_tokens": 12635226.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 0.9786687552928924,
684
+ "epoch": 0.9152086137281292,
685
+ "grad_norm": 0.7316192388534546,
686
+ "learning_rate": 9.953390194056258e-05,
687
+ "loss": 0.06054847836494446,
688
+ "mean_token_accuracy": 0.9760404825210571,
689
+ "num_tokens": 12823428.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 0.966060334444046,
694
+ "epoch": 0.9286675639300135,
695
+ "grad_norm": 1.3535135984420776,
696
+ "learning_rate": 9.950309391965947e-05,
697
+ "loss": 0.061383575201034546,
698
+ "mean_token_accuracy": 0.9749173820018768,
699
+ "num_tokens": 13012362.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 0.9722849190235138,
704
+ "epoch": 0.9421265141318977,
705
+ "grad_norm": 0.890771746635437,
706
+ "learning_rate": 9.947130512674602e-05,
707
+ "loss": 0.06301190257072449,
708
+ "mean_token_accuracy": 0.9750476598739624,
709
+ "num_tokens": 13200680.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 0.9624020338058472,
714
+ "epoch": 0.955585464333782,
715
+ "grad_norm": 1.142499327659607,
716
+ "learning_rate": 9.943853619163255e-05,
717
+ "loss": 0.06179196834564209,
718
+ "mean_token_accuracy": 0.9751901209354401,
719
+ "num_tokens": 13389333.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 0.9653747022151947,
724
+ "epoch": 0.9690444145356663,
725
+ "grad_norm": 0.8682493567466736,
726
+ "learning_rate": 9.94047877635482e-05,
727
+ "loss": 0.06113170981407166,
728
+ "mean_token_accuracy": 0.9748325288295746,
729
+ "num_tokens": 13578210.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 0.970348197221756,
734
+ "epoch": 0.9825033647375505,
735
+ "grad_norm": 0.8351430296897888,
736
+ "learning_rate": 9.93700605111283e-05,
737
+ "loss": 0.05901788473129273,
738
+ "mean_token_accuracy": 0.9761226296424865,
739
+ "num_tokens": 13767039.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 0.956724613904953,
744
+ "epoch": 0.9959623149394348,
745
+ "grad_norm": 1.2604819536209106,
746
+ "learning_rate": 9.933435512240084e-05,
747
+ "loss": 0.06124393343925476,
748
+ "mean_token_accuracy": 0.9747833967208862,
749
+ "num_tokens": 13956057.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "epoch": 1.0,
754
+ "eval_entropy": 0.9722227849018802,
755
+ "eval_loss": 0.0600070059299469,
756
+ "eval_mean_token_accuracy": 0.975752676368519,
757
+ "eval_num_tokens": 14012630.0,
758
+ "eval_runtime": 24.6215,
759
+ "eval_samples_per_second": 203.074,
760
+ "eval_steps_per_second": 6.377,
761
+ "step": 743
762
+ },
763
+ {
764
+ "entropy": 0.9689933776855468,
765
+ "epoch": 1.009421265141319,
766
+ "grad_norm": 0.9720287919044495,
767
+ "learning_rate": 9.929767230477305e-05,
768
+ "loss": 0.055563896894454956,
769
+ "mean_token_accuracy": 0.9779708027839661,
770
+ "num_tokens": 14144652.0,
771
+ "step": 750
772
+ },
773
+ {
774
+ "entropy": 0.9506655633449554,
775
+ "epoch": 1.0228802153432033,
776
+ "grad_norm": 0.6619595289230347,
777
+ "learning_rate": 9.92600127850173e-05,
778
+ "loss": 0.050587379932403566,
779
+ "mean_token_accuracy": 0.979685264825821,
780
+ "num_tokens": 14332714.0,
781
+ "step": 760
782
+ },
783
+ {
784
+ "entropy": 0.9458732306957245,
785
+ "epoch": 1.0363391655450875,
786
+ "grad_norm": 0.7905108332633972,
787
+ "learning_rate": 9.922137730925673e-05,
788
+ "loss": 0.05069155097007751,
789
+ "mean_token_accuracy": 0.9796596229076385,
790
+ "num_tokens": 14520902.0,
791
+ "step": 770
792
+ },
793
+ {
794
+ "entropy": 0.9320916712284089,
795
+ "epoch": 1.0497981157469718,
796
+ "grad_norm": 0.9210566878318787,
797
+ "learning_rate": 9.918176664295041e-05,
798
+ "loss": 0.051744121313095096,
799
+ "mean_token_accuracy": 0.9792148530483246,
800
+ "num_tokens": 14709215.0,
801
+ "step": 780
802
+ },
803
+ {
804
+ "entropy": 0.928934782743454,
805
+ "epoch": 1.063257065948856,
806
+ "grad_norm": 0.7442036867141724,
807
+ "learning_rate": 9.914118157087824e-05,
808
+ "loss": 0.0486875057220459,
809
+ "mean_token_accuracy": 0.9798122465610504,
810
+ "num_tokens": 14898242.0,
811
+ "step": 790
812
+ },
813
+ {
814
+ "entropy": 0.942881977558136,
815
+ "epoch": 1.0767160161507403,
816
+ "grad_norm": 1.0910941362380981,
817
+ "learning_rate": 9.909962289712538e-05,
818
+ "loss": 0.052297019958496095,
819
+ "mean_token_accuracy": 0.9793200254440307,
820
+ "num_tokens": 15086835.0,
821
+ "step": 800
822
+ },
823
+ {
824
+ "entropy": 0.9503917813301086,
825
+ "epoch": 1.0901749663526246,
826
+ "grad_norm": 0.8091554641723633,
827
+ "learning_rate": 9.905709144506629e-05,
828
+ "loss": 0.049927744269371035,
829
+ "mean_token_accuracy": 0.9802082359790802,
830
+ "num_tokens": 15275042.0,
831
+ "step": 810
832
+ },
833
+ {
834
+ "entropy": 0.9426830470561981,
835
+ "epoch": 1.1036339165545088,
836
+ "grad_norm": 1.0589245557785034,
837
+ "learning_rate": 9.901358805734846e-05,
838
+ "loss": 0.053803551197052005,
839
+ "mean_token_accuracy": 0.9785399675369263,
840
+ "num_tokens": 15463923.0,
841
+ "step": 820
842
+ },
843
+ {
844
+ "entropy": 0.9445010781288147,
845
+ "epoch": 1.117092866756393,
846
+ "grad_norm": 0.8410441279411316,
847
+ "learning_rate": 9.89691135958757e-05,
848
+ "loss": 0.053181976079940796,
849
+ "mean_token_accuracy": 0.9792523324489594,
850
+ "num_tokens": 15652526.0,
851
+ "step": 830
852
+ },
853
+ {
854
+ "entropy": 0.946729838848114,
855
+ "epoch": 1.1305518169582773,
856
+ "grad_norm": 0.8246080279350281,
857
+ "learning_rate": 9.892366894179105e-05,
858
+ "loss": 0.054291915893554685,
859
+ "mean_token_accuracy": 0.9791357696056366,
860
+ "num_tokens": 15841340.0,
861
+ "step": 840
862
+ },
863
+ {
864
+ "entropy": 0.9433728814125061,
865
+ "epoch": 1.1440107671601616,
866
+ "grad_norm": 0.9200296401977539,
867
+ "learning_rate": 9.887725499545937e-05,
868
+ "loss": 0.05073915719985962,
869
+ "mean_token_accuracy": 0.9796931505203247,
870
+ "num_tokens": 16029478.0,
871
+ "step": 850
872
+ },
873
+ {
874
+ "entropy": 0.9494417488574982,
875
+ "epoch": 1.1574697173620458,
876
+ "grad_norm": 0.8342758417129517,
877
+ "learning_rate": 9.882987267644939e-05,
878
+ "loss": 0.050929927825927736,
879
+ "mean_token_accuracy": 0.9793219089508056,
880
+ "num_tokens": 16218014.0,
881
+ "step": 860
882
+ },
883
+ {
884
+ "entropy": 0.9409256100654602,
885
+ "epoch": 1.17092866756393,
886
+ "grad_norm": 0.7911009192466736,
887
+ "learning_rate": 9.878152292351563e-05,
888
+ "loss": 0.049964362382888795,
889
+ "mean_token_accuracy": 0.9798487305641175,
890
+ "num_tokens": 16406420.0,
891
+ "step": 870
892
+ },
893
+ {
894
+ "entropy": 0.9429958581924438,
895
+ "epoch": 1.1843876177658144,
896
+ "grad_norm": 1.4877018928527832,
897
+ "learning_rate": 9.873220669457975e-05,
898
+ "loss": 0.04969423711299896,
899
+ "mean_token_accuracy": 0.9801322638988494,
900
+ "num_tokens": 16594948.0,
901
+ "step": 880
902
+ },
903
+ {
904
+ "entropy": 0.9542136013507843,
905
+ "epoch": 1.1978465679676986,
906
+ "grad_norm": 0.8030785918235779,
907
+ "learning_rate": 9.868192496671147e-05,
908
+ "loss": 0.04981146454811096,
909
+ "mean_token_accuracy": 0.9796162784099579,
910
+ "num_tokens": 16782983.0,
911
+ "step": 890
912
+ },
913
+ {
914
+ "entropy": 0.9367722630500793,
915
+ "epoch": 1.2113055181695827,
916
+ "grad_norm": 1.1169800758361816,
917
+ "learning_rate": 9.86306787361094e-05,
918
+ "loss": 0.05210963487625122,
919
+ "mean_token_accuracy": 0.9790139615535736,
920
+ "num_tokens": 16971394.0,
921
+ "step": 900
922
+ },
923
+ {
924
+ "entropy": 0.9487208306789399,
925
+ "epoch": 1.224764468371467,
926
+ "grad_norm": 0.8650282621383667,
927
+ "learning_rate": 9.857846901808117e-05,
928
+ "loss": 0.05012243390083313,
929
+ "mean_token_accuracy": 0.9805107891559601,
930
+ "num_tokens": 17159951.0,
931
+ "step": 910
932
+ },
933
+ {
934
+ "entropy": 0.9441231489181519,
935
+ "epoch": 1.2382234185733512,
936
+ "grad_norm": 0.8193967938423157,
937
+ "learning_rate": 9.852529684702329e-05,
938
+ "loss": 0.048866665363311766,
939
+ "mean_token_accuracy": 0.9807584881782532,
940
+ "num_tokens": 17348711.0,
941
+ "step": 920
942
+ },
943
+ {
944
+ "entropy": 0.9361630439758301,
945
+ "epoch": 1.2516823687752354,
946
+ "grad_norm": 1.3613612651824951,
947
+ "learning_rate": 9.847116327640082e-05,
948
+ "loss": 0.05324091911315918,
949
+ "mean_token_accuracy": 0.97904953956604,
950
+ "num_tokens": 17537116.0,
951
+ "step": 930
952
+ },
953
+ {
954
+ "entropy": 0.9562997639179229,
955
+ "epoch": 1.2651413189771197,
956
+ "grad_norm": 0.8121878504753113,
957
+ "learning_rate": 9.841606937872632e-05,
958
+ "loss": 0.05011019706726074,
959
+ "mean_token_accuracy": 0.9798731088638306,
960
+ "num_tokens": 17725567.0,
961
+ "step": 940
962
+ },
963
+ {
964
+ "entropy": 0.9359076261520386,
965
+ "epoch": 1.278600269179004,
966
+ "grad_norm": 0.7838549613952637,
967
+ "learning_rate": 9.836001624553869e-05,
968
+ "loss": 0.04834386110305786,
969
+ "mean_token_accuracy": 0.9804529249668121,
970
+ "num_tokens": 17914191.0,
971
+ "step": 950
972
+ },
973
+ {
974
+ "entropy": 0.9527446150779724,
975
+ "epoch": 1.2920592193808882,
976
+ "grad_norm": 1.0981712341308594,
977
+ "learning_rate": 9.830300498738152e-05,
978
+ "loss": 0.055077624320983884,
979
+ "mean_token_accuracy": 0.9776535391807556,
980
+ "num_tokens": 18103184.0,
981
+ "step": 960
982
+ },
983
+ {
984
+ "entropy": 0.9574812948703766,
985
+ "epoch": 1.3055181695827724,
986
+ "grad_norm": 0.8291501402854919,
987
+ "learning_rate": 9.824503673378112e-05,
988
+ "loss": 0.050153911113739014,
989
+ "mean_token_accuracy": 0.9792965054512024,
990
+ "num_tokens": 18291351.0,
991
+ "step": 970
992
+ },
993
+ {
994
+ "entropy": 0.939585280418396,
995
+ "epoch": 1.3189771197846567,
996
+ "grad_norm": 1.0864412784576416,
997
+ "learning_rate": 9.81861126332241e-05,
998
+ "loss": 0.05132197737693787,
999
+ "mean_token_accuracy": 0.9792291462421417,
1000
+ "num_tokens": 18480192.0,
1001
+ "step": 980
1002
+ },
1003
+ {
1004
+ "entropy": 0.9468406856060028,
1005
+ "epoch": 1.332436069986541,
1006
+ "grad_norm": 0.9772785902023315,
1007
+ "learning_rate": 9.812623385313461e-05,
1008
+ "loss": 0.04844954013824463,
1009
+ "mean_token_accuracy": 0.9804142415523529,
1010
+ "num_tokens": 18669028.0,
1011
+ "step": 990
1012
+ },
1013
+ {
1014
+ "entropy": 0.9413834273815155,
1015
+ "epoch": 1.3458950201884252,
1016
+ "grad_norm": 0.9020094871520996,
1017
+ "learning_rate": 9.806540157985131e-05,
1018
+ "loss": 0.05075312852859497,
1019
+ "mean_token_accuracy": 0.9790591120719909,
1020
+ "num_tokens": 18857581.0,
1021
+ "step": 1000
1022
+ },
1023
+ {
1024
+ "entropy": 0.9526253461837768,
1025
+ "epoch": 1.3593539703903095,
1026
+ "grad_norm": 1.0524990558624268,
1027
+ "learning_rate": 9.800361701860368e-05,
1028
+ "loss": 0.049737372994422914,
1029
+ "mean_token_accuracy": 0.9795652389526367,
1030
+ "num_tokens": 19046105.0,
1031
+ "step": 1010
1032
+ },
1033
+ {
1034
+ "entropy": 0.9484909176826477,
1035
+ "epoch": 1.3728129205921937,
1036
+ "grad_norm": 1.0715620517730713,
1037
+ "learning_rate": 9.794088139348835e-05,
1038
+ "loss": 0.04935494959354401,
1039
+ "mean_token_accuracy": 0.9797740161418915,
1040
+ "num_tokens": 19234730.0,
1041
+ "step": 1020
1042
+ },
1043
+ {
1044
+ "entropy": 0.9398573458194732,
1045
+ "epoch": 1.386271870794078,
1046
+ "grad_norm": 1.2134987115859985,
1047
+ "learning_rate": 9.787719594744468e-05,
1048
+ "loss": 0.0518394410610199,
1049
+ "mean_token_accuracy": 0.9793788075447083,
1050
+ "num_tokens": 19423572.0,
1051
+ "step": 1030
1052
+ },
1053
+ {
1054
+ "entropy": 0.9465648651123046,
1055
+ "epoch": 1.3997308209959622,
1056
+ "grad_norm": 1.004693627357483,
1057
+ "learning_rate": 9.781256194223023e-05,
1058
+ "loss": 0.04776117205619812,
1059
+ "mean_token_accuracy": 0.9813261866569519,
1060
+ "num_tokens": 19612465.0,
1061
+ "step": 1040
1062
+ },
1063
+ {
1064
+ "entropy": 0.9448004186153411,
1065
+ "epoch": 1.4131897711978465,
1066
+ "grad_norm": 1.2913438081741333,
1067
+ "learning_rate": 9.774698065839577e-05,
1068
+ "loss": 0.04807930588722229,
1069
+ "mean_token_accuracy": 0.9805689513683319,
1070
+ "num_tokens": 19800482.0,
1071
+ "step": 1050
1072
+ },
1073
+ {
1074
+ "entropy": 0.9425322234630584,
1075
+ "epoch": 1.4266487213997308,
1076
+ "grad_norm": 1.1068469285964966,
1077
+ "learning_rate": 9.768045339525979e-05,
1078
+ "loss": 0.05053595900535583,
1079
+ "mean_token_accuracy": 0.9794536828994751,
1080
+ "num_tokens": 19989444.0,
1081
+ "step": 1060
1082
+ },
1083
+ {
1084
+ "entropy": 0.9472232103347779,
1085
+ "epoch": 1.440107671601615,
1086
+ "grad_norm": 0.7409384250640869,
1087
+ "learning_rate": 9.76129814708829e-05,
1088
+ "loss": 0.052136778831481934,
1089
+ "mean_token_accuracy": 0.979197359085083,
1090
+ "num_tokens": 20178482.0,
1091
+ "step": 1070
1092
+ },
1093
+ {
1094
+ "entropy": 0.9450975477695465,
1095
+ "epoch": 1.4535666218034993,
1096
+ "grad_norm": 1.3440778255462646,
1097
+ "learning_rate": 9.754456622204167e-05,
1098
+ "loss": 0.05258495211601257,
1099
+ "mean_token_accuracy": 0.9798152863979339,
1100
+ "num_tokens": 20367345.0,
1101
+ "step": 1080
1102
+ },
1103
+ {
1104
+ "entropy": 0.9468007743358612,
1105
+ "epoch": 1.4670255720053835,
1106
+ "grad_norm": 0.8377829194068909,
1107
+ "learning_rate": 9.747520900420209e-05,
1108
+ "loss": 0.04893603026866913,
1109
+ "mean_token_accuracy": 0.9805733859539032,
1110
+ "num_tokens": 20555817.0,
1111
+ "step": 1090
1112
+ },
1113
+ {
1114
+ "entropy": 0.9505827724933624,
1115
+ "epoch": 1.4804845222072678,
1116
+ "grad_norm": 0.776889979839325,
1117
+ "learning_rate": 9.740491119149277e-05,
1118
+ "loss": 0.05367119312286377,
1119
+ "mean_token_accuracy": 0.9779646098613739,
1120
+ "num_tokens": 20744001.0,
1121
+ "step": 1100
1122
+ },
1123
+ {
1124
+ "entropy": 0.9516380190849304,
1125
+ "epoch": 1.493943472409152,
1126
+ "grad_norm": 0.8598276376724243,
1127
+ "learning_rate": 9.733367417667773e-05,
1128
+ "loss": 0.05071394443511963,
1129
+ "mean_token_accuracy": 0.979692417383194,
1130
+ "num_tokens": 20931715.0,
1131
+ "step": 1110
1132
+ },
1133
+ {
1134
+ "entropy": 0.9351628720760345,
1135
+ "epoch": 1.5074024226110363,
1136
+ "grad_norm": 1.3307439088821411,
1137
+ "learning_rate": 9.726149937112873e-05,
1138
+ "loss": 0.051372635364532473,
1139
+ "mean_token_accuracy": 0.9797907948493958,
1140
+ "num_tokens": 21120803.0,
1141
+ "step": 1120
1142
+ },
1143
+ {
1144
+ "entropy": 0.9268040716648102,
1145
+ "epoch": 1.5208613728129206,
1146
+ "grad_norm": 0.9320250749588013,
1147
+ "learning_rate": 9.718838820479743e-05,
1148
+ "loss": 0.04939974546432495,
1149
+ "mean_token_accuracy": 0.98038170337677,
1150
+ "num_tokens": 21309384.0,
1151
+ "step": 1130
1152
+ },
1153
+ {
1154
+ "entropy": 0.9241463243961334,
1155
+ "epoch": 1.5343203230148048,
1156
+ "grad_norm": 1.065938115119934,
1157
+ "learning_rate": 9.711434212618691e-05,
1158
+ "loss": 0.04856643378734589,
1159
+ "mean_token_accuracy": 0.9800511121749877,
1160
+ "num_tokens": 21498234.0,
1161
+ "step": 1140
1162
+ },
1163
+ {
1164
+ "entropy": 0.9350252687931061,
1165
+ "epoch": 1.547779273216689,
1166
+ "grad_norm": 0.6925032138824463,
1167
+ "learning_rate": 9.703936260232308e-05,
1168
+ "loss": 0.0512526273727417,
1169
+ "mean_token_accuracy": 0.979562646150589,
1170
+ "num_tokens": 21687119.0,
1171
+ "step": 1150
1172
+ },
1173
+ {
1174
+ "entropy": 0.9403733670711517,
1175
+ "epoch": 1.5612382234185733,
1176
+ "grad_norm": 1.2069101333618164,
1177
+ "learning_rate": 9.696345111872557e-05,
1178
+ "loss": 0.049795085191726686,
1179
+ "mean_token_accuracy": 0.9806506633758545,
1180
+ "num_tokens": 21876094.0,
1181
+ "step": 1160
1182
+ },
1183
+ {
1184
+ "entropy": 0.9370538294315338,
1185
+ "epoch": 1.5746971736204576,
1186
+ "grad_norm": 1.1759546995162964,
1187
+ "learning_rate": 9.688660917937838e-05,
1188
+ "loss": 0.05461466312408447,
1189
+ "mean_token_accuracy": 0.9776057600975037,
1190
+ "num_tokens": 22064836.0,
1191
+ "step": 1170
1192
+ },
1193
+ {
1194
+ "entropy": 0.9517778217792511,
1195
+ "epoch": 1.5881561238223418,
1196
+ "grad_norm": 0.8965149521827698,
1197
+ "learning_rate": 9.68088383066999e-05,
1198
+ "loss": 0.05289326906204224,
1199
+ "mean_token_accuracy": 0.9782088756561279,
1200
+ "num_tokens": 22253065.0,
1201
+ "step": 1180
1202
+ },
1203
+ {
1204
+ "entropy": 0.9372851848602295,
1205
+ "epoch": 1.601615074024226,
1206
+ "grad_norm": 0.7192108631134033,
1207
+ "learning_rate": 9.673014004151292e-05,
1208
+ "loss": 0.048973691463470456,
1209
+ "mean_token_accuracy": 0.9806210815906524,
1210
+ "num_tokens": 22441206.0,
1211
+ "step": 1190
1212
+ },
1213
+ {
1214
+ "entropy": 0.9122351944446564,
1215
+ "epoch": 1.6150740242261103,
1216
+ "grad_norm": 0.8763614296913147,
1217
+ "learning_rate": 9.665051594301407e-05,
1218
+ "loss": 0.0461783230304718,
1219
+ "mean_token_accuracy": 0.9817197680473327,
1220
+ "num_tokens": 22629637.0,
1221
+ "step": 1200
1222
+ },
1223
+ {
1224
+ "entropy": 0.9162654876708984,
1225
+ "epoch": 1.6285329744279946,
1226
+ "grad_norm": 0.9806778430938721,
1227
+ "learning_rate": 9.656996758874284e-05,
1228
+ "loss": 0.04827338755130768,
1229
+ "mean_token_accuracy": 0.9806499361991883,
1230
+ "num_tokens": 22818369.0,
1231
+ "step": 1210
1232
+ },
1233
+ {
1234
+ "entropy": 0.9125026524066925,
1235
+ "epoch": 1.6419919246298789,
1236
+ "grad_norm": 0.9685350656509399,
1237
+ "learning_rate": 9.648849657455044e-05,
1238
+ "loss": 0.048379385471343996,
1239
+ "mean_token_accuracy": 0.980905145406723,
1240
+ "num_tokens": 23006863.0,
1241
+ "step": 1220
1242
+ },
1243
+ {
1244
+ "entropy": 0.9119791567325592,
1245
+ "epoch": 1.6554508748317631,
1246
+ "grad_norm": 0.8717457056045532,
1247
+ "learning_rate": 9.640610451456811e-05,
1248
+ "loss": 0.050873959064483644,
1249
+ "mean_token_accuracy": 0.979601925611496,
1250
+ "num_tokens": 23195735.0,
1251
+ "step": 1230
1252
+ },
1253
+ {
1254
+ "entropy": 0.9244007229804992,
1255
+ "epoch": 1.6689098250336474,
1256
+ "grad_norm": 1.0574654340744019,
1257
+ "learning_rate": 9.632279304117517e-05,
1258
+ "loss": 0.05112813711166382,
1259
+ "mean_token_accuracy": 0.979493772983551,
1260
+ "num_tokens": 23383862.0,
1261
+ "step": 1240
1262
+ },
1263
+ {
1264
+ "entropy": 0.9225810945034028,
1265
+ "epoch": 1.6823687752355316,
1266
+ "grad_norm": 0.9628807306289673,
1267
+ "learning_rate": 9.623856380496664e-05,
1268
+ "loss": 0.050480544567108154,
1269
+ "mean_token_accuracy": 0.9797678411006927,
1270
+ "num_tokens": 23572292.0,
1271
+ "step": 1250
1272
+ },
1273
+ {
1274
+ "entropy": 0.9287579476833343,
1275
+ "epoch": 1.695827725437416,
1276
+ "grad_norm": 1.015265941619873,
1277
+ "learning_rate": 9.615341847472059e-05,
1278
+ "loss": 0.05133126378059387,
1279
+ "mean_token_accuracy": 0.9797986805438995,
1280
+ "num_tokens": 23760252.0,
1281
+ "step": 1260
1282
+ },
1283
+ {
1284
+ "entropy": 0.9133845925331116,
1285
+ "epoch": 1.7092866756393001,
1286
+ "grad_norm": 0.8996883630752563,
1287
+ "learning_rate": 9.606735873736505e-05,
1288
+ "loss": 0.05044950246810913,
1289
+ "mean_token_accuracy": 0.9795303404331207,
1290
+ "num_tokens": 23949031.0,
1291
+ "step": 1270
1292
+ },
1293
+ {
1294
+ "entropy": 0.9213698863983154,
1295
+ "epoch": 1.7227456258411844,
1296
+ "grad_norm": 1.0490264892578125,
1297
+ "learning_rate": 9.598038629794461e-05,
1298
+ "loss": 0.04797698855400086,
1299
+ "mean_token_accuracy": 0.9806023716926575,
1300
+ "num_tokens": 24137829.0,
1301
+ "step": 1280
1302
+ },
1303
+ {
1304
+ "entropy": 0.9173697710037232,
1305
+ "epoch": 1.7362045760430687,
1306
+ "grad_norm": 1.3648769855499268,
1307
+ "learning_rate": 9.589250287958657e-05,
1308
+ "loss": 0.049194514751434326,
1309
+ "mean_token_accuracy": 0.980712479352951,
1310
+ "num_tokens": 24326586.0,
1311
+ "step": 1290
1312
+ },
1313
+ {
1314
+ "entropy": 0.9242741882801055,
1315
+ "epoch": 1.749663526244953,
1316
+ "grad_norm": 1.105125069618225,
1317
+ "learning_rate": 9.580371022346693e-05,
1318
+ "loss": 0.048667973279953,
1319
+ "mean_token_accuracy": 0.9801449298858642,
1320
+ "num_tokens": 24514904.0,
1321
+ "step": 1300
1322
+ },
1323
+ {
1324
+ "entropy": 0.9162627995014191,
1325
+ "epoch": 1.7631224764468372,
1326
+ "grad_norm": 0.8887938857078552,
1327
+ "learning_rate": 9.571401008877572e-05,
1328
+ "loss": 0.04878672957420349,
1329
+ "mean_token_accuracy": 0.980469423532486,
1330
+ "num_tokens": 24703688.0,
1331
+ "step": 1310
1332
+ },
1333
+ {
1334
+ "entropy": 0.9149538338184356,
1335
+ "epoch": 1.7765814266487214,
1336
+ "grad_norm": 0.8838976621627808,
1337
+ "learning_rate": 9.562340425268233e-05,
1338
+ "loss": 0.04892318844795227,
1339
+ "mean_token_accuracy": 0.9808776795864105,
1340
+ "num_tokens": 24892107.0,
1341
+ "step": 1320
1342
+ },
1343
+ {
1344
+ "entropy": 0.914735221862793,
1345
+ "epoch": 1.7900403768506057,
1346
+ "grad_norm": 1.1152070760726929,
1347
+ "learning_rate": 9.553189451030019e-05,
1348
+ "loss": 0.04825109839439392,
1349
+ "mean_token_accuracy": 0.9804604113101959,
1350
+ "num_tokens": 25080130.0,
1351
+ "step": 1330
1352
+ },
1353
+ {
1354
+ "entropy": 0.9170725226402283,
1355
+ "epoch": 1.80349932705249,
1356
+ "grad_norm": 0.9479517340660095,
1357
+ "learning_rate": 9.543948267465115e-05,
1358
+ "loss": 0.051445144414901736,
1359
+ "mean_token_accuracy": 0.9792819261550904,
1360
+ "num_tokens": 25268652.0,
1361
+ "step": 1340
1362
+ },
1363
+ {
1364
+ "entropy": 0.9142911911010743,
1365
+ "epoch": 1.8169582772543742,
1366
+ "grad_norm": 0.8172292709350586,
1367
+ "learning_rate": 9.534617057662977e-05,
1368
+ "loss": 0.0475692093372345,
1369
+ "mean_token_accuracy": 0.9809505581855774,
1370
+ "num_tokens": 25457120.0,
1371
+ "step": 1350
1372
+ },
1373
+ {
1374
+ "entropy": 0.9019249439239502,
1375
+ "epoch": 1.8304172274562585,
1376
+ "grad_norm": 0.8183121681213379,
1377
+ "learning_rate": 9.525196006496679e-05,
1378
+ "loss": 0.04979957342147827,
1379
+ "mean_token_accuracy": 0.9799730658531189,
1380
+ "num_tokens": 25645699.0,
1381
+ "step": 1360
1382
+ },
1383
+ {
1384
+ "entropy": 0.9181066155433655,
1385
+ "epoch": 1.8438761776581427,
1386
+ "grad_norm": 0.8256579041481018,
1387
+ "learning_rate": 9.515685300619271e-05,
1388
+ "loss": 0.04996164441108704,
1389
+ "mean_token_accuracy": 0.9802247405052185,
1390
+ "num_tokens": 25834183.0,
1391
+ "step": 1370
1392
+ },
1393
+ {
1394
+ "entropy": 0.9171363770961761,
1395
+ "epoch": 1.857335127860027,
1396
+ "grad_norm": 0.9727098345756531,
1397
+ "learning_rate": 9.506085128460065e-05,
1398
+ "loss": 0.048660767078399655,
1399
+ "mean_token_accuracy": 0.9805064260959625,
1400
+ "num_tokens": 26023317.0,
1401
+ "step": 1380
1402
+ },
1403
+ {
1404
+ "entropy": 0.9112720847129822,
1405
+ "epoch": 1.8707940780619112,
1406
+ "grad_norm": 0.9826673865318298,
1407
+ "learning_rate": 9.496395680220918e-05,
1408
+ "loss": 0.04775593280792236,
1409
+ "mean_token_accuracy": 0.9809929549694061,
1410
+ "num_tokens": 26212115.0,
1411
+ "step": 1390
1412
+ },
1413
+ {
1414
+ "entropy": 0.9034272134304047,
1415
+ "epoch": 1.8842530282637955,
1416
+ "grad_norm": 0.9951306581497192,
1417
+ "learning_rate": 9.486617147872446e-05,
1418
+ "loss": 0.04939360618591308,
1419
+ "mean_token_accuracy": 0.9800347864627839,
1420
+ "num_tokens": 26401167.0,
1421
+ "step": 1400
1422
+ },
1423
+ {
1424
+ "entropy": 0.897865754365921,
1425
+ "epoch": 1.8977119784656797,
1426
+ "grad_norm": 1.0163404941558838,
1427
+ "learning_rate": 9.476749725150235e-05,
1428
+ "loss": 0.05002856254577637,
1429
+ "mean_token_accuracy": 0.9793690323829651,
1430
+ "num_tokens": 26590570.0,
1431
+ "step": 1410
1432
+ },
1433
+ {
1434
+ "entropy": 0.9004576802253723,
1435
+ "epoch": 1.911170928667564,
1436
+ "grad_norm": 0.6666616201400757,
1437
+ "learning_rate": 9.466793607550995e-05,
1438
+ "loss": 0.04905453026294708,
1439
+ "mean_token_accuracy": 0.9803856253623963,
1440
+ "num_tokens": 26779481.0,
1441
+ "step": 1420
1442
+ },
1443
+ {
1444
+ "entropy": 0.8978902101516724,
1445
+ "epoch": 1.9246298788694483,
1446
+ "grad_norm": 0.9295416474342346,
1447
+ "learning_rate": 9.45674899232869e-05,
1448
+ "loss": 0.05261261463165283,
1449
+ "mean_token_accuracy": 0.9785708487033844,
1450
+ "num_tokens": 26968262.0,
1451
+ "step": 1430
1452
+ },
1453
+ {
1454
+ "entropy": 0.8962675571441651,
1455
+ "epoch": 1.9380888290713325,
1456
+ "grad_norm": 0.88262939453125,
1457
+ "learning_rate": 9.446616078490626e-05,
1458
+ "loss": 0.04765265882015228,
1459
+ "mean_token_accuracy": 0.98052077293396,
1460
+ "num_tokens": 27157631.0,
1461
+ "step": 1440
1462
+ },
1463
+ {
1464
+ "entropy": 0.8909463047981262,
1465
+ "epoch": 1.9515477792732168,
1466
+ "grad_norm": 1.4205769300460815,
1467
+ "learning_rate": 9.436395066793518e-05,
1468
+ "loss": 0.049406200647354126,
1469
+ "mean_token_accuracy": 0.979697072505951,
1470
+ "num_tokens": 27345921.0,
1471
+ "step": 1450
1472
+ },
1473
+ {
1474
+ "entropy": 0.8969364166259766,
1475
+ "epoch": 1.965006729475101,
1476
+ "grad_norm": 0.9517145752906799,
1477
+ "learning_rate": 9.426086159739496e-05,
1478
+ "loss": 0.0510346531867981,
1479
+ "mean_token_accuracy": 0.9793386399745941,
1480
+ "num_tokens": 27534751.0,
1481
+ "step": 1460
1482
+ },
1483
+ {
1484
+ "entropy": 0.9092479169368743,
1485
+ "epoch": 1.9784656796769853,
1486
+ "grad_norm": 1.200056791305542,
1487
+ "learning_rate": 9.415689561572107e-05,
1488
+ "loss": 0.04974203705787659,
1489
+ "mean_token_accuracy": 0.9800146698951722,
1490
+ "num_tokens": 27723763.0,
1491
+ "step": 1470
1492
+ },
1493
+ {
1494
+ "entropy": 0.9059641897678375,
1495
+ "epoch": 1.9919246298788695,
1496
+ "grad_norm": 1.072447657585144,
1497
+ "learning_rate": 9.405205478272267e-05,
1498
+ "loss": 0.05065792202949524,
1499
+ "mean_token_accuracy": 0.9799223959445953,
1500
+ "num_tokens": 27911784.0,
1501
+ "step": 1480
1502
+ },
1503
+ {
1504
+ "epoch": 2.0,
1505
+ "eval_entropy": 0.9083398975384464,
1506
+ "eval_loss": 0.05432562157511711,
1507
+ "eval_mean_token_accuracy": 0.9782073607869969,
1508
+ "eval_num_tokens": 28025286.0,
1509
+ "eval_runtime": 24.4304,
1510
+ "eval_samples_per_second": 204.663,
1511
+ "eval_steps_per_second": 6.426,
1512
+ "step": 1486
1513
+ },
1514
+ {
1515
+ "entropy": 0.9113237977027893,
1516
+ "epoch": 2.005383580080754,
1517
+ "grad_norm": 1.0167063474655151,
1518
+ "learning_rate": 9.394634117554173e-05,
1519
+ "loss": 0.044942992925643924,
1520
+ "mean_token_accuracy": 0.9818171083927154,
1521
+ "num_tokens": 28100906.0,
1522
+ "step": 1490
1523
+ },
1524
+ {
1525
+ "entropy": 0.8780008792877197,
1526
+ "epoch": 2.018842530282638,
1527
+ "grad_norm": 1.9552241563796997,
1528
+ "learning_rate": 9.38397568886119e-05,
1529
+ "loss": 0.036248764395713805,
1530
+ "mean_token_accuracy": 0.9852245450019836,
1531
+ "num_tokens": 28289445.0,
1532
+ "step": 1500
1533
+ },
1534
+ {
1535
+ "entropy": 0.8721176147460937,
1536
+ "epoch": 2.0323014804845223,
1537
+ "grad_norm": 0.8951388001441956,
1538
+ "learning_rate": 9.373230403361712e-05,
1539
+ "loss": 0.037987279891967776,
1540
+ "mean_token_accuracy": 0.9850614190101623,
1541
+ "num_tokens": 28477972.0,
1542
+ "step": 1510
1543
+ },
1544
+ {
1545
+ "entropy": 0.8922657191753387,
1546
+ "epoch": 2.0457604306864066,
1547
+ "grad_norm": 0.9028105735778809,
1548
+ "learning_rate": 9.362398473944958e-05,
1549
+ "loss": 0.03588914275169373,
1550
+ "mean_token_accuracy": 0.9863102614879609,
1551
+ "num_tokens": 28666272.0,
1552
+ "step": 1520
1553
+ },
1554
+ {
1555
+ "entropy": 0.877799642086029,
1556
+ "epoch": 2.059219380888291,
1557
+ "grad_norm": 1.153886318206787,
1558
+ "learning_rate": 9.35148011521677e-05,
1559
+ "loss": 0.03577309250831604,
1560
+ "mean_token_accuracy": 0.9863148391246795,
1561
+ "num_tokens": 28854780.0,
1562
+ "step": 1530
1563
+ },
1564
+ {
1565
+ "entropy": 0.8708971381187439,
1566
+ "epoch": 2.072678331090175,
1567
+ "grad_norm": 1.042955994606018,
1568
+ "learning_rate": 9.340475543495364e-05,
1569
+ "loss": 0.038266432285308835,
1570
+ "mean_token_accuracy": 0.9848419308662415,
1571
+ "num_tokens": 29043741.0,
1572
+ "step": 1540
1573
+ },
1574
+ {
1575
+ "entropy": 0.8800721943378449,
1576
+ "epoch": 2.0861372812920593,
1577
+ "grad_norm": 0.6789060235023499,
1578
+ "learning_rate": 9.329384976807023e-05,
1579
+ "loss": 0.032948991656303404,
1580
+ "mean_token_accuracy": 0.9871293902397156,
1581
+ "num_tokens": 29232719.0,
1582
+ "step": 1550
1583
+ },
1584
+ {
1585
+ "entropy": 0.8743580460548401,
1586
+ "epoch": 2.0995962314939436,
1587
+ "grad_norm": 1.1780879497528076,
1588
+ "learning_rate": 9.318208634881802e-05,
1589
+ "loss": 0.036874374747276305,
1590
+ "mean_token_accuracy": 0.9859182178974152,
1591
+ "num_tokens": 29421817.0,
1592
+ "step": 1560
1593
+ },
1594
+ {
1595
+ "entropy": 0.8851303637027741,
1596
+ "epoch": 2.113055181695828,
1597
+ "grad_norm": 0.747734785079956,
1598
+ "learning_rate": 9.306946739149161e-05,
1599
+ "loss": 0.0364631175994873,
1600
+ "mean_token_accuracy": 0.9862085223197937,
1601
+ "num_tokens": 29610344.0,
1602
+ "step": 1570
1603
+ },
1604
+ {
1605
+ "entropy": 0.8798732101917267,
1606
+ "epoch": 2.126514131897712,
1607
+ "grad_norm": 1.5001860857009888,
1608
+ "learning_rate": 9.29559951273358e-05,
1609
+ "loss": 0.03813003897666931,
1610
+ "mean_token_accuracy": 0.9852604746818543,
1611
+ "num_tokens": 29798997.0,
1612
+ "step": 1580
1613
+ },
1614
+ {
1615
+ "entropy": 0.8797551989555359,
1616
+ "epoch": 2.1399730820995964,
1617
+ "grad_norm": 0.9593478441238403,
1618
+ "learning_rate": 9.284167180450141e-05,
1619
+ "loss": 0.0394927829504013,
1620
+ "mean_token_accuracy": 0.984604275226593,
1621
+ "num_tokens": 29987809.0,
1622
+ "step": 1590
1623
+ },
1624
+ {
1625
+ "entropy": 0.8860546290874481,
1626
+ "epoch": 2.1534320323014806,
1627
+ "grad_norm": 0.8347703218460083,
1628
+ "learning_rate": 9.272649968800069e-05,
1629
+ "loss": 0.036699697375297546,
1630
+ "mean_token_accuracy": 0.985380882024765,
1631
+ "num_tokens": 30176234.0,
1632
+ "step": 1600
1633
+ },
1634
+ {
1635
+ "entropy": 0.8783667802810669,
1636
+ "epoch": 2.166890982503365,
1637
+ "grad_norm": 1.1154481172561646,
1638
+ "learning_rate": 9.26104810596625e-05,
1639
+ "loss": 0.03756999969482422,
1640
+ "mean_token_accuracy": 0.9853451430797577,
1641
+ "num_tokens": 30364657.0,
1642
+ "step": 1610
1643
+ },
1644
+ {
1645
+ "entropy": 0.8684478521347045,
1646
+ "epoch": 2.180349932705249,
1647
+ "grad_norm": 0.7515475153923035,
1648
+ "learning_rate": 9.249361821808708e-05,
1649
+ "loss": 0.0382376104593277,
1650
+ "mean_token_accuracy": 0.9854029655456543,
1651
+ "num_tokens": 30552904.0,
1652
+ "step": 1620
1653
+ },
1654
+ {
1655
+ "entropy": 0.8636964917182922,
1656
+ "epoch": 2.1938088829071334,
1657
+ "grad_norm": 0.7711939215660095,
1658
+ "learning_rate": 9.237591347860052e-05,
1659
+ "loss": 0.036220991611480714,
1660
+ "mean_token_accuracy": 0.9860713243484497,
1661
+ "num_tokens": 30741259.0,
1662
+ "step": 1630
1663
+ },
1664
+ {
1665
+ "entropy": 0.8592016279697419,
1666
+ "epoch": 2.2072678331090176,
1667
+ "grad_norm": 1.1143887042999268,
1668
+ "learning_rate": 9.225736917320886e-05,
1669
+ "loss": 0.036316031217575075,
1670
+ "mean_token_accuracy": 0.985757052898407,
1671
+ "num_tokens": 30930144.0,
1672
+ "step": 1640
1673
+ },
1674
+ {
1675
+ "entropy": 0.8671079576015472,
1676
+ "epoch": 2.220726783310902,
1677
+ "grad_norm": 0.8980015516281128,
1678
+ "learning_rate": 9.213798765055187e-05,
1679
+ "loss": 0.03822658061981201,
1680
+ "mean_token_accuracy": 0.9847764372825623,
1681
+ "num_tokens": 31118624.0,
1682
+ "step": 1650
1683
+ },
1684
+ {
1685
+ "entropy": 0.8785548269748688,
1686
+ "epoch": 2.234185733512786,
1687
+ "grad_norm": 1.094323992729187,
1688
+ "learning_rate": 9.20177712758566e-05,
1689
+ "loss": 0.03736622333526611,
1690
+ "mean_token_accuracy": 0.9850081980228425,
1691
+ "num_tokens": 31306833.0,
1692
+ "step": 1660
1693
+ },
1694
+ {
1695
+ "entropy": 0.866977310180664,
1696
+ "epoch": 2.2476446837146704,
1697
+ "grad_norm": 0.8372092843055725,
1698
+ "learning_rate": 9.189672243089046e-05,
1699
+ "loss": 0.0401554524898529,
1700
+ "mean_token_accuracy": 0.9840337932109833,
1701
+ "num_tokens": 31495503.0,
1702
+ "step": 1670
1703
+ },
1704
+ {
1705
+ "entropy": 0.8787918210029602,
1706
+ "epoch": 2.2611036339165547,
1707
+ "grad_norm": 1.5408164262771606,
1708
+ "learning_rate": 9.177484351391402e-05,
1709
+ "loss": 0.0368030846118927,
1710
+ "mean_token_accuracy": 0.9847234487533569,
1711
+ "num_tokens": 31683865.0,
1712
+ "step": 1680
1713
+ },
1714
+ {
1715
+ "entropy": 0.872721153497696,
1716
+ "epoch": 2.274562584118439,
1717
+ "grad_norm": 1.1115421056747437,
1718
+ "learning_rate": 9.165213693963355e-05,
1719
+ "loss": 0.037859299778938295,
1720
+ "mean_token_accuracy": 0.9851646661758423,
1721
+ "num_tokens": 31871903.0,
1722
+ "step": 1690
1723
+ },
1724
+ {
1725
+ "entropy": 0.8774094223976135,
1726
+ "epoch": 2.288021534320323,
1727
+ "grad_norm": 1.0301331281661987,
1728
+ "learning_rate": 9.152860513915314e-05,
1729
+ "loss": 0.038671016693115234,
1730
+ "mean_token_accuracy": 0.9840058028697968,
1731
+ "num_tokens": 32060500.0,
1732
+ "step": 1700
1733
+ },
1734
+ {
1735
+ "entropy": 0.889606237411499,
1736
+ "epoch": 2.3014804845222074,
1737
+ "grad_norm": 0.8601903915405273,
1738
+ "learning_rate": 9.140425055992648e-05,
1739
+ "loss": 0.039603835344314574,
1740
+ "mean_token_accuracy": 0.9840235590934754,
1741
+ "num_tokens": 32248744.0,
1742
+ "step": 1710
1743
+ },
1744
+ {
1745
+ "entropy": 0.886734277009964,
1746
+ "epoch": 2.3149394347240917,
1747
+ "grad_norm": 1.1029839515686035,
1748
+ "learning_rate": 9.127907566570853e-05,
1749
+ "loss": 0.039513933658599856,
1750
+ "mean_token_accuracy": 0.9844573020935059,
1751
+ "num_tokens": 32437640.0,
1752
+ "step": 1720
1753
+ },
1754
+ {
1755
+ "entropy": 0.8843017637729644,
1756
+ "epoch": 2.328398384925976,
1757
+ "grad_norm": 1.2545154094696045,
1758
+ "learning_rate": 9.115308293650653e-05,
1759
+ "loss": 0.036970189213752745,
1760
+ "mean_token_accuracy": 0.985239815711975,
1761
+ "num_tokens": 32625986.0,
1762
+ "step": 1730
1763
+ },
1764
+ {
1765
+ "entropy": 0.8913422048091888,
1766
+ "epoch": 2.34185733512786,
1767
+ "grad_norm": 0.9613803625106812,
1768
+ "learning_rate": 9.102627486853099e-05,
1769
+ "loss": 0.03811657428741455,
1770
+ "mean_token_accuracy": 0.9852804243564606,
1771
+ "num_tokens": 32814531.0,
1772
+ "step": 1740
1773
+ },
1774
+ {
1775
+ "entropy": 0.8907467782497406,
1776
+ "epoch": 2.3553162853297445,
1777
+ "grad_norm": 1.1811398267745972,
1778
+ "learning_rate": 9.089865397414614e-05,
1779
+ "loss": 0.03903660774230957,
1780
+ "mean_token_accuracy": 0.9842524945735931,
1781
+ "num_tokens": 33002960.0,
1782
+ "step": 1750
1783
+ },
1784
+ {
1785
+ "entropy": 0.8833664715290069,
1786
+ "epoch": 2.3687752355316287,
1787
+ "grad_norm": 0.8338477611541748,
1788
+ "learning_rate": 9.077022278182024e-05,
1789
+ "loss": 0.03982087969779968,
1790
+ "mean_token_accuracy": 0.9841201484203339,
1791
+ "num_tokens": 33191183.0,
1792
+ "step": 1760
1793
+ },
1794
+ {
1795
+ "entropy": 0.8783826470375061,
1796
+ "epoch": 2.382234185733513,
1797
+ "grad_norm": 0.6904510259628296,
1798
+ "learning_rate": 9.064098383607545e-05,
1799
+ "loss": 0.03699290752410889,
1800
+ "mean_token_accuracy": 0.9854120731353759,
1801
+ "num_tokens": 33379798.0,
1802
+ "step": 1770
1803
+ },
1804
+ {
1805
+ "entropy": 0.8871048510074615,
1806
+ "epoch": 2.3956931359353972,
1807
+ "grad_norm": 1.009539246559143,
1808
+ "learning_rate": 9.051093969743738e-05,
1809
+ "loss": 0.03926805555820465,
1810
+ "mean_token_accuracy": 0.9843246698379516,
1811
+ "num_tokens": 33568531.0,
1812
+ "step": 1780
1813
+ },
1814
+ {
1815
+ "entropy": 0.8973462700843811,
1816
+ "epoch": 2.409152086137281,
1817
+ "grad_norm": 0.8916401267051697,
1818
+ "learning_rate": 9.03800929423844e-05,
1819
+ "loss": 0.039596831798553465,
1820
+ "mean_token_accuracy": 0.9839228212833404,
1821
+ "num_tokens": 33756333.0,
1822
+ "step": 1790
1823
+ },
1824
+ {
1825
+ "entropy": 0.8954619467258453,
1826
+ "epoch": 2.4226110363391653,
1827
+ "grad_norm": 1.056504487991333,
1828
+ "learning_rate": 9.024844616329662e-05,
1829
+ "loss": 0.0396859347820282,
1830
+ "mean_token_accuracy": 0.9837161123752594,
1831
+ "num_tokens": 33944904.0,
1832
+ "step": 1800
1833
+ },
1834
+ {
1835
+ "entropy": 0.8952643811702728,
1836
+ "epoch": 2.4360699865410496,
1837
+ "grad_norm": 1.0442084074020386,
1838
+ "learning_rate": 9.011600196840447e-05,
1839
+ "loss": 0.037324142456054685,
1840
+ "mean_token_accuracy": 0.9852897703647614,
1841
+ "num_tokens": 34133530.0,
1842
+ "step": 1810
1843
+ },
1844
+ {
1845
+ "entropy": 0.8874911010265351,
1846
+ "epoch": 2.449528936742934,
1847
+ "grad_norm": 0.9926224946975708,
1848
+ "learning_rate": 8.998276298173707e-05,
1849
+ "loss": 0.03637495338916778,
1850
+ "mean_token_accuracy": 0.9853426337242126,
1851
+ "num_tokens": 34322042.0,
1852
+ "step": 1820
1853
+ },
1854
+ {
1855
+ "entropy": 0.8900584518909455,
1856
+ "epoch": 2.462987886944818,
1857
+ "grad_norm": 1.001734733581543,
1858
+ "learning_rate": 8.984873184307017e-05,
1859
+ "loss": 0.04029585719108582,
1860
+ "mean_token_accuracy": 0.9842372059822082,
1861
+ "num_tokens": 34510259.0,
1862
+ "step": 1830
1863
+ },
1864
+ {
1865
+ "entropy": 0.8986345052719116,
1866
+ "epoch": 2.4764468371467023,
1867
+ "grad_norm": 0.772941529750824,
1868
+ "learning_rate": 8.971391120787397e-05,
1869
+ "loss": 0.0401084691286087,
1870
+ "mean_token_accuracy": 0.9841427087783814,
1871
+ "num_tokens": 34698669.0,
1872
+ "step": 1840
1873
+ },
1874
+ {
1875
+ "entropy": 0.8911370873451233,
1876
+ "epoch": 2.4899057873485866,
1877
+ "grad_norm": 0.895031213760376,
1878
+ "learning_rate": 8.957830374726042e-05,
1879
+ "loss": 0.03941032290458679,
1880
+ "mean_token_accuracy": 0.9841966688632965,
1881
+ "num_tokens": 34887160.0,
1882
+ "step": 1850
1883
+ },
1884
+ {
1885
+ "entropy": 0.8901963472366333,
1886
+ "epoch": 2.503364737550471,
1887
+ "grad_norm": 1.0677918195724487,
1888
+ "learning_rate": 8.944191214793028e-05,
1889
+ "loss": 0.03503885865211487,
1890
+ "mean_token_accuracy": 0.9856241464614868,
1891
+ "num_tokens": 35075891.0,
1892
+ "step": 1860
1893
+ },
1894
+ {
1895
+ "entropy": 0.8817630231380462,
1896
+ "epoch": 2.516823687752355,
1897
+ "grad_norm": 1.1200164556503296,
1898
+ "learning_rate": 8.930473911212e-05,
1899
+ "loss": 0.03878425657749176,
1900
+ "mean_token_accuracy": 0.9846492648124695,
1901
+ "num_tokens": 35264455.0,
1902
+ "step": 1870
1903
+ },
1904
+ {
1905
+ "entropy": 0.8897477686405182,
1906
+ "epoch": 2.5302826379542394,
1907
+ "grad_norm": 0.8058190941810608,
1908
+ "learning_rate": 8.916678735754809e-05,
1909
+ "loss": 0.041558006405830385,
1910
+ "mean_token_accuracy": 0.9830883860588073,
1911
+ "num_tokens": 35453251.0,
1912
+ "step": 1880
1913
+ },
1914
+ {
1915
+ "entropy": 0.8873181045055389,
1916
+ "epoch": 2.5437415881561236,
1917
+ "grad_norm": 0.7765992283821106,
1918
+ "learning_rate": 8.902805961736123e-05,
1919
+ "loss": 0.03555050492286682,
1920
+ "mean_token_accuracy": 0.9858225345611572,
1921
+ "num_tokens": 35641959.0,
1922
+ "step": 1890
1923
+ },
1924
+ {
1925
+ "entropy": 0.8782606959342957,
1926
+ "epoch": 2.557200538358008,
1927
+ "grad_norm": 0.8488866686820984,
1928
+ "learning_rate": 8.88885586400803e-05,
1929
+ "loss": 0.03606923818588257,
1930
+ "mean_token_accuracy": 0.9858632445335388,
1931
+ "num_tokens": 35829636.0,
1932
+ "step": 1900
1933
+ },
1934
+ {
1935
+ "entropy": 0.8869808673858642,
1936
+ "epoch": 2.570659488559892,
1937
+ "grad_norm": 1.1734739542007446,
1938
+ "learning_rate": 8.874828718954576e-05,
1939
+ "loss": 0.040956351161003116,
1940
+ "mean_token_accuracy": 0.9835374176502227,
1941
+ "num_tokens": 36017818.0,
1942
+ "step": 1910
1943
+ },
1944
+ {
1945
+ "entropy": 0.8850924432277679,
1946
+ "epoch": 2.5841184387617764,
1947
+ "grad_norm": 1.3584387302398682,
1948
+ "learning_rate": 8.86072480448629e-05,
1949
+ "loss": 0.042437011003494264,
1950
+ "mean_token_accuracy": 0.9826479077339172,
1951
+ "num_tokens": 36206505.0,
1952
+ "step": 1920
1953
+ },
1954
+ {
1955
+ "entropy": 0.8801687896251679,
1956
+ "epoch": 2.5975773889636606,
1957
+ "grad_norm": 1.2862240076065063,
1958
+ "learning_rate": 8.84654440003469e-05,
1959
+ "loss": 0.03880060315132141,
1960
+ "mean_token_accuracy": 0.9843404710292816,
1961
+ "num_tokens": 36395802.0,
1962
+ "step": 1930
1963
+ },
1964
+ {
1965
+ "entropy": 0.8752781748771667,
1966
+ "epoch": 2.611036339165545,
1967
+ "grad_norm": 0.7341249585151672,
1968
+ "learning_rate": 8.83228778654674e-05,
1969
+ "loss": 0.03776344656944275,
1970
+ "mean_token_accuracy": 0.9852877616882324,
1971
+ "num_tokens": 36584370.0,
1972
+ "step": 1940
1973
+ },
1974
+ {
1975
+ "entropy": 0.8759494423866272,
1976
+ "epoch": 2.624495289367429,
1977
+ "grad_norm": 0.9807276129722595,
1978
+ "learning_rate": 8.817955246479276e-05,
1979
+ "loss": 0.03780297338962555,
1980
+ "mean_token_accuracy": 0.9853227376937866,
1981
+ "num_tokens": 36772865.0,
1982
+ "step": 1950
1983
+ },
1984
+ {
1985
+ "entropy": 0.8753129065036773,
1986
+ "epoch": 2.6379542395693134,
1987
+ "grad_norm": 1.1391628980636597,
1988
+ "learning_rate": 8.803547063793422e-05,
1989
+ "loss": 0.03953765034675598,
1990
+ "mean_token_accuracy": 0.9839197635650635,
1991
+ "num_tokens": 36961462.0,
1992
+ "step": 1960
1993
+ },
1994
+ {
1995
+ "entropy": 0.8771899223327637,
1996
+ "epoch": 2.6514131897711977,
1997
+ "grad_norm": 1.0865391492843628,
1998
+ "learning_rate": 8.789063523948958e-05,
1999
+ "loss": 0.03856399655342102,
2000
+ "mean_token_accuracy": 0.9848688066005706,
2001
+ "num_tokens": 37150486.0,
2002
+ "step": 1970
2003
+ },
2004
+ {
2005
+ "entropy": 0.8774881184101104,
2006
+ "epoch": 2.664872139973082,
2007
+ "grad_norm": 0.8561595678329468,
2008
+ "learning_rate": 8.774504913898663e-05,
2009
+ "loss": 0.03795175850391388,
2010
+ "mean_token_accuracy": 0.9849309325218201,
2011
+ "num_tokens": 37338890.0,
2012
+ "step": 1980
2013
+ },
2014
+ {
2015
+ "entropy": 0.8841517448425293,
2016
+ "epoch": 2.678331090174966,
2017
+ "grad_norm": 0.9458399415016174,
2018
+ "learning_rate": 8.75987152208264e-05,
2019
+ "loss": 0.03860213160514832,
2020
+ "mean_token_accuracy": 0.9842230796813964,
2021
+ "num_tokens": 37526886.0,
2022
+ "step": 1990
2023
+ },
2024
+ {
2025
+ "entropy": 0.8809043228626251,
2026
+ "epoch": 2.6917900403768504,
2027
+ "grad_norm": 0.9360871315002441,
2028
+ "learning_rate": 8.745163638422583e-05,
2029
+ "loss": 0.03709094822406769,
2030
+ "mean_token_accuracy": 0.9855118036270142,
2031
+ "num_tokens": 37715408.0,
2032
+ "step": 2000
2033
+ },
2034
+ {
2035
+ "entropy": 0.8805519282817841,
2036
+ "epoch": 2.7052489905787347,
2037
+ "grad_norm": 0.8852124214172363,
2038
+ "learning_rate": 8.730381554316051e-05,
2039
+ "loss": 0.03869341611862183,
2040
+ "mean_token_accuracy": 0.9846724212169647,
2041
+ "num_tokens": 37904328.0,
2042
+ "step": 2010
2043
+ },
2044
+ {
2045
+ "entropy": 0.8796228408813477,
2046
+ "epoch": 2.718707940780619,
2047
+ "grad_norm": 1.189297080039978,
2048
+ "learning_rate": 8.715525562630687e-05,
2049
+ "loss": 0.03675769567489624,
2050
+ "mean_token_accuracy": 0.9854080140590668,
2051
+ "num_tokens": 38092776.0,
2052
+ "step": 2020
2053
+ },
2054
+ {
2055
+ "entropy": 0.8722020089626312,
2056
+ "epoch": 2.732166890982503,
2057
+ "grad_norm": 0.7027947306632996,
2058
+ "learning_rate": 8.700595957698411e-05,
2059
+ "loss": 0.03889244794845581,
2060
+ "mean_token_accuracy": 0.984558242559433,
2061
+ "num_tokens": 38282221.0,
2062
+ "step": 2030
2063
+ },
2064
+ {
2065
+ "entropy": 0.8746632814407349,
2066
+ "epoch": 2.7456258411843875,
2067
+ "grad_norm": 0.9799376726150513,
2068
+ "learning_rate": 8.685593035309598e-05,
2069
+ "loss": 0.03815680146217346,
2070
+ "mean_token_accuracy": 0.9845012128353119,
2071
+ "num_tokens": 38470633.0,
2072
+ "step": 2040
2073
+ },
2074
+ {
2075
+ "entropy": 0.8766567528247833,
2076
+ "epoch": 2.7590847913862717,
2077
+ "grad_norm": 0.850459098815918,
2078
+ "learning_rate": 8.670517092707213e-05,
2079
+ "loss": 0.039029371738433835,
2080
+ "mean_token_accuracy": 0.9845906794071198,
2081
+ "num_tokens": 38659048.0,
2082
+ "step": 2050
2083
+ },
2084
+ {
2085
+ "entropy": 0.8778688013553619,
2086
+ "epoch": 2.772543741588156,
2087
+ "grad_norm": 0.8540134429931641,
2088
+ "learning_rate": 8.655368428580919e-05,
2089
+ "loss": 0.039645448327064514,
2090
+ "mean_token_accuracy": 0.9840566098690033,
2091
+ "num_tokens": 38847370.0,
2092
+ "step": 2060
2093
+ },
2094
+ {
2095
+ "entropy": 0.8758600771427154,
2096
+ "epoch": 2.7860026917900402,
2097
+ "grad_norm": 0.8036021590232849,
2098
+ "learning_rate": 8.640147343061165e-05,
2099
+ "loss": 0.038226932287216187,
2100
+ "mean_token_accuracy": 0.9845647215843201,
2101
+ "num_tokens": 39036040.0,
2102
+ "step": 2070
2103
+ },
2104
+ {
2105
+ "entropy": 0.8652268946170807,
2106
+ "epoch": 2.7994616419919245,
2107
+ "grad_norm": 0.8526738286018372,
2108
+ "learning_rate": 8.624854137713234e-05,
2109
+ "loss": 0.03798363208770752,
2110
+ "mean_token_accuracy": 0.9845202267169952,
2111
+ "num_tokens": 39224695.0,
2112
+ "step": 2080
2113
+ },
2114
+ {
2115
+ "entropy": 0.8707200348377228,
2116
+ "epoch": 2.8129205921938087,
2117
+ "grad_norm": 0.9852389097213745,
2118
+ "learning_rate": 8.609489115531278e-05,
2119
+ "loss": 0.037227436900138855,
2120
+ "mean_token_accuracy": 0.9852050960063934,
2121
+ "num_tokens": 39413360.0,
2122
+ "step": 2090
2123
+ },
2124
+ {
2125
+ "entropy": 0.8711447060108185,
2126
+ "epoch": 2.826379542395693,
2127
+ "grad_norm": 1.228277325630188,
2128
+ "learning_rate": 8.594052580932301e-05,
2129
+ "loss": 0.03698050379753113,
2130
+ "mean_token_accuracy": 0.9852443158626556,
2131
+ "num_tokens": 39602544.0,
2132
+ "step": 2100
2133
+ },
2134
+ {
2135
+ "entropy": 0.8742912471294403,
2136
+ "epoch": 2.8398384925975773,
2137
+ "grad_norm": 0.8440544605255127,
2138
+ "learning_rate": 8.578544839750141e-05,
2139
+ "loss": 0.03757670521736145,
2140
+ "mean_token_accuracy": 0.9848618268966675,
2141
+ "num_tokens": 39791340.0,
2142
+ "step": 2110
2143
+ },
2144
+ {
2145
+ "entropy": 0.8801726162433624,
2146
+ "epoch": 2.8532974427994615,
2147
+ "grad_norm": 1.0955252647399902,
2148
+ "learning_rate": 8.562966199229399e-05,
2149
+ "loss": 0.040004149079322815,
2150
+ "mean_token_accuracy": 0.9841345012187958,
2151
+ "num_tokens": 39980212.0,
2152
+ "step": 2120
2153
+ },
2154
+ {
2155
+ "entropy": 0.8931492984294891,
2156
+ "epoch": 2.8667563930013458,
2157
+ "grad_norm": 0.5722652673721313,
2158
+ "learning_rate": 8.547316968019363e-05,
2159
+ "loss": 0.038096648454666135,
2160
+ "mean_token_accuracy": 0.9849136590957641,
2161
+ "num_tokens": 40169122.0,
2162
+ "step": 2130
2163
+ },
2164
+ {
2165
+ "entropy": 0.8767493844032288,
2166
+ "epoch": 2.88021534320323,
2167
+ "grad_norm": 0.7663730382919312,
2168
+ "learning_rate": 8.531597456167885e-05,
2169
+ "loss": 0.03729096055030823,
2170
+ "mean_token_accuracy": 0.985169780254364,
2171
+ "num_tokens": 40357678.0,
2172
+ "step": 2140
2173
+ },
2174
+ {
2175
+ "entropy": 0.8770561516284943,
2176
+ "epoch": 2.8936742934051143,
2177
+ "grad_norm": 1.1505672931671143,
2178
+ "learning_rate": 8.515807975115239e-05,
2179
+ "loss": 0.03984796404838562,
2180
+ "mean_token_accuracy": 0.9838845670223236,
2181
+ "num_tokens": 40546892.0,
2182
+ "step": 2150
2183
+ },
2184
+ {
2185
+ "entropy": 0.8789129912853241,
2186
+ "epoch": 2.9071332436069985,
2187
+ "grad_norm": 0.8355761766433716,
2188
+ "learning_rate": 8.499948837687959e-05,
2189
+ "loss": 0.03769223690032959,
2190
+ "mean_token_accuracy": 0.9847559213638306,
2191
+ "num_tokens": 40736109.0,
2192
+ "step": 2160
2193
+ },
2194
+ {
2195
+ "entropy": 0.8781779289245606,
2196
+ "epoch": 2.920592193808883,
2197
+ "grad_norm": 0.9103918075561523,
2198
+ "learning_rate": 8.484020358092625e-05,
2199
+ "loss": 0.038262826204299924,
2200
+ "mean_token_accuracy": 0.984741848707199,
2201
+ "num_tokens": 40924341.0,
2202
+ "step": 2170
2203
+ },
2204
+ {
2205
+ "entropy": 0.8552807629108429,
2206
+ "epoch": 2.934051144010767,
2207
+ "grad_norm": 0.8338510990142822,
2208
+ "learning_rate": 8.468022851909657e-05,
2209
+ "loss": 0.0355743408203125,
2210
+ "mean_token_accuracy": 0.9855036079883576,
2211
+ "num_tokens": 41113849.0,
2212
+ "step": 2180
2213
+ },
2214
+ {
2215
+ "entropy": 0.8705591142177582,
2216
+ "epoch": 2.9475100942126513,
2217
+ "grad_norm": 0.8334057927131653,
2218
+ "learning_rate": 8.451956636087046e-05,
2219
+ "loss": 0.037476515769958495,
2220
+ "mean_token_accuracy": 0.9854551255702972,
2221
+ "num_tokens": 41303130.0,
2222
+ "step": 2190
2223
+ },
2224
+ {
2225
+ "entropy": 0.8809274792671203,
2226
+ "epoch": 2.9609690444145356,
2227
+ "grad_norm": 0.9424103498458862,
2228
+ "learning_rate": 8.435822028934087e-05,
2229
+ "loss": 0.03624279499053955,
2230
+ "mean_token_accuracy": 0.9856135487556458,
2231
+ "num_tokens": 41491750.0,
2232
+ "step": 2200
2233
+ },
2234
+ {
2235
+ "entropy": 0.8818133771419525,
2236
+ "epoch": 2.97442799461642,
2237
+ "grad_norm": 1.04243004322052,
2238
+ "learning_rate": 8.41961935011506e-05,
2239
+ "loss": 0.039132320880889894,
2240
+ "mean_token_accuracy": 0.984746390581131,
2241
+ "num_tokens": 41680493.0,
2242
+ "step": 2210
2243
+ },
2244
+ {
2245
+ "entropy": 0.8838395297527313,
2246
+ "epoch": 2.987886944818304,
2247
+ "grad_norm": 0.8650784492492676,
2248
+ "learning_rate": 8.403348920642911e-05,
2249
+ "loss": 0.03822052478790283,
2250
+ "mean_token_accuracy": 0.9852108955383301,
2251
+ "num_tokens": 41868378.0,
2252
+ "step": 2220
2253
+ },
2254
+ {
2255
+ "epoch": 3.0,
2256
+ "eval_entropy": 0.8800126549544608,
2257
+ "eval_loss": 0.05468416586518288,
2258
+ "eval_mean_token_accuracy": 0.9781015617832257,
2259
+ "eval_num_tokens": 42038037.0,
2260
+ "eval_runtime": 24.4825,
2261
+ "eval_samples_per_second": 204.228,
2262
+ "eval_steps_per_second": 6.413,
2263
+ "step": 2229
2264
+ },
2265
+ {
2266
+ "entropy": 0.8773060619831086,
2267
+ "epoch": 3.0013458950201883,
2268
+ "grad_norm": 0.7845072150230408,
2269
+ "learning_rate": 8.387011062872883e-05,
2270
+ "loss": 0.0365300714969635,
2271
+ "mean_token_accuracy": 0.985853761434555,
2272
+ "num_tokens": 42056717.0,
2273
+ "step": 2230
2274
+ },
2275
+ {
2276
+ "entropy": 0.8720550239086151,
2277
+ "epoch": 3.0148048452220726,
2278
+ "grad_norm": 0.9917275905609131,
2279
+ "learning_rate": 8.370606100496128e-05,
2280
+ "loss": 0.030071893334388734,
2281
+ "mean_token_accuracy": 0.9890478909015655,
2282
+ "num_tokens": 42245058.0,
2283
+ "step": 2240
2284
+ },
2285
+ {
2286
+ "entropy": 0.8489451467990875,
2287
+ "epoch": 3.028263795423957,
2288
+ "grad_norm": 0.9414294362068176,
2289
+ "learning_rate": 8.354134358533301e-05,
2290
+ "loss": 0.027054613828659056,
2291
+ "mean_token_accuracy": 0.9894806027412415,
2292
+ "num_tokens": 42433468.0,
2293
+ "step": 2250
2294
+ },
2295
+ {
2296
+ "entropy": 0.848755830526352,
2297
+ "epoch": 3.041722745625841,
2298
+ "grad_norm": 0.7276235818862915,
2299
+ "learning_rate": 8.337596163328114e-05,
2300
+ "loss": 0.026897600293159483,
2301
+ "mean_token_accuracy": 0.9893308103084564,
2302
+ "num_tokens": 42621657.0,
2303
+ "step": 2260
2304
+ },
2305
+ {
2306
+ "entropy": 0.8402929604053497,
2307
+ "epoch": 3.0551816958277254,
2308
+ "grad_norm": 0.9455838799476624,
2309
+ "learning_rate": 8.320991842540875e-05,
2310
+ "loss": 0.027918827533721925,
2311
+ "mean_token_accuracy": 0.9888385117053986,
2312
+ "num_tokens": 42810533.0,
2313
+ "step": 2270
2314
+ },
2315
+ {
2316
+ "entropy": 0.8434606611728668,
2317
+ "epoch": 3.0686406460296096,
2318
+ "grad_norm": 0.7503429651260376,
2319
+ "learning_rate": 8.304321725141995e-05,
2320
+ "loss": 0.025769710540771484,
2321
+ "mean_token_accuracy": 0.989881956577301,
2322
+ "num_tokens": 42999085.0,
2323
+ "step": 2280
2324
+ },
2325
+ {
2326
+ "entropy": 0.8459890782833099,
2327
+ "epoch": 3.082099596231494,
2328
+ "grad_norm": 0.736440896987915,
2329
+ "learning_rate": 8.287586141405464e-05,
2330
+ "loss": 0.028781753778457642,
2331
+ "mean_token_accuracy": 0.9882656812667847,
2332
+ "num_tokens": 43188222.0,
2333
+ "step": 2290
2334
+ },
2335
+ {
2336
+ "entropy": 0.8493306159973144,
2337
+ "epoch": 3.095558546433378,
2338
+ "grad_norm": 0.9577816128730774,
2339
+ "learning_rate": 8.27078542290232e-05,
2340
+ "loss": 0.02689719796180725,
2341
+ "mean_token_accuracy": 0.9889744818210602,
2342
+ "num_tokens": 43376648.0,
2343
+ "step": 2300
2344
+ },
2345
+ {
2346
+ "entropy": 0.8512665331363678,
2347
+ "epoch": 3.1090174966352624,
2348
+ "grad_norm": 0.9631588459014893,
2349
+ "learning_rate": 8.253919902494071e-05,
2350
+ "loss": 0.031439167261123654,
2351
+ "mean_token_accuracy": 0.9874942421913147,
2352
+ "num_tokens": 43564769.0,
2353
+ "step": 2310
2354
+ },
2355
+ {
2356
+ "entropy": 0.8519513130187988,
2357
+ "epoch": 3.1224764468371466,
2358
+ "grad_norm": 1.1011922359466553,
2359
+ "learning_rate": 8.236989914326101e-05,
2360
+ "loss": 0.02696958780288696,
2361
+ "mean_token_accuracy": 0.9900951981544495,
2362
+ "num_tokens": 43753362.0,
2363
+ "step": 2320
2364
+ },
2365
+ {
2366
+ "entropy": 0.8507618844509125,
2367
+ "epoch": 3.135935397039031,
2368
+ "grad_norm": 0.9351623058319092,
2369
+ "learning_rate": 8.21999579382105e-05,
2370
+ "loss": 0.026315575838088988,
2371
+ "mean_token_accuracy": 0.989511913061142,
2372
+ "num_tokens": 43942063.0,
2373
+ "step": 2330
2374
+ },
2375
+ {
2376
+ "entropy": 0.8506478369235992,
2377
+ "epoch": 3.149394347240915,
2378
+ "grad_norm": 1.096564769744873,
2379
+ "learning_rate": 8.202937877672175e-05,
2380
+ "loss": 0.029049152135849,
2381
+ "mean_token_accuracy": 0.9886228442192078,
2382
+ "num_tokens": 44130381.0,
2383
+ "step": 2340
2384
+ },
2385
+ {
2386
+ "entropy": 0.8624941289424897,
2387
+ "epoch": 3.1628532974427994,
2388
+ "grad_norm": 0.7859402298927307,
2389
+ "learning_rate": 8.185816503836665e-05,
2390
+ "loss": 0.027744990587234498,
2391
+ "mean_token_accuracy": 0.9890527307987214,
2392
+ "num_tokens": 44319243.0,
2393
+ "step": 2350
2394
+ },
2395
+ {
2396
+ "entropy": 0.8627866744995117,
2397
+ "epoch": 3.1763122476446837,
2398
+ "grad_norm": 0.7764605283737183,
2399
+ "learning_rate": 8.168632011528961e-05,
2400
+ "loss": 0.02571650445461273,
2401
+ "mean_token_accuracy": 0.9903442323207855,
2402
+ "num_tokens": 44507698.0,
2403
+ "step": 2360
2404
+ },
2405
+ {
2406
+ "entropy": 0.8566473722457886,
2407
+ "epoch": 3.189771197846568,
2408
+ "grad_norm": 0.8182396292686462,
2409
+ "learning_rate": 8.15138474121403e-05,
2410
+ "loss": 0.025625643134117127,
2411
+ "mean_token_accuracy": 0.9897150516510009,
2412
+ "num_tokens": 44695954.0,
2413
+ "step": 2370
2414
+ },
2415
+ {
2416
+ "entropy": 0.8555088877677918,
2417
+ "epoch": 3.203230148048452,
2418
+ "grad_norm": 1.0277098417282104,
2419
+ "learning_rate": 8.134075034600609e-05,
2420
+ "loss": 0.028235083818435668,
2421
+ "mean_token_accuracy": 0.9894341349601745,
2422
+ "num_tokens": 44884558.0,
2423
+ "step": 2380
2424
+ },
2425
+ {
2426
+ "entropy": 0.8562702238559723,
2427
+ "epoch": 3.2166890982503364,
2428
+ "grad_norm": 1.0676493644714355,
2429
+ "learning_rate": 8.116703234634453e-05,
2430
+ "loss": 0.027713948488235475,
2431
+ "mean_token_accuracy": 0.9891053259372711,
2432
+ "num_tokens": 45073381.0,
2433
+ "step": 2390
2434
+ },
2435
+ {
2436
+ "entropy": 0.8520576059818268,
2437
+ "epoch": 3.2301480484522207,
2438
+ "grad_norm": 0.7926117181777954,
2439
+ "learning_rate": 8.099269685491528e-05,
2440
+ "loss": 0.028936928510665892,
2441
+ "mean_token_accuracy": 0.988807988166809,
2442
+ "num_tokens": 45262350.0,
2443
+ "step": 2400
2444
+ },
2445
+ {
2446
+ "entropy": 0.860496312379837,
2447
+ "epoch": 3.243606998654105,
2448
+ "grad_norm": 1.1108510494232178,
2449
+ "learning_rate": 8.081774732571196e-05,
2450
+ "loss": 0.030533093214035033,
2451
+ "mean_token_accuracy": 0.9882748067378998,
2452
+ "num_tokens": 45450782.0,
2453
+ "step": 2410
2454
+ },
2455
+ {
2456
+ "entropy": 0.872324675321579,
2457
+ "epoch": 3.257065948855989,
2458
+ "grad_norm": 1.3057194948196411,
2459
+ "learning_rate": 8.06421872248937e-05,
2460
+ "loss": 0.032421305775642395,
2461
+ "mean_token_accuracy": 0.9879753351211548,
2462
+ "num_tokens": 45639122.0,
2463
+ "step": 2420
2464
+ },
2465
+ {
2466
+ "entropy": 0.8778823137283325,
2467
+ "epoch": 3.2705248990578735,
2468
+ "grad_norm": 1.289680004119873,
2469
+ "learning_rate": 8.046602003071648e-05,
2470
+ "loss": 0.03096393346786499,
2471
+ "mean_token_accuracy": 0.9883370697498322,
2472
+ "num_tokens": 45827730.0,
2473
+ "step": 2430
2474
+ },
2475
+ {
2476
+ "entropy": 0.8627406179904937,
2477
+ "epoch": 3.2839838492597577,
2478
+ "grad_norm": 0.8361398577690125,
2479
+ "learning_rate": 8.028924923346426e-05,
2480
+ "loss": 0.02957139313220978,
2481
+ "mean_token_accuracy": 0.9884674072265625,
2482
+ "num_tokens": 46016274.0,
2483
+ "step": 2440
2484
+ },
2485
+ {
2486
+ "entropy": 0.8584554970264435,
2487
+ "epoch": 3.297442799461642,
2488
+ "grad_norm": 0.9330723881721497,
2489
+ "learning_rate": 8.011187833537972e-05,
2490
+ "loss": 0.029568272829055785,
2491
+ "mean_token_accuracy": 0.9882686555385589,
2492
+ "num_tokens": 46204679.0,
2493
+ "step": 2450
2494
+ },
2495
+ {
2496
+ "entropy": 0.8515178024768829,
2497
+ "epoch": 3.3109017496635262,
2498
+ "grad_norm": 1.0296216011047363,
2499
+ "learning_rate": 7.993391085059502e-05,
2500
+ "loss": 0.027788180112838744,
2501
+ "mean_token_accuracy": 0.9888734877109527,
2502
+ "num_tokens": 46393989.0,
2503
+ "step": 2460
2504
+ },
2505
+ {
2506
+ "entropy": 0.8488850116729736,
2507
+ "epoch": 3.3243606998654105,
2508
+ "grad_norm": 1.1329699754714966,
2509
+ "learning_rate": 7.975535030506203e-05,
2510
+ "loss": 0.028057461977005003,
2511
+ "mean_token_accuracy": 0.9895590543746948,
2512
+ "num_tokens": 46582790.0,
2513
+ "step": 2470
2514
+ },
2515
+ {
2516
+ "entropy": 0.849625188112259,
2517
+ "epoch": 3.3378196500672948,
2518
+ "grad_norm": 0.9267219305038452,
2519
+ "learning_rate": 7.957620023648256e-05,
2520
+ "loss": 0.02802973985671997,
2521
+ "mean_token_accuracy": 0.988851535320282,
2522
+ "num_tokens": 46771138.0,
2523
+ "step": 2480
2524
+ },
2525
+ {
2526
+ "entropy": 0.854446280002594,
2527
+ "epoch": 3.351278600269179,
2528
+ "grad_norm": 0.8713932037353516,
2529
+ "learning_rate": 7.939646419423826e-05,
2530
+ "loss": 0.030220019817352294,
2531
+ "mean_token_accuracy": 0.9882843673229218,
2532
+ "num_tokens": 46960054.0,
2533
+ "step": 2490
2534
+ },
2535
+ {
2536
+ "entropy": 0.8599806547164917,
2537
+ "epoch": 3.3647375504710633,
2538
+ "grad_norm": 0.675019383430481,
2539
+ "learning_rate": 7.92161457393203e-05,
2540
+ "loss": 0.02958865463733673,
2541
+ "mean_token_accuracy": 0.9880296885967255,
2542
+ "num_tokens": 47149567.0,
2543
+ "step": 2500
2544
+ },
2545
+ {
2546
+ "entropy": 0.8549504458904267,
2547
+ "epoch": 3.3781965006729475,
2548
+ "grad_norm": 1.2176353931427002,
2549
+ "learning_rate": 7.903524844425878e-05,
2550
+ "loss": 0.027722162008285523,
2551
+ "mean_token_accuracy": 0.9890090465545655,
2552
+ "num_tokens": 47338060.0,
2553
+ "step": 2510
2554
+ },
2555
+ {
2556
+ "entropy": 0.8552233815193176,
2557
+ "epoch": 3.391655450874832,
2558
+ "grad_norm": 0.9571717977523804,
2559
+ "learning_rate": 7.885377589305197e-05,
2560
+ "loss": 0.028924965858459474,
2561
+ "mean_token_accuracy": 0.9884298503398895,
2562
+ "num_tokens": 47525895.0,
2563
+ "step": 2520
2564
+ },
2565
+ {
2566
+ "entropy": 0.8521630108356476,
2567
+ "epoch": 3.405114401076716,
2568
+ "grad_norm": 0.965713620185852,
2569
+ "learning_rate": 7.867173168109534e-05,
2570
+ "loss": 0.030511701107025148,
2571
+ "mean_token_accuracy": 0.9881398677825928,
2572
+ "num_tokens": 47714524.0,
2573
+ "step": 2530
2574
+ },
2575
+ {
2576
+ "entropy": 0.8537400126457214,
2577
+ "epoch": 3.4185733512786003,
2578
+ "grad_norm": 0.9005307555198669,
2579
+ "learning_rate": 7.84891194151103e-05,
2580
+ "loss": 0.030432754755020143,
2581
+ "mean_token_accuracy": 0.9876708447933197,
2582
+ "num_tokens": 47903188.0,
2583
+ "step": 2540
2584
+ },
2585
+ {
2586
+ "entropy": 0.8596058666706086,
2587
+ "epoch": 3.4320323014804845,
2588
+ "grad_norm": 0.8159075379371643,
2589
+ "learning_rate": 7.830594271307267e-05,
2590
+ "loss": 0.027927806973457335,
2591
+ "mean_token_accuracy": 0.9889210939407349,
2592
+ "num_tokens": 48091334.0,
2593
+ "step": 2550
2594
+ },
2595
+ {
2596
+ "entropy": 0.8482735753059387,
2597
+ "epoch": 3.445491251682369,
2598
+ "grad_norm": 1.0411661863327026,
2599
+ "learning_rate": 7.812220520414115e-05,
2600
+ "loss": 0.03003154993057251,
2601
+ "mean_token_accuracy": 0.9882730603218078,
2602
+ "num_tokens": 48279622.0,
2603
+ "step": 2560
2604
+ },
2605
+ {
2606
+ "entropy": 0.8466426789760589,
2607
+ "epoch": 3.458950201884253,
2608
+ "grad_norm": 1.189595341682434,
2609
+ "learning_rate": 7.793791052858528e-05,
2610
+ "loss": 0.03237656056880951,
2611
+ "mean_token_accuracy": 0.9869072616100312,
2612
+ "num_tokens": 48467527.0,
2613
+ "step": 2570
2614
+ },
2615
+ {
2616
+ "entropy": 0.8545588493347168,
2617
+ "epoch": 3.4724091520861373,
2618
+ "grad_norm": 1.0700056552886963,
2619
+ "learning_rate": 7.775306233771343e-05,
2620
+ "loss": 0.0306060791015625,
2621
+ "mean_token_accuracy": 0.9880664825439454,
2622
+ "num_tokens": 48656151.0,
2623
+ "step": 2580
2624
+ },
2625
+ {
2626
+ "entropy": 0.8503433585166931,
2627
+ "epoch": 3.4858681022880216,
2628
+ "grad_norm": 0.7099478244781494,
2629
+ "learning_rate": 7.756766429380033e-05,
2630
+ "loss": 0.02984013557434082,
2631
+ "mean_token_accuracy": 0.9879442811012268,
2632
+ "num_tokens": 48844796.0,
2633
+ "step": 2590
2634
+ },
2635
+ {
2636
+ "entropy": 0.8464376091957092,
2637
+ "epoch": 3.499327052489906,
2638
+ "grad_norm": 0.9877128005027771,
2639
+ "learning_rate": 7.738172007001465e-05,
2640
+ "loss": 0.0306446373462677,
2641
+ "mean_token_accuracy": 0.9883591532707214,
2642
+ "num_tokens": 49033216.0,
2643
+ "step": 2600
2644
+ },
2645
+ {
2646
+ "entropy": 0.8477609276771545,
2647
+ "epoch": 3.51278600269179,
2648
+ "grad_norm": 1.3324531316757202,
2649
+ "learning_rate": 7.719523335034612e-05,
2650
+ "loss": 0.030148410797119142,
2651
+ "mean_token_accuracy": 0.9879989802837372,
2652
+ "num_tokens": 49221425.0,
2653
+ "step": 2610
2654
+ },
2655
+ {
2656
+ "entropy": 0.8461863219738006,
2657
+ "epoch": 3.5262449528936743,
2658
+ "grad_norm": 0.8938570022583008,
2659
+ "learning_rate": 7.70082078295326e-05,
2660
+ "loss": 0.02682304084300995,
2661
+ "mean_token_accuracy": 0.9897881150245667,
2662
+ "num_tokens": 49409785.0,
2663
+ "step": 2620
2664
+ },
2665
+ {
2666
+ "entropy": 0.8362689018249512,
2667
+ "epoch": 3.5397039030955586,
2668
+ "grad_norm": 0.7423873543739319,
2669
+ "learning_rate": 7.682064721298683e-05,
2670
+ "loss": 0.02689332664012909,
2671
+ "mean_token_accuracy": 0.9899903059005737,
2672
+ "num_tokens": 49598524.0,
2673
+ "step": 2630
2674
+ },
2675
+ {
2676
+ "entropy": 0.8395139217376709,
2677
+ "epoch": 3.553162853297443,
2678
+ "grad_norm": 0.921755850315094,
2679
+ "learning_rate": 7.663255521672308e-05,
2680
+ "loss": 0.030062124133110046,
2681
+ "mean_token_accuracy": 0.9880835592746735,
2682
+ "num_tokens": 49787241.0,
2683
+ "step": 2640
2684
+ },
2685
+ {
2686
+ "entropy": 0.8467226803302765,
2687
+ "epoch": 3.566621803499327,
2688
+ "grad_norm": 1.0296459197998047,
2689
+ "learning_rate": 7.64439355672835e-05,
2690
+ "loss": 0.030666223168373107,
2691
+ "mean_token_accuracy": 0.9877098739147187,
2692
+ "num_tokens": 49975844.0,
2693
+ "step": 2650
2694
+ },
2695
+ {
2696
+ "entropy": 0.833392071723938,
2697
+ "epoch": 3.5800807537012114,
2698
+ "grad_norm": 1.2861948013305664,
2699
+ "learning_rate": 7.625479200166425e-05,
2700
+ "loss": 0.028174835443496703,
2701
+ "mean_token_accuracy": 0.989056545495987,
2702
+ "num_tokens": 50164857.0,
2703
+ "step": 2660
2704
+ },
2705
+ {
2706
+ "entropy": 0.8340730309486389,
2707
+ "epoch": 3.5935397039030956,
2708
+ "grad_norm": 0.8116776347160339,
2709
+ "learning_rate": 7.606512826724155e-05,
2710
+ "loss": 0.028390336036682128,
2711
+ "mean_token_accuracy": 0.9886897742748261,
2712
+ "num_tokens": 50353452.0,
2713
+ "step": 2670
2714
+ },
2715
+ {
2716
+ "entropy": 0.8357116639614105,
2717
+ "epoch": 3.60699865410498,
2718
+ "grad_norm": 0.8362084031105042,
2719
+ "learning_rate": 7.587494812169728e-05,
2720
+ "loss": 0.029806244373321533,
2721
+ "mean_token_accuracy": 0.988309508562088,
2722
+ "num_tokens": 50542461.0,
2723
+ "step": 2680
2724
+ },
2725
+ {
2726
+ "entropy": 0.8363270878791809,
2727
+ "epoch": 3.620457604306864,
2728
+ "grad_norm": 0.8438499569892883,
2729
+ "learning_rate": 7.568425533294476e-05,
2730
+ "loss": 0.028577303886413573,
2731
+ "mean_token_accuracy": 0.9886945366859436,
2732
+ "num_tokens": 50730872.0,
2733
+ "step": 2690
2734
+ },
2735
+ {
2736
+ "entropy": 0.8310261785984039,
2737
+ "epoch": 3.6339165545087484,
2738
+ "grad_norm": 1.1920002698898315,
2739
+ "learning_rate": 7.549305367905385e-05,
2740
+ "loss": 0.030321845412254335,
2741
+ "mean_token_accuracy": 0.9878708899021149,
2742
+ "num_tokens": 50919889.0,
2743
+ "step": 2700
2744
+ },
2745
+ {
2746
+ "entropy": 0.8380252480506897,
2747
+ "epoch": 3.6473755047106327,
2748
+ "grad_norm": 0.9409399032592773,
2749
+ "learning_rate": 7.53013469481763e-05,
2750
+ "loss": 0.030501589179039,
2751
+ "mean_token_accuracy": 0.9872083842754364,
2752
+ "num_tokens": 51108079.0,
2753
+ "step": 2710
2754
+ },
2755
+ {
2756
+ "entropy": 0.8422612130641938,
2757
+ "epoch": 3.660834454912517,
2758
+ "grad_norm": 0.8370440006256104,
2759
+ "learning_rate": 7.510913893847058e-05,
2760
+ "loss": 0.02974823713302612,
2761
+ "mean_token_accuracy": 0.9885554671287536,
2762
+ "num_tokens": 51296913.0,
2763
+ "step": 2720
2764
+ },
2765
+ {
2766
+ "entropy": 0.8387595832347869,
2767
+ "epoch": 3.674293405114401,
2768
+ "grad_norm": 1.1360642910003662,
2769
+ "learning_rate": 7.491643345802667e-05,
2770
+ "loss": 0.026970452070236205,
2771
+ "mean_token_accuracy": 0.9888331353664398,
2772
+ "num_tokens": 51485678.0,
2773
+ "step": 2730
2774
+ },
2775
+ {
2776
+ "entropy": 0.8403031289577484,
2777
+ "epoch": 3.6877523553162854,
2778
+ "grad_norm": 1.1195677518844604,
2779
+ "learning_rate": 7.472323432479062e-05,
2780
+ "loss": 0.029451662302017213,
2781
+ "mean_token_accuracy": 0.988847154378891,
2782
+ "num_tokens": 51674765.0,
2783
+ "step": 2740
2784
+ },
2785
+ {
2786
+ "entropy": 0.8499106168746948,
2787
+ "epoch": 3.7012113055181697,
2788
+ "grad_norm": 0.9573526978492737,
2789
+ "learning_rate": 7.452954536648888e-05,
2790
+ "loss": 0.030826866626739502,
2791
+ "mean_token_accuracy": 0.9875578820705414,
2792
+ "num_tokens": 51863924.0,
2793
+ "step": 2750
2794
+ },
2795
+ {
2796
+ "entropy": 0.8576709866523743,
2797
+ "epoch": 3.714670255720054,
2798
+ "grad_norm": 0.9386286735534668,
2799
+ "learning_rate": 7.433537042055248e-05,
2800
+ "loss": 0.02946900427341461,
2801
+ "mean_token_accuracy": 0.9885939955711365,
2802
+ "num_tokens": 52052011.0,
2803
+ "step": 2760
2804
+ },
2805
+ {
2806
+ "entropy": 0.8404816091060638,
2807
+ "epoch": 3.728129205921938,
2808
+ "grad_norm": 1.095536708831787,
2809
+ "learning_rate": 7.414071333404104e-05,
2810
+ "loss": 0.030070826411247253,
2811
+ "mean_token_accuracy": 0.9881400763988495,
2812
+ "num_tokens": 52240955.0,
2813
+ "step": 2770
2814
+ },
2815
+ {
2816
+ "entropy": 0.8346689164638519,
2817
+ "epoch": 3.7415881561238225,
2818
+ "grad_norm": 0.9964426159858704,
2819
+ "learning_rate": 7.394557796356644e-05,
2820
+ "loss": 0.03005717396736145,
2821
+ "mean_token_accuracy": 0.9883057296276092,
2822
+ "num_tokens": 52429544.0,
2823
+ "step": 2780
2824
+ },
2825
+ {
2826
+ "entropy": 0.8377436757087707,
2827
+ "epoch": 3.7550471063257067,
2828
+ "grad_norm": 0.8499778509140015,
2829
+ "learning_rate": 7.374996817521653e-05,
2830
+ "loss": 0.029283815622329713,
2831
+ "mean_token_accuracy": 0.9882387280464172,
2832
+ "num_tokens": 52618405.0,
2833
+ "step": 2790
2834
+ },
2835
+ {
2836
+ "entropy": 0.8328788638114929,
2837
+ "epoch": 3.768506056527591,
2838
+ "grad_norm": 0.9594390988349915,
2839
+ "learning_rate": 7.35538878444785e-05,
2840
+ "loss": 0.02666555345058441,
2841
+ "mean_token_accuracy": 0.9889323055744171,
2842
+ "num_tokens": 52806689.0,
2843
+ "step": 2800
2844
+ },
2845
+ {
2846
+ "entropy": 0.829415249824524,
2847
+ "epoch": 3.781965006729475,
2848
+ "grad_norm": 1.3587099313735962,
2849
+ "learning_rate": 7.335734085616206e-05,
2850
+ "loss": 0.028349629044532774,
2851
+ "mean_token_accuracy": 0.9890907645225525,
2852
+ "num_tokens": 52995248.0,
2853
+ "step": 2810
2854
+ },
2855
+ {
2856
+ "entropy": 0.8335795283317566,
2857
+ "epoch": 3.7954239569313595,
2858
+ "grad_norm": 1.077789306640625,
2859
+ "learning_rate": 7.316033110432249e-05,
2860
+ "loss": 0.03001731038093567,
2861
+ "mean_token_accuracy": 0.9878358781337738,
2862
+ "num_tokens": 53183607.0,
2863
+ "step": 2820
2864
+ },
2865
+ {
2866
+ "entropy": 0.8316770553588867,
2867
+ "epoch": 3.8088829071332437,
2868
+ "grad_norm": 0.7265951037406921,
2869
+ "learning_rate": 7.296286249218352e-05,
2870
+ "loss": 0.030136841535568237,
2871
+ "mean_token_accuracy": 0.9880174934864044,
2872
+ "num_tokens": 53372735.0,
2873
+ "step": 2830
2874
+ },
2875
+ {
2876
+ "entropy": 0.8310399711132049,
2877
+ "epoch": 3.822341857335128,
2878
+ "grad_norm": 0.6605978012084961,
2879
+ "learning_rate": 7.276493893205995e-05,
2880
+ "loss": 0.026665833592414857,
2881
+ "mean_token_accuracy": 0.9892704486846924,
2882
+ "num_tokens": 53561776.0,
2883
+ "step": 2840
2884
+ },
2885
+ {
2886
+ "entropy": 0.8258654296398162,
2887
+ "epoch": 3.8358008075370122,
2888
+ "grad_norm": 0.6560797691345215,
2889
+ "learning_rate": 7.256656434528018e-05,
2890
+ "loss": 0.026718440651893615,
2891
+ "mean_token_accuracy": 0.989847868680954,
2892
+ "num_tokens": 53750019.0,
2893
+ "step": 2850
2894
+ },
2895
+ {
2896
+ "entropy": 0.8221413195133209,
2897
+ "epoch": 3.8492597577388965,
2898
+ "grad_norm": 0.7495356798171997,
2899
+ "learning_rate": 7.236774266210852e-05,
2900
+ "loss": 0.027431756258010864,
2901
+ "mean_token_accuracy": 0.9892430484294892,
2902
+ "num_tokens": 53938914.0,
2903
+ "step": 2860
2904
+ },
2905
+ {
2906
+ "entropy": 0.8341928541660308,
2907
+ "epoch": 3.8627187079407808,
2908
+ "grad_norm": 1.0676971673965454,
2909
+ "learning_rate": 7.216847782166727e-05,
2910
+ "loss": 0.028432759642601012,
2911
+ "mean_token_accuracy": 0.9889024317264556,
2912
+ "num_tokens": 54127763.0,
2913
+ "step": 2870
2914
+ },
2915
+ {
2916
+ "entropy": 0.8359149813652038,
2917
+ "epoch": 3.876177658142665,
2918
+ "grad_norm": 1.380476713180542,
2919
+ "learning_rate": 7.196877377185872e-05,
2920
+ "loss": 0.028149163722991942,
2921
+ "mean_token_accuracy": 0.9884953200817108,
2922
+ "num_tokens": 54315930.0,
2923
+ "step": 2880
2924
+ },
2925
+ {
2926
+ "entropy": 0.8294597804546356,
2927
+ "epoch": 3.8896366083445493,
2928
+ "grad_norm": 1.0728843212127686,
2929
+ "learning_rate": 7.176863446928694e-05,
2930
+ "loss": 0.028733691573143004,
2931
+ "mean_token_accuracy": 0.9884523034095765,
2932
+ "num_tokens": 54504632.0,
2933
+ "step": 2890
2934
+ },
2935
+ {
2936
+ "entropy": 0.8245523810386658,
2937
+ "epoch": 3.9030955585464335,
2938
+ "grad_norm": 0.9723307490348816,
2939
+ "learning_rate": 7.156806387917937e-05,
2940
+ "loss": 0.02793261408805847,
2941
+ "mean_token_accuracy": 0.9891038477420807,
2942
+ "num_tokens": 54693752.0,
2943
+ "step": 2900
2944
+ },
2945
+ {
2946
+ "entropy": 0.8279620766639709,
2947
+ "epoch": 3.916554508748318,
2948
+ "grad_norm": 1.003624439239502,
2949
+ "learning_rate": 7.136706597530825e-05,
2950
+ "loss": 0.028367668390274048,
2951
+ "mean_token_accuracy": 0.9893603324890137,
2952
+ "num_tokens": 54882030.0,
2953
+ "step": 2910
2954
+ },
2955
+ {
2956
+ "entropy": 0.8306584179401397,
2957
+ "epoch": 3.930013458950202,
2958
+ "grad_norm": 0.8481361865997314,
2959
+ "learning_rate": 7.116564473991192e-05,
2960
+ "loss": 0.029952478408813477,
2961
+ "mean_token_accuracy": 0.9881332099437714,
2962
+ "num_tokens": 55070549.0,
2963
+ "step": 2920
2964
+ },
2965
+ {
2966
+ "entropy": 0.8316469848155975,
2967
+ "epoch": 3.9434724091520863,
2968
+ "grad_norm": 0.7631049156188965,
2969
+ "learning_rate": 7.096380416361588e-05,
2970
+ "loss": 0.029351180791854857,
2971
+ "mean_token_accuracy": 0.9888184070587158,
2972
+ "num_tokens": 55258999.0,
2973
+ "step": 2930
2974
+ },
2975
+ {
2976
+ "entropy": 0.8221287310123444,
2977
+ "epoch": 3.9569313593539706,
2978
+ "grad_norm": 0.7234840393066406,
2979
+ "learning_rate": 7.076154824535381e-05,
2980
+ "loss": 0.027709537744522096,
2981
+ "mean_token_accuracy": 0.9890308439731598,
2982
+ "num_tokens": 55448047.0,
2983
+ "step": 2940
2984
+ },
2985
+ {
2986
+ "entropy": 0.8239192545413971,
2987
+ "epoch": 3.970390309555855,
2988
+ "grad_norm": 0.8497020602226257,
2989
+ "learning_rate": 7.055888099228825e-05,
2990
+ "loss": 0.029604345560073853,
2991
+ "mean_token_accuracy": 0.9881375730037689,
2992
+ "num_tokens": 55636566.0,
2993
+ "step": 2950
2994
+ },
2995
+ {
2996
+ "entropy": 0.8319927275180816,
2997
+ "epoch": 3.983849259757739,
2998
+ "grad_norm": 1.1599552631378174,
2999
+ "learning_rate": 7.035580641973119e-05,
3000
+ "loss": 0.02763860821723938,
3001
+ "mean_token_accuracy": 0.9891406774520874,
3002
+ "num_tokens": 55825049.0,
3003
+ "step": 2960
3004
+ },
3005
+ {
3006
+ "entropy": 0.8404937744140625,
3007
+ "epoch": 3.9973082099596233,
3008
+ "grad_norm": 0.906755805015564,
3009
+ "learning_rate": 7.015232855106468e-05,
3010
+ "loss": 0.029578033089637756,
3011
+ "mean_token_accuracy": 0.9878996431827545,
3012
+ "num_tokens": 56012965.0,
3013
+ "step": 2970
3014
+ },
3015
+ {
3016
+ "epoch": 4.0,
3017
+ "eval_entropy": 0.8419649528849656,
3018
+ "eval_loss": 0.0570383295416832,
3019
+ "eval_mean_token_accuracy": 0.9787763862093543,
3020
+ "eval_num_tokens": 56050518.0,
3021
+ "eval_runtime": 24.4889,
3022
+ "eval_samples_per_second": 204.174,
3023
+ "eval_steps_per_second": 6.411,
3024
+ "step": 2972
3025
+ }
3026
+ ],
3027
+ "logging_steps": 10,
3028
+ "max_steps": 7430,
3029
+ "num_input_tokens_seen": 0,
3030
+ "num_train_epochs": 10,
3031
+ "save_steps": 500,
3032
+ "stateful_callbacks": {
3033
+ "TrainerControl": {
3034
+ "args": {
3035
+ "should_epoch_stop": false,
3036
+ "should_evaluate": false,
3037
+ "should_log": false,
3038
+ "should_save": true,
3039
+ "should_training_stop": false
3040
+ },
3041
+ "attributes": {}
3042
+ }
3043
+ },
3044
+ "total_flos": 2.7083082838982e+18,
3045
+ "train_batch_size": 32,
3046
+ "trial_name": null,
3047
+ "trial_params": null
3048
+ }
baby_talk_L16_a50/seed_42/checkpoint-2972/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c060e97b69d99564c146471c3d3ac4d335e3b1968074124f4edc5aebf612e1e3
3
+ size 5368
baby_talk_L16_a50/seed_42/checkpoint-3715/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
baby_talk_L16_a50/seed_42/checkpoint-3715/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/checkpoint-3715/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76034427b07f0dba7df52bb96914473f77c4a53c8e7d021b0affc3de7384c88d
3
+ size 80792096
baby_talk_L16_a50/seed_42/checkpoint-3715/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
baby_talk_L16_a50/seed_42/checkpoint-3715/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
baby_talk_L16_a50/seed_42/checkpoint-3715/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
baby_talk_L16_a50/seed_42/checkpoint-3715/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
baby_talk_L16_a50/seed_42/checkpoint-3715/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c060e97b69d99564c146471c3d3ac4d335e3b1968074124f4edc5aebf612e1e3
3
+ size 5368
baby_talk_L16_a50/seed_42/checkpoint-4458/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
baby_talk_L16_a50/seed_42/checkpoint-4458/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/checkpoint-4458/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a8b6fd518dab5e24f6b28262e81c749857962f0080cb5998e7b9aa57c5eb01a
3
+ size 80792096
baby_talk_L16_a50/seed_42/checkpoint-4458/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
baby_talk_L16_a50/seed_42/checkpoint-4458/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
baby_talk_L16_a50/seed_42/checkpoint-4458/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 131072,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
baby_talk_L16_a50/seed_42/checkpoint-4458/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
baby_talk_L16_a50/seed_42/checkpoint-4458/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c060e97b69d99564c146471c3d3ac4d335e3b1968074124f4edc5aebf612e1e3
3
+ size 5368
baby_talk_L16_a50/seed_42/checkpoint-5201/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
baby_talk_L16_a50/seed_42/checkpoint-5201/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "o_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
baby_talk_L16_a50/seed_42/checkpoint-5201/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88d555221f6e593a18115c5e50a575aea4bc5817d56745baecbe8d07df0e6cea
3
+ size 80792096
baby_talk_L16_a50/seed_42/checkpoint-5201/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}