codingmonster1234 commited on
Commit
dc23183
·
verified ·
1 Parent(s): b7e443d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' + content }}
27
+ {%- if message.tool_calls %}
28
+ {%- for tool_call in message.tool_calls %}
29
+ {%- if (loop.first and content) or (not loop.first) %}
30
+ {{- '\n' }}
31
+ {%- endif %}
32
+ {%- if tool_call.function %}
33
+ {%- set tool_call = tool_call.function %}
34
+ {%- endif %}
35
+ {{- '<tool_call>\n{"name": "' }}
36
+ {{- tool_call.name }}
37
+ {{- '", "arguments": ' }}
38
+ {%- if tool_call.arguments is string %}
39
+ {{- tool_call.arguments }}
40
+ {%- else %}
41
+ {{- tool_call.arguments | tojson }}
42
+ {%- endif %}
43
+ {{- '}\n</tool_call>' }}
44
+ {%- endfor %}
45
+ {%- endif %}
46
+ {{- '<|im_end|>\n' }}
47
+ {%- elif message.role == "tool" %}
48
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
+ {{- '<|im_start|>user' }}
50
+ {%- endif %}
51
+ {{- '\n<tool_response>\n' }}
52
+ {{- content }}
53
+ {{- '\n</tool_response>' }}
54
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
+ {{- '<|im_end|>\n' }}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- if add_generation_prompt %}
60
+ {{- '<|im_start|>assistant\n' }}
61
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 262144,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 5000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": true,
67
+ "transformers_version": "5.4.0",
68
+ "use_cache": false,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "temperature": 0.7,
9
+ "top_k": 20,
10
+ "top_p": 0.8,
11
+ "transformers_version": "5.4.0"
12
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db62c16fc7e72b520780f950405f0befffc05eb21dea3b0429c54aaa0fac5f63
3
+ size 8044982080
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:188874149b517f101f308cac71f71874db6f1c5e40d45d31f15d3a3ad0c7a240
3
+ size 16090225449
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc24cc580d6b93c8a95bc57b42299118656280a2fcbfb6854e41b57414837d71
3
+ size 1465
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "151665": {
181
+ "content": "<tool_response>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "151666": {
189
+ "content": "</tool_response>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "151667": {
197
+ "content": "<think>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "151668": {
205
+ "content": "</think>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ }
212
+ },
213
+ "additional_special_tokens": [
214
+ "<|im_start|>",
215
+ "<|im_end|>",
216
+ "<|object_ref_start|>",
217
+ "<|object_ref_end|>",
218
+ "<|box_start|>",
219
+ "<|box_end|>",
220
+ "<|quad_start|>",
221
+ "<|quad_end|>",
222
+ "<|vision_start|>",
223
+ "<|vision_end|>",
224
+ "<|vision_pad|>",
225
+ "<|image_pad|>",
226
+ "<|video_pad|>"
227
+ ],
228
+ "bos_token": null,
229
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}",
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "model_max_length": 1010000,
234
+ "pad_token": "<|endoftext|>",
235
+ "split_special_tokens": false,
236
+ "tokenizer_class": "Qwen2Tokenizer",
237
+ "unk_token": null,
238
+ "add_bos_token": false
239
+ }
trainer_state.json ADDED
@@ -0,0 +1,1738 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 168,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.0061692222952843,
14
+ "epoch": 0.011904761904761904,
15
+ "grad_norm": 3.390625,
16
+ "learning_rate": 2e-05,
17
+ "loss": 2.293125867843628,
18
+ "mean_token_accuracy": 0.5738132819533348,
19
+ "num_tokens": 29832.0,
20
+ "step": 1
21
+ },
22
+ {
23
+ "entropy": 1.385195016860962,
24
+ "epoch": 0.023809523809523808,
25
+ "grad_norm": 0.78125,
26
+ "learning_rate": 1.999922292480975e-05,
27
+ "loss": 1.5697591304779053,
28
+ "mean_token_accuracy": 0.6427712365984917,
29
+ "num_tokens": 58835.0,
30
+ "step": 2
31
+ },
32
+ {
33
+ "entropy": 1.5784537345170975,
34
+ "epoch": 0.03571428571428571,
35
+ "grad_norm": 0.62890625,
36
+ "learning_rate": 1.9996891820008165e-05,
37
+ "loss": 1.5061622858047485,
38
+ "mean_token_accuracy": 0.654805600643158,
39
+ "num_tokens": 88089.0,
40
+ "step": 3
41
+ },
42
+ {
43
+ "entropy": 1.5019408017396927,
44
+ "epoch": 0.047619047619047616,
45
+ "grad_norm": 0.43359375,
46
+ "learning_rate": 1.9993007047883988e-05,
47
+ "loss": 1.3531173467636108,
48
+ "mean_token_accuracy": 0.6810621172189713,
49
+ "num_tokens": 116996.0,
50
+ "step": 4
51
+ },
52
+ {
53
+ "entropy": 1.442432388663292,
54
+ "epoch": 0.05952380952380952,
55
+ "grad_norm": 0.369140625,
56
+ "learning_rate": 1.9987569212189224e-05,
57
+ "loss": 1.2870382070541382,
58
+ "mean_token_accuracy": 0.6946646422147751,
59
+ "num_tokens": 146502.0,
60
+ "step": 5
61
+ },
62
+ {
63
+ "entropy": 1.383298322558403,
64
+ "epoch": 0.07142857142857142,
65
+ "grad_norm": 0.318359375,
66
+ "learning_rate": 1.9980579158045322e-05,
67
+ "loss": 1.2606914043426514,
68
+ "mean_token_accuracy": 0.6914810612797737,
69
+ "num_tokens": 175000.0,
70
+ "step": 6
71
+ },
72
+ {
73
+ "entropy": 1.3554321229457855,
74
+ "epoch": 0.08333333333333333,
75
+ "grad_norm": 0.359375,
76
+ "learning_rate": 1.9972037971811802e-05,
77
+ "loss": 1.2325180768966675,
78
+ "mean_token_accuracy": 0.6992553323507309,
79
+ "num_tokens": 203581.0,
80
+ "step": 7
81
+ },
82
+ {
83
+ "entropy": 1.301919937133789,
84
+ "epoch": 0.09523809523809523,
85
+ "grad_norm": 0.3046875,
86
+ "learning_rate": 1.9961946980917457e-05,
87
+ "loss": 1.1691060066223145,
88
+ "mean_token_accuracy": 0.714451938867569,
89
+ "num_tokens": 233225.0,
90
+ "step": 8
91
+ },
92
+ {
93
+ "entropy": 1.3274528235197067,
94
+ "epoch": 0.10714285714285714,
95
+ "grad_norm": 0.296875,
96
+ "learning_rate": 1.9950307753654016e-05,
97
+ "loss": 1.22238290309906,
98
+ "mean_token_accuracy": 0.6991388499736786,
99
+ "num_tokens": 261557.0,
100
+ "step": 9
101
+ },
102
+ {
103
+ "entropy": 1.3020492941141129,
104
+ "epoch": 0.11904761904761904,
105
+ "grad_norm": 0.279296875,
106
+ "learning_rate": 1.9937122098932428e-05,
107
+ "loss": 1.1407413482666016,
108
+ "mean_token_accuracy": 0.7115657702088356,
109
+ "num_tokens": 290843.0,
110
+ "step": 10
111
+ },
112
+ {
113
+ "entropy": 1.2911820262670517,
114
+ "epoch": 0.13095238095238096,
115
+ "grad_norm": 0.263671875,
116
+ "learning_rate": 1.9922392066001724e-05,
117
+ "loss": 1.1007871627807617,
118
+ "mean_token_accuracy": 0.7251745313405991,
119
+ "num_tokens": 320963.0,
120
+ "step": 11
121
+ },
122
+ {
123
+ "entropy": 1.305821493268013,
124
+ "epoch": 0.14285714285714285,
125
+ "grad_norm": 0.2890625,
126
+ "learning_rate": 1.9906119944130527e-05,
127
+ "loss": 1.0885382890701294,
128
+ "mean_token_accuracy": 0.7273061871528625,
129
+ "num_tokens": 350648.0,
130
+ "step": 12
131
+ },
132
+ {
133
+ "entropy": 1.3162220120429993,
134
+ "epoch": 0.15476190476190477,
135
+ "grad_norm": 0.265625,
136
+ "learning_rate": 1.9888308262251286e-05,
137
+ "loss": 1.0963213443756104,
138
+ "mean_token_accuracy": 0.7211973443627357,
139
+ "num_tokens": 380096.0,
140
+ "step": 13
141
+ },
142
+ {
143
+ "entropy": 1.3141592741012573,
144
+ "epoch": 0.16666666666666666,
145
+ "grad_norm": 0.2734375,
146
+ "learning_rate": 1.9868959788567213e-05,
147
+ "loss": 1.0897754430770874,
148
+ "mean_token_accuracy": 0.7258400693535805,
149
+ "num_tokens": 407435.0,
150
+ "step": 14
151
+ },
152
+ {
153
+ "entropy": 1.3073242455720901,
154
+ "epoch": 0.17857142857142858,
155
+ "grad_norm": 0.2578125,
156
+ "learning_rate": 1.9848077530122083e-05,
157
+ "loss": 1.0494160652160645,
158
+ "mean_token_accuracy": 0.7293207123875618,
159
+ "num_tokens": 435734.0,
160
+ "step": 15
161
+ },
162
+ {
163
+ "entropy": 1.3367096036672592,
164
+ "epoch": 0.19047619047619047,
165
+ "grad_norm": 0.255859375,
166
+ "learning_rate": 1.9825664732332886e-05,
167
+ "loss": 1.1211317777633667,
168
+ "mean_token_accuracy": 0.7143202275037766,
169
+ "num_tokens": 464973.0,
170
+ "step": 16
171
+ },
172
+ {
173
+ "entropy": 1.3097643703222275,
174
+ "epoch": 0.20238095238095238,
175
+ "grad_norm": 0.25,
176
+ "learning_rate": 1.9801724878485438e-05,
177
+ "loss": 1.0753662586212158,
178
+ "mean_token_accuracy": 0.7259641215205193,
179
+ "num_tokens": 493135.0,
180
+ "step": 17
181
+ },
182
+ {
183
+ "entropy": 1.2622702419757843,
184
+ "epoch": 0.21428571428571427,
185
+ "grad_norm": 0.232421875,
186
+ "learning_rate": 1.977626168919305e-05,
187
+ "loss": 1.007223129272461,
188
+ "mean_token_accuracy": 0.744126707315445,
189
+ "num_tokens": 522656.0,
190
+ "step": 18
191
+ },
192
+ {
193
+ "entropy": 1.2859665155410767,
194
+ "epoch": 0.2261904761904762,
195
+ "grad_norm": 0.23046875,
196
+ "learning_rate": 1.9749279121818235e-05,
197
+ "loss": 1.0457340478897095,
198
+ "mean_token_accuracy": 0.7328037023544312,
199
+ "num_tokens": 551875.0,
200
+ "step": 19
201
+ },
202
+ {
203
+ "entropy": 1.275212675333023,
204
+ "epoch": 0.23809523809523808,
205
+ "grad_norm": 0.251953125,
206
+ "learning_rate": 1.9720781369857747e-05,
207
+ "loss": 1.0395888090133667,
208
+ "mean_token_accuracy": 0.7307759299874306,
209
+ "num_tokens": 580523.0,
210
+ "step": 20
211
+ },
212
+ {
213
+ "entropy": 1.3000101447105408,
214
+ "epoch": 0.25,
215
+ "grad_norm": 0.2275390625,
216
+ "learning_rate": 1.969077286229078e-05,
217
+ "loss": 1.0626932382583618,
218
+ "mean_token_accuracy": 0.7271415144205093,
219
+ "num_tokens": 609771.0,
220
+ "step": 21
221
+ },
222
+ {
223
+ "entropy": 1.242678239941597,
224
+ "epoch": 0.2619047619047619,
225
+ "grad_norm": 0.2470703125,
226
+ "learning_rate": 1.9659258262890683e-05,
227
+ "loss": 0.9827122092247009,
228
+ "mean_token_accuracy": 0.7448626458644867,
229
+ "num_tokens": 639104.0,
230
+ "step": 22
231
+ },
232
+ {
233
+ "entropy": 1.2583424746990204,
234
+ "epoch": 0.27380952380952384,
235
+ "grad_norm": 0.228515625,
236
+ "learning_rate": 1.962624246950012e-05,
237
+ "loss": 1.0062870979309082,
238
+ "mean_token_accuracy": 0.7375933676958084,
239
+ "num_tokens": 667792.0,
240
+ "step": 23
241
+ },
242
+ {
243
+ "entropy": 1.2531014680862427,
244
+ "epoch": 0.2857142857142857,
245
+ "grad_norm": 0.2294921875,
246
+ "learning_rate": 1.9591730613269878e-05,
247
+ "loss": 1.0229589939117432,
248
+ "mean_token_accuracy": 0.7366377785801888,
249
+ "num_tokens": 696742.0,
250
+ "step": 24
251
+ },
252
+ {
253
+ "entropy": 1.2342166602611542,
254
+ "epoch": 0.2976190476190476,
255
+ "grad_norm": 0.2333984375,
256
+ "learning_rate": 1.955572805786141e-05,
257
+ "loss": 0.9788997769355774,
258
+ "mean_token_accuracy": 0.7421486154198647,
259
+ "num_tokens": 725968.0,
260
+ "step": 25
261
+ },
262
+ {
263
+ "entropy": 1.2210585623979568,
264
+ "epoch": 0.30952380952380953,
265
+ "grad_norm": 0.2578125,
266
+ "learning_rate": 1.9518240398613226e-05,
267
+ "loss": 0.987277090549469,
268
+ "mean_token_accuracy": 0.7420973554253578,
269
+ "num_tokens": 755689.0,
270
+ "step": 26
271
+ },
272
+ {
273
+ "entropy": 1.24309404194355,
274
+ "epoch": 0.32142857142857145,
275
+ "grad_norm": 0.2421875,
276
+ "learning_rate": 1.947927346167132e-05,
277
+ "loss": 1.0301053524017334,
278
+ "mean_token_accuracy": 0.7300752699375153,
279
+ "num_tokens": 784977.0,
280
+ "step": 27
281
+ },
282
+ {
283
+ "entropy": 1.2028213143348694,
284
+ "epoch": 0.3333333333333333,
285
+ "grad_norm": 0.2177734375,
286
+ "learning_rate": 1.9438833303083677e-05,
287
+ "loss": 0.9393562078475952,
288
+ "mean_token_accuracy": 0.7491495907306671,
289
+ "num_tokens": 814048.0,
290
+ "step": 28
291
+ },
292
+ {
293
+ "entropy": 1.2287103980779648,
294
+ "epoch": 0.34523809523809523,
295
+ "grad_norm": 0.228515625,
296
+ "learning_rate": 1.9396926207859085e-05,
297
+ "loss": 1.0168366432189941,
298
+ "mean_token_accuracy": 0.7329602986574173,
299
+ "num_tokens": 843602.0,
300
+ "step": 29
301
+ },
302
+ {
303
+ "entropy": 1.2081626951694489,
304
+ "epoch": 0.35714285714285715,
305
+ "grad_norm": 0.2275390625,
306
+ "learning_rate": 1.935355868899034e-05,
307
+ "loss": 0.958310604095459,
308
+ "mean_token_accuracy": 0.7456908002495766,
309
+ "num_tokens": 871915.0,
310
+ "step": 30
311
+ },
312
+ {
313
+ "entropy": 1.2221457809209824,
314
+ "epoch": 0.36904761904761907,
315
+ "grad_norm": 0.2294921875,
316
+ "learning_rate": 1.9308737486442045e-05,
317
+ "loss": 0.9946644902229309,
318
+ "mean_token_accuracy": 0.7383344992995262,
319
+ "num_tokens": 900851.0,
320
+ "step": 31
321
+ },
322
+ {
323
+ "entropy": 1.1801428943872452,
324
+ "epoch": 0.38095238095238093,
325
+ "grad_norm": 0.21484375,
326
+ "learning_rate": 1.926246956610309e-05,
327
+ "loss": 0.9103766083717346,
328
+ "mean_token_accuracy": 0.7624464929103851,
329
+ "num_tokens": 929498.0,
330
+ "step": 32
331
+ },
332
+ {
333
+ "entropy": 1.2152698189020157,
334
+ "epoch": 0.39285714285714285,
335
+ "grad_norm": 0.2333984375,
336
+ "learning_rate": 1.921476211870408e-05,
337
+ "loss": 0.9737407565116882,
338
+ "mean_token_accuracy": 0.7427262291312218,
339
+ "num_tokens": 958933.0,
340
+ "step": 33
341
+ },
342
+ {
343
+ "entropy": 1.2030568569898605,
344
+ "epoch": 0.40476190476190477,
345
+ "grad_norm": 0.22265625,
346
+ "learning_rate": 1.9165622558699763e-05,
347
+ "loss": 0.9593278169631958,
348
+ "mean_token_accuracy": 0.7506603300571442,
349
+ "num_tokens": 987731.0,
350
+ "step": 34
351
+ },
352
+ {
353
+ "entropy": 1.1957021951675415,
354
+ "epoch": 0.4166666666666667,
355
+ "grad_norm": 0.2158203125,
356
+ "learning_rate": 1.9115058523116734e-05,
357
+ "loss": 0.9239043593406677,
358
+ "mean_token_accuracy": 0.7555749863386154,
359
+ "num_tokens": 1017002.0,
360
+ "step": 35
361
+ },
362
+ {
363
+ "entropy": 1.2133885324001312,
364
+ "epoch": 0.42857142857142855,
365
+ "grad_norm": 0.216796875,
366
+ "learning_rate": 1.9063077870366504e-05,
367
+ "loss": 0.9809866547584534,
368
+ "mean_token_accuracy": 0.7437998279929161,
369
+ "num_tokens": 1046678.0,
370
+ "step": 36
371
+ },
372
+ {
373
+ "entropy": 1.2098581492900848,
374
+ "epoch": 0.44047619047619047,
375
+ "grad_norm": 0.2236328125,
376
+ "learning_rate": 1.900968867902419e-05,
377
+ "loss": 0.938984215259552,
378
+ "mean_token_accuracy": 0.7494841367006302,
379
+ "num_tokens": 1074445.0,
380
+ "step": 37
381
+ },
382
+ {
383
+ "entropy": 1.1815967112779617,
384
+ "epoch": 0.4523809523809524,
385
+ "grad_norm": 0.236328125,
386
+ "learning_rate": 1.895489924657301e-05,
387
+ "loss": 0.8934326767921448,
388
+ "mean_token_accuracy": 0.7595476359128952,
389
+ "num_tokens": 1103620.0,
390
+ "step": 38
391
+ },
392
+ {
393
+ "entropy": 1.2028009444475174,
394
+ "epoch": 0.4642857142857143,
395
+ "grad_norm": 0.2265625,
396
+ "learning_rate": 1.8898718088114688e-05,
397
+ "loss": 0.922984778881073,
398
+ "mean_token_accuracy": 0.7540801167488098,
399
+ "num_tokens": 1132637.0,
400
+ "step": 39
401
+ },
402
+ {
403
+ "entropy": 1.2034422308206558,
404
+ "epoch": 0.47619047619047616,
405
+ "grad_norm": 1.171875,
406
+ "learning_rate": 1.8841153935046098e-05,
407
+ "loss": 0.9033240675926208,
408
+ "mean_token_accuracy": 0.7560576424002647,
409
+ "num_tokens": 1161527.0,
410
+ "step": 40
411
+ },
412
+ {
413
+ "entropy": 1.1716476827859879,
414
+ "epoch": 0.4880952380952381,
415
+ "grad_norm": 0.2138671875,
416
+ "learning_rate": 1.8782215733702286e-05,
417
+ "loss": 0.8880018591880798,
418
+ "mean_token_accuracy": 0.7613470479846001,
419
+ "num_tokens": 1190701.0,
420
+ "step": 41
421
+ },
422
+ {
423
+ "entropy": 1.2157341986894608,
424
+ "epoch": 0.5,
425
+ "grad_norm": 0.2314453125,
426
+ "learning_rate": 1.8721912643966055e-05,
427
+ "loss": 0.9609653949737549,
428
+ "mean_token_accuracy": 0.7453824803233147,
429
+ "num_tokens": 1218835.0,
430
+ "step": 42
431
+ },
432
+ {
433
+ "entropy": 1.197568565607071,
434
+ "epoch": 0.5119047619047619,
435
+ "grad_norm": 0.216796875,
436
+ "learning_rate": 1.866025403784439e-05,
437
+ "loss": 0.9219189882278442,
438
+ "mean_token_accuracy": 0.7547592371702194,
439
+ "num_tokens": 1248679.0,
440
+ "step": 43
441
+ },
442
+ {
443
+ "entropy": 1.1708803623914719,
444
+ "epoch": 0.5238095238095238,
445
+ "grad_norm": 0.373046875,
446
+ "learning_rate": 1.8597249498011906e-05,
447
+ "loss": 0.8802202343940735,
448
+ "mean_token_accuracy": 0.7667126134037971,
449
+ "num_tokens": 1277106.0,
450
+ "step": 44
451
+ },
452
+ {
453
+ "entropy": 1.191767856478691,
454
+ "epoch": 0.5357142857142857,
455
+ "grad_norm": 0.2197265625,
456
+ "learning_rate": 1.8532908816321557e-05,
457
+ "loss": 0.9313769936561584,
458
+ "mean_token_accuracy": 0.7529165670275688,
459
+ "num_tokens": 1305983.0,
460
+ "step": 45
461
+ },
462
+ {
463
+ "entropy": 1.2066084146499634,
464
+ "epoch": 0.5476190476190477,
465
+ "grad_norm": 0.2255859375,
466
+ "learning_rate": 1.8467241992282842e-05,
467
+ "loss": 0.9347527027130127,
468
+ "mean_token_accuracy": 0.7446473762392998,
469
+ "num_tokens": 1334578.0,
470
+ "step": 46
471
+ },
472
+ {
473
+ "entropy": 1.177584484219551,
474
+ "epoch": 0.5595238095238095,
475
+ "grad_norm": 0.25,
476
+ "learning_rate": 1.8400259231507716e-05,
477
+ "loss": 0.8884726166725159,
478
+ "mean_token_accuracy": 0.7611024901270866,
479
+ "num_tokens": 1362873.0,
480
+ "step": 47
481
+ },
482
+ {
483
+ "entropy": 1.1629594564437866,
484
+ "epoch": 0.5714285714285714,
485
+ "grad_norm": 0.2265625,
486
+ "learning_rate": 1.833197094412449e-05,
487
+ "loss": 0.8861435651779175,
488
+ "mean_token_accuracy": 0.76307063549757,
489
+ "num_tokens": 1391315.0,
490
+ "step": 48
491
+ },
492
+ {
493
+ "entropy": 1.168922871351242,
494
+ "epoch": 0.5833333333333334,
495
+ "grad_norm": 0.23046875,
496
+ "learning_rate": 1.826238774315995e-05,
497
+ "loss": 0.8765286207199097,
498
+ "mean_token_accuracy": 0.76119015365839,
499
+ "num_tokens": 1419829.0,
500
+ "step": 49
501
+ },
502
+ {
503
+ "entropy": 1.1843004375696182,
504
+ "epoch": 0.5952380952380952,
505
+ "grad_norm": 0.234375,
506
+ "learning_rate": 1.819152044288992e-05,
507
+ "loss": 0.9242440462112427,
508
+ "mean_token_accuracy": 0.7494527697563171,
509
+ "num_tokens": 1447790.0,
510
+ "step": 50
511
+ },
512
+ {
513
+ "entropy": 1.1673331260681152,
514
+ "epoch": 0.6071428571428571,
515
+ "grad_norm": 0.2490234375,
516
+ "learning_rate": 1.811938005715857e-05,
517
+ "loss": 0.8822228312492371,
518
+ "mean_token_accuracy": 0.7585421577095985,
519
+ "num_tokens": 1476278.0,
520
+ "step": 51
521
+ },
522
+ {
523
+ "entropy": 1.2116869688034058,
524
+ "epoch": 0.6190476190476191,
525
+ "grad_norm": 0.2421875,
526
+ "learning_rate": 1.8045977797666685e-05,
527
+ "loss": 0.9784308671951294,
528
+ "mean_token_accuracy": 0.7404012456536293,
529
+ "num_tokens": 1503947.0,
530
+ "step": 52
531
+ },
532
+ {
533
+ "entropy": 1.162365809082985,
534
+ "epoch": 0.6309523809523809,
535
+ "grad_norm": 0.2265625,
536
+ "learning_rate": 1.7971325072229227e-05,
537
+ "loss": 0.9283543825149536,
538
+ "mean_token_accuracy": 0.7499738857150078,
539
+ "num_tokens": 1533531.0,
540
+ "step": 53
541
+ },
542
+ {
543
+ "entropy": 1.1863622218370438,
544
+ "epoch": 0.6428571428571429,
545
+ "grad_norm": 0.2421875,
546
+ "learning_rate": 1.7895433483002356e-05,
547
+ "loss": 0.9471738934516907,
548
+ "mean_token_accuracy": 0.7532860413193703,
549
+ "num_tokens": 1561412.0,
550
+ "step": 54
551
+ },
552
+ {
553
+ "entropy": 1.1698070168495178,
554
+ "epoch": 0.6547619047619048,
555
+ "grad_norm": 0.2255859375,
556
+ "learning_rate": 1.78183148246803e-05,
557
+ "loss": 0.9019606709480286,
558
+ "mean_token_accuracy": 0.7543124184012413,
559
+ "num_tokens": 1590336.0,
560
+ "step": 55
561
+ },
562
+ {
563
+ "entropy": 1.1683688312768936,
564
+ "epoch": 0.6666666666666666,
565
+ "grad_norm": 0.208984375,
566
+ "learning_rate": 1.7739981082662275e-05,
567
+ "loss": 0.9020405411720276,
568
+ "mean_token_accuracy": 0.7580606490373611,
569
+ "num_tokens": 1620442.0,
570
+ "step": 56
571
+ },
572
+ {
573
+ "entropy": 1.1867523938417435,
574
+ "epoch": 0.6785714285714286,
575
+ "grad_norm": 0.216796875,
576
+ "learning_rate": 1.766044443118978e-05,
577
+ "loss": 0.917300283908844,
578
+ "mean_token_accuracy": 0.7553394213318825,
579
+ "num_tokens": 1648762.0,
580
+ "step": 57
581
+ },
582
+ {
583
+ "entropy": 1.1505564451217651,
584
+ "epoch": 0.6904761904761905,
585
+ "grad_norm": 0.2216796875,
586
+ "learning_rate": 1.757971723145453e-05,
587
+ "loss": 0.8627029061317444,
588
+ "mean_token_accuracy": 0.7657916098833084,
589
+ "num_tokens": 1677464.0,
590
+ "step": 58
591
+ },
592
+ {
593
+ "entropy": 1.1766629666090012,
594
+ "epoch": 0.7023809523809523,
595
+ "grad_norm": 0.2236328125,
596
+ "learning_rate": 1.7497812029677344e-05,
597
+ "loss": 0.8795939087867737,
598
+ "mean_token_accuracy": 0.7613174989819527,
599
+ "num_tokens": 1704994.0,
600
+ "step": 59
601
+ },
602
+ {
603
+ "entropy": 1.1731744706630707,
604
+ "epoch": 0.7142857142857143,
605
+ "grad_norm": 0.2158203125,
606
+ "learning_rate": 1.741474155515827e-05,
607
+ "loss": 0.8988810777664185,
608
+ "mean_token_accuracy": 0.7579676881432533,
609
+ "num_tokens": 1734202.0,
610
+ "step": 60
611
+ },
612
+ {
613
+ "entropy": 1.1697156727313995,
614
+ "epoch": 0.7261904761904762,
615
+ "grad_norm": 0.2255859375,
616
+ "learning_rate": 1.7330518718298263e-05,
617
+ "loss": 0.9070097804069519,
618
+ "mean_token_accuracy": 0.7564781159162521,
619
+ "num_tokens": 1763541.0,
620
+ "step": 61
621
+ },
622
+ {
623
+ "entropy": 1.1686383485794067,
624
+ "epoch": 0.7380952380952381,
625
+ "grad_norm": 0.2177734375,
626
+ "learning_rate": 1.7245156608592727e-05,
627
+ "loss": 0.8804867267608643,
628
+ "mean_token_accuracy": 0.7639917582273483,
629
+ "num_tokens": 1793196.0,
630
+ "step": 62
631
+ },
632
+ {
633
+ "entropy": 1.195967510342598,
634
+ "epoch": 0.75,
635
+ "grad_norm": 0.25390625,
636
+ "learning_rate": 1.7158668492597186e-05,
637
+ "loss": 0.9389015436172485,
638
+ "mean_token_accuracy": 0.747251845896244,
639
+ "num_tokens": 1821023.0,
640
+ "step": 63
641
+ },
642
+ {
643
+ "entropy": 1.1664810329675674,
644
+ "epoch": 0.7619047619047619,
645
+ "grad_norm": 0.21875,
646
+ "learning_rate": 1.7071067811865477e-05,
647
+ "loss": 0.9056146740913391,
648
+ "mean_token_accuracy": 0.7550350353121758,
649
+ "num_tokens": 1849586.0,
650
+ "step": 64
651
+ },
652
+ {
653
+ "entropy": 1.171183928847313,
654
+ "epoch": 0.7738095238095238,
655
+ "grad_norm": 0.2177734375,
656
+ "learning_rate": 1.698236818086073e-05,
657
+ "loss": 0.929341197013855,
658
+ "mean_token_accuracy": 0.7491638883948326,
659
+ "num_tokens": 1878622.0,
660
+ "step": 65
661
+ },
662
+ {
663
+ "entropy": 1.1465008854866028,
664
+ "epoch": 0.7857142857142857,
665
+ "grad_norm": 0.2177734375,
666
+ "learning_rate": 1.689258338483947e-05,
667
+ "loss": 0.8692110776901245,
668
+ "mean_token_accuracy": 0.765314869582653,
669
+ "num_tokens": 1907725.0,
670
+ "step": 66
671
+ },
672
+ {
673
+ "entropy": 1.1706128865480423,
674
+ "epoch": 0.7976190476190477,
675
+ "grad_norm": 0.216796875,
676
+ "learning_rate": 1.6801727377709195e-05,
677
+ "loss": 0.886278510093689,
678
+ "mean_token_accuracy": 0.7576193287968636,
679
+ "num_tokens": 1936209.0,
680
+ "step": 67
681
+ },
682
+ {
683
+ "entropy": 1.1479064524173737,
684
+ "epoch": 0.8095238095238095,
685
+ "grad_norm": 0.2216796875,
686
+ "learning_rate": 1.67098142798597e-05,
687
+ "loss": 0.8587610125541687,
688
+ "mean_token_accuracy": 0.7682890966534615,
689
+ "num_tokens": 1964915.0,
690
+ "step": 68
691
+ },
692
+ {
693
+ "entropy": 1.1495172083377838,
694
+ "epoch": 0.8214285714285714,
695
+ "grad_norm": 0.2294921875,
696
+ "learning_rate": 1.6616858375968596e-05,
697
+ "loss": 0.8885282874107361,
698
+ "mean_token_accuracy": 0.7598370909690857,
699
+ "num_tokens": 1993606.0,
700
+ "step": 69
701
+ },
702
+ {
703
+ "entropy": 1.1534761786460876,
704
+ "epoch": 0.8333333333333334,
705
+ "grad_norm": 0.2138671875,
706
+ "learning_rate": 1.6522874112781213e-05,
707
+ "loss": 0.8863916993141174,
708
+ "mean_token_accuracy": 0.7640347108244896,
709
+ "num_tokens": 2022472.0,
710
+ "step": 70
711
+ },
712
+ {
713
+ "entropy": 1.14171202480793,
714
+ "epoch": 0.8452380952380952,
715
+ "grad_norm": 0.220703125,
716
+ "learning_rate": 1.6427876096865394e-05,
717
+ "loss": 0.8785849809646606,
718
+ "mean_token_accuracy": 0.7604316994547844,
719
+ "num_tokens": 2052746.0,
720
+ "step": 71
721
+ },
722
+ {
723
+ "entropy": 1.1478676050901413,
724
+ "epoch": 0.8571428571428571,
725
+ "grad_norm": 0.212890625,
726
+ "learning_rate": 1.6331879092341402e-05,
727
+ "loss": 0.8796285390853882,
728
+ "mean_token_accuracy": 0.7586944848299026,
729
+ "num_tokens": 2081889.0,
730
+ "step": 72
731
+ },
732
+ {
733
+ "entropy": 1.1222540885210037,
734
+ "epoch": 0.8690476190476191,
735
+ "grad_norm": 0.193359375,
736
+ "learning_rate": 1.6234898018587336e-05,
737
+ "loss": 0.8146858811378479,
738
+ "mean_token_accuracy": 0.7756616845726967,
739
+ "num_tokens": 2111616.0,
740
+ "step": 73
741
+ },
742
+ {
743
+ "entropy": 1.153001144528389,
744
+ "epoch": 0.8809523809523809,
745
+ "grad_norm": 0.224609375,
746
+ "learning_rate": 1.6136947947920477e-05,
747
+ "loss": 0.8884707689285278,
748
+ "mean_token_accuracy": 0.7565625682473183,
749
+ "num_tokens": 2140433.0,
750
+ "step": 74
751
+ },
752
+ {
753
+ "entropy": 1.1275182217359543,
754
+ "epoch": 0.8928571428571429,
755
+ "grad_norm": 0.2099609375,
756
+ "learning_rate": 1.6038044103254775e-05,
757
+ "loss": 0.8272450566291809,
758
+ "mean_token_accuracy": 0.7704622000455856,
759
+ "num_tokens": 2170414.0,
760
+ "step": 75
761
+ },
762
+ {
763
+ "entropy": 1.1576026529073715,
764
+ "epoch": 0.9047619047619048,
765
+ "grad_norm": 0.22265625,
766
+ "learning_rate": 1.5938201855735017e-05,
767
+ "loss": 0.9035623669624329,
768
+ "mean_token_accuracy": 0.7542874589562416,
769
+ "num_tokens": 2198868.0,
770
+ "step": 76
771
+ },
772
+ {
773
+ "entropy": 1.1199318170547485,
774
+ "epoch": 0.9166666666666666,
775
+ "grad_norm": 0.20703125,
776
+ "learning_rate": 1.5837436722347902e-05,
777
+ "loss": 0.8039325475692749,
778
+ "mean_token_accuracy": 0.783287987112999,
779
+ "num_tokens": 2228134.0,
780
+ "step": 77
781
+ },
782
+ {
783
+ "entropy": 1.1484037339687347,
784
+ "epoch": 0.9285714285714286,
785
+ "grad_norm": 0.2138671875,
786
+ "learning_rate": 1.573576436351046e-05,
787
+ "loss": 0.8699290752410889,
788
+ "mean_token_accuracy": 0.7641323357820511,
789
+ "num_tokens": 2257447.0,
790
+ "step": 78
791
+ },
792
+ {
793
+ "entropy": 1.1295416802167892,
794
+ "epoch": 0.9404761904761905,
795
+ "grad_norm": 0.205078125,
796
+ "learning_rate": 1.563320058063622e-05,
797
+ "loss": 0.8303874731063843,
798
+ "mean_token_accuracy": 0.7720286920666695,
799
+ "num_tokens": 2286749.0,
800
+ "step": 79
801
+ },
802
+ {
803
+ "entropy": 1.1563286185264587,
804
+ "epoch": 0.9523809523809523,
805
+ "grad_norm": 0.21875,
806
+ "learning_rate": 1.5529761313679396e-05,
807
+ "loss": 0.8524646759033203,
808
+ "mean_token_accuracy": 0.7633371129631996,
809
+ "num_tokens": 2315039.0,
810
+ "step": 80
811
+ },
812
+ {
813
+ "entropy": 1.1543449014425278,
814
+ "epoch": 0.9642857142857143,
815
+ "grad_norm": 0.2158203125,
816
+ "learning_rate": 1.5425462638657597e-05,
817
+ "loss": 0.9120794534683228,
818
+ "mean_token_accuracy": 0.756316527724266,
819
+ "num_tokens": 2344737.0,
820
+ "step": 81
821
+ },
822
+ {
823
+ "entropy": 1.13828843832016,
824
+ "epoch": 0.9761904761904762,
825
+ "grad_norm": 0.2265625,
826
+ "learning_rate": 1.5320320765153367e-05,
827
+ "loss": 0.824118971824646,
828
+ "mean_token_accuracy": 0.7736462280154228,
829
+ "num_tokens": 2373710.0,
830
+ "step": 82
831
+ },
832
+ {
833
+ "entropy": 1.145560473203659,
834
+ "epoch": 0.9880952380952381,
835
+ "grad_norm": 0.220703125,
836
+ "learning_rate": 1.5214352033794981e-05,
837
+ "loss": 0.8729808926582336,
838
+ "mean_token_accuracy": 0.7629412487149239,
839
+ "num_tokens": 2402610.0,
840
+ "step": 83
841
+ },
842
+ {
843
+ "entropy": 1.1476428806781769,
844
+ "epoch": 1.0,
845
+ "grad_norm": 0.22265625,
846
+ "learning_rate": 1.5107572913716859e-05,
847
+ "loss": 0.8972144722938538,
848
+ "mean_token_accuracy": 0.757901057600975,
849
+ "num_tokens": 2430019.0,
850
+ "step": 84
851
+ },
852
+ {
853
+ "epoch": 1.0,
854
+ "eval_entropy": 1.1429666471481323,
855
+ "eval_loss": 0.8658801317214966,
856
+ "eval_mean_token_accuracy": 0.7630383356412251,
857
+ "eval_model_preparation_time": 0.0051,
858
+ "eval_num_tokens": 2430019.0,
859
+ "eval_runtime": 19.169,
860
+ "eval_samples_per_second": 7.825,
861
+ "eval_steps_per_second": 7.825,
862
+ "step": 84
863
+ },
864
+ {
865
+ "entropy": 1.1193113178014755,
866
+ "epoch": 1.0119047619047619,
867
+ "grad_norm": 0.205078125,
868
+ "learning_rate": 1.5000000000000002e-05,
869
+ "loss": 0.8069751858711243,
870
+ "mean_token_accuracy": 0.7752386555075645,
871
+ "num_tokens": 2459653.0,
872
+ "step": 85
873
+ },
874
+ {
875
+ "entropy": 1.14054836332798,
876
+ "epoch": 1.0238095238095237,
877
+ "grad_norm": 0.2109375,
878
+ "learning_rate": 1.4891650011092896e-05,
879
+ "loss": 0.8288445472717285,
880
+ "mean_token_accuracy": 0.7729767188429832,
881
+ "num_tokens": 2488217.0,
882
+ "step": 86
883
+ },
884
+ {
885
+ "entropy": 1.1414664089679718,
886
+ "epoch": 1.0357142857142858,
887
+ "grad_norm": 0.2158203125,
888
+ "learning_rate": 1.4782539786213184e-05,
889
+ "loss": 0.8254880905151367,
890
+ "mean_token_accuracy": 0.7727913111448288,
891
+ "num_tokens": 2517578.0,
892
+ "step": 87
893
+ },
894
+ {
895
+ "entropy": 1.1179616451263428,
896
+ "epoch": 1.0476190476190477,
897
+ "grad_norm": 0.205078125,
898
+ "learning_rate": 1.4672686282730622e-05,
899
+ "loss": 0.8098872303962708,
900
+ "mean_token_accuracy": 0.7769448384642601,
901
+ "num_tokens": 2546116.0,
902
+ "step": 88
903
+ },
904
+ {
905
+ "entropy": 1.1239117681980133,
906
+ "epoch": 1.0595238095238095,
907
+ "grad_norm": 0.318359375,
908
+ "learning_rate": 1.4562106573531632e-05,
909
+ "loss": 0.8263017535209656,
910
+ "mean_token_accuracy": 0.7758133932948112,
911
+ "num_tokens": 2574681.0,
912
+ "step": 89
913
+ },
914
+ {
915
+ "entropy": 1.1026111543178558,
916
+ "epoch": 1.0714285714285714,
917
+ "grad_norm": 0.2080078125,
918
+ "learning_rate": 1.4450817844365924e-05,
919
+ "loss": 0.8099116086959839,
920
+ "mean_token_accuracy": 0.7731629684567451,
921
+ "num_tokens": 2603807.0,
922
+ "step": 90
923
+ },
924
+ {
925
+ "entropy": 1.1024491339921951,
926
+ "epoch": 1.0833333333333333,
927
+ "grad_norm": 0.2158203125,
928
+ "learning_rate": 1.4338837391175582e-05,
929
+ "loss": 0.8093633055686951,
930
+ "mean_token_accuracy": 0.7739714533090591,
931
+ "num_tokens": 2632614.0,
932
+ "step": 91
933
+ },
934
+ {
935
+ "entropy": 1.1085499972105026,
936
+ "epoch": 1.0952380952380953,
937
+ "grad_norm": 0.216796875,
938
+ "learning_rate": 1.4226182617406996e-05,
939
+ "loss": 0.8473532199859619,
940
+ "mean_token_accuracy": 0.7683232203125954,
941
+ "num_tokens": 2661538.0,
942
+ "step": 92
943
+ },
944
+ {
945
+ "entropy": 1.0892803370952606,
946
+ "epoch": 1.1071428571428572,
947
+ "grad_norm": 0.220703125,
948
+ "learning_rate": 1.4112871031306118e-05,
949
+ "loss": 0.8294469118118286,
950
+ "mean_token_accuracy": 0.7713945508003235,
951
+ "num_tokens": 2690777.0,
952
+ "step": 93
953
+ },
954
+ {
955
+ "entropy": 1.1031535863876343,
956
+ "epoch": 1.119047619047619,
957
+ "grad_norm": 0.224609375,
958
+ "learning_rate": 1.3998920243197408e-05,
959
+ "loss": 0.8391809463500977,
960
+ "mean_token_accuracy": 0.7676805257797241,
961
+ "num_tokens": 2719730.0,
962
+ "step": 94
963
+ },
964
+ {
965
+ "entropy": 1.0815589874982834,
966
+ "epoch": 1.130952380952381,
967
+ "grad_norm": 0.21484375,
968
+ "learning_rate": 1.3884347962746949e-05,
969
+ "loss": 0.7862935066223145,
970
+ "mean_token_accuracy": 0.7806214541196823,
971
+ "num_tokens": 2749156.0,
972
+ "step": 95
973
+ },
974
+ {
975
+ "entropy": 1.084671527147293,
976
+ "epoch": 1.1428571428571428,
977
+ "grad_norm": 0.2197265625,
978
+ "learning_rate": 1.3769171996210053e-05,
979
+ "loss": 0.840523898601532,
980
+ "mean_token_accuracy": 0.7695459797978401,
981
+ "num_tokens": 2778531.0,
982
+ "step": 96
983
+ },
984
+ {
985
+ "entropy": 1.0894652903079987,
986
+ "epoch": 1.1547619047619047,
987
+ "grad_norm": 0.216796875,
988
+ "learning_rate": 1.3653410243663953e-05,
989
+ "loss": 0.7974240779876709,
990
+ "mean_token_accuracy": 0.7744667157530785,
991
+ "num_tokens": 2806462.0,
992
+ "step": 97
993
+ },
994
+ {
995
+ "entropy": 1.0971969813108444,
996
+ "epoch": 1.1666666666666667,
997
+ "grad_norm": 0.22265625,
998
+ "learning_rate": 1.3537080696225815e-05,
999
+ "loss": 0.8246796131134033,
1000
+ "mean_token_accuracy": 0.7684177905321121,
1001
+ "num_tokens": 2835497.0,
1002
+ "step": 98
1003
+ },
1004
+ {
1005
+ "entropy": 1.1123791635036469,
1006
+ "epoch": 1.1785714285714286,
1007
+ "grad_norm": 0.2373046875,
1008
+ "learning_rate": 1.342020143325669e-05,
1009
+ "loss": 0.8859103322029114,
1010
+ "mean_token_accuracy": 0.7534352988004684,
1011
+ "num_tokens": 2865231.0,
1012
+ "step": 99
1013
+ },
1014
+ {
1015
+ "entropy": 1.075607344508171,
1016
+ "epoch": 1.1904761904761905,
1017
+ "grad_norm": 0.2265625,
1018
+ "learning_rate": 1.3302790619551673e-05,
1019
+ "loss": 0.7980949878692627,
1020
+ "mean_token_accuracy": 0.7762870118021965,
1021
+ "num_tokens": 2894329.0,
1022
+ "step": 100
1023
+ },
1024
+ {
1025
+ "entropy": 1.1072215735912323,
1026
+ "epoch": 1.2023809523809523,
1027
+ "grad_norm": 0.2353515625,
1028
+ "learning_rate": 1.3184866502516846e-05,
1029
+ "loss": 0.8650733232498169,
1030
+ "mean_token_accuracy": 0.764843761920929,
1031
+ "num_tokens": 2923660.0,
1032
+ "step": 101
1033
+ },
1034
+ {
1035
+ "entropy": 1.0887151509523392,
1036
+ "epoch": 1.2142857142857142,
1037
+ "grad_norm": 0.2255859375,
1038
+ "learning_rate": 1.3066447409333345e-05,
1039
+ "loss": 0.790311336517334,
1040
+ "mean_token_accuracy": 0.7792445793747902,
1041
+ "num_tokens": 2952054.0,
1042
+ "step": 102
1043
+ },
1044
+ {
1045
+ "entropy": 1.1025346666574478,
1046
+ "epoch": 1.2261904761904763,
1047
+ "grad_norm": 0.2392578125,
1048
+ "learning_rate": 1.2947551744109044e-05,
1049
+ "loss": 0.8180376887321472,
1050
+ "mean_token_accuracy": 0.7729773372411728,
1051
+ "num_tokens": 2981426.0,
1052
+ "step": 103
1053
+ },
1054
+ {
1055
+ "entropy": 1.0916212499141693,
1056
+ "epoch": 1.2380952380952381,
1057
+ "grad_norm": 0.2373046875,
1058
+ "learning_rate": 1.2828197985018276e-05,
1059
+ "loss": 0.7971659898757935,
1060
+ "mean_token_accuracy": 0.7799450904130936,
1061
+ "num_tokens": 3009579.0,
1062
+ "step": 104
1063
+ },
1064
+ {
1065
+ "entropy": 1.1104163080453873,
1066
+ "epoch": 1.25,
1067
+ "grad_norm": 0.2333984375,
1068
+ "learning_rate": 1.2708404681430054e-05,
1069
+ "loss": 0.8455361127853394,
1070
+ "mean_token_accuracy": 0.7681760489940643,
1071
+ "num_tokens": 3038292.0,
1072
+ "step": 105
1073
+ },
1074
+ {
1075
+ "entropy": 1.1180581152439117,
1076
+ "epoch": 1.2619047619047619,
1077
+ "grad_norm": 0.2431640625,
1078
+ "learning_rate": 1.2588190451025209e-05,
1079
+ "loss": 0.8946309685707092,
1080
+ "mean_token_accuracy": 0.755538322031498,
1081
+ "num_tokens": 3068231.0,
1082
+ "step": 106
1083
+ },
1084
+ {
1085
+ "entropy": 1.0994994044303894,
1086
+ "epoch": 1.2738095238095237,
1087
+ "grad_norm": 0.265625,
1088
+ "learning_rate": 1.2467573976902936e-05,
1089
+ "loss": 0.7855837345123291,
1090
+ "mean_token_accuracy": 0.7798345908522606,
1091
+ "num_tokens": 3096640.0,
1092
+ "step": 107
1093
+ },
1094
+ {
1095
+ "entropy": 1.0958448350429535,
1096
+ "epoch": 1.2857142857142856,
1097
+ "grad_norm": 0.22265625,
1098
+ "learning_rate": 1.2346574004677154e-05,
1099
+ "loss": 0.8080664277076721,
1100
+ "mean_token_accuracy": 0.775592751801014,
1101
+ "num_tokens": 3125619.0,
1102
+ "step": 108
1103
+ },
1104
+ {
1105
+ "entropy": 1.1057351678609848,
1106
+ "epoch": 1.2976190476190477,
1107
+ "grad_norm": 0.375,
1108
+ "learning_rate": 1.2225209339563144e-05,
1109
+ "loss": 0.8222600817680359,
1110
+ "mean_token_accuracy": 0.7683183401823044,
1111
+ "num_tokens": 3155256.0,
1112
+ "step": 109
1113
+ },
1114
+ {
1115
+ "entropy": 1.1132191121578217,
1116
+ "epoch": 1.3095238095238095,
1117
+ "grad_norm": 0.2197265625,
1118
+ "learning_rate": 1.210349884345496e-05,
1119
+ "loss": 0.8248376250267029,
1120
+ "mean_token_accuracy": 0.7687205746769905,
1121
+ "num_tokens": 3183948.0,
1122
+ "step": 110
1123
+ },
1124
+ {
1125
+ "entropy": 1.0987165123224258,
1126
+ "epoch": 1.3214285714285714,
1127
+ "grad_norm": 0.31640625,
1128
+ "learning_rate": 1.1981461431993978e-05,
1129
+ "loss": 0.8191619515419006,
1130
+ "mean_token_accuracy": 0.772399052977562,
1131
+ "num_tokens": 3212463.0,
1132
+ "step": 111
1133
+ },
1134
+ {
1135
+ "entropy": 1.1073571592569351,
1136
+ "epoch": 1.3333333333333333,
1137
+ "grad_norm": 0.232421875,
1138
+ "learning_rate": 1.1859116071629148e-05,
1139
+ "loss": 0.8318334221839905,
1140
+ "mean_token_accuracy": 0.7649757117033005,
1141
+ "num_tokens": 3241487.0,
1142
+ "step": 112
1143
+ },
1144
+ {
1145
+ "entropy": 1.102282091975212,
1146
+ "epoch": 1.3452380952380953,
1147
+ "grad_norm": 0.2197265625,
1148
+ "learning_rate": 1.1736481776669307e-05,
1149
+ "loss": 0.8375995755195618,
1150
+ "mean_token_accuracy": 0.7682436108589172,
1151
+ "num_tokens": 3270436.0,
1152
+ "step": 113
1153
+ },
1154
+ {
1155
+ "entropy": 1.0837299078702927,
1156
+ "epoch": 1.3571428571428572,
1157
+ "grad_norm": 0.2158203125,
1158
+ "learning_rate": 1.1613577606328068e-05,
1159
+ "loss": 0.7833430767059326,
1160
+ "mean_token_accuracy": 0.7823601812124252,
1161
+ "num_tokens": 3299814.0,
1162
+ "step": 114
1163
+ },
1164
+ {
1165
+ "entropy": 1.0879952907562256,
1166
+ "epoch": 1.369047619047619,
1167
+ "grad_norm": 0.2119140625,
1168
+ "learning_rate": 1.1490422661761744e-05,
1169
+ "loss": 0.7993915677070618,
1170
+ "mean_token_accuracy": 0.7771986275911331,
1171
+ "num_tokens": 3328509.0,
1172
+ "step": 115
1173
+ },
1174
+ {
1175
+ "entropy": 1.112231805920601,
1176
+ "epoch": 1.380952380952381,
1177
+ "grad_norm": 0.224609375,
1178
+ "learning_rate": 1.1367036083100735e-05,
1179
+ "loss": 0.8307598233222961,
1180
+ "mean_token_accuracy": 0.7695401236414909,
1181
+ "num_tokens": 3356953.0,
1182
+ "step": 116
1183
+ },
1184
+ {
1185
+ "entropy": 1.0973141938447952,
1186
+ "epoch": 1.3928571428571428,
1187
+ "grad_norm": 0.2314453125,
1188
+ "learning_rate": 1.1243437046474854e-05,
1189
+ "loss": 0.8001049757003784,
1190
+ "mean_token_accuracy": 0.7750882878899574,
1191
+ "num_tokens": 3385659.0,
1192
+ "step": 117
1193
+ },
1194
+ {
1195
+ "entropy": 1.1106764674186707,
1196
+ "epoch": 1.4047619047619047,
1197
+ "grad_norm": 0.228515625,
1198
+ "learning_rate": 1.1119644761033079e-05,
1199
+ "loss": 0.820791482925415,
1200
+ "mean_token_accuracy": 0.7748995646834373,
1201
+ "num_tokens": 3414046.0,
1202
+ "step": 118
1203
+ },
1204
+ {
1205
+ "entropy": 1.0989094227552414,
1206
+ "epoch": 1.4166666666666667,
1207
+ "grad_norm": 0.26171875,
1208
+ "learning_rate": 1.0995678465958168e-05,
1209
+ "loss": 0.8132579326629639,
1210
+ "mean_token_accuracy": 0.7685609012842178,
1211
+ "num_tokens": 3442153.0,
1212
+ "step": 119
1213
+ },
1214
+ {
1215
+ "entropy": 1.123728185892105,
1216
+ "epoch": 1.4285714285714286,
1217
+ "grad_norm": 0.2265625,
1218
+ "learning_rate": 1.0871557427476585e-05,
1219
+ "loss": 0.8655298948287964,
1220
+ "mean_token_accuracy": 0.7622044086456299,
1221
+ "num_tokens": 3471630.0,
1222
+ "step": 120
1223
+ },
1224
+ {
1225
+ "entropy": 1.1011102497577667,
1226
+ "epoch": 1.4404761904761905,
1227
+ "grad_norm": 0.21875,
1228
+ "learning_rate": 1.0747300935864245e-05,
1229
+ "loss": 0.8160438537597656,
1230
+ "mean_token_accuracy": 0.7715617045760155,
1231
+ "num_tokens": 3500341.0,
1232
+ "step": 121
1233
+ },
1234
+ {
1235
+ "entropy": 1.0802496522665024,
1236
+ "epoch": 1.4523809523809523,
1237
+ "grad_norm": 0.22265625,
1238
+ "learning_rate": 1.0622928302448523e-05,
1239
+ "loss": 0.795846700668335,
1240
+ "mean_token_accuracy": 0.7745838463306427,
1241
+ "num_tokens": 3530737.0,
1242
+ "step": 122
1243
+ },
1244
+ {
1245
+ "entropy": 1.111521065235138,
1246
+ "epoch": 1.4642857142857144,
1247
+ "grad_norm": 0.2275390625,
1248
+ "learning_rate": 1.0498458856606972e-05,
1249
+ "loss": 0.8259180188179016,
1250
+ "mean_token_accuracy": 0.7704022750258446,
1251
+ "num_tokens": 3559280.0,
1252
+ "step": 123
1253
+ },
1254
+ {
1255
+ "entropy": 1.1052347421646118,
1256
+ "epoch": 1.4761904761904763,
1257
+ "grad_norm": 0.240234375,
1258
+ "learning_rate": 1.037391194276326e-05,
1259
+ "loss": 0.8490574359893799,
1260
+ "mean_token_accuracy": 0.7630502879619598,
1261
+ "num_tokens": 3588223.0,
1262
+ "step": 124
1263
+ },
1264
+ {
1265
+ "entropy": 1.1114005744457245,
1266
+ "epoch": 1.4880952380952381,
1267
+ "grad_norm": 0.2255859375,
1268
+ "learning_rate": 1.0249306917380731e-05,
1269
+ "loss": 0.8460506796836853,
1270
+ "mean_token_accuracy": 0.766345664858818,
1271
+ "num_tokens": 3617347.0,
1272
+ "step": 125
1273
+ },
1274
+ {
1275
+ "entropy": 1.115243524312973,
1276
+ "epoch": 1.5,
1277
+ "grad_norm": 0.21484375,
1278
+ "learning_rate": 1.0124663145954152e-05,
1279
+ "loss": 0.8421509265899658,
1280
+ "mean_token_accuracy": 0.7646084725856781,
1281
+ "num_tokens": 3646452.0,
1282
+ "step": 126
1283
+ },
1284
+ {
1285
+ "entropy": 1.1102195531129837,
1286
+ "epoch": 1.5119047619047619,
1287
+ "grad_norm": 0.68359375,
1288
+ "learning_rate": 1e-05,
1289
+ "loss": 0.8465963006019592,
1290
+ "mean_token_accuracy": 0.7651297971606255,
1291
+ "num_tokens": 3674684.0,
1292
+ "step": 127
1293
+ },
1294
+ {
1295
+ "entropy": 1.1138557642698288,
1296
+ "epoch": 1.5238095238095237,
1297
+ "grad_norm": 0.2392578125,
1298
+ "learning_rate": 9.87533685404585e-06,
1299
+ "loss": 0.8462578058242798,
1300
+ "mean_token_accuracy": 0.7656397670507431,
1301
+ "num_tokens": 3701972.0,
1302
+ "step": 128
1303
+ },
1304
+ {
1305
+ "entropy": 1.0684314519166946,
1306
+ "epoch": 1.5357142857142856,
1307
+ "grad_norm": 0.2294921875,
1308
+ "learning_rate": 9.750693082619274e-06,
1309
+ "loss": 0.7849152684211731,
1310
+ "mean_token_accuracy": 0.7857328802347183,
1311
+ "num_tokens": 3731223.0,
1312
+ "step": 129
1313
+ },
1314
+ {
1315
+ "entropy": 1.0986532717943192,
1316
+ "epoch": 1.5476190476190477,
1317
+ "grad_norm": 0.234375,
1318
+ "learning_rate": 9.626088057236745e-06,
1319
+ "loss": 0.8162216544151306,
1320
+ "mean_token_accuracy": 0.7728657871484756,
1321
+ "num_tokens": 3759466.0,
1322
+ "step": 130
1323
+ },
1324
+ {
1325
+ "entropy": 1.1011187136173248,
1326
+ "epoch": 1.5595238095238095,
1327
+ "grad_norm": 0.220703125,
1328
+ "learning_rate": 9.501541143393028e-06,
1329
+ "loss": 0.8209044933319092,
1330
+ "mean_token_accuracy": 0.7711444199085236,
1331
+ "num_tokens": 3788276.0,
1332
+ "step": 131
1333
+ },
1334
+ {
1335
+ "entropy": 1.0799630433321,
1336
+ "epoch": 1.5714285714285714,
1337
+ "grad_norm": 0.21875,
1338
+ "learning_rate": 9.377071697551479e-06,
1339
+ "loss": 0.7802744507789612,
1340
+ "mean_token_accuracy": 0.7825465202331543,
1341
+ "num_tokens": 3817834.0,
1342
+ "step": 132
1343
+ },
1344
+ {
1345
+ "entropy": 1.1000354290008545,
1346
+ "epoch": 1.5833333333333335,
1347
+ "grad_norm": 0.21875,
1348
+ "learning_rate": 9.252699064135759e-06,
1349
+ "loss": 0.8035217523574829,
1350
+ "mean_token_accuracy": 0.7786530405282974,
1351
+ "num_tokens": 3846803.0,
1352
+ "step": 133
1353
+ },
1354
+ {
1355
+ "entropy": 1.106198564171791,
1356
+ "epoch": 1.5952380952380953,
1357
+ "grad_norm": 0.2119140625,
1358
+ "learning_rate": 9.128442572523418e-06,
1359
+ "loss": 0.8161381483078003,
1360
+ "mean_token_accuracy": 0.7741018161177635,
1361
+ "num_tokens": 3875363.0,
1362
+ "step": 134
1363
+ },
1364
+ {
1365
+ "entropy": 1.094715103507042,
1366
+ "epoch": 1.6071428571428572,
1367
+ "grad_norm": 0.2236328125,
1368
+ "learning_rate": 9.004321534041836e-06,
1369
+ "loss": 0.797153115272522,
1370
+ "mean_token_accuracy": 0.7743495553731918,
1371
+ "num_tokens": 3904020.0,
1372
+ "step": 135
1373
+ },
1374
+ {
1375
+ "entropy": 1.0934260189533234,
1376
+ "epoch": 1.619047619047619,
1377
+ "grad_norm": 0.2236328125,
1378
+ "learning_rate": 8.880355238966923e-06,
1379
+ "loss": 0.7957767248153687,
1380
+ "mean_token_accuracy": 0.7722097188234329,
1381
+ "num_tokens": 3932554.0,
1382
+ "step": 136
1383
+ },
1384
+ {
1385
+ "entropy": 1.0932885110378265,
1386
+ "epoch": 1.630952380952381,
1387
+ "grad_norm": 0.2216796875,
1388
+ "learning_rate": 8.756562953525151e-06,
1389
+ "loss": 0.8285123109817505,
1390
+ "mean_token_accuracy": 0.7748213410377502,
1391
+ "num_tokens": 3963124.0,
1392
+ "step": 137
1393
+ },
1394
+ {
1395
+ "entropy": 1.0973111540079117,
1396
+ "epoch": 1.6428571428571428,
1397
+ "grad_norm": 0.220703125,
1398
+ "learning_rate": 8.632963916899268e-06,
1399
+ "loss": 0.8037251234054565,
1400
+ "mean_token_accuracy": 0.7732022255659103,
1401
+ "num_tokens": 3991485.0,
1402
+ "step": 138
1403
+ },
1404
+ {
1405
+ "entropy": 1.1007558554410934,
1406
+ "epoch": 1.6547619047619047,
1407
+ "grad_norm": 0.22265625,
1408
+ "learning_rate": 8.509577338238255e-06,
1409
+ "loss": 0.8211590051651001,
1410
+ "mean_token_accuracy": 0.770406000316143,
1411
+ "num_tokens": 4020583.0,
1412
+ "step": 139
1413
+ },
1414
+ {
1415
+ "entropy": 1.0826598927378654,
1416
+ "epoch": 1.6666666666666665,
1417
+ "grad_norm": 0.2177734375,
1418
+ "learning_rate": 8.386422393671934e-06,
1419
+ "loss": 0.7706205248832703,
1420
+ "mean_token_accuracy": 0.7830442562699318,
1421
+ "num_tokens": 4049853.0,
1422
+ "step": 140
1423
+ },
1424
+ {
1425
+ "entropy": 1.1024836301803589,
1426
+ "epoch": 1.6785714285714286,
1427
+ "grad_norm": 0.2099609375,
1428
+ "learning_rate": 8.263518223330698e-06,
1429
+ "loss": 0.8168199062347412,
1430
+ "mean_token_accuracy": 0.7706331759691238,
1431
+ "num_tokens": 4079030.0,
1432
+ "step": 141
1433
+ },
1434
+ {
1435
+ "entropy": 1.1091957688331604,
1436
+ "epoch": 1.6904761904761905,
1437
+ "grad_norm": 0.2236328125,
1438
+ "learning_rate": 8.140883928370855e-06,
1439
+ "loss": 0.8526521325111389,
1440
+ "mean_token_accuracy": 0.7632784247398376,
1441
+ "num_tokens": 4108702.0,
1442
+ "step": 142
1443
+ },
1444
+ {
1445
+ "entropy": 1.0926142483949661,
1446
+ "epoch": 1.7023809523809523,
1447
+ "grad_norm": 0.251953125,
1448
+ "learning_rate": 8.018538568006027e-06,
1449
+ "loss": 0.800937294960022,
1450
+ "mean_token_accuracy": 0.7739113718271255,
1451
+ "num_tokens": 4138456.0,
1452
+ "step": 143
1453
+ },
1454
+ {
1455
+ "entropy": 1.085595116019249,
1456
+ "epoch": 1.7142857142857144,
1457
+ "grad_norm": 0.21875,
1458
+ "learning_rate": 7.896501156545044e-06,
1459
+ "loss": 0.7860180735588074,
1460
+ "mean_token_accuracy": 0.7786939144134521,
1461
+ "num_tokens": 4168706.0,
1462
+ "step": 144
1463
+ },
1464
+ {
1465
+ "entropy": 1.072287455201149,
1466
+ "epoch": 1.7261904761904763,
1467
+ "grad_norm": 0.2197265625,
1468
+ "learning_rate": 7.774790660436857e-06,
1469
+ "loss": 0.7674008011817932,
1470
+ "mean_token_accuracy": 0.7878812775015831,
1471
+ "num_tokens": 4197229.0,
1472
+ "step": 145
1473
+ },
1474
+ {
1475
+ "entropy": 1.1147316098213196,
1476
+ "epoch": 1.7380952380952381,
1477
+ "grad_norm": 0.21484375,
1478
+ "learning_rate": 7.653425995322852e-06,
1479
+ "loss": 0.8494656682014465,
1480
+ "mean_token_accuracy": 0.7613426074385643,
1481
+ "num_tokens": 4226241.0,
1482
+ "step": 146
1483
+ },
1484
+ {
1485
+ "entropy": 1.0884745866060257,
1486
+ "epoch": 1.75,
1487
+ "grad_norm": 0.2353515625,
1488
+ "learning_rate": 7.532426023097063e-06,
1489
+ "loss": 0.7670794129371643,
1490
+ "mean_token_accuracy": 0.7854177579283714,
1491
+ "num_tokens": 4254275.0,
1492
+ "step": 147
1493
+ },
1494
+ {
1495
+ "entropy": 1.0930557996034622,
1496
+ "epoch": 1.7619047619047619,
1497
+ "grad_norm": 0.208984375,
1498
+ "learning_rate": 7.411809548974792e-06,
1499
+ "loss": 0.8160566091537476,
1500
+ "mean_token_accuracy": 0.7741377875208855,
1501
+ "num_tokens": 4283150.0,
1502
+ "step": 148
1503
+ },
1504
+ {
1505
+ "entropy": 1.0953784435987473,
1506
+ "epoch": 1.7738095238095237,
1507
+ "grad_norm": 0.2236328125,
1508
+ "learning_rate": 7.291595318569951e-06,
1509
+ "loss": 0.8185824751853943,
1510
+ "mean_token_accuracy": 0.7722392901778221,
1511
+ "num_tokens": 4312410.0,
1512
+ "step": 149
1513
+ },
1514
+ {
1515
+ "entropy": 1.0579064786434174,
1516
+ "epoch": 1.7857142857142856,
1517
+ "grad_norm": 0.2099609375,
1518
+ "learning_rate": 7.171802014981726e-06,
1519
+ "loss": 0.748650848865509,
1520
+ "mean_token_accuracy": 0.79205472022295,
1521
+ "num_tokens": 4341254.0,
1522
+ "step": 150
1523
+ },
1524
+ {
1525
+ "entropy": 1.0844353437423706,
1526
+ "epoch": 1.7976190476190477,
1527
+ "grad_norm": 0.2197265625,
1528
+ "learning_rate": 7.052448255890958e-06,
1529
+ "loss": 0.7923084497451782,
1530
+ "mean_token_accuracy": 0.7777365446090698,
1531
+ "num_tokens": 4369573.0,
1532
+ "step": 151
1533
+ },
1534
+ {
1535
+ "entropy": 1.098150685429573,
1536
+ "epoch": 1.8095238095238095,
1537
+ "grad_norm": 0.23046875,
1538
+ "learning_rate": 6.933552590666659e-06,
1539
+ "loss": 0.8330479860305786,
1540
+ "mean_token_accuracy": 0.7675687223672867,
1541
+ "num_tokens": 4397753.0,
1542
+ "step": 152
1543
+ },
1544
+ {
1545
+ "entropy": 1.0822398364543915,
1546
+ "epoch": 1.8214285714285714,
1547
+ "grad_norm": 0.2119140625,
1548
+ "learning_rate": 6.815133497483157e-06,
1549
+ "loss": 0.7889379262924194,
1550
+ "mean_token_accuracy": 0.7777184247970581,
1551
+ "num_tokens": 4427257.0,
1552
+ "step": 153
1553
+ },
1554
+ {
1555
+ "entropy": 1.0754519402980804,
1556
+ "epoch": 1.8333333333333335,
1557
+ "grad_norm": 0.2236328125,
1558
+ "learning_rate": 6.697209380448333e-06,
1559
+ "loss": 0.784767746925354,
1560
+ "mean_token_accuracy": 0.7786002233624458,
1561
+ "num_tokens": 4456709.0,
1562
+ "step": 154
1563
+ },
1564
+ {
1565
+ "entropy": 1.084225744009018,
1566
+ "epoch": 1.8452380952380953,
1567
+ "grad_norm": 0.2177734375,
1568
+ "learning_rate": 6.579798566743314e-06,
1569
+ "loss": 0.8074082732200623,
1570
+ "mean_token_accuracy": 0.7725819870829582,
1571
+ "num_tokens": 4485653.0,
1572
+ "step": 155
1573
+ },
1574
+ {
1575
+ "entropy": 1.0734328627586365,
1576
+ "epoch": 1.8571428571428572,
1577
+ "grad_norm": 0.203125,
1578
+ "learning_rate": 6.462919303774186e-06,
1579
+ "loss": 0.7693166136741638,
1580
+ "mean_token_accuracy": 0.7844012156128883,
1581
+ "num_tokens": 4515131.0,
1582
+ "step": 156
1583
+ },
1584
+ {
1585
+ "entropy": 1.0944669842720032,
1586
+ "epoch": 1.869047619047619,
1587
+ "grad_norm": 0.224609375,
1588
+ "learning_rate": 6.34658975633605e-06,
1589
+ "loss": 0.8283172249794006,
1590
+ "mean_token_accuracy": 0.7699304968118668,
1591
+ "num_tokens": 4544554.0,
1592
+ "step": 157
1593
+ },
1594
+ {
1595
+ "entropy": 1.0710095912218094,
1596
+ "epoch": 1.880952380952381,
1597
+ "grad_norm": 0.20703125,
1598
+ "learning_rate": 6.230828003789949e-06,
1599
+ "loss": 0.7723422050476074,
1600
+ "mean_token_accuracy": 0.785270169377327,
1601
+ "num_tokens": 4574526.0,
1602
+ "step": 158
1603
+ },
1604
+ {
1605
+ "entropy": 1.1019357591867447,
1606
+ "epoch": 1.8928571428571428,
1607
+ "grad_norm": 0.2236328125,
1608
+ "learning_rate": 6.115652037253054e-06,
1609
+ "loss": 0.842171847820282,
1610
+ "mean_token_accuracy": 0.7660646587610245,
1611
+ "num_tokens": 4603221.0,
1612
+ "step": 159
1613
+ },
1614
+ {
1615
+ "entropy": 1.0798636227846146,
1616
+ "epoch": 1.9047619047619047,
1617
+ "grad_norm": 0.2138671875,
1618
+ "learning_rate": 6.001079756802592e-06,
1619
+ "loss": 0.7799994945526123,
1620
+ "mean_token_accuracy": 0.7830873727798462,
1621
+ "num_tokens": 4632086.0,
1622
+ "step": 160
1623
+ },
1624
+ {
1625
+ "entropy": 1.0717933773994446,
1626
+ "epoch": 1.9166666666666665,
1627
+ "grad_norm": 0.21875,
1628
+ "learning_rate": 5.887128968693887e-06,
1629
+ "loss": 0.7654195427894592,
1630
+ "mean_token_accuracy": 0.7824560701847076,
1631
+ "num_tokens": 4660316.0,
1632
+ "step": 161
1633
+ },
1634
+ {
1635
+ "entropy": 1.069181576371193,
1636
+ "epoch": 1.9285714285714286,
1637
+ "grad_norm": 0.2119140625,
1638
+ "learning_rate": 5.773817382593008e-06,
1639
+ "loss": 0.7898048162460327,
1640
+ "mean_token_accuracy": 0.7769228145480156,
1641
+ "num_tokens": 4689587.0,
1642
+ "step": 162
1643
+ },
1644
+ {
1645
+ "entropy": 1.0788903683423996,
1646
+ "epoch": 1.9404761904761905,
1647
+ "grad_norm": 0.212890625,
1648
+ "learning_rate": 5.66116260882442e-06,
1649
+ "loss": 0.7974780797958374,
1650
+ "mean_token_accuracy": 0.7752274572849274,
1651
+ "num_tokens": 4719335.0,
1652
+ "step": 163
1653
+ },
1654
+ {
1655
+ "entropy": 1.1007077991962433,
1656
+ "epoch": 1.9523809523809523,
1657
+ "grad_norm": 0.23046875,
1658
+ "learning_rate": 5.549182155634076e-06,
1659
+ "loss": 0.7892836332321167,
1660
+ "mean_token_accuracy": 0.7779370620846748,
1661
+ "num_tokens": 4746463.0,
1662
+ "step": 164
1663
+ },
1664
+ {
1665
+ "entropy": 1.0850374549627304,
1666
+ "epoch": 1.9642857142857144,
1667
+ "grad_norm": 0.2109375,
1668
+ "learning_rate": 5.43789342646837e-06,
1669
+ "loss": 0.7919931411743164,
1670
+ "mean_token_accuracy": 0.7770635932683945,
1671
+ "num_tokens": 4775141.0,
1672
+ "step": 165
1673
+ },
1674
+ {
1675
+ "entropy": 1.081614837050438,
1676
+ "epoch": 1.9761904761904763,
1677
+ "grad_norm": 0.21875,
1678
+ "learning_rate": 5.32731371726938e-06,
1679
+ "loss": 0.7762281894683838,
1680
+ "mean_token_accuracy": 0.7814824879169464,
1681
+ "num_tokens": 4803229.0,
1682
+ "step": 166
1683
+ },
1684
+ {
1685
+ "entropy": 1.086833968758583,
1686
+ "epoch": 1.9880952380952381,
1687
+ "grad_norm": 0.216796875,
1688
+ "learning_rate": 5.217460213786822e-06,
1689
+ "loss": 0.8244621157646179,
1690
+ "mean_token_accuracy": 0.7730955481529236,
1691
+ "num_tokens": 4832506.0,
1692
+ "step": 167
1693
+ },
1694
+ {
1695
+ "entropy": 1.0700944513082504,
1696
+ "epoch": 2.0,
1697
+ "grad_norm": 0.2216796875,
1698
+ "learning_rate": 5.108349988907111e-06,
1699
+ "loss": 0.7783507704734802,
1700
+ "mean_token_accuracy": 0.7822717130184174,
1701
+ "num_tokens": 4860038.0,
1702
+ "step": 168
1703
+ },
1704
+ {
1705
+ "epoch": 2.0,
1706
+ "eval_entropy": 1.0883367625872293,
1707
+ "eval_loss": 0.8387430906295776,
1708
+ "eval_mean_token_accuracy": 0.7681301248073578,
1709
+ "eval_model_preparation_time": 0.0051,
1710
+ "eval_num_tokens": 4860038.0,
1711
+ "eval_runtime": 19.5881,
1712
+ "eval_samples_per_second": 7.658,
1713
+ "eval_steps_per_second": 7.658,
1714
+ "step": 168
1715
+ }
1716
+ ],
1717
+ "logging_steps": 1,
1718
+ "max_steps": 252,
1719
+ "num_input_tokens_seen": 0,
1720
+ "num_train_epochs": 3,
1721
+ "save_steps": 500,
1722
+ "stateful_callbacks": {
1723
+ "TrainerControl": {
1724
+ "args": {
1725
+ "should_epoch_stop": false,
1726
+ "should_evaluate": false,
1727
+ "should_log": false,
1728
+ "should_save": true,
1729
+ "should_training_stop": false
1730
+ },
1731
+ "attributes": {}
1732
+ }
1733
+ },
1734
+ "total_flos": 1.1131690390237286e+17,
1735
+ "train_batch_size": 2,
1736
+ "trial_name": null,
1737
+ "trial_params": null
1738
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df04f2387ceaaf0af4f50c3c27439b4b3b5bb4a366490e82fbcb5ddc98d615ef
3
+ size 5649