cutelemonlili commited on
Commit
ece0b12
·
verified ·
1 Parent(s): d7096fc

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-14B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: MATH_training_response_Qwen2.5_72B
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # MATH_training_response_Qwen2.5_72B
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) on the MATH_training_response_Qwen2.5_72B dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.0579
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0001
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 4
46
+ - total_train_batch_size: 4
47
+ - total_eval_batch_size: 4
48
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
49
+ - lr_scheduler_type: cosine
50
+ - lr_scheduler_warmup_ratio: 0.1
51
+ - num_epochs: 2.0
52
+
53
+ ### Training results
54
+
55
+ | Training Loss | Epoch | Step | Validation Loss |
56
+ |:-------------:|:------:|:----:|:---------------:|
57
+ | 0.046 | 1.3889 | 200 | 0.0584 |
58
+
59
+
60
+ ### Framework versions
61
+
62
+ - PEFT 0.12.0
63
+ - Transformers 4.46.1
64
+ - Pytorch 2.5.1+cu124
65
+ - Datasets 3.1.0
66
+ - Tokenizers 0.20.3
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-14B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "down_proj",
25
+ "o_proj",
26
+ "q_proj",
27
+ "up_proj",
28
+ "gate_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fea4b9066052e7677343c9fe97cb80f2f66e84669ffc4178961eb4d211cf4801
3
+ size 68902296
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_loss": 0.05785840377211571,
4
+ "eval_runtime": 2.9043,
5
+ "eval_samples_per_second": 2.066,
6
+ "eval_steps_per_second": 0.689,
7
+ "total_flos": 547031730880512.0,
8
+ "train_loss": 0.056736280779457755,
9
+ "train_runtime": 1071.8886,
10
+ "train_samples_per_second": 1.069,
11
+ "train_steps_per_second": 0.269
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_loss": 0.05785840377211571,
4
+ "eval_runtime": 2.9043,
5
+ "eval_samples_per_second": 2.066,
6
+ "eval_steps_per_second": 0.689
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 547031730880512.0,
4
+ "train_loss": 0.056736280779457755,
5
+ "train_runtime": 1071.8886,
6
+ "train_samples_per_second": 1.069,
7
+ "train_steps_per_second": 0.269
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 1, "total_steps": 288, "loss": 0.1218, "lr": 3.448275862068966e-06, "epoch": 0.006944444444444444, "percentage": 0.35, "elapsed_time": "0:00:06", "remaining_time": "0:32:10"}
2
+ {"current_steps": 2, "total_steps": 288, "loss": 0.1441, "lr": 6.896551724137932e-06, "epoch": 0.013888888888888888, "percentage": 0.69, "elapsed_time": "0:00:16", "remaining_time": "0:39:48"}
3
+ {"current_steps": 3, "total_steps": 288, "loss": 0.1517, "lr": 1.0344827586206897e-05, "epoch": 0.020833333333333332, "percentage": 1.04, "elapsed_time": "0:00:20", "remaining_time": "0:32:12"}
4
+ {"current_steps": 4, "total_steps": 288, "loss": 0.184, "lr": 1.3793103448275863e-05, "epoch": 0.027777777777777776, "percentage": 1.39, "elapsed_time": "0:00:23", "remaining_time": "0:28:19"}
5
+ {"current_steps": 5, "total_steps": 288, "loss": 0.0937, "lr": 1.7241379310344828e-05, "epoch": 0.034722222222222224, "percentage": 1.74, "elapsed_time": "0:00:27", "remaining_time": "0:25:57"}
6
+ {"current_steps": 6, "total_steps": 288, "loss": 0.1932, "lr": 2.0689655172413793e-05, "epoch": 0.041666666666666664, "percentage": 2.08, "elapsed_time": "0:00:31", "remaining_time": "0:24:20"}
7
+ {"current_steps": 7, "total_steps": 288, "loss": 0.1163, "lr": 2.413793103448276e-05, "epoch": 0.04861111111111111, "percentage": 2.43, "elapsed_time": "0:00:34", "remaining_time": "0:23:13"}
8
+ {"current_steps": 8, "total_steps": 288, "loss": 0.0997, "lr": 2.7586206896551727e-05, "epoch": 0.05555555555555555, "percentage": 2.78, "elapsed_time": "0:00:38", "remaining_time": "0:22:21"}
9
+ {"current_steps": 9, "total_steps": 288, "loss": 0.0691, "lr": 3.103448275862069e-05, "epoch": 0.0625, "percentage": 3.12, "elapsed_time": "0:00:41", "remaining_time": "0:21:39"}
10
+ {"current_steps": 10, "total_steps": 288, "loss": 0.1096, "lr": 3.4482758620689657e-05, "epoch": 0.06944444444444445, "percentage": 3.47, "elapsed_time": "0:00:45", "remaining_time": "0:21:05"}
11
+ {"current_steps": 11, "total_steps": 288, "loss": 0.0915, "lr": 3.793103448275862e-05, "epoch": 0.0763888888888889, "percentage": 3.82, "elapsed_time": "0:00:49", "remaining_time": "0:20:37"}
12
+ {"current_steps": 12, "total_steps": 288, "loss": 0.0758, "lr": 4.1379310344827587e-05, "epoch": 0.08333333333333333, "percentage": 4.17, "elapsed_time": "0:00:52", "remaining_time": "0:20:13"}
13
+ {"current_steps": 13, "total_steps": 288, "loss": 0.085, "lr": 4.482758620689655e-05, "epoch": 0.09027777777777778, "percentage": 4.51, "elapsed_time": "0:00:56", "remaining_time": "0:19:52"}
14
+ {"current_steps": 14, "total_steps": 288, "loss": 0.1039, "lr": 4.827586206896552e-05, "epoch": 0.09722222222222222, "percentage": 4.86, "elapsed_time": "0:00:59", "remaining_time": "0:19:32"}
15
+ {"current_steps": 15, "total_steps": 288, "loss": 0.1153, "lr": 5.172413793103449e-05, "epoch": 0.10416666666666667, "percentage": 5.21, "elapsed_time": "0:01:03", "remaining_time": "0:19:17"}
16
+ {"current_steps": 16, "total_steps": 288, "loss": 0.096, "lr": 5.517241379310345e-05, "epoch": 0.1111111111111111, "percentage": 5.56, "elapsed_time": "0:01:07", "remaining_time": "0:19:02"}
17
+ {"current_steps": 17, "total_steps": 288, "loss": 0.1187, "lr": 5.862068965517241e-05, "epoch": 0.11805555555555555, "percentage": 5.9, "elapsed_time": "0:01:10", "remaining_time": "0:18:48"}
18
+ {"current_steps": 18, "total_steps": 288, "loss": 0.0613, "lr": 6.206896551724138e-05, "epoch": 0.125, "percentage": 6.25, "elapsed_time": "0:01:14", "remaining_time": "0:18:35"}
19
+ {"current_steps": 19, "total_steps": 288, "loss": 0.112, "lr": 6.551724137931034e-05, "epoch": 0.13194444444444445, "percentage": 6.6, "elapsed_time": "0:01:18", "remaining_time": "0:18:24"}
20
+ {"current_steps": 20, "total_steps": 288, "loss": 0.1131, "lr": 6.896551724137931e-05, "epoch": 0.1388888888888889, "percentage": 6.94, "elapsed_time": "0:01:21", "remaining_time": "0:18:14"}
21
+ {"current_steps": 21, "total_steps": 288, "loss": 0.0754, "lr": 7.241379310344828e-05, "epoch": 0.14583333333333334, "percentage": 7.29, "elapsed_time": "0:01:25", "remaining_time": "0:18:04"}
22
+ {"current_steps": 22, "total_steps": 288, "loss": 0.0476, "lr": 7.586206896551724e-05, "epoch": 0.1527777777777778, "percentage": 7.64, "elapsed_time": "0:01:28", "remaining_time": "0:17:54"}
23
+ {"current_steps": 23, "total_steps": 288, "loss": 0.0778, "lr": 7.931034482758621e-05, "epoch": 0.1597222222222222, "percentage": 7.99, "elapsed_time": "0:01:32", "remaining_time": "0:17:46"}
24
+ {"current_steps": 24, "total_steps": 288, "loss": 0.0749, "lr": 8.275862068965517e-05, "epoch": 0.16666666666666666, "percentage": 8.33, "elapsed_time": "0:01:36", "remaining_time": "0:17:37"}
25
+ {"current_steps": 25, "total_steps": 288, "loss": 0.0692, "lr": 8.620689655172413e-05, "epoch": 0.1736111111111111, "percentage": 8.68, "elapsed_time": "0:01:39", "remaining_time": "0:17:29"}
26
+ {"current_steps": 26, "total_steps": 288, "loss": 0.1005, "lr": 8.96551724137931e-05, "epoch": 0.18055555555555555, "percentage": 9.03, "elapsed_time": "0:01:43", "remaining_time": "0:17:21"}
27
+ {"current_steps": 27, "total_steps": 288, "loss": 0.0796, "lr": 9.310344827586207e-05, "epoch": 0.1875, "percentage": 9.38, "elapsed_time": "0:01:46", "remaining_time": "0:17:14"}
28
+ {"current_steps": 28, "total_steps": 288, "loss": 0.0808, "lr": 9.655172413793105e-05, "epoch": 0.19444444444444445, "percentage": 9.72, "elapsed_time": "0:01:50", "remaining_time": "0:17:07"}
29
+ {"current_steps": 29, "total_steps": 288, "loss": 0.0589, "lr": 0.0001, "epoch": 0.2013888888888889, "percentage": 10.07, "elapsed_time": "0:01:54", "remaining_time": "0:16:59"}
30
+ {"current_steps": 30, "total_steps": 288, "loss": 0.0885, "lr": 9.999632180371776e-05, "epoch": 0.20833333333333334, "percentage": 10.42, "elapsed_time": "0:01:57", "remaining_time": "0:16:52"}
31
+ {"current_steps": 31, "total_steps": 288, "loss": 0.0859, "lr": 9.998528775603611e-05, "epoch": 0.2152777777777778, "percentage": 10.76, "elapsed_time": "0:02:01", "remaining_time": "0:16:46"}
32
+ {"current_steps": 32, "total_steps": 288, "loss": 0.0482, "lr": 9.99668994803708e-05, "epoch": 0.2222222222222222, "percentage": 11.11, "elapsed_time": "0:02:05", "remaining_time": "0:16:40"}
33
+ {"current_steps": 33, "total_steps": 288, "loss": 0.0575, "lr": 9.994115968214932e-05, "epoch": 0.22916666666666666, "percentage": 11.46, "elapsed_time": "0:02:08", "remaining_time": "0:16:34"}
34
+ {"current_steps": 34, "total_steps": 288, "loss": 0.0661, "lr": 9.990807214841287e-05, "epoch": 0.2361111111111111, "percentage": 11.81, "elapsed_time": "0:02:12", "remaining_time": "0:16:28"}
35
+ {"current_steps": 35, "total_steps": 288, "loss": 0.0556, "lr": 9.986764174725919e-05, "epoch": 0.24305555555555555, "percentage": 12.15, "elapsed_time": "0:02:15", "remaining_time": "0:16:22"}
36
+ {"current_steps": 36, "total_steps": 288, "loss": 0.0546, "lr": 9.981987442712633e-05, "epoch": 0.25, "percentage": 12.5, "elapsed_time": "0:02:19", "remaining_time": "0:16:16"}
37
+ {"current_steps": 37, "total_steps": 288, "loss": 0.0704, "lr": 9.976477721591745e-05, "epoch": 0.2569444444444444, "percentage": 12.85, "elapsed_time": "0:02:23", "remaining_time": "0:16:10"}
38
+ {"current_steps": 38, "total_steps": 288, "loss": 0.0627, "lr": 9.97023582199669e-05, "epoch": 0.2638888888888889, "percentage": 13.19, "elapsed_time": "0:02:26", "remaining_time": "0:16:05"}
39
+ {"current_steps": 39, "total_steps": 288, "loss": 0.0353, "lr": 9.963262662284736e-05, "epoch": 0.2708333333333333, "percentage": 13.54, "elapsed_time": "0:02:30", "remaining_time": "0:16:00"}
40
+ {"current_steps": 40, "total_steps": 288, "loss": 0.0562, "lr": 9.955559268401893e-05, "epoch": 0.2777777777777778, "percentage": 13.89, "elapsed_time": "0:02:34", "remaining_time": "0:15:54"}
41
+ {"current_steps": 41, "total_steps": 288, "loss": 0.0774, "lr": 9.947126773731948e-05, "epoch": 0.2847222222222222, "percentage": 14.24, "elapsed_time": "0:02:37", "remaining_time": "0:15:49"}
42
+ {"current_steps": 42, "total_steps": 288, "loss": 0.0688, "lr": 9.937966418929726e-05, "epoch": 0.2916666666666667, "percentage": 14.58, "elapsed_time": "0:02:41", "remaining_time": "0:15:43"}
43
+ {"current_steps": 43, "total_steps": 288, "loss": 0.0631, "lr": 9.928079551738543e-05, "epoch": 0.2986111111111111, "percentage": 14.93, "elapsed_time": "0:02:44", "remaining_time": "0:15:38"}
44
+ {"current_steps": 44, "total_steps": 288, "loss": 0.0602, "lr": 9.917467626791925e-05, "epoch": 0.3055555555555556, "percentage": 15.28, "elapsed_time": "0:02:48", "remaining_time": "0:15:33"}
45
+ {"current_steps": 45, "total_steps": 288, "loss": 0.0603, "lr": 9.90613220539959e-05, "epoch": 0.3125, "percentage": 15.62, "elapsed_time": "0:02:52", "remaining_time": "0:15:28"}
46
+ {"current_steps": 46, "total_steps": 288, "loss": 0.05, "lr": 9.89407495531773e-05, "epoch": 0.3194444444444444, "percentage": 15.97, "elapsed_time": "0:02:55", "remaining_time": "0:15:23"}
47
+ {"current_steps": 47, "total_steps": 288, "loss": 0.0666, "lr": 9.881297650503641e-05, "epoch": 0.3263888888888889, "percentage": 16.32, "elapsed_time": "0:02:59", "remaining_time": "0:15:18"}
48
+ {"current_steps": 48, "total_steps": 288, "loss": 0.0576, "lr": 9.867802170854724e-05, "epoch": 0.3333333333333333, "percentage": 16.67, "elapsed_time": "0:03:02", "remaining_time": "0:15:14"}
49
+ {"current_steps": 49, "total_steps": 288, "loss": 0.0714, "lr": 9.853590501931904e-05, "epoch": 0.3402777777777778, "percentage": 17.01, "elapsed_time": "0:03:06", "remaining_time": "0:15:09"}
50
+ {"current_steps": 50, "total_steps": 288, "loss": 0.0557, "lr": 9.838664734667495e-05, "epoch": 0.3472222222222222, "percentage": 17.36, "elapsed_time": "0:03:09", "remaining_time": "0:15:04"}
51
+ {"current_steps": 51, "total_steps": 288, "loss": 0.0608, "lr": 9.82302706505756e-05, "epoch": 0.3541666666666667, "percentage": 17.71, "elapsed_time": "0:03:13", "remaining_time": "0:15:01"}
52
+ {"current_steps": 52, "total_steps": 288, "loss": 0.0457, "lr": 9.806679793838829e-05, "epoch": 0.3611111111111111, "percentage": 18.06, "elapsed_time": "0:03:17", "remaining_time": "0:14:57"}
53
+ {"current_steps": 53, "total_steps": 288, "loss": 0.0538, "lr": 9.78962532615019e-05, "epoch": 0.3680555555555556, "percentage": 18.4, "elapsed_time": "0:03:21", "remaining_time": "0:14:52"}
54
+ {"current_steps": 54, "total_steps": 288, "loss": 0.0535, "lr": 9.771866171178831e-05, "epoch": 0.375, "percentage": 18.75, "elapsed_time": "0:03:24", "remaining_time": "0:14:48"}
55
+ {"current_steps": 55, "total_steps": 288, "loss": 0.0533, "lr": 9.753404941791062e-05, "epoch": 0.3819444444444444, "percentage": 19.1, "elapsed_time": "0:03:28", "remaining_time": "0:14:43"}
56
+ {"current_steps": 56, "total_steps": 288, "loss": 0.0558, "lr": 9.734244354147895e-05, "epoch": 0.3888888888888889, "percentage": 19.44, "elapsed_time": "0:03:32", "remaining_time": "0:14:39"}
57
+ {"current_steps": 57, "total_steps": 288, "loss": 0.059, "lr": 9.714387227305422e-05, "epoch": 0.3958333333333333, "percentage": 19.79, "elapsed_time": "0:03:35", "remaining_time": "0:14:34"}
58
+ {"current_steps": 58, "total_steps": 288, "loss": 0.0666, "lr": 9.693836482800044e-05, "epoch": 0.4027777777777778, "percentage": 20.14, "elapsed_time": "0:03:39", "remaining_time": "0:14:30"}
59
+ {"current_steps": 59, "total_steps": 288, "loss": 0.0553, "lr": 9.672595144218646e-05, "epoch": 0.4097222222222222, "percentage": 20.49, "elapsed_time": "0:03:43", "remaining_time": "0:14:26"}
60
+ {"current_steps": 60, "total_steps": 288, "loss": 0.0736, "lr": 9.650666336753728e-05, "epoch": 0.4166666666666667, "percentage": 20.83, "elapsed_time": "0:03:46", "remaining_time": "0:14:22"}
61
+ {"current_steps": 61, "total_steps": 288, "loss": 0.0658, "lr": 9.628053286743619e-05, "epoch": 0.4236111111111111, "percentage": 21.18, "elapsed_time": "0:03:50", "remaining_time": "0:14:17"}
62
+ {"current_steps": 62, "total_steps": 288, "loss": 0.0597, "lr": 9.604759321197773e-05, "epoch": 0.4305555555555556, "percentage": 21.53, "elapsed_time": "0:03:54", "remaining_time": "0:14:13"}
63
+ {"current_steps": 63, "total_steps": 288, "loss": 0.0661, "lr": 9.580787867307293e-05, "epoch": 0.4375, "percentage": 21.88, "elapsed_time": "0:03:57", "remaining_time": "0:14:09"}
64
+ {"current_steps": 64, "total_steps": 288, "loss": 0.0684, "lr": 9.55614245194068e-05, "epoch": 0.4444444444444444, "percentage": 22.22, "elapsed_time": "0:04:01", "remaining_time": "0:14:04"}
65
+ {"current_steps": 65, "total_steps": 288, "loss": 0.0547, "lr": 9.530826701124939e-05, "epoch": 0.4513888888888889, "percentage": 22.57, "elapsed_time": "0:04:04", "remaining_time": "0:14:00"}
66
+ {"current_steps": 66, "total_steps": 288, "loss": 0.0596, "lr": 9.504844339512095e-05, "epoch": 0.4583333333333333, "percentage": 22.92, "elapsed_time": "0:04:08", "remaining_time": "0:13:55"}
67
+ {"current_steps": 67, "total_steps": 288, "loss": 0.0411, "lr": 9.478199189831183e-05, "epoch": 0.4652777777777778, "percentage": 23.26, "elapsed_time": "0:04:12", "remaining_time": "0:13:51"}
68
+ {"current_steps": 68, "total_steps": 288, "loss": 0.0548, "lr": 9.450895172325822e-05, "epoch": 0.4722222222222222, "percentage": 23.61, "elapsed_time": "0:04:15", "remaining_time": "0:13:47"}
69
+ {"current_steps": 69, "total_steps": 288, "loss": 0.0531, "lr": 9.422936304177439e-05, "epoch": 0.4791666666666667, "percentage": 23.96, "elapsed_time": "0:04:19", "remaining_time": "0:13:43"}
70
+ {"current_steps": 70, "total_steps": 288, "loss": 0.0658, "lr": 9.39432669891423e-05, "epoch": 0.4861111111111111, "percentage": 24.31, "elapsed_time": "0:04:22", "remaining_time": "0:13:38"}
71
+ {"current_steps": 71, "total_steps": 288, "loss": 0.0511, "lr": 9.365070565805941e-05, "epoch": 0.4930555555555556, "percentage": 24.65, "elapsed_time": "0:04:26", "remaining_time": "0:13:34"}
72
+ {"current_steps": 72, "total_steps": 288, "loss": 0.0547, "lr": 9.335172209244575e-05, "epoch": 0.5, "percentage": 25.0, "elapsed_time": "0:04:30", "remaining_time": "0:13:30"}
73
+ {"current_steps": 73, "total_steps": 288, "loss": 0.0674, "lr": 9.304636028111094e-05, "epoch": 0.5069444444444444, "percentage": 25.35, "elapsed_time": "0:04:33", "remaining_time": "0:13:26"}
74
+ {"current_steps": 74, "total_steps": 288, "loss": 0.067, "lr": 9.273466515128209e-05, "epoch": 0.5138888888888888, "percentage": 25.69, "elapsed_time": "0:04:37", "remaining_time": "0:13:22"}
75
+ {"current_steps": 75, "total_steps": 288, "loss": 0.0501, "lr": 9.241668256199392e-05, "epoch": 0.5208333333333334, "percentage": 26.04, "elapsed_time": "0:04:41", "remaining_time": "0:13:18"}
76
+ {"current_steps": 76, "total_steps": 288, "loss": 0.0509, "lr": 9.209245929734156e-05, "epoch": 0.5277777777777778, "percentage": 26.39, "elapsed_time": "0:04:44", "remaining_time": "0:13:13"}
77
+ {"current_steps": 77, "total_steps": 288, "loss": 0.0465, "lr": 9.176204305959726e-05, "epoch": 0.5347222222222222, "percentage": 26.74, "elapsed_time": "0:04:48", "remaining_time": "0:13:09"}
78
+ {"current_steps": 78, "total_steps": 288, "loss": 0.0454, "lr": 9.142548246219212e-05, "epoch": 0.5416666666666666, "percentage": 27.08, "elapsed_time": "0:04:51", "remaining_time": "0:13:05"}
79
+ {"current_steps": 79, "total_steps": 288, "loss": 0.0598, "lr": 9.108282702256365e-05, "epoch": 0.5486111111111112, "percentage": 27.43, "elapsed_time": "0:04:55", "remaining_time": "0:13:01"}
80
+ {"current_steps": 80, "total_steps": 288, "loss": 0.0632, "lr": 9.073412715487044e-05, "epoch": 0.5555555555555556, "percentage": 27.78, "elapsed_time": "0:04:59", "remaining_time": "0:12:57"}
81
+ {"current_steps": 81, "total_steps": 288, "loss": 0.059, "lr": 9.037943416257474e-05, "epoch": 0.5625, "percentage": 28.12, "elapsed_time": "0:05:02", "remaining_time": "0:12:53"}
82
+ {"current_steps": 82, "total_steps": 288, "loss": 0.0578, "lr": 9.001880023089441e-05, "epoch": 0.5694444444444444, "percentage": 28.47, "elapsed_time": "0:05:06", "remaining_time": "0:12:49"}
83
+ {"current_steps": 83, "total_steps": 288, "loss": 0.0435, "lr": 8.965227841912489e-05, "epoch": 0.5763888888888888, "percentage": 28.82, "elapsed_time": "0:05:09", "remaining_time": "0:12:45"}
84
+ {"current_steps": 84, "total_steps": 288, "loss": 0.058, "lr": 8.927992265283282e-05, "epoch": 0.5833333333333334, "percentage": 29.17, "elapsed_time": "0:05:13", "remaining_time": "0:12:41"}
85
+ {"current_steps": 85, "total_steps": 288, "loss": 0.045, "lr": 8.890178771592199e-05, "epoch": 0.5902777777777778, "percentage": 29.51, "elapsed_time": "0:05:17", "remaining_time": "0:12:37"}
86
+ {"current_steps": 86, "total_steps": 288, "loss": 0.0709, "lr": 8.851792924257317e-05, "epoch": 0.5972222222222222, "percentage": 29.86, "elapsed_time": "0:05:20", "remaining_time": "0:12:33"}
87
+ {"current_steps": 87, "total_steps": 288, "loss": 0.0525, "lr": 8.812840370905873e-05, "epoch": 0.6041666666666666, "percentage": 30.21, "elapsed_time": "0:05:24", "remaining_time": "0:12:29"}
88
+ {"current_steps": 88, "total_steps": 288, "loss": 0.0464, "lr": 8.773326842543347e-05, "epoch": 0.6111111111111112, "percentage": 30.56, "elapsed_time": "0:05:27", "remaining_time": "0:12:25"}
89
+ {"current_steps": 89, "total_steps": 288, "loss": 0.0649, "lr": 8.733258152710262e-05, "epoch": 0.6180555555555556, "percentage": 30.9, "elapsed_time": "0:05:31", "remaining_time": "0:12:21"}
90
+ {"current_steps": 90, "total_steps": 288, "loss": 0.0575, "lr": 8.692640196626858e-05, "epoch": 0.625, "percentage": 31.25, "elapsed_time": "0:05:35", "remaining_time": "0:12:17"}
91
+ {"current_steps": 91, "total_steps": 288, "loss": 0.0694, "lr": 8.651478950325737e-05, "epoch": 0.6319444444444444, "percentage": 31.6, "elapsed_time": "0:05:38", "remaining_time": "0:12:13"}
92
+ {"current_steps": 92, "total_steps": 288, "loss": 0.0581, "lr": 8.609780469772623e-05, "epoch": 0.6388888888888888, "percentage": 31.94, "elapsed_time": "0:05:42", "remaining_time": "0:12:09"}
93
+ {"current_steps": 93, "total_steps": 288, "loss": 0.0489, "lr": 8.567550889975362e-05, "epoch": 0.6458333333333334, "percentage": 32.29, "elapsed_time": "0:05:45", "remaining_time": "0:12:05"}
94
+ {"current_steps": 94, "total_steps": 288, "loss": 0.0597, "lr": 8.524796424081292e-05, "epoch": 0.6527777777777778, "percentage": 32.64, "elapsed_time": "0:05:49", "remaining_time": "0:12:01"}
95
+ {"current_steps": 95, "total_steps": 288, "loss": 0.0609, "lr": 8.481523362463111e-05, "epoch": 0.6597222222222222, "percentage": 32.99, "elapsed_time": "0:05:53", "remaining_time": "0:11:57"}
96
+ {"current_steps": 96, "total_steps": 288, "loss": 0.0461, "lr": 8.437738071793394e-05, "epoch": 0.6666666666666666, "percentage": 33.33, "elapsed_time": "0:05:56", "remaining_time": "0:11:53"}
97
+ {"current_steps": 97, "total_steps": 288, "loss": 0.0675, "lr": 8.393446994107877e-05, "epoch": 0.6736111111111112, "percentage": 33.68, "elapsed_time": "0:06:00", "remaining_time": "0:11:49"}
98
+ {"current_steps": 98, "total_steps": 288, "loss": 0.0501, "lr": 8.348656645857649e-05, "epoch": 0.6805555555555556, "percentage": 34.03, "elapsed_time": "0:06:03", "remaining_time": "0:11:45"}
99
+ {"current_steps": 99, "total_steps": 288, "loss": 0.0465, "lr": 8.303373616950408e-05, "epoch": 0.6875, "percentage": 34.38, "elapsed_time": "0:06:07", "remaining_time": "0:11:41"}
100
+ {"current_steps": 100, "total_steps": 288, "loss": 0.0538, "lr": 8.257604569780897e-05, "epoch": 0.6944444444444444, "percentage": 34.72, "elapsed_time": "0:06:11", "remaining_time": "0:11:37"}
101
+ {"current_steps": 101, "total_steps": 288, "loss": 0.0532, "lr": 8.21135623825068e-05, "epoch": 0.7013888888888888, "percentage": 35.07, "elapsed_time": "0:06:14", "remaining_time": "0:11:33"}
102
+ {"current_steps": 102, "total_steps": 288, "loss": 0.0584, "lr": 8.164635426777404e-05, "epoch": 0.7083333333333334, "percentage": 35.42, "elapsed_time": "0:06:18", "remaining_time": "0:11:29"}
103
+ {"current_steps": 103, "total_steps": 288, "loss": 0.0486, "lr": 8.117449009293668e-05, "epoch": 0.7152777777777778, "percentage": 35.76, "elapsed_time": "0:06:22", "remaining_time": "0:11:26"}
104
+ {"current_steps": 104, "total_steps": 288, "loss": 0.0512, "lr": 8.069803928235689e-05, "epoch": 0.7222222222222222, "percentage": 36.11, "elapsed_time": "0:06:25", "remaining_time": "0:11:22"}
105
+ {"current_steps": 105, "total_steps": 288, "loss": 0.0466, "lr": 8.021707193521865e-05, "epoch": 0.7291666666666666, "percentage": 36.46, "elapsed_time": "0:06:29", "remaining_time": "0:11:18"}
106
+ {"current_steps": 106, "total_steps": 288, "loss": 0.0614, "lr": 7.973165881521434e-05, "epoch": 0.7361111111111112, "percentage": 36.81, "elapsed_time": "0:06:32", "remaining_time": "0:11:14"}
107
+ {"current_steps": 107, "total_steps": 288, "loss": 0.0468, "lr": 7.924187134013323e-05, "epoch": 0.7430555555555556, "percentage": 37.15, "elapsed_time": "0:06:36", "remaining_time": "0:11:10"}
108
+ {"current_steps": 108, "total_steps": 288, "loss": 0.0522, "lr": 7.874778157135415e-05, "epoch": 0.75, "percentage": 37.5, "elapsed_time": "0:06:40", "remaining_time": "0:11:06"}
109
+ {"current_steps": 109, "total_steps": 288, "loss": 0.0639, "lr": 7.824946220324312e-05, "epoch": 0.7569444444444444, "percentage": 37.85, "elapsed_time": "0:06:43", "remaining_time": "0:11:02"}
110
+ {"current_steps": 110, "total_steps": 288, "loss": 0.054, "lr": 7.774698655245802e-05, "epoch": 0.7638888888888888, "percentage": 38.19, "elapsed_time": "0:06:47", "remaining_time": "0:10:58"}
111
+ {"current_steps": 111, "total_steps": 288, "loss": 0.0492, "lr": 7.724042854716169e-05, "epoch": 0.7708333333333334, "percentage": 38.54, "elapsed_time": "0:06:50", "remaining_time": "0:10:55"}
112
+ {"current_steps": 112, "total_steps": 288, "loss": 0.0562, "lr": 7.6729862716145e-05, "epoch": 0.7777777777777778, "percentage": 38.89, "elapsed_time": "0:06:54", "remaining_time": "0:10:51"}
113
+ {"current_steps": 113, "total_steps": 288, "loss": 0.069, "lr": 7.621536417786159e-05, "epoch": 0.7847222222222222, "percentage": 39.24, "elapsed_time": "0:06:58", "remaining_time": "0:10:47"}
114
+ {"current_steps": 114, "total_steps": 288, "loss": 0.057, "lr": 7.56970086293759e-05, "epoch": 0.7916666666666666, "percentage": 39.58, "elapsed_time": "0:07:01", "remaining_time": "0:10:43"}
115
+ {"current_steps": 115, "total_steps": 288, "loss": 0.0545, "lr": 7.5174872335226e-05, "epoch": 0.7986111111111112, "percentage": 39.93, "elapsed_time": "0:07:05", "remaining_time": "0:10:39"}
116
+ {"current_steps": 116, "total_steps": 288, "loss": 0.0533, "lr": 7.464903211620291e-05, "epoch": 0.8055555555555556, "percentage": 40.28, "elapsed_time": "0:07:08", "remaining_time": "0:10:35"}
117
+ {"current_steps": 117, "total_steps": 288, "loss": 0.0567, "lr": 7.411956533804818e-05, "epoch": 0.8125, "percentage": 40.62, "elapsed_time": "0:07:12", "remaining_time": "0:10:32"}
118
+ {"current_steps": 118, "total_steps": 288, "loss": 0.0634, "lr": 7.358654990007122e-05, "epoch": 0.8194444444444444, "percentage": 40.97, "elapsed_time": "0:07:16", "remaining_time": "0:10:28"}
119
+ {"current_steps": 119, "total_steps": 288, "loss": 0.0396, "lr": 7.305006422368811e-05, "epoch": 0.8263888888888888, "percentage": 41.32, "elapsed_time": "0:07:19", "remaining_time": "0:10:24"}
120
+ {"current_steps": 120, "total_steps": 288, "loss": 0.0598, "lr": 7.251018724088367e-05, "epoch": 0.8333333333333334, "percentage": 41.67, "elapsed_time": "0:07:23", "remaining_time": "0:10:20"}
121
+ {"current_steps": 121, "total_steps": 288, "loss": 0.0542, "lr": 7.196699838259834e-05, "epoch": 0.8402777777777778, "percentage": 42.01, "elapsed_time": "0:07:26", "remaining_time": "0:10:16"}
122
+ {"current_steps": 122, "total_steps": 288, "loss": 0.064, "lr": 7.142057756704168e-05, "epoch": 0.8472222222222222, "percentage": 42.36, "elapsed_time": "0:07:30", "remaining_time": "0:10:13"}
123
+ {"current_steps": 123, "total_steps": 288, "loss": 0.0552, "lr": 7.087100518793421e-05, "epoch": 0.8541666666666666, "percentage": 42.71, "elapsed_time": "0:07:34", "remaining_time": "0:10:09"}
124
+ {"current_steps": 124, "total_steps": 288, "loss": 0.0519, "lr": 7.031836210267915e-05, "epoch": 0.8611111111111112, "percentage": 43.06, "elapsed_time": "0:07:37", "remaining_time": "0:10:05"}
125
+ {"current_steps": 125, "total_steps": 288, "loss": 0.0544, "lr": 6.976272962046619e-05, "epoch": 0.8680555555555556, "percentage": 43.4, "elapsed_time": "0:07:41", "remaining_time": "0:10:01"}
126
+ {"current_steps": 126, "total_steps": 288, "loss": 0.0684, "lr": 6.920418949030856e-05, "epoch": 0.875, "percentage": 43.75, "elapsed_time": "0:07:44", "remaining_time": "0:09:57"}
127
+ {"current_steps": 127, "total_steps": 288, "loss": 0.0737, "lr": 6.864282388901544e-05, "epoch": 0.8819444444444444, "percentage": 44.1, "elapsed_time": "0:07:48", "remaining_time": "0:09:53"}
128
+ {"current_steps": 128, "total_steps": 288, "loss": 0.0515, "lr": 6.807871540910154e-05, "epoch": 0.8888888888888888, "percentage": 44.44, "elapsed_time": "0:07:52", "remaining_time": "0:09:50"}
129
+ {"current_steps": 129, "total_steps": 288, "loss": 0.0574, "lr": 6.751194704663543e-05, "epoch": 0.8958333333333334, "percentage": 44.79, "elapsed_time": "0:07:55", "remaining_time": "0:09:46"}
130
+ {"current_steps": 130, "total_steps": 288, "loss": 0.0445, "lr": 6.694260218902844e-05, "epoch": 0.9027777777777778, "percentage": 45.14, "elapsed_time": "0:07:59", "remaining_time": "0:09:42"}
131
+ {"current_steps": 131, "total_steps": 288, "loss": 0.0569, "lr": 6.637076460276613e-05, "epoch": 0.9097222222222222, "percentage": 45.49, "elapsed_time": "0:08:02", "remaining_time": "0:09:38"}
132
+ {"current_steps": 132, "total_steps": 288, "loss": 0.0551, "lr": 6.57965184210838e-05, "epoch": 0.9166666666666666, "percentage": 45.83, "elapsed_time": "0:08:06", "remaining_time": "0:09:35"}
133
+ {"current_steps": 133, "total_steps": 288, "loss": 0.0557, "lr": 6.521994813158834e-05, "epoch": 0.9236111111111112, "percentage": 46.18, "elapsed_time": "0:08:10", "remaining_time": "0:09:31"}
134
+ {"current_steps": 134, "total_steps": 288, "loss": 0.041, "lr": 6.464113856382752e-05, "epoch": 0.9305555555555556, "percentage": 46.53, "elapsed_time": "0:08:13", "remaining_time": "0:09:27"}
135
+ {"current_steps": 135, "total_steps": 288, "loss": 0.0522, "lr": 6.406017487680937e-05, "epoch": 0.9375, "percentage": 46.88, "elapsed_time": "0:08:17", "remaining_time": "0:09:23"}
136
+ {"current_steps": 136, "total_steps": 288, "loss": 0.0486, "lr": 6.347714254647284e-05, "epoch": 0.9444444444444444, "percentage": 47.22, "elapsed_time": "0:08:21", "remaining_time": "0:09:19"}
137
+ {"current_steps": 137, "total_steps": 288, "loss": 0.0593, "lr": 6.28921273531119e-05, "epoch": 0.9513888888888888, "percentage": 47.57, "elapsed_time": "0:08:24", "remaining_time": "0:09:16"}
138
+ {"current_steps": 138, "total_steps": 288, "loss": 0.0603, "lr": 6.230521536875494e-05, "epoch": 0.9583333333333334, "percentage": 47.92, "elapsed_time": "0:08:28", "remaining_time": "0:09:12"}
139
+ {"current_steps": 139, "total_steps": 288, "loss": 0.0422, "lr": 6.171649294450113e-05, "epoch": 0.9652777777777778, "percentage": 48.26, "elapsed_time": "0:08:31", "remaining_time": "0:09:08"}
140
+ {"current_steps": 140, "total_steps": 288, "loss": 0.0628, "lr": 6.112604669781572e-05, "epoch": 0.9722222222222222, "percentage": 48.61, "elapsed_time": "0:08:35", "remaining_time": "0:09:04"}
141
+ {"current_steps": 141, "total_steps": 288, "loss": 0.0416, "lr": 6.0533963499786314e-05, "epoch": 0.9791666666666666, "percentage": 48.96, "elapsed_time": "0:08:39", "remaining_time": "0:09:01"}
142
+ {"current_steps": 142, "total_steps": 288, "loss": 0.0663, "lr": 5.994033046234162e-05, "epoch": 0.9861111111111112, "percentage": 49.31, "elapsed_time": "0:08:42", "remaining_time": "0:08:57"}
143
+ {"current_steps": 143, "total_steps": 288, "loss": 0.0598, "lr": 5.934523492543489e-05, "epoch": 0.9930555555555556, "percentage": 49.65, "elapsed_time": "0:08:46", "remaining_time": "0:08:53"}
144
+ {"current_steps": 144, "total_steps": 288, "loss": 0.0553, "lr": 5.874876444419377e-05, "epoch": 1.0, "percentage": 50.0, "elapsed_time": "0:08:49", "remaining_time": "0:08:49"}
145
+ {"current_steps": 145, "total_steps": 288, "loss": 0.0487, "lr": 5.8151006776038544e-05, "epoch": 1.0069444444444444, "percentage": 50.35, "elapsed_time": "0:08:53", "remaining_time": "0:08:46"}
146
+ {"current_steps": 146, "total_steps": 288, "loss": 0.0496, "lr": 5.75520498677705e-05, "epoch": 1.0138888888888888, "percentage": 50.69, "elapsed_time": "0:08:57", "remaining_time": "0:08:42"}
147
+ {"current_steps": 147, "total_steps": 288, "loss": 0.0447, "lr": 5.6951981842632585e-05, "epoch": 1.0208333333333333, "percentage": 51.04, "elapsed_time": "0:09:00", "remaining_time": "0:08:38"}
148
+ {"current_steps": 148, "total_steps": 288, "loss": 0.0574, "lr": 5.6350890987343944e-05, "epoch": 1.0277777777777777, "percentage": 51.39, "elapsed_time": "0:09:04", "remaining_time": "0:08:34"}
149
+ {"current_steps": 149, "total_steps": 288, "loss": 0.0484, "lr": 5.574886573911056e-05, "epoch": 1.0347222222222223, "percentage": 51.74, "elapsed_time": "0:09:07", "remaining_time": "0:08:31"}
150
+ {"current_steps": 150, "total_steps": 288, "loss": 0.0424, "lr": 5.5145994672613624e-05, "epoch": 1.0416666666666667, "percentage": 52.08, "elapsed_time": "0:09:11", "remaining_time": "0:08:27"}
151
+ {"current_steps": 151, "total_steps": 288, "loss": 0.0506, "lr": 5.4542366486977756e-05, "epoch": 1.0486111111111112, "percentage": 52.43, "elapsed_time": "0:09:15", "remaining_time": "0:08:23"}
152
+ {"current_steps": 152, "total_steps": 288, "loss": 0.0527, "lr": 5.39380699927209e-05, "epoch": 1.0555555555555556, "percentage": 52.78, "elapsed_time": "0:09:18", "remaining_time": "0:08:19"}
153
+ {"current_steps": 153, "total_steps": 288, "loss": 0.0511, "lr": 5.3333194098687764e-05, "epoch": 1.0625, "percentage": 53.12, "elapsed_time": "0:09:22", "remaining_time": "0:08:16"}
154
+ {"current_steps": 154, "total_steps": 288, "loss": 0.0543, "lr": 5.272782779896898e-05, "epoch": 1.0694444444444444, "percentage": 53.47, "elapsed_time": "0:09:25", "remaining_time": "0:08:12"}
155
+ {"current_steps": 155, "total_steps": 288, "loss": 0.0595, "lr": 5.212206015980742e-05, "epoch": 1.0763888888888888, "percentage": 53.82, "elapsed_time": "0:09:29", "remaining_time": "0:08:08"}
156
+ {"current_steps": 156, "total_steps": 288, "loss": 0.0396, "lr": 5.151598030649425e-05, "epoch": 1.0833333333333333, "percentage": 54.17, "elapsed_time": "0:09:33", "remaining_time": "0:08:05"}
157
+ {"current_steps": 157, "total_steps": 288, "loss": 0.0436, "lr": 5.0909677410255985e-05, "epoch": 1.0902777777777777, "percentage": 54.51, "elapsed_time": "0:09:36", "remaining_time": "0:08:01"}
158
+ {"current_steps": 158, "total_steps": 288, "loss": 0.0514, "lr": 5.030324067513499e-05, "epoch": 1.0972222222222223, "percentage": 54.86, "elapsed_time": "0:09:40", "remaining_time": "0:07:57"}
159
+ {"current_steps": 159, "total_steps": 288, "loss": 0.0472, "lr": 4.969675932486503e-05, "epoch": 1.1041666666666667, "percentage": 55.21, "elapsed_time": "0:09:44", "remaining_time": "0:07:53"}
160
+ {"current_steps": 160, "total_steps": 288, "loss": 0.0436, "lr": 4.9090322589744027e-05, "epoch": 1.1111111111111112, "percentage": 55.56, "elapsed_time": "0:09:47", "remaining_time": "0:07:50"}
161
+ {"current_steps": 161, "total_steps": 288, "loss": 0.0547, "lr": 4.848401969350577e-05, "epoch": 1.1180555555555556, "percentage": 55.9, "elapsed_time": "0:09:51", "remaining_time": "0:07:46"}
162
+ {"current_steps": 162, "total_steps": 288, "loss": 0.0454, "lr": 4.78779398401926e-05, "epoch": 1.125, "percentage": 56.25, "elapsed_time": "0:09:54", "remaining_time": "0:07:42"}
163
+ {"current_steps": 163, "total_steps": 288, "loss": 0.0481, "lr": 4.7272172201031054e-05, "epoch": 1.1319444444444444, "percentage": 56.6, "elapsed_time": "0:09:58", "remaining_time": "0:07:38"}
164
+ {"current_steps": 164, "total_steps": 288, "loss": 0.0463, "lr": 4.666680590131225e-05, "epoch": 1.1388888888888888, "percentage": 56.94, "elapsed_time": "0:10:02", "remaining_time": "0:07:35"}
165
+ {"current_steps": 165, "total_steps": 288, "loss": 0.0506, "lr": 4.606193000727913e-05, "epoch": 1.1458333333333333, "percentage": 57.29, "elapsed_time": "0:10:05", "remaining_time": "0:07:31"}
166
+ {"current_steps": 166, "total_steps": 288, "loss": 0.0474, "lr": 4.545763351302224e-05, "epoch": 1.1527777777777777, "percentage": 57.64, "elapsed_time": "0:10:09", "remaining_time": "0:07:27"}
167
+ {"current_steps": 167, "total_steps": 288, "loss": 0.0454, "lr": 4.485400532738638e-05, "epoch": 1.1597222222222223, "percentage": 57.99, "elapsed_time": "0:10:12", "remaining_time": "0:07:24"}
168
+ {"current_steps": 168, "total_steps": 288, "loss": 0.0517, "lr": 4.425113426088945e-05, "epoch": 1.1666666666666667, "percentage": 58.33, "elapsed_time": "0:10:16", "remaining_time": "0:07:20"}
169
+ {"current_steps": 169, "total_steps": 288, "loss": 0.0538, "lr": 4.364910901265606e-05, "epoch": 1.1736111111111112, "percentage": 58.68, "elapsed_time": "0:10:20", "remaining_time": "0:07:16"}
170
+ {"current_steps": 170, "total_steps": 288, "loss": 0.0413, "lr": 4.3048018157367433e-05, "epoch": 1.1805555555555556, "percentage": 59.03, "elapsed_time": "0:10:23", "remaining_time": "0:07:12"}
171
+ {"current_steps": 171, "total_steps": 288, "loss": 0.0571, "lr": 4.244795013222951e-05, "epoch": 1.1875, "percentage": 59.38, "elapsed_time": "0:10:27", "remaining_time": "0:07:09"}
172
+ {"current_steps": 172, "total_steps": 288, "loss": 0.0406, "lr": 4.184899322396147e-05, "epoch": 1.1944444444444444, "percentage": 59.72, "elapsed_time": "0:10:30", "remaining_time": "0:07:05"}
173
+ {"current_steps": 173, "total_steps": 288, "loss": 0.0492, "lr": 4.125123555580624e-05, "epoch": 1.2013888888888888, "percentage": 60.07, "elapsed_time": "0:10:34", "remaining_time": "0:07:01"}
174
+ {"current_steps": 174, "total_steps": 288, "loss": 0.057, "lr": 4.0654765074565124e-05, "epoch": 1.2083333333333333, "percentage": 60.42, "elapsed_time": "0:10:38", "remaining_time": "0:06:58"}
175
+ {"current_steps": 175, "total_steps": 288, "loss": 0.0479, "lr": 4.005966953765839e-05, "epoch": 1.2152777777777777, "percentage": 60.76, "elapsed_time": "0:10:41", "remaining_time": "0:06:54"}
176
+ {"current_steps": 176, "total_steps": 288, "loss": 0.0395, "lr": 3.94660365002137e-05, "epoch": 1.2222222222222223, "percentage": 61.11, "elapsed_time": "0:10:45", "remaining_time": "0:06:50"}
177
+ {"current_steps": 177, "total_steps": 288, "loss": 0.0391, "lr": 3.887395330218429e-05, "epoch": 1.2291666666666667, "percentage": 61.46, "elapsed_time": "0:10:48", "remaining_time": "0:06:46"}
178
+ {"current_steps": 178, "total_steps": 288, "loss": 0.043, "lr": 3.8283507055498886e-05, "epoch": 1.2361111111111112, "percentage": 61.81, "elapsed_time": "0:10:52", "remaining_time": "0:06:43"}
179
+ {"current_steps": 179, "total_steps": 288, "loss": 0.0449, "lr": 3.769478463124507e-05, "epoch": 1.2430555555555556, "percentage": 62.15, "elapsed_time": "0:10:56", "remaining_time": "0:06:39"}
180
+ {"current_steps": 180, "total_steps": 288, "loss": 0.0563, "lr": 3.7107872646888116e-05, "epoch": 1.25, "percentage": 62.5, "elapsed_time": "0:10:59", "remaining_time": "0:06:35"}
181
+ {"current_steps": 181, "total_steps": 288, "loss": 0.0652, "lr": 3.652285745352717e-05, "epoch": 1.2569444444444444, "percentage": 62.85, "elapsed_time": "0:11:03", "remaining_time": "0:06:32"}
182
+ {"current_steps": 182, "total_steps": 288, "loss": 0.0439, "lr": 3.5939825123190635e-05, "epoch": 1.2638888888888888, "percentage": 63.19, "elapsed_time": "0:11:06", "remaining_time": "0:06:28"}
183
+ {"current_steps": 183, "total_steps": 288, "loss": 0.0396, "lr": 3.5358861436172485e-05, "epoch": 1.2708333333333333, "percentage": 63.54, "elapsed_time": "0:11:10", "remaining_time": "0:06:24"}
184
+ {"current_steps": 184, "total_steps": 288, "loss": 0.0349, "lr": 3.4780051868411675e-05, "epoch": 1.2777777777777777, "percentage": 63.89, "elapsed_time": "0:11:14", "remaining_time": "0:06:21"}
185
+ {"current_steps": 185, "total_steps": 288, "loss": 0.045, "lr": 3.4203481578916194e-05, "epoch": 1.2847222222222223, "percentage": 64.24, "elapsed_time": "0:11:17", "remaining_time": "0:06:17"}
186
+ {"current_steps": 186, "total_steps": 288, "loss": 0.054, "lr": 3.362923539723389e-05, "epoch": 1.2916666666666667, "percentage": 64.58, "elapsed_time": "0:11:21", "remaining_time": "0:06:13"}
187
+ {"current_steps": 187, "total_steps": 288, "loss": 0.0462, "lr": 3.305739781097157e-05, "epoch": 1.2986111111111112, "percentage": 64.93, "elapsed_time": "0:11:25", "remaining_time": "0:06:09"}
188
+ {"current_steps": 188, "total_steps": 288, "loss": 0.049, "lr": 3.248805295336458e-05, "epoch": 1.3055555555555556, "percentage": 65.28, "elapsed_time": "0:11:28", "remaining_time": "0:06:06"}
189
+ {"current_steps": 189, "total_steps": 288, "loss": 0.0398, "lr": 3.1921284590898456e-05, "epoch": 1.3125, "percentage": 65.62, "elapsed_time": "0:11:32", "remaining_time": "0:06:02"}
190
+ {"current_steps": 190, "total_steps": 288, "loss": 0.0622, "lr": 3.135717611098458e-05, "epoch": 1.3194444444444444, "percentage": 65.97, "elapsed_time": "0:11:35", "remaining_time": "0:05:58"}
191
+ {"current_steps": 191, "total_steps": 288, "loss": 0.0628, "lr": 3.079581050969146e-05, "epoch": 1.3263888888888888, "percentage": 66.32, "elapsed_time": "0:11:39", "remaining_time": "0:05:55"}
192
+ {"current_steps": 192, "total_steps": 288, "loss": 0.0538, "lr": 3.023727037953382e-05, "epoch": 1.3333333333333333, "percentage": 66.67, "elapsed_time": "0:11:43", "remaining_time": "0:05:51"}
193
+ {"current_steps": 193, "total_steps": 288, "loss": 0.0494, "lr": 2.9681637897320868e-05, "epoch": 1.3402777777777777, "percentage": 67.01, "elapsed_time": "0:11:46", "remaining_time": "0:05:47"}
194
+ {"current_steps": 194, "total_steps": 288, "loss": 0.0494, "lr": 2.912899481206582e-05, "epoch": 1.3472222222222223, "percentage": 67.36, "elapsed_time": "0:11:50", "remaining_time": "0:05:44"}
195
+ {"current_steps": 195, "total_steps": 288, "loss": 0.0474, "lr": 2.8579422432958312e-05, "epoch": 1.3541666666666667, "percentage": 67.71, "elapsed_time": "0:11:54", "remaining_time": "0:05:40"}
196
+ {"current_steps": 196, "total_steps": 288, "loss": 0.0522, "lr": 2.803300161740166e-05, "epoch": 1.3611111111111112, "percentage": 68.06, "elapsed_time": "0:11:58", "remaining_time": "0:05:37"}
197
+ {"current_steps": 197, "total_steps": 288, "loss": 0.0393, "lr": 2.748981275911633e-05, "epoch": 1.3680555555555556, "percentage": 68.4, "elapsed_time": "0:12:01", "remaining_time": "0:05:33"}
198
+ {"current_steps": 198, "total_steps": 288, "loss": 0.0395, "lr": 2.6949935776311896e-05, "epoch": 1.375, "percentage": 68.75, "elapsed_time": "0:12:05", "remaining_time": "0:05:29"}
199
+ {"current_steps": 199, "total_steps": 288, "loss": 0.0445, "lr": 2.6413450099928783e-05, "epoch": 1.3819444444444444, "percentage": 69.1, "elapsed_time": "0:12:08", "remaining_time": "0:05:25"}
200
+ {"current_steps": 200, "total_steps": 288, "loss": 0.046, "lr": 2.5880434661951823e-05, "epoch": 1.3888888888888888, "percentage": 69.44, "elapsed_time": "0:12:12", "remaining_time": "0:05:22"}
201
+ {"current_steps": 200, "total_steps": 288, "eval_loss": 0.05843019485473633, "epoch": 1.3888888888888888, "percentage": 69.44, "elapsed_time": "0:12:15", "remaining_time": "0:05:23"}
202
+ {"current_steps": 201, "total_steps": 288, "loss": 0.0592, "lr": 2.5350967883797096e-05, "epoch": 1.3958333333333333, "percentage": 69.79, "elapsed_time": "0:12:18", "remaining_time": "0:05:19"}
203
+ {"current_steps": 202, "total_steps": 288, "loss": 0.0492, "lr": 2.4825127664774006e-05, "epoch": 1.4027777777777777, "percentage": 70.14, "elapsed_time": "0:12:22", "remaining_time": "0:05:16"}
204
+ {"current_steps": 203, "total_steps": 288, "loss": 0.0599, "lr": 2.4302991370624107e-05, "epoch": 1.4097222222222223, "percentage": 70.49, "elapsed_time": "0:12:26", "remaining_time": "0:05:12"}
205
+ {"current_steps": 204, "total_steps": 288, "loss": 0.0405, "lr": 2.3784635822138424e-05, "epoch": 1.4166666666666667, "percentage": 70.83, "elapsed_time": "0:12:29", "remaining_time": "0:05:08"}
206
+ {"current_steps": 205, "total_steps": 288, "loss": 0.0522, "lr": 2.327013728385502e-05, "epoch": 1.4236111111111112, "percentage": 71.18, "elapsed_time": "0:12:33", "remaining_time": "0:05:05"}
207
+ {"current_steps": 206, "total_steps": 288, "loss": 0.0498, "lr": 2.2759571452838324e-05, "epoch": 1.4305555555555556, "percentage": 71.53, "elapsed_time": "0:12:37", "remaining_time": "0:05:01"}
208
+ {"current_steps": 207, "total_steps": 288, "loss": 0.0516, "lr": 2.225301344754199e-05, "epoch": 1.4375, "percentage": 71.88, "elapsed_time": "0:12:40", "remaining_time": "0:04:57"}
209
+ {"current_steps": 208, "total_steps": 288, "loss": 0.0356, "lr": 2.17505377967569e-05, "epoch": 1.4444444444444444, "percentage": 72.22, "elapsed_time": "0:12:44", "remaining_time": "0:04:53"}
210
+ {"current_steps": 209, "total_steps": 288, "loss": 0.0553, "lr": 2.1252218428645846e-05, "epoch": 1.4513888888888888, "percentage": 72.57, "elapsed_time": "0:12:47", "remaining_time": "0:04:50"}
211
+ {"current_steps": 210, "total_steps": 288, "loss": 0.0436, "lr": 2.075812865986677e-05, "epoch": 1.4583333333333333, "percentage": 72.92, "elapsed_time": "0:12:51", "remaining_time": "0:04:46"}
212
+ {"current_steps": 211, "total_steps": 288, "loss": 0.0531, "lr": 2.026834118478567e-05, "epoch": 1.4652777777777777, "percentage": 73.26, "elapsed_time": "0:12:55", "remaining_time": "0:04:42"}
213
+ {"current_steps": 212, "total_steps": 288, "loss": 0.0529, "lr": 1.978292806478134e-05, "epoch": 1.4722222222222223, "percentage": 73.61, "elapsed_time": "0:12:58", "remaining_time": "0:04:39"}
214
+ {"current_steps": 213, "total_steps": 288, "loss": 0.0408, "lr": 1.9301960717643118e-05, "epoch": 1.4791666666666667, "percentage": 73.96, "elapsed_time": "0:13:02", "remaining_time": "0:04:35"}
215
+ {"current_steps": 214, "total_steps": 288, "loss": 0.0532, "lr": 1.8825509907063327e-05, "epoch": 1.4861111111111112, "percentage": 74.31, "elapsed_time": "0:13:05", "remaining_time": "0:04:31"}
216
+ {"current_steps": 215, "total_steps": 288, "loss": 0.0357, "lr": 1.8353645732225976e-05, "epoch": 1.4930555555555556, "percentage": 74.65, "elapsed_time": "0:13:09", "remaining_time": "0:04:28"}
217
+ {"current_steps": 216, "total_steps": 288, "loss": 0.0458, "lr": 1.7886437617493205e-05, "epoch": 1.5, "percentage": 75.0, "elapsed_time": "0:13:13", "remaining_time": "0:04:24"}
218
+ {"current_steps": 217, "total_steps": 288, "loss": 0.065, "lr": 1.7423954302191047e-05, "epoch": 1.5069444444444444, "percentage": 75.35, "elapsed_time": "0:13:16", "remaining_time": "0:04:20"}
219
+ {"current_steps": 218, "total_steps": 288, "loss": 0.0363, "lr": 1.6966263830495936e-05, "epoch": 1.5138888888888888, "percentage": 75.69, "elapsed_time": "0:13:20", "remaining_time": "0:04:17"}
220
+ {"current_steps": 219, "total_steps": 288, "loss": 0.0504, "lr": 1.6513433541423528e-05, "epoch": 1.5208333333333335, "percentage": 76.04, "elapsed_time": "0:13:24", "remaining_time": "0:04:13"}
221
+ {"current_steps": 220, "total_steps": 288, "loss": 0.0393, "lr": 1.606553005892125e-05, "epoch": 1.5277777777777777, "percentage": 76.39, "elapsed_time": "0:13:27", "remaining_time": "0:04:09"}
222
+ {"current_steps": 221, "total_steps": 288, "loss": 0.0482, "lr": 1.5622619282066082e-05, "epoch": 1.5347222222222223, "percentage": 76.74, "elapsed_time": "0:13:31", "remaining_time": "0:04:05"}
223
+ {"current_steps": 222, "total_steps": 288, "loss": 0.037, "lr": 1.5184766375368915e-05, "epoch": 1.5416666666666665, "percentage": 77.08, "elapsed_time": "0:13:34", "remaining_time": "0:04:02"}
224
+ {"current_steps": 223, "total_steps": 288, "loss": 0.0536, "lr": 1.4752035759187106e-05, "epoch": 1.5486111111111112, "percentage": 77.43, "elapsed_time": "0:13:38", "remaining_time": "0:03:58"}
225
+ {"current_steps": 224, "total_steps": 288, "loss": 0.0461, "lr": 1.4324491100246385e-05, "epoch": 1.5555555555555556, "percentage": 77.78, "elapsed_time": "0:13:42", "remaining_time": "0:03:54"}
226
+ {"current_steps": 225, "total_steps": 288, "loss": 0.0461, "lr": 1.3902195302273779e-05, "epoch": 1.5625, "percentage": 78.12, "elapsed_time": "0:13:45", "remaining_time": "0:03:51"}
227
+ {"current_steps": 226, "total_steps": 288, "loss": 0.0448, "lr": 1.348521049674264e-05, "epoch": 1.5694444444444444, "percentage": 78.47, "elapsed_time": "0:13:49", "remaining_time": "0:03:47"}
228
+ {"current_steps": 227, "total_steps": 288, "loss": 0.0475, "lr": 1.3073598033731426e-05, "epoch": 1.5763888888888888, "percentage": 78.82, "elapsed_time": "0:13:52", "remaining_time": "0:03:43"}
229
+ {"current_steps": 228, "total_steps": 288, "loss": 0.0432, "lr": 1.2667418472897386e-05, "epoch": 1.5833333333333335, "percentage": 79.17, "elapsed_time": "0:13:56", "remaining_time": "0:03:40"}
230
+ {"current_steps": 229, "total_steps": 288, "loss": 0.0446, "lr": 1.2266731574566536e-05, "epoch": 1.5902777777777777, "percentage": 79.51, "elapsed_time": "0:14:00", "remaining_time": "0:03:36"}
231
+ {"current_steps": 230, "total_steps": 288, "loss": 0.0403, "lr": 1.1871596290941278e-05, "epoch": 1.5972222222222223, "percentage": 79.86, "elapsed_time": "0:14:03", "remaining_time": "0:03:32"}
232
+ {"current_steps": 231, "total_steps": 288, "loss": 0.0424, "lr": 1.1482070757426856e-05, "epoch": 1.6041666666666665, "percentage": 80.21, "elapsed_time": "0:14:07", "remaining_time": "0:03:29"}
233
+ {"current_steps": 232, "total_steps": 288, "loss": 0.0486, "lr": 1.1098212284078036e-05, "epoch": 1.6111111111111112, "percentage": 80.56, "elapsed_time": "0:14:10", "remaining_time": "0:03:25"}
234
+ {"current_steps": 233, "total_steps": 288, "loss": 0.0452, "lr": 1.0720077347167202e-05, "epoch": 1.6180555555555556, "percentage": 80.9, "elapsed_time": "0:14:14", "remaining_time": "0:03:21"}
235
+ {"current_steps": 234, "total_steps": 288, "loss": 0.0418, "lr": 1.0347721580875126e-05, "epoch": 1.625, "percentage": 81.25, "elapsed_time": "0:14:18", "remaining_time": "0:03:18"}
236
+ {"current_steps": 235, "total_steps": 288, "loss": 0.0488, "lr": 9.981199769105604e-06, "epoch": 1.6319444444444444, "percentage": 81.6, "elapsed_time": "0:14:21", "remaining_time": "0:03:14"}
237
+ {"current_steps": 236, "total_steps": 288, "loss": 0.0544, "lr": 9.620565837425271e-06, "epoch": 1.6388888888888888, "percentage": 81.94, "elapsed_time": "0:14:25", "remaining_time": "0:03:10"}
238
+ {"current_steps": 237, "total_steps": 288, "loss": 0.0418, "lr": 9.26587284512957e-06, "epoch": 1.6458333333333335, "percentage": 82.29, "elapsed_time": "0:14:28", "remaining_time": "0:03:06"}
239
+ {"current_steps": 238, "total_steps": 288, "loss": 0.0509, "lr": 8.917172977436356e-06, "epoch": 1.6527777777777777, "percentage": 82.64, "elapsed_time": "0:14:32", "remaining_time": "0:03:03"}
240
+ {"current_steps": 239, "total_steps": 288, "loss": 0.0493, "lr": 8.574517537807897e-06, "epoch": 1.6597222222222223, "percentage": 82.99, "elapsed_time": "0:14:36", "remaining_time": "0:02:59"}
241
+ {"current_steps": 240, "total_steps": 288, "loss": 0.0538, "lr": 8.237956940402757e-06, "epoch": 1.6666666666666665, "percentage": 83.33, "elapsed_time": "0:14:39", "remaining_time": "0:02:55"}
242
+ {"current_steps": 241, "total_steps": 288, "loss": 0.0417, "lr": 7.907540702658456e-06, "epoch": 1.6736111111111112, "percentage": 83.68, "elapsed_time": "0:14:43", "remaining_time": "0:02:52"}
243
+ {"current_steps": 242, "total_steps": 288, "loss": 0.0648, "lr": 7.583317438006093e-06, "epoch": 1.6805555555555556, "percentage": 84.03, "elapsed_time": "0:14:47", "remaining_time": "0:02:48"}
244
+ {"current_steps": 243, "total_steps": 288, "loss": 0.0409, "lr": 7.265334848717931e-06, "epoch": 1.6875, "percentage": 84.38, "elapsed_time": "0:14:50", "remaining_time": "0:02:44"}
245
+ {"current_steps": 244, "total_steps": 288, "loss": 0.0425, "lr": 6.953639718889076e-06, "epoch": 1.6944444444444444, "percentage": 84.72, "elapsed_time": "0:14:54", "remaining_time": "0:02:41"}
246
+ {"current_steps": 245, "total_steps": 288, "loss": 0.0483, "lr": 6.648277907554235e-06, "epoch": 1.7013888888888888, "percentage": 85.07, "elapsed_time": "0:14:57", "remaining_time": "0:02:37"}
247
+ {"current_steps": 246, "total_steps": 288, "loss": 0.0461, "lr": 6.349294341940593e-06, "epoch": 1.7083333333333335, "percentage": 85.42, "elapsed_time": "0:15:01", "remaining_time": "0:02:33"}
248
+ {"current_steps": 247, "total_steps": 288, "loss": 0.0538, "lr": 6.056733010857712e-06, "epoch": 1.7152777777777777, "percentage": 85.76, "elapsed_time": "0:15:05", "remaining_time": "0:02:30"}
249
+ {"current_steps": 248, "total_steps": 288, "loss": 0.0455, "lr": 5.770636958225617e-06, "epoch": 1.7222222222222223, "percentage": 86.11, "elapsed_time": "0:15:08", "remaining_time": "0:02:26"}
250
+ {"current_steps": 249, "total_steps": 288, "loss": 0.0444, "lr": 5.491048276741784e-06, "epoch": 1.7291666666666665, "percentage": 86.46, "elapsed_time": "0:15:12", "remaining_time": "0:02:22"}
251
+ {"current_steps": 250, "total_steps": 288, "loss": 0.0607, "lr": 5.218008101688171e-06, "epoch": 1.7361111111111112, "percentage": 86.81, "elapsed_time": "0:15:15", "remaining_time": "0:02:19"}
252
+ {"current_steps": 251, "total_steps": 288, "loss": 0.0445, "lr": 4.951556604879048e-06, "epoch": 1.7430555555555556, "percentage": 87.15, "elapsed_time": "0:15:19", "remaining_time": "0:02:15"}
253
+ {"current_steps": 252, "total_steps": 288, "loss": 0.0388, "lr": 4.691732988750614e-06, "epoch": 1.75, "percentage": 87.5, "elapsed_time": "0:15:23", "remaining_time": "0:02:11"}
254
+ {"current_steps": 253, "total_steps": 288, "loss": 0.0391, "lr": 4.43857548059321e-06, "epoch": 1.7569444444444444, "percentage": 87.85, "elapsed_time": "0:15:26", "remaining_time": "0:02:08"}
255
+ {"current_steps": 254, "total_steps": 288, "loss": 0.0432, "lr": 4.192121326927073e-06, "epoch": 1.7638888888888888, "percentage": 88.19, "elapsed_time": "0:15:30", "remaining_time": "0:02:04"}
256
+ {"current_steps": 255, "total_steps": 288, "loss": 0.044, "lr": 3.952406788022267e-06, "epoch": 1.7708333333333335, "percentage": 88.54, "elapsed_time": "0:15:33", "remaining_time": "0:02:00"}
257
+ {"current_steps": 256, "total_steps": 288, "loss": 0.042, "lr": 3.7194671325638198e-06, "epoch": 1.7777777777777777, "percentage": 88.89, "elapsed_time": "0:15:37", "remaining_time": "0:01:57"}
258
+ {"current_steps": 257, "total_steps": 288, "loss": 0.0446, "lr": 3.493336632462718e-06, "epoch": 1.7847222222222223, "percentage": 89.24, "elapsed_time": "0:15:41", "remaining_time": "0:01:53"}
259
+ {"current_steps": 258, "total_steps": 288, "loss": 0.0375, "lr": 3.274048557813553e-06, "epoch": 1.7916666666666665, "percentage": 89.58, "elapsed_time": "0:15:44", "remaining_time": "0:01:49"}
260
+ {"current_steps": 259, "total_steps": 288, "loss": 0.0458, "lr": 3.061635171999566e-06, "epoch": 1.7986111111111112, "percentage": 89.93, "elapsed_time": "0:15:48", "remaining_time": "0:01:46"}
261
+ {"current_steps": 260, "total_steps": 288, "loss": 0.043, "lr": 2.85612772694579e-06, "epoch": 1.8055555555555556, "percentage": 90.28, "elapsed_time": "0:15:51", "remaining_time": "0:01:42"}
262
+ {"current_steps": 261, "total_steps": 288, "loss": 0.0422, "lr": 2.657556458521049e-06, "epoch": 1.8125, "percentage": 90.62, "elapsed_time": "0:15:55", "remaining_time": "0:01:38"}
263
+ {"current_steps": 262, "total_steps": 288, "loss": 0.049, "lr": 2.4659505820893826e-06, "epoch": 1.8194444444444444, "percentage": 90.97, "elapsed_time": "0:15:59", "remaining_time": "0:01:35"}
264
+ {"current_steps": 263, "total_steps": 288, "loss": 0.0437, "lr": 2.2813382882116986e-06, "epoch": 1.8263888888888888, "percentage": 91.32, "elapsed_time": "0:16:02", "remaining_time": "0:01:31"}
265
+ {"current_steps": 264, "total_steps": 288, "loss": 0.0426, "lr": 2.1037467384981026e-06, "epoch": 1.8333333333333335, "percentage": 91.67, "elapsed_time": "0:16:06", "remaining_time": "0:01:27"}
266
+ {"current_steps": 265, "total_steps": 288, "loss": 0.0458, "lr": 1.933202061611722e-06, "epoch": 1.8402777777777777, "percentage": 92.01, "elapsed_time": "0:16:09", "remaining_time": "0:01:24"}
267
+ {"current_steps": 266, "total_steps": 288, "loss": 0.0531, "lr": 1.769729349424415e-06, "epoch": 1.8472222222222223, "percentage": 92.36, "elapsed_time": "0:16:13", "remaining_time": "0:01:20"}
268
+ {"current_steps": 267, "total_steps": 288, "loss": 0.0425, "lr": 1.6133526533250565e-06, "epoch": 1.8541666666666665, "percentage": 92.71, "elapsed_time": "0:16:17", "remaining_time": "0:01:16"}
269
+ {"current_steps": 268, "total_steps": 288, "loss": 0.0492, "lr": 1.4640949806809523e-06, "epoch": 1.8611111111111112, "percentage": 93.06, "elapsed_time": "0:16:20", "remaining_time": "0:01:13"}
270
+ {"current_steps": 269, "total_steps": 288, "loss": 0.0452, "lr": 1.3219782914527634e-06, "epoch": 1.8680555555555556, "percentage": 93.4, "elapsed_time": "0:16:24", "remaining_time": "0:01:09"}
271
+ {"current_steps": 270, "total_steps": 288, "loss": 0.0549, "lr": 1.1870234949636073e-06, "epoch": 1.875, "percentage": 93.75, "elapsed_time": "0:16:27", "remaining_time": "0:01:05"}
272
+ {"current_steps": 271, "total_steps": 288, "loss": 0.0463, "lr": 1.0592504468227127e-06, "epoch": 1.8819444444444444, "percentage": 94.1, "elapsed_time": "0:16:31", "remaining_time": "0:01:02"}
273
+ {"current_steps": 272, "total_steps": 288, "loss": 0.0522, "lr": 9.386779460041017e-07, "epoch": 1.8888888888888888, "percentage": 94.44, "elapsed_time": "0:16:35", "remaining_time": "0:00:58"}
274
+ {"current_steps": 273, "total_steps": 288, "loss": 0.0475, "lr": 8.253237320807461e-07, "epoch": 1.8958333333333335, "percentage": 94.79, "elapsed_time": "0:16:38", "remaining_time": "0:00:54"}
275
+ {"current_steps": 274, "total_steps": 288, "loss": 0.0498, "lr": 7.192044826145771e-07, "epoch": 1.9027777777777777, "percentage": 95.14, "elapsed_time": "0:16:42", "remaining_time": "0:00:51"}
276
+ {"current_steps": 275, "total_steps": 288, "loss": 0.0422, "lr": 6.20335810702749e-07, "epoch": 1.9097222222222223, "percentage": 95.49, "elapsed_time": "0:16:46", "remaining_time": "0:00:47"}
277
+ {"current_steps": 276, "total_steps": 288, "loss": 0.0433, "lr": 5.287322626805202e-07, "epoch": 1.9166666666666665, "percentage": 95.83, "elapsed_time": "0:16:49", "remaining_time": "0:00:43"}
278
+ {"current_steps": 277, "total_steps": 288, "loss": 0.0499, "lr": 4.4440731598107686e-07, "epoch": 1.9236111111111112, "percentage": 96.18, "elapsed_time": "0:16:53", "remaining_time": "0:00:40"}
279
+ {"current_steps": 278, "total_steps": 288, "loss": 0.0664, "lr": 3.673733771526466e-07, "epoch": 1.9305555555555556, "percentage": 96.53, "elapsed_time": "0:16:56", "remaining_time": "0:00:36"}
280
+ {"current_steps": 279, "total_steps": 288, "loss": 0.0395, "lr": 2.976417800331144e-07, "epoch": 1.9375, "percentage": 96.88, "elapsed_time": "0:17:00", "remaining_time": "0:00:32"}
281
+ {"current_steps": 280, "total_steps": 288, "loss": 0.0456, "lr": 2.352227840825394e-07, "epoch": 1.9444444444444444, "percentage": 97.22, "elapsed_time": "0:17:04", "remaining_time": "0:00:29"}
282
+ {"current_steps": 281, "total_steps": 288, "loss": 0.0449, "lr": 1.8012557287367392e-07, "epoch": 1.9513888888888888, "percentage": 97.57, "elapsed_time": "0:17:07", "remaining_time": "0:00:25"}
283
+ {"current_steps": 282, "total_steps": 288, "loss": 0.0433, "lr": 1.3235825274081625e-07, "epoch": 1.9583333333333335, "percentage": 97.92, "elapsed_time": "0:17:11", "remaining_time": "0:00:21"}
284
+ {"current_steps": 283, "total_steps": 288, "loss": 0.0516, "lr": 9.19278515871369e-08, "epoch": 1.9652777777777777, "percentage": 98.26, "elapsed_time": "0:17:14", "remaining_time": "0:00:18"}
285
+ {"current_steps": 284, "total_steps": 288, "loss": 0.0392, "lr": 5.8840317850683555e-08, "epoch": 1.9722222222222223, "percentage": 98.61, "elapsed_time": "0:17:18", "remaining_time": "0:00:14"}
286
+ {"current_steps": 285, "total_steps": 288, "loss": 0.0471, "lr": 3.310051962920335e-08, "epoch": 1.9791666666666665, "percentage": 98.96, "elapsed_time": "0:17:22", "remaining_time": "0:00:10"}
287
+ {"current_steps": 286, "total_steps": 288, "loss": 0.0542, "lr": 1.471224396389359e-08, "epoch": 1.9861111111111112, "percentage": 99.31, "elapsed_time": "0:17:25", "remaining_time": "0:00:07"}
288
+ {"current_steps": 287, "total_steps": 288, "loss": 0.0474, "lr": 3.6781962822529657e-09, "epoch": 1.9930555555555556, "percentage": 99.65, "elapsed_time": "0:17:29", "remaining_time": "0:00:03"}
289
+ {"current_steps": 288, "total_steps": 288, "loss": 0.0448, "lr": 0.0, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "0:17:32", "remaining_time": "0:00:00"}
290
+ {"current_steps": 288, "total_steps": 288, "epoch": 2.0, "percentage": 100.0, "elapsed_time": "0:17:51", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,2066 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 200,
6
+ "global_step": 288,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.006944444444444444,
13
+ "grad_norm": 0.0634646986292456,
14
+ "learning_rate": 3.448275862068966e-06,
15
+ "loss": 0.1218,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.013888888888888888,
20
+ "grad_norm": 0.0859646592063227,
21
+ "learning_rate": 6.896551724137932e-06,
22
+ "loss": 0.1441,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.020833333333333332,
27
+ "grad_norm": 0.08311493152111031,
28
+ "learning_rate": 1.0344827586206897e-05,
29
+ "loss": 0.1517,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.027777777777777776,
34
+ "grad_norm": 0.14507101606184478,
35
+ "learning_rate": 1.3793103448275863e-05,
36
+ "loss": 0.184,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.034722222222222224,
41
+ "grad_norm": 0.049880571226207794,
42
+ "learning_rate": 1.7241379310344828e-05,
43
+ "loss": 0.0937,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.041666666666666664,
48
+ "grad_norm": 0.13472688960866697,
49
+ "learning_rate": 2.0689655172413793e-05,
50
+ "loss": 0.1932,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.04861111111111111,
55
+ "grad_norm": 0.063128720217521,
56
+ "learning_rate": 2.413793103448276e-05,
57
+ "loss": 0.1163,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.05555555555555555,
62
+ "grad_norm": 0.05791338456495497,
63
+ "learning_rate": 2.7586206896551727e-05,
64
+ "loss": 0.0997,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.0625,
69
+ "grad_norm": 0.03814512155396182,
70
+ "learning_rate": 3.103448275862069e-05,
71
+ "loss": 0.0691,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.06944444444444445,
76
+ "grad_norm": 0.07278628165772846,
77
+ "learning_rate": 3.4482758620689657e-05,
78
+ "loss": 0.1096,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.0763888888888889,
83
+ "grad_norm": 0.06072973165332569,
84
+ "learning_rate": 3.793103448275862e-05,
85
+ "loss": 0.0915,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.08333333333333333,
90
+ "grad_norm": 0.05086161703424197,
91
+ "learning_rate": 4.1379310344827587e-05,
92
+ "loss": 0.0758,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.09027777777777778,
97
+ "grad_norm": 0.06108424147364289,
98
+ "learning_rate": 4.482758620689655e-05,
99
+ "loss": 0.085,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.09722222222222222,
104
+ "grad_norm": 0.08171893276043111,
105
+ "learning_rate": 4.827586206896552e-05,
106
+ "loss": 0.1039,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.10416666666666667,
111
+ "grad_norm": 0.10806188024280473,
112
+ "learning_rate": 5.172413793103449e-05,
113
+ "loss": 0.1153,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.1111111111111111,
118
+ "grad_norm": 0.08264842267025586,
119
+ "learning_rate": 5.517241379310345e-05,
120
+ "loss": 0.096,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.11805555555555555,
125
+ "grad_norm": 0.11707563126376766,
126
+ "learning_rate": 5.862068965517241e-05,
127
+ "loss": 0.1187,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.125,
132
+ "grad_norm": 0.04624996354545763,
133
+ "learning_rate": 6.206896551724138e-05,
134
+ "loss": 0.0613,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.13194444444444445,
139
+ "grad_norm": 0.12096307356119897,
140
+ "learning_rate": 6.551724137931034e-05,
141
+ "loss": 0.112,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.1388888888888889,
146
+ "grad_norm": 0.11329767467249009,
147
+ "learning_rate": 6.896551724137931e-05,
148
+ "loss": 0.1131,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.14583333333333334,
153
+ "grad_norm": 0.07076834653741017,
154
+ "learning_rate": 7.241379310344828e-05,
155
+ "loss": 0.0754,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.1527777777777778,
160
+ "grad_norm": 0.07631833003946596,
161
+ "learning_rate": 7.586206896551724e-05,
162
+ "loss": 0.0476,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.1597222222222222,
167
+ "grad_norm": 0.09111429978286256,
168
+ "learning_rate": 7.931034482758621e-05,
169
+ "loss": 0.0778,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.16666666666666666,
174
+ "grad_norm": 0.09638107817903875,
175
+ "learning_rate": 8.275862068965517e-05,
176
+ "loss": 0.0749,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.1736111111111111,
181
+ "grad_norm": 0.08873808756623787,
182
+ "learning_rate": 8.620689655172413e-05,
183
+ "loss": 0.0692,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.18055555555555555,
188
+ "grad_norm": 0.1193395529072999,
189
+ "learning_rate": 8.96551724137931e-05,
190
+ "loss": 0.1005,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.1875,
195
+ "grad_norm": 0.07825819960868853,
196
+ "learning_rate": 9.310344827586207e-05,
197
+ "loss": 0.0796,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.19444444444444445,
202
+ "grad_norm": 0.09882721733405724,
203
+ "learning_rate": 9.655172413793105e-05,
204
+ "loss": 0.0808,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.2013888888888889,
209
+ "grad_norm": 0.07496690540223322,
210
+ "learning_rate": 0.0001,
211
+ "loss": 0.0589,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.20833333333333334,
216
+ "grad_norm": 0.06753626384871714,
217
+ "learning_rate": 9.999632180371776e-05,
218
+ "loss": 0.0885,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.2152777777777778,
223
+ "grad_norm": 0.11227962647329344,
224
+ "learning_rate": 9.998528775603611e-05,
225
+ "loss": 0.0859,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.2222222222222222,
230
+ "grad_norm": 0.04228956053702444,
231
+ "learning_rate": 9.99668994803708e-05,
232
+ "loss": 0.0482,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.22916666666666666,
237
+ "grad_norm": 0.049938706422113184,
238
+ "learning_rate": 9.994115968214932e-05,
239
+ "loss": 0.0575,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.2361111111111111,
244
+ "grad_norm": 0.054468426952476535,
245
+ "learning_rate": 9.990807214841287e-05,
246
+ "loss": 0.0661,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.24305555555555555,
251
+ "grad_norm": 0.048987738819091665,
252
+ "learning_rate": 9.986764174725919e-05,
253
+ "loss": 0.0556,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.25,
258
+ "grad_norm": 0.05029249306240408,
259
+ "learning_rate": 9.981987442712633e-05,
260
+ "loss": 0.0546,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.2569444444444444,
265
+ "grad_norm": 0.07920414870467878,
266
+ "learning_rate": 9.976477721591745e-05,
267
+ "loss": 0.0704,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.2638888888888889,
272
+ "grad_norm": 0.04856426595651698,
273
+ "learning_rate": 9.97023582199669e-05,
274
+ "loss": 0.0627,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.2708333333333333,
279
+ "grad_norm": 0.03338189886854429,
280
+ "learning_rate": 9.963262662284736e-05,
281
+ "loss": 0.0353,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.2777777777777778,
286
+ "grad_norm": 0.05814625123252708,
287
+ "learning_rate": 9.955559268401893e-05,
288
+ "loss": 0.0562,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.2847222222222222,
293
+ "grad_norm": 0.09810680657571984,
294
+ "learning_rate": 9.947126773731948e-05,
295
+ "loss": 0.0774,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.2916666666666667,
300
+ "grad_norm": 0.08137711658725671,
301
+ "learning_rate": 9.937966418929726e-05,
302
+ "loss": 0.0688,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.2986111111111111,
307
+ "grad_norm": 0.04984221097640415,
308
+ "learning_rate": 9.928079551738543e-05,
309
+ "loss": 0.0631,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.3055555555555556,
314
+ "grad_norm": 0.045586138765645726,
315
+ "learning_rate": 9.917467626791925e-05,
316
+ "loss": 0.0602,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.3125,
321
+ "grad_norm": 0.04555416039001976,
322
+ "learning_rate": 9.90613220539959e-05,
323
+ "loss": 0.0603,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.3194444444444444,
328
+ "grad_norm": 0.06929704690081513,
329
+ "learning_rate": 9.89407495531773e-05,
330
+ "loss": 0.05,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.3263888888888889,
335
+ "grad_norm": 0.05071419561253368,
336
+ "learning_rate": 9.881297650503641e-05,
337
+ "loss": 0.0666,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.3333333333333333,
342
+ "grad_norm": 0.05861571605506496,
343
+ "learning_rate": 9.867802170854724e-05,
344
+ "loss": 0.0576,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.3402777777777778,
349
+ "grad_norm": 0.05602477728042172,
350
+ "learning_rate": 9.853590501931904e-05,
351
+ "loss": 0.0714,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.3472222222222222,
356
+ "grad_norm": 0.0600399573347184,
357
+ "learning_rate": 9.838664734667495e-05,
358
+ "loss": 0.0557,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.3541666666666667,
363
+ "grad_norm": 0.04310502477291791,
364
+ "learning_rate": 9.82302706505756e-05,
365
+ "loss": 0.0608,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.3611111111111111,
370
+ "grad_norm": 0.04208369800984692,
371
+ "learning_rate": 9.806679793838829e-05,
372
+ "loss": 0.0457,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.3680555555555556,
377
+ "grad_norm": 0.04390617615736525,
378
+ "learning_rate": 9.78962532615019e-05,
379
+ "loss": 0.0538,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.375,
384
+ "grad_norm": 0.040829606477323165,
385
+ "learning_rate": 9.771866171178831e-05,
386
+ "loss": 0.0535,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.3819444444444444,
391
+ "grad_norm": 0.04189168396894191,
392
+ "learning_rate": 9.753404941791062e-05,
393
+ "loss": 0.0533,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.3888888888888889,
398
+ "grad_norm": 0.048795880713950655,
399
+ "learning_rate": 9.734244354147895e-05,
400
+ "loss": 0.0558,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.3958333333333333,
405
+ "grad_norm": 0.04594331706981098,
406
+ "learning_rate": 9.714387227305422e-05,
407
+ "loss": 0.059,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.4027777777777778,
412
+ "grad_norm": 0.059333907325760896,
413
+ "learning_rate": 9.693836482800044e-05,
414
+ "loss": 0.0666,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.4097222222222222,
419
+ "grad_norm": 0.04504422858266263,
420
+ "learning_rate": 9.672595144218646e-05,
421
+ "loss": 0.0553,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.4166666666666667,
426
+ "grad_norm": 0.06138639267675264,
427
+ "learning_rate": 9.650666336753728e-05,
428
+ "loss": 0.0736,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.4236111111111111,
433
+ "grad_norm": 0.054120807804378494,
434
+ "learning_rate": 9.628053286743619e-05,
435
+ "loss": 0.0658,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.4305555555555556,
440
+ "grad_norm": 0.05215983401668381,
441
+ "learning_rate": 9.604759321197773e-05,
442
+ "loss": 0.0597,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.4375,
447
+ "grad_norm": 0.05923618865174348,
448
+ "learning_rate": 9.580787867307293e-05,
449
+ "loss": 0.0661,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.4444444444444444,
454
+ "grad_norm": 0.06697666067838456,
455
+ "learning_rate": 9.55614245194068e-05,
456
+ "loss": 0.0684,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.4513888888888889,
461
+ "grad_norm": 0.04541593572547064,
462
+ "learning_rate": 9.530826701124939e-05,
463
+ "loss": 0.0547,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.4583333333333333,
468
+ "grad_norm": 0.05700134510715612,
469
+ "learning_rate": 9.504844339512095e-05,
470
+ "loss": 0.0596,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.4652777777777778,
475
+ "grad_norm": 0.041366318222701697,
476
+ "learning_rate": 9.478199189831183e-05,
477
+ "loss": 0.0411,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.4722222222222222,
482
+ "grad_norm": 0.043353520617435734,
483
+ "learning_rate": 9.450895172325822e-05,
484
+ "loss": 0.0548,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.4791666666666667,
489
+ "grad_norm": 0.05496280723006238,
490
+ "learning_rate": 9.422936304177439e-05,
491
+ "loss": 0.0531,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.4861111111111111,
496
+ "grad_norm": 0.057292146852494466,
497
+ "learning_rate": 9.39432669891423e-05,
498
+ "loss": 0.0658,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.4930555555555556,
503
+ "grad_norm": 0.044098969405188045,
504
+ "learning_rate": 9.365070565805941e-05,
505
+ "loss": 0.0511,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.5,
510
+ "grad_norm": 0.10907567767546891,
511
+ "learning_rate": 9.335172209244575e-05,
512
+ "loss": 0.0547,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.5069444444444444,
517
+ "grad_norm": 0.05193284317496665,
518
+ "learning_rate": 9.304636028111094e-05,
519
+ "loss": 0.0674,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.5138888888888888,
524
+ "grad_norm": 0.06799161021928798,
525
+ "learning_rate": 9.273466515128209e-05,
526
+ "loss": 0.067,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.5208333333333334,
531
+ "grad_norm": 0.05626247552701963,
532
+ "learning_rate": 9.241668256199392e-05,
533
+ "loss": 0.0501,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.5277777777777778,
538
+ "grad_norm": 0.04578630890465305,
539
+ "learning_rate": 9.209245929734156e-05,
540
+ "loss": 0.0509,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.5347222222222222,
545
+ "grad_norm": 0.0451951125591856,
546
+ "learning_rate": 9.176204305959726e-05,
547
+ "loss": 0.0465,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.5416666666666666,
552
+ "grad_norm": 0.04052385478349237,
553
+ "learning_rate": 9.142548246219212e-05,
554
+ "loss": 0.0454,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.5486111111111112,
559
+ "grad_norm": 0.04695475481560532,
560
+ "learning_rate": 9.108282702256365e-05,
561
+ "loss": 0.0598,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.5555555555555556,
566
+ "grad_norm": 0.05974716728603661,
567
+ "learning_rate": 9.073412715487044e-05,
568
+ "loss": 0.0632,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.5625,
573
+ "grad_norm": 0.04594564477538813,
574
+ "learning_rate": 9.037943416257474e-05,
575
+ "loss": 0.059,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.5694444444444444,
580
+ "grad_norm": 0.04581771185786344,
581
+ "learning_rate": 9.001880023089441e-05,
582
+ "loss": 0.0578,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.5763888888888888,
587
+ "grad_norm": 0.03808060898920002,
588
+ "learning_rate": 8.965227841912489e-05,
589
+ "loss": 0.0435,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.5833333333333334,
594
+ "grad_norm": 0.06582783981663642,
595
+ "learning_rate": 8.927992265283282e-05,
596
+ "loss": 0.058,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.5902777777777778,
601
+ "grad_norm": 0.0409049374684989,
602
+ "learning_rate": 8.890178771592199e-05,
603
+ "loss": 0.045,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.5972222222222222,
608
+ "grad_norm": 0.056258831948584206,
609
+ "learning_rate": 8.851792924257317e-05,
610
+ "loss": 0.0709,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.6041666666666666,
615
+ "grad_norm": 0.04580764978743741,
616
+ "learning_rate": 8.812840370905873e-05,
617
+ "loss": 0.0525,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.6111111111111112,
622
+ "grad_norm": 0.043263482428543644,
623
+ "learning_rate": 8.773326842543347e-05,
624
+ "loss": 0.0464,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.6180555555555556,
629
+ "grad_norm": 0.059777623657141676,
630
+ "learning_rate": 8.733258152710262e-05,
631
+ "loss": 0.0649,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.625,
636
+ "grad_norm": 0.049405025547198826,
637
+ "learning_rate": 8.692640196626858e-05,
638
+ "loss": 0.0575,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.6319444444444444,
643
+ "grad_norm": 0.054678965104638884,
644
+ "learning_rate": 8.651478950325737e-05,
645
+ "loss": 0.0694,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.6388888888888888,
650
+ "grad_norm": 0.05031387124367363,
651
+ "learning_rate": 8.609780469772623e-05,
652
+ "loss": 0.0581,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.6458333333333334,
657
+ "grad_norm": 0.043219919789406584,
658
+ "learning_rate": 8.567550889975362e-05,
659
+ "loss": 0.0489,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.6527777777777778,
664
+ "grad_norm": 0.04843201217718916,
665
+ "learning_rate": 8.524796424081292e-05,
666
+ "loss": 0.0597,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.6597222222222222,
671
+ "grad_norm": 0.0522833112433358,
672
+ "learning_rate": 8.481523362463111e-05,
673
+ "loss": 0.0609,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.6666666666666666,
678
+ "grad_norm": 0.04658288241167051,
679
+ "learning_rate": 8.437738071793394e-05,
680
+ "loss": 0.0461,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.6736111111111112,
685
+ "grad_norm": 0.05192918762269953,
686
+ "learning_rate": 8.393446994107877e-05,
687
+ "loss": 0.0675,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.6805555555555556,
692
+ "grad_norm": 0.04625340723515702,
693
+ "learning_rate": 8.348656645857649e-05,
694
+ "loss": 0.0501,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.6875,
699
+ "grad_norm": 0.04545506022482319,
700
+ "learning_rate": 8.303373616950408e-05,
701
+ "loss": 0.0465,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.6944444444444444,
706
+ "grad_norm": 0.048499611503797835,
707
+ "learning_rate": 8.257604569780897e-05,
708
+ "loss": 0.0538,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.7013888888888888,
713
+ "grad_norm": 0.04849174145205094,
714
+ "learning_rate": 8.21135623825068e-05,
715
+ "loss": 0.0532,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.7083333333333334,
720
+ "grad_norm": 0.046809029835309476,
721
+ "learning_rate": 8.164635426777404e-05,
722
+ "loss": 0.0584,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.7152777777777778,
727
+ "grad_norm": 0.04338551366678789,
728
+ "learning_rate": 8.117449009293668e-05,
729
+ "loss": 0.0486,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.7222222222222222,
734
+ "grad_norm": 0.04288423757366809,
735
+ "learning_rate": 8.069803928235689e-05,
736
+ "loss": 0.0512,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.7291666666666666,
741
+ "grad_norm": 0.04297925782588076,
742
+ "learning_rate": 8.021707193521865e-05,
743
+ "loss": 0.0466,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.7361111111111112,
748
+ "grad_norm": 0.04840548556064357,
749
+ "learning_rate": 7.973165881521434e-05,
750
+ "loss": 0.0614,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.7430555555555556,
755
+ "grad_norm": 0.05089661493349612,
756
+ "learning_rate": 7.924187134013323e-05,
757
+ "loss": 0.0468,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.75,
762
+ "grad_norm": 0.043594180551687345,
763
+ "learning_rate": 7.874778157135415e-05,
764
+ "loss": 0.0522,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.7569444444444444,
769
+ "grad_norm": 0.07921359503782698,
770
+ "learning_rate": 7.824946220324312e-05,
771
+ "loss": 0.0639,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.7638888888888888,
776
+ "grad_norm": 0.046303003192425585,
777
+ "learning_rate": 7.774698655245802e-05,
778
+ "loss": 0.054,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.7708333333333334,
783
+ "grad_norm": 0.0443323814040479,
784
+ "learning_rate": 7.724042854716169e-05,
785
+ "loss": 0.0492,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.7777777777777778,
790
+ "grad_norm": 0.04970508540718419,
791
+ "learning_rate": 7.6729862716145e-05,
792
+ "loss": 0.0562,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.7847222222222222,
797
+ "grad_norm": 0.09180702014526151,
798
+ "learning_rate": 7.621536417786159e-05,
799
+ "loss": 0.069,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.7916666666666666,
804
+ "grad_norm": 0.047409146676109855,
805
+ "learning_rate": 7.56970086293759e-05,
806
+ "loss": 0.057,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.7986111111111112,
811
+ "grad_norm": 0.05564719359814639,
812
+ "learning_rate": 7.5174872335226e-05,
813
+ "loss": 0.0545,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.8055555555555556,
818
+ "grad_norm": 0.044385293233721056,
819
+ "learning_rate": 7.464903211620291e-05,
820
+ "loss": 0.0533,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.8125,
825
+ "grad_norm": 0.048373968528083365,
826
+ "learning_rate": 7.411956533804818e-05,
827
+ "loss": 0.0567,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.8194444444444444,
832
+ "grad_norm": 0.06631096530721073,
833
+ "learning_rate": 7.358654990007122e-05,
834
+ "loss": 0.0634,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.8263888888888888,
839
+ "grad_norm": 0.03919679279316613,
840
+ "learning_rate": 7.305006422368811e-05,
841
+ "loss": 0.0396,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.8333333333333334,
846
+ "grad_norm": 0.0612342230223103,
847
+ "learning_rate": 7.251018724088367e-05,
848
+ "loss": 0.0598,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.8402777777777778,
853
+ "grad_norm": 0.04761240051837147,
854
+ "learning_rate": 7.196699838259834e-05,
855
+ "loss": 0.0542,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.8472222222222222,
860
+ "grad_norm": 0.055239940867233474,
861
+ "learning_rate": 7.142057756704168e-05,
862
+ "loss": 0.064,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.8541666666666666,
867
+ "grad_norm": 0.05625030436845944,
868
+ "learning_rate": 7.087100518793421e-05,
869
+ "loss": 0.0552,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.8611111111111112,
874
+ "grad_norm": 0.04917209401012176,
875
+ "learning_rate": 7.031836210267915e-05,
876
+ "loss": 0.0519,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.8680555555555556,
881
+ "grad_norm": 0.045923821351934915,
882
+ "learning_rate": 6.976272962046619e-05,
883
+ "loss": 0.0544,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.875,
888
+ "grad_norm": 0.055983725032457854,
889
+ "learning_rate": 6.920418949030856e-05,
890
+ "loss": 0.0684,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.8819444444444444,
895
+ "grad_norm": 0.09704535662409224,
896
+ "learning_rate": 6.864282388901544e-05,
897
+ "loss": 0.0737,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.8888888888888888,
902
+ "grad_norm": 0.044657916384289814,
903
+ "learning_rate": 6.807871540910154e-05,
904
+ "loss": 0.0515,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.8958333333333334,
909
+ "grad_norm": 0.07059390531279443,
910
+ "learning_rate": 6.751194704663543e-05,
911
+ "loss": 0.0574,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.9027777777777778,
916
+ "grad_norm": 0.04502070349330495,
917
+ "learning_rate": 6.694260218902844e-05,
918
+ "loss": 0.0445,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.9097222222222222,
923
+ "grad_norm": 0.06605902723986466,
924
+ "learning_rate": 6.637076460276613e-05,
925
+ "loss": 0.0569,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.9166666666666666,
930
+ "grad_norm": 0.051118257758994964,
931
+ "learning_rate": 6.57965184210838e-05,
932
+ "loss": 0.0551,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.9236111111111112,
937
+ "grad_norm": 0.04518774034822205,
938
+ "learning_rate": 6.521994813158834e-05,
939
+ "loss": 0.0557,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.9305555555555556,
944
+ "grad_norm": 0.04284945779426791,
945
+ "learning_rate": 6.464113856382752e-05,
946
+ "loss": 0.041,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.9375,
951
+ "grad_norm": 0.04754862748835347,
952
+ "learning_rate": 6.406017487680937e-05,
953
+ "loss": 0.0522,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.9444444444444444,
958
+ "grad_norm": 0.04459511286211968,
959
+ "learning_rate": 6.347714254647284e-05,
960
+ "loss": 0.0486,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.9513888888888888,
965
+ "grad_norm": 0.06113290499228477,
966
+ "learning_rate": 6.28921273531119e-05,
967
+ "loss": 0.0593,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.9583333333333334,
972
+ "grad_norm": 0.056574006950422875,
973
+ "learning_rate": 6.230521536875494e-05,
974
+ "loss": 0.0603,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.9652777777777778,
979
+ "grad_norm": 0.04419993037171285,
980
+ "learning_rate": 6.171649294450113e-05,
981
+ "loss": 0.0422,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.9722222222222222,
986
+ "grad_norm": 0.058933736730721914,
987
+ "learning_rate": 6.112604669781572e-05,
988
+ "loss": 0.0628,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.9791666666666666,
993
+ "grad_norm": 0.043997579067719075,
994
+ "learning_rate": 6.0533963499786314e-05,
995
+ "loss": 0.0416,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.9861111111111112,
1000
+ "grad_norm": 0.0787901330317747,
1001
+ "learning_rate": 5.994033046234162e-05,
1002
+ "loss": 0.0663,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.9930555555555556,
1007
+ "grad_norm": 0.056572258685848587,
1008
+ "learning_rate": 5.934523492543489e-05,
1009
+ "loss": 0.0598,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 1.0,
1014
+ "grad_norm": 0.06836555724673425,
1015
+ "learning_rate": 5.874876444419377e-05,
1016
+ "loss": 0.0553,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 1.0069444444444444,
1021
+ "grad_norm": 0.05528468814497019,
1022
+ "learning_rate": 5.8151006776038544e-05,
1023
+ "loss": 0.0487,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 1.0138888888888888,
1028
+ "grad_norm": 0.04442209097541711,
1029
+ "learning_rate": 5.75520498677705e-05,
1030
+ "loss": 0.0496,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 1.0208333333333333,
1035
+ "grad_norm": 0.04265736671883575,
1036
+ "learning_rate": 5.6951981842632585e-05,
1037
+ "loss": 0.0447,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 1.0277777777777777,
1042
+ "grad_norm": 0.06716393769395008,
1043
+ "learning_rate": 5.6350890987343944e-05,
1044
+ "loss": 0.0574,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 1.0347222222222223,
1049
+ "grad_norm": 0.04619919754230532,
1050
+ "learning_rate": 5.574886573911056e-05,
1051
+ "loss": 0.0484,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 1.0416666666666667,
1056
+ "grad_norm": 0.04088308722298511,
1057
+ "learning_rate": 5.5145994672613624e-05,
1058
+ "loss": 0.0424,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 1.0486111111111112,
1063
+ "grad_norm": 0.07106013370406222,
1064
+ "learning_rate": 5.4542366486977756e-05,
1065
+ "loss": 0.0506,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 1.0555555555555556,
1070
+ "grad_norm": 0.06544088043365288,
1071
+ "learning_rate": 5.39380699927209e-05,
1072
+ "loss": 0.0527,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 1.0625,
1077
+ "grad_norm": 0.055005258429162714,
1078
+ "learning_rate": 5.3333194098687764e-05,
1079
+ "loss": 0.0511,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 1.0694444444444444,
1084
+ "grad_norm": 0.05657332977154827,
1085
+ "learning_rate": 5.272782779896898e-05,
1086
+ "loss": 0.0543,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 1.0763888888888888,
1091
+ "grad_norm": 0.08633216171192509,
1092
+ "learning_rate": 5.212206015980742e-05,
1093
+ "loss": 0.0595,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 1.0833333333333333,
1098
+ "grad_norm": 0.04344217768745669,
1099
+ "learning_rate": 5.151598030649425e-05,
1100
+ "loss": 0.0396,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 1.0902777777777777,
1105
+ "grad_norm": 0.04446718541450083,
1106
+ "learning_rate": 5.0909677410255985e-05,
1107
+ "loss": 0.0436,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 1.0972222222222223,
1112
+ "grad_norm": 0.05936952558342409,
1113
+ "learning_rate": 5.030324067513499e-05,
1114
+ "loss": 0.0514,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 1.1041666666666667,
1119
+ "grad_norm": 0.06506431319350331,
1120
+ "learning_rate": 4.969675932486503e-05,
1121
+ "loss": 0.0472,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 1.1111111111111112,
1126
+ "grad_norm": 0.04763084580847737,
1127
+ "learning_rate": 4.9090322589744027e-05,
1128
+ "loss": 0.0436,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 1.1180555555555556,
1133
+ "grad_norm": 0.06531630358906274,
1134
+ "learning_rate": 4.848401969350577e-05,
1135
+ "loss": 0.0547,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 1.125,
1140
+ "grad_norm": 0.06486635891777948,
1141
+ "learning_rate": 4.78779398401926e-05,
1142
+ "loss": 0.0454,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 1.1319444444444444,
1147
+ "grad_norm": 0.06325850216814397,
1148
+ "learning_rate": 4.7272172201031054e-05,
1149
+ "loss": 0.0481,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 1.1388888888888888,
1154
+ "grad_norm": 0.05964254119159708,
1155
+ "learning_rate": 4.666680590131225e-05,
1156
+ "loss": 0.0463,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 1.1458333333333333,
1161
+ "grad_norm": 0.05342845760927094,
1162
+ "learning_rate": 4.606193000727913e-05,
1163
+ "loss": 0.0506,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 1.1527777777777777,
1168
+ "grad_norm": 0.05377115554688926,
1169
+ "learning_rate": 4.545763351302224e-05,
1170
+ "loss": 0.0474,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 1.1597222222222223,
1175
+ "grad_norm": 0.05055891815765679,
1176
+ "learning_rate": 4.485400532738638e-05,
1177
+ "loss": 0.0454,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 1.1666666666666667,
1182
+ "grad_norm": 0.05497839217223033,
1183
+ "learning_rate": 4.425113426088945e-05,
1184
+ "loss": 0.0517,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 1.1736111111111112,
1189
+ "grad_norm": 0.07316550289620528,
1190
+ "learning_rate": 4.364910901265606e-05,
1191
+ "loss": 0.0538,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 1.1805555555555556,
1196
+ "grad_norm": 0.047990097455734924,
1197
+ "learning_rate": 4.3048018157367433e-05,
1198
+ "loss": 0.0413,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 1.1875,
1203
+ "grad_norm": 0.060217032523026584,
1204
+ "learning_rate": 4.244795013222951e-05,
1205
+ "loss": 0.0571,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 1.1944444444444444,
1210
+ "grad_norm": 0.04698866045082108,
1211
+ "learning_rate": 4.184899322396147e-05,
1212
+ "loss": 0.0406,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 1.2013888888888888,
1217
+ "grad_norm": 0.048369004985907946,
1218
+ "learning_rate": 4.125123555580624e-05,
1219
+ "loss": 0.0492,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 1.2083333333333333,
1224
+ "grad_norm": 0.05659619196658486,
1225
+ "learning_rate": 4.0654765074565124e-05,
1226
+ "loss": 0.057,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 1.2152777777777777,
1231
+ "grad_norm": 0.057459855223361965,
1232
+ "learning_rate": 4.005966953765839e-05,
1233
+ "loss": 0.0479,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 1.2222222222222223,
1238
+ "grad_norm": 0.04572437211261691,
1239
+ "learning_rate": 3.94660365002137e-05,
1240
+ "loss": 0.0395,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 1.2291666666666667,
1245
+ "grad_norm": 0.048219072573456936,
1246
+ "learning_rate": 3.887395330218429e-05,
1247
+ "loss": 0.0391,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 1.2361111111111112,
1252
+ "grad_norm": 0.06136131829899375,
1253
+ "learning_rate": 3.8283507055498886e-05,
1254
+ "loss": 0.043,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 1.2430555555555556,
1259
+ "grad_norm": 0.059005391153620405,
1260
+ "learning_rate": 3.769478463124507e-05,
1261
+ "loss": 0.0449,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 1.25,
1266
+ "grad_norm": 0.0746524056732719,
1267
+ "learning_rate": 3.7107872646888116e-05,
1268
+ "loss": 0.0563,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 1.2569444444444444,
1273
+ "grad_norm": 0.06512798583168254,
1274
+ "learning_rate": 3.652285745352717e-05,
1275
+ "loss": 0.0652,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 1.2638888888888888,
1280
+ "grad_norm": 0.05118809031893748,
1281
+ "learning_rate": 3.5939825123190635e-05,
1282
+ "loss": 0.0439,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 1.2708333333333333,
1287
+ "grad_norm": 0.0491373328570806,
1288
+ "learning_rate": 3.5358861436172485e-05,
1289
+ "loss": 0.0396,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 1.2777777777777777,
1294
+ "grad_norm": 0.045715529417524076,
1295
+ "learning_rate": 3.4780051868411675e-05,
1296
+ "loss": 0.0349,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 1.2847222222222223,
1301
+ "grad_norm": 0.05499812650132431,
1302
+ "learning_rate": 3.4203481578916194e-05,
1303
+ "loss": 0.045,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 1.2916666666666667,
1308
+ "grad_norm": 0.05444819130310788,
1309
+ "learning_rate": 3.362923539723389e-05,
1310
+ "loss": 0.054,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 1.2986111111111112,
1315
+ "grad_norm": 0.05755054434891386,
1316
+ "learning_rate": 3.305739781097157e-05,
1317
+ "loss": 0.0462,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 1.3055555555555556,
1322
+ "grad_norm": 0.056782562230561044,
1323
+ "learning_rate": 3.248805295336458e-05,
1324
+ "loss": 0.049,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 1.3125,
1329
+ "grad_norm": 0.05059433419507378,
1330
+ "learning_rate": 3.1921284590898456e-05,
1331
+ "loss": 0.0398,
1332
+ "step": 189
1333
+ },
1334
+ {
1335
+ "epoch": 1.3194444444444444,
1336
+ "grad_norm": 0.06312033575652291,
1337
+ "learning_rate": 3.135717611098458e-05,
1338
+ "loss": 0.0622,
1339
+ "step": 190
1340
+ },
1341
+ {
1342
+ "epoch": 1.3263888888888888,
1343
+ "grad_norm": 0.06684980111755691,
1344
+ "learning_rate": 3.079581050969146e-05,
1345
+ "loss": 0.0628,
1346
+ "step": 191
1347
+ },
1348
+ {
1349
+ "epoch": 1.3333333333333333,
1350
+ "grad_norm": 0.09404520793015296,
1351
+ "learning_rate": 3.023727037953382e-05,
1352
+ "loss": 0.0538,
1353
+ "step": 192
1354
+ },
1355
+ {
1356
+ "epoch": 1.3402777777777777,
1357
+ "grad_norm": 0.055119231303708986,
1358
+ "learning_rate": 2.9681637897320868e-05,
1359
+ "loss": 0.0494,
1360
+ "step": 193
1361
+ },
1362
+ {
1363
+ "epoch": 1.3472222222222223,
1364
+ "grad_norm": 0.06806024002262281,
1365
+ "learning_rate": 2.912899481206582e-05,
1366
+ "loss": 0.0494,
1367
+ "step": 194
1368
+ },
1369
+ {
1370
+ "epoch": 1.3541666666666667,
1371
+ "grad_norm": 0.05634926402516711,
1372
+ "learning_rate": 2.8579422432958312e-05,
1373
+ "loss": 0.0474,
1374
+ "step": 195
1375
+ },
1376
+ {
1377
+ "epoch": 1.3611111111111112,
1378
+ "grad_norm": 0.062090271560473254,
1379
+ "learning_rate": 2.803300161740166e-05,
1380
+ "loss": 0.0522,
1381
+ "step": 196
1382
+ },
1383
+ {
1384
+ "epoch": 1.3680555555555556,
1385
+ "grad_norm": 0.05050749410112152,
1386
+ "learning_rate": 2.748981275911633e-05,
1387
+ "loss": 0.0393,
1388
+ "step": 197
1389
+ },
1390
+ {
1391
+ "epoch": 1.375,
1392
+ "grad_norm": 0.05546645285466708,
1393
+ "learning_rate": 2.6949935776311896e-05,
1394
+ "loss": 0.0395,
1395
+ "step": 198
1396
+ },
1397
+ {
1398
+ "epoch": 1.3819444444444444,
1399
+ "grad_norm": 0.05531341419035151,
1400
+ "learning_rate": 2.6413450099928783e-05,
1401
+ "loss": 0.0445,
1402
+ "step": 199
1403
+ },
1404
+ {
1405
+ "epoch": 1.3888888888888888,
1406
+ "grad_norm": 0.057587022878495084,
1407
+ "learning_rate": 2.5880434661951823e-05,
1408
+ "loss": 0.046,
1409
+ "step": 200
1410
+ },
1411
+ {
1412
+ "epoch": 1.3888888888888888,
1413
+ "eval_loss": 0.05843019485473633,
1414
+ "eval_runtime": 2.8965,
1415
+ "eval_samples_per_second": 2.071,
1416
+ "eval_steps_per_second": 0.69,
1417
+ "step": 200
1418
+ },
1419
+ {
1420
+ "epoch": 1.3958333333333333,
1421
+ "grad_norm": 0.06587462746896457,
1422
+ "learning_rate": 2.5350967883797096e-05,
1423
+ "loss": 0.0592,
1424
+ "step": 201
1425
+ },
1426
+ {
1427
+ "epoch": 1.4027777777777777,
1428
+ "grad_norm": 0.05623019516285661,
1429
+ "learning_rate": 2.4825127664774006e-05,
1430
+ "loss": 0.0492,
1431
+ "step": 202
1432
+ },
1433
+ {
1434
+ "epoch": 1.4097222222222223,
1435
+ "grad_norm": 0.06970774002767344,
1436
+ "learning_rate": 2.4302991370624107e-05,
1437
+ "loss": 0.0599,
1438
+ "step": 203
1439
+ },
1440
+ {
1441
+ "epoch": 1.4166666666666667,
1442
+ "grad_norm": 0.050591438460209116,
1443
+ "learning_rate": 2.3784635822138424e-05,
1444
+ "loss": 0.0405,
1445
+ "step": 204
1446
+ },
1447
+ {
1448
+ "epoch": 1.4236111111111112,
1449
+ "grad_norm": 0.06087979461292365,
1450
+ "learning_rate": 2.327013728385502e-05,
1451
+ "loss": 0.0522,
1452
+ "step": 205
1453
+ },
1454
+ {
1455
+ "epoch": 1.4305555555555556,
1456
+ "grad_norm": 0.05750045979367977,
1457
+ "learning_rate": 2.2759571452838324e-05,
1458
+ "loss": 0.0498,
1459
+ "step": 206
1460
+ },
1461
+ {
1462
+ "epoch": 1.4375,
1463
+ "grad_norm": 0.06261633186118712,
1464
+ "learning_rate": 2.225301344754199e-05,
1465
+ "loss": 0.0516,
1466
+ "step": 207
1467
+ },
1468
+ {
1469
+ "epoch": 1.4444444444444444,
1470
+ "grad_norm": 0.04778739915939838,
1471
+ "learning_rate": 2.17505377967569e-05,
1472
+ "loss": 0.0356,
1473
+ "step": 208
1474
+ },
1475
+ {
1476
+ "epoch": 1.4513888888888888,
1477
+ "grad_norm": 0.06838727020740598,
1478
+ "learning_rate": 2.1252218428645846e-05,
1479
+ "loss": 0.0553,
1480
+ "step": 209
1481
+ },
1482
+ {
1483
+ "epoch": 1.4583333333333333,
1484
+ "grad_norm": 0.05699327263249455,
1485
+ "learning_rate": 2.075812865986677e-05,
1486
+ "loss": 0.0436,
1487
+ "step": 210
1488
+ },
1489
+ {
1490
+ "epoch": 1.4652777777777777,
1491
+ "grad_norm": 0.05755232280782536,
1492
+ "learning_rate": 2.026834118478567e-05,
1493
+ "loss": 0.0531,
1494
+ "step": 211
1495
+ },
1496
+ {
1497
+ "epoch": 1.4722222222222223,
1498
+ "grad_norm": 0.06672702654987289,
1499
+ "learning_rate": 1.978292806478134e-05,
1500
+ "loss": 0.0529,
1501
+ "step": 212
1502
+ },
1503
+ {
1504
+ "epoch": 1.4791666666666667,
1505
+ "grad_norm": 0.05318360817731425,
1506
+ "learning_rate": 1.9301960717643118e-05,
1507
+ "loss": 0.0408,
1508
+ "step": 213
1509
+ },
1510
+ {
1511
+ "epoch": 1.4861111111111112,
1512
+ "grad_norm": 0.05852145674739832,
1513
+ "learning_rate": 1.8825509907063327e-05,
1514
+ "loss": 0.0532,
1515
+ "step": 214
1516
+ },
1517
+ {
1518
+ "epoch": 1.4930555555555556,
1519
+ "grad_norm": 0.04695844857372584,
1520
+ "learning_rate": 1.8353645732225976e-05,
1521
+ "loss": 0.0357,
1522
+ "step": 215
1523
+ },
1524
+ {
1525
+ "epoch": 1.5,
1526
+ "grad_norm": 0.06015704770880845,
1527
+ "learning_rate": 1.7886437617493205e-05,
1528
+ "loss": 0.0458,
1529
+ "step": 216
1530
+ },
1531
+ {
1532
+ "epoch": 1.5069444444444444,
1533
+ "grad_norm": 0.06607871507117989,
1534
+ "learning_rate": 1.7423954302191047e-05,
1535
+ "loss": 0.065,
1536
+ "step": 217
1537
+ },
1538
+ {
1539
+ "epoch": 1.5138888888888888,
1540
+ "grad_norm": 0.07091336376792805,
1541
+ "learning_rate": 1.6966263830495936e-05,
1542
+ "loss": 0.0363,
1543
+ "step": 218
1544
+ },
1545
+ {
1546
+ "epoch": 1.5208333333333335,
1547
+ "grad_norm": 0.05916059675448067,
1548
+ "learning_rate": 1.6513433541423528e-05,
1549
+ "loss": 0.0504,
1550
+ "step": 219
1551
+ },
1552
+ {
1553
+ "epoch": 1.5277777777777777,
1554
+ "grad_norm": 0.06409972590419019,
1555
+ "learning_rate": 1.606553005892125e-05,
1556
+ "loss": 0.0393,
1557
+ "step": 220
1558
+ },
1559
+ {
1560
+ "epoch": 1.5347222222222223,
1561
+ "grad_norm": 0.06555802647292505,
1562
+ "learning_rate": 1.5622619282066082e-05,
1563
+ "loss": 0.0482,
1564
+ "step": 221
1565
+ },
1566
+ {
1567
+ "epoch": 1.5416666666666665,
1568
+ "grad_norm": 0.05144622081409977,
1569
+ "learning_rate": 1.5184766375368915e-05,
1570
+ "loss": 0.037,
1571
+ "step": 222
1572
+ },
1573
+ {
1574
+ "epoch": 1.5486111111111112,
1575
+ "grad_norm": 0.06265878609345442,
1576
+ "learning_rate": 1.4752035759187106e-05,
1577
+ "loss": 0.0536,
1578
+ "step": 223
1579
+ },
1580
+ {
1581
+ "epoch": 1.5555555555555556,
1582
+ "grad_norm": 0.05834542354430463,
1583
+ "learning_rate": 1.4324491100246385e-05,
1584
+ "loss": 0.0461,
1585
+ "step": 224
1586
+ },
1587
+ {
1588
+ "epoch": 1.5625,
1589
+ "grad_norm": 0.059108211105270696,
1590
+ "learning_rate": 1.3902195302273779e-05,
1591
+ "loss": 0.0461,
1592
+ "step": 225
1593
+ },
1594
+ {
1595
+ "epoch": 1.5694444444444444,
1596
+ "grad_norm": 0.06072177930930135,
1597
+ "learning_rate": 1.348521049674264e-05,
1598
+ "loss": 0.0448,
1599
+ "step": 226
1600
+ },
1601
+ {
1602
+ "epoch": 1.5763888888888888,
1603
+ "grad_norm": 0.06147140484331682,
1604
+ "learning_rate": 1.3073598033731426e-05,
1605
+ "loss": 0.0475,
1606
+ "step": 227
1607
+ },
1608
+ {
1609
+ "epoch": 1.5833333333333335,
1610
+ "grad_norm": 0.05617171004497398,
1611
+ "learning_rate": 1.2667418472897386e-05,
1612
+ "loss": 0.0432,
1613
+ "step": 228
1614
+ },
1615
+ {
1616
+ "epoch": 1.5902777777777777,
1617
+ "grad_norm": 0.06266225996920342,
1618
+ "learning_rate": 1.2266731574566536e-05,
1619
+ "loss": 0.0446,
1620
+ "step": 229
1621
+ },
1622
+ {
1623
+ "epoch": 1.5972222222222223,
1624
+ "grad_norm": 0.054440007451986294,
1625
+ "learning_rate": 1.1871596290941278e-05,
1626
+ "loss": 0.0403,
1627
+ "step": 230
1628
+ },
1629
+ {
1630
+ "epoch": 1.6041666666666665,
1631
+ "grad_norm": 0.05771513928537823,
1632
+ "learning_rate": 1.1482070757426856e-05,
1633
+ "loss": 0.0424,
1634
+ "step": 231
1635
+ },
1636
+ {
1637
+ "epoch": 1.6111111111111112,
1638
+ "grad_norm": 0.07317554509540891,
1639
+ "learning_rate": 1.1098212284078036e-05,
1640
+ "loss": 0.0486,
1641
+ "step": 232
1642
+ },
1643
+ {
1644
+ "epoch": 1.6180555555555556,
1645
+ "grad_norm": 0.0592998028315768,
1646
+ "learning_rate": 1.0720077347167202e-05,
1647
+ "loss": 0.0452,
1648
+ "step": 233
1649
+ },
1650
+ {
1651
+ "epoch": 1.625,
1652
+ "grad_norm": 0.05734391978777635,
1653
+ "learning_rate": 1.0347721580875126e-05,
1654
+ "loss": 0.0418,
1655
+ "step": 234
1656
+ },
1657
+ {
1658
+ "epoch": 1.6319444444444444,
1659
+ "grad_norm": 0.05781410001692482,
1660
+ "learning_rate": 9.981199769105604e-06,
1661
+ "loss": 0.0488,
1662
+ "step": 235
1663
+ },
1664
+ {
1665
+ "epoch": 1.6388888888888888,
1666
+ "grad_norm": 0.06084611217062411,
1667
+ "learning_rate": 9.620565837425271e-06,
1668
+ "loss": 0.0544,
1669
+ "step": 236
1670
+ },
1671
+ {
1672
+ "epoch": 1.6458333333333335,
1673
+ "grad_norm": 0.05484483522551538,
1674
+ "learning_rate": 9.26587284512957e-06,
1675
+ "loss": 0.0418,
1676
+ "step": 237
1677
+ },
1678
+ {
1679
+ "epoch": 1.6527777777777777,
1680
+ "grad_norm": 0.061075107734742555,
1681
+ "learning_rate": 8.917172977436356e-06,
1682
+ "loss": 0.0509,
1683
+ "step": 238
1684
+ },
1685
+ {
1686
+ "epoch": 1.6597222222222223,
1687
+ "grad_norm": 0.056522690917951715,
1688
+ "learning_rate": 8.574517537807897e-06,
1689
+ "loss": 0.0493,
1690
+ "step": 239
1691
+ },
1692
+ {
1693
+ "epoch": 1.6666666666666665,
1694
+ "grad_norm": 0.06145420768545982,
1695
+ "learning_rate": 8.237956940402757e-06,
1696
+ "loss": 0.0538,
1697
+ "step": 240
1698
+ },
1699
+ {
1700
+ "epoch": 1.6736111111111112,
1701
+ "grad_norm": 0.07712832225863908,
1702
+ "learning_rate": 7.907540702658456e-06,
1703
+ "loss": 0.0417,
1704
+ "step": 241
1705
+ },
1706
+ {
1707
+ "epoch": 1.6805555555555556,
1708
+ "grad_norm": 0.1210354286083926,
1709
+ "learning_rate": 7.583317438006093e-06,
1710
+ "loss": 0.0648,
1711
+ "step": 242
1712
+ },
1713
+ {
1714
+ "epoch": 1.6875,
1715
+ "grad_norm": 0.056155382622722974,
1716
+ "learning_rate": 7.265334848717931e-06,
1717
+ "loss": 0.0409,
1718
+ "step": 243
1719
+ },
1720
+ {
1721
+ "epoch": 1.6944444444444444,
1722
+ "grad_norm": 0.06052338165886097,
1723
+ "learning_rate": 6.953639718889076e-06,
1724
+ "loss": 0.0425,
1725
+ "step": 244
1726
+ },
1727
+ {
1728
+ "epoch": 1.7013888888888888,
1729
+ "grad_norm": 0.06942550868803853,
1730
+ "learning_rate": 6.648277907554235e-06,
1731
+ "loss": 0.0483,
1732
+ "step": 245
1733
+ },
1734
+ {
1735
+ "epoch": 1.7083333333333335,
1736
+ "grad_norm": 0.06102534205447137,
1737
+ "learning_rate": 6.349294341940593e-06,
1738
+ "loss": 0.0461,
1739
+ "step": 246
1740
+ },
1741
+ {
1742
+ "epoch": 1.7152777777777777,
1743
+ "grad_norm": 0.06664766533657307,
1744
+ "learning_rate": 6.056733010857712e-06,
1745
+ "loss": 0.0538,
1746
+ "step": 247
1747
+ },
1748
+ {
1749
+ "epoch": 1.7222222222222223,
1750
+ "grad_norm": 0.055246592628772316,
1751
+ "learning_rate": 5.770636958225617e-06,
1752
+ "loss": 0.0455,
1753
+ "step": 248
1754
+ },
1755
+ {
1756
+ "epoch": 1.7291666666666665,
1757
+ "grad_norm": 0.061169981220161505,
1758
+ "learning_rate": 5.491048276741784e-06,
1759
+ "loss": 0.0444,
1760
+ "step": 249
1761
+ },
1762
+ {
1763
+ "epoch": 1.7361111111111112,
1764
+ "grad_norm": 0.07327155244232343,
1765
+ "learning_rate": 5.218008101688171e-06,
1766
+ "loss": 0.0607,
1767
+ "step": 250
1768
+ },
1769
+ {
1770
+ "epoch": 1.7430555555555556,
1771
+ "grad_norm": 0.06145818807633177,
1772
+ "learning_rate": 4.951556604879048e-06,
1773
+ "loss": 0.0445,
1774
+ "step": 251
1775
+ },
1776
+ {
1777
+ "epoch": 1.75,
1778
+ "grad_norm": 0.05297976039230036,
1779
+ "learning_rate": 4.691732988750614e-06,
1780
+ "loss": 0.0388,
1781
+ "step": 252
1782
+ },
1783
+ {
1784
+ "epoch": 1.7569444444444444,
1785
+ "grad_norm": 0.054054619462972,
1786
+ "learning_rate": 4.43857548059321e-06,
1787
+ "loss": 0.0391,
1788
+ "step": 253
1789
+ },
1790
+ {
1791
+ "epoch": 1.7638888888888888,
1792
+ "grad_norm": 0.05798999532961248,
1793
+ "learning_rate": 4.192121326927073e-06,
1794
+ "loss": 0.0432,
1795
+ "step": 254
1796
+ },
1797
+ {
1798
+ "epoch": 1.7708333333333335,
1799
+ "grad_norm": 0.05728064735233047,
1800
+ "learning_rate": 3.952406788022267e-06,
1801
+ "loss": 0.044,
1802
+ "step": 255
1803
+ },
1804
+ {
1805
+ "epoch": 1.7777777777777777,
1806
+ "grad_norm": 0.05548812892517234,
1807
+ "learning_rate": 3.7194671325638198e-06,
1808
+ "loss": 0.042,
1809
+ "step": 256
1810
+ },
1811
+ {
1812
+ "epoch": 1.7847222222222223,
1813
+ "grad_norm": 0.05776655459801612,
1814
+ "learning_rate": 3.493336632462718e-06,
1815
+ "loss": 0.0446,
1816
+ "step": 257
1817
+ },
1818
+ {
1819
+ "epoch": 1.7916666666666665,
1820
+ "grad_norm": 0.06297495738200812,
1821
+ "learning_rate": 3.274048557813553e-06,
1822
+ "loss": 0.0375,
1823
+ "step": 258
1824
+ },
1825
+ {
1826
+ "epoch": 1.7986111111111112,
1827
+ "grad_norm": 0.0571294673264027,
1828
+ "learning_rate": 3.061635171999566e-06,
1829
+ "loss": 0.0458,
1830
+ "step": 259
1831
+ },
1832
+ {
1833
+ "epoch": 1.8055555555555556,
1834
+ "grad_norm": 0.05711653877850546,
1835
+ "learning_rate": 2.85612772694579e-06,
1836
+ "loss": 0.043,
1837
+ "step": 260
1838
+ },
1839
+ {
1840
+ "epoch": 1.8125,
1841
+ "grad_norm": 0.056535662892683275,
1842
+ "learning_rate": 2.657556458521049e-06,
1843
+ "loss": 0.0422,
1844
+ "step": 261
1845
+ },
1846
+ {
1847
+ "epoch": 1.8194444444444444,
1848
+ "grad_norm": 0.05861262152488929,
1849
+ "learning_rate": 2.4659505820893826e-06,
1850
+ "loss": 0.049,
1851
+ "step": 262
1852
+ },
1853
+ {
1854
+ "epoch": 1.8263888888888888,
1855
+ "grad_norm": 0.057433042780234714,
1856
+ "learning_rate": 2.2813382882116986e-06,
1857
+ "loss": 0.0437,
1858
+ "step": 263
1859
+ },
1860
+ {
1861
+ "epoch": 1.8333333333333335,
1862
+ "grad_norm": 0.05389278487965654,
1863
+ "learning_rate": 2.1037467384981026e-06,
1864
+ "loss": 0.0426,
1865
+ "step": 264
1866
+ },
1867
+ {
1868
+ "epoch": 1.8402777777777777,
1869
+ "grad_norm": 0.05557228450230491,
1870
+ "learning_rate": 1.933202061611722e-06,
1871
+ "loss": 0.0458,
1872
+ "step": 265
1873
+ },
1874
+ {
1875
+ "epoch": 1.8472222222222223,
1876
+ "grad_norm": 0.05917014990003017,
1877
+ "learning_rate": 1.769729349424415e-06,
1878
+ "loss": 0.0531,
1879
+ "step": 266
1880
+ },
1881
+ {
1882
+ "epoch": 1.8541666666666665,
1883
+ "grad_norm": 0.057944432503720486,
1884
+ "learning_rate": 1.6133526533250565e-06,
1885
+ "loss": 0.0425,
1886
+ "step": 267
1887
+ },
1888
+ {
1889
+ "epoch": 1.8611111111111112,
1890
+ "grad_norm": 0.06092440732902568,
1891
+ "learning_rate": 1.4640949806809523e-06,
1892
+ "loss": 0.0492,
1893
+ "step": 268
1894
+ },
1895
+ {
1896
+ "epoch": 1.8680555555555556,
1897
+ "grad_norm": 0.05900717359821016,
1898
+ "learning_rate": 1.3219782914527634e-06,
1899
+ "loss": 0.0452,
1900
+ "step": 269
1901
+ },
1902
+ {
1903
+ "epoch": 1.875,
1904
+ "grad_norm": 0.06389890797796334,
1905
+ "learning_rate": 1.1870234949636073e-06,
1906
+ "loss": 0.0549,
1907
+ "step": 270
1908
+ },
1909
+ {
1910
+ "epoch": 1.8819444444444444,
1911
+ "grad_norm": 0.05962740065521039,
1912
+ "learning_rate": 1.0592504468227127e-06,
1913
+ "loss": 0.0463,
1914
+ "step": 271
1915
+ },
1916
+ {
1917
+ "epoch": 1.8888888888888888,
1918
+ "grad_norm": 0.07174635089821045,
1919
+ "learning_rate": 9.386779460041017e-07,
1920
+ "loss": 0.0522,
1921
+ "step": 272
1922
+ },
1923
+ {
1924
+ "epoch": 1.8958333333333335,
1925
+ "grad_norm": 0.06082782248412747,
1926
+ "learning_rate": 8.253237320807461e-07,
1927
+ "loss": 0.0475,
1928
+ "step": 273
1929
+ },
1930
+ {
1931
+ "epoch": 1.9027777777777777,
1932
+ "grad_norm": 0.0601520271544199,
1933
+ "learning_rate": 7.192044826145771e-07,
1934
+ "loss": 0.0498,
1935
+ "step": 274
1936
+ },
1937
+ {
1938
+ "epoch": 1.9097222222222223,
1939
+ "grad_norm": 0.05825976235647901,
1940
+ "learning_rate": 6.20335810702749e-07,
1941
+ "loss": 0.0422,
1942
+ "step": 275
1943
+ },
1944
+ {
1945
+ "epoch": 1.9166666666666665,
1946
+ "grad_norm": 0.05517949333399056,
1947
+ "learning_rate": 5.287322626805202e-07,
1948
+ "loss": 0.0433,
1949
+ "step": 276
1950
+ },
1951
+ {
1952
+ "epoch": 1.9236111111111112,
1953
+ "grad_norm": 0.06763262497028635,
1954
+ "learning_rate": 4.4440731598107686e-07,
1955
+ "loss": 0.0499,
1956
+ "step": 277
1957
+ },
1958
+ {
1959
+ "epoch": 1.9305555555555556,
1960
+ "grad_norm": 0.16139138859406038,
1961
+ "learning_rate": 3.673733771526466e-07,
1962
+ "loss": 0.0664,
1963
+ "step": 278
1964
+ },
1965
+ {
1966
+ "epoch": 1.9375,
1967
+ "grad_norm": 0.056764953267334266,
1968
+ "learning_rate": 2.976417800331144e-07,
1969
+ "loss": 0.0395,
1970
+ "step": 279
1971
+ },
1972
+ {
1973
+ "epoch": 1.9444444444444444,
1974
+ "grad_norm": 0.06087509888868891,
1975
+ "learning_rate": 2.352227840825394e-07,
1976
+ "loss": 0.0456,
1977
+ "step": 280
1978
+ },
1979
+ {
1980
+ "epoch": 1.9513888888888888,
1981
+ "grad_norm": 0.07013826621193169,
1982
+ "learning_rate": 1.8012557287367392e-07,
1983
+ "loss": 0.0449,
1984
+ "step": 281
1985
+ },
1986
+ {
1987
+ "epoch": 1.9583333333333335,
1988
+ "grad_norm": 0.06122401225786666,
1989
+ "learning_rate": 1.3235825274081625e-07,
1990
+ "loss": 0.0433,
1991
+ "step": 282
1992
+ },
1993
+ {
1994
+ "epoch": 1.9652777777777777,
1995
+ "grad_norm": 0.062035625059352666,
1996
+ "learning_rate": 9.19278515871369e-08,
1997
+ "loss": 0.0516,
1998
+ "step": 283
1999
+ },
2000
+ {
2001
+ "epoch": 1.9722222222222223,
2002
+ "grad_norm": 0.05458425287261523,
2003
+ "learning_rate": 5.8840317850683555e-08,
2004
+ "loss": 0.0392,
2005
+ "step": 284
2006
+ },
2007
+ {
2008
+ "epoch": 1.9791666666666665,
2009
+ "grad_norm": 0.0574212811670188,
2010
+ "learning_rate": 3.310051962920335e-08,
2011
+ "loss": 0.0471,
2012
+ "step": 285
2013
+ },
2014
+ {
2015
+ "epoch": 1.9861111111111112,
2016
+ "grad_norm": 0.05924979972086086,
2017
+ "learning_rate": 1.471224396389359e-08,
2018
+ "loss": 0.0542,
2019
+ "step": 286
2020
+ },
2021
+ {
2022
+ "epoch": 1.9930555555555556,
2023
+ "grad_norm": 0.05752784210946029,
2024
+ "learning_rate": 3.6781962822529657e-09,
2025
+ "loss": 0.0474,
2026
+ "step": 287
2027
+ },
2028
+ {
2029
+ "epoch": 2.0,
2030
+ "grad_norm": 0.0627018913384721,
2031
+ "learning_rate": 0.0,
2032
+ "loss": 0.0448,
2033
+ "step": 288
2034
+ },
2035
+ {
2036
+ "epoch": 2.0,
2037
+ "step": 288,
2038
+ "total_flos": 547031730880512.0,
2039
+ "train_loss": 0.056736280779457755,
2040
+ "train_runtime": 1071.8886,
2041
+ "train_samples_per_second": 1.069,
2042
+ "train_steps_per_second": 0.269
2043
+ }
2044
+ ],
2045
+ "logging_steps": 1,
2046
+ "max_steps": 288,
2047
+ "num_input_tokens_seen": 0,
2048
+ "num_train_epochs": 2,
2049
+ "save_steps": 300,
2050
+ "stateful_callbacks": {
2051
+ "TrainerControl": {
2052
+ "args": {
2053
+ "should_epoch_stop": false,
2054
+ "should_evaluate": false,
2055
+ "should_log": false,
2056
+ "should_save": true,
2057
+ "should_training_stop": true
2058
+ },
2059
+ "attributes": {}
2060
+ }
2061
+ },
2062
+ "total_flos": 547031730880512.0,
2063
+ "train_batch_size": 1,
2064
+ "trial_name": null,
2065
+ "trial_params": null
2066
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1d59aab1b457e8b9fd7de0c37304956819c20c31aa1763bdf88eee33e8deac7
3
+ size 7224
training_eval_loss.png ADDED
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff