Royal-lobster commited on
Commit
ee0ba32
1 Parent(s): 557c7c9

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: teknium/OpenHermes-2.5-Mistral-7B
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: out
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
15
+ # out
16
+
17
+ This model is a fine-tuned version of [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) on the None dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.1923
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-06
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 2
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 2
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 16
46
+ - total_eval_batch_size: 4
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_steps: 10
50
+ - num_epochs: 2
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:-----:|:----:|:---------------:|
56
+ | 1.1498 | 0.0 | 1 | 1.1953 |
57
+ | 0.321 | 0.1 | 31 | 0.3176 |
58
+ | 0.2693 | 0.2 | 62 | 0.2712 |
59
+ | 0.2701 | 0.31 | 93 | 0.2523 |
60
+ | 0.27 | 0.41 | 124 | 0.2362 |
61
+ | 0.2244 | 0.51 | 155 | 0.2284 |
62
+ | 0.2227 | 0.61 | 186 | 0.2260 |
63
+ | 0.2167 | 0.71 | 217 | 0.2171 |
64
+ | 0.2098 | 0.81 | 248 | 0.2082 |
65
+ | 0.1842 | 0.92 | 279 | 0.2047 |
66
+ | 0.1917 | 1.02 | 310 | 0.2013 |
67
+ | 0.1639 | 1.12 | 341 | 0.1982 |
68
+ | 0.1835 | 1.22 | 372 | 0.1968 |
69
+ | 0.1666 | 1.32 | 403 | 0.1953 |
70
+ | 0.1694 | 1.43 | 434 | 0.1932 |
71
+ | 0.1461 | 1.53 | 465 | 0.1929 |
72
+ | 0.1535 | 1.63 | 496 | 0.1927 |
73
+ | 0.1419 | 1.73 | 527 | 0.1925 |
74
+ | 0.1612 | 1.83 | 558 | 0.1923 |
75
+ | 0.1857 | 1.93 | 589 | 0.1923 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - Transformers 4.34.1
81
+ - Pytorch 2.0.1+cu118
82
+ - Datasets 2.14.6
83
+ - Tokenizers 0.14.1
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 32000,
3
+ "<|im_start|>": 32001
4
+ }
checkpoint-304/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "teknium/OpenHermes-2.5-Mistral-7B",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "max_position_embeddings": 32768,
13
+ "model_type": "mistral",
14
+ "num_attention_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_key_value_heads": 8,
17
+ "rms_norm_eps": 1e-05,
18
+ "rope_theta": 10000.0,
19
+ "sliding_window": 4096,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.34.1",
23
+ "use_cache": false,
24
+ "vocab_size": 32002
25
+ }
checkpoint-304/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 32000,
5
+ "transformers_version": "4.34.1"
6
+ }
checkpoint-304/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e6187564e0cb423c502ad7c4c471b40e3a3948b33291e75fe96e5eee8fa136
3
+ size 14512135879
checkpoint-304/pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0460f2ef47ab8ffe104b355e94d643362c5a2d30e7c88ff25309ea8d3dbf6c5a
3
+ size 9886765428
checkpoint-304/pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:769dc4ff85335c5680fca0fc804c81420600c1b9d287a32bb36611c418168385
3
+ size 5121688491
checkpoint-304/pytorch_model.bin.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15008350208
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
135
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
136
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
138
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
139
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
144
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
145
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
146
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
147
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
148
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
149
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
150
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
151
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
153
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
155
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
156
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
157
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
158
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
159
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
160
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
162
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
163
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
164
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
165
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
166
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
167
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
168
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
169
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
171
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
172
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
174
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
175
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
176
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
177
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
178
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
216
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
217
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
218
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
220
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
222
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
223
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
225
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
226
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
227
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
228
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
229
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
230
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
231
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
232
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
233
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
234
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
235
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
236
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
237
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
238
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
239
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
240
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
241
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
242
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
249
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
255
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
256
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
257
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
259
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
262
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
263
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
264
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
265
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
266
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
267
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
268
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
280
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
281
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
282
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
283
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
284
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
289
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
290
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
291
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
292
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
293
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
294
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
295
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
296
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
297
+ }
298
+ }
checkpoint-304/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b79fdab1d04d2fef64e6cc48d12f3c84ce5b8c1a21bebdb7dbe2a42d800421e
3
+ size 15607
checkpoint-304/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31694b994f0607cfe91452414877c57b870598c9a4bd9bae0760f822c13a1461
3
+ size 15607
checkpoint-304/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7547983ff9227d6a5b68c9efde9fe1bc3541fcc13d36c1688a739942d845584c
3
+ size 627
checkpoint-304/trainer_state.json ADDED
@@ -0,0 +1,1923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9983579638752053,
5
+ "eval_steps": 31,
6
+ "global_step": 304,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 5.000000000000001e-07,
14
+ "loss": 1.1498,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "eval_loss": 1.1952733993530273,
20
+ "eval_runtime": 15.2597,
21
+ "eval_samples_per_second": 6.226,
22
+ "eval_steps_per_second": 1.573,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 1.0000000000000002e-06,
28
+ "loss": 1.176,
29
+ "step": 2
30
+ },
31
+ {
32
+ "epoch": 0.01,
33
+ "learning_rate": 1.5e-06,
34
+ "loss": 1.1287,
35
+ "step": 3
36
+ },
37
+ {
38
+ "epoch": 0.01,
39
+ "learning_rate": 2.0000000000000003e-06,
40
+ "loss": 1.0172,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.02,
45
+ "learning_rate": 2.5e-06,
46
+ "loss": 0.8997,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 0.02,
51
+ "learning_rate": 3e-06,
52
+ "loss": 0.8616,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 0.02,
57
+ "learning_rate": 3.5e-06,
58
+ "loss": 0.7128,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.03,
63
+ "learning_rate": 4.000000000000001e-06,
64
+ "loss": 0.6826,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 4.5e-06,
70
+ "loss": 0.5946,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.03,
75
+ "learning_rate": 5e-06,
76
+ "loss": 0.5616,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.04,
81
+ "learning_rate": 4.999965501009142e-06,
82
+ "loss": 0.4778,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.04,
87
+ "learning_rate": 4.999862004988709e-06,
88
+ "loss": 0.5264,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.04,
93
+ "learning_rate": 4.999689514795112e-06,
94
+ "loss": 0.4518,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.05,
99
+ "learning_rate": 4.9994480351889364e-06,
100
+ "loss": 0.46,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.05,
105
+ "learning_rate": 4.999137572834828e-06,
106
+ "loss": 0.4111,
107
+ "step": 15
108
+ },
109
+ {
110
+ "epoch": 0.05,
111
+ "learning_rate": 4.998758136301295e-06,
112
+ "loss": 0.3646,
113
+ "step": 16
114
+ },
115
+ {
116
+ "epoch": 0.06,
117
+ "learning_rate": 4.99830973606048e-06,
118
+ "loss": 0.4024,
119
+ "step": 17
120
+ },
121
+ {
122
+ "epoch": 0.06,
123
+ "learning_rate": 4.997792384487867e-06,
124
+ "loss": 0.4042,
125
+ "step": 18
126
+ },
127
+ {
128
+ "epoch": 0.06,
129
+ "learning_rate": 4.997206095861944e-06,
130
+ "loss": 0.3659,
131
+ "step": 19
132
+ },
133
+ {
134
+ "epoch": 0.07,
135
+ "learning_rate": 4.996550886363801e-06,
136
+ "loss": 0.3527,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.07,
141
+ "learning_rate": 4.995826774076693e-06,
142
+ "loss": 0.3726,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.07,
147
+ "learning_rate": 4.995033778985534e-06,
148
+ "loss": 0.3551,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.08,
153
+ "learning_rate": 4.994171922976349e-06,
154
+ "loss": 0.382,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.08,
159
+ "learning_rate": 4.993241229835666e-06,
160
+ "loss": 0.3694,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.08,
165
+ "learning_rate": 4.992241725249866e-06,
166
+ "loss": 0.3603,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.09,
171
+ "learning_rate": 4.991173436804468e-06,
172
+ "loss": 0.3383,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 0.09,
177
+ "learning_rate": 4.990036393983372e-06,
178
+ "loss": 0.3455,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 0.09,
183
+ "learning_rate": 4.9888306281680405e-06,
184
+ "loss": 0.3428,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 0.1,
189
+ "learning_rate": 4.987556172636637e-06,
190
+ "loss": 0.3001,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 0.1,
195
+ "learning_rate": 4.986213062563104e-06,
196
+ "loss": 0.307,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 0.1,
201
+ "learning_rate": 4.984801335016198e-06,
202
+ "loss": 0.321,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 0.1,
207
+ "eval_loss": 0.3176000714302063,
208
+ "eval_runtime": 15.512,
209
+ "eval_samples_per_second": 6.124,
210
+ "eval_steps_per_second": 1.547,
211
+ "step": 31
212
+ },
213
+ {
214
+ "epoch": 0.11,
215
+ "learning_rate": 4.9833210289584574e-06,
216
+ "loss": 0.2925,
217
+ "step": 32
218
+ },
219
+ {
220
+ "epoch": 0.11,
221
+ "learning_rate": 4.981772185245135e-06,
222
+ "loss": 0.311,
223
+ "step": 33
224
+ },
225
+ {
226
+ "epoch": 0.11,
227
+ "learning_rate": 4.980154846623067e-06,
228
+ "loss": 0.3066,
229
+ "step": 34
230
+ },
231
+ {
232
+ "epoch": 0.11,
233
+ "learning_rate": 4.978469057729493e-06,
234
+ "loss": 0.2879,
235
+ "step": 35
236
+ },
237
+ {
238
+ "epoch": 0.12,
239
+ "learning_rate": 4.976714865090827e-06,
240
+ "loss": 0.3001,
241
+ "step": 36
242
+ },
243
+ {
244
+ "epoch": 0.12,
245
+ "learning_rate": 4.974892317121368e-06,
246
+ "loss": 0.3111,
247
+ "step": 37
248
+ },
249
+ {
250
+ "epoch": 0.12,
251
+ "learning_rate": 4.97300146412197e-06,
252
+ "loss": 0.3067,
253
+ "step": 38
254
+ },
255
+ {
256
+ "epoch": 0.13,
257
+ "learning_rate": 4.9710423582786485e-06,
258
+ "loss": 0.3091,
259
+ "step": 39
260
+ },
261
+ {
262
+ "epoch": 0.13,
263
+ "learning_rate": 4.969015053661142e-06,
264
+ "loss": 0.302,
265
+ "step": 40
266
+ },
267
+ {
268
+ "epoch": 0.13,
269
+ "learning_rate": 4.966919606221423e-06,
270
+ "loss": 0.2991,
271
+ "step": 41
272
+ },
273
+ {
274
+ "epoch": 0.14,
275
+ "learning_rate": 4.964756073792148e-06,
276
+ "loss": 0.3034,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 0.14,
281
+ "learning_rate": 4.9625245160850674e-06,
282
+ "loss": 0.2877,
283
+ "step": 43
284
+ },
285
+ {
286
+ "epoch": 0.14,
287
+ "learning_rate": 4.960224994689371e-06,
288
+ "loss": 0.283,
289
+ "step": 44
290
+ },
291
+ {
292
+ "epoch": 0.15,
293
+ "learning_rate": 4.957857573069992e-06,
294
+ "loss": 0.2993,
295
+ "step": 45
296
+ },
297
+ {
298
+ "epoch": 0.15,
299
+ "learning_rate": 4.955422316565856e-06,
300
+ "loss": 0.2738,
301
+ "step": 46
302
+ },
303
+ {
304
+ "epoch": 0.15,
305
+ "learning_rate": 4.952919292388079e-06,
306
+ "loss": 0.293,
307
+ "step": 47
308
+ },
309
+ {
310
+ "epoch": 0.16,
311
+ "learning_rate": 4.950348569618105e-06,
312
+ "loss": 0.3053,
313
+ "step": 48
314
+ },
315
+ {
316
+ "epoch": 0.16,
317
+ "learning_rate": 4.947710219205808e-06,
318
+ "loss": 0.2875,
319
+ "step": 49
320
+ },
321
+ {
322
+ "epoch": 0.16,
323
+ "learning_rate": 4.9450043139675284e-06,
324
+ "loss": 0.2798,
325
+ "step": 50
326
+ },
327
+ {
328
+ "epoch": 0.17,
329
+ "learning_rate": 4.9422309285840684e-06,
330
+ "loss": 0.3172,
331
+ "step": 51
332
+ },
333
+ {
334
+ "epoch": 0.17,
335
+ "learning_rate": 4.939390139598623e-06,
336
+ "loss": 0.2713,
337
+ "step": 52
338
+ },
339
+ {
340
+ "epoch": 0.17,
341
+ "learning_rate": 4.936482025414677e-06,
342
+ "loss": 0.2372,
343
+ "step": 53
344
+ },
345
+ {
346
+ "epoch": 0.18,
347
+ "learning_rate": 4.933506666293834e-06,
348
+ "loss": 0.3125,
349
+ "step": 54
350
+ },
351
+ {
352
+ "epoch": 0.18,
353
+ "learning_rate": 4.9304641443536015e-06,
354
+ "loss": 0.2687,
355
+ "step": 55
356
+ },
357
+ {
358
+ "epoch": 0.18,
359
+ "learning_rate": 4.927354543565131e-06,
360
+ "loss": 0.2701,
361
+ "step": 56
362
+ },
363
+ {
364
+ "epoch": 0.19,
365
+ "learning_rate": 4.924177949750893e-06,
366
+ "loss": 0.2681,
367
+ "step": 57
368
+ },
369
+ {
370
+ "epoch": 0.19,
371
+ "learning_rate": 4.920934450582311e-06,
372
+ "loss": 0.2751,
373
+ "step": 58
374
+ },
375
+ {
376
+ "epoch": 0.19,
377
+ "learning_rate": 4.917624135577346e-06,
378
+ "loss": 0.2902,
379
+ "step": 59
380
+ },
381
+ {
382
+ "epoch": 0.2,
383
+ "learning_rate": 4.914247096098019e-06,
384
+ "loss": 0.2618,
385
+ "step": 60
386
+ },
387
+ {
388
+ "epoch": 0.2,
389
+ "learning_rate": 4.910803425347892e-06,
390
+ "loss": 0.2569,
391
+ "step": 61
392
+ },
393
+ {
394
+ "epoch": 0.2,
395
+ "learning_rate": 4.907293218369499e-06,
396
+ "loss": 0.2693,
397
+ "step": 62
398
+ },
399
+ {
400
+ "epoch": 0.2,
401
+ "eval_loss": 0.27119407057762146,
402
+ "eval_runtime": 15.5684,
403
+ "eval_samples_per_second": 6.102,
404
+ "eval_steps_per_second": 1.542,
405
+ "step": 62
406
+ },
407
+ {
408
+ "epoch": 0.21,
409
+ "learning_rate": 4.903716572041718e-06,
410
+ "loss": 0.2475,
411
+ "step": 63
412
+ },
413
+ {
414
+ "epoch": 0.21,
415
+ "learning_rate": 4.9000735850771e-06,
416
+ "loss": 0.2492,
417
+ "step": 64
418
+ },
419
+ {
420
+ "epoch": 0.21,
421
+ "learning_rate": 4.8963643580191446e-06,
422
+ "loss": 0.273,
423
+ "step": 65
424
+ },
425
+ {
426
+ "epoch": 0.22,
427
+ "learning_rate": 4.8925889932395246e-06,
428
+ "loss": 0.2193,
429
+ "step": 66
430
+ },
431
+ {
432
+ "epoch": 0.22,
433
+ "learning_rate": 4.888747594935259e-06,
434
+ "loss": 0.2637,
435
+ "step": 67
436
+ },
437
+ {
438
+ "epoch": 0.22,
439
+ "learning_rate": 4.88484026912584e-06,
440
+ "loss": 0.3141,
441
+ "step": 68
442
+ },
443
+ {
444
+ "epoch": 0.23,
445
+ "learning_rate": 4.880867123650306e-06,
446
+ "loss": 0.254,
447
+ "step": 69
448
+ },
449
+ {
450
+ "epoch": 0.23,
451
+ "learning_rate": 4.876828268164264e-06,
452
+ "loss": 0.2778,
453
+ "step": 70
454
+ },
455
+ {
456
+ "epoch": 0.23,
457
+ "learning_rate": 4.872723814136866e-06,
458
+ "loss": 0.2809,
459
+ "step": 71
460
+ },
461
+ {
462
+ "epoch": 0.24,
463
+ "learning_rate": 4.868553874847728e-06,
464
+ "loss": 0.266,
465
+ "step": 72
466
+ },
467
+ {
468
+ "epoch": 0.24,
469
+ "learning_rate": 4.864318565383809e-06,
470
+ "loss": 0.2174,
471
+ "step": 73
472
+ },
473
+ {
474
+ "epoch": 0.24,
475
+ "learning_rate": 4.86001800263623e-06,
476
+ "loss": 0.2732,
477
+ "step": 74
478
+ },
479
+ {
480
+ "epoch": 0.25,
481
+ "learning_rate": 4.855652305297052e-06,
482
+ "loss": 0.2507,
483
+ "step": 75
484
+ },
485
+ {
486
+ "epoch": 0.25,
487
+ "learning_rate": 4.8512215938559955e-06,
488
+ "loss": 0.2514,
489
+ "step": 76
490
+ },
491
+ {
492
+ "epoch": 0.25,
493
+ "learning_rate": 4.846725990597122e-06,
494
+ "loss": 0.268,
495
+ "step": 77
496
+ },
497
+ {
498
+ "epoch": 0.26,
499
+ "learning_rate": 4.84216561959545e-06,
500
+ "loss": 0.228,
501
+ "step": 78
502
+ },
503
+ {
504
+ "epoch": 0.26,
505
+ "learning_rate": 4.837540606713538e-06,
506
+ "loss": 0.2606,
507
+ "step": 79
508
+ },
509
+ {
510
+ "epoch": 0.26,
511
+ "learning_rate": 4.832851079598007e-06,
512
+ "loss": 0.2554,
513
+ "step": 80
514
+ },
515
+ {
516
+ "epoch": 0.27,
517
+ "learning_rate": 4.82809716767602e-06,
518
+ "loss": 0.257,
519
+ "step": 81
520
+ },
521
+ {
522
+ "epoch": 0.27,
523
+ "learning_rate": 4.8232790021517094e-06,
524
+ "loss": 0.2475,
525
+ "step": 82
526
+ },
527
+ {
528
+ "epoch": 0.27,
529
+ "learning_rate": 4.818396716002553e-06,
530
+ "loss": 0.2636,
531
+ "step": 83
532
+ },
533
+ {
534
+ "epoch": 0.28,
535
+ "learning_rate": 4.813450443975705e-06,
536
+ "loss": 0.2076,
537
+ "step": 84
538
+ },
539
+ {
540
+ "epoch": 0.28,
541
+ "learning_rate": 4.808440322584283e-06,
542
+ "loss": 0.2336,
543
+ "step": 85
544
+ },
545
+ {
546
+ "epoch": 0.28,
547
+ "learning_rate": 4.803366490103593e-06,
548
+ "loss": 0.2664,
549
+ "step": 86
550
+ },
551
+ {
552
+ "epoch": 0.29,
553
+ "learning_rate": 4.798229086567312e-06,
554
+ "loss": 0.2559,
555
+ "step": 87
556
+ },
557
+ {
558
+ "epoch": 0.29,
559
+ "learning_rate": 4.793028253763633e-06,
560
+ "loss": 0.2687,
561
+ "step": 88
562
+ },
563
+ {
564
+ "epoch": 0.29,
565
+ "learning_rate": 4.787764135231342e-06,
566
+ "loss": 0.2263,
567
+ "step": 89
568
+ },
569
+ {
570
+ "epoch": 0.3,
571
+ "learning_rate": 4.7824368762558595e-06,
572
+ "loss": 0.2058,
573
+ "step": 90
574
+ },
575
+ {
576
+ "epoch": 0.3,
577
+ "learning_rate": 4.7770466238652336e-06,
578
+ "loss": 0.246,
579
+ "step": 91
580
+ },
581
+ {
582
+ "epoch": 0.3,
583
+ "learning_rate": 4.771593526826078e-06,
584
+ "loss": 0.2592,
585
+ "step": 92
586
+ },
587
+ {
588
+ "epoch": 0.31,
589
+ "learning_rate": 4.76607773563947e-06,
590
+ "loss": 0.2701,
591
+ "step": 93
592
+ },
593
+ {
594
+ "epoch": 0.31,
595
+ "eval_loss": 0.25227251648902893,
596
+ "eval_runtime": 15.5184,
597
+ "eval_samples_per_second": 6.122,
598
+ "eval_steps_per_second": 1.547,
599
+ "step": 93
600
+ },
601
+ {
602
+ "epoch": 0.31,
603
+ "learning_rate": 4.760499402536792e-06,
604
+ "loss": 0.2154,
605
+ "step": 94
606
+ },
607
+ {
608
+ "epoch": 0.31,
609
+ "learning_rate": 4.754858681475534e-06,
610
+ "loss": 0.2748,
611
+ "step": 95
612
+ },
613
+ {
614
+ "epoch": 0.32,
615
+ "learning_rate": 4.7491557281350455e-06,
616
+ "loss": 0.2461,
617
+ "step": 96
618
+ },
619
+ {
620
+ "epoch": 0.32,
621
+ "learning_rate": 4.743390699912232e-06,
622
+ "loss": 0.2245,
623
+ "step": 97
624
+ },
625
+ {
626
+ "epoch": 0.32,
627
+ "learning_rate": 4.737563755917219e-06,
628
+ "loss": 0.2446,
629
+ "step": 98
630
+ },
631
+ {
632
+ "epoch": 0.33,
633
+ "learning_rate": 4.731675056968958e-06,
634
+ "loss": 0.2563,
635
+ "step": 99
636
+ },
637
+ {
638
+ "epoch": 0.33,
639
+ "learning_rate": 4.7257247655907854e-06,
640
+ "loss": 0.2512,
641
+ "step": 100
642
+ },
643
+ {
644
+ "epoch": 0.33,
645
+ "learning_rate": 4.7197130460059385e-06,
646
+ "loss": 0.2613,
647
+ "step": 101
648
+ },
649
+ {
650
+ "epoch": 0.33,
651
+ "learning_rate": 4.7136400641330245e-06,
652
+ "loss": 0.2294,
653
+ "step": 102
654
+ },
655
+ {
656
+ "epoch": 0.34,
657
+ "learning_rate": 4.7075059875814424e-06,
658
+ "loss": 0.2162,
659
+ "step": 103
660
+ },
661
+ {
662
+ "epoch": 0.34,
663
+ "learning_rate": 4.70131098564675e-06,
664
+ "loss": 0.251,
665
+ "step": 104
666
+ },
667
+ {
668
+ "epoch": 0.34,
669
+ "learning_rate": 4.695055229306001e-06,
670
+ "loss": 0.2468,
671
+ "step": 105
672
+ },
673
+ {
674
+ "epoch": 0.35,
675
+ "learning_rate": 4.6887388912130206e-06,
676
+ "loss": 0.2467,
677
+ "step": 106
678
+ },
679
+ {
680
+ "epoch": 0.35,
681
+ "learning_rate": 4.68236214569364e-06,
682
+ "loss": 0.2588,
683
+ "step": 107
684
+ },
685
+ {
686
+ "epoch": 0.35,
687
+ "learning_rate": 4.675925168740887e-06,
688
+ "loss": 0.2592,
689
+ "step": 108
690
+ },
691
+ {
692
+ "epoch": 0.36,
693
+ "learning_rate": 4.6694281380101304e-06,
694
+ "loss": 0.2225,
695
+ "step": 109
696
+ },
697
+ {
698
+ "epoch": 0.36,
699
+ "learning_rate": 4.662871232814171e-06,
700
+ "loss": 0.2411,
701
+ "step": 110
702
+ },
703
+ {
704
+ "epoch": 0.36,
705
+ "learning_rate": 4.656254634118301e-06,
706
+ "loss": 0.2512,
707
+ "step": 111
708
+ },
709
+ {
710
+ "epoch": 0.37,
711
+ "learning_rate": 4.649578524535302e-06,
712
+ "loss": 0.2252,
713
+ "step": 112
714
+ },
715
+ {
716
+ "epoch": 0.37,
717
+ "learning_rate": 4.642843088320408e-06,
718
+ "loss": 0.2288,
719
+ "step": 113
720
+ },
721
+ {
722
+ "epoch": 0.37,
723
+ "learning_rate": 4.636048511366222e-06,
724
+ "loss": 0.2339,
725
+ "step": 114
726
+ },
727
+ {
728
+ "epoch": 0.38,
729
+ "learning_rate": 4.6291949811975814e-06,
730
+ "loss": 0.2252,
731
+ "step": 115
732
+ },
733
+ {
734
+ "epoch": 0.38,
735
+ "learning_rate": 4.622282686966387e-06,
736
+ "loss": 0.2497,
737
+ "step": 116
738
+ },
739
+ {
740
+ "epoch": 0.38,
741
+ "learning_rate": 4.615311819446379e-06,
742
+ "loss": 0.2339,
743
+ "step": 117
744
+ },
745
+ {
746
+ "epoch": 0.39,
747
+ "learning_rate": 4.6082825710278724e-06,
748
+ "loss": 0.2946,
749
+ "step": 118
750
+ },
751
+ {
752
+ "epoch": 0.39,
753
+ "learning_rate": 4.60119513571245e-06,
754
+ "loss": 0.2797,
755
+ "step": 119
756
+ },
757
+ {
758
+ "epoch": 0.39,
759
+ "learning_rate": 4.594049709107604e-06,
760
+ "loss": 0.272,
761
+ "step": 120
762
+ },
763
+ {
764
+ "epoch": 0.4,
765
+ "learning_rate": 4.58684648842134e-06,
766
+ "loss": 0.2437,
767
+ "step": 121
768
+ },
769
+ {
770
+ "epoch": 0.4,
771
+ "learning_rate": 4.5795856724567344e-06,
772
+ "loss": 0.2328,
773
+ "step": 122
774
+ },
775
+ {
776
+ "epoch": 0.4,
777
+ "learning_rate": 4.572267461606446e-06,
778
+ "loss": 0.2333,
779
+ "step": 123
780
+ },
781
+ {
782
+ "epoch": 0.41,
783
+ "learning_rate": 4.564892057847184e-06,
784
+ "loss": 0.27,
785
+ "step": 124
786
+ },
787
+ {
788
+ "epoch": 0.41,
789
+ "eval_loss": 0.2362014800310135,
790
+ "eval_runtime": 15.5362,
791
+ "eval_samples_per_second": 6.115,
792
+ "eval_steps_per_second": 1.545,
793
+ "step": 124
794
+ },
795
+ {
796
+ "epoch": 0.41,
797
+ "learning_rate": 4.5574596647341414e-06,
798
+ "loss": 0.2202,
799
+ "step": 125
800
+ },
801
+ {
802
+ "epoch": 0.41,
803
+ "learning_rate": 4.549970487395365e-06,
804
+ "loss": 0.2129,
805
+ "step": 126
806
+ },
807
+ {
808
+ "epoch": 0.42,
809
+ "learning_rate": 4.542424732526105e-06,
810
+ "loss": 0.2343,
811
+ "step": 127
812
+ },
813
+ {
814
+ "epoch": 0.42,
815
+ "learning_rate": 4.534822608383104e-06,
816
+ "loss": 0.2527,
817
+ "step": 128
818
+ },
819
+ {
820
+ "epoch": 0.42,
821
+ "learning_rate": 4.5271643247788496e-06,
822
+ "loss": 0.2622,
823
+ "step": 129
824
+ },
825
+ {
826
+ "epoch": 0.43,
827
+ "learning_rate": 4.519450093075787e-06,
828
+ "loss": 0.247,
829
+ "step": 130
830
+ },
831
+ {
832
+ "epoch": 0.43,
833
+ "learning_rate": 4.5116801261804846e-06,
834
+ "loss": 0.2612,
835
+ "step": 131
836
+ },
837
+ {
838
+ "epoch": 0.43,
839
+ "learning_rate": 4.503854638537756e-06,
840
+ "loss": 0.2515,
841
+ "step": 132
842
+ },
843
+ {
844
+ "epoch": 0.44,
845
+ "learning_rate": 4.49597384612474e-06,
846
+ "loss": 0.2164,
847
+ "step": 133
848
+ },
849
+ {
850
+ "epoch": 0.44,
851
+ "learning_rate": 4.488037966444948e-06,
852
+ "loss": 0.2545,
853
+ "step": 134
854
+ },
855
+ {
856
+ "epoch": 0.44,
857
+ "learning_rate": 4.48004721852225e-06,
858
+ "loss": 0.2547,
859
+ "step": 135
860
+ },
861
+ {
862
+ "epoch": 0.45,
863
+ "learning_rate": 4.472001822894839e-06,
864
+ "loss": 0.1869,
865
+ "step": 136
866
+ },
867
+ {
868
+ "epoch": 0.45,
869
+ "learning_rate": 4.463902001609139e-06,
870
+ "loss": 0.2383,
871
+ "step": 137
872
+ },
873
+ {
874
+ "epoch": 0.45,
875
+ "learning_rate": 4.455747978213679e-06,
876
+ "loss": 0.2234,
877
+ "step": 138
878
+ },
879
+ {
880
+ "epoch": 0.46,
881
+ "learning_rate": 4.44753997775292e-06,
882
+ "loss": 0.2356,
883
+ "step": 139
884
+ },
885
+ {
886
+ "epoch": 0.46,
887
+ "learning_rate": 4.43927822676105e-06,
888
+ "loss": 0.232,
889
+ "step": 140
890
+ },
891
+ {
892
+ "epoch": 0.46,
893
+ "learning_rate": 4.430962953255725e-06,
894
+ "loss": 0.2352,
895
+ "step": 141
896
+ },
897
+ {
898
+ "epoch": 0.47,
899
+ "learning_rate": 4.4225943867317835e-06,
900
+ "loss": 0.2199,
901
+ "step": 142
902
+ },
903
+ {
904
+ "epoch": 0.47,
905
+ "learning_rate": 4.4141727581549025e-06,
906
+ "loss": 0.2175,
907
+ "step": 143
908
+ },
909
+ {
910
+ "epoch": 0.47,
911
+ "learning_rate": 4.405698299955234e-06,
912
+ "loss": 0.221,
913
+ "step": 144
914
+ },
915
+ {
916
+ "epoch": 0.48,
917
+ "learning_rate": 4.39717124602098e-06,
918
+ "loss": 0.2522,
919
+ "step": 145
920
+ },
921
+ {
922
+ "epoch": 0.48,
923
+ "learning_rate": 4.388591831691948e-06,
924
+ "loss": 0.2412,
925
+ "step": 146
926
+ },
927
+ {
928
+ "epoch": 0.48,
929
+ "learning_rate": 4.3799602937530464e-06,
930
+ "loss": 0.2176,
931
+ "step": 147
932
+ },
933
+ {
934
+ "epoch": 0.49,
935
+ "learning_rate": 4.3712768704277535e-06,
936
+ "loss": 0.2242,
937
+ "step": 148
938
+ },
939
+ {
940
+ "epoch": 0.49,
941
+ "learning_rate": 4.362541801371542e-06,
942
+ "loss": 0.2156,
943
+ "step": 149
944
+ },
945
+ {
946
+ "epoch": 0.49,
947
+ "learning_rate": 4.353755327665268e-06,
948
+ "loss": 0.2077,
949
+ "step": 150
950
+ },
951
+ {
952
+ "epoch": 0.5,
953
+ "learning_rate": 4.344917691808511e-06,
954
+ "loss": 0.2285,
955
+ "step": 151
956
+ },
957
+ {
958
+ "epoch": 0.5,
959
+ "learning_rate": 4.3360291377128864e-06,
960
+ "loss": 0.2364,
961
+ "step": 152
962
+ },
963
+ {
964
+ "epoch": 0.5,
965
+ "learning_rate": 4.32708991069531e-06,
966
+ "loss": 0.2577,
967
+ "step": 153
968
+ },
969
+ {
970
+ "epoch": 0.51,
971
+ "learning_rate": 4.318100257471233e-06,
972
+ "loss": 0.2347,
973
+ "step": 154
974
+ },
975
+ {
976
+ "epoch": 0.51,
977
+ "learning_rate": 4.309060426147826e-06,
978
+ "loss": 0.2244,
979
+ "step": 155
980
+ },
981
+ {
982
+ "epoch": 0.51,
983
+ "eval_loss": 0.22840476036071777,
984
+ "eval_runtime": 15.5442,
985
+ "eval_samples_per_second": 6.112,
986
+ "eval_steps_per_second": 1.544,
987
+ "step": 155
988
+ },
989
+ {
990
+ "epoch": 0.51,
991
+ "learning_rate": 4.299970666217135e-06,
992
+ "loss": 0.2197,
993
+ "step": 156
994
+ },
995
+ {
996
+ "epoch": 0.52,
997
+ "learning_rate": 4.290831228549196e-06,
998
+ "loss": 0.2618,
999
+ "step": 157
1000
+ },
1001
+ {
1002
+ "epoch": 0.52,
1003
+ "learning_rate": 4.281642365385111e-06,
1004
+ "loss": 0.2498,
1005
+ "step": 158
1006
+ },
1007
+ {
1008
+ "epoch": 0.52,
1009
+ "learning_rate": 4.272404330330084e-06,
1010
+ "loss": 0.2309,
1011
+ "step": 159
1012
+ },
1013
+ {
1014
+ "epoch": 0.53,
1015
+ "learning_rate": 4.263117378346425e-06,
1016
+ "loss": 0.2163,
1017
+ "step": 160
1018
+ },
1019
+ {
1020
+ "epoch": 0.53,
1021
+ "learning_rate": 4.253781765746511e-06,
1022
+ "loss": 0.1968,
1023
+ "step": 161
1024
+ },
1025
+ {
1026
+ "epoch": 0.53,
1027
+ "learning_rate": 4.244397750185714e-06,
1028
+ "loss": 0.2127,
1029
+ "step": 162
1030
+ },
1031
+ {
1032
+ "epoch": 0.54,
1033
+ "learning_rate": 4.234965590655287e-06,
1034
+ "loss": 0.2088,
1035
+ "step": 163
1036
+ },
1037
+ {
1038
+ "epoch": 0.54,
1039
+ "learning_rate": 4.225485547475217e-06,
1040
+ "loss": 0.2082,
1041
+ "step": 164
1042
+ },
1043
+ {
1044
+ "epoch": 0.54,
1045
+ "learning_rate": 4.215957882287044e-06,
1046
+ "loss": 0.2549,
1047
+ "step": 165
1048
+ },
1049
+ {
1050
+ "epoch": 0.55,
1051
+ "learning_rate": 4.206382858046636e-06,
1052
+ "loss": 0.2131,
1053
+ "step": 166
1054
+ },
1055
+ {
1056
+ "epoch": 0.55,
1057
+ "learning_rate": 4.19676073901693e-06,
1058
+ "loss": 0.2048,
1059
+ "step": 167
1060
+ },
1061
+ {
1062
+ "epoch": 0.55,
1063
+ "learning_rate": 4.187091790760644e-06,
1064
+ "loss": 0.2451,
1065
+ "step": 168
1066
+ },
1067
+ {
1068
+ "epoch": 0.56,
1069
+ "learning_rate": 4.177376280132946e-06,
1070
+ "loss": 0.1896,
1071
+ "step": 169
1072
+ },
1073
+ {
1074
+ "epoch": 0.56,
1075
+ "learning_rate": 4.167614475274082e-06,
1076
+ "loss": 0.2253,
1077
+ "step": 170
1078
+ },
1079
+ {
1080
+ "epoch": 0.56,
1081
+ "learning_rate": 4.1578066456019885e-06,
1082
+ "loss": 0.2167,
1083
+ "step": 171
1084
+ },
1085
+ {
1086
+ "epoch": 0.56,
1087
+ "learning_rate": 4.147953061804845e-06,
1088
+ "loss": 0.2262,
1089
+ "step": 172
1090
+ },
1091
+ {
1092
+ "epoch": 0.57,
1093
+ "learning_rate": 4.1380539958336095e-06,
1094
+ "loss": 0.2483,
1095
+ "step": 173
1096
+ },
1097
+ {
1098
+ "epoch": 0.57,
1099
+ "learning_rate": 4.128109720894512e-06,
1100
+ "loss": 0.2824,
1101
+ "step": 174
1102
+ },
1103
+ {
1104
+ "epoch": 0.57,
1105
+ "learning_rate": 4.118120511441512e-06,
1106
+ "loss": 0.2273,
1107
+ "step": 175
1108
+ },
1109
+ {
1110
+ "epoch": 0.58,
1111
+ "learning_rate": 4.108086643168724e-06,
1112
+ "loss": 0.2177,
1113
+ "step": 176
1114
+ },
1115
+ {
1116
+ "epoch": 0.58,
1117
+ "learning_rate": 4.098008393002816e-06,
1118
+ "loss": 0.2147,
1119
+ "step": 177
1120
+ },
1121
+ {
1122
+ "epoch": 0.58,
1123
+ "learning_rate": 4.087886039095353e-06,
1124
+ "loss": 0.1915,
1125
+ "step": 178
1126
+ },
1127
+ {
1128
+ "epoch": 0.59,
1129
+ "learning_rate": 4.077719860815132e-06,
1130
+ "loss": 0.2123,
1131
+ "step": 179
1132
+ },
1133
+ {
1134
+ "epoch": 0.59,
1135
+ "learning_rate": 4.067510138740467e-06,
1136
+ "loss": 0.2418,
1137
+ "step": 180
1138
+ },
1139
+ {
1140
+ "epoch": 0.59,
1141
+ "learning_rate": 4.057257154651444e-06,
1142
+ "loss": 0.2055,
1143
+ "step": 181
1144
+ },
1145
+ {
1146
+ "epoch": 0.6,
1147
+ "learning_rate": 4.046961191522147e-06,
1148
+ "loss": 0.2098,
1149
+ "step": 182
1150
+ },
1151
+ {
1152
+ "epoch": 0.6,
1153
+ "learning_rate": 4.036622533512845e-06,
1154
+ "loss": 0.2304,
1155
+ "step": 183
1156
+ },
1157
+ {
1158
+ "epoch": 0.6,
1159
+ "learning_rate": 4.026241465962154e-06,
1160
+ "loss": 0.2018,
1161
+ "step": 184
1162
+ },
1163
+ {
1164
+ "epoch": 0.61,
1165
+ "learning_rate": 4.0158182753791566e-06,
1166
+ "loss": 0.2009,
1167
+ "step": 185
1168
+ },
1169
+ {
1170
+ "epoch": 0.61,
1171
+ "learning_rate": 4.0053532494354985e-06,
1172
+ "loss": 0.2227,
1173
+ "step": 186
1174
+ },
1175
+ {
1176
+ "epoch": 0.61,
1177
+ "eval_loss": 0.2260431945323944,
1178
+ "eval_runtime": 15.5286,
1179
+ "eval_samples_per_second": 6.118,
1180
+ "eval_steps_per_second": 1.546,
1181
+ "step": 186
1182
+ },
1183
+ {
1184
+ "epoch": 0.61,
1185
+ "learning_rate": 3.994846676957448e-06,
1186
+ "loss": 0.1973,
1187
+ "step": 187
1188
+ },
1189
+ {
1190
+ "epoch": 0.62,
1191
+ "learning_rate": 3.984298847917923e-06,
1192
+ "loss": 0.2192,
1193
+ "step": 188
1194
+ },
1195
+ {
1196
+ "epoch": 0.62,
1197
+ "learning_rate": 3.973710053428487e-06,
1198
+ "loss": 0.2016,
1199
+ "step": 189
1200
+ },
1201
+ {
1202
+ "epoch": 0.62,
1203
+ "learning_rate": 3.963080585731324e-06,
1204
+ "loss": 0.2118,
1205
+ "step": 190
1206
+ },
1207
+ {
1208
+ "epoch": 0.63,
1209
+ "learning_rate": 3.952410738191158e-06,
1210
+ "loss": 0.2361,
1211
+ "step": 191
1212
+ },
1213
+ {
1214
+ "epoch": 0.63,
1215
+ "learning_rate": 3.941700805287169e-06,
1216
+ "loss": 0.2132,
1217
+ "step": 192
1218
+ },
1219
+ {
1220
+ "epoch": 0.63,
1221
+ "learning_rate": 3.9309510826048556e-06,
1222
+ "loss": 0.2117,
1223
+ "step": 193
1224
+ },
1225
+ {
1226
+ "epoch": 0.64,
1227
+ "learning_rate": 3.92016186682789e-06,
1228
+ "loss": 0.2196,
1229
+ "step": 194
1230
+ },
1231
+ {
1232
+ "epoch": 0.64,
1233
+ "learning_rate": 3.909333455729914e-06,
1234
+ "loss": 0.1958,
1235
+ "step": 195
1236
+ },
1237
+ {
1238
+ "epoch": 0.64,
1239
+ "learning_rate": 3.898466148166333e-06,
1240
+ "loss": 0.2318,
1241
+ "step": 196
1242
+ },
1243
+ {
1244
+ "epoch": 0.65,
1245
+ "learning_rate": 3.8875602440660635e-06,
1246
+ "loss": 0.2034,
1247
+ "step": 197
1248
+ },
1249
+ {
1250
+ "epoch": 0.65,
1251
+ "learning_rate": 3.876616044423253e-06,
1252
+ "loss": 0.2119,
1253
+ "step": 198
1254
+ },
1255
+ {
1256
+ "epoch": 0.65,
1257
+ "learning_rate": 3.865633851288975e-06,
1258
+ "loss": 0.2177,
1259
+ "step": 199
1260
+ },
1261
+ {
1262
+ "epoch": 0.66,
1263
+ "learning_rate": 3.854613967762898e-06,
1264
+ "loss": 0.1974,
1265
+ "step": 200
1266
+ },
1267
+ {
1268
+ "epoch": 0.66,
1269
+ "learning_rate": 3.843556697984907e-06,
1270
+ "loss": 0.1902,
1271
+ "step": 201
1272
+ },
1273
+ {
1274
+ "epoch": 0.66,
1275
+ "learning_rate": 3.832462347126722e-06,
1276
+ "loss": 0.1975,
1277
+ "step": 202
1278
+ },
1279
+ {
1280
+ "epoch": 0.67,
1281
+ "learning_rate": 3.821331221383471e-06,
1282
+ "loss": 0.2083,
1283
+ "step": 203
1284
+ },
1285
+ {
1286
+ "epoch": 0.67,
1287
+ "learning_rate": 3.8101636279652375e-06,
1288
+ "loss": 0.2247,
1289
+ "step": 204
1290
+ },
1291
+ {
1292
+ "epoch": 0.67,
1293
+ "learning_rate": 3.798959875088584e-06,
1294
+ "loss": 0.2169,
1295
+ "step": 205
1296
+ },
1297
+ {
1298
+ "epoch": 0.68,
1299
+ "learning_rate": 3.787720271968046e-06,
1300
+ "loss": 0.1854,
1301
+ "step": 206
1302
+ },
1303
+ {
1304
+ "epoch": 0.68,
1305
+ "learning_rate": 3.7764451288075944e-06,
1306
+ "loss": 0.2269,
1307
+ "step": 207
1308
+ },
1309
+ {
1310
+ "epoch": 0.68,
1311
+ "learning_rate": 3.765134756792079e-06,
1312
+ "loss": 0.1946,
1313
+ "step": 208
1314
+ },
1315
+ {
1316
+ "epoch": 0.69,
1317
+ "learning_rate": 3.753789468078636e-06,
1318
+ "loss": 0.1944,
1319
+ "step": 209
1320
+ },
1321
+ {
1322
+ "epoch": 0.69,
1323
+ "learning_rate": 3.742409575788074e-06,
1324
+ "loss": 0.2057,
1325
+ "step": 210
1326
+ },
1327
+ {
1328
+ "epoch": 0.69,
1329
+ "learning_rate": 3.730995393996234e-06,
1330
+ "loss": 0.2427,
1331
+ "step": 211
1332
+ },
1333
+ {
1334
+ "epoch": 0.7,
1335
+ "learning_rate": 3.719547237725319e-06,
1336
+ "loss": 0.227,
1337
+ "step": 212
1338
+ },
1339
+ {
1340
+ "epoch": 0.7,
1341
+ "learning_rate": 3.708065422935198e-06,
1342
+ "loss": 0.2152,
1343
+ "step": 213
1344
+ },
1345
+ {
1346
+ "epoch": 0.7,
1347
+ "learning_rate": 3.6965502665146916e-06,
1348
+ "loss": 0.2177,
1349
+ "step": 214
1350
+ },
1351
+ {
1352
+ "epoch": 0.71,
1353
+ "learning_rate": 3.6850020862728196e-06,
1354
+ "loss": 0.2089,
1355
+ "step": 215
1356
+ },
1357
+ {
1358
+ "epoch": 0.71,
1359
+ "learning_rate": 3.6734212009300346e-06,
1360
+ "loss": 0.2081,
1361
+ "step": 216
1362
+ },
1363
+ {
1364
+ "epoch": 0.71,
1365
+ "learning_rate": 3.661807930109422e-06,
1366
+ "loss": 0.2167,
1367
+ "step": 217
1368
+ },
1369
+ {
1370
+ "epoch": 0.71,
1371
+ "eval_loss": 0.21709737181663513,
1372
+ "eval_runtime": 15.5146,
1373
+ "eval_samples_per_second": 6.123,
1374
+ "eval_steps_per_second": 1.547,
1375
+ "step": 217
1376
+ },
1377
+ {
1378
+ "epoch": 0.72,
1379
+ "learning_rate": 3.650162594327881e-06,
1380
+ "loss": 0.2135,
1381
+ "step": 218
1382
+ },
1383
+ {
1384
+ "epoch": 0.72,
1385
+ "learning_rate": 3.6384855149872776e-06,
1386
+ "loss": 0.1976,
1387
+ "step": 219
1388
+ },
1389
+ {
1390
+ "epoch": 0.72,
1391
+ "learning_rate": 3.6267770143655743e-06,
1392
+ "loss": 0.2073,
1393
+ "step": 220
1394
+ },
1395
+ {
1396
+ "epoch": 0.73,
1397
+ "learning_rate": 3.615037415607937e-06,
1398
+ "loss": 0.2252,
1399
+ "step": 221
1400
+ },
1401
+ {
1402
+ "epoch": 0.73,
1403
+ "learning_rate": 3.603267042717813e-06,
1404
+ "loss": 0.2296,
1405
+ "step": 222
1406
+ },
1407
+ {
1408
+ "epoch": 0.73,
1409
+ "learning_rate": 3.5914662205479923e-06,
1410
+ "loss": 0.181,
1411
+ "step": 223
1412
+ },
1413
+ {
1414
+ "epoch": 0.74,
1415
+ "learning_rate": 3.579635274791639e-06,
1416
+ "loss": 0.2196,
1417
+ "step": 224
1418
+ },
1419
+ {
1420
+ "epoch": 0.74,
1421
+ "learning_rate": 3.567774531973305e-06,
1422
+ "loss": 0.2063,
1423
+ "step": 225
1424
+ },
1425
+ {
1426
+ "epoch": 0.74,
1427
+ "learning_rate": 3.555884319439917e-06,
1428
+ "loss": 0.2313,
1429
+ "step": 226
1430
+ },
1431
+ {
1432
+ "epoch": 0.75,
1433
+ "learning_rate": 3.5439649653517416e-06,
1434
+ "loss": 0.1994,
1435
+ "step": 227
1436
+ },
1437
+ {
1438
+ "epoch": 0.75,
1439
+ "learning_rate": 3.532016798673329e-06,
1440
+ "loss": 0.2143,
1441
+ "step": 228
1442
+ },
1443
+ {
1444
+ "epoch": 0.75,
1445
+ "learning_rate": 3.5200401491644333e-06,
1446
+ "loss": 0.2118,
1447
+ "step": 229
1448
+ },
1449
+ {
1450
+ "epoch": 0.76,
1451
+ "learning_rate": 3.508035347370912e-06,
1452
+ "loss": 0.2081,
1453
+ "step": 230
1454
+ },
1455
+ {
1456
+ "epoch": 0.76,
1457
+ "learning_rate": 3.4960027246156043e-06,
1458
+ "loss": 0.2313,
1459
+ "step": 231
1460
+ },
1461
+ {
1462
+ "epoch": 0.76,
1463
+ "learning_rate": 3.483942612989183e-06,
1464
+ "loss": 0.2071,
1465
+ "step": 232
1466
+ },
1467
+ {
1468
+ "epoch": 0.77,
1469
+ "learning_rate": 3.471855345340992e-06,
1470
+ "loss": 0.2199,
1471
+ "step": 233
1472
+ },
1473
+ {
1474
+ "epoch": 0.77,
1475
+ "learning_rate": 3.4597412552698617e-06,
1476
+ "loss": 0.1879,
1477
+ "step": 234
1478
+ },
1479
+ {
1480
+ "epoch": 0.77,
1481
+ "learning_rate": 3.447600677114898e-06,
1482
+ "loss": 0.1717,
1483
+ "step": 235
1484
+ },
1485
+ {
1486
+ "epoch": 0.78,
1487
+ "learning_rate": 3.4354339459462556e-06,
1488
+ "loss": 0.2273,
1489
+ "step": 236
1490
+ },
1491
+ {
1492
+ "epoch": 0.78,
1493
+ "learning_rate": 3.423241397555893e-06,
1494
+ "loss": 0.2351,
1495
+ "step": 237
1496
+ },
1497
+ {
1498
+ "epoch": 0.78,
1499
+ "learning_rate": 3.4110233684483033e-06,
1500
+ "loss": 0.2165,
1501
+ "step": 238
1502
+ },
1503
+ {
1504
+ "epoch": 0.78,
1505
+ "learning_rate": 3.3987801958312254e-06,
1506
+ "loss": 0.2309,
1507
+ "step": 239
1508
+ },
1509
+ {
1510
+ "epoch": 0.79,
1511
+ "learning_rate": 3.386512217606339e-06,
1512
+ "loss": 0.2023,
1513
+ "step": 240
1514
+ },
1515
+ {
1516
+ "epoch": 0.79,
1517
+ "learning_rate": 3.3742197723599403e-06,
1518
+ "loss": 0.2189,
1519
+ "step": 241
1520
+ },
1521
+ {
1522
+ "epoch": 0.79,
1523
+ "learning_rate": 3.361903199353593e-06,
1524
+ "loss": 0.205,
1525
+ "step": 242
1526
+ },
1527
+ {
1528
+ "epoch": 0.8,
1529
+ "learning_rate": 3.349562838514769e-06,
1530
+ "loss": 0.1912,
1531
+ "step": 243
1532
+ },
1533
+ {
1534
+ "epoch": 0.8,
1535
+ "learning_rate": 3.3371990304274654e-06,
1536
+ "loss": 0.1898,
1537
+ "step": 244
1538
+ },
1539
+ {
1540
+ "epoch": 0.8,
1541
+ "learning_rate": 3.3248121163228037e-06,
1542
+ "loss": 0.2037,
1543
+ "step": 245
1544
+ },
1545
+ {
1546
+ "epoch": 0.81,
1547
+ "learning_rate": 3.3124024380696134e-06,
1548
+ "loss": 0.1959,
1549
+ "step": 246
1550
+ },
1551
+ {
1552
+ "epoch": 0.81,
1553
+ "learning_rate": 3.299970338164995e-06,
1554
+ "loss": 0.1972,
1555
+ "step": 247
1556
+ },
1557
+ {
1558
+ "epoch": 0.81,
1559
+ "learning_rate": 3.28751615972487e-06,
1560
+ "loss": 0.2098,
1561
+ "step": 248
1562
+ },
1563
+ {
1564
+ "epoch": 0.81,
1565
+ "eval_loss": 0.20815877616405487,
1566
+ "eval_runtime": 15.5231,
1567
+ "eval_samples_per_second": 6.12,
1568
+ "eval_steps_per_second": 1.546,
1569
+ "step": 248
1570
+ },
1571
+ {
1572
+ "epoch": 0.82,
1573
+ "learning_rate": 3.2750402464745084e-06,
1574
+ "loss": 0.2322,
1575
+ "step": 249
1576
+ },
1577
+ {
1578
+ "epoch": 0.82,
1579
+ "learning_rate": 3.262542942739044e-06,
1580
+ "loss": 0.1977,
1581
+ "step": 250
1582
+ },
1583
+ {
1584
+ "epoch": 0.82,
1585
+ "learning_rate": 3.2500245934339714e-06,
1586
+ "loss": 0.1983,
1587
+ "step": 251
1588
+ },
1589
+ {
1590
+ "epoch": 0.83,
1591
+ "learning_rate": 3.2374855440556242e-06,
1592
+ "loss": 0.1857,
1593
+ "step": 252
1594
+ },
1595
+ {
1596
+ "epoch": 0.83,
1597
+ "learning_rate": 3.224926140671643e-06,
1598
+ "loss": 0.183,
1599
+ "step": 253
1600
+ },
1601
+ {
1602
+ "epoch": 0.83,
1603
+ "learning_rate": 3.2123467299114216e-06,
1604
+ "loss": 0.2216,
1605
+ "step": 254
1606
+ },
1607
+ {
1608
+ "epoch": 0.84,
1609
+ "learning_rate": 3.199747658956541e-06,
1610
+ "loss": 0.1808,
1611
+ "step": 255
1612
+ },
1613
+ {
1614
+ "epoch": 0.84,
1615
+ "learning_rate": 3.1871292755311887e-06,
1616
+ "loss": 0.2278,
1617
+ "step": 256
1618
+ },
1619
+ {
1620
+ "epoch": 0.84,
1621
+ "learning_rate": 3.174491927892561e-06,
1622
+ "loss": 0.203,
1623
+ "step": 257
1624
+ },
1625
+ {
1626
+ "epoch": 0.85,
1627
+ "learning_rate": 3.1618359648212492e-06,
1628
+ "loss": 0.2209,
1629
+ "step": 258
1630
+ },
1631
+ {
1632
+ "epoch": 0.85,
1633
+ "learning_rate": 3.1491617356116167e-06,
1634
+ "loss": 0.2131,
1635
+ "step": 259
1636
+ },
1637
+ {
1638
+ "epoch": 0.85,
1639
+ "learning_rate": 3.136469590062158e-06,
1640
+ "loss": 0.2111,
1641
+ "step": 260
1642
+ },
1643
+ {
1644
+ "epoch": 0.86,
1645
+ "learning_rate": 3.1237598784658444e-06,
1646
+ "loss": 0.1915,
1647
+ "step": 261
1648
+ },
1649
+ {
1650
+ "epoch": 0.86,
1651
+ "learning_rate": 3.1110329516004546e-06,
1652
+ "loss": 0.2272,
1653
+ "step": 262
1654
+ },
1655
+ {
1656
+ "epoch": 0.86,
1657
+ "learning_rate": 3.0982891607188948e-06,
1658
+ "loss": 0.1784,
1659
+ "step": 263
1660
+ },
1661
+ {
1662
+ "epoch": 0.87,
1663
+ "learning_rate": 3.085528857539506e-06,
1664
+ "loss": 0.2064,
1665
+ "step": 264
1666
+ },
1667
+ {
1668
+ "epoch": 0.87,
1669
+ "learning_rate": 3.0727523942363547e-06,
1670
+ "loss": 0.1857,
1671
+ "step": 265
1672
+ },
1673
+ {
1674
+ "epoch": 0.87,
1675
+ "learning_rate": 3.0599601234295124e-06,
1676
+ "loss": 0.1981,
1677
+ "step": 266
1678
+ },
1679
+ {
1680
+ "epoch": 0.88,
1681
+ "learning_rate": 3.0471523981753266e-06,
1682
+ "loss": 0.2429,
1683
+ "step": 267
1684
+ },
1685
+ {
1686
+ "epoch": 0.88,
1687
+ "learning_rate": 3.0343295719566747e-06,
1688
+ "loss": 0.1755,
1689
+ "step": 268
1690
+ },
1691
+ {
1692
+ "epoch": 0.88,
1693
+ "learning_rate": 3.0214919986732076e-06,
1694
+ "loss": 0.2024,
1695
+ "step": 269
1696
+ },
1697
+ {
1698
+ "epoch": 0.89,
1699
+ "learning_rate": 3.0086400326315853e-06,
1700
+ "loss": 0.1859,
1701
+ "step": 270
1702
+ },
1703
+ {
1704
+ "epoch": 0.89,
1705
+ "learning_rate": 2.9957740285356933e-06,
1706
+ "loss": 0.2056,
1707
+ "step": 271
1708
+ },
1709
+ {
1710
+ "epoch": 0.89,
1711
+ "learning_rate": 2.9828943414768583e-06,
1712
+ "loss": 0.2183,
1713
+ "step": 272
1714
+ },
1715
+ {
1716
+ "epoch": 0.9,
1717
+ "learning_rate": 2.9700013269240463e-06,
1718
+ "loss": 0.1892,
1719
+ "step": 273
1720
+ },
1721
+ {
1722
+ "epoch": 0.9,
1723
+ "learning_rate": 2.957095340714049e-06,
1724
+ "loss": 0.2139,
1725
+ "step": 274
1726
+ },
1727
+ {
1728
+ "epoch": 0.9,
1729
+ "learning_rate": 2.9441767390416665e-06,
1730
+ "loss": 0.2314,
1731
+ "step": 275
1732
+ },
1733
+ {
1734
+ "epoch": 0.91,
1735
+ "learning_rate": 2.9312458784498763e-06,
1736
+ "loss": 0.1897,
1737
+ "step": 276
1738
+ },
1739
+ {
1740
+ "epoch": 0.91,
1741
+ "learning_rate": 2.918303115819992e-06,
1742
+ "loss": 0.1698,
1743
+ "step": 277
1744
+ },
1745
+ {
1746
+ "epoch": 0.91,
1747
+ "learning_rate": 2.9053488083618118e-06,
1748
+ "loss": 0.2147,
1749
+ "step": 278
1750
+ },
1751
+ {
1752
+ "epoch": 0.92,
1753
+ "learning_rate": 2.892383313603765e-06,
1754
+ "loss": 0.1842,
1755
+ "step": 279
1756
+ },
1757
+ {
1758
+ "epoch": 0.92,
1759
+ "eval_loss": 0.20471937954425812,
1760
+ "eval_runtime": 15.5241,
1761
+ "eval_samples_per_second": 6.12,
1762
+ "eval_steps_per_second": 1.546,
1763
+ "step": 279
1764
+ },
1765
+ {
1766
+ "epoch": 0.92,
1767
+ "learning_rate": 2.8794069893830386e-06,
1768
+ "loss": 0.178,
1769
+ "step": 280
1770
+ },
1771
+ {
1772
+ "epoch": 0.92,
1773
+ "learning_rate": 2.8664201938357052e-06,
1774
+ "loss": 0.2245,
1775
+ "step": 281
1776
+ },
1777
+ {
1778
+ "epoch": 0.93,
1779
+ "learning_rate": 2.8534232853868384e-06,
1780
+ "loss": 0.1583,
1781
+ "step": 282
1782
+ },
1783
+ {
1784
+ "epoch": 0.93,
1785
+ "learning_rate": 2.840416622740617e-06,
1786
+ "loss": 0.1752,
1787
+ "step": 283
1788
+ },
1789
+ {
1790
+ "epoch": 0.93,
1791
+ "learning_rate": 2.8274005648704316e-06,
1792
+ "loss": 0.1981,
1793
+ "step": 284
1794
+ },
1795
+ {
1796
+ "epoch": 0.94,
1797
+ "learning_rate": 2.8143754710089694e-06,
1798
+ "loss": 0.2216,
1799
+ "step": 285
1800
+ },
1801
+ {
1802
+ "epoch": 0.94,
1803
+ "learning_rate": 2.8013417006383078e-06,
1804
+ "loss": 0.2075,
1805
+ "step": 286
1806
+ },
1807
+ {
1808
+ "epoch": 0.94,
1809
+ "learning_rate": 2.7882996134799854e-06,
1810
+ "loss": 0.1978,
1811
+ "step": 287
1812
+ },
1813
+ {
1814
+ "epoch": 0.95,
1815
+ "learning_rate": 2.775249569485079e-06,
1816
+ "loss": 0.2302,
1817
+ "step": 288
1818
+ },
1819
+ {
1820
+ "epoch": 0.95,
1821
+ "learning_rate": 2.762191928824267e-06,
1822
+ "loss": 0.2218,
1823
+ "step": 289
1824
+ },
1825
+ {
1826
+ "epoch": 0.95,
1827
+ "learning_rate": 2.7491270518778913e-06,
1828
+ "loss": 0.2033,
1829
+ "step": 290
1830
+ },
1831
+ {
1832
+ "epoch": 0.96,
1833
+ "learning_rate": 2.736055299226007e-06,
1834
+ "loss": 0.2055,
1835
+ "step": 291
1836
+ },
1837
+ {
1838
+ "epoch": 0.96,
1839
+ "learning_rate": 2.722977031638435e-06,
1840
+ "loss": 0.2231,
1841
+ "step": 292
1842
+ },
1843
+ {
1844
+ "epoch": 0.96,
1845
+ "learning_rate": 2.709892610064801e-06,
1846
+ "loss": 0.2233,
1847
+ "step": 293
1848
+ },
1849
+ {
1850
+ "epoch": 0.97,
1851
+ "learning_rate": 2.696802395624579e-06,
1852
+ "loss": 0.2354,
1853
+ "step": 294
1854
+ },
1855
+ {
1856
+ "epoch": 0.97,
1857
+ "learning_rate": 2.683706749597118e-06,
1858
+ "loss": 0.2136,
1859
+ "step": 295
1860
+ },
1861
+ {
1862
+ "epoch": 0.97,
1863
+ "learning_rate": 2.670606033411678e-06,
1864
+ "loss": 0.181,
1865
+ "step": 296
1866
+ },
1867
+ {
1868
+ "epoch": 0.98,
1869
+ "learning_rate": 2.657500608637448e-06,
1870
+ "loss": 0.2051,
1871
+ "step": 297
1872
+ },
1873
+ {
1874
+ "epoch": 0.98,
1875
+ "learning_rate": 2.6443908369735715e-06,
1876
+ "loss": 0.209,
1877
+ "step": 298
1878
+ },
1879
+ {
1880
+ "epoch": 0.98,
1881
+ "learning_rate": 2.631277080239163e-06,
1882
+ "loss": 0.2184,
1883
+ "step": 299
1884
+ },
1885
+ {
1886
+ "epoch": 0.99,
1887
+ "learning_rate": 2.6181597003633218e-06,
1888
+ "loss": 0.1988,
1889
+ "step": 300
1890
+ },
1891
+ {
1892
+ "epoch": 0.99,
1893
+ "learning_rate": 2.605039059375143e-06,
1894
+ "loss": 0.2223,
1895
+ "step": 301
1896
+ },
1897
+ {
1898
+ "epoch": 0.99,
1899
+ "learning_rate": 2.5919155193937244e-06,
1900
+ "loss": 0.2029,
1901
+ "step": 302
1902
+ },
1903
+ {
1904
+ "epoch": 1.0,
1905
+ "learning_rate": 2.578789442618176e-06,
1906
+ "loss": 0.2089,
1907
+ "step": 303
1908
+ },
1909
+ {
1910
+ "epoch": 1.0,
1911
+ "learning_rate": 2.565661191317618e-06,
1912
+ "loss": 0.2065,
1913
+ "step": 304
1914
+ }
1915
+ ],
1916
+ "logging_steps": 1,
1917
+ "max_steps": 608,
1918
+ "num_train_epochs": 2,
1919
+ "save_steps": 500,
1920
+ "total_flos": 1.7027813817943327e+18,
1921
+ "trial_name": null,
1922
+ "trial_params": null
1923
+ }
checkpoint-304/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f17f663326682e34d06194ed091725483adeebe21013fe1752885513b5506a8
3
+ size 4411
checkpoint-608/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "teknium/OpenHermes-2.5-Mistral-7B",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "max_position_embeddings": 32768,
13
+ "model_type": "mistral",
14
+ "num_attention_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_key_value_heads": 8,
17
+ "rms_norm_eps": 1e-05,
18
+ "rope_theta": 10000.0,
19
+ "sliding_window": 4096,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.34.1",
23
+ "use_cache": false,
24
+ "vocab_size": 32002
25
+ }
checkpoint-608/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 32000,
5
+ "transformers_version": "4.34.1"
6
+ }
checkpoint-608/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9283358f828e552d22cde3331bba60c448519fe00765841d28afef5bb46e83c
3
+ size 14512135879
checkpoint-608/pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d00e12428ee23606c20c0cb82491d564848080e13e6dc427e94e14700ee502d
3
+ size 9886765428
checkpoint-608/pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5e0a4e903b17ca571e3d14a89e7bf7d4ed70334289ba4353fe58f9fd95789fd
3
+ size 5121688491
checkpoint-608/pytorch_model.bin.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15008350208
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
135
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
136
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
138
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
139
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
144
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
145
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
146
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
147
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
148
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
149
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
150
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
151
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
153
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
155
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
156
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
157
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
158
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
159
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
160
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
162
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
163
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
164
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
165
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
166
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
167
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
168
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
169
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
171
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
172
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
174
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
175
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
176
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
177
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
178
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
216
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
217
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
218
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
220
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
222
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
223
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
225
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
226
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
227
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
228
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
229
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
230
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
231
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
232
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
233
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
234
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
235
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
236
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
237
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
238
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
239
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
240
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
241
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
242
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
249
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
255
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
256
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
257
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
259
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
262
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
263
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
264
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
265
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
266
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
267
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
268
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
280
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
281
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
282
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
283
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
284
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
289
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
290
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
291
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
292
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
293
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
294
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
295
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
296
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
297
+ }
298
+ }
checkpoint-608/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b79fdab1d04d2fef64e6cc48d12f3c84ce5b8c1a21bebdb7dbe2a42d800421e
3
+ size 15607
checkpoint-608/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31694b994f0607cfe91452414877c57b870598c9a4bd9bae0760f822c13a1461
3
+ size 15607
checkpoint-608/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2118335bc95c668e5c08172cad5e5298abff867f2c1dbf61efb10cdaf94b2750
3
+ size 627
checkpoint-608/trainer_state.json ADDED
@@ -0,0 +1,3827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9967159277504105,
5
+ "eval_steps": 31,
6
+ "global_step": 608,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 5.000000000000001e-07,
14
+ "loss": 1.1498,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "eval_loss": 1.1952733993530273,
20
+ "eval_runtime": 15.2597,
21
+ "eval_samples_per_second": 6.226,
22
+ "eval_steps_per_second": 1.573,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 1.0000000000000002e-06,
28
+ "loss": 1.176,
29
+ "step": 2
30
+ },
31
+ {
32
+ "epoch": 0.01,
33
+ "learning_rate": 1.5e-06,
34
+ "loss": 1.1287,
35
+ "step": 3
36
+ },
37
+ {
38
+ "epoch": 0.01,
39
+ "learning_rate": 2.0000000000000003e-06,
40
+ "loss": 1.0172,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.02,
45
+ "learning_rate": 2.5e-06,
46
+ "loss": 0.8997,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 0.02,
51
+ "learning_rate": 3e-06,
52
+ "loss": 0.8616,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 0.02,
57
+ "learning_rate": 3.5e-06,
58
+ "loss": 0.7128,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.03,
63
+ "learning_rate": 4.000000000000001e-06,
64
+ "loss": 0.6826,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.03,
69
+ "learning_rate": 4.5e-06,
70
+ "loss": 0.5946,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.03,
75
+ "learning_rate": 5e-06,
76
+ "loss": 0.5616,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.04,
81
+ "learning_rate": 4.999965501009142e-06,
82
+ "loss": 0.4778,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.04,
87
+ "learning_rate": 4.999862004988709e-06,
88
+ "loss": 0.5264,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.04,
93
+ "learning_rate": 4.999689514795112e-06,
94
+ "loss": 0.4518,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.05,
99
+ "learning_rate": 4.9994480351889364e-06,
100
+ "loss": 0.46,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.05,
105
+ "learning_rate": 4.999137572834828e-06,
106
+ "loss": 0.4111,
107
+ "step": 15
108
+ },
109
+ {
110
+ "epoch": 0.05,
111
+ "learning_rate": 4.998758136301295e-06,
112
+ "loss": 0.3646,
113
+ "step": 16
114
+ },
115
+ {
116
+ "epoch": 0.06,
117
+ "learning_rate": 4.99830973606048e-06,
118
+ "loss": 0.4024,
119
+ "step": 17
120
+ },
121
+ {
122
+ "epoch": 0.06,
123
+ "learning_rate": 4.997792384487867e-06,
124
+ "loss": 0.4042,
125
+ "step": 18
126
+ },
127
+ {
128
+ "epoch": 0.06,
129
+ "learning_rate": 4.997206095861944e-06,
130
+ "loss": 0.3659,
131
+ "step": 19
132
+ },
133
+ {
134
+ "epoch": 0.07,
135
+ "learning_rate": 4.996550886363801e-06,
136
+ "loss": 0.3527,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.07,
141
+ "learning_rate": 4.995826774076693e-06,
142
+ "loss": 0.3726,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.07,
147
+ "learning_rate": 4.995033778985534e-06,
148
+ "loss": 0.3551,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.08,
153
+ "learning_rate": 4.994171922976349e-06,
154
+ "loss": 0.382,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.08,
159
+ "learning_rate": 4.993241229835666e-06,
160
+ "loss": 0.3694,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.08,
165
+ "learning_rate": 4.992241725249866e-06,
166
+ "loss": 0.3603,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.09,
171
+ "learning_rate": 4.991173436804468e-06,
172
+ "loss": 0.3383,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 0.09,
177
+ "learning_rate": 4.990036393983372e-06,
178
+ "loss": 0.3455,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 0.09,
183
+ "learning_rate": 4.9888306281680405e-06,
184
+ "loss": 0.3428,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 0.1,
189
+ "learning_rate": 4.987556172636637e-06,
190
+ "loss": 0.3001,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 0.1,
195
+ "learning_rate": 4.986213062563104e-06,
196
+ "loss": 0.307,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 0.1,
201
+ "learning_rate": 4.984801335016198e-06,
202
+ "loss": 0.321,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 0.1,
207
+ "eval_loss": 0.3176000714302063,
208
+ "eval_runtime": 15.512,
209
+ "eval_samples_per_second": 6.124,
210
+ "eval_steps_per_second": 1.547,
211
+ "step": 31
212
+ },
213
+ {
214
+ "epoch": 0.11,
215
+ "learning_rate": 4.9833210289584574e-06,
216
+ "loss": 0.2925,
217
+ "step": 32
218
+ },
219
+ {
220
+ "epoch": 0.11,
221
+ "learning_rate": 4.981772185245135e-06,
222
+ "loss": 0.311,
223
+ "step": 33
224
+ },
225
+ {
226
+ "epoch": 0.11,
227
+ "learning_rate": 4.980154846623067e-06,
228
+ "loss": 0.3066,
229
+ "step": 34
230
+ },
231
+ {
232
+ "epoch": 0.11,
233
+ "learning_rate": 4.978469057729493e-06,
234
+ "loss": 0.2879,
235
+ "step": 35
236
+ },
237
+ {
238
+ "epoch": 0.12,
239
+ "learning_rate": 4.976714865090827e-06,
240
+ "loss": 0.3001,
241
+ "step": 36
242
+ },
243
+ {
244
+ "epoch": 0.12,
245
+ "learning_rate": 4.974892317121368e-06,
246
+ "loss": 0.3111,
247
+ "step": 37
248
+ },
249
+ {
250
+ "epoch": 0.12,
251
+ "learning_rate": 4.97300146412197e-06,
252
+ "loss": 0.3067,
253
+ "step": 38
254
+ },
255
+ {
256
+ "epoch": 0.13,
257
+ "learning_rate": 4.9710423582786485e-06,
258
+ "loss": 0.3091,
259
+ "step": 39
260
+ },
261
+ {
262
+ "epoch": 0.13,
263
+ "learning_rate": 4.969015053661142e-06,
264
+ "loss": 0.302,
265
+ "step": 40
266
+ },
267
+ {
268
+ "epoch": 0.13,
269
+ "learning_rate": 4.966919606221423e-06,
270
+ "loss": 0.2991,
271
+ "step": 41
272
+ },
273
+ {
274
+ "epoch": 0.14,
275
+ "learning_rate": 4.964756073792148e-06,
276
+ "loss": 0.3034,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 0.14,
281
+ "learning_rate": 4.9625245160850674e-06,
282
+ "loss": 0.2877,
283
+ "step": 43
284
+ },
285
+ {
286
+ "epoch": 0.14,
287
+ "learning_rate": 4.960224994689371e-06,
288
+ "loss": 0.283,
289
+ "step": 44
290
+ },
291
+ {
292
+ "epoch": 0.15,
293
+ "learning_rate": 4.957857573069992e-06,
294
+ "loss": 0.2993,
295
+ "step": 45
296
+ },
297
+ {
298
+ "epoch": 0.15,
299
+ "learning_rate": 4.955422316565856e-06,
300
+ "loss": 0.2738,
301
+ "step": 46
302
+ },
303
+ {
304
+ "epoch": 0.15,
305
+ "learning_rate": 4.952919292388079e-06,
306
+ "loss": 0.293,
307
+ "step": 47
308
+ },
309
+ {
310
+ "epoch": 0.16,
311
+ "learning_rate": 4.950348569618105e-06,
312
+ "loss": 0.3053,
313
+ "step": 48
314
+ },
315
+ {
316
+ "epoch": 0.16,
317
+ "learning_rate": 4.947710219205808e-06,
318
+ "loss": 0.2875,
319
+ "step": 49
320
+ },
321
+ {
322
+ "epoch": 0.16,
323
+ "learning_rate": 4.9450043139675284e-06,
324
+ "loss": 0.2798,
325
+ "step": 50
326
+ },
327
+ {
328
+ "epoch": 0.17,
329
+ "learning_rate": 4.9422309285840684e-06,
330
+ "loss": 0.3172,
331
+ "step": 51
332
+ },
333
+ {
334
+ "epoch": 0.17,
335
+ "learning_rate": 4.939390139598623e-06,
336
+ "loss": 0.2713,
337
+ "step": 52
338
+ },
339
+ {
340
+ "epoch": 0.17,
341
+ "learning_rate": 4.936482025414677e-06,
342
+ "loss": 0.2372,
343
+ "step": 53
344
+ },
345
+ {
346
+ "epoch": 0.18,
347
+ "learning_rate": 4.933506666293834e-06,
348
+ "loss": 0.3125,
349
+ "step": 54
350
+ },
351
+ {
352
+ "epoch": 0.18,
353
+ "learning_rate": 4.9304641443536015e-06,
354
+ "loss": 0.2687,
355
+ "step": 55
356
+ },
357
+ {
358
+ "epoch": 0.18,
359
+ "learning_rate": 4.927354543565131e-06,
360
+ "loss": 0.2701,
361
+ "step": 56
362
+ },
363
+ {
364
+ "epoch": 0.19,
365
+ "learning_rate": 4.924177949750893e-06,
366
+ "loss": 0.2681,
367
+ "step": 57
368
+ },
369
+ {
370
+ "epoch": 0.19,
371
+ "learning_rate": 4.920934450582311e-06,
372
+ "loss": 0.2751,
373
+ "step": 58
374
+ },
375
+ {
376
+ "epoch": 0.19,
377
+ "learning_rate": 4.917624135577346e-06,
378
+ "loss": 0.2902,
379
+ "step": 59
380
+ },
381
+ {
382
+ "epoch": 0.2,
383
+ "learning_rate": 4.914247096098019e-06,
384
+ "loss": 0.2618,
385
+ "step": 60
386
+ },
387
+ {
388
+ "epoch": 0.2,
389
+ "learning_rate": 4.910803425347892e-06,
390
+ "loss": 0.2569,
391
+ "step": 61
392
+ },
393
+ {
394
+ "epoch": 0.2,
395
+ "learning_rate": 4.907293218369499e-06,
396
+ "loss": 0.2693,
397
+ "step": 62
398
+ },
399
+ {
400
+ "epoch": 0.2,
401
+ "eval_loss": 0.27119407057762146,
402
+ "eval_runtime": 15.5684,
403
+ "eval_samples_per_second": 6.102,
404
+ "eval_steps_per_second": 1.542,
405
+ "step": 62
406
+ },
407
+ {
408
+ "epoch": 0.21,
409
+ "learning_rate": 4.903716572041718e-06,
410
+ "loss": 0.2475,
411
+ "step": 63
412
+ },
413
+ {
414
+ "epoch": 0.21,
415
+ "learning_rate": 4.9000735850771e-06,
416
+ "loss": 0.2492,
417
+ "step": 64
418
+ },
419
+ {
420
+ "epoch": 0.21,
421
+ "learning_rate": 4.8963643580191446e-06,
422
+ "loss": 0.273,
423
+ "step": 65
424
+ },
425
+ {
426
+ "epoch": 0.22,
427
+ "learning_rate": 4.8925889932395246e-06,
428
+ "loss": 0.2193,
429
+ "step": 66
430
+ },
431
+ {
432
+ "epoch": 0.22,
433
+ "learning_rate": 4.888747594935259e-06,
434
+ "loss": 0.2637,
435
+ "step": 67
436
+ },
437
+ {
438
+ "epoch": 0.22,
439
+ "learning_rate": 4.88484026912584e-06,
440
+ "loss": 0.3141,
441
+ "step": 68
442
+ },
443
+ {
444
+ "epoch": 0.23,
445
+ "learning_rate": 4.880867123650306e-06,
446
+ "loss": 0.254,
447
+ "step": 69
448
+ },
449
+ {
450
+ "epoch": 0.23,
451
+ "learning_rate": 4.876828268164264e-06,
452
+ "loss": 0.2778,
453
+ "step": 70
454
+ },
455
+ {
456
+ "epoch": 0.23,
457
+ "learning_rate": 4.872723814136866e-06,
458
+ "loss": 0.2809,
459
+ "step": 71
460
+ },
461
+ {
462
+ "epoch": 0.24,
463
+ "learning_rate": 4.868553874847728e-06,
464
+ "loss": 0.266,
465
+ "step": 72
466
+ },
467
+ {
468
+ "epoch": 0.24,
469
+ "learning_rate": 4.864318565383809e-06,
470
+ "loss": 0.2174,
471
+ "step": 73
472
+ },
473
+ {
474
+ "epoch": 0.24,
475
+ "learning_rate": 4.86001800263623e-06,
476
+ "loss": 0.2732,
477
+ "step": 74
478
+ },
479
+ {
480
+ "epoch": 0.25,
481
+ "learning_rate": 4.855652305297052e-06,
482
+ "loss": 0.2507,
483
+ "step": 75
484
+ },
485
+ {
486
+ "epoch": 0.25,
487
+ "learning_rate": 4.8512215938559955e-06,
488
+ "loss": 0.2514,
489
+ "step": 76
490
+ },
491
+ {
492
+ "epoch": 0.25,
493
+ "learning_rate": 4.846725990597122e-06,
494
+ "loss": 0.268,
495
+ "step": 77
496
+ },
497
+ {
498
+ "epoch": 0.26,
499
+ "learning_rate": 4.84216561959545e-06,
500
+ "loss": 0.228,
501
+ "step": 78
502
+ },
503
+ {
504
+ "epoch": 0.26,
505
+ "learning_rate": 4.837540606713538e-06,
506
+ "loss": 0.2606,
507
+ "step": 79
508
+ },
509
+ {
510
+ "epoch": 0.26,
511
+ "learning_rate": 4.832851079598007e-06,
512
+ "loss": 0.2554,
513
+ "step": 80
514
+ },
515
+ {
516
+ "epoch": 0.27,
517
+ "learning_rate": 4.82809716767602e-06,
518
+ "loss": 0.257,
519
+ "step": 81
520
+ },
521
+ {
522
+ "epoch": 0.27,
523
+ "learning_rate": 4.8232790021517094e-06,
524
+ "loss": 0.2475,
525
+ "step": 82
526
+ },
527
+ {
528
+ "epoch": 0.27,
529
+ "learning_rate": 4.818396716002553e-06,
530
+ "loss": 0.2636,
531
+ "step": 83
532
+ },
533
+ {
534
+ "epoch": 0.28,
535
+ "learning_rate": 4.813450443975705e-06,
536
+ "loss": 0.2076,
537
+ "step": 84
538
+ },
539
+ {
540
+ "epoch": 0.28,
541
+ "learning_rate": 4.808440322584283e-06,
542
+ "loss": 0.2336,
543
+ "step": 85
544
+ },
545
+ {
546
+ "epoch": 0.28,
547
+ "learning_rate": 4.803366490103593e-06,
548
+ "loss": 0.2664,
549
+ "step": 86
550
+ },
551
+ {
552
+ "epoch": 0.29,
553
+ "learning_rate": 4.798229086567312e-06,
554
+ "loss": 0.2559,
555
+ "step": 87
556
+ },
557
+ {
558
+ "epoch": 0.29,
559
+ "learning_rate": 4.793028253763633e-06,
560
+ "loss": 0.2687,
561
+ "step": 88
562
+ },
563
+ {
564
+ "epoch": 0.29,
565
+ "learning_rate": 4.787764135231342e-06,
566
+ "loss": 0.2263,
567
+ "step": 89
568
+ },
569
+ {
570
+ "epoch": 0.3,
571
+ "learning_rate": 4.7824368762558595e-06,
572
+ "loss": 0.2058,
573
+ "step": 90
574
+ },
575
+ {
576
+ "epoch": 0.3,
577
+ "learning_rate": 4.7770466238652336e-06,
578
+ "loss": 0.246,
579
+ "step": 91
580
+ },
581
+ {
582
+ "epoch": 0.3,
583
+ "learning_rate": 4.771593526826078e-06,
584
+ "loss": 0.2592,
585
+ "step": 92
586
+ },
587
+ {
588
+ "epoch": 0.31,
589
+ "learning_rate": 4.76607773563947e-06,
590
+ "loss": 0.2701,
591
+ "step": 93
592
+ },
593
+ {
594
+ "epoch": 0.31,
595
+ "eval_loss": 0.25227251648902893,
596
+ "eval_runtime": 15.5184,
597
+ "eval_samples_per_second": 6.122,
598
+ "eval_steps_per_second": 1.547,
599
+ "step": 93
600
+ },
601
+ {
602
+ "epoch": 0.31,
603
+ "learning_rate": 4.760499402536792e-06,
604
+ "loss": 0.2154,
605
+ "step": 94
606
+ },
607
+ {
608
+ "epoch": 0.31,
609
+ "learning_rate": 4.754858681475534e-06,
610
+ "loss": 0.2748,
611
+ "step": 95
612
+ },
613
+ {
614
+ "epoch": 0.32,
615
+ "learning_rate": 4.7491557281350455e-06,
616
+ "loss": 0.2461,
617
+ "step": 96
618
+ },
619
+ {
620
+ "epoch": 0.32,
621
+ "learning_rate": 4.743390699912232e-06,
622
+ "loss": 0.2245,
623
+ "step": 97
624
+ },
625
+ {
626
+ "epoch": 0.32,
627
+ "learning_rate": 4.737563755917219e-06,
628
+ "loss": 0.2446,
629
+ "step": 98
630
+ },
631
+ {
632
+ "epoch": 0.33,
633
+ "learning_rate": 4.731675056968958e-06,
634
+ "loss": 0.2563,
635
+ "step": 99
636
+ },
637
+ {
638
+ "epoch": 0.33,
639
+ "learning_rate": 4.7257247655907854e-06,
640
+ "loss": 0.2512,
641
+ "step": 100
642
+ },
643
+ {
644
+ "epoch": 0.33,
645
+ "learning_rate": 4.7197130460059385e-06,
646
+ "loss": 0.2613,
647
+ "step": 101
648
+ },
649
+ {
650
+ "epoch": 0.33,
651
+ "learning_rate": 4.7136400641330245e-06,
652
+ "loss": 0.2294,
653
+ "step": 102
654
+ },
655
+ {
656
+ "epoch": 0.34,
657
+ "learning_rate": 4.7075059875814424e-06,
658
+ "loss": 0.2162,
659
+ "step": 103
660
+ },
661
+ {
662
+ "epoch": 0.34,
663
+ "learning_rate": 4.70131098564675e-06,
664
+ "loss": 0.251,
665
+ "step": 104
666
+ },
667
+ {
668
+ "epoch": 0.34,
669
+ "learning_rate": 4.695055229306001e-06,
670
+ "loss": 0.2468,
671
+ "step": 105
672
+ },
673
+ {
674
+ "epoch": 0.35,
675
+ "learning_rate": 4.6887388912130206e-06,
676
+ "loss": 0.2467,
677
+ "step": 106
678
+ },
679
+ {
680
+ "epoch": 0.35,
681
+ "learning_rate": 4.68236214569364e-06,
682
+ "loss": 0.2588,
683
+ "step": 107
684
+ },
685
+ {
686
+ "epoch": 0.35,
687
+ "learning_rate": 4.675925168740887e-06,
688
+ "loss": 0.2592,
689
+ "step": 108
690
+ },
691
+ {
692
+ "epoch": 0.36,
693
+ "learning_rate": 4.6694281380101304e-06,
694
+ "loss": 0.2225,
695
+ "step": 109
696
+ },
697
+ {
698
+ "epoch": 0.36,
699
+ "learning_rate": 4.662871232814171e-06,
700
+ "loss": 0.2411,
701
+ "step": 110
702
+ },
703
+ {
704
+ "epoch": 0.36,
705
+ "learning_rate": 4.656254634118301e-06,
706
+ "loss": 0.2512,
707
+ "step": 111
708
+ },
709
+ {
710
+ "epoch": 0.37,
711
+ "learning_rate": 4.649578524535302e-06,
712
+ "loss": 0.2252,
713
+ "step": 112
714
+ },
715
+ {
716
+ "epoch": 0.37,
717
+ "learning_rate": 4.642843088320408e-06,
718
+ "loss": 0.2288,
719
+ "step": 113
720
+ },
721
+ {
722
+ "epoch": 0.37,
723
+ "learning_rate": 4.636048511366222e-06,
724
+ "loss": 0.2339,
725
+ "step": 114
726
+ },
727
+ {
728
+ "epoch": 0.38,
729
+ "learning_rate": 4.6291949811975814e-06,
730
+ "loss": 0.2252,
731
+ "step": 115
732
+ },
733
+ {
734
+ "epoch": 0.38,
735
+ "learning_rate": 4.622282686966387e-06,
736
+ "loss": 0.2497,
737
+ "step": 116
738
+ },
739
+ {
740
+ "epoch": 0.38,
741
+ "learning_rate": 4.615311819446379e-06,
742
+ "loss": 0.2339,
743
+ "step": 117
744
+ },
745
+ {
746
+ "epoch": 0.39,
747
+ "learning_rate": 4.6082825710278724e-06,
748
+ "loss": 0.2946,
749
+ "step": 118
750
+ },
751
+ {
752
+ "epoch": 0.39,
753
+ "learning_rate": 4.60119513571245e-06,
754
+ "loss": 0.2797,
755
+ "step": 119
756
+ },
757
+ {
758
+ "epoch": 0.39,
759
+ "learning_rate": 4.594049709107604e-06,
760
+ "loss": 0.272,
761
+ "step": 120
762
+ },
763
+ {
764
+ "epoch": 0.4,
765
+ "learning_rate": 4.58684648842134e-06,
766
+ "loss": 0.2437,
767
+ "step": 121
768
+ },
769
+ {
770
+ "epoch": 0.4,
771
+ "learning_rate": 4.5795856724567344e-06,
772
+ "loss": 0.2328,
773
+ "step": 122
774
+ },
775
+ {
776
+ "epoch": 0.4,
777
+ "learning_rate": 4.572267461606446e-06,
778
+ "loss": 0.2333,
779
+ "step": 123
780
+ },
781
+ {
782
+ "epoch": 0.41,
783
+ "learning_rate": 4.564892057847184e-06,
784
+ "loss": 0.27,
785
+ "step": 124
786
+ },
787
+ {
788
+ "epoch": 0.41,
789
+ "eval_loss": 0.2362014800310135,
790
+ "eval_runtime": 15.5362,
791
+ "eval_samples_per_second": 6.115,
792
+ "eval_steps_per_second": 1.545,
793
+ "step": 124
794
+ },
795
+ {
796
+ "epoch": 0.41,
797
+ "learning_rate": 4.5574596647341414e-06,
798
+ "loss": 0.2202,
799
+ "step": 125
800
+ },
801
+ {
802
+ "epoch": 0.41,
803
+ "learning_rate": 4.549970487395365e-06,
804
+ "loss": 0.2129,
805
+ "step": 126
806
+ },
807
+ {
808
+ "epoch": 0.42,
809
+ "learning_rate": 4.542424732526105e-06,
810
+ "loss": 0.2343,
811
+ "step": 127
812
+ },
813
+ {
814
+ "epoch": 0.42,
815
+ "learning_rate": 4.534822608383104e-06,
816
+ "loss": 0.2527,
817
+ "step": 128
818
+ },
819
+ {
820
+ "epoch": 0.42,
821
+ "learning_rate": 4.5271643247788496e-06,
822
+ "loss": 0.2622,
823
+ "step": 129
824
+ },
825
+ {
826
+ "epoch": 0.43,
827
+ "learning_rate": 4.519450093075787e-06,
828
+ "loss": 0.247,
829
+ "step": 130
830
+ },
831
+ {
832
+ "epoch": 0.43,
833
+ "learning_rate": 4.5116801261804846e-06,
834
+ "loss": 0.2612,
835
+ "step": 131
836
+ },
837
+ {
838
+ "epoch": 0.43,
839
+ "learning_rate": 4.503854638537756e-06,
840
+ "loss": 0.2515,
841
+ "step": 132
842
+ },
843
+ {
844
+ "epoch": 0.44,
845
+ "learning_rate": 4.49597384612474e-06,
846
+ "loss": 0.2164,
847
+ "step": 133
848
+ },
849
+ {
850
+ "epoch": 0.44,
851
+ "learning_rate": 4.488037966444948e-06,
852
+ "loss": 0.2545,
853
+ "step": 134
854
+ },
855
+ {
856
+ "epoch": 0.44,
857
+ "learning_rate": 4.48004721852225e-06,
858
+ "loss": 0.2547,
859
+ "step": 135
860
+ },
861
+ {
862
+ "epoch": 0.45,
863
+ "learning_rate": 4.472001822894839e-06,
864
+ "loss": 0.1869,
865
+ "step": 136
866
+ },
867
+ {
868
+ "epoch": 0.45,
869
+ "learning_rate": 4.463902001609139e-06,
870
+ "loss": 0.2383,
871
+ "step": 137
872
+ },
873
+ {
874
+ "epoch": 0.45,
875
+ "learning_rate": 4.455747978213679e-06,
876
+ "loss": 0.2234,
877
+ "step": 138
878
+ },
879
+ {
880
+ "epoch": 0.46,
881
+ "learning_rate": 4.44753997775292e-06,
882
+ "loss": 0.2356,
883
+ "step": 139
884
+ },
885
+ {
886
+ "epoch": 0.46,
887
+ "learning_rate": 4.43927822676105e-06,
888
+ "loss": 0.232,
889
+ "step": 140
890
+ },
891
+ {
892
+ "epoch": 0.46,
893
+ "learning_rate": 4.430962953255725e-06,
894
+ "loss": 0.2352,
895
+ "step": 141
896
+ },
897
+ {
898
+ "epoch": 0.47,
899
+ "learning_rate": 4.4225943867317835e-06,
900
+ "loss": 0.2199,
901
+ "step": 142
902
+ },
903
+ {
904
+ "epoch": 0.47,
905
+ "learning_rate": 4.4141727581549025e-06,
906
+ "loss": 0.2175,
907
+ "step": 143
908
+ },
909
+ {
910
+ "epoch": 0.47,
911
+ "learning_rate": 4.405698299955234e-06,
912
+ "loss": 0.221,
913
+ "step": 144
914
+ },
915
+ {
916
+ "epoch": 0.48,
917
+ "learning_rate": 4.39717124602098e-06,
918
+ "loss": 0.2522,
919
+ "step": 145
920
+ },
921
+ {
922
+ "epoch": 0.48,
923
+ "learning_rate": 4.388591831691948e-06,
924
+ "loss": 0.2412,
925
+ "step": 146
926
+ },
927
+ {
928
+ "epoch": 0.48,
929
+ "learning_rate": 4.3799602937530464e-06,
930
+ "loss": 0.2176,
931
+ "step": 147
932
+ },
933
+ {
934
+ "epoch": 0.49,
935
+ "learning_rate": 4.3712768704277535e-06,
936
+ "loss": 0.2242,
937
+ "step": 148
938
+ },
939
+ {
940
+ "epoch": 0.49,
941
+ "learning_rate": 4.362541801371542e-06,
942
+ "loss": 0.2156,
943
+ "step": 149
944
+ },
945
+ {
946
+ "epoch": 0.49,
947
+ "learning_rate": 4.353755327665268e-06,
948
+ "loss": 0.2077,
949
+ "step": 150
950
+ },
951
+ {
952
+ "epoch": 0.5,
953
+ "learning_rate": 4.344917691808511e-06,
954
+ "loss": 0.2285,
955
+ "step": 151
956
+ },
957
+ {
958
+ "epoch": 0.5,
959
+ "learning_rate": 4.3360291377128864e-06,
960
+ "loss": 0.2364,
961
+ "step": 152
962
+ },
963
+ {
964
+ "epoch": 0.5,
965
+ "learning_rate": 4.32708991069531e-06,
966
+ "loss": 0.2577,
967
+ "step": 153
968
+ },
969
+ {
970
+ "epoch": 0.51,
971
+ "learning_rate": 4.318100257471233e-06,
972
+ "loss": 0.2347,
973
+ "step": 154
974
+ },
975
+ {
976
+ "epoch": 0.51,
977
+ "learning_rate": 4.309060426147826e-06,
978
+ "loss": 0.2244,
979
+ "step": 155
980
+ },
981
+ {
982
+ "epoch": 0.51,
983
+ "eval_loss": 0.22840476036071777,
984
+ "eval_runtime": 15.5442,
985
+ "eval_samples_per_second": 6.112,
986
+ "eval_steps_per_second": 1.544,
987
+ "step": 155
988
+ },
989
+ {
990
+ "epoch": 0.51,
991
+ "learning_rate": 4.299970666217135e-06,
992
+ "loss": 0.2197,
993
+ "step": 156
994
+ },
995
+ {
996
+ "epoch": 0.52,
997
+ "learning_rate": 4.290831228549196e-06,
998
+ "loss": 0.2618,
999
+ "step": 157
1000
+ },
1001
+ {
1002
+ "epoch": 0.52,
1003
+ "learning_rate": 4.281642365385111e-06,
1004
+ "loss": 0.2498,
1005
+ "step": 158
1006
+ },
1007
+ {
1008
+ "epoch": 0.52,
1009
+ "learning_rate": 4.272404330330084e-06,
1010
+ "loss": 0.2309,
1011
+ "step": 159
1012
+ },
1013
+ {
1014
+ "epoch": 0.53,
1015
+ "learning_rate": 4.263117378346425e-06,
1016
+ "loss": 0.2163,
1017
+ "step": 160
1018
+ },
1019
+ {
1020
+ "epoch": 0.53,
1021
+ "learning_rate": 4.253781765746511e-06,
1022
+ "loss": 0.1968,
1023
+ "step": 161
1024
+ },
1025
+ {
1026
+ "epoch": 0.53,
1027
+ "learning_rate": 4.244397750185714e-06,
1028
+ "loss": 0.2127,
1029
+ "step": 162
1030
+ },
1031
+ {
1032
+ "epoch": 0.54,
1033
+ "learning_rate": 4.234965590655287e-06,
1034
+ "loss": 0.2088,
1035
+ "step": 163
1036
+ },
1037
+ {
1038
+ "epoch": 0.54,
1039
+ "learning_rate": 4.225485547475217e-06,
1040
+ "loss": 0.2082,
1041
+ "step": 164
1042
+ },
1043
+ {
1044
+ "epoch": 0.54,
1045
+ "learning_rate": 4.215957882287044e-06,
1046
+ "loss": 0.2549,
1047
+ "step": 165
1048
+ },
1049
+ {
1050
+ "epoch": 0.55,
1051
+ "learning_rate": 4.206382858046636e-06,
1052
+ "loss": 0.2131,
1053
+ "step": 166
1054
+ },
1055
+ {
1056
+ "epoch": 0.55,
1057
+ "learning_rate": 4.19676073901693e-06,
1058
+ "loss": 0.2048,
1059
+ "step": 167
1060
+ },
1061
+ {
1062
+ "epoch": 0.55,
1063
+ "learning_rate": 4.187091790760644e-06,
1064
+ "loss": 0.2451,
1065
+ "step": 168
1066
+ },
1067
+ {
1068
+ "epoch": 0.56,
1069
+ "learning_rate": 4.177376280132946e-06,
1070
+ "loss": 0.1896,
1071
+ "step": 169
1072
+ },
1073
+ {
1074
+ "epoch": 0.56,
1075
+ "learning_rate": 4.167614475274082e-06,
1076
+ "loss": 0.2253,
1077
+ "step": 170
1078
+ },
1079
+ {
1080
+ "epoch": 0.56,
1081
+ "learning_rate": 4.1578066456019885e-06,
1082
+ "loss": 0.2167,
1083
+ "step": 171
1084
+ },
1085
+ {
1086
+ "epoch": 0.56,
1087
+ "learning_rate": 4.147953061804845e-06,
1088
+ "loss": 0.2262,
1089
+ "step": 172
1090
+ },
1091
+ {
1092
+ "epoch": 0.57,
1093
+ "learning_rate": 4.1380539958336095e-06,
1094
+ "loss": 0.2483,
1095
+ "step": 173
1096
+ },
1097
+ {
1098
+ "epoch": 0.57,
1099
+ "learning_rate": 4.128109720894512e-06,
1100
+ "loss": 0.2824,
1101
+ "step": 174
1102
+ },
1103
+ {
1104
+ "epoch": 0.57,
1105
+ "learning_rate": 4.118120511441512e-06,
1106
+ "loss": 0.2273,
1107
+ "step": 175
1108
+ },
1109
+ {
1110
+ "epoch": 0.58,
1111
+ "learning_rate": 4.108086643168724e-06,
1112
+ "loss": 0.2177,
1113
+ "step": 176
1114
+ },
1115
+ {
1116
+ "epoch": 0.58,
1117
+ "learning_rate": 4.098008393002816e-06,
1118
+ "loss": 0.2147,
1119
+ "step": 177
1120
+ },
1121
+ {
1122
+ "epoch": 0.58,
1123
+ "learning_rate": 4.087886039095353e-06,
1124
+ "loss": 0.1915,
1125
+ "step": 178
1126
+ },
1127
+ {
1128
+ "epoch": 0.59,
1129
+ "learning_rate": 4.077719860815132e-06,
1130
+ "loss": 0.2123,
1131
+ "step": 179
1132
+ },
1133
+ {
1134
+ "epoch": 0.59,
1135
+ "learning_rate": 4.067510138740467e-06,
1136
+ "loss": 0.2418,
1137
+ "step": 180
1138
+ },
1139
+ {
1140
+ "epoch": 0.59,
1141
+ "learning_rate": 4.057257154651444e-06,
1142
+ "loss": 0.2055,
1143
+ "step": 181
1144
+ },
1145
+ {
1146
+ "epoch": 0.6,
1147
+ "learning_rate": 4.046961191522147e-06,
1148
+ "loss": 0.2098,
1149
+ "step": 182
1150
+ },
1151
+ {
1152
+ "epoch": 0.6,
1153
+ "learning_rate": 4.036622533512845e-06,
1154
+ "loss": 0.2304,
1155
+ "step": 183
1156
+ },
1157
+ {
1158
+ "epoch": 0.6,
1159
+ "learning_rate": 4.026241465962154e-06,
1160
+ "loss": 0.2018,
1161
+ "step": 184
1162
+ },
1163
+ {
1164
+ "epoch": 0.61,
1165
+ "learning_rate": 4.0158182753791566e-06,
1166
+ "loss": 0.2009,
1167
+ "step": 185
1168
+ },
1169
+ {
1170
+ "epoch": 0.61,
1171
+ "learning_rate": 4.0053532494354985e-06,
1172
+ "loss": 0.2227,
1173
+ "step": 186
1174
+ },
1175
+ {
1176
+ "epoch": 0.61,
1177
+ "eval_loss": 0.2260431945323944,
1178
+ "eval_runtime": 15.5286,
1179
+ "eval_samples_per_second": 6.118,
1180
+ "eval_steps_per_second": 1.546,
1181
+ "step": 186
1182
+ },
1183
+ {
1184
+ "epoch": 0.61,
1185
+ "learning_rate": 3.994846676957448e-06,
1186
+ "loss": 0.1973,
1187
+ "step": 187
1188
+ },
1189
+ {
1190
+ "epoch": 0.62,
1191
+ "learning_rate": 3.984298847917923e-06,
1192
+ "loss": 0.2192,
1193
+ "step": 188
1194
+ },
1195
+ {
1196
+ "epoch": 0.62,
1197
+ "learning_rate": 3.973710053428487e-06,
1198
+ "loss": 0.2016,
1199
+ "step": 189
1200
+ },
1201
+ {
1202
+ "epoch": 0.62,
1203
+ "learning_rate": 3.963080585731324e-06,
1204
+ "loss": 0.2118,
1205
+ "step": 190
1206
+ },
1207
+ {
1208
+ "epoch": 0.63,
1209
+ "learning_rate": 3.952410738191158e-06,
1210
+ "loss": 0.2361,
1211
+ "step": 191
1212
+ },
1213
+ {
1214
+ "epoch": 0.63,
1215
+ "learning_rate": 3.941700805287169e-06,
1216
+ "loss": 0.2132,
1217
+ "step": 192
1218
+ },
1219
+ {
1220
+ "epoch": 0.63,
1221
+ "learning_rate": 3.9309510826048556e-06,
1222
+ "loss": 0.2117,
1223
+ "step": 193
1224
+ },
1225
+ {
1226
+ "epoch": 0.64,
1227
+ "learning_rate": 3.92016186682789e-06,
1228
+ "loss": 0.2196,
1229
+ "step": 194
1230
+ },
1231
+ {
1232
+ "epoch": 0.64,
1233
+ "learning_rate": 3.909333455729914e-06,
1234
+ "loss": 0.1958,
1235
+ "step": 195
1236
+ },
1237
+ {
1238
+ "epoch": 0.64,
1239
+ "learning_rate": 3.898466148166333e-06,
1240
+ "loss": 0.2318,
1241
+ "step": 196
1242
+ },
1243
+ {
1244
+ "epoch": 0.65,
1245
+ "learning_rate": 3.8875602440660635e-06,
1246
+ "loss": 0.2034,
1247
+ "step": 197
1248
+ },
1249
+ {
1250
+ "epoch": 0.65,
1251
+ "learning_rate": 3.876616044423253e-06,
1252
+ "loss": 0.2119,
1253
+ "step": 198
1254
+ },
1255
+ {
1256
+ "epoch": 0.65,
1257
+ "learning_rate": 3.865633851288975e-06,
1258
+ "loss": 0.2177,
1259
+ "step": 199
1260
+ },
1261
+ {
1262
+ "epoch": 0.66,
1263
+ "learning_rate": 3.854613967762898e-06,
1264
+ "loss": 0.1974,
1265
+ "step": 200
1266
+ },
1267
+ {
1268
+ "epoch": 0.66,
1269
+ "learning_rate": 3.843556697984907e-06,
1270
+ "loss": 0.1902,
1271
+ "step": 201
1272
+ },
1273
+ {
1274
+ "epoch": 0.66,
1275
+ "learning_rate": 3.832462347126722e-06,
1276
+ "loss": 0.1975,
1277
+ "step": 202
1278
+ },
1279
+ {
1280
+ "epoch": 0.67,
1281
+ "learning_rate": 3.821331221383471e-06,
1282
+ "loss": 0.2083,
1283
+ "step": 203
1284
+ },
1285
+ {
1286
+ "epoch": 0.67,
1287
+ "learning_rate": 3.8101636279652375e-06,
1288
+ "loss": 0.2247,
1289
+ "step": 204
1290
+ },
1291
+ {
1292
+ "epoch": 0.67,
1293
+ "learning_rate": 3.798959875088584e-06,
1294
+ "loss": 0.2169,
1295
+ "step": 205
1296
+ },
1297
+ {
1298
+ "epoch": 0.68,
1299
+ "learning_rate": 3.787720271968046e-06,
1300
+ "loss": 0.1854,
1301
+ "step": 206
1302
+ },
1303
+ {
1304
+ "epoch": 0.68,
1305
+ "learning_rate": 3.7764451288075944e-06,
1306
+ "loss": 0.2269,
1307
+ "step": 207
1308
+ },
1309
+ {
1310
+ "epoch": 0.68,
1311
+ "learning_rate": 3.765134756792079e-06,
1312
+ "loss": 0.1946,
1313
+ "step": 208
1314
+ },
1315
+ {
1316
+ "epoch": 0.69,
1317
+ "learning_rate": 3.753789468078636e-06,
1318
+ "loss": 0.1944,
1319
+ "step": 209
1320
+ },
1321
+ {
1322
+ "epoch": 0.69,
1323
+ "learning_rate": 3.742409575788074e-06,
1324
+ "loss": 0.2057,
1325
+ "step": 210
1326
+ },
1327
+ {
1328
+ "epoch": 0.69,
1329
+ "learning_rate": 3.730995393996234e-06,
1330
+ "loss": 0.2427,
1331
+ "step": 211
1332
+ },
1333
+ {
1334
+ "epoch": 0.7,
1335
+ "learning_rate": 3.719547237725319e-06,
1336
+ "loss": 0.227,
1337
+ "step": 212
1338
+ },
1339
+ {
1340
+ "epoch": 0.7,
1341
+ "learning_rate": 3.708065422935198e-06,
1342
+ "loss": 0.2152,
1343
+ "step": 213
1344
+ },
1345
+ {
1346
+ "epoch": 0.7,
1347
+ "learning_rate": 3.6965502665146916e-06,
1348
+ "loss": 0.2177,
1349
+ "step": 214
1350
+ },
1351
+ {
1352
+ "epoch": 0.71,
1353
+ "learning_rate": 3.6850020862728196e-06,
1354
+ "loss": 0.2089,
1355
+ "step": 215
1356
+ },
1357
+ {
1358
+ "epoch": 0.71,
1359
+ "learning_rate": 3.6734212009300346e-06,
1360
+ "loss": 0.2081,
1361
+ "step": 216
1362
+ },
1363
+ {
1364
+ "epoch": 0.71,
1365
+ "learning_rate": 3.661807930109422e-06,
1366
+ "loss": 0.2167,
1367
+ "step": 217
1368
+ },
1369
+ {
1370
+ "epoch": 0.71,
1371
+ "eval_loss": 0.21709737181663513,
1372
+ "eval_runtime": 15.5146,
1373
+ "eval_samples_per_second": 6.123,
1374
+ "eval_steps_per_second": 1.547,
1375
+ "step": 217
1376
+ },
1377
+ {
1378
+ "epoch": 0.72,
1379
+ "learning_rate": 3.650162594327881e-06,
1380
+ "loss": 0.2135,
1381
+ "step": 218
1382
+ },
1383
+ {
1384
+ "epoch": 0.72,
1385
+ "learning_rate": 3.6384855149872776e-06,
1386
+ "loss": 0.1976,
1387
+ "step": 219
1388
+ },
1389
+ {
1390
+ "epoch": 0.72,
1391
+ "learning_rate": 3.6267770143655743e-06,
1392
+ "loss": 0.2073,
1393
+ "step": 220
1394
+ },
1395
+ {
1396
+ "epoch": 0.73,
1397
+ "learning_rate": 3.615037415607937e-06,
1398
+ "loss": 0.2252,
1399
+ "step": 221
1400
+ },
1401
+ {
1402
+ "epoch": 0.73,
1403
+ "learning_rate": 3.603267042717813e-06,
1404
+ "loss": 0.2296,
1405
+ "step": 222
1406
+ },
1407
+ {
1408
+ "epoch": 0.73,
1409
+ "learning_rate": 3.5914662205479923e-06,
1410
+ "loss": 0.181,
1411
+ "step": 223
1412
+ },
1413
+ {
1414
+ "epoch": 0.74,
1415
+ "learning_rate": 3.579635274791639e-06,
1416
+ "loss": 0.2196,
1417
+ "step": 224
1418
+ },
1419
+ {
1420
+ "epoch": 0.74,
1421
+ "learning_rate": 3.567774531973305e-06,
1422
+ "loss": 0.2063,
1423
+ "step": 225
1424
+ },
1425
+ {
1426
+ "epoch": 0.74,
1427
+ "learning_rate": 3.555884319439917e-06,
1428
+ "loss": 0.2313,
1429
+ "step": 226
1430
+ },
1431
+ {
1432
+ "epoch": 0.75,
1433
+ "learning_rate": 3.5439649653517416e-06,
1434
+ "loss": 0.1994,
1435
+ "step": 227
1436
+ },
1437
+ {
1438
+ "epoch": 0.75,
1439
+ "learning_rate": 3.532016798673329e-06,
1440
+ "loss": 0.2143,
1441
+ "step": 228
1442
+ },
1443
+ {
1444
+ "epoch": 0.75,
1445
+ "learning_rate": 3.5200401491644333e-06,
1446
+ "loss": 0.2118,
1447
+ "step": 229
1448
+ },
1449
+ {
1450
+ "epoch": 0.76,
1451
+ "learning_rate": 3.508035347370912e-06,
1452
+ "loss": 0.2081,
1453
+ "step": 230
1454
+ },
1455
+ {
1456
+ "epoch": 0.76,
1457
+ "learning_rate": 3.4960027246156043e-06,
1458
+ "loss": 0.2313,
1459
+ "step": 231
1460
+ },
1461
+ {
1462
+ "epoch": 0.76,
1463
+ "learning_rate": 3.483942612989183e-06,
1464
+ "loss": 0.2071,
1465
+ "step": 232
1466
+ },
1467
+ {
1468
+ "epoch": 0.77,
1469
+ "learning_rate": 3.471855345340992e-06,
1470
+ "loss": 0.2199,
1471
+ "step": 233
1472
+ },
1473
+ {
1474
+ "epoch": 0.77,
1475
+ "learning_rate": 3.4597412552698617e-06,
1476
+ "loss": 0.1879,
1477
+ "step": 234
1478
+ },
1479
+ {
1480
+ "epoch": 0.77,
1481
+ "learning_rate": 3.447600677114898e-06,
1482
+ "loss": 0.1717,
1483
+ "step": 235
1484
+ },
1485
+ {
1486
+ "epoch": 0.78,
1487
+ "learning_rate": 3.4354339459462556e-06,
1488
+ "loss": 0.2273,
1489
+ "step": 236
1490
+ },
1491
+ {
1492
+ "epoch": 0.78,
1493
+ "learning_rate": 3.423241397555893e-06,
1494
+ "loss": 0.2351,
1495
+ "step": 237
1496
+ },
1497
+ {
1498
+ "epoch": 0.78,
1499
+ "learning_rate": 3.4110233684483033e-06,
1500
+ "loss": 0.2165,
1501
+ "step": 238
1502
+ },
1503
+ {
1504
+ "epoch": 0.78,
1505
+ "learning_rate": 3.3987801958312254e-06,
1506
+ "loss": 0.2309,
1507
+ "step": 239
1508
+ },
1509
+ {
1510
+ "epoch": 0.79,
1511
+ "learning_rate": 3.386512217606339e-06,
1512
+ "loss": 0.2023,
1513
+ "step": 240
1514
+ },
1515
+ {
1516
+ "epoch": 0.79,
1517
+ "learning_rate": 3.3742197723599403e-06,
1518
+ "loss": 0.2189,
1519
+ "step": 241
1520
+ },
1521
+ {
1522
+ "epoch": 0.79,
1523
+ "learning_rate": 3.361903199353593e-06,
1524
+ "loss": 0.205,
1525
+ "step": 242
1526
+ },
1527
+ {
1528
+ "epoch": 0.8,
1529
+ "learning_rate": 3.349562838514769e-06,
1530
+ "loss": 0.1912,
1531
+ "step": 243
1532
+ },
1533
+ {
1534
+ "epoch": 0.8,
1535
+ "learning_rate": 3.3371990304274654e-06,
1536
+ "loss": 0.1898,
1537
+ "step": 244
1538
+ },
1539
+ {
1540
+ "epoch": 0.8,
1541
+ "learning_rate": 3.3248121163228037e-06,
1542
+ "loss": 0.2037,
1543
+ "step": 245
1544
+ },
1545
+ {
1546
+ "epoch": 0.81,
1547
+ "learning_rate": 3.3124024380696134e-06,
1548
+ "loss": 0.1959,
1549
+ "step": 246
1550
+ },
1551
+ {
1552
+ "epoch": 0.81,
1553
+ "learning_rate": 3.299970338164995e-06,
1554
+ "loss": 0.1972,
1555
+ "step": 247
1556
+ },
1557
+ {
1558
+ "epoch": 0.81,
1559
+ "learning_rate": 3.28751615972487e-06,
1560
+ "loss": 0.2098,
1561
+ "step": 248
1562
+ },
1563
+ {
1564
+ "epoch": 0.81,
1565
+ "eval_loss": 0.20815877616405487,
1566
+ "eval_runtime": 15.5231,
1567
+ "eval_samples_per_second": 6.12,
1568
+ "eval_steps_per_second": 1.546,
1569
+ "step": 248
1570
+ },
1571
+ {
1572
+ "epoch": 0.82,
1573
+ "learning_rate": 3.2750402464745084e-06,
1574
+ "loss": 0.2322,
1575
+ "step": 249
1576
+ },
1577
+ {
1578
+ "epoch": 0.82,
1579
+ "learning_rate": 3.262542942739044e-06,
1580
+ "loss": 0.1977,
1581
+ "step": 250
1582
+ },
1583
+ {
1584
+ "epoch": 0.82,
1585
+ "learning_rate": 3.2500245934339714e-06,
1586
+ "loss": 0.1983,
1587
+ "step": 251
1588
+ },
1589
+ {
1590
+ "epoch": 0.83,
1591
+ "learning_rate": 3.2374855440556242e-06,
1592
+ "loss": 0.1857,
1593
+ "step": 252
1594
+ },
1595
+ {
1596
+ "epoch": 0.83,
1597
+ "learning_rate": 3.224926140671643e-06,
1598
+ "loss": 0.183,
1599
+ "step": 253
1600
+ },
1601
+ {
1602
+ "epoch": 0.83,
1603
+ "learning_rate": 3.2123467299114216e-06,
1604
+ "loss": 0.2216,
1605
+ "step": 254
1606
+ },
1607
+ {
1608
+ "epoch": 0.84,
1609
+ "learning_rate": 3.199747658956541e-06,
1610
+ "loss": 0.1808,
1611
+ "step": 255
1612
+ },
1613
+ {
1614
+ "epoch": 0.84,
1615
+ "learning_rate": 3.1871292755311887e-06,
1616
+ "loss": 0.2278,
1617
+ "step": 256
1618
+ },
1619
+ {
1620
+ "epoch": 0.84,
1621
+ "learning_rate": 3.174491927892561e-06,
1622
+ "loss": 0.203,
1623
+ "step": 257
1624
+ },
1625
+ {
1626
+ "epoch": 0.85,
1627
+ "learning_rate": 3.1618359648212492e-06,
1628
+ "loss": 0.2209,
1629
+ "step": 258
1630
+ },
1631
+ {
1632
+ "epoch": 0.85,
1633
+ "learning_rate": 3.1491617356116167e-06,
1634
+ "loss": 0.2131,
1635
+ "step": 259
1636
+ },
1637
+ {
1638
+ "epoch": 0.85,
1639
+ "learning_rate": 3.136469590062158e-06,
1640
+ "loss": 0.2111,
1641
+ "step": 260
1642
+ },
1643
+ {
1644
+ "epoch": 0.86,
1645
+ "learning_rate": 3.1237598784658444e-06,
1646
+ "loss": 0.1915,
1647
+ "step": 261
1648
+ },
1649
+ {
1650
+ "epoch": 0.86,
1651
+ "learning_rate": 3.1110329516004546e-06,
1652
+ "loss": 0.2272,
1653
+ "step": 262
1654
+ },
1655
+ {
1656
+ "epoch": 0.86,
1657
+ "learning_rate": 3.0982891607188948e-06,
1658
+ "loss": 0.1784,
1659
+ "step": 263
1660
+ },
1661
+ {
1662
+ "epoch": 0.87,
1663
+ "learning_rate": 3.085528857539506e-06,
1664
+ "loss": 0.2064,
1665
+ "step": 264
1666
+ },
1667
+ {
1668
+ "epoch": 0.87,
1669
+ "learning_rate": 3.0727523942363547e-06,
1670
+ "loss": 0.1857,
1671
+ "step": 265
1672
+ },
1673
+ {
1674
+ "epoch": 0.87,
1675
+ "learning_rate": 3.0599601234295124e-06,
1676
+ "loss": 0.1981,
1677
+ "step": 266
1678
+ },
1679
+ {
1680
+ "epoch": 0.88,
1681
+ "learning_rate": 3.0471523981753266e-06,
1682
+ "loss": 0.2429,
1683
+ "step": 267
1684
+ },
1685
+ {
1686
+ "epoch": 0.88,
1687
+ "learning_rate": 3.0343295719566747e-06,
1688
+ "loss": 0.1755,
1689
+ "step": 268
1690
+ },
1691
+ {
1692
+ "epoch": 0.88,
1693
+ "learning_rate": 3.0214919986732076e-06,
1694
+ "loss": 0.2024,
1695
+ "step": 269
1696
+ },
1697
+ {
1698
+ "epoch": 0.89,
1699
+ "learning_rate": 3.0086400326315853e-06,
1700
+ "loss": 0.1859,
1701
+ "step": 270
1702
+ },
1703
+ {
1704
+ "epoch": 0.89,
1705
+ "learning_rate": 2.9957740285356933e-06,
1706
+ "loss": 0.2056,
1707
+ "step": 271
1708
+ },
1709
+ {
1710
+ "epoch": 0.89,
1711
+ "learning_rate": 2.9828943414768583e-06,
1712
+ "loss": 0.2183,
1713
+ "step": 272
1714
+ },
1715
+ {
1716
+ "epoch": 0.9,
1717
+ "learning_rate": 2.9700013269240463e-06,
1718
+ "loss": 0.1892,
1719
+ "step": 273
1720
+ },
1721
+ {
1722
+ "epoch": 0.9,
1723
+ "learning_rate": 2.957095340714049e-06,
1724
+ "loss": 0.2139,
1725
+ "step": 274
1726
+ },
1727
+ {
1728
+ "epoch": 0.9,
1729
+ "learning_rate": 2.9441767390416665e-06,
1730
+ "loss": 0.2314,
1731
+ "step": 275
1732
+ },
1733
+ {
1734
+ "epoch": 0.91,
1735
+ "learning_rate": 2.9312458784498763e-06,
1736
+ "loss": 0.1897,
1737
+ "step": 276
1738
+ },
1739
+ {
1740
+ "epoch": 0.91,
1741
+ "learning_rate": 2.918303115819992e-06,
1742
+ "loss": 0.1698,
1743
+ "step": 277
1744
+ },
1745
+ {
1746
+ "epoch": 0.91,
1747
+ "learning_rate": 2.9053488083618118e-06,
1748
+ "loss": 0.2147,
1749
+ "step": 278
1750
+ },
1751
+ {
1752
+ "epoch": 0.92,
1753
+ "learning_rate": 2.892383313603765e-06,
1754
+ "loss": 0.1842,
1755
+ "step": 279
1756
+ },
1757
+ {
1758
+ "epoch": 0.92,
1759
+ "eval_loss": 0.20471937954425812,
1760
+ "eval_runtime": 15.5241,
1761
+ "eval_samples_per_second": 6.12,
1762
+ "eval_steps_per_second": 1.546,
1763
+ "step": 279
1764
+ },
1765
+ {
1766
+ "epoch": 0.92,
1767
+ "learning_rate": 2.8794069893830386e-06,
1768
+ "loss": 0.178,
1769
+ "step": 280
1770
+ },
1771
+ {
1772
+ "epoch": 0.92,
1773
+ "learning_rate": 2.8664201938357052e-06,
1774
+ "loss": 0.2245,
1775
+ "step": 281
1776
+ },
1777
+ {
1778
+ "epoch": 0.93,
1779
+ "learning_rate": 2.8534232853868384e-06,
1780
+ "loss": 0.1583,
1781
+ "step": 282
1782
+ },
1783
+ {
1784
+ "epoch": 0.93,
1785
+ "learning_rate": 2.840416622740617e-06,
1786
+ "loss": 0.1752,
1787
+ "step": 283
1788
+ },
1789
+ {
1790
+ "epoch": 0.93,
1791
+ "learning_rate": 2.8274005648704316e-06,
1792
+ "loss": 0.1981,
1793
+ "step": 284
1794
+ },
1795
+ {
1796
+ "epoch": 0.94,
1797
+ "learning_rate": 2.8143754710089694e-06,
1798
+ "loss": 0.2216,
1799
+ "step": 285
1800
+ },
1801
+ {
1802
+ "epoch": 0.94,
1803
+ "learning_rate": 2.8013417006383078e-06,
1804
+ "loss": 0.2075,
1805
+ "step": 286
1806
+ },
1807
+ {
1808
+ "epoch": 0.94,
1809
+ "learning_rate": 2.7882996134799854e-06,
1810
+ "loss": 0.1978,
1811
+ "step": 287
1812
+ },
1813
+ {
1814
+ "epoch": 0.95,
1815
+ "learning_rate": 2.775249569485079e-06,
1816
+ "loss": 0.2302,
1817
+ "step": 288
1818
+ },
1819
+ {
1820
+ "epoch": 0.95,
1821
+ "learning_rate": 2.762191928824267e-06,
1822
+ "loss": 0.2218,
1823
+ "step": 289
1824
+ },
1825
+ {
1826
+ "epoch": 0.95,
1827
+ "learning_rate": 2.7491270518778913e-06,
1828
+ "loss": 0.2033,
1829
+ "step": 290
1830
+ },
1831
+ {
1832
+ "epoch": 0.96,
1833
+ "learning_rate": 2.736055299226007e-06,
1834
+ "loss": 0.2055,
1835
+ "step": 291
1836
+ },
1837
+ {
1838
+ "epoch": 0.96,
1839
+ "learning_rate": 2.722977031638435e-06,
1840
+ "loss": 0.2231,
1841
+ "step": 292
1842
+ },
1843
+ {
1844
+ "epoch": 0.96,
1845
+ "learning_rate": 2.709892610064801e-06,
1846
+ "loss": 0.2233,
1847
+ "step": 293
1848
+ },
1849
+ {
1850
+ "epoch": 0.97,
1851
+ "learning_rate": 2.696802395624579e-06,
1852
+ "loss": 0.2354,
1853
+ "step": 294
1854
+ },
1855
+ {
1856
+ "epoch": 0.97,
1857
+ "learning_rate": 2.683706749597118e-06,
1858
+ "loss": 0.2136,
1859
+ "step": 295
1860
+ },
1861
+ {
1862
+ "epoch": 0.97,
1863
+ "learning_rate": 2.670606033411678e-06,
1864
+ "loss": 0.181,
1865
+ "step": 296
1866
+ },
1867
+ {
1868
+ "epoch": 0.98,
1869
+ "learning_rate": 2.657500608637448e-06,
1870
+ "loss": 0.2051,
1871
+ "step": 297
1872
+ },
1873
+ {
1874
+ "epoch": 0.98,
1875
+ "learning_rate": 2.6443908369735715e-06,
1876
+ "loss": 0.209,
1877
+ "step": 298
1878
+ },
1879
+ {
1880
+ "epoch": 0.98,
1881
+ "learning_rate": 2.631277080239163e-06,
1882
+ "loss": 0.2184,
1883
+ "step": 299
1884
+ },
1885
+ {
1886
+ "epoch": 0.99,
1887
+ "learning_rate": 2.6181597003633218e-06,
1888
+ "loss": 0.1988,
1889
+ "step": 300
1890
+ },
1891
+ {
1892
+ "epoch": 0.99,
1893
+ "learning_rate": 2.605039059375143e-06,
1894
+ "loss": 0.2223,
1895
+ "step": 301
1896
+ },
1897
+ {
1898
+ "epoch": 0.99,
1899
+ "learning_rate": 2.5919155193937244e-06,
1900
+ "loss": 0.2029,
1901
+ "step": 302
1902
+ },
1903
+ {
1904
+ "epoch": 1.0,
1905
+ "learning_rate": 2.578789442618176e-06,
1906
+ "loss": 0.2089,
1907
+ "step": 303
1908
+ },
1909
+ {
1910
+ "epoch": 1.0,
1911
+ "learning_rate": 2.565661191317618e-06,
1912
+ "loss": 0.2065,
1913
+ "step": 304
1914
+ },
1915
+ {
1916
+ "epoch": 1.0,
1917
+ "learning_rate": 2.5525311278211888e-06,
1918
+ "loss": 0.1977,
1919
+ "step": 305
1920
+ },
1921
+ {
1922
+ "epoch": 1.0,
1923
+ "learning_rate": 2.5393996145080413e-06,
1924
+ "loss": 0.1748,
1925
+ "step": 306
1926
+ },
1927
+ {
1928
+ "epoch": 1.01,
1929
+ "learning_rate": 2.5262670137973413e-06,
1930
+ "loss": 0.2128,
1931
+ "step": 307
1932
+ },
1933
+ {
1934
+ "epoch": 1.01,
1935
+ "learning_rate": 2.5131336881382658e-06,
1936
+ "loss": 0.1876,
1937
+ "step": 308
1938
+ },
1939
+ {
1940
+ "epoch": 1.01,
1941
+ "learning_rate": 2.5e-06,
1942
+ "loss": 0.1947,
1943
+ "step": 309
1944
+ },
1945
+ {
1946
+ "epoch": 1.02,
1947
+ "learning_rate": 2.4868663118617355e-06,
1948
+ "loss": 0.1917,
1949
+ "step": 310
1950
+ },
1951
+ {
1952
+ "epoch": 1.02,
1953
+ "eval_loss": 0.2012825459241867,
1954
+ "eval_runtime": 15.4963,
1955
+ "eval_samples_per_second": 6.131,
1956
+ "eval_steps_per_second": 1.549,
1957
+ "step": 310
1958
+ },
1959
+ {
1960
+ "epoch": 1.02,
1961
+ "learning_rate": 2.47373298620266e-06,
1962
+ "loss": 0.1904,
1963
+ "step": 311
1964
+ },
1965
+ {
1966
+ "epoch": 1.02,
1967
+ "learning_rate": 2.4606003854919595e-06,
1968
+ "loss": 0.1792,
1969
+ "step": 312
1970
+ },
1971
+ {
1972
+ "epoch": 1.03,
1973
+ "learning_rate": 2.4474688721788116e-06,
1974
+ "loss": 0.1886,
1975
+ "step": 313
1976
+ },
1977
+ {
1978
+ "epoch": 1.03,
1979
+ "learning_rate": 2.4343388086823828e-06,
1980
+ "loss": 0.201,
1981
+ "step": 314
1982
+ },
1983
+ {
1984
+ "epoch": 1.03,
1985
+ "learning_rate": 2.421210557381825e-06,
1986
+ "loss": 0.1776,
1987
+ "step": 315
1988
+ },
1989
+ {
1990
+ "epoch": 1.04,
1991
+ "learning_rate": 2.4080844806062764e-06,
1992
+ "loss": 0.1957,
1993
+ "step": 316
1994
+ },
1995
+ {
1996
+ "epoch": 1.04,
1997
+ "learning_rate": 2.3949609406248576e-06,
1998
+ "loss": 0.1744,
1999
+ "step": 317
2000
+ },
2001
+ {
2002
+ "epoch": 1.04,
2003
+ "learning_rate": 2.3818402996366786e-06,
2004
+ "loss": 0.2017,
2005
+ "step": 318
2006
+ },
2007
+ {
2008
+ "epoch": 1.05,
2009
+ "learning_rate": 2.3687229197608373e-06,
2010
+ "loss": 0.1586,
2011
+ "step": 319
2012
+ },
2013
+ {
2014
+ "epoch": 1.05,
2015
+ "learning_rate": 2.3556091630264294e-06,
2016
+ "loss": 0.1481,
2017
+ "step": 320
2018
+ },
2019
+ {
2020
+ "epoch": 1.05,
2021
+ "learning_rate": 2.3424993913625534e-06,
2022
+ "loss": 0.1696,
2023
+ "step": 321
2024
+ },
2025
+ {
2026
+ "epoch": 1.06,
2027
+ "learning_rate": 2.3293939665883233e-06,
2028
+ "loss": 0.1974,
2029
+ "step": 322
2030
+ },
2031
+ {
2032
+ "epoch": 1.06,
2033
+ "learning_rate": 2.3162932504028828e-06,
2034
+ "loss": 0.1698,
2035
+ "step": 323
2036
+ },
2037
+ {
2038
+ "epoch": 1.06,
2039
+ "learning_rate": 2.303197604375422e-06,
2040
+ "loss": 0.184,
2041
+ "step": 324
2042
+ },
2043
+ {
2044
+ "epoch": 1.07,
2045
+ "learning_rate": 2.2901073899351997e-06,
2046
+ "loss": 0.1806,
2047
+ "step": 325
2048
+ },
2049
+ {
2050
+ "epoch": 1.07,
2051
+ "learning_rate": 2.277022968361566e-06,
2052
+ "loss": 0.1741,
2053
+ "step": 326
2054
+ },
2055
+ {
2056
+ "epoch": 1.07,
2057
+ "learning_rate": 2.2639447007739933e-06,
2058
+ "loss": 0.1906,
2059
+ "step": 327
2060
+ },
2061
+ {
2062
+ "epoch": 1.08,
2063
+ "learning_rate": 2.2508729481221096e-06,
2064
+ "loss": 0.19,
2065
+ "step": 328
2066
+ },
2067
+ {
2068
+ "epoch": 1.08,
2069
+ "learning_rate": 2.2378080711757332e-06,
2070
+ "loss": 0.1954,
2071
+ "step": 329
2072
+ },
2073
+ {
2074
+ "epoch": 1.08,
2075
+ "learning_rate": 2.2247504305149217e-06,
2076
+ "loss": 0.1755,
2077
+ "step": 330
2078
+ },
2079
+ {
2080
+ "epoch": 1.09,
2081
+ "learning_rate": 2.2117003865200154e-06,
2082
+ "loss": 0.1932,
2083
+ "step": 331
2084
+ },
2085
+ {
2086
+ "epoch": 1.09,
2087
+ "learning_rate": 2.1986582993616926e-06,
2088
+ "loss": 0.1795,
2089
+ "step": 332
2090
+ },
2091
+ {
2092
+ "epoch": 1.09,
2093
+ "learning_rate": 2.185624528991031e-06,
2094
+ "loss": 0.1586,
2095
+ "step": 333
2096
+ },
2097
+ {
2098
+ "epoch": 1.1,
2099
+ "learning_rate": 2.1725994351295697e-06,
2100
+ "loss": 0.1746,
2101
+ "step": 334
2102
+ },
2103
+ {
2104
+ "epoch": 1.1,
2105
+ "learning_rate": 2.159583377259384e-06,
2106
+ "loss": 0.1744,
2107
+ "step": 335
2108
+ },
2109
+ {
2110
+ "epoch": 1.1,
2111
+ "learning_rate": 2.1465767146131633e-06,
2112
+ "loss": 0.1553,
2113
+ "step": 336
2114
+ },
2115
+ {
2116
+ "epoch": 1.11,
2117
+ "learning_rate": 2.1335798061642956e-06,
2118
+ "loss": 0.1589,
2119
+ "step": 337
2120
+ },
2121
+ {
2122
+ "epoch": 1.11,
2123
+ "learning_rate": 2.1205930106169626e-06,
2124
+ "loss": 0.1624,
2125
+ "step": 338
2126
+ },
2127
+ {
2128
+ "epoch": 1.11,
2129
+ "learning_rate": 2.1076166863962358e-06,
2130
+ "loss": 0.1823,
2131
+ "step": 339
2132
+ },
2133
+ {
2134
+ "epoch": 1.12,
2135
+ "learning_rate": 2.094651191638189e-06,
2136
+ "loss": 0.1658,
2137
+ "step": 340
2138
+ },
2139
+ {
2140
+ "epoch": 1.12,
2141
+ "learning_rate": 2.0816968841800094e-06,
2142
+ "loss": 0.1639,
2143
+ "step": 341
2144
+ },
2145
+ {
2146
+ "epoch": 1.12,
2147
+ "eval_loss": 0.19820626080036163,
2148
+ "eval_runtime": 15.5045,
2149
+ "eval_samples_per_second": 6.127,
2150
+ "eval_steps_per_second": 1.548,
2151
+ "step": 341
2152
+ },
2153
+ {
2154
+ "epoch": 1.12,
2155
+ "learning_rate": 2.0687541215501245e-06,
2156
+ "loss": 0.1776,
2157
+ "step": 342
2158
+ },
2159
+ {
2160
+ "epoch": 1.13,
2161
+ "learning_rate": 2.0558232609583343e-06,
2162
+ "loss": 0.1674,
2163
+ "step": 343
2164
+ },
2165
+ {
2166
+ "epoch": 1.13,
2167
+ "learning_rate": 2.0429046592859524e-06,
2168
+ "loss": 0.2104,
2169
+ "step": 344
2170
+ },
2171
+ {
2172
+ "epoch": 1.13,
2173
+ "learning_rate": 2.0299986730759553e-06,
2174
+ "loss": 0.1598,
2175
+ "step": 345
2176
+ },
2177
+ {
2178
+ "epoch": 1.14,
2179
+ "learning_rate": 2.0171056585231425e-06,
2180
+ "loss": 0.1741,
2181
+ "step": 346
2182
+ },
2183
+ {
2184
+ "epoch": 1.14,
2185
+ "learning_rate": 2.004225971464308e-06,
2186
+ "loss": 0.1508,
2187
+ "step": 347
2188
+ },
2189
+ {
2190
+ "epoch": 1.14,
2191
+ "learning_rate": 1.991359967368416e-06,
2192
+ "loss": 0.1735,
2193
+ "step": 348
2194
+ },
2195
+ {
2196
+ "epoch": 1.15,
2197
+ "learning_rate": 1.9785080013267933e-06,
2198
+ "loss": 0.1834,
2199
+ "step": 349
2200
+ },
2201
+ {
2202
+ "epoch": 1.15,
2203
+ "learning_rate": 1.965670428043326e-06,
2204
+ "loss": 0.1538,
2205
+ "step": 350
2206
+ },
2207
+ {
2208
+ "epoch": 1.15,
2209
+ "learning_rate": 1.952847601824674e-06,
2210
+ "loss": 0.1905,
2211
+ "step": 351
2212
+ },
2213
+ {
2214
+ "epoch": 1.16,
2215
+ "learning_rate": 1.940039876570489e-06,
2216
+ "loss": 0.1745,
2217
+ "step": 352
2218
+ },
2219
+ {
2220
+ "epoch": 1.16,
2221
+ "learning_rate": 1.927247605763647e-06,
2222
+ "loss": 0.1847,
2223
+ "step": 353
2224
+ },
2225
+ {
2226
+ "epoch": 1.16,
2227
+ "learning_rate": 1.914471142460495e-06,
2228
+ "loss": 0.1672,
2229
+ "step": 354
2230
+ },
2231
+ {
2232
+ "epoch": 1.17,
2233
+ "learning_rate": 1.9017108392811065e-06,
2234
+ "loss": 0.1881,
2235
+ "step": 355
2236
+ },
2237
+ {
2238
+ "epoch": 1.17,
2239
+ "learning_rate": 1.888967048399547e-06,
2240
+ "loss": 0.1712,
2241
+ "step": 356
2242
+ },
2243
+ {
2244
+ "epoch": 1.17,
2245
+ "learning_rate": 1.8762401215341569e-06,
2246
+ "loss": 0.1437,
2247
+ "step": 357
2248
+ },
2249
+ {
2250
+ "epoch": 1.18,
2251
+ "learning_rate": 1.8635304099378426e-06,
2252
+ "loss": 0.1589,
2253
+ "step": 358
2254
+ },
2255
+ {
2256
+ "epoch": 1.18,
2257
+ "learning_rate": 1.8508382643883837e-06,
2258
+ "loss": 0.192,
2259
+ "step": 359
2260
+ },
2261
+ {
2262
+ "epoch": 1.18,
2263
+ "learning_rate": 1.8381640351787516e-06,
2264
+ "loss": 0.1691,
2265
+ "step": 360
2266
+ },
2267
+ {
2268
+ "epoch": 1.19,
2269
+ "learning_rate": 1.8255080721074391e-06,
2270
+ "loss": 0.15,
2271
+ "step": 361
2272
+ },
2273
+ {
2274
+ "epoch": 1.19,
2275
+ "learning_rate": 1.8128707244688109e-06,
2276
+ "loss": 0.166,
2277
+ "step": 362
2278
+ },
2279
+ {
2280
+ "epoch": 1.19,
2281
+ "learning_rate": 1.800252341043459e-06,
2282
+ "loss": 0.1661,
2283
+ "step": 363
2284
+ },
2285
+ {
2286
+ "epoch": 1.2,
2287
+ "learning_rate": 1.7876532700885788e-06,
2288
+ "loss": 0.192,
2289
+ "step": 364
2290
+ },
2291
+ {
2292
+ "epoch": 1.2,
2293
+ "learning_rate": 1.7750738593283573e-06,
2294
+ "loss": 0.1597,
2295
+ "step": 365
2296
+ },
2297
+ {
2298
+ "epoch": 1.2,
2299
+ "learning_rate": 1.7625144559443758e-06,
2300
+ "loss": 0.1562,
2301
+ "step": 366
2302
+ },
2303
+ {
2304
+ "epoch": 1.21,
2305
+ "learning_rate": 1.7499754065660288e-06,
2306
+ "loss": 0.1527,
2307
+ "step": 367
2308
+ },
2309
+ {
2310
+ "epoch": 1.21,
2311
+ "learning_rate": 1.7374570572609559e-06,
2312
+ "loss": 0.1631,
2313
+ "step": 368
2314
+ },
2315
+ {
2316
+ "epoch": 1.21,
2317
+ "learning_rate": 1.7249597535254916e-06,
2318
+ "loss": 0.1552,
2319
+ "step": 369
2320
+ },
2321
+ {
2322
+ "epoch": 1.22,
2323
+ "learning_rate": 1.7124838402751304e-06,
2324
+ "loss": 0.1469,
2325
+ "step": 370
2326
+ },
2327
+ {
2328
+ "epoch": 1.22,
2329
+ "learning_rate": 1.7000296618350054e-06,
2330
+ "loss": 0.1476,
2331
+ "step": 371
2332
+ },
2333
+ {
2334
+ "epoch": 1.22,
2335
+ "learning_rate": 1.6875975619303872e-06,
2336
+ "loss": 0.1835,
2337
+ "step": 372
2338
+ },
2339
+ {
2340
+ "epoch": 1.22,
2341
+ "eval_loss": 0.19680581986904144,
2342
+ "eval_runtime": 15.5171,
2343
+ "eval_samples_per_second": 6.122,
2344
+ "eval_steps_per_second": 1.547,
2345
+ "step": 372
2346
+ },
2347
+ {
2348
+ "epoch": 1.22,
2349
+ "learning_rate": 1.6751878836771965e-06,
2350
+ "loss": 0.177,
2351
+ "step": 373
2352
+ },
2353
+ {
2354
+ "epoch": 1.23,
2355
+ "learning_rate": 1.6628009695725348e-06,
2356
+ "loss": 0.1641,
2357
+ "step": 374
2358
+ },
2359
+ {
2360
+ "epoch": 1.23,
2361
+ "learning_rate": 1.650437161485231e-06,
2362
+ "loss": 0.1827,
2363
+ "step": 375
2364
+ },
2365
+ {
2366
+ "epoch": 1.23,
2367
+ "learning_rate": 1.6380968006464073e-06,
2368
+ "loss": 0.1648,
2369
+ "step": 376
2370
+ },
2371
+ {
2372
+ "epoch": 1.24,
2373
+ "learning_rate": 1.6257802276400604e-06,
2374
+ "loss": 0.152,
2375
+ "step": 377
2376
+ },
2377
+ {
2378
+ "epoch": 1.24,
2379
+ "learning_rate": 1.613487782393661e-06,
2380
+ "loss": 0.157,
2381
+ "step": 378
2382
+ },
2383
+ {
2384
+ "epoch": 1.24,
2385
+ "learning_rate": 1.6012198041687748e-06,
2386
+ "loss": 0.166,
2387
+ "step": 379
2388
+ },
2389
+ {
2390
+ "epoch": 1.25,
2391
+ "learning_rate": 1.588976631551697e-06,
2392
+ "loss": 0.1634,
2393
+ "step": 380
2394
+ },
2395
+ {
2396
+ "epoch": 1.25,
2397
+ "learning_rate": 1.5767586024441066e-06,
2398
+ "loss": 0.1697,
2399
+ "step": 381
2400
+ },
2401
+ {
2402
+ "epoch": 1.25,
2403
+ "learning_rate": 1.5645660540537444e-06,
2404
+ "loss": 0.157,
2405
+ "step": 382
2406
+ },
2407
+ {
2408
+ "epoch": 1.26,
2409
+ "learning_rate": 1.552399322885103e-06,
2410
+ "loss": 0.1533,
2411
+ "step": 383
2412
+ },
2413
+ {
2414
+ "epoch": 1.26,
2415
+ "learning_rate": 1.5402587447301387e-06,
2416
+ "loss": 0.1762,
2417
+ "step": 384
2418
+ },
2419
+ {
2420
+ "epoch": 1.26,
2421
+ "learning_rate": 1.5281446546590084e-06,
2422
+ "loss": 0.1389,
2423
+ "step": 385
2424
+ },
2425
+ {
2426
+ "epoch": 1.27,
2427
+ "learning_rate": 1.516057387010818e-06,
2428
+ "loss": 0.1616,
2429
+ "step": 386
2430
+ },
2431
+ {
2432
+ "epoch": 1.27,
2433
+ "learning_rate": 1.5039972753843966e-06,
2434
+ "loss": 0.1534,
2435
+ "step": 387
2436
+ },
2437
+ {
2438
+ "epoch": 1.27,
2439
+ "learning_rate": 1.4919646526290884e-06,
2440
+ "loss": 0.1617,
2441
+ "step": 388
2442
+ },
2443
+ {
2444
+ "epoch": 1.28,
2445
+ "learning_rate": 1.4799598508355678e-06,
2446
+ "loss": 0.1318,
2447
+ "step": 389
2448
+ },
2449
+ {
2450
+ "epoch": 1.28,
2451
+ "learning_rate": 1.4679832013266721e-06,
2452
+ "loss": 0.1536,
2453
+ "step": 390
2454
+ },
2455
+ {
2456
+ "epoch": 1.28,
2457
+ "learning_rate": 1.4560350346482599e-06,
2458
+ "loss": 0.1957,
2459
+ "step": 391
2460
+ },
2461
+ {
2462
+ "epoch": 1.29,
2463
+ "learning_rate": 1.4441156805600842e-06,
2464
+ "loss": 0.1608,
2465
+ "step": 392
2466
+ },
2467
+ {
2468
+ "epoch": 1.29,
2469
+ "learning_rate": 1.4322254680266962e-06,
2470
+ "loss": 0.1706,
2471
+ "step": 393
2472
+ },
2473
+ {
2474
+ "epoch": 1.29,
2475
+ "learning_rate": 1.4203647252083619e-06,
2476
+ "loss": 0.1256,
2477
+ "step": 394
2478
+ },
2479
+ {
2480
+ "epoch": 1.3,
2481
+ "learning_rate": 1.4085337794520087e-06,
2482
+ "loss": 0.1561,
2483
+ "step": 395
2484
+ },
2485
+ {
2486
+ "epoch": 1.3,
2487
+ "learning_rate": 1.3967329572821875e-06,
2488
+ "loss": 0.1634,
2489
+ "step": 396
2490
+ },
2491
+ {
2492
+ "epoch": 1.3,
2493
+ "learning_rate": 1.3849625843920633e-06,
2494
+ "loss": 0.1753,
2495
+ "step": 397
2496
+ },
2497
+ {
2498
+ "epoch": 1.31,
2499
+ "learning_rate": 1.3732229856344259e-06,
2500
+ "loss": 0.1589,
2501
+ "step": 398
2502
+ },
2503
+ {
2504
+ "epoch": 1.31,
2505
+ "learning_rate": 1.3615144850127232e-06,
2506
+ "loss": 0.1764,
2507
+ "step": 399
2508
+ },
2509
+ {
2510
+ "epoch": 1.31,
2511
+ "learning_rate": 1.3498374056721198e-06,
2512
+ "loss": 0.1534,
2513
+ "step": 400
2514
+ },
2515
+ {
2516
+ "epoch": 1.32,
2517
+ "learning_rate": 1.3381920698905788e-06,
2518
+ "loss": 0.159,
2519
+ "step": 401
2520
+ },
2521
+ {
2522
+ "epoch": 1.32,
2523
+ "learning_rate": 1.326578799069966e-06,
2524
+ "loss": 0.152,
2525
+ "step": 402
2526
+ },
2527
+ {
2528
+ "epoch": 1.32,
2529
+ "learning_rate": 1.3149979137271806e-06,
2530
+ "loss": 0.1666,
2531
+ "step": 403
2532
+ },
2533
+ {
2534
+ "epoch": 1.32,
2535
+ "eval_loss": 0.19529137015342712,
2536
+ "eval_runtime": 15.5082,
2537
+ "eval_samples_per_second": 6.126,
2538
+ "eval_steps_per_second": 1.548,
2539
+ "step": 403
2540
+ },
2541
+ {
2542
+ "epoch": 1.33,
2543
+ "learning_rate": 1.3034497334853092e-06,
2544
+ "loss": 0.1778,
2545
+ "step": 404
2546
+ },
2547
+ {
2548
+ "epoch": 1.33,
2549
+ "learning_rate": 1.2919345770648023e-06,
2550
+ "loss": 0.1733,
2551
+ "step": 405
2552
+ },
2553
+ {
2554
+ "epoch": 1.33,
2555
+ "learning_rate": 1.280452762274682e-06,
2556
+ "loss": 0.1577,
2557
+ "step": 406
2558
+ },
2559
+ {
2560
+ "epoch": 1.34,
2561
+ "learning_rate": 1.2690046060037661e-06,
2562
+ "loss": 0.1548,
2563
+ "step": 407
2564
+ },
2565
+ {
2566
+ "epoch": 1.34,
2567
+ "learning_rate": 1.2575904242119264e-06,
2568
+ "loss": 0.1437,
2569
+ "step": 408
2570
+ },
2571
+ {
2572
+ "epoch": 1.34,
2573
+ "learning_rate": 1.2462105319213643e-06,
2574
+ "loss": 0.1652,
2575
+ "step": 409
2576
+ },
2577
+ {
2578
+ "epoch": 1.35,
2579
+ "learning_rate": 1.234865243207921e-06,
2580
+ "loss": 0.167,
2581
+ "step": 410
2582
+ },
2583
+ {
2584
+ "epoch": 1.35,
2585
+ "learning_rate": 1.2235548711924056e-06,
2586
+ "loss": 0.1693,
2587
+ "step": 411
2588
+ },
2589
+ {
2590
+ "epoch": 1.35,
2591
+ "learning_rate": 1.2122797280319543e-06,
2592
+ "loss": 0.1714,
2593
+ "step": 412
2594
+ },
2595
+ {
2596
+ "epoch": 1.36,
2597
+ "learning_rate": 1.2010401249114166e-06,
2598
+ "loss": 0.1752,
2599
+ "step": 413
2600
+ },
2601
+ {
2602
+ "epoch": 1.36,
2603
+ "learning_rate": 1.1898363720347635e-06,
2604
+ "loss": 0.1439,
2605
+ "step": 414
2606
+ },
2607
+ {
2608
+ "epoch": 1.36,
2609
+ "learning_rate": 1.1786687786165302e-06,
2610
+ "loss": 0.1663,
2611
+ "step": 415
2612
+ },
2613
+ {
2614
+ "epoch": 1.37,
2615
+ "learning_rate": 1.167537652873279e-06,
2616
+ "loss": 0.1644,
2617
+ "step": 416
2618
+ },
2619
+ {
2620
+ "epoch": 1.37,
2621
+ "learning_rate": 1.1564433020150946e-06,
2622
+ "loss": 0.1385,
2623
+ "step": 417
2624
+ },
2625
+ {
2626
+ "epoch": 1.37,
2627
+ "learning_rate": 1.1453860322371032e-06,
2628
+ "loss": 0.1617,
2629
+ "step": 418
2630
+ },
2631
+ {
2632
+ "epoch": 1.38,
2633
+ "learning_rate": 1.134366148711025e-06,
2634
+ "loss": 0.1725,
2635
+ "step": 419
2636
+ },
2637
+ {
2638
+ "epoch": 1.38,
2639
+ "learning_rate": 1.1233839555767482e-06,
2640
+ "loss": 0.1487,
2641
+ "step": 420
2642
+ },
2643
+ {
2644
+ "epoch": 1.38,
2645
+ "learning_rate": 1.1124397559339373e-06,
2646
+ "loss": 0.1636,
2647
+ "step": 421
2648
+ },
2649
+ {
2650
+ "epoch": 1.39,
2651
+ "learning_rate": 1.1015338518336672e-06,
2652
+ "loss": 0.1607,
2653
+ "step": 422
2654
+ },
2655
+ {
2656
+ "epoch": 1.39,
2657
+ "learning_rate": 1.0906665442700868e-06,
2658
+ "loss": 0.2252,
2659
+ "step": 423
2660
+ },
2661
+ {
2662
+ "epoch": 1.39,
2663
+ "learning_rate": 1.079838133172111e-06,
2664
+ "loss": 0.1901,
2665
+ "step": 424
2666
+ },
2667
+ {
2668
+ "epoch": 1.4,
2669
+ "learning_rate": 1.0690489173951446e-06,
2670
+ "loss": 0.1576,
2671
+ "step": 425
2672
+ },
2673
+ {
2674
+ "epoch": 1.4,
2675
+ "learning_rate": 1.0582991947128324e-06,
2676
+ "loss": 0.1734,
2677
+ "step": 426
2678
+ },
2679
+ {
2680
+ "epoch": 1.4,
2681
+ "learning_rate": 1.0475892618088426e-06,
2682
+ "loss": 0.1654,
2683
+ "step": 427
2684
+ },
2685
+ {
2686
+ "epoch": 1.41,
2687
+ "learning_rate": 1.0369194142686766e-06,
2688
+ "loss": 0.1608,
2689
+ "step": 428
2690
+ },
2691
+ {
2692
+ "epoch": 1.41,
2693
+ "learning_rate": 1.0262899465715128e-06,
2694
+ "loss": 0.18,
2695
+ "step": 429
2696
+ },
2697
+ {
2698
+ "epoch": 1.41,
2699
+ "learning_rate": 1.0157011520820784e-06,
2700
+ "loss": 0.1379,
2701
+ "step": 430
2702
+ },
2703
+ {
2704
+ "epoch": 1.42,
2705
+ "learning_rate": 1.0051533230425527e-06,
2706
+ "loss": 0.1584,
2707
+ "step": 431
2708
+ },
2709
+ {
2710
+ "epoch": 1.42,
2711
+ "learning_rate": 9.946467505645019e-07,
2712
+ "loss": 0.1488,
2713
+ "step": 432
2714
+ },
2715
+ {
2716
+ "epoch": 1.42,
2717
+ "learning_rate": 9.84181724620844e-07,
2718
+ "loss": 0.1863,
2719
+ "step": 433
2720
+ },
2721
+ {
2722
+ "epoch": 1.43,
2723
+ "learning_rate": 9.73758534037847e-07,
2724
+ "loss": 0.1694,
2725
+ "step": 434
2726
+ },
2727
+ {
2728
+ "epoch": 1.43,
2729
+ "eval_loss": 0.19318194687366486,
2730
+ "eval_runtime": 15.505,
2731
+ "eval_samples_per_second": 6.127,
2732
+ "eval_steps_per_second": 1.548,
2733
+ "step": 434
2734
+ },
2735
+ {
2736
+ "epoch": 1.43,
2737
+ "learning_rate": 9.633774664871557e-07,
2738
+ "loss": 0.1826,
2739
+ "step": 435
2740
+ },
2741
+ {
2742
+ "epoch": 1.43,
2743
+ "learning_rate": 9.530388084778541e-07,
2744
+ "loss": 0.1892,
2745
+ "step": 436
2746
+ },
2747
+ {
2748
+ "epoch": 1.44,
2749
+ "learning_rate": 9.427428453485573e-07,
2750
+ "loss": 0.1377,
2751
+ "step": 437
2752
+ },
2753
+ {
2754
+ "epoch": 1.44,
2755
+ "learning_rate": 9.32489861259534e-07,
2756
+ "loss": 0.1576,
2757
+ "step": 438
2758
+ },
2759
+ {
2760
+ "epoch": 1.44,
2761
+ "learning_rate": 9.222801391848688e-07,
2762
+ "loss": 0.1988,
2763
+ "step": 439
2764
+ },
2765
+ {
2766
+ "epoch": 1.44,
2767
+ "learning_rate": 9.121139609046484e-07,
2768
+ "loss": 0.1426,
2769
+ "step": 440
2770
+ },
2771
+ {
2772
+ "epoch": 1.45,
2773
+ "learning_rate": 9.019916069971857e-07,
2774
+ "loss": 0.1467,
2775
+ "step": 441
2776
+ },
2777
+ {
2778
+ "epoch": 1.45,
2779
+ "learning_rate": 8.919133568312768e-07,
2780
+ "loss": 0.1558,
2781
+ "step": 442
2782
+ },
2783
+ {
2784
+ "epoch": 1.45,
2785
+ "learning_rate": 8.818794885584902e-07,
2786
+ "loss": 0.1593,
2787
+ "step": 443
2788
+ },
2789
+ {
2790
+ "epoch": 1.46,
2791
+ "learning_rate": 8.718902791054895e-07,
2792
+ "loss": 0.1443,
2793
+ "step": 444
2794
+ },
2795
+ {
2796
+ "epoch": 1.46,
2797
+ "learning_rate": 8.619460041663915e-07,
2798
+ "loss": 0.1779,
2799
+ "step": 445
2800
+ },
2801
+ {
2802
+ "epoch": 1.46,
2803
+ "learning_rate": 8.52046938195156e-07,
2804
+ "loss": 0.1454,
2805
+ "step": 446
2806
+ },
2807
+ {
2808
+ "epoch": 1.47,
2809
+ "learning_rate": 8.421933543980126e-07,
2810
+ "loss": 0.1607,
2811
+ "step": 447
2812
+ },
2813
+ {
2814
+ "epoch": 1.47,
2815
+ "learning_rate": 8.323855247259185e-07,
2816
+ "loss": 0.1558,
2817
+ "step": 448
2818
+ },
2819
+ {
2820
+ "epoch": 1.47,
2821
+ "learning_rate": 8.226237198670556e-07,
2822
+ "loss": 0.1748,
2823
+ "step": 449
2824
+ },
2825
+ {
2826
+ "epoch": 1.48,
2827
+ "learning_rate": 8.129082092393562e-07,
2828
+ "loss": 0.1754,
2829
+ "step": 450
2830
+ },
2831
+ {
2832
+ "epoch": 1.48,
2833
+ "learning_rate": 8.032392609830708e-07,
2834
+ "loss": 0.1418,
2835
+ "step": 451
2836
+ },
2837
+ {
2838
+ "epoch": 1.48,
2839
+ "learning_rate": 7.936171419533653e-07,
2840
+ "loss": 0.1553,
2841
+ "step": 452
2842
+ },
2843
+ {
2844
+ "epoch": 1.49,
2845
+ "learning_rate": 7.840421177129564e-07,
2846
+ "loss": 0.1576,
2847
+ "step": 453
2848
+ },
2849
+ {
2850
+ "epoch": 1.49,
2851
+ "learning_rate": 7.745144525247839e-07,
2852
+ "loss": 0.1405,
2853
+ "step": 454
2854
+ },
2855
+ {
2856
+ "epoch": 1.49,
2857
+ "learning_rate": 7.650344093447145e-07,
2858
+ "loss": 0.1495,
2859
+ "step": 455
2860
+ },
2861
+ {
2862
+ "epoch": 1.5,
2863
+ "learning_rate": 7.55602249814287e-07,
2864
+ "loss": 0.1631,
2865
+ "step": 456
2866
+ },
2867
+ {
2868
+ "epoch": 1.5,
2869
+ "learning_rate": 7.462182342534896e-07,
2870
+ "loss": 0.1728,
2871
+ "step": 457
2872
+ },
2873
+ {
2874
+ "epoch": 1.5,
2875
+ "learning_rate": 7.368826216535758e-07,
2876
+ "loss": 0.1756,
2877
+ "step": 458
2878
+ },
2879
+ {
2880
+ "epoch": 1.51,
2881
+ "learning_rate": 7.275956696699169e-07,
2882
+ "loss": 0.164,
2883
+ "step": 459
2884
+ },
2885
+ {
2886
+ "epoch": 1.51,
2887
+ "learning_rate": 7.183576346148899e-07,
2888
+ "loss": 0.1417,
2889
+ "step": 460
2890
+ },
2891
+ {
2892
+ "epoch": 1.51,
2893
+ "learning_rate": 7.091687714508044e-07,
2894
+ "loss": 0.1871,
2895
+ "step": 461
2896
+ },
2897
+ {
2898
+ "epoch": 1.52,
2899
+ "learning_rate": 7.000293337828656e-07,
2900
+ "loss": 0.1651,
2901
+ "step": 462
2902
+ },
2903
+ {
2904
+ "epoch": 1.52,
2905
+ "learning_rate": 6.909395738521745e-07,
2906
+ "loss": 0.1705,
2907
+ "step": 463
2908
+ },
2909
+ {
2910
+ "epoch": 1.52,
2911
+ "learning_rate": 6.818997425287671e-07,
2912
+ "loss": 0.1633,
2913
+ "step": 464
2914
+ },
2915
+ {
2916
+ "epoch": 1.53,
2917
+ "learning_rate": 6.729100893046897e-07,
2918
+ "loss": 0.1461,
2919
+ "step": 465
2920
+ },
2921
+ {
2922
+ "epoch": 1.53,
2923
+ "eval_loss": 0.19294388592243195,
2924
+ "eval_runtime": 15.5273,
2925
+ "eval_samples_per_second": 6.118,
2926
+ "eval_steps_per_second": 1.546,
2927
+ "step": 465
2928
+ },
2929
+ {
2930
+ "epoch": 1.53,
2931
+ "learning_rate": 6.639708622871144e-07,
2932
+ "loss": 0.1411,
2933
+ "step": 466
2934
+ },
2935
+ {
2936
+ "epoch": 1.53,
2937
+ "learning_rate": 6.550823081914892e-07,
2938
+ "loss": 0.1586,
2939
+ "step": 467
2940
+ },
2941
+ {
2942
+ "epoch": 1.54,
2943
+ "learning_rate": 6.462446723347324e-07,
2944
+ "loss": 0.1364,
2945
+ "step": 468
2946
+ },
2947
+ {
2948
+ "epoch": 1.54,
2949
+ "learning_rate": 6.374581986284578e-07,
2950
+ "loss": 0.1651,
2951
+ "step": 469
2952
+ },
2953
+ {
2954
+ "epoch": 1.54,
2955
+ "learning_rate": 6.28723129572247e-07,
2956
+ "loss": 0.1633,
2957
+ "step": 470
2958
+ },
2959
+ {
2960
+ "epoch": 1.55,
2961
+ "learning_rate": 6.200397062469541e-07,
2962
+ "loss": 0.1462,
2963
+ "step": 471
2964
+ },
2965
+ {
2966
+ "epoch": 1.55,
2967
+ "learning_rate": 6.11408168308052e-07,
2968
+ "loss": 0.1466,
2969
+ "step": 472
2970
+ },
2971
+ {
2972
+ "epoch": 1.55,
2973
+ "learning_rate": 6.0282875397902e-07,
2974
+ "loss": 0.1469,
2975
+ "step": 473
2976
+ },
2977
+ {
2978
+ "epoch": 1.56,
2979
+ "learning_rate": 5.943017000447671e-07,
2980
+ "loss": 0.1357,
2981
+ "step": 474
2982
+ },
2983
+ {
2984
+ "epoch": 1.56,
2985
+ "learning_rate": 5.858272418450978e-07,
2986
+ "loss": 0.1592,
2987
+ "step": 475
2988
+ },
2989
+ {
2990
+ "epoch": 1.56,
2991
+ "learning_rate": 5.774056132682168e-07,
2992
+ "loss": 0.1649,
2993
+ "step": 476
2994
+ },
2995
+ {
2996
+ "epoch": 1.57,
2997
+ "learning_rate": 5.690370467442743e-07,
2998
+ "loss": 0.1612,
2999
+ "step": 477
3000
+ },
3001
+ {
3002
+ "epoch": 1.57,
3003
+ "learning_rate": 5.607217732389503e-07,
3004
+ "loss": 0.1916,
3005
+ "step": 478
3006
+ },
3007
+ {
3008
+ "epoch": 1.57,
3009
+ "learning_rate": 5.5246002224708e-07,
3010
+ "loss": 0.2088,
3011
+ "step": 479
3012
+ },
3013
+ {
3014
+ "epoch": 1.58,
3015
+ "learning_rate": 5.442520217863215e-07,
3016
+ "loss": 0.1536,
3017
+ "step": 480
3018
+ },
3019
+ {
3020
+ "epoch": 1.58,
3021
+ "learning_rate": 5.360979983908615e-07,
3022
+ "loss": 0.152,
3023
+ "step": 481
3024
+ },
3025
+ {
3026
+ "epoch": 1.58,
3027
+ "learning_rate": 5.279981771051615e-07,
3028
+ "loss": 0.1389,
3029
+ "step": 482
3030
+ },
3031
+ {
3032
+ "epoch": 1.59,
3033
+ "learning_rate": 5.199527814777509e-07,
3034
+ "loss": 0.1164,
3035
+ "step": 483
3036
+ },
3037
+ {
3038
+ "epoch": 1.59,
3039
+ "learning_rate": 5.119620335550532e-07,
3040
+ "loss": 0.1821,
3041
+ "step": 484
3042
+ },
3043
+ {
3044
+ "epoch": 1.59,
3045
+ "learning_rate": 5.040261538752606e-07,
3046
+ "loss": 0.1525,
3047
+ "step": 485
3048
+ },
3049
+ {
3050
+ "epoch": 1.6,
3051
+ "learning_rate": 4.961453614622453e-07,
3052
+ "loss": 0.1384,
3053
+ "step": 486
3054
+ },
3055
+ {
3056
+ "epoch": 1.6,
3057
+ "learning_rate": 4.883198738195157e-07,
3058
+ "loss": 0.1791,
3059
+ "step": 487
3060
+ },
3061
+ {
3062
+ "epoch": 1.6,
3063
+ "learning_rate": 4.805499069242131e-07,
3064
+ "loss": 0.1391,
3065
+ "step": 488
3066
+ },
3067
+ {
3068
+ "epoch": 1.61,
3069
+ "learning_rate": 4.72835675221151e-07,
3070
+ "loss": 0.1324,
3071
+ "step": 489
3072
+ },
3073
+ {
3074
+ "epoch": 1.61,
3075
+ "learning_rate": 4.651773916168967e-07,
3076
+ "loss": 0.1548,
3077
+ "step": 490
3078
+ },
3079
+ {
3080
+ "epoch": 1.61,
3081
+ "learning_rate": 4.5757526747389506e-07,
3082
+ "loss": 0.1615,
3083
+ "step": 491
3084
+ },
3085
+ {
3086
+ "epoch": 1.62,
3087
+ "learning_rate": 4.5002951260463503e-07,
3088
+ "loss": 0.1391,
3089
+ "step": 492
3090
+ },
3091
+ {
3092
+ "epoch": 1.62,
3093
+ "learning_rate": 4.4254033526585917e-07,
3094
+ "loss": 0.1596,
3095
+ "step": 493
3096
+ },
3097
+ {
3098
+ "epoch": 1.62,
3099
+ "learning_rate": 4.3510794215281595e-07,
3100
+ "loss": 0.1344,
3101
+ "step": 494
3102
+ },
3103
+ {
3104
+ "epoch": 1.63,
3105
+ "learning_rate": 4.277325383935549e-07,
3106
+ "loss": 0.1831,
3107
+ "step": 495
3108
+ },
3109
+ {
3110
+ "epoch": 1.63,
3111
+ "learning_rate": 4.2041432754326593e-07,
3112
+ "loss": 0.1535,
3113
+ "step": 496
3114
+ },
3115
+ {
3116
+ "epoch": 1.63,
3117
+ "eval_loss": 0.1927250325679779,
3118
+ "eval_runtime": 15.5004,
3119
+ "eval_samples_per_second": 6.129,
3120
+ "eval_steps_per_second": 1.548,
3121
+ "step": 496
3122
+ },
3123
+ {
3124
+ "epoch": 1.63,
3125
+ "learning_rate": 4.1315351157866003e-07,
3126
+ "loss": 0.1262,
3127
+ "step": 497
3128
+ },
3129
+ {
3130
+ "epoch": 1.64,
3131
+ "learning_rate": 4.059502908923962e-07,
3132
+ "loss": 0.1685,
3133
+ "step": 498
3134
+ },
3135
+ {
3136
+ "epoch": 1.64,
3137
+ "learning_rate": 3.988048642875503e-07,
3138
+ "loss": 0.1435,
3139
+ "step": 499
3140
+ },
3141
+ {
3142
+ "epoch": 1.64,
3143
+ "learning_rate": 3.917174289721276e-07,
3144
+ "loss": 0.1423,
3145
+ "step": 500
3146
+ },
3147
+ {
3148
+ "epoch": 1.65,
3149
+ "learning_rate": 3.8468818055362153e-07,
3150
+ "loss": 0.1434,
3151
+ "step": 501
3152
+ },
3153
+ {
3154
+ "epoch": 1.65,
3155
+ "learning_rate": 3.7771731303361314e-07,
3156
+ "loss": 0.1656,
3157
+ "step": 502
3158
+ },
3159
+ {
3160
+ "epoch": 1.65,
3161
+ "learning_rate": 3.708050188024187e-07,
3162
+ "loss": 0.1553,
3163
+ "step": 503
3164
+ },
3165
+ {
3166
+ "epoch": 1.66,
3167
+ "learning_rate": 3.639514886337786e-07,
3168
+ "loss": 0.1503,
3169
+ "step": 504
3170
+ },
3171
+ {
3172
+ "epoch": 1.66,
3173
+ "learning_rate": 3.571569116795928e-07,
3174
+ "loss": 0.1227,
3175
+ "step": 505
3176
+ },
3177
+ {
3178
+ "epoch": 1.66,
3179
+ "learning_rate": 3.5042147546469894e-07,
3180
+ "loss": 0.1365,
3181
+ "step": 506
3182
+ },
3183
+ {
3184
+ "epoch": 1.67,
3185
+ "learning_rate": 3.437453658816994e-07,
3186
+ "loss": 0.1508,
3187
+ "step": 507
3188
+ },
3189
+ {
3190
+ "epoch": 1.67,
3191
+ "learning_rate": 3.371287671858292e-07,
3192
+ "loss": 0.1571,
3193
+ "step": 508
3194
+ },
3195
+ {
3196
+ "epoch": 1.67,
3197
+ "learning_rate": 3.3057186198987066e-07,
3198
+ "loss": 0.1518,
3199
+ "step": 509
3200
+ },
3201
+ {
3202
+ "epoch": 1.67,
3203
+ "learning_rate": 3.2407483125911354e-07,
3204
+ "loss": 0.1502,
3205
+ "step": 510
3206
+ },
3207
+ {
3208
+ "epoch": 1.68,
3209
+ "learning_rate": 3.1763785430636073e-07,
3210
+ "loss": 0.1273,
3211
+ "step": 511
3212
+ },
3213
+ {
3214
+ "epoch": 1.68,
3215
+ "learning_rate": 3.112611087869799e-07,
3216
+ "loss": 0.1738,
3217
+ "step": 512
3218
+ },
3219
+ {
3220
+ "epoch": 1.68,
3221
+ "learning_rate": 3.04944770693999e-07,
3222
+ "loss": 0.143,
3223
+ "step": 513
3224
+ },
3225
+ {
3226
+ "epoch": 1.69,
3227
+ "learning_rate": 2.9868901435325033e-07,
3228
+ "loss": 0.1424,
3229
+ "step": 514
3230
+ },
3231
+ {
3232
+ "epoch": 1.69,
3233
+ "learning_rate": 2.924940124185585e-07,
3234
+ "loss": 0.1749,
3235
+ "step": 515
3236
+ },
3237
+ {
3238
+ "epoch": 1.69,
3239
+ "learning_rate": 2.8635993586697555e-07,
3240
+ "loss": 0.1547,
3241
+ "step": 516
3242
+ },
3243
+ {
3244
+ "epoch": 1.7,
3245
+ "learning_rate": 2.80286953994062e-07,
3246
+ "loss": 0.1725,
3247
+ "step": 517
3248
+ },
3249
+ {
3250
+ "epoch": 1.7,
3251
+ "learning_rate": 2.7427523440921534e-07,
3252
+ "loss": 0.1373,
3253
+ "step": 518
3254
+ },
3255
+ {
3256
+ "epoch": 1.7,
3257
+ "learning_rate": 2.6832494303104195e-07,
3258
+ "loss": 0.1434,
3259
+ "step": 519
3260
+ },
3261
+ {
3262
+ "epoch": 1.71,
3263
+ "learning_rate": 2.62436244082781e-07,
3264
+ "loss": 0.147,
3265
+ "step": 520
3266
+ },
3267
+ {
3268
+ "epoch": 1.71,
3269
+ "learning_rate": 2.566093000877687e-07,
3270
+ "loss": 0.1673,
3271
+ "step": 521
3272
+ },
3273
+ {
3274
+ "epoch": 1.71,
3275
+ "learning_rate": 2.5084427186495566e-07,
3276
+ "loss": 0.1618,
3277
+ "step": 522
3278
+ },
3279
+ {
3280
+ "epoch": 1.72,
3281
+ "learning_rate": 2.451413185244661e-07,
3282
+ "loss": 0.1618,
3283
+ "step": 523
3284
+ },
3285
+ {
3286
+ "epoch": 1.72,
3287
+ "learning_rate": 2.3950059746320864e-07,
3288
+ "loss": 0.1507,
3289
+ "step": 524
3290
+ },
3291
+ {
3292
+ "epoch": 1.72,
3293
+ "learning_rate": 2.3392226436053073e-07,
3294
+ "loss": 0.1582,
3295
+ "step": 525
3296
+ },
3297
+ {
3298
+ "epoch": 1.73,
3299
+ "learning_rate": 2.2840647317392218e-07,
3300
+ "loss": 0.1691,
3301
+ "step": 526
3302
+ },
3303
+ {
3304
+ "epoch": 1.73,
3305
+ "learning_rate": 2.2295337613476714e-07,
3306
+ "loss": 0.1419,
3307
+ "step": 527
3308
+ },
3309
+ {
3310
+ "epoch": 1.73,
3311
+ "eval_loss": 0.1925160139799118,
3312
+ "eval_runtime": 15.4967,
3313
+ "eval_samples_per_second": 6.13,
3314
+ "eval_steps_per_second": 1.549,
3315
+ "step": 527
3316
+ },
3317
+ {
3318
+ "epoch": 1.73,
3319
+ "learning_rate": 2.1756312374414113e-07,
3320
+ "loss": 0.1648,
3321
+ "step": 528
3322
+ },
3323
+ {
3324
+ "epoch": 1.74,
3325
+ "learning_rate": 2.1223586476865953e-07,
3326
+ "loss": 0.1488,
3327
+ "step": 529
3328
+ },
3329
+ {
3330
+ "epoch": 1.74,
3331
+ "learning_rate": 2.0697174623636795e-07,
3332
+ "loss": 0.1691,
3333
+ "step": 530
3334
+ },
3335
+ {
3336
+ "epoch": 1.74,
3337
+ "learning_rate": 2.017709134326884e-07,
3338
+ "loss": 0.1669,
3339
+ "step": 531
3340
+ },
3341
+ {
3342
+ "epoch": 1.75,
3343
+ "learning_rate": 1.9663350989640812e-07,
3344
+ "loss": 0.1548,
3345
+ "step": 532
3346
+ },
3347
+ {
3348
+ "epoch": 1.75,
3349
+ "learning_rate": 1.915596774157169e-07,
3350
+ "loss": 0.161,
3351
+ "step": 533
3352
+ },
3353
+ {
3354
+ "epoch": 1.75,
3355
+ "learning_rate": 1.8654955602429499e-07,
3356
+ "loss": 0.1763,
3357
+ "step": 534
3358
+ },
3359
+ {
3360
+ "epoch": 1.76,
3361
+ "learning_rate": 1.8160328399744825e-07,
3362
+ "loss": 0.1813,
3363
+ "step": 535
3364
+ },
3365
+ {
3366
+ "epoch": 1.76,
3367
+ "learning_rate": 1.7672099784829116e-07,
3368
+ "loss": 0.155,
3369
+ "step": 536
3370
+ },
3371
+ {
3372
+ "epoch": 1.76,
3373
+ "learning_rate": 1.719028323239802e-07,
3374
+ "loss": 0.1718,
3375
+ "step": 537
3376
+ },
3377
+ {
3378
+ "epoch": 1.77,
3379
+ "learning_rate": 1.6714892040199383e-07,
3380
+ "loss": 0.1486,
3381
+ "step": 538
3382
+ },
3383
+ {
3384
+ "epoch": 1.77,
3385
+ "learning_rate": 1.6245939328646322e-07,
3386
+ "loss": 0.1465,
3387
+ "step": 539
3388
+ },
3389
+ {
3390
+ "epoch": 1.77,
3391
+ "learning_rate": 1.5783438040455097e-07,
3392
+ "loss": 0.1261,
3393
+ "step": 540
3394
+ },
3395
+ {
3396
+ "epoch": 1.78,
3397
+ "learning_rate": 1.5327400940287868e-07,
3398
+ "loss": 0.2032,
3399
+ "step": 541
3400
+ },
3401
+ {
3402
+ "epoch": 1.78,
3403
+ "learning_rate": 1.4877840614400452e-07,
3404
+ "loss": 0.1648,
3405
+ "step": 542
3406
+ },
3407
+ {
3408
+ "epoch": 1.78,
3409
+ "learning_rate": 1.44347694702949e-07,
3410
+ "loss": 0.1614,
3411
+ "step": 543
3412
+ },
3413
+ {
3414
+ "epoch": 1.79,
3415
+ "learning_rate": 1.399819973637706e-07,
3416
+ "loss": 0.1783,
3417
+ "step": 544
3418
+ },
3419
+ {
3420
+ "epoch": 1.79,
3421
+ "learning_rate": 1.3568143461619193e-07,
3422
+ "loss": 0.1642,
3423
+ "step": 545
3424
+ },
3425
+ {
3426
+ "epoch": 1.79,
3427
+ "learning_rate": 1.3144612515227278e-07,
3428
+ "loss": 0.1615,
3429
+ "step": 546
3430
+ },
3431
+ {
3432
+ "epoch": 1.8,
3433
+ "learning_rate": 1.2727618586313495e-07,
3434
+ "loss": 0.1474,
3435
+ "step": 547
3436
+ },
3437
+ {
3438
+ "epoch": 1.8,
3439
+ "learning_rate": 1.2317173183573616e-07,
3440
+ "loss": 0.1432,
3441
+ "step": 548
3442
+ },
3443
+ {
3444
+ "epoch": 1.8,
3445
+ "learning_rate": 1.1913287634969461e-07,
3446
+ "loss": 0.1582,
3447
+ "step": 549
3448
+ },
3449
+ {
3450
+ "epoch": 1.81,
3451
+ "learning_rate": 1.151597308741606e-07,
3452
+ "loss": 0.1579,
3453
+ "step": 550
3454
+ },
3455
+ {
3456
+ "epoch": 1.81,
3457
+ "learning_rate": 1.1125240506474178e-07,
3458
+ "loss": 0.1533,
3459
+ "step": 551
3460
+ },
3461
+ {
3462
+ "epoch": 1.81,
3463
+ "learning_rate": 1.0741100676047639e-07,
3464
+ "loss": 0.1517,
3465
+ "step": 552
3466
+ },
3467
+ {
3468
+ "epoch": 1.82,
3469
+ "learning_rate": 1.0363564198085624e-07,
3470
+ "loss": 0.1716,
3471
+ "step": 553
3472
+ },
3473
+ {
3474
+ "epoch": 1.82,
3475
+ "learning_rate": 9.992641492290094e-08,
3476
+ "loss": 0.1709,
3477
+ "step": 554
3478
+ },
3479
+ {
3480
+ "epoch": 1.82,
3481
+ "learning_rate": 9.628342795828333e-08,
3482
+ "loss": 0.1439,
3483
+ "step": 555
3484
+ },
3485
+ {
3486
+ "epoch": 1.83,
3487
+ "learning_rate": 9.270678163050218e-08,
3488
+ "loss": 0.1568,
3489
+ "step": 556
3490
+ },
3491
+ {
3492
+ "epoch": 1.83,
3493
+ "learning_rate": 8.919657465210867e-08,
3494
+ "loss": 0.1306,
3495
+ "step": 557
3496
+ },
3497
+ {
3498
+ "epoch": 1.83,
3499
+ "learning_rate": 8.575290390198193e-08,
3500
+ "loss": 0.1612,
3501
+ "step": 558
3502
+ },
3503
+ {
3504
+ "epoch": 1.83,
3505
+ "eval_loss": 0.19228258728981018,
3506
+ "eval_runtime": 15.5027,
3507
+ "eval_samples_per_second": 6.128,
3508
+ "eval_steps_per_second": 1.548,
3509
+ "step": 558
3510
+ },
3511
+ {
3512
+ "epoch": 1.84,
3513
+ "learning_rate": 8.237586442265411e-08,
3514
+ "loss": 0.1491,
3515
+ "step": 559
3516
+ },
3517
+ {
3518
+ "epoch": 1.84,
3519
+ "learning_rate": 7.906554941768896e-08,
3520
+ "loss": 0.1692,
3521
+ "step": 560
3522
+ },
3523
+ {
3524
+ "epoch": 1.84,
3525
+ "learning_rate": 7.582205024910805e-08,
3526
+ "loss": 0.1775,
3527
+ "step": 561
3528
+ },
3529
+ {
3530
+ "epoch": 1.85,
3531
+ "learning_rate": 7.264545643486997e-08,
3532
+ "loss": 0.1526,
3533
+ "step": 562
3534
+ },
3535
+ {
3536
+ "epoch": 1.85,
3537
+ "learning_rate": 6.953585564639903e-08,
3538
+ "loss": 0.1716,
3539
+ "step": 563
3540
+ },
3541
+ {
3542
+ "epoch": 1.85,
3543
+ "learning_rate": 6.649333370616712e-08,
3544
+ "loss": 0.177,
3545
+ "step": 564
3546
+ },
3547
+ {
3548
+ "epoch": 1.86,
3549
+ "learning_rate": 6.351797458532316e-08,
3550
+ "loss": 0.1542,
3551
+ "step": 565
3552
+ },
3553
+ {
3554
+ "epoch": 1.86,
3555
+ "learning_rate": 6.060986040137689e-08,
3556
+ "loss": 0.1603,
3557
+ "step": 566
3558
+ },
3559
+ {
3560
+ "epoch": 1.86,
3561
+ "learning_rate": 5.776907141593235e-08,
3562
+ "loss": 0.1542,
3563
+ "step": 567
3564
+ },
3565
+ {
3566
+ "epoch": 1.87,
3567
+ "learning_rate": 5.4995686032471575e-08,
3568
+ "loss": 0.1629,
3569
+ "step": 568
3570
+ },
3571
+ {
3572
+ "epoch": 1.87,
3573
+ "learning_rate": 5.2289780794192726e-08,
3574
+ "loss": 0.1614,
3575
+ "step": 569
3576
+ },
3577
+ {
3578
+ "epoch": 1.87,
3579
+ "learning_rate": 4.96514303818954e-08,
3580
+ "loss": 0.1267,
3581
+ "step": 570
3582
+ },
3583
+ {
3584
+ "epoch": 1.88,
3585
+ "learning_rate": 4.708070761192146e-08,
3586
+ "loss": 0.1888,
3587
+ "step": 571
3588
+ },
3589
+ {
3590
+ "epoch": 1.88,
3591
+ "learning_rate": 4.457768343414382e-08,
3592
+ "loss": 0.168,
3593
+ "step": 572
3594
+ },
3595
+ {
3596
+ "epoch": 1.88,
3597
+ "learning_rate": 4.2142426930008584e-08,
3598
+ "loss": 0.1492,
3599
+ "step": 573
3600
+ },
3601
+ {
3602
+ "epoch": 1.89,
3603
+ "learning_rate": 3.9775005310629946e-08,
3604
+ "loss": 0.1382,
3605
+ "step": 574
3606
+ },
3607
+ {
3608
+ "epoch": 1.89,
3609
+ "learning_rate": 3.747548391493272e-08,
3610
+ "loss": 0.1682,
3611
+ "step": 575
3612
+ },
3613
+ {
3614
+ "epoch": 1.89,
3615
+ "learning_rate": 3.5243926207851606e-08,
3616
+ "loss": 0.1833,
3617
+ "step": 576
3618
+ },
3619
+ {
3620
+ "epoch": 1.89,
3621
+ "learning_rate": 3.3080393778577305e-08,
3622
+ "loss": 0.1451,
3623
+ "step": 577
3624
+ },
3625
+ {
3626
+ "epoch": 1.9,
3627
+ "learning_rate": 3.09849463388584e-08,
3628
+ "loss": 0.1741,
3629
+ "step": 578
3630
+ },
3631
+ {
3632
+ "epoch": 1.9,
3633
+ "learning_rate": 2.8957641721352735e-08,
3634
+ "loss": 0.1765,
3635
+ "step": 579
3636
+ },
3637
+ {
3638
+ "epoch": 1.9,
3639
+ "learning_rate": 2.6998535878030584e-08,
3640
+ "loss": 0.1684,
3641
+ "step": 580
3642
+ },
3643
+ {
3644
+ "epoch": 1.91,
3645
+ "learning_rate": 2.510768287863202e-08,
3646
+ "loss": 0.1543,
3647
+ "step": 581
3648
+ },
3649
+ {
3650
+ "epoch": 1.91,
3651
+ "learning_rate": 2.3285134909173113e-08,
3652
+ "loss": 0.1558,
3653
+ "step": 582
3654
+ },
3655
+ {
3656
+ "epoch": 1.91,
3657
+ "learning_rate": 2.1530942270506504e-08,
3658
+ "loss": 0.1654,
3659
+ "step": 583
3660
+ },
3661
+ {
3662
+ "epoch": 1.92,
3663
+ "learning_rate": 1.9845153376933102e-08,
3664
+ "loss": 0.1381,
3665
+ "step": 584
3666
+ },
3667
+ {
3668
+ "epoch": 1.92,
3669
+ "learning_rate": 1.822781475486507e-08,
3670
+ "loss": 0.164,
3671
+ "step": 585
3672
+ },
3673
+ {
3674
+ "epoch": 1.92,
3675
+ "learning_rate": 1.6678971041542702e-08,
3676
+ "loss": 0.153,
3677
+ "step": 586
3678
+ },
3679
+ {
3680
+ "epoch": 1.93,
3681
+ "learning_rate": 1.5198664983802346e-08,
3682
+ "loss": 0.1168,
3683
+ "step": 587
3684
+ },
3685
+ {
3686
+ "epoch": 1.93,
3687
+ "learning_rate": 1.3786937436895686e-08,
3688
+ "loss": 0.1425,
3689
+ "step": 588
3690
+ },
3691
+ {
3692
+ "epoch": 1.93,
3693
+ "learning_rate": 1.2443827363363693e-08,
3694
+ "loss": 0.1857,
3695
+ "step": 589
3696
+ },
3697
+ {
3698
+ "epoch": 1.93,
3699
+ "eval_loss": 0.19226355850696564,
3700
+ "eval_runtime": 15.5264,
3701
+ "eval_samples_per_second": 6.119,
3702
+ "eval_steps_per_second": 1.546,
3703
+ "step": 589
3704
+ },
3705
+ {
3706
+ "epoch": 1.94,
3707
+ "learning_rate": 1.1169371831959986e-08,
3708
+ "loss": 0.1804,
3709
+ "step": 590
3710
+ },
3711
+ {
3712
+ "epoch": 1.94,
3713
+ "learning_rate": 9.963606016628325e-09,
3714
+ "loss": 0.1569,
3715
+ "step": 591
3716
+ },
3717
+ {
3718
+ "epoch": 1.94,
3719
+ "learning_rate": 8.826563195531713e-09,
3720
+ "loss": 0.1867,
3721
+ "step": 592
3722
+ },
3723
+ {
3724
+ "epoch": 1.95,
3725
+ "learning_rate": 7.758274750134243e-09,
3726
+ "loss": 0.1755,
3727
+ "step": 593
3728
+ },
3729
+ {
3730
+ "epoch": 1.95,
3731
+ "learning_rate": 6.758770164334572e-09,
3732
+ "loss": 0.1796,
3733
+ "step": 594
3734
+ },
3735
+ {
3736
+ "epoch": 1.95,
3737
+ "learning_rate": 5.828077023651846e-09,
3738
+ "loss": 0.1559,
3739
+ "step": 595
3740
+ },
3741
+ {
3742
+ "epoch": 1.96,
3743
+ "learning_rate": 4.96622101446631e-09,
3744
+ "loss": 0.1851,
3745
+ "step": 596
3746
+ },
3747
+ {
3748
+ "epoch": 1.96,
3749
+ "learning_rate": 4.1732259233071004e-09,
3750
+ "loss": 0.1833,
3751
+ "step": 597
3752
+ },
3753
+ {
3754
+ "epoch": 1.96,
3755
+ "learning_rate": 3.449113636199153e-09,
3756
+ "loss": 0.1786,
3757
+ "step": 598
3758
+ },
3759
+ {
3760
+ "epoch": 1.97,
3761
+ "learning_rate": 2.793904138056469e-09,
3762
+ "loss": 0.1979,
3763
+ "step": 599
3764
+ },
3765
+ {
3766
+ "epoch": 1.97,
3767
+ "learning_rate": 2.2076155121328326e-09,
3768
+ "loss": 0.1566,
3769
+ "step": 600
3770
+ },
3771
+ {
3772
+ "epoch": 1.97,
3773
+ "learning_rate": 1.6902639395208197e-09,
3774
+ "loss": 0.1636,
3775
+ "step": 601
3776
+ },
3777
+ {
3778
+ "epoch": 1.98,
3779
+ "learning_rate": 1.2418636987057697e-09,
3780
+ "loss": 0.1439,
3781
+ "step": 602
3782
+ },
3783
+ {
3784
+ "epoch": 1.98,
3785
+ "learning_rate": 8.624271651727634e-10,
3786
+ "loss": 0.1967,
3787
+ "step": 603
3788
+ },
3789
+ {
3790
+ "epoch": 1.98,
3791
+ "learning_rate": 5.519648110638431e-10,
3792
+ "loss": 0.1715,
3793
+ "step": 604
3794
+ },
3795
+ {
3796
+ "epoch": 1.99,
3797
+ "learning_rate": 3.1048520488907717e-10,
3798
+ "loss": 0.1833,
3799
+ "step": 605
3800
+ },
3801
+ {
3802
+ "epoch": 1.99,
3803
+ "learning_rate": 1.3799501129063698e-10,
3804
+ "loss": 0.1692,
3805
+ "step": 606
3806
+ },
3807
+ {
3808
+ "epoch": 1.99,
3809
+ "learning_rate": 3.4498990858777835e-11,
3810
+ "loss": 0.1697,
3811
+ "step": 607
3812
+ },
3813
+ {
3814
+ "epoch": 2.0,
3815
+ "learning_rate": 0.0,
3816
+ "loss": 0.1712,
3817
+ "step": 608
3818
+ }
3819
+ ],
3820
+ "logging_steps": 1,
3821
+ "max_steps": 608,
3822
+ "num_train_epochs": 2,
3823
+ "save_steps": 500,
3824
+ "total_flos": 3.399970706505597e+18,
3825
+ "trial_name": null,
3826
+ "trial_params": null
3827
+ }
checkpoint-608/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f17f663326682e34d06194ed091725483adeebe21013fe1752885513b5506a8
3
+ size 4411
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "teknium/OpenHermes-2.5-Mistral-7B",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 14336,
12
+ "max_position_embeddings": 32768,
13
+ "model_type": "mistral",
14
+ "num_attention_heads": 32,
15
+ "num_hidden_layers": 32,
16
+ "num_key_value_heads": 8,
17
+ "rms_norm_eps": 1e-05,
18
+ "rope_theta": 10000.0,
19
+ "sliding_window": 4096,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.34.1",
23
+ "use_cache": false,
24
+ "vocab_size": 32002
25
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 32000,
5
+ "transformers_version": "4.34.1"
6
+ }
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d00e12428ee23606c20c0cb82491d564848080e13e6dc427e94e14700ee502d
3
+ size 9886765428
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5e0a4e903b17ca571e3d14a89e7bf7d4ed70334289ba4353fe58f9fd95789fd
3
+ size 5121688491
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15008350208
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
135
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
136
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
138
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
139
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
144
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
145
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
146
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
147
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
148
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
149
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
150
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
151
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
153
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
155
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
156
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
157
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
158
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
159
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
160
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
162
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
163
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
164
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
165
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
166
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
167
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
168
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
169
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
171
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
172
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
174
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
175
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
176
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
177
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
178
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
216
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
217
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
218
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
220
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
222
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
223
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
225
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
226
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
227
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
228
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
229
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
230
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
231
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
232
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
233
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
234
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
235
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
236
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
237
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
238
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
239
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
240
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
241
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
242
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
249
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
255
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
256
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
257
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
259
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
262
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
263
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
264
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
265
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
266
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
267
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
268
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
280
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
281
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
282
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
283
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
284
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
289
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
290
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
291
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
292
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
293
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
294
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
295
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
296
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
297
+ }
298
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "32000": {
30
+ "content": "<|im_end|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|im_start|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "additional_special_tokens": [],
47
+ "bos_token": "<s>",
48
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
49
+ "clean_up_tokenization_spaces": false,
50
+ "eos_token": "</s>",
51
+ "legacy": true,
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "</s>",
54
+ "sp_model_kwargs": {},
55
+ "spaces_between_special_tokens": false,
56
+ "tokenizer_class": "LlamaTokenizer",
57
+ "trust_remote_code": false,
58
+ "unk_token": "<unk>",
59
+ "use_default_system_prompt": true,
60
+ "use_fast": true
61
+ }