Upload folder using huggingface_hub
Browse files- config.json +2 -2
- model-00001-of-00003.safetensors +2 -2
- model-00002-of-00003.safetensors +1 -1
- model-00003-of-00003.safetensors +2 -2
- model.safetensors.index.json +8 -8
config.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"attention_bias": false,
|
| 3 |
"attention_dropout": 0.0,
|
|
|
|
| 4 |
"bos_token_id": null,
|
| 5 |
"eos_token_id": 151645,
|
| 6 |
"head_dim": 192,
|
|
@@ -47,11 +48,10 @@
|
|
| 47 |
}
|
| 48 |
},
|
| 49 |
"routed_scaling_factor": 1.0,
|
| 50 |
-
"router_jitter_noise": 0.0,
|
| 51 |
"sliding_window": 128,
|
| 52 |
"tie_word_embeddings": false,
|
| 53 |
"topk_group": 1,
|
| 54 |
-
"transformers_version": "5.
|
| 55 |
"use_cache": true,
|
| 56 |
"v_head_dim": 128,
|
| 57 |
"vocab_size": 151669
|
|
|
|
| 1 |
{
|
| 2 |
"attention_bias": false,
|
| 3 |
"attention_dropout": 0.0,
|
| 4 |
+
"attention_value_scale": 0.707,
|
| 5 |
"bos_token_id": null,
|
| 6 |
"eos_token_id": 151645,
|
| 7 |
"head_dim": 192,
|
|
|
|
| 48 |
}
|
| 49 |
},
|
| 50 |
"routed_scaling_factor": 1.0,
|
|
|
|
| 51 |
"sliding_window": 128,
|
| 52 |
"tie_word_embeddings": false,
|
| 53 |
"topk_group": 1,
|
| 54 |
+
"transformers_version": "5.7.0.dev0",
|
| 55 |
"use_cache": true,
|
| 56 |
"v_head_dim": 128,
|
| 57 |
"vocab_size": 151669
|
model-00001-of-00003.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b3f5a618c41b9ba418d56f7d2532f0577aecdb30e8694c426a399a4959f1b6c
|
| 3 |
+
size 1999644448
|
model-00002-of-00003.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1998137832
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41d75a901bebd8146f8c3682300eeb3768458b6ace41e4738a96a0e2c644a0d4
|
| 3 |
size 1998137832
|
model-00003-of-00003.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:532bebad37176d2276e8d860e4be8d0b040d6caac4aba9a82e24cbb923a19f15
|
| 3 |
+
size 684427968
|
model.safetensors.index.json
CHANGED
|
@@ -18,8 +18,6 @@
|
|
| 18 |
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 19 |
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 20 |
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 21 |
-
"model.layers.1.mlp.gate.weight": "model-00001-of-00003.safetensors",
|
| 22 |
-
"model.layers.1.mlp.gate.e_score_correction_bias": "model-00001-of-00003.safetensors",
|
| 23 |
"model.layers.1.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 24 |
"model.layers.1.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 25 |
"model.layers.1.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
|
|
@@ -212,6 +210,8 @@
|
|
| 212 |
"model.layers.1.mlp.experts.61.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 213 |
"model.layers.1.mlp.experts.62.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 214 |
"model.layers.1.mlp.experts.63.down_proj.weight": "model-00001-of-00003.safetensors",
|
|
|
|
|
|
|
| 215 |
"model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 216 |
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 217 |
"model.layers.2.self_attn.attention_sink_bias": "model-00001-of-00003.safetensors",
|
|
@@ -219,8 +219,6 @@
|
|
| 219 |
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 220 |
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 221 |
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 222 |
-
"model.layers.2.mlp.gate.weight": "model-00001-of-00003.safetensors",
|
| 223 |
-
"model.layers.2.mlp.gate.e_score_correction_bias": "model-00001-of-00003.safetensors",
|
| 224 |
"model.layers.2.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 225 |
"model.layers.2.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 226 |
"model.layers.2.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
|
|
@@ -413,6 +411,8 @@
|
|
| 413 |
"model.layers.2.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 414 |
"model.layers.2.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 415 |
"model.layers.2.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
|
|
|
|
|
|
|
| 416 |
"model.layers.2.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 417 |
"model.layers.2.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 418 |
"model.layers.3.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
|
|
@@ -420,8 +420,6 @@
|
|
| 420 |
"model.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 421 |
"model.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 422 |
"model.layers.3.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 423 |
-
"model.layers.3.mlp.gate.weight": "model-00002-of-00003.safetensors",
|
| 424 |
-
"model.layers.3.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
|
| 425 |
"model.layers.3.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 426 |
"model.layers.3.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 427 |
"model.layers.3.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
|
|
@@ -614,6 +612,8 @@
|
|
| 614 |
"model.layers.3.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 615 |
"model.layers.3.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 616 |
"model.layers.3.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
|
|
|
|
|
|
|
| 617 |
"model.layers.3.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 618 |
"model.layers.3.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 619 |
"model.layers.4.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
|
|
@@ -621,8 +621,6 @@
|
|
| 621 |
"model.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 622 |
"model.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 623 |
"model.layers.4.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 624 |
-
"model.layers.4.mlp.gate.weight": "model-00002-of-00003.safetensors",
|
| 625 |
-
"model.layers.4.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
|
| 626 |
"model.layers.4.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 627 |
"model.layers.4.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 628 |
"model.layers.4.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
|
|
@@ -815,6 +813,8 @@
|
|
| 815 |
"model.layers.4.mlp.experts.61.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 816 |
"model.layers.4.mlp.experts.62.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 817 |
"model.layers.4.mlp.experts.63.down_proj.weight": "model-00003-of-00003.safetensors",
|
|
|
|
|
|
|
| 818 |
"model.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 819 |
"model.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 820 |
"model.norm.weight": "model-00003-of-00003.safetensors",
|
|
|
|
| 18 |
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 19 |
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 20 |
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
|
|
|
|
|
|
| 21 |
"model.layers.1.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 22 |
"model.layers.1.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 23 |
"model.layers.1.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
|
|
|
|
| 210 |
"model.layers.1.mlp.experts.61.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 211 |
"model.layers.1.mlp.experts.62.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 212 |
"model.layers.1.mlp.experts.63.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 213 |
+
"model.layers.1.mlp.gate.weight": "model-00001-of-00003.safetensors",
|
| 214 |
+
"model.layers.1.mlp.gate.e_score_correction_bias": "model-00001-of-00003.safetensors",
|
| 215 |
"model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 216 |
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 217 |
"model.layers.2.self_attn.attention_sink_bias": "model-00001-of-00003.safetensors",
|
|
|
|
| 219 |
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 220 |
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 221 |
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
|
|
|
|
|
|
| 222 |
"model.layers.2.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 223 |
"model.layers.2.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 224 |
"model.layers.2.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
|
|
|
|
| 411 |
"model.layers.2.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 412 |
"model.layers.2.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 413 |
"model.layers.2.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 414 |
+
"model.layers.2.mlp.gate.weight": "model-00002-of-00003.safetensors",
|
| 415 |
+
"model.layers.2.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
|
| 416 |
"model.layers.2.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 417 |
"model.layers.2.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 418 |
"model.layers.3.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
|
|
|
|
| 420 |
"model.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 421 |
"model.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 422 |
"model.layers.3.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
|
|
|
|
|
|
| 423 |
"model.layers.3.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 424 |
"model.layers.3.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 425 |
"model.layers.3.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
|
|
|
|
| 612 |
"model.layers.3.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 613 |
"model.layers.3.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 614 |
"model.layers.3.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 615 |
+
"model.layers.3.mlp.gate.weight": "model-00002-of-00003.safetensors",
|
| 616 |
+
"model.layers.3.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
|
| 617 |
"model.layers.3.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 618 |
"model.layers.3.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 619 |
"model.layers.4.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
|
|
|
|
| 621 |
"model.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 622 |
"model.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 623 |
"model.layers.4.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
|
|
|
|
|
|
| 624 |
"model.layers.4.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 625 |
"model.layers.4.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 626 |
"model.layers.4.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
|
|
|
|
| 813 |
"model.layers.4.mlp.experts.61.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 814 |
"model.layers.4.mlp.experts.62.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 815 |
"model.layers.4.mlp.experts.63.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 816 |
+
"model.layers.4.mlp.gate.weight": "model-00003-of-00003.safetensors",
|
| 817 |
+
"model.layers.4.mlp.gate.e_score_correction_bias": "model-00003-of-00003.safetensors",
|
| 818 |
"model.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 819 |
"model.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 820 |
"model.norm.weight": "model-00003-of-00003.safetensors",
|