casinca commited on
Commit
bb2ab88
·
verified ·
1 Parent(s): 6a676cd

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "attention_bias": false,
3
  "attention_dropout": 0.0,
 
4
  "bos_token_id": null,
5
  "eos_token_id": 151645,
6
  "head_dim": 192,
@@ -47,11 +48,10 @@
47
  }
48
  },
49
  "routed_scaling_factor": 1.0,
50
- "router_jitter_noise": 0.0,
51
  "sliding_window": 128,
52
  "tie_word_embeddings": false,
53
  "topk_group": 1,
54
- "transformers_version": "5.6.0.dev0",
55
  "use_cache": true,
56
  "v_head_dim": 128,
57
  "vocab_size": 151669
 
1
  {
2
  "attention_bias": false,
3
  "attention_dropout": 0.0,
4
+ "attention_value_scale": 0.707,
5
  "bos_token_id": null,
6
  "eos_token_id": 151645,
7
  "head_dim": 192,
 
48
  }
49
  },
50
  "routed_scaling_factor": 1.0,
 
51
  "sliding_window": 128,
52
  "tie_word_embeddings": false,
53
  "topk_group": 1,
54
+ "transformers_version": "5.7.0.dev0",
55
  "use_cache": true,
56
  "v_head_dim": 128,
57
  "vocab_size": 151669
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42af2467e4bfa98049267514a080ad74861e15b08495967418358583199cb31b
3
- size 1999907056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3f5a618c41b9ba418d56f7d2532f0577aecdb30e8694c426a399a4959f1b6c
3
+ size 1999644448
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:793de365a5fe73df01793e6467be00fae4edaef269b05ae94c5090b1448fdc0a
3
  size 1998137832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41d75a901bebd8146f8c3682300eeb3768458b6ace41e4738a96a0e2c644a0d4
3
  size 1998137832
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fb38def785a4b9001af6ef78281a8479e5ef85952f2eadedfe5a3bedb6b118a
3
- size 684165360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532bebad37176d2276e8d860e4be8d0b040d6caac4aba9a82e24cbb923a19f15
3
+ size 684427968
model.safetensors.index.json CHANGED
@@ -18,8 +18,6 @@
18
  "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
19
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
21
- "model.layers.1.mlp.gate.weight": "model-00001-of-00003.safetensors",
22
- "model.layers.1.mlp.gate.e_score_correction_bias": "model-00001-of-00003.safetensors",
23
  "model.layers.1.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
24
  "model.layers.1.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
25
  "model.layers.1.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
@@ -212,6 +210,8 @@
212
  "model.layers.1.mlp.experts.61.down_proj.weight": "model-00001-of-00003.safetensors",
213
  "model.layers.1.mlp.experts.62.down_proj.weight": "model-00001-of-00003.safetensors",
214
  "model.layers.1.mlp.experts.63.down_proj.weight": "model-00001-of-00003.safetensors",
 
 
215
  "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
  "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
217
  "model.layers.2.self_attn.attention_sink_bias": "model-00001-of-00003.safetensors",
@@ -219,8 +219,6 @@
219
  "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
220
  "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
221
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
- "model.layers.2.mlp.gate.weight": "model-00001-of-00003.safetensors",
223
- "model.layers.2.mlp.gate.e_score_correction_bias": "model-00001-of-00003.safetensors",
224
  "model.layers.2.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
225
  "model.layers.2.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
226
  "model.layers.2.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
@@ -413,6 +411,8 @@
413
  "model.layers.2.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
414
  "model.layers.2.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
415
  "model.layers.2.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
 
 
416
  "model.layers.2.input_layernorm.weight": "model-00002-of-00003.safetensors",
417
  "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
418
  "model.layers.3.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
@@ -420,8 +420,6 @@
420
  "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
421
  "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
422
  "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
423
- "model.layers.3.mlp.gate.weight": "model-00002-of-00003.safetensors",
424
- "model.layers.3.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
425
  "model.layers.3.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
426
  "model.layers.3.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
427
  "model.layers.3.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
@@ -614,6 +612,8 @@
614
  "model.layers.3.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
615
  "model.layers.3.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
616
  "model.layers.3.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
 
 
617
  "model.layers.3.input_layernorm.weight": "model-00002-of-00003.safetensors",
618
  "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
619
  "model.layers.4.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
@@ -621,8 +621,6 @@
621
  "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
622
  "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
623
  "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
624
- "model.layers.4.mlp.gate.weight": "model-00002-of-00003.safetensors",
625
- "model.layers.4.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
626
  "model.layers.4.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
627
  "model.layers.4.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
628
  "model.layers.4.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
@@ -815,6 +813,8 @@
815
  "model.layers.4.mlp.experts.61.down_proj.weight": "model-00003-of-00003.safetensors",
816
  "model.layers.4.mlp.experts.62.down_proj.weight": "model-00003-of-00003.safetensors",
817
  "model.layers.4.mlp.experts.63.down_proj.weight": "model-00003-of-00003.safetensors",
 
 
818
  "model.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors",
819
  "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
820
  "model.norm.weight": "model-00003-of-00003.safetensors",
 
18
  "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
19
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
 
 
21
  "model.layers.1.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
22
  "model.layers.1.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
23
  "model.layers.1.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
 
210
  "model.layers.1.mlp.experts.61.down_proj.weight": "model-00001-of-00003.safetensors",
211
  "model.layers.1.mlp.experts.62.down_proj.weight": "model-00001-of-00003.safetensors",
212
  "model.layers.1.mlp.experts.63.down_proj.weight": "model-00001-of-00003.safetensors",
213
+ "model.layers.1.mlp.gate.weight": "model-00001-of-00003.safetensors",
214
+ "model.layers.1.mlp.gate.e_score_correction_bias": "model-00001-of-00003.safetensors",
215
  "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
  "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
217
  "model.layers.2.self_attn.attention_sink_bias": "model-00001-of-00003.safetensors",
 
219
  "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
220
  "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
221
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
 
 
222
  "model.layers.2.mlp.experts.0.gate_proj.weight": "model-00001-of-00003.safetensors",
223
  "model.layers.2.mlp.experts.0.up_proj.weight": "model-00001-of-00003.safetensors",
224
  "model.layers.2.mlp.experts.1.gate_proj.weight": "model-00001-of-00003.safetensors",
 
411
  "model.layers.2.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
412
  "model.layers.2.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
413
  "model.layers.2.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
414
+ "model.layers.2.mlp.gate.weight": "model-00002-of-00003.safetensors",
415
+ "model.layers.2.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
416
  "model.layers.2.input_layernorm.weight": "model-00002-of-00003.safetensors",
417
  "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
418
  "model.layers.3.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
 
420
  "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
421
  "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
422
  "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
 
 
423
  "model.layers.3.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
424
  "model.layers.3.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
425
  "model.layers.3.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
 
612
  "model.layers.3.mlp.experts.61.down_proj.weight": "model-00002-of-00003.safetensors",
613
  "model.layers.3.mlp.experts.62.down_proj.weight": "model-00002-of-00003.safetensors",
614
  "model.layers.3.mlp.experts.63.down_proj.weight": "model-00002-of-00003.safetensors",
615
+ "model.layers.3.mlp.gate.weight": "model-00002-of-00003.safetensors",
616
+ "model.layers.3.mlp.gate.e_score_correction_bias": "model-00002-of-00003.safetensors",
617
  "model.layers.3.input_layernorm.weight": "model-00002-of-00003.safetensors",
618
  "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
619
  "model.layers.4.self_attn.attention_sink_bias": "model-00002-of-00003.safetensors",
 
621
  "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
622
  "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
623
  "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
 
 
624
  "model.layers.4.mlp.experts.0.gate_proj.weight": "model-00002-of-00003.safetensors",
625
  "model.layers.4.mlp.experts.0.up_proj.weight": "model-00002-of-00003.safetensors",
626
  "model.layers.4.mlp.experts.1.gate_proj.weight": "model-00002-of-00003.safetensors",
 
813
  "model.layers.4.mlp.experts.61.down_proj.weight": "model-00003-of-00003.safetensors",
814
  "model.layers.4.mlp.experts.62.down_proj.weight": "model-00003-of-00003.safetensors",
815
  "model.layers.4.mlp.experts.63.down_proj.weight": "model-00003-of-00003.safetensors",
816
+ "model.layers.4.mlp.gate.weight": "model-00003-of-00003.safetensors",
817
+ "model.layers.4.mlp.gate.e_score_correction_bias": "model-00003-of-00003.safetensors",
818
  "model.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors",
819
  "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
820
  "model.norm.weight": "model-00003-of-00003.safetensors",