JosephusCheung commited on Dec 16, 2023

Commit

f2cbd96

•

1 Parent(s): e3ce4d1

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

config.json +33 -0
generation_config.json +8 -0
pytorch_model-00001-of-00008.bin +3 -0
pytorch_model-00002-of-00008.bin +3 -0
pytorch_model-00003-of-00008.bin +3 -0
pytorch_model-00004-of-00008.bin +3 -0
pytorch_model-00005-of-00008.bin +3 -0
pytorch_model-00006-of-00008.bin +3 -0
pytorch_model-00007-of-00008.bin +3 -0
pytorch_model-00008-of-00008.bin +3 -0
pytorch_model.bin.index.json +1002 -0
qwen.tiktoken +0 -0
special_tokens_map.json +9 -0
tokenization_qwen.py +230 -0
tokenizer_config.json +10 -0
visual.bin +3 -0
visual.py +428 -0

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "/notebooks/moe-8x7b",
+  "architectures": [
+    "MixtralForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 32768,
+  "model_type": "mixtral",
+  "num_attention_heads": 32,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "num_local_experts": 8,
+  "output_router_logits": false,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": 8192,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.36.1",
+  "use_cache": true,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "pad_token_id": 0,
+  "transformers_version": "4.36.1",
+  "use_cache": false
+}

pytorch_model-00001-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d29cb4071cb88b84138ce94f3775aeb5c94eee8a1cde0f8699e0065238c4748f
+size 9988040244

pytorch_model-00002-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d72fc0d1eb1dc67f3d3b824831f513121f16ef77cfe9b4987a7dacdf5e046b
+size 9959835736

pytorch_model-00003-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44db8160126b04f1f01f22e231799096f539304cf7dc33f1db92f7872b4cd824
+size 9915711646

pytorch_model-00004-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8776cf7902c73fc184702e89feaeaf6a326fb5145cd8671483e79c7d97c8d68e
+size 9915711710

pytorch_model-00005-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4dcc601734371e2416e6d8b885327ff790b6da8f02b33f122f3310853fbfd26
+size 9959835980

pytorch_model-00006-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e697da58feb498426d4962850a909649807f7d29bce7e778ae6ce216cc7b8a5e
+size 9915711710

pytorch_model-00007-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ef40e6681e96ac38612c6495aceba0628b2d705539906c87ba73e862562e5d4
+size 9915711710

pytorch_model-00008-of-00008.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5f34090895bf1372efef56be4d788ed55e89d975bcc9c1229008856f8a99d97
+size 6473075116

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,1002 @@

+{
+  "metadata": {
+    "total_size": 76043264000
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00008-of-00008.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.block_sparse_moe.gate.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00004-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.block_sparse_moe.gate.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00005-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.block_sparse_moe.gate.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00006-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.block_sparse_moe.gate.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.block_sparse_moe.gate.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.block_sparse_moe.gate.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.block_sparse_moe.gate.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00007-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.block_sparse_moe.gate.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.block_sparse_moe.gate.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.block_sparse_moe.gate.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00008-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.block_sparse_moe.gate.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.0.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.0.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.0.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.1.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.1.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.1.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.2.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.2.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.2.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.3.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.3.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.3.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.4.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.4.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.4.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.5.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.5.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.5.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.6.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.6.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.6.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.7.w1.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.7.w2.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.experts.7.w3.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.block_sparse_moe.gate.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00008.bin",
+    "model.norm.weight": "pytorch_model-00008-of-00008.bin"
+  }
+}

qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS, start=len(self.mergeable_ranks)
+            )
+        }
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError('Adding regular tokens is not supported')
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS:
+                raise ValueError('Adding unknown special tokens is not supported')
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_max_length": 999999999999999999,
+  "tokenizer_class": "QWenTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+      ]
+  }
+}

visual.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:219ab65695072fc179a29903587e9744b124aee1bd4d09ec960d22d81f207450
+size 3871401097

visual.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from einops import rearrange
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x, attn_mask=None):
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, embed_dim, num_heads,
+                 bias=True, kdim=None, vdim=None):
+        super(VisualAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
+        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+    def forward(self, query, key, value, attn_mask = None):
+        # query/key/value: [sq, b, h]
+        sq, b, _ = query.size()
+        assert query is key, 'Only Support Self-Attention Currently'
+        sk = sq
+        mixed_x_layer = self.in_proj(query)
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(
+            self.hidden_size_per_attention_head, dim=-1)
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+        value_layer = value_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(b,
+            self.num_attention_heads_per_partition,
+            sq, self.hidden_size_per_attention_head)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        output = self.out_proj(context_layer)
+        return output
+class VisualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+            is_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+        x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList([
+            VisualAttentionBlock(
+                width, heads, mlp_ratio, act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(layers)
+        ])
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(
+            self,
+            image_size: int = 448,
+            patch_size: int = 14,
+            width: int = 1664,
+            layers: int = 48,
+            heads: int = 16,
+            mlp_ratio: float = 4.9231,
+            n_queries: int = 256,
+            output_dim: int = 4096,
+            **kwargs
+    ):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.output_dim = output_dim
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.attn_pool = Resampler(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=4096,
+            num_heads=4096 // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+        )
+        self.ln_post = norm_layer(4096)
+        self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(4096, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = x + get_abs_pos(self.positional_embedding, x.size(1))
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.attn_pool(x)
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+    def encode(self, image_paths):
+        images = []
+        for image_path in image_paths:
+            if isinstance(image_path, Image.Image):
+                image = image_path
+            elif image_path.startswith("http://") or image_path.startswith("https://"):
+                image = Image.open(requests.get(image_path, stream=True).raw)
+            else:
+                image = Image.open(image_path)
+            image = image.convert("RGB")
+            images.append(self.image_transform(image))
+        images = torch.stack(images, dim=0)
+        return self(images)