feat: ✨ Uploaded artifacts
Browse files- config.json +37 -0
- mergekit_moe_config.yml +101 -0
- model-00001-of-00006.safetensors +3 -0
- model-00002-of-00006.safetensors +3 -0
- model-00003-of-00006.safetensors +3 -0
- model-00004-of-00006.safetensors +3 -0
- model-00005-of-00006.safetensors +3 -0
- model-00006-of-00006.safetensors +3 -0
- model.safetensors.index.json +1 -0
- special_tokens_map.json +24 -0
- tokenizer.json +0 -0
- tokenizer_config.json +47 -0
config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "nitic-nlp-team/webnavix-llama-base",
|
3 |
+
"architectures": [
|
4 |
+
"MixtralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"head_dim": 128,
|
11 |
+
"hidden_act": "silu",
|
12 |
+
"hidden_size": 2560,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 6912,
|
15 |
+
"max_position_embeddings": 4096,
|
16 |
+
"mlp_bias": false,
|
17 |
+
"model_type": "mixtral",
|
18 |
+
"num_attention_heads": 20,
|
19 |
+
"num_experts_per_tok": 2,
|
20 |
+
"num_hidden_layers": 32,
|
21 |
+
"num_key_value_heads": 20,
|
22 |
+
"num_local_experts": 8,
|
23 |
+
"output_router_logits": false,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"pretraining_tp": 1,
|
26 |
+
"rms_norm_eps": 1e-05,
|
27 |
+
"rope_scaling": null,
|
28 |
+
"rope_theta": 10000.0,
|
29 |
+
"router_aux_loss_coef": 0.001,
|
30 |
+
"router_jitter_noise": 0.0,
|
31 |
+
"sliding_window": null,
|
32 |
+
"tie_word_embeddings": false,
|
33 |
+
"torch_dtype": "bfloat16",
|
34 |
+
"transformers_version": "4.47.1",
|
35 |
+
"use_cache": false,
|
36 |
+
"vocab_size": 32000
|
37 |
+
}
|
mergekit_moe_config.yml
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_model: nitic-nlp-team/webnavix-llama-base
|
2 |
+
gate_mode: hidden
|
3 |
+
dtype: bfloat16
|
4 |
+
experts_per_token: 2
|
5 |
+
experts:
|
6 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-ai-tools/checkpoint-500/
|
7 |
+
positive_prompts:
|
8 |
+
- "Could you please open brandmark?"
|
9 |
+
- "Please help me to create a logo for my brand."
|
10 |
+
- "Change the font size - 2.5, Light Peach color & let the font style be as it is."
|
11 |
+
- "Open Quillbot website."
|
12 |
+
- 'Can you please open the website "Hugging face" stable diffusion?'
|
13 |
+
- "Open bard and login with below credentials: \n\tID - webtasks.navigator@gmail.com \n\tPassword - KEG24qweUHij%^"
|
14 |
+
- "Ask bard to summarize the below news from BBC news \\n\\t\\n\\t- Modi in US: Elon Musk says Tesla to come to India 'as soon as possible'"
|
15 |
+
- 'I want to create a mood board on "African Savanna".'
|
16 |
+
- "Open copy.ai website."
|
17 |
+
- "Got any creative ideas for a 10-year old's birthday?"
|
18 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-booking/checkpoint-1050/
|
19 |
+
positive_prompts:
|
20 |
+
- "Open momondo.in and login with google using the below details: \\n\\t\\n\\tId: webtasks.navigator@gmail.com \\n\\tPassword: KEG24qweUHij%^"
|
21 |
+
- "Great, How many people will be in your party?"
|
22 |
+
- "Open Cheaptickets website."
|
23 |
+
- "Send me cruise options from Canada/New England for 1st July to 8th July."
|
24 |
+
- "I am searching for one-way flights from Azerbaijan to Turkey."
|
25 |
+
- "Date of journey: 30th June. Number of travelers: 4"
|
26 |
+
- "It will be picked up in Toronto (and vicinity), Ontario, Canada on June 8th at 10:00 AM. Drop-off will take place at 10 AM on 10th June."
|
27 |
+
- "Please open Vrbo website."
|
28 |
+
- "In which location?"
|
29 |
+
- "What will be the price for the 1st option?"
|
30 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-composing/checkpoint-500/
|
31 |
+
positive_prompts:
|
32 |
+
- "Please open Wattpad."
|
33 |
+
- "Create a new story"
|
34 |
+
- "Below in the text box, write: \\n\\t \\n\\tWhispers in the wind, \\n\\tA dance of words unseen, \\n\\tEmotions unfurling, \\n\\tIn the spaces between. \\n\\t \\n\\tLove's tender embrace, \\n\\tOr heartache's bitter sting, \\n\\tIn ...\") Please select the best action using the correct format, do not provide any other information or explanation."
|
35 |
+
- 'Click on "Publish" and tick the "*You must complete this field to proceed."'
|
36 |
+
- 'Open a new Google Docs document and create a with the bulleted list with the title "Healthy Eating Habits."'
|
37 |
+
- "Add the following bullet points to the list: \\n\\t\\n\\tPrioritize tasks based on importance and urgency. \\n\\t\\n\\tBreak larger tasks into smaller, manageable chunks. \\n\\t\\n\\tSet realistic goals and deadlines. \\n\\t\\n..."
|
38 |
+
- 'Select "Untitled Part 1" paste this "The Enchanted Gears" and click on "Save."'
|
39 |
+
- "I want to create a blog in a few words on Beginner's Guide to Meditation: Finding Inner Peace."
|
40 |
+
- 'Let''s add a subtitle to the document stating, "Enhancing Collaboration and Productivity."'
|
41 |
+
- "For the title, let's use Georgia with a font size of 28, and for the subtitle, let's use Tahoma with a font size of 18."
|
42 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-information-lookup/checkpoint-500/
|
43 |
+
positive_prompts:
|
44 |
+
- "Please open the HowStuffWorks website."
|
45 |
+
- "Search for the article titled 'How to Prepare for a Hurricane.'"
|
46 |
+
- "Name the contributing writers in this article."
|
47 |
+
- "What is the Pythagorean theorem?"
|
48 |
+
- "Search 'Trading for Beginners'."
|
49 |
+
- "Show me some articles"
|
50 |
+
- 'Can you open the website of the Central Intelligence Agency? [00:34] Please go to country "China". [01:06] What is the population of China?'
|
51 |
+
- "Could you tell me the major rivers in China, listed by length in kilometers?"
|
52 |
+
- 'Go to the "New on Britannica " section. [00:47] Open the article on "Sister Rosetta Tharpe." [01:01] Who was Sister Rosetta Tharpe? [01:39] Whose daughter was Sister Rosetta Tharpe? [02:11] At what age did Sister Rosetta Tharpe start singing and playing the guitar?'
|
53 |
+
- "When did she officially join Lucky Millinder's swing band?"
|
54 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-shopping/checkpoint-400/
|
55 |
+
positive_prompts:
|
56 |
+
- "Can you please open the Grubhub website?"
|
57 |
+
- "Can you show me some of their Neighborhood Gems options?"
|
58 |
+
- "I am looking for DSLR camera."
|
59 |
+
- "Okay. May I know the price range?"
|
60 |
+
- "Can you find a Samsung 5G smartphone?"
|
61 |
+
- "My budget is between Rs25000 to 30000."
|
62 |
+
- "Select Peach colour and what is the price?"
|
63 |
+
- "What will be the delivery date for the last one if I purchase it today?"
|
64 |
+
- "Please open the eBay website."
|
65 |
+
- "Look for 3 width and 60 gauge."
|
66 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-social-interaction/checkpoint-150/
|
67 |
+
positive_prompts:
|
68 |
+
- "Open Discourse Org & sign in with google using the below credentials: \\n\\tID - webtasks.navigator@gmail.com \\n\\tPassword - KEG24qweUHij%^"
|
69 |
+
- "Open “Gaming” categories and select 'new topic'."
|
70 |
+
- "Sure, for the title add: \\n\\tBest free games to-play on PC"
|
71 |
+
- "For description add the text below: \\n\\tLooking for some fantastic free games to enjoy on your PC? Look no further! Here’s a list of the best free games that offer incredible experiences without brea..."
|
72 |
+
- "Please create the Topic."
|
73 |
+
- "Browse games & send me some recommended Games"
|
74 |
+
- "How many followers does it have?"
|
75 |
+
- "Go to videos and open the topic “Recommend a great YouTube video”."
|
76 |
+
- "Add a server on discord then select “Create my own” and Please name it as “pina colada's server”."
|
77 |
+
- "Now please go to the “Mirage” server and create an event."
|
78 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-summarizing/checkpoint-450/
|
79 |
+
positive_prompts:
|
80 |
+
- "Open the Second one and please summarize this article."
|
81 |
+
- 'Please open the New Yorker website. [01:04] Click on "Goings On" [01:24] Open the link to the first article [01:56] Please summarize this article briefly.'
|
82 |
+
- "Please summarize the first two paragraphs."
|
83 |
+
- "Search Jalapeno Falafels Recipe [00:57] Please summarize this article name “Jalapeno Falafels Recipe”"
|
84 |
+
- "Summarize the above article in few lines."
|
85 |
+
- "Open the 2nd article and summarize the first 2 paragraphs of the article."
|
86 |
+
- "Show me few articles and summarize for me."
|
87 |
+
- "Open the Third one and please summarize this article."
|
88 |
+
- "Please summarize the first two paragraphs."
|
89 |
+
- "Provide me with a list of the articles."
|
90 |
+
- source_model: /content/drive/MyDrive/Projects/nitic-nlp-team/webnavix/checkpoints/webnavix/nitic-nlp-team/webnavix-llama-task-management/checkpoint-450/
|
91 |
+
positive_prompts:
|
92 |
+
- "Open the Trello website and please search for “Annotations”."
|
93 |
+
- "Go to “Gmail” and Add “Template”. Can you tell me the title name? Email Empowerment."
|
94 |
+
- "Please Select Notification as watching."
|
95 |
+
- "Create a Workspace in Trello."
|
96 |
+
- "Create a new section as \"Do laundry\" & add the below tasks: \\n\\t- Sort and wash clothes. \\n\\t- Dry and fold clean laundry. \\n\\t- Iron or steam clothes if necessary. \\n\\t- Put away clean clothes in their de..."
|
97 |
+
- "What will be their Due date? This weekend."
|
98 |
+
- "Now, please add the following tasks to the below sections:\\n\\tPlanning\\n\\tTasks: \\n\\t\\n\\t- Research home renovation ideas online \\n\\t\\n\\t- Create a mood board for design inspiration\\n\\t2. Budgeting \\n\\t\\n\\tTasks: \\n\\t..."
|
99 |
+
- "Do you want any of the tasks to be set as a priority?"
|
100 |
+
- "Let's open the Todoist website."
|
101 |
+
- "Could you provide me with the task name and description?"
|
model-00001-of-00006.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98121ebe544d0cfcccb231faccabe8c821662dc0c125611448475e7c324744f3
|
3 |
+
size 4972953912
|
model-00002-of-00006.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3caac6089b6e8e06de100767966702d5b41647e8b6709603203141720792f32f
|
3 |
+
size 4969011496
|
model-00003-of-00006.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0387270df6546b814d2355eead67c732afd57b7087159df8c7b46e4265c011fc
|
3 |
+
size 4986061816
|
model-00004-of-00006.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cde5b24c6b03bc4f92444a97de09f280130504ee5d2c017873d0be097a595b3
|
3 |
+
size 4969011640
|
model-00005-of-00006.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bced034dde24294a5ebbc74b553d65a3e7b8816715c32d7cd14d38d9d4b2641f
|
3 |
+
size 4986061808
|
model-00006-of-00006.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:039cc7c63062162d1233cb098a3a22cc078b49dd31da1fd9cad5606fa974733c
|
3 |
+
size 4303161160
|
model.safetensors.index.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata": {"mergekit_version": "0.0.5.2", "total_size": 29186135040}, "weight_map": {"model.embed_tokens.weight": "model-00001-of-00006.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00006.safetensors", "model.layers.0.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00006.safetensors", "model.layers.1.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00006.safetensors", "model.layers.2.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00006.safetensors", "model.layers.3.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00006.safetensors", "model.layers.4.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00006.safetensors", "model.layers.5.input_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00006.safetensors", "model.layers.5.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00006.safetensors", "model.layers.6.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00006.safetensors", "model.layers.7.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00006.safetensors", "model.layers.8.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00006.safetensors", "model.layers.9.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00006.safetensors", "model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00006.safetensors", "model.layers.10.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00006.safetensors", "model.layers.11.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.input_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00006.safetensors", "model.layers.12.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.input_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00006.safetensors", "model.layers.13.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.input_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00006.safetensors", "model.layers.14.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00006.safetensors", "model.layers.15.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00006.safetensors", "model.layers.16.input_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.6.w2.weight": "model-00004-of-00006.safetensors", "model.layers.16.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.6.w2.weight": "model-00004-of-00006.safetensors", "model.layers.17.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.input_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.6.w2.weight": "model-00004-of-00006.safetensors", "model.layers.18.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.input_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.6.w2.weight": "model-00004-of-00006.safetensors", "model.layers.19.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.input_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.6.w2.weight": "model-00004-of-00006.safetensors", "model.layers.20.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00006.safetensors", "model.layers.21.input_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00006.safetensors", "model.layers.21.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.input_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00006.safetensors", "model.layers.22.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00006.safetensors", "model.layers.23.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.input_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00006.safetensors", "model.layers.24.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.input_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00006.safetensors", "model.layers.25.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.input_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00006.safetensors", "model.layers.26.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00006.safetensors", "model.layers.27.input_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.input_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00006.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00006.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00006.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00006.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.5.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00006.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00006.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00006.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00006.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.5.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.input_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00006.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00006.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00006.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00006.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.5.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.input_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00006.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00006.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00006.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.5.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00006.safetensors", "model.norm.weight": "model-00006-of-00006.safetensors", "lm_head.weight": "model-00006-of-00006.safetensors", "model.layers.0.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.1.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.2.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.3.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.4.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.5.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.6.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.7.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.8.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.9.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.10.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.11.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.12.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.13.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.14.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.15.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.16.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.17.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.18.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.19.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.20.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.21.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.22.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.23.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.24.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.25.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.26.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.27.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.28.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.29.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.30.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors", "model.layers.31.block_sparse_moe.gate.weight": "model-00006-of-00006.safetensors"}}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "<s>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<unk>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"bos_token": "<s>",
|
32 |
+
"clean_up_tokenization_spaces": false,
|
33 |
+
"eos_token": "</s>",
|
34 |
+
"extra_special_tokens": {},
|
35 |
+
"legacy": false,
|
36 |
+
"max_length": 4096,
|
37 |
+
"model_max_length": 1000000000000000019884624838656,
|
38 |
+
"pad_token": "<s>",
|
39 |
+
"padding_side": "left",
|
40 |
+
"sp_model_kwargs": {},
|
41 |
+
"stride": 0,
|
42 |
+
"tokenizer_class": "LlamaTokenizer",
|
43 |
+
"truncation_side": "right",
|
44 |
+
"truncation_strategy": "longest_first",
|
45 |
+
"unk_token": "<unk>",
|
46 |
+
"use_default_system_prompt": false
|
47 |
+
}
|