diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2b57ea7d98162cb67f956f63b69923e387335a3b --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +--- +language: +- en +- zh +library_name: mlx +license: mit +pipeline_tag: text-generation +base_model: zai-org/GLM-5.1 +tags: +- mlx +--- diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..0093efaa15b9ee3b0d8799ec64933fe0897b6687 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,117 @@ +[gMASK] +{%- if tools -%} +{%- macro tool_to_json(tool) -%} + {%- set ns_tool = namespace(first=true) -%} + {{ '{' -}} + {%- for k, v in tool.items() -%} + {%- if k != 'defer_loading' and k != 'strict' -%} + {%- if not ns_tool.first -%}{{- ', ' -}}{%- endif -%} + {%- set ns_tool.first = false -%} + "{{ k }}": {{ v | tojson(ensure_ascii=False) }} + {%- endif -%} + {%- endfor -%} + {{- '}' -}} +{%- endmacro -%} +<|system|> +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{% for tool in tools %} +{%- if 'function' in tool -%} + {%- set tool = tool['function'] -%} +{%- endif -%} +{% if tool.defer_loading is not defined or not tool.defer_loading %} +{{ tool_to_json(tool) }} +{% endif %} +{% endfor %} + + +For each function call, output the function name and arguments within the following XML format: +{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}...{%- endif -%} +{%- macro visible_text(content) -%} + {%- if content is string -%} + {{- content }} + {%- elif content is iterable and content is not mapping -%} + {%- for item in content -%} + {%- if item is mapping and item.type == 'text' -%} + {{- item.text }} + {%- elif item is string -%} + {{- item }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{- content }} + {%- endif -%} +{%- endmacro -%} +{%- set ns = namespace(last_user_index=-1, thinking_indices='') -%} +{%- for m in messages %} + {%- if m.role == 'user' %} + {%- set ns.last_user_index = loop.index0 -%} + {%- elif m.role == 'assistant' %} + {%- if m.reasoning_content is string %} + {%- set ns.thinking_indices = ns.thinking_indices ~ ',' ~ ns.last_user_index ~ ',' -%} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- set ns.has_thinking = false -%} +{%- for m in messages -%} +{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}{% set ns.has_thinking = (',' ~ loop.index0 ~ ',') in ns.thinking_indices -%} +{%- elif m.role == 'assistant' -%} +<|assistant|> +{%- set content = visible_text(m.content) %} +{%- if m.reasoning_content is string %} + {%- set reasoning_content = m.reasoning_content %} +{%- elif '' in content %} + {%- set reasoning_content = content.split('')[0].split('')[-1] %} + {%- set content = content.split('')[-1] %} +{%- elif loop.index0 > ns.last_user_index and not (enable_thinking is defined and not enable_thinking) %} + {%- set reasoning_content = '' %} +{%- elif loop.index0 < ns.last_user_index and ns.has_thinking %} + {%- set reasoning_content = '' %} +{%- endif %} +{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content is defined -%} +{{ '' + reasoning_content + ''}} +{%- else -%} +{{ '' }} +{%- endif -%} +{%- if content.strip() -%} +{{ content.strip() }} +{%- endif -%} +{% if m.tool_calls %} +{% for tc in m.tool_calls %} +{%- if tc.function %} + {%- set tc = tc.function %} +{%- endif %} +{{- '' + tc.name -}} +{% set _args = tc.arguments %}{% for k, v in _args.items() %}{{ k }}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{% endfor %}{% endfor %} +{% endif %} +{%- elif m.role == 'tool' -%} +{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|observation|>' -}} +{%- endif %} +{%- if m.content is string -%} + {{- '' + m.content + '' -}} +{%- else -%} + {{- '\n' -}} + {% for tr in m.content %} + {%- for tool in tools -%} + {%- if 'function' in tool -%} + {%- set tool = tool['function'] -%} + {%- endif -%} + {%- if tool.name == tr.name -%} + {{- tool_to_json(tool) + '\n' -}} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + {{- '' -}} +{% endif -%} +{%- elif m.role == 'system' -%} +<|system|>{{ visible_text(m.content) }} +{%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + <|assistant|>{{- '' if (enable_thinking is defined and not enable_thinking) else '' -}} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..afbcd72bfc565ea7e3563d72ed86c482f0cdd443 --- /dev/null +++ b/config.json @@ -0,0 +1,9419 @@ +{ + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "ep_size": 1, + "first_k_dense_replace": 3, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 6144, + "index_head_dim": 128, + "index_n_heads": 32, + "index_topk": 2048, + "indexer_rope_interleave": true, + "initializer_range": 0.02, + "intermediate_size": 12288, + "kv_lora_rank": 512, + "max_position_embeddings": 202752, + "model_type": "glm_moe_dsa", + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "n_group": 1, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "num_nextn_predict_layers": 1, + "pad_token_id": 154820, + "pretraining_tp": 1, + "q_lora_rank": 2048, + "qk_head_dim": 256, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "quantization": { + "group_size": 64, + "bits": 2, + "mode": "affine", + "model.embed_tokens": { + "group_size": 64, + "bits": 6, + "mode": "affine" + }, + "model.layers.0.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.mlp.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.mlp.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.mlp.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "lm_head": { + "group_size": 64, + "bits": 4, + "mode": "affine" + } + }, + "quantization_config": { + "group_size": 64, + "bits": 2, + "mode": "affine", + "model.embed_tokens": { + "group_size": 64, + "bits": 6, + "mode": "affine" + }, + "model.layers.0.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.0.mlp.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.0.mlp.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.1.mlp.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.1.mlp.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.2.mlp.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.2.mlp.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.3.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.3.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.4.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.4.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.5.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.5.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.6.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.6.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.7.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.7.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.8.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.8.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.9.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.9.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.10.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.10.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.11.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.11.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.12.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.12.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.13.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.13.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.14.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.14.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.15.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.15.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.16.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.16.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.17.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.17.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.18.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.18.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.19.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.19.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.20.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.20.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.21.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.21.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.22.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.22.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.23.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.23.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.24.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.24.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.25.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.25.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.26.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.26.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.27.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.27.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.28.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.28.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.29.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.29.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.30.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.30.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.31.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.31.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.32.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.32.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.33.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.33.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.34.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.34.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.35.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.35.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.36.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.36.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.37.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.37.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.38.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.38.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.39.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.39.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.40.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.40.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.41.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.41.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.42.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.42.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.43.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.43.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.44.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.44.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.45.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.45.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.46.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.46.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.47.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.47.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.48.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.48.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.49.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.49.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.50.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.50.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.51.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.51.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.52.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.52.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.53.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.53.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.54.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.54.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.55.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.55.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.56.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.56.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.57.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.57.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.58.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.58.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.59.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.59.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.60.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.60.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.61.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.61.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.62.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.62.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.63.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.63.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.64.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.64.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.65.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.65.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.66.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.66.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.67.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.67.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.68.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.68.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.69.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.69.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.70.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.70.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.71.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.71.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.72.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.72.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.73.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.73.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.74.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.74.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.75.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.75.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.76.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.76.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.q_a_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.q_b_proj": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.kv_a_proj_with_mqa": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.embed_q": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.unembed_out": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.o_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wq_b": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.self_attn.indexer.wk": { + "group_size": 64, + "bits": 8, + "mode": "affine" + }, + "model.layers.77.mlp.switch_mlp.down_proj": { + "group_size": 64, + "bits": 3, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.gate_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.up_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "model.layers.77.mlp.shared_experts.down_proj": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "lm_head": { + "group_size": 64, + "bits": 4, + "mode": "affine" + } + }, + "rms_norm_eps": 1e-05, + "rope_interleave": true, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 1, + "topk_method": "noaux_tc", + "transformers_version": "5.4.0", + "use_cache": true, + "v_head_dim": 256, + "vocab_size": 154880 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..453800a061bdc65b75b9dd99ecc66ede543dac89 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "pad_token_id": 154820, + "temperature": 1.0, + "top_p": 0.95, + "transformers_version": "5.4.0" +} diff --git a/model-00001-of-00057.safetensors b/model-00001-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bc3b2706d937ed047dd8ee8ed2cf366244c867aa --- /dev/null +++ b/model-00001-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:445e6a6b2022f19315701854642ef934419c1b2b6477ce76cd4b304f1fa21f69 +size 5246709693 diff --git a/model-00002-of-00057.safetensors b/model-00002-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7787554b9f40011d0b0fdac0b2fbcae57a726df3 --- /dev/null +++ b/model-00002-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a219c84714aa2fa4789440a3f0b50ac3bf5b1054f73bf18cd7bf4d025849ea +size 4582454201 diff --git a/model-00003-of-00057.safetensors b/model-00003-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..598e96a75307718380d59965cf1e50737cfd607e --- /dev/null +++ b/model-00003-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7b98e94fabce83092ca7ffb5b14de9f02cedd85d168681871dabc52920f31b +size 4582454219 diff --git a/model-00004-of-00057.safetensors b/model-00004-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f918dfc12b636d1e60ae09070caf1e128dcedd1f --- /dev/null +++ b/model-00004-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76915dc1da8ab5be31f23373f93c4588dbe7b1875d3778b209f5b31875de2a6a +size 5138374942 diff --git a/model-00006-of-00057.safetensors b/model-00006-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..545e6bd1b78d89be9c12ab08f807ba9d68064029 --- /dev/null +++ b/model-00006-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6202822d5356c069dab988816906756025a0ef963cd22e4f3de18acb939c90b1 +size 4582454252 diff --git a/model-00007-of-00057.safetensors b/model-00007-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a32dbe6571a6f5f6ba51cdafad0a3277ed8f80f2 --- /dev/null +++ b/model-00007-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46c37bc9954c3d6f20a6985581c5b6f4dc11e6ea0a10ef623a8b4fa3158754d +size 5138375078 diff --git a/model-00008-of-00057.safetensors b/model-00008-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82f146b9a2ad6e68f3ff9b3f45b905d4fcb26668 --- /dev/null +++ b/model-00008-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9f7c86c7b3a188b71aadb3ca24cd2d66894403644aa26d5d8a06be1ac45fad6 +size 4582454273 diff --git a/model-00009-of-00057.safetensors b/model-00009-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..61a7e2218520707558b5e5891c4069d5a84fc766 --- /dev/null +++ b/model-00009-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc1c880c2f9f4d280208eaccc000f805e283b0eb114cc84775ab74626ac559d6 +size 4582454273 diff --git a/model-00010-of-00057.safetensors b/model-00010-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..566b9dc22184e2f646fdddc3bea9dcc4b3e2490b --- /dev/null +++ b/model-00010-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0777032c95f5b08bdfe019fe7f385adb152c23afcd6742516aecee008800c7d +size 5138375032 diff --git a/model-00011-of-00057.safetensors b/model-00011-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ad77db2f19884b7dec3f75b87b5d0c351678448c --- /dev/null +++ b/model-00011-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb2879256fe9ea0a207b0cfc8e15819565e8af1fe7b53f05fa42fa48428518b6 +size 4582454237 diff --git a/model-00013-of-00057.safetensors b/model-00013-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10c4f64d243f187c3ee29e5dd1e75d7282db290e --- /dev/null +++ b/model-00013-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f0cfe7be9fdfa744ff157d380ff533b8ff88747b46b222d648ddaa8bf6f8cbc +size 5138375062 diff --git a/model-00014-of-00057.safetensors b/model-00014-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..80653cd00a2d896bf890c35fe55eab0138241621 --- /dev/null +++ b/model-00014-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90a5aceb50fa951256d7e357b2a56730ccf3018b2d7407f6914a0cedf4141db7 +size 4582454269 diff --git a/model-00015-of-00057.safetensors b/model-00015-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7e6620a3fd216f85de5b7314688f2e8fb7b6ab4 --- /dev/null +++ b/model-00015-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5317ba5647f65fc3645f28db87f2fe3304b84ecf9a436a5519c5d8348d8116ce +size 4582454273 diff --git a/model-00016-of-00057.safetensors b/model-00016-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..04a82d1e633a12ff7f046a3052adefb9a2f2cad4 --- /dev/null +++ b/model-00016-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b01dad15394e8ad4ea6bc23272eb626fdf32e6fbf63b53a8b4944adaa2de7c83 +size 5138375060 diff --git a/model-00019-of-00057.safetensors b/model-00019-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2cc00a5009c7b040af44e88bddd1f755083d9586 --- /dev/null +++ b/model-00019-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61069d0f87ac2525b4c658b2105bf4bd6dc559febd3a0374825c79e31691db57 +size 5138375062 diff --git a/model-00022-of-00057.safetensors b/model-00022-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..41fdf10a4f8bd9b6aade179b60f24a7aef5cde19 --- /dev/null +++ b/model-00022-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4f574727e80564fdd4056bb9e0d2d7edbc1c54f29c9ae750bb74b4fdd63da2f +size 5138375034 diff --git a/model-00023-of-00057.safetensors b/model-00023-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..499f7ca489654b0265aaabc7d185e3316d1b12a1 --- /dev/null +++ b/model-00023-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9280c731ce24ec2516f06bb42edb239a7750b87963009dff926b9aa131386f78 +size 4582454249 diff --git a/model-00024-of-00057.safetensors b/model-00024-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8f234a355566167e45dc8104102fc8ecd5aba6d6 --- /dev/null +++ b/model-00024-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9d4d24e51ac29d9ed4610ffc22a78558ccc3b241b243d03ed7c00dc6e5f307f +size 4582454273 diff --git a/model-00026-of-00057.safetensors b/model-00026-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..153c4274877541aec91352b4e9bf161e86d02e2b --- /dev/null +++ b/model-00026-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db8373cfc58958d3b92826760b05b98d21b94088dda22743842acffa2c731533 +size 4582454279 diff --git a/model-00027-of-00057.safetensors b/model-00027-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eac1903bd7ac5e96aa721b9aeb89f4557f73ab3b --- /dev/null +++ b/model-00027-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab666ad80b216e4f5be23ce3c504d06f99f871d28d458a752e750a8f102d0342 +size 4582454273 diff --git a/model-00028-of-00057.safetensors b/model-00028-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6fc9b793c0c1beef6b0b673e40f49bf40e36bff --- /dev/null +++ b/model-00028-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d67fe2177173563278700cc708d391dfe1e69780147507c6729368c5a835b7 +size 5138375052 diff --git a/model-00029-of-00057.safetensors b/model-00029-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40d52a588ad4d0392e6ada30079159ca1e9a7dd7 --- /dev/null +++ b/model-00029-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26d8bca2ddd6d2b4037009fdcbb32d9b5a5cd47cd55779c6d44b94a88a58a8a +size 4582454241 diff --git a/model-00030-of-00057.safetensors b/model-00030-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..515f66406a8ddb865b6520e6eaf88e0c8fd606b0 --- /dev/null +++ b/model-00030-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb6ef637dcaa3292e02ba2fb942f6274d310fe0b50e6cd7b782c04b15db13561 +size 4582454273 diff --git a/model-00031-of-00057.safetensors b/model-00031-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..62e6ce089c61da987bb63d9d704677178e975521 --- /dev/null +++ b/model-00031-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b673867b74bbdb31dd7c558ed78f37c7383deb312ce41e1e25be7e0ad722678 +size 5138375074 diff --git a/model-00033-of-00057.safetensors b/model-00033-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10388a7d2799115e11f29d917815973d53674c66 --- /dev/null +++ b/model-00033-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31dfbc00b5f31bb21c57e133e93bfe8c6b2596267ea2a80c4df6776a4f1c19c0 +size 4582454273 diff --git a/model-00034-of-00057.safetensors b/model-00034-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f5927a23e40ab1befad91812c71093888c0ac95 --- /dev/null +++ b/model-00034-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3683b99ee031e032a9184a8479c99bd0568363e9f3c662299a5b425ebb1e8c4d +size 5138375074 diff --git a/model-00035-of-00057.safetensors b/model-00035-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..53d4fda2961cdaa9ff178a8abb00ca58adebe1a7 --- /dev/null +++ b/model-00035-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d738e152babed5bd2ea57964c0b435ab1a22647afea487faebdc314b6226a6f +size 4582454243 diff --git a/model-00036-of-00057.safetensors b/model-00036-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..44ad32151088fb69c0b2690dca45f27bd7fbe094 --- /dev/null +++ b/model-00036-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a147e66a7553cb0114fbbdd3584d9c8cd57d40e685327ae26f615af9c520bba6 +size 4582454273 diff --git a/model-00039-of-00057.safetensors b/model-00039-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..05c5e274b4255d87898b67d78f0c187750631b7d --- /dev/null +++ b/model-00039-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed7767792cb66cee619014e184781fb93486c771ef536630921b5688e194597b +size 4582454265 diff --git a/model-00040-of-00057.safetensors b/model-00040-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d877b6f663fb23d437e8a2421770934bcda2aacc --- /dev/null +++ b/model-00040-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:574ae674cdd665f4638b95878e7972a3ecefa1a91366f3312435a7c0ffcfb5c7 +size 5138375048 diff --git a/model-00042-of-00057.safetensors b/model-00042-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1d68f694db400072c0aa1d1032065fb7076c6e41 --- /dev/null +++ b/model-00042-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:260ceb4fdf714f69bd1dea3e9494c511fcee7de58af1da2086afc7d8e27a9136 +size 4582454269 diff --git a/model-00043-of-00057.safetensors b/model-00043-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b026a9339f5593f13b07169400c699a2958d72ed --- /dev/null +++ b/model-00043-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76746e0cb0a2238010295e83066796b95116fae4f1620bd62af22ee7dc91ab9c +size 5138375068 diff --git a/model-00045-of-00057.safetensors b/model-00045-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a00c2ed11cb76ddb5bec502c322862a4e72033d --- /dev/null +++ b/model-00045-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e7a469be6d5a68b5d3130fde29bf7e364b2e7e38cbf7da42c4a5ce7777e5cb6 +size 4582454273 diff --git a/model-00046-of-00057.safetensors b/model-00046-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cf6cf920d6142858445d02a1d2f8a00f631f2964 --- /dev/null +++ b/model-00046-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed1f563a9b2ab9b723c2802b7786cc37839e3506da7610a94a2264317d91b0a6 +size 5138375026 diff --git a/model-00047-of-00057.safetensors b/model-00047-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8cfc858c82b90ed55e3fd80bac02667e9a46d2b2 --- /dev/null +++ b/model-00047-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c1e27670a171e1d37fbe52ce2694f2d330a200f453f2f3630fd5618d02bfe7 +size 4582454255 diff --git a/model-00048-of-00057.safetensors b/model-00048-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..416c8374bd63899b7bc17ecd97d86edf5afda0ef --- /dev/null +++ b/model-00048-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8576f9ae9ecf8da1e63b37702506552d4692553d301c2e42f80a7a0bf7c3a9 +size 4582454273 diff --git a/model-00049-of-00057.safetensors b/model-00049-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8c1d7db8dad1b53c4edac85f3cdad6089a9ba485 --- /dev/null +++ b/model-00049-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c176219685208495e0d52adc47dd6d77b02267cacc731ab44002856dc1af35ff +size 5138375014 diff --git a/model-00050-of-00057.safetensors b/model-00050-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b22149e6abb502e1523f0460fe2a4c62bffd60a --- /dev/null +++ b/model-00050-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783af00b9304b018be7411f83f3ac9da1b2c67dc934c0f8c6ae1ff5492344700 +size 4582454247 diff --git a/model-00051-of-00057.safetensors b/model-00051-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c3f1b7d8d5022924dc8faed3c272e12c9407a562 --- /dev/null +++ b/model-00051-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:100cf1bd912a6c8f0e566ba43a84eb05ab2a8229b34ef77008979cff3fd4c039 +size 4582454261 diff --git a/model-00052-of-00057.safetensors b/model-00052-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4fbf50475f3c93acbed897aa64a12c81aa67fadf --- /dev/null +++ b/model-00052-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6411369d1cce607f4e473cbf1a9bf7da3f7b78e17c2453b4b045476e2dfe175f +size 5138375020 diff --git a/model-00054-of-00057.safetensors b/model-00054-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..45d5bc9e8b26991f574f3411bc73a7fd5c6c2b79 --- /dev/null +++ b/model-00054-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:789b13896db0093b8b80736de9b909d50e90c839cd90c0c9714c2a5b1e76d435 +size 4582454273 diff --git a/model-00055-of-00057.safetensors b/model-00055-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e87148a8f6c48f107d08f5f7e55d0c978c69fff8 --- /dev/null +++ b/model-00055-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6850d99f8f24a4ac892712a83de95991cf06a78948e0c8c919f9161596dc837a +size 5138375030 diff --git a/model-00057-of-00057.safetensors b/model-00057-of-00057.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2ebe7397f86812a505bae354c97cd4149120017 --- /dev/null +++ b/model-00057-of-00057.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fd128f5bf031bbaafa0f7163bbe3903f7ae8e18ee16f4d6f23eef5338dd8c4 +size 2975604418 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2d0e90fa7ba092d6cc834a1be599c48bd01bdd05 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,3960 @@ +{ + "metadata": { + "total_size": 270263390208, + "total_parameters": 743911218432 + }, + "weight_map": { + "lm_head.biases": "model-00057-of-00057.safetensors", + "lm_head.scales": "model-00057-of-00057.safetensors", + "lm_head.weight": "model-00057-of-00057.safetensors", + "model.embed_tokens.biases": "model-00001-of-00057.safetensors", + "model.embed_tokens.scales": "model-00001-of-00057.safetensors", + "model.embed_tokens.weight": "model-00001-of-00057.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.down_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.down_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.up_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.up_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.embed_q.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.embed_q.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.embed_q.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.k_norm.bias": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.k_norm.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.wk.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.wk.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.wk.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.wq_b.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.wq_b.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.indexer.wq_b.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_a_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_a_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_b_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_b_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.unembed_out.biases": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.unembed_out.scales": "model-00001-of-00057.safetensors", + "model.layers.0.self_attn.unembed_out.weight": "model-00001-of-00057.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.down_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.down_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.up_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.up_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.embed_q.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.embed_q.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.embed_q.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.k_norm.bias": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.k_norm.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.wk.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.wk.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.wk.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.wq_b.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.wq_b.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.indexer.wq_b.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_a_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_a_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_b_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_b_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.unembed_out.biases": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.unembed_out.scales": "model-00001-of-00057.safetensors", + "model.layers.1.self_attn.unembed_out.weight": "model-00001-of-00057.safetensors", + "model.layers.10.input_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.gate.e_score_correction_bias": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.gate.weight": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.10.self_attn.embed_q.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.embed_q.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.embed_q.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.k_norm.bias": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.k_norm.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.wk.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.wk.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.wk.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.wq_b.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.wq_b.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.indexer.wq_b.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.o_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.o_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_a_layernorm.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_a_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_a_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_a_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_b_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_b_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.q_b_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.unembed_out.biases": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.unembed_out.scales": "model-00006-of-00057.safetensors", + "model.layers.10.self_attn.unembed_out.weight": "model-00006-of-00057.safetensors", + "model.layers.11.input_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.gate.e_score_correction_bias": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.gate.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.embed_q.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.embed_q.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.embed_q.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.k_norm.bias": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.k_norm.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.wk.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.wk.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.wk.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.wq_b.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.wq_b.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.indexer.wq_b.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.o_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.o_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_a_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_a_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_a_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_a_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_b_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_b_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.q_b_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.unembed_out.biases": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.unembed_out.scales": "model-00007-of-00057.safetensors", + "model.layers.11.self_attn.unembed_out.weight": "model-00007-of-00057.safetensors", + "model.layers.12.input_layernorm.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.gate.e_score_correction_bias": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.gate.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00008-of-00057.safetensors", + "model.layers.12.self_attn.embed_q.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.embed_q.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.embed_q.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.k_norm.bias": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.k_norm.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.wk.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.wk.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.wk.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.wq_b.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.wq_b.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.indexer.wq_b.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.o_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.o_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_a_layernorm.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_a_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_a_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_a_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_b_proj.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_b_proj.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.q_b_proj.weight": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.unembed_out.biases": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.unembed_out.scales": "model-00007-of-00057.safetensors", + "model.layers.12.self_attn.unembed_out.weight": "model-00007-of-00057.safetensors", + "model.layers.13.input_layernorm.weight": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.gate.e_score_correction_bias": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.gate.weight": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00009-of-00057.safetensors", + "model.layers.13.self_attn.embed_q.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.embed_q.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.embed_q.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.k_norm.bias": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.k_norm.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.wk.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.wk.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.wk.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.wq_b.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.wq_b.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.indexer.wq_b.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.o_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.o_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_a_layernorm.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_a_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_a_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_a_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_b_proj.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_b_proj.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.q_b_proj.weight": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.unembed_out.biases": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.unembed_out.scales": "model-00008-of-00057.safetensors", + "model.layers.13.self_attn.unembed_out.weight": "model-00008-of-00057.safetensors", + "model.layers.14.input_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.gate.e_score_correction_bias": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.gate.weight": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.14.self_attn.embed_q.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.embed_q.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.embed_q.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.k_norm.bias": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.k_norm.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.wk.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.wk.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.wk.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.wq_b.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.wq_b.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.indexer.wq_b.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.o_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.o_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_a_layernorm.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_a_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_a_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_a_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_b_proj.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_b_proj.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.q_b_proj.weight": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.unembed_out.biases": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.unembed_out.scales": "model-00009-of-00057.safetensors", + "model.layers.14.self_attn.unembed_out.weight": "model-00009-of-00057.safetensors", + "model.layers.15.input_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.gate.e_score_correction_bias": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.gate.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.embed_q.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.embed_q.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.embed_q.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.k_norm.bias": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.k_norm.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.wk.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.wk.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.wk.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.wq_b.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.wq_b.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.indexer.wq_b.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.o_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.o_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_a_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_a_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_a_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_a_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_b_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_b_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.q_b_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.unembed_out.biases": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.unembed_out.scales": "model-00010-of-00057.safetensors", + "model.layers.15.self_attn.unembed_out.weight": "model-00010-of-00057.safetensors", + "model.layers.16.input_layernorm.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.gate.e_score_correction_bias": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.gate.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00011-of-00057.safetensors", + "model.layers.16.self_attn.embed_q.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.embed_q.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.embed_q.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.k_norm.bias": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.k_norm.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.wk.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.wk.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.wk.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.wq_b.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.wq_b.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.indexer.wq_b.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.o_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.o_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_a_layernorm.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_a_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_a_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_a_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_b_proj.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_b_proj.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.q_b_proj.weight": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.unembed_out.biases": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.unembed_out.scales": "model-00010-of-00057.safetensors", + "model.layers.16.self_attn.unembed_out.weight": "model-00010-of-00057.safetensors", + "model.layers.17.input_layernorm.weight": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.gate.e_score_correction_bias": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.gate.weight": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00012-of-00057.safetensors", + "model.layers.17.self_attn.embed_q.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.embed_q.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.embed_q.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.k_norm.bias": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.k_norm.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.wk.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.wk.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.wk.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.wq_b.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.wq_b.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.indexer.wq_b.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.o_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.o_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_a_layernorm.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_a_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_a_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_a_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_b_proj.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_b_proj.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.q_b_proj.weight": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.unembed_out.biases": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.unembed_out.scales": "model-00011-of-00057.safetensors", + "model.layers.17.self_attn.unembed_out.weight": "model-00011-of-00057.safetensors", + "model.layers.18.input_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.gate.e_score_correction_bias": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.gate.weight": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.18.self_attn.embed_q.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.embed_q.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.embed_q.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.k_norm.bias": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.k_norm.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.wk.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.wk.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.wk.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.wq_b.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.wq_b.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.indexer.wq_b.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.o_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.o_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_a_layernorm.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_a_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_a_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_a_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_b_proj.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_b_proj.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.q_b_proj.weight": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.unembed_out.biases": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.unembed_out.scales": "model-00012-of-00057.safetensors", + "model.layers.18.self_attn.unembed_out.weight": "model-00012-of-00057.safetensors", + "model.layers.19.input_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.gate.e_score_correction_bias": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.gate.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.embed_q.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.embed_q.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.embed_q.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.k_norm.bias": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.k_norm.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.wk.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.wk.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.wk.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.wq_b.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.wq_b.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.indexer.wq_b.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.o_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.o_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_a_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_a_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_a_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_a_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_b_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_b_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.q_b_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.unembed_out.biases": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.unembed_out.scales": "model-00013-of-00057.safetensors", + "model.layers.19.self_attn.unembed_out.weight": "model-00013-of-00057.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.down_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.down_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.up_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.up_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.embed_q.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.embed_q.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.embed_q.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.k_norm.bias": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.k_norm.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.wk.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.wk.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.wk.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.wq_b.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.wq_b.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.indexer.wq_b.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_a_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_a_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_b_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_b_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.unembed_out.biases": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.unembed_out.scales": "model-00001-of-00057.safetensors", + "model.layers.2.self_attn.unembed_out.weight": "model-00001-of-00057.safetensors", + "model.layers.20.input_layernorm.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.gate.e_score_correction_bias": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.gate.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00014-of-00057.safetensors", + "model.layers.20.self_attn.embed_q.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.embed_q.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.embed_q.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.k_norm.bias": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.k_norm.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.wk.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.wk.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.wk.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.wq_b.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.wq_b.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.indexer.wq_b.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.o_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.o_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_a_layernorm.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_a_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_a_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_a_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_b_proj.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_b_proj.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.q_b_proj.weight": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.unembed_out.biases": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.unembed_out.scales": "model-00013-of-00057.safetensors", + "model.layers.20.self_attn.unembed_out.weight": "model-00013-of-00057.safetensors", + "model.layers.21.input_layernorm.weight": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.gate.e_score_correction_bias": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.gate.weight": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00015-of-00057.safetensors", + "model.layers.21.self_attn.embed_q.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.embed_q.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.embed_q.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.k_norm.bias": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.k_norm.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.wk.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.wk.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.wk.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.wq_b.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.wq_b.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.indexer.wq_b.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.o_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.o_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_a_layernorm.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_a_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_a_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_a_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_b_proj.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_b_proj.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.q_b_proj.weight": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.unembed_out.biases": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.unembed_out.scales": "model-00014-of-00057.safetensors", + "model.layers.21.self_attn.unembed_out.weight": "model-00014-of-00057.safetensors", + "model.layers.22.input_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.gate.e_score_correction_bias": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.gate.weight": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.22.self_attn.embed_q.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.embed_q.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.embed_q.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.k_norm.bias": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.k_norm.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.wk.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.wk.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.wk.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.wq_b.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.wq_b.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.indexer.wq_b.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.o_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.o_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_a_layernorm.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_a_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_a_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_a_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_b_proj.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_b_proj.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.q_b_proj.weight": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.unembed_out.biases": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.unembed_out.scales": "model-00015-of-00057.safetensors", + "model.layers.22.self_attn.unembed_out.weight": "model-00015-of-00057.safetensors", + "model.layers.23.input_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.gate.e_score_correction_bias": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.gate.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.embed_q.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.embed_q.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.embed_q.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.k_norm.bias": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.k_norm.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.wk.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.wk.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.wk.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.wq_b.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.wq_b.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.indexer.wq_b.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.o_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.o_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_a_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_a_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_a_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_a_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_b_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_b_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.q_b_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.unembed_out.biases": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.unembed_out.scales": "model-00016-of-00057.safetensors", + "model.layers.23.self_attn.unembed_out.weight": "model-00016-of-00057.safetensors", + "model.layers.24.input_layernorm.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.gate.e_score_correction_bias": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.gate.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00017-of-00057.safetensors", + "model.layers.24.self_attn.embed_q.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.embed_q.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.embed_q.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.k_norm.bias": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.k_norm.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.wk.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.wk.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.wk.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.wq_b.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.wq_b.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.indexer.wq_b.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.o_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.o_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_a_layernorm.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_a_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_a_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_a_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_b_proj.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_b_proj.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.q_b_proj.weight": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.unembed_out.biases": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.unembed_out.scales": "model-00016-of-00057.safetensors", + "model.layers.24.self_attn.unembed_out.weight": "model-00016-of-00057.safetensors", + "model.layers.25.input_layernorm.weight": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.gate.e_score_correction_bias": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.gate.weight": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00018-of-00057.safetensors", + "model.layers.25.self_attn.embed_q.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.embed_q.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.embed_q.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.k_norm.bias": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.k_norm.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.wk.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.wk.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.wk.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.wq_b.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.wq_b.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.indexer.wq_b.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.o_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.o_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_a_layernorm.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_a_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_a_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_a_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_b_proj.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_b_proj.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.q_b_proj.weight": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.unembed_out.biases": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.unembed_out.scales": "model-00017-of-00057.safetensors", + "model.layers.25.self_attn.unembed_out.weight": "model-00017-of-00057.safetensors", + "model.layers.26.input_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.gate.e_score_correction_bias": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.gate.weight": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.26.self_attn.embed_q.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.embed_q.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.embed_q.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.k_norm.bias": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.k_norm.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.wk.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.wk.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.wk.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.wq_b.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.wq_b.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.indexer.wq_b.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.o_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.o_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_a_layernorm.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_a_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_a_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_a_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_b_proj.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_b_proj.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.q_b_proj.weight": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.unembed_out.biases": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.unembed_out.scales": "model-00018-of-00057.safetensors", + "model.layers.26.self_attn.unembed_out.weight": "model-00018-of-00057.safetensors", + "model.layers.27.input_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.gate.e_score_correction_bias": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.gate.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.embed_q.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.embed_q.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.embed_q.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.k_norm.bias": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.k_norm.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.wk.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.wk.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.wk.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.wq_b.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.wq_b.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.indexer.wq_b.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.o_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.o_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_a_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_a_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_a_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_a_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_b_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_b_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.q_b_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.unembed_out.biases": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.unembed_out.scales": "model-00019-of-00057.safetensors", + "model.layers.27.self_attn.unembed_out.weight": "model-00019-of-00057.safetensors", + "model.layers.28.input_layernorm.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.gate.e_score_correction_bias": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.gate.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00020-of-00057.safetensors", + "model.layers.28.self_attn.embed_q.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.embed_q.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.embed_q.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.k_norm.bias": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.k_norm.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.wk.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.wk.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.wk.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.wq_b.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.wq_b.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.indexer.wq_b.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.o_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.o_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_a_layernorm.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_a_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_a_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_a_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_b_proj.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_b_proj.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.q_b_proj.weight": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.unembed_out.biases": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.unembed_out.scales": "model-00019-of-00057.safetensors", + "model.layers.28.self_attn.unembed_out.weight": "model-00019-of-00057.safetensors", + "model.layers.29.input_layernorm.weight": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.gate.e_score_correction_bias": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.gate.weight": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00021-of-00057.safetensors", + "model.layers.29.self_attn.embed_q.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.embed_q.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.embed_q.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.k_norm.bias": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.k_norm.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.wk.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.wk.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.wk.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.wq_b.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.wq_b.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.indexer.wq_b.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.o_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.o_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_a_layernorm.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_a_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_a_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_a_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_b_proj.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_b_proj.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.q_b_proj.weight": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.unembed_out.biases": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.unembed_out.scales": "model-00020-of-00057.safetensors", + "model.layers.29.self_attn.unembed_out.weight": "model-00020-of-00057.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.gate.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.embed_q.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.embed_q.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.embed_q.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.k_norm.bias": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.k_norm.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.wk.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.wk.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.wk.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.wq_b.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.wq_b.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.indexer.wq_b.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_a_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_a_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_b_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_b_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.unembed_out.biases": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.unembed_out.scales": "model-00001-of-00057.safetensors", + "model.layers.3.self_attn.unembed_out.weight": "model-00001-of-00057.safetensors", + "model.layers.30.input_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.gate.e_score_correction_bias": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.gate.weight": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.30.self_attn.embed_q.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.embed_q.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.embed_q.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.k_norm.bias": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.k_norm.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.wk.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.wk.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.wk.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.wq_b.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.wq_b.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.indexer.wq_b.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.o_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.o_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_a_layernorm.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_a_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_a_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_a_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_b_proj.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_b_proj.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.q_b_proj.weight": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.unembed_out.biases": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.unembed_out.scales": "model-00021-of-00057.safetensors", + "model.layers.30.self_attn.unembed_out.weight": "model-00021-of-00057.safetensors", + "model.layers.31.input_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.gate.e_score_correction_bias": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.gate.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.embed_q.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.embed_q.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.embed_q.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.k_norm.bias": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.k_norm.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.wk.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.wk.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.wk.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.wq_b.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.wq_b.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.indexer.wq_b.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.o_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.o_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_a_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_a_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_a_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_a_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_b_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_b_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.q_b_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.unembed_out.biases": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.unembed_out.scales": "model-00022-of-00057.safetensors", + "model.layers.31.self_attn.unembed_out.weight": "model-00022-of-00057.safetensors", + "model.layers.32.input_layernorm.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.gate.e_score_correction_bias": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.gate.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00023-of-00057.safetensors", + "model.layers.32.self_attn.embed_q.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.embed_q.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.embed_q.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.k_norm.bias": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.k_norm.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.wk.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.wk.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.wk.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.wq_b.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.wq_b.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.indexer.wq_b.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.kv_a_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.o_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.o_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_a_layernorm.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_a_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_a_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_a_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_b_proj.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_b_proj.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.q_b_proj.weight": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.unembed_out.biases": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.unembed_out.scales": "model-00022-of-00057.safetensors", + "model.layers.32.self_attn.unembed_out.weight": "model-00022-of-00057.safetensors", + "model.layers.33.input_layernorm.weight": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.gate.e_score_correction_bias": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.gate.weight": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00024-of-00057.safetensors", + "model.layers.33.self_attn.embed_q.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.embed_q.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.embed_q.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.k_norm.bias": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.k_norm.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.wk.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.wk.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.wk.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.wq_b.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.wq_b.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.indexer.wq_b.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.kv_a_layernorm.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.o_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.o_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_a_layernorm.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_a_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_a_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_a_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_b_proj.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_b_proj.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.q_b_proj.weight": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.unembed_out.biases": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.unembed_out.scales": "model-00023-of-00057.safetensors", + "model.layers.33.self_attn.unembed_out.weight": "model-00023-of-00057.safetensors", + "model.layers.34.input_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.gate.e_score_correction_bias": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.gate.weight": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.34.self_attn.embed_q.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.embed_q.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.embed_q.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.k_norm.bias": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.k_norm.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.wk.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.wk.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.wk.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.wq_b.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.wq_b.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.indexer.wq_b.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.kv_a_layernorm.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.o_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.o_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_a_layernorm.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_a_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_a_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_a_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_b_proj.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_b_proj.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.q_b_proj.weight": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.unembed_out.biases": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.unembed_out.scales": "model-00024-of-00057.safetensors", + "model.layers.34.self_attn.unembed_out.weight": "model-00024-of-00057.safetensors", + "model.layers.35.input_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.gate.e_score_correction_bias": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.gate.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.embed_q.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.embed_q.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.embed_q.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.k_norm.bias": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.k_norm.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.wk.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.wk.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.wk.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.wq_b.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.wq_b.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.indexer.wq_b.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.kv_a_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.o_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.o_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_a_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_a_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_a_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_a_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_b_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_b_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.q_b_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.unembed_out.biases": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.unembed_out.scales": "model-00025-of-00057.safetensors", + "model.layers.35.self_attn.unembed_out.weight": "model-00025-of-00057.safetensors", + "model.layers.36.input_layernorm.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.gate.e_score_correction_bias": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.gate.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00026-of-00057.safetensors", + "model.layers.36.self_attn.embed_q.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.embed_q.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.embed_q.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.k_norm.bias": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.k_norm.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.wk.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.wk.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.wk.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.wq_b.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.wq_b.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.indexer.wq_b.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.kv_a_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.o_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.o_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_a_layernorm.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_a_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_a_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_a_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_b_proj.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_b_proj.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.q_b_proj.weight": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.unembed_out.biases": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.unembed_out.scales": "model-00025-of-00057.safetensors", + "model.layers.36.self_attn.unembed_out.weight": "model-00025-of-00057.safetensors", + "model.layers.37.input_layernorm.weight": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.gate.e_score_correction_bias": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.gate.weight": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00027-of-00057.safetensors", + "model.layers.37.self_attn.embed_q.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.embed_q.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.embed_q.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.k_norm.bias": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.k_norm.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.wk.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.wk.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.wk.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.wq_b.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.wq_b.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.indexer.wq_b.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.kv_a_layernorm.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.o_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.o_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_a_layernorm.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_a_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_a_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_a_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_b_proj.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_b_proj.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.q_b_proj.weight": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.unembed_out.biases": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.unembed_out.scales": "model-00026-of-00057.safetensors", + "model.layers.37.self_attn.unembed_out.weight": "model-00026-of-00057.safetensors", + "model.layers.38.input_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.gate.e_score_correction_bias": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.gate.weight": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.38.self_attn.embed_q.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.embed_q.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.embed_q.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.k_norm.bias": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.k_norm.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.wk.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.wk.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.wk.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.wq_b.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.wq_b.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.indexer.wq_b.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.kv_a_layernorm.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.o_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.o_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_a_layernorm.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_a_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_a_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_a_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_b_proj.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_b_proj.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.q_b_proj.weight": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.unembed_out.biases": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.unembed_out.scales": "model-00027-of-00057.safetensors", + "model.layers.38.self_attn.unembed_out.weight": "model-00027-of-00057.safetensors", + "model.layers.39.input_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.gate.e_score_correction_bias": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.gate.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.embed_q.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.embed_q.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.embed_q.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.k_norm.bias": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.k_norm.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.wk.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.wk.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.wk.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.wq_b.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.wq_b.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.indexer.wq_b.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.kv_a_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.o_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.o_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_a_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_a_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_a_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_a_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_b_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_b_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.q_b_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.unembed_out.biases": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.unembed_out.scales": "model-00028-of-00057.safetensors", + "model.layers.39.self_attn.unembed_out.weight": "model-00028-of-00057.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.gate.e_score_correction_bias": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.gate.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00057.safetensors", + "model.layers.4.self_attn.embed_q.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.embed_q.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.embed_q.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.k_norm.bias": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.k_norm.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.wk.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.wk.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.wk.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.wq_b.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.wq_b.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.indexer.wq_b.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.o_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.o_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_a_layernorm.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_a_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_a_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_a_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_b_proj.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_b_proj.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.q_b_proj.weight": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.unembed_out.biases": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.unembed_out.scales": "model-00001-of-00057.safetensors", + "model.layers.4.self_attn.unembed_out.weight": "model-00001-of-00057.safetensors", + "model.layers.40.input_layernorm.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.gate.e_score_correction_bias": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.gate.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00029-of-00057.safetensors", + "model.layers.40.self_attn.embed_q.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.embed_q.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.embed_q.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.k_norm.bias": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.k_norm.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.wk.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.wk.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.wk.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.wq_b.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.wq_b.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.indexer.wq_b.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.kv_a_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.o_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.o_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_a_layernorm.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_a_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_a_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_a_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_b_proj.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_b_proj.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.q_b_proj.weight": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.unembed_out.biases": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.unembed_out.scales": "model-00028-of-00057.safetensors", + "model.layers.40.self_attn.unembed_out.weight": "model-00028-of-00057.safetensors", + "model.layers.41.input_layernorm.weight": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.gate.e_score_correction_bias": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.gate.weight": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00030-of-00057.safetensors", + "model.layers.41.self_attn.embed_q.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.embed_q.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.embed_q.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.k_norm.bias": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.k_norm.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.wk.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.wk.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.wk.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.wq_b.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.wq_b.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.indexer.wq_b.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.kv_a_layernorm.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.o_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.o_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_a_layernorm.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_a_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_a_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_a_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_b_proj.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_b_proj.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.q_b_proj.weight": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.unembed_out.biases": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.unembed_out.scales": "model-00029-of-00057.safetensors", + "model.layers.41.self_attn.unembed_out.weight": "model-00029-of-00057.safetensors", + "model.layers.42.input_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.gate.e_score_correction_bias": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.gate.weight": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.42.self_attn.embed_q.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.embed_q.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.embed_q.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.k_norm.bias": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.k_norm.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.wk.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.wk.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.wk.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.wq_b.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.wq_b.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.indexer.wq_b.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.kv_a_layernorm.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.o_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.o_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_a_layernorm.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_a_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_a_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_a_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_b_proj.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_b_proj.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.q_b_proj.weight": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.unembed_out.biases": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.unembed_out.scales": "model-00030-of-00057.safetensors", + "model.layers.42.self_attn.unembed_out.weight": "model-00030-of-00057.safetensors", + "model.layers.43.input_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.gate.e_score_correction_bias": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.gate.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.embed_q.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.embed_q.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.embed_q.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.k_norm.bias": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.k_norm.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.wk.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.wk.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.wk.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.wq_b.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.wq_b.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.indexer.wq_b.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.kv_a_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.o_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.o_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_a_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_a_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_a_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_a_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_b_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_b_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.q_b_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.unembed_out.biases": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.unembed_out.scales": "model-00031-of-00057.safetensors", + "model.layers.43.self_attn.unembed_out.weight": "model-00031-of-00057.safetensors", + "model.layers.44.input_layernorm.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.gate.e_score_correction_bias": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.gate.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00032-of-00057.safetensors", + "model.layers.44.self_attn.embed_q.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.embed_q.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.embed_q.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.k_norm.bias": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.k_norm.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.wk.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.wk.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.wk.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.wq_b.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.wq_b.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.indexer.wq_b.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.kv_a_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.o_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.o_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_a_layernorm.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_a_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_a_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_a_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_b_proj.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_b_proj.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.q_b_proj.weight": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.unembed_out.biases": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.unembed_out.scales": "model-00031-of-00057.safetensors", + "model.layers.44.self_attn.unembed_out.weight": "model-00031-of-00057.safetensors", + "model.layers.45.input_layernorm.weight": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.gate.e_score_correction_bias": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.gate.weight": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00033-of-00057.safetensors", + "model.layers.45.self_attn.embed_q.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.embed_q.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.embed_q.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.k_norm.bias": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.k_norm.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.wk.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.wk.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.wk.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.wq_b.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.wq_b.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.indexer.wq_b.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.kv_a_layernorm.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.o_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.o_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_a_layernorm.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_a_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_a_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_a_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_b_proj.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_b_proj.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.q_b_proj.weight": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.unembed_out.biases": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.unembed_out.scales": "model-00032-of-00057.safetensors", + "model.layers.45.self_attn.unembed_out.weight": "model-00032-of-00057.safetensors", + "model.layers.46.input_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.gate.e_score_correction_bias": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.gate.weight": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.46.self_attn.embed_q.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.embed_q.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.embed_q.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.k_norm.bias": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.k_norm.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.wk.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.wk.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.wk.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.wq_b.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.wq_b.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.indexer.wq_b.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.kv_a_layernorm.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.o_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.o_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_a_layernorm.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_a_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_a_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_a_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_b_proj.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_b_proj.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.q_b_proj.weight": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.unembed_out.biases": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.unembed_out.scales": "model-00033-of-00057.safetensors", + "model.layers.46.self_attn.unembed_out.weight": "model-00033-of-00057.safetensors", + "model.layers.47.input_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.gate.e_score_correction_bias": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.gate.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.embed_q.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.embed_q.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.embed_q.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.k_norm.bias": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.k_norm.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.wk.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.wk.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.wk.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.wq_b.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.wq_b.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.indexer.wq_b.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.kv_a_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.o_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.o_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_a_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_a_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_a_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_a_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_b_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_b_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.q_b_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.unembed_out.biases": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.unembed_out.scales": "model-00034-of-00057.safetensors", + "model.layers.47.self_attn.unembed_out.weight": "model-00034-of-00057.safetensors", + "model.layers.48.input_layernorm.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.gate.e_score_correction_bias": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.gate.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00035-of-00057.safetensors", + "model.layers.48.self_attn.embed_q.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.embed_q.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.embed_q.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.k_norm.bias": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.k_norm.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.wk.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.wk.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.wk.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.wq_b.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.wq_b.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.indexer.wq_b.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.kv_a_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.o_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.o_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_a_layernorm.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_a_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_a_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_a_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_b_proj.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_b_proj.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.q_b_proj.weight": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.unembed_out.biases": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.unembed_out.scales": "model-00034-of-00057.safetensors", + "model.layers.48.self_attn.unembed_out.weight": "model-00034-of-00057.safetensors", + "model.layers.49.input_layernorm.weight": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.gate.e_score_correction_bias": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.gate.weight": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00036-of-00057.safetensors", + "model.layers.49.self_attn.embed_q.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.embed_q.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.embed_q.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.k_norm.bias": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.k_norm.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.wk.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.wk.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.wk.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.wq_b.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.wq_b.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.indexer.wq_b.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.kv_a_layernorm.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.o_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.o_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_a_layernorm.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_a_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_a_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_a_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_b_proj.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_b_proj.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.q_b_proj.weight": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.unembed_out.biases": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.unembed_out.scales": "model-00035-of-00057.safetensors", + "model.layers.49.self_attn.unembed_out.weight": "model-00035-of-00057.safetensors", + "model.layers.5.input_layernorm.weight": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.gate.e_score_correction_bias": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.gate.weight": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00057.safetensors", + "model.layers.5.self_attn.embed_q.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.embed_q.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.embed_q.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.k_norm.bias": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.k_norm.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.wk.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.wk.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.wk.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.wq_b.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.wq_b.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.indexer.wq_b.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.o_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.o_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_a_layernorm.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_a_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_a_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_a_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_b_proj.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_b_proj.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.q_b_proj.weight": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.unembed_out.biases": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.unembed_out.scales": "model-00002-of-00057.safetensors", + "model.layers.5.self_attn.unembed_out.weight": "model-00002-of-00057.safetensors", + "model.layers.50.input_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.gate.e_score_correction_bias": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.gate.weight": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.50.self_attn.embed_q.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.embed_q.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.embed_q.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.k_norm.bias": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.k_norm.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.wk.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.wk.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.wk.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.wq_b.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.wq_b.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.indexer.wq_b.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.kv_a_layernorm.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.o_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.o_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_a_layernorm.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_a_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_a_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_a_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_b_proj.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_b_proj.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.q_b_proj.weight": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.unembed_out.biases": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.unembed_out.scales": "model-00036-of-00057.safetensors", + "model.layers.50.self_attn.unembed_out.weight": "model-00036-of-00057.safetensors", + "model.layers.51.input_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.gate.e_score_correction_bias": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.gate.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.embed_q.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.embed_q.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.embed_q.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.k_norm.bias": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.k_norm.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.wk.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.wk.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.wk.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.wq_b.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.wq_b.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.indexer.wq_b.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.kv_a_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.o_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.o_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_a_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_a_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_a_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_a_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_b_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_b_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.q_b_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.unembed_out.biases": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.unembed_out.scales": "model-00037-of-00057.safetensors", + "model.layers.51.self_attn.unembed_out.weight": "model-00037-of-00057.safetensors", + "model.layers.52.input_layernorm.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.gate.e_score_correction_bias": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.gate.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00038-of-00057.safetensors", + "model.layers.52.self_attn.embed_q.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.embed_q.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.embed_q.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.k_norm.bias": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.k_norm.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.wk.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.wk.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.wk.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.wq_b.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.wq_b.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.indexer.wq_b.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.kv_a_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.o_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.o_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_a_layernorm.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_a_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_a_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_a_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_b_proj.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_b_proj.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.q_b_proj.weight": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.unembed_out.biases": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.unembed_out.scales": "model-00037-of-00057.safetensors", + "model.layers.52.self_attn.unembed_out.weight": "model-00037-of-00057.safetensors", + "model.layers.53.input_layernorm.weight": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.gate.e_score_correction_bias": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.gate.weight": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00039-of-00057.safetensors", + "model.layers.53.self_attn.embed_q.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.embed_q.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.embed_q.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.k_norm.bias": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.k_norm.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.wk.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.wk.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.wk.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.wq_b.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.wq_b.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.indexer.wq_b.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.kv_a_layernorm.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.o_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.o_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_a_layernorm.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_a_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_a_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_a_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_b_proj.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_b_proj.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.q_b_proj.weight": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.unembed_out.biases": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.unembed_out.scales": "model-00038-of-00057.safetensors", + "model.layers.53.self_attn.unembed_out.weight": "model-00038-of-00057.safetensors", + "model.layers.54.input_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.gate.e_score_correction_bias": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.gate.weight": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.54.self_attn.embed_q.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.embed_q.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.embed_q.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.k_norm.bias": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.k_norm.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.wk.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.wk.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.wk.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.wq_b.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.wq_b.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.indexer.wq_b.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.kv_a_layernorm.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.o_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.o_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_a_layernorm.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_a_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_a_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_a_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_b_proj.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_b_proj.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.q_b_proj.weight": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.unembed_out.biases": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.unembed_out.scales": "model-00039-of-00057.safetensors", + "model.layers.54.self_attn.unembed_out.weight": "model-00039-of-00057.safetensors", + "model.layers.55.input_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.gate.e_score_correction_bias": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.gate.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.embed_q.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.embed_q.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.embed_q.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.k_norm.bias": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.k_norm.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.wk.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.wk.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.wk.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.wq_b.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.wq_b.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.indexer.wq_b.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.kv_a_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.o_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.o_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_a_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_a_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_a_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_a_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_b_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_b_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.q_b_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.unembed_out.biases": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.unembed_out.scales": "model-00040-of-00057.safetensors", + "model.layers.55.self_attn.unembed_out.weight": "model-00040-of-00057.safetensors", + "model.layers.56.input_layernorm.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.gate.e_score_correction_bias": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.gate.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00041-of-00057.safetensors", + "model.layers.56.self_attn.embed_q.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.embed_q.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.embed_q.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.k_norm.bias": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.k_norm.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.wk.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.wk.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.wk.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.wq_b.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.wq_b.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.indexer.wq_b.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.kv_a_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.o_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.o_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_a_layernorm.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_a_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_a_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_a_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_b_proj.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_b_proj.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.q_b_proj.weight": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.unembed_out.biases": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.unembed_out.scales": "model-00040-of-00057.safetensors", + "model.layers.56.self_attn.unembed_out.weight": "model-00040-of-00057.safetensors", + "model.layers.57.input_layernorm.weight": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.gate.e_score_correction_bias": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.gate.weight": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00042-of-00057.safetensors", + "model.layers.57.self_attn.embed_q.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.embed_q.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.embed_q.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.k_norm.bias": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.k_norm.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.wk.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.wk.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.wk.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.wq_b.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.wq_b.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.indexer.wq_b.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.kv_a_layernorm.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.o_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.o_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_a_layernorm.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_a_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_a_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_a_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_b_proj.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_b_proj.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.q_b_proj.weight": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.unembed_out.biases": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.unembed_out.scales": "model-00041-of-00057.safetensors", + "model.layers.57.self_attn.unembed_out.weight": "model-00041-of-00057.safetensors", + "model.layers.58.input_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.gate.e_score_correction_bias": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.gate.weight": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.58.self_attn.embed_q.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.embed_q.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.embed_q.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.k_norm.bias": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.k_norm.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.wk.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.wk.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.wk.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.wq_b.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.wq_b.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.indexer.wq_b.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.kv_a_layernorm.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.o_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.o_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_a_layernorm.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_a_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_a_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_a_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_b_proj.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_b_proj.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.q_b_proj.weight": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.unembed_out.biases": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.unembed_out.scales": "model-00042-of-00057.safetensors", + "model.layers.58.self_attn.unembed_out.weight": "model-00042-of-00057.safetensors", + "model.layers.59.input_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.gate.e_score_correction_bias": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.gate.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.embed_q.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.embed_q.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.embed_q.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.k_norm.bias": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.k_norm.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.wk.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.wk.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.wk.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.wq_b.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.wq_b.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.indexer.wq_b.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.kv_a_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.o_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.o_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_a_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_a_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_a_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_a_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_b_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_b_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.q_b_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.unembed_out.biases": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.unembed_out.scales": "model-00043-of-00057.safetensors", + "model.layers.59.self_attn.unembed_out.weight": "model-00043-of-00057.safetensors", + "model.layers.6.input_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.gate.e_score_correction_bias": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.gate.weight": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.6.self_attn.embed_q.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.embed_q.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.embed_q.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.k_norm.bias": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.k_norm.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.wk.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.wk.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.wk.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.wq_b.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.wq_b.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.indexer.wq_b.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.o_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.o_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_a_layernorm.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_a_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_a_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_a_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_b_proj.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_b_proj.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.q_b_proj.weight": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.unembed_out.biases": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.unembed_out.scales": "model-00003-of-00057.safetensors", + "model.layers.6.self_attn.unembed_out.weight": "model-00003-of-00057.safetensors", + "model.layers.60.input_layernorm.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.gate.e_score_correction_bias": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.gate.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00044-of-00057.safetensors", + "model.layers.60.self_attn.embed_q.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.embed_q.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.embed_q.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.k_norm.bias": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.k_norm.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.wk.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.wk.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.wk.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.wq_b.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.wq_b.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.indexer.wq_b.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.kv_a_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.o_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.o_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_a_layernorm.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_a_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_a_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_a_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_b_proj.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_b_proj.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.q_b_proj.weight": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.unembed_out.biases": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.unembed_out.scales": "model-00043-of-00057.safetensors", + "model.layers.60.self_attn.unembed_out.weight": "model-00043-of-00057.safetensors", + "model.layers.61.input_layernorm.weight": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.gate.e_score_correction_bias": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.gate.weight": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.61.post_attention_layernorm.weight": "model-00045-of-00057.safetensors", + "model.layers.61.self_attn.embed_q.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.embed_q.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.embed_q.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.k_norm.bias": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.k_norm.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.wk.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.wk.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.wk.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.wq_b.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.wq_b.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.indexer.wq_b.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.kv_a_layernorm.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.o_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.o_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.o_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_a_layernorm.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_a_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_a_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_a_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_b_proj.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_b_proj.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.q_b_proj.weight": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.unembed_out.biases": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.unembed_out.scales": "model-00044-of-00057.safetensors", + "model.layers.61.self_attn.unembed_out.weight": "model-00044-of-00057.safetensors", + "model.layers.62.input_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.gate.e_score_correction_bias": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.gate.weight": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.62.post_attention_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.62.self_attn.embed_q.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.embed_q.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.embed_q.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.k_norm.bias": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.k_norm.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.wk.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.wk.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.wk.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.wq_b.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.wq_b.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.indexer.wq_b.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.kv_a_layernorm.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.o_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.o_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.o_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_a_layernorm.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_a_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_a_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_a_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_b_proj.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_b_proj.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.q_b_proj.weight": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.unembed_out.biases": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.unembed_out.scales": "model-00045-of-00057.safetensors", + "model.layers.62.self_attn.unembed_out.weight": "model-00045-of-00057.safetensors", + "model.layers.63.input_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.gate.e_score_correction_bias": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.gate.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.post_attention_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.embed_q.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.embed_q.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.embed_q.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.k_norm.bias": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.k_norm.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.wk.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.wk.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.wk.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.wq_b.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.wq_b.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.indexer.wq_b.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.kv_a_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.o_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.o_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.o_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_a_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_a_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_a_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_a_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_b_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_b_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.q_b_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.unembed_out.biases": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.unembed_out.scales": "model-00046-of-00057.safetensors", + "model.layers.63.self_attn.unembed_out.weight": "model-00046-of-00057.safetensors", + "model.layers.64.input_layernorm.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.gate.e_score_correction_bias": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.gate.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.64.post_attention_layernorm.weight": "model-00047-of-00057.safetensors", + "model.layers.64.self_attn.embed_q.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.embed_q.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.embed_q.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.k_norm.bias": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.k_norm.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.wk.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.wk.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.wk.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.wq_b.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.wq_b.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.indexer.wq_b.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.kv_a_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.o_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.o_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.o_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_a_layernorm.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_a_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_a_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_a_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_b_proj.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_b_proj.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.q_b_proj.weight": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.unembed_out.biases": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.unembed_out.scales": "model-00046-of-00057.safetensors", + "model.layers.64.self_attn.unembed_out.weight": "model-00046-of-00057.safetensors", + "model.layers.65.input_layernorm.weight": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.gate.e_score_correction_bias": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.gate.weight": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.65.post_attention_layernorm.weight": "model-00048-of-00057.safetensors", + "model.layers.65.self_attn.embed_q.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.embed_q.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.embed_q.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.k_norm.bias": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.k_norm.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.wk.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.wk.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.wk.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.wq_b.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.wq_b.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.indexer.wq_b.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.kv_a_layernorm.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.o_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.o_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.o_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_a_layernorm.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_a_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_a_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_a_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_b_proj.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_b_proj.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.q_b_proj.weight": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.unembed_out.biases": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.unembed_out.scales": "model-00047-of-00057.safetensors", + "model.layers.65.self_attn.unembed_out.weight": "model-00047-of-00057.safetensors", + "model.layers.66.input_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.gate.e_score_correction_bias": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.gate.weight": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.66.post_attention_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.66.self_attn.embed_q.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.embed_q.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.embed_q.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.k_norm.bias": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.k_norm.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.wk.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.wk.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.wk.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.wq_b.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.wq_b.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.indexer.wq_b.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.kv_a_layernorm.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.o_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.o_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.o_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_a_layernorm.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_a_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_a_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_a_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_b_proj.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_b_proj.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.q_b_proj.weight": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.unembed_out.biases": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.unembed_out.scales": "model-00048-of-00057.safetensors", + "model.layers.66.self_attn.unembed_out.weight": "model-00048-of-00057.safetensors", + "model.layers.67.input_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.gate.e_score_correction_bias": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.gate.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.post_attention_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.embed_q.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.embed_q.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.embed_q.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.k_norm.bias": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.k_norm.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.wk.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.wk.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.wk.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.wq_b.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.wq_b.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.indexer.wq_b.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.kv_a_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.o_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.o_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.o_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_a_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_a_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_a_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_a_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_b_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_b_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.q_b_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.unembed_out.biases": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.unembed_out.scales": "model-00049-of-00057.safetensors", + "model.layers.67.self_attn.unembed_out.weight": "model-00049-of-00057.safetensors", + "model.layers.68.input_layernorm.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.gate.e_score_correction_bias": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.gate.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.68.post_attention_layernorm.weight": "model-00050-of-00057.safetensors", + "model.layers.68.self_attn.embed_q.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.embed_q.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.embed_q.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.k_norm.bias": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.k_norm.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.wk.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.wk.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.wk.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.wq_b.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.wq_b.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.indexer.wq_b.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.kv_a_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.o_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.o_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.o_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_a_layernorm.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_a_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_a_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_a_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_b_proj.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_b_proj.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.q_b_proj.weight": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.unembed_out.biases": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.unembed_out.scales": "model-00049-of-00057.safetensors", + "model.layers.68.self_attn.unembed_out.weight": "model-00049-of-00057.safetensors", + "model.layers.69.input_layernorm.weight": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.gate.e_score_correction_bias": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.gate.weight": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.69.post_attention_layernorm.weight": "model-00051-of-00057.safetensors", + "model.layers.69.self_attn.embed_q.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.embed_q.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.embed_q.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.k_norm.bias": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.k_norm.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.wk.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.wk.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.wk.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.wq_b.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.wq_b.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.indexer.wq_b.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.kv_a_layernorm.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.o_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.o_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.o_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_a_layernorm.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_a_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_a_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_a_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_b_proj.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_b_proj.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.q_b_proj.weight": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.unembed_out.biases": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.unembed_out.scales": "model-00050-of-00057.safetensors", + "model.layers.69.self_attn.unembed_out.weight": "model-00050-of-00057.safetensors", + "model.layers.7.input_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.gate.e_score_correction_bias": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.gate.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.embed_q.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.embed_q.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.embed_q.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.k_norm.bias": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.k_norm.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.wk.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.wk.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.wk.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.wq_b.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.wq_b.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.indexer.wq_b.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.o_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.o_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_a_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_a_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_a_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_a_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_b_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_b_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.q_b_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.unembed_out.biases": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.unembed_out.scales": "model-00004-of-00057.safetensors", + "model.layers.7.self_attn.unembed_out.weight": "model-00004-of-00057.safetensors", + "model.layers.70.input_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.gate.e_score_correction_bias": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.gate.weight": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.70.post_attention_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.70.self_attn.embed_q.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.embed_q.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.embed_q.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.k_norm.bias": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.k_norm.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.wk.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.wk.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.wk.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.wq_b.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.wq_b.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.indexer.wq_b.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.kv_a_layernorm.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.o_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.o_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.o_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_a_layernorm.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_a_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_a_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_a_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_b_proj.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_b_proj.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.q_b_proj.weight": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.unembed_out.biases": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.unembed_out.scales": "model-00051-of-00057.safetensors", + "model.layers.70.self_attn.unembed_out.weight": "model-00051-of-00057.safetensors", + "model.layers.71.input_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.gate.e_score_correction_bias": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.gate.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.post_attention_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.embed_q.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.embed_q.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.embed_q.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.k_norm.bias": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.k_norm.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.wk.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.wk.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.wk.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.wq_b.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.wq_b.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.indexer.wq_b.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.kv_a_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.o_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.o_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.o_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_a_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_a_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_a_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_a_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_b_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_b_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.q_b_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.unembed_out.biases": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.unembed_out.scales": "model-00052-of-00057.safetensors", + "model.layers.71.self_attn.unembed_out.weight": "model-00052-of-00057.safetensors", + "model.layers.72.input_layernorm.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.gate.e_score_correction_bias": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.gate.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.72.post_attention_layernorm.weight": "model-00053-of-00057.safetensors", + "model.layers.72.self_attn.embed_q.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.embed_q.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.embed_q.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.k_norm.bias": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.k_norm.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.wk.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.wk.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.wk.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.wq_b.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.wq_b.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.indexer.wq_b.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.kv_a_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.o_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.o_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.o_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_a_layernorm.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_a_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_a_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_a_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_b_proj.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_b_proj.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.q_b_proj.weight": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.unembed_out.biases": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.unembed_out.scales": "model-00052-of-00057.safetensors", + "model.layers.72.self_attn.unembed_out.weight": "model-00052-of-00057.safetensors", + "model.layers.73.input_layernorm.weight": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.gate.e_score_correction_bias": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.gate.weight": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.73.post_attention_layernorm.weight": "model-00054-of-00057.safetensors", + "model.layers.73.self_attn.embed_q.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.embed_q.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.embed_q.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.k_norm.bias": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.k_norm.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.wk.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.wk.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.wk.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.wq_b.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.wq_b.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.indexer.wq_b.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.kv_a_layernorm.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.o_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.o_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.o_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_a_layernorm.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_a_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_a_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_a_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_b_proj.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_b_proj.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.q_b_proj.weight": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.unembed_out.biases": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.unembed_out.scales": "model-00053-of-00057.safetensors", + "model.layers.73.self_attn.unembed_out.weight": "model-00053-of-00057.safetensors", + "model.layers.74.input_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.gate.e_score_correction_bias": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.gate.weight": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.74.post_attention_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.74.self_attn.embed_q.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.embed_q.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.embed_q.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.k_norm.bias": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.k_norm.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.wk.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.wk.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.wk.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.wq_b.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.wq_b.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.indexer.wq_b.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.kv_a_layernorm.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.o_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.o_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.o_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_a_layernorm.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_a_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_a_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_a_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_b_proj.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_b_proj.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.q_b_proj.weight": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.unembed_out.biases": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.unembed_out.scales": "model-00054-of-00057.safetensors", + "model.layers.74.self_attn.unembed_out.weight": "model-00054-of-00057.safetensors", + "model.layers.75.input_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.gate.e_score_correction_bias": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.gate.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.post_attention_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.embed_q.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.embed_q.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.embed_q.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.k_norm.bias": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.k_norm.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.wk.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.wk.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.wk.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.wq_b.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.wq_b.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.indexer.wq_b.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.kv_a_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.o_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.o_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.o_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_a_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_a_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_a_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_a_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_b_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_b_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.q_b_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.unembed_out.biases": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.unembed_out.scales": "model-00055-of-00057.safetensors", + "model.layers.75.self_attn.unembed_out.weight": "model-00055-of-00057.safetensors", + "model.layers.76.input_layernorm.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.gate.e_score_correction_bias": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.gate.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.76.post_attention_layernorm.weight": "model-00056-of-00057.safetensors", + "model.layers.76.self_attn.embed_q.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.embed_q.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.embed_q.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.k_norm.bias": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.k_norm.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.wk.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.wk.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.wk.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.wq_b.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.wq_b.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.indexer.wq_b.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.kv_a_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.o_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.o_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.o_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_a_layernorm.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_a_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_a_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_a_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_b_proj.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_b_proj.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.q_b_proj.weight": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.unembed_out.biases": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.unembed_out.scales": "model-00055-of-00057.safetensors", + "model.layers.76.self_attn.unembed_out.weight": "model-00055-of-00057.safetensors", + "model.layers.77.input_layernorm.weight": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.gate.e_score_correction_bias": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.gate.weight": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.biases": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.scales": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.weight": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.biases": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.scales": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.weight": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.biases": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.scales": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.weight": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.biases": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.scales": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.weight": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.biases": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.scales": "model-00057-of-00057.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.weight": "model-00057-of-00057.safetensors", + "model.layers.77.post_attention_layernorm.weight": "model-00057-of-00057.safetensors", + "model.layers.77.self_attn.embed_q.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.embed_q.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.embed_q.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.k_norm.bias": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.k_norm.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.wk.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.wk.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.wk.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.wq_b.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.wq_b.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.indexer.wq_b.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.kv_a_layernorm.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.o_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.o_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.o_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_a_layernorm.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_a_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_a_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_a_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_b_proj.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_b_proj.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.q_b_proj.weight": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.unembed_out.biases": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.unembed_out.scales": "model-00056-of-00057.safetensors", + "model.layers.77.self_attn.unembed_out.weight": "model-00056-of-00057.safetensors", + "model.layers.8.input_layernorm.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.gate.e_score_correction_bias": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.gate.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00005-of-00057.safetensors", + "model.layers.8.self_attn.embed_q.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.embed_q.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.embed_q.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.k_norm.bias": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.k_norm.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.wk.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.wk.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.wk.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.wq_b.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.wq_b.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.indexer.wq_b.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.o_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.o_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_a_layernorm.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_a_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_a_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_a_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_b_proj.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_b_proj.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.q_b_proj.weight": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.unembed_out.biases": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.unembed_out.scales": "model-00004-of-00057.safetensors", + "model.layers.8.self_attn.unembed_out.weight": "model-00004-of-00057.safetensors", + "model.layers.9.input_layernorm.weight": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.gate.e_score_correction_bias": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.gate.weight": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00006-of-00057.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00006-of-00057.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00006-of-00057.safetensors", + "model.layers.9.self_attn.embed_q.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.embed_q.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.embed_q.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.k_norm.bias": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.k_norm.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.wk.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.wk.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.wk.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.wq_b.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.wq_b.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.indexer.wq_b.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.o_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.o_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_a_layernorm.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_a_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_a_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_a_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_b_proj.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_b_proj.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.q_b_proj.weight": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.unembed_out.biases": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.unembed_out.scales": "model-00005-of-00057.safetensors", + "model.layers.9.self_attn.unembed_out.weight": "model-00005-of-00057.safetensors", + "model.norm.weight": "model-00057-of-00057.safetensors" + } +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..aba40197a4cdb5607f4ab7a05fb0a4ee8054fd6d --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d +size 20217442 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa53776c9f7ac98333a470b78a5b732d5343d15 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,34 @@ +{ + "backend": "tokenizers", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|endoftext|>", + "extra_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>", + "<|begin_of_audio|>", + "<|end_of_audio|>", + "<|begin_of_transcription|>", + "<|end_of_transcription|>" + ], + "is_local": true, + "model_max_length": 202752, + "model_specific_special_tokens": {}, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "TokenizersBackend", + "tool_parser_type": "glm47" +}