diff --git a/Llama-3.3-70B-Instruct/ll_4bit/README.md b/Llama-3.3-70B-Instruct/ll_4bit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69b948e165edf9328192beee7bbd9b0f12b987f6 --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/README.md @@ -0,0 +1,7 @@ +# Quantized Model Checkpoint + +**Base model:** unknown + +**Average bitwidth:** unknown + +See `quantization_config.txt` for full configuration details. diff --git a/Llama-3.3-70B-Instruct/ll_4bit/chat_template.jinja b/Llama-3.3-70B-Instruct/ll_4bit/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..33089ace1be88f22a10fe861ad49718d5d886090 --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/chat_template.jinja @@ -0,0 +1,109 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {#- This means we're in ipython mode #} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/Llama-3.3-70B-Instruct/ll_4bit/config.json b/Llama-3.3-70B-Instruct/ll_4bit/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e63a7035337dcb4b48fd88eaa1ae72ee7ddcf27a --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": false, + "transformers_version": "5.3.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/Llama-3.3-70B-Instruct/ll_4bit/generation_config.json b/Llama-3.3-70B-Instruct/ll_4bit/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..46631145bc4c440dabc4b8b3ae4bad43f1f7d99d --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.3.0" +} diff --git a/Llama-3.3-70B-Instruct/ll_4bit/humming_online_quant_config.json b/Llama-3.3-70B-Instruct/ll_4bit/humming_online_quant_config.json new file mode 100644 index 0000000000000000000000000000000000000000..71f9c5cc25ef014276e683aa28267a8997891b97 --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/humming_online_quant_config.json @@ -0,0 +1,5 @@ +{ + "quant_method": "gptq", + "bits": 4, + "group_size": 128 +} \ No newline at end of file diff --git a/Llama-3.3-70B-Instruct/ll_4bit/model.safetensors.index.json b/Llama-3.3-70B-Instruct/ll_4bit/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1a01e223969ce58bab528610d7b8078cd1f49cd1 --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/model.safetensors.index.json @@ -0,0 +1,731 @@ +{ + "metadata": { + "total_parameters": 70553706496, + "total_size": 141107412992 + }, + "weight_map": { + "lm_head.weight": "model-00001-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.36.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.36.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.36.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.37.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.37.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.37.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.38.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.38.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.38.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.39.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.39.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.39.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.40.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.40.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.40.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.40.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.40.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.40.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.40.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.41.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.41.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.41.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.42.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.42.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.42.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.43.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.43.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.43.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.44.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.44.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.44.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.45.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.45.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.45.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.46.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.46.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.46.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.47.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.47.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.47.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.48.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.48.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.48.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.49.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.49.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.49.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.50.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.50.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.50.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.50.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.50.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.50.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.50.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.51.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.51.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.51.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.52.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.52.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.52.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.53.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.53.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.53.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.54.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.54.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.54.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.55.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.55.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.55.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.55.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.55.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.55.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.55.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.56.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.56.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.56.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.57.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.57.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.57.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.58.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.58.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.58.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.59.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.59.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.59.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.60.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.60.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.60.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.60.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.60.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.60.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.60.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.61.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.61.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.61.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.62.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.62.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.62.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.63.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.63.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.63.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.64.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.64.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.64.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.65.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.65.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.65.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.66.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.66.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.66.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.67.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.67.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.67.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.68.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.68.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.68.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.69.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.69.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.69.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.70.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.70.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.70.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.70.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.70.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.70.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.70.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.70.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.70.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.71.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.71.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.71.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.72.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.72.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.72.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.73.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.73.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.73.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.74.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.74.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.74.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.75.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.75.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.75.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.76.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.76.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.76.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.77.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.77.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.77.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.78.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.78.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.78.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.79.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.79.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.79.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/Llama-3.3-70B-Instruct/ll_4bit/quantization_config.txt b/Llama-3.3-70B-Instruct/ll_4bit/quantization_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..94e66b961c3965d5a9ca23282d8c7aa7f2c7ea57 --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/quantization_config.txt @@ -0,0 +1,560 @@ +model.layers.0.self_attn.q_proj: 4 +model.layers.0.self_attn.k_proj: 4 +model.layers.0.self_attn.v_proj: 4 +model.layers.0.self_attn.o_proj: 4 +model.layers.0.mlp.gate_proj: 4 +model.layers.0.mlp.up_proj: 4 +model.layers.0.mlp.down_proj: 4 +model.layers.1.self_attn.q_proj: 4 +model.layers.1.self_attn.k_proj: 4 +model.layers.1.self_attn.v_proj: 4 +model.layers.1.self_attn.o_proj: 4 +model.layers.1.mlp.gate_proj: 4 +model.layers.1.mlp.up_proj: 4 +model.layers.1.mlp.down_proj: 4 +model.layers.2.self_attn.q_proj: 4 +model.layers.2.self_attn.k_proj: 4 +model.layers.2.self_attn.v_proj: 4 +model.layers.2.self_attn.o_proj: 4 +model.layers.2.mlp.gate_proj: 4 +model.layers.2.mlp.up_proj: 4 +model.layers.2.mlp.down_proj: 4 +model.layers.3.self_attn.q_proj: 4 +model.layers.3.self_attn.k_proj: 4 +model.layers.3.self_attn.v_proj: 4 +model.layers.3.self_attn.o_proj: 4 +model.layers.3.mlp.gate_proj: 4 +model.layers.3.mlp.up_proj: 4 +model.layers.3.mlp.down_proj: 4 +model.layers.4.self_attn.q_proj: 4 +model.layers.4.self_attn.k_proj: 4 +model.layers.4.self_attn.v_proj: 4 +model.layers.4.self_attn.o_proj: 4 +model.layers.4.mlp.gate_proj: 4 +model.layers.4.mlp.up_proj: 4 +model.layers.4.mlp.down_proj: 4 +model.layers.5.self_attn.q_proj: 4 +model.layers.5.self_attn.k_proj: 4 +model.layers.5.self_attn.v_proj: 4 +model.layers.5.self_attn.o_proj: 4 +model.layers.5.mlp.gate_proj: 4 +model.layers.5.mlp.up_proj: 4 +model.layers.5.mlp.down_proj: 4 +model.layers.6.self_attn.q_proj: 4 +model.layers.6.self_attn.k_proj: 4 +model.layers.6.self_attn.v_proj: 4 +model.layers.6.self_attn.o_proj: 4 +model.layers.6.mlp.gate_proj: 4 +model.layers.6.mlp.up_proj: 4 +model.layers.6.mlp.down_proj: 4 +model.layers.7.self_attn.q_proj: 4 +model.layers.7.self_attn.k_proj: 4 +model.layers.7.self_attn.v_proj: 4 +model.layers.7.self_attn.o_proj: 4 +model.layers.7.mlp.gate_proj: 4 +model.layers.7.mlp.up_proj: 4 +model.layers.7.mlp.down_proj: 4 +model.layers.8.self_attn.q_proj: 4 +model.layers.8.self_attn.k_proj: 4 +model.layers.8.self_attn.v_proj: 4 +model.layers.8.self_attn.o_proj: 4 +model.layers.8.mlp.gate_proj: 4 +model.layers.8.mlp.up_proj: 4 +model.layers.8.mlp.down_proj: 4 +model.layers.9.self_attn.q_proj: 4 +model.layers.9.self_attn.k_proj: 4 +model.layers.9.self_attn.v_proj: 4 +model.layers.9.self_attn.o_proj: 4 +model.layers.9.mlp.gate_proj: 4 +model.layers.9.mlp.up_proj: 4 +model.layers.9.mlp.down_proj: 4 +model.layers.10.self_attn.q_proj: 4 +model.layers.10.self_attn.k_proj: 4 +model.layers.10.self_attn.v_proj: 4 +model.layers.10.self_attn.o_proj: 4 +model.layers.10.mlp.gate_proj: 4 +model.layers.10.mlp.up_proj: 4 +model.layers.10.mlp.down_proj: 4 +model.layers.11.self_attn.q_proj: 4 +model.layers.11.self_attn.k_proj: 4 +model.layers.11.self_attn.v_proj: 4 +model.layers.11.self_attn.o_proj: 4 +model.layers.11.mlp.gate_proj: 4 +model.layers.11.mlp.up_proj: 4 +model.layers.11.mlp.down_proj: 4 +model.layers.12.self_attn.q_proj: 4 +model.layers.12.self_attn.k_proj: 4 +model.layers.12.self_attn.v_proj: 4 +model.layers.12.self_attn.o_proj: 4 +model.layers.12.mlp.gate_proj: 4 +model.layers.12.mlp.up_proj: 4 +model.layers.12.mlp.down_proj: 4 +model.layers.13.self_attn.q_proj: 4 +model.layers.13.self_attn.k_proj: 4 +model.layers.13.self_attn.v_proj: 4 +model.layers.13.self_attn.o_proj: 4 +model.layers.13.mlp.gate_proj: 4 +model.layers.13.mlp.up_proj: 4 +model.layers.13.mlp.down_proj: 4 +model.layers.14.self_attn.q_proj: 4 +model.layers.14.self_attn.k_proj: 4 +model.layers.14.self_attn.v_proj: 4 +model.layers.14.self_attn.o_proj: 4 +model.layers.14.mlp.gate_proj: 4 +model.layers.14.mlp.up_proj: 4 +model.layers.14.mlp.down_proj: 4 +model.layers.15.self_attn.q_proj: 4 +model.layers.15.self_attn.k_proj: 4 +model.layers.15.self_attn.v_proj: 4 +model.layers.15.self_attn.o_proj: 4 +model.layers.15.mlp.gate_proj: 4 +model.layers.15.mlp.up_proj: 4 +model.layers.15.mlp.down_proj: 4 +model.layers.16.self_attn.q_proj: 4 +model.layers.16.self_attn.k_proj: 4 +model.layers.16.self_attn.v_proj: 4 +model.layers.16.self_attn.o_proj: 4 +model.layers.16.mlp.gate_proj: 4 +model.layers.16.mlp.up_proj: 4 +model.layers.16.mlp.down_proj: 4 +model.layers.17.self_attn.q_proj: 4 +model.layers.17.self_attn.k_proj: 4 +model.layers.17.self_attn.v_proj: 4 +model.layers.17.self_attn.o_proj: 4 +model.layers.17.mlp.gate_proj: 4 +model.layers.17.mlp.up_proj: 4 +model.layers.17.mlp.down_proj: 4 +model.layers.18.self_attn.q_proj: 4 +model.layers.18.self_attn.k_proj: 4 +model.layers.18.self_attn.v_proj: 4 +model.layers.18.self_attn.o_proj: 4 +model.layers.18.mlp.gate_proj: 4 +model.layers.18.mlp.up_proj: 4 +model.layers.18.mlp.down_proj: 4 +model.layers.19.self_attn.q_proj: 4 +model.layers.19.self_attn.k_proj: 4 +model.layers.19.self_attn.v_proj: 4 +model.layers.19.self_attn.o_proj: 4 +model.layers.19.mlp.gate_proj: 4 +model.layers.19.mlp.up_proj: 4 +model.layers.19.mlp.down_proj: 4 +model.layers.20.self_attn.q_proj: 4 +model.layers.20.self_attn.k_proj: 4 +model.layers.20.self_attn.v_proj: 4 +model.layers.20.self_attn.o_proj: 4 +model.layers.20.mlp.gate_proj: 4 +model.layers.20.mlp.up_proj: 4 +model.layers.20.mlp.down_proj: 4 +model.layers.21.self_attn.q_proj: 4 +model.layers.21.self_attn.k_proj: 4 +model.layers.21.self_attn.v_proj: 4 +model.layers.21.self_attn.o_proj: 4 +model.layers.21.mlp.gate_proj: 4 +model.layers.21.mlp.up_proj: 4 +model.layers.21.mlp.down_proj: 4 +model.layers.22.self_attn.q_proj: 4 +model.layers.22.self_attn.k_proj: 4 +model.layers.22.self_attn.v_proj: 4 +model.layers.22.self_attn.o_proj: 4 +model.layers.22.mlp.gate_proj: 4 +model.layers.22.mlp.up_proj: 4 +model.layers.22.mlp.down_proj: 4 +model.layers.23.self_attn.q_proj: 4 +model.layers.23.self_attn.k_proj: 4 +model.layers.23.self_attn.v_proj: 4 +model.layers.23.self_attn.o_proj: 4 +model.layers.23.mlp.gate_proj: 4 +model.layers.23.mlp.up_proj: 4 +model.layers.23.mlp.down_proj: 4 +model.layers.24.self_attn.q_proj: 4 +model.layers.24.self_attn.k_proj: 4 +model.layers.24.self_attn.v_proj: 4 +model.layers.24.self_attn.o_proj: 4 +model.layers.24.mlp.gate_proj: 4 +model.layers.24.mlp.up_proj: 4 +model.layers.24.mlp.down_proj: 4 +model.layers.25.self_attn.q_proj: 4 +model.layers.25.self_attn.k_proj: 4 +model.layers.25.self_attn.v_proj: 4 +model.layers.25.self_attn.o_proj: 4 +model.layers.25.mlp.gate_proj: 4 +model.layers.25.mlp.up_proj: 4 +model.layers.25.mlp.down_proj: 4 +model.layers.26.self_attn.q_proj: 4 +model.layers.26.self_attn.k_proj: 4 +model.layers.26.self_attn.v_proj: 4 +model.layers.26.self_attn.o_proj: 4 +model.layers.26.mlp.gate_proj: 4 +model.layers.26.mlp.up_proj: 4 +model.layers.26.mlp.down_proj: 4 +model.layers.27.self_attn.q_proj: 4 +model.layers.27.self_attn.k_proj: 4 +model.layers.27.self_attn.v_proj: 4 +model.layers.27.self_attn.o_proj: 4 +model.layers.27.mlp.gate_proj: 4 +model.layers.27.mlp.up_proj: 4 +model.layers.27.mlp.down_proj: 4 +model.layers.28.self_attn.q_proj: 4 +model.layers.28.self_attn.k_proj: 4 +model.layers.28.self_attn.v_proj: 4 +model.layers.28.self_attn.o_proj: 4 +model.layers.28.mlp.gate_proj: 4 +model.layers.28.mlp.up_proj: 4 +model.layers.28.mlp.down_proj: 4 +model.layers.29.self_attn.q_proj: 4 +model.layers.29.self_attn.k_proj: 4 +model.layers.29.self_attn.v_proj: 4 +model.layers.29.self_attn.o_proj: 4 +model.layers.29.mlp.gate_proj: 4 +model.layers.29.mlp.up_proj: 4 +model.layers.29.mlp.down_proj: 4 +model.layers.30.self_attn.q_proj: 4 +model.layers.30.self_attn.k_proj: 4 +model.layers.30.self_attn.v_proj: 4 +model.layers.30.self_attn.o_proj: 4 +model.layers.30.mlp.gate_proj: 4 +model.layers.30.mlp.up_proj: 4 +model.layers.30.mlp.down_proj: 4 +model.layers.31.self_attn.q_proj: 4 +model.layers.31.self_attn.k_proj: 4 +model.layers.31.self_attn.v_proj: 4 +model.layers.31.self_attn.o_proj: 4 +model.layers.31.mlp.gate_proj: 4 +model.layers.31.mlp.up_proj: 4 +model.layers.31.mlp.down_proj: 4 +model.layers.32.self_attn.q_proj: 4 +model.layers.32.self_attn.k_proj: 4 +model.layers.32.self_attn.v_proj: 4 +model.layers.32.self_attn.o_proj: 4 +model.layers.32.mlp.gate_proj: 4 +model.layers.32.mlp.up_proj: 4 +model.layers.32.mlp.down_proj: 4 +model.layers.33.self_attn.q_proj: 4 +model.layers.33.self_attn.k_proj: 4 +model.layers.33.self_attn.v_proj: 4 +model.layers.33.self_attn.o_proj: 4 +model.layers.33.mlp.gate_proj: 4 +model.layers.33.mlp.up_proj: 4 +model.layers.33.mlp.down_proj: 4 +model.layers.34.self_attn.q_proj: 4 +model.layers.34.self_attn.k_proj: 4 +model.layers.34.self_attn.v_proj: 4 +model.layers.34.self_attn.o_proj: 4 +model.layers.34.mlp.gate_proj: 4 +model.layers.34.mlp.up_proj: 4 +model.layers.34.mlp.down_proj: 4 +model.layers.35.self_attn.q_proj: 4 +model.layers.35.self_attn.k_proj: 4 +model.layers.35.self_attn.v_proj: 4 +model.layers.35.self_attn.o_proj: 4 +model.layers.35.mlp.gate_proj: 4 +model.layers.35.mlp.up_proj: 4 +model.layers.35.mlp.down_proj: 4 +model.layers.36.self_attn.q_proj: 4 +model.layers.36.self_attn.k_proj: 4 +model.layers.36.self_attn.v_proj: 4 +model.layers.36.self_attn.o_proj: 4 +model.layers.36.mlp.gate_proj: 4 +model.layers.36.mlp.up_proj: 4 +model.layers.36.mlp.down_proj: 4 +model.layers.37.self_attn.q_proj: 4 +model.layers.37.self_attn.k_proj: 4 +model.layers.37.self_attn.v_proj: 4 +model.layers.37.self_attn.o_proj: 4 +model.layers.37.mlp.gate_proj: 4 +model.layers.37.mlp.up_proj: 4 +model.layers.37.mlp.down_proj: 4 +model.layers.38.self_attn.q_proj: 4 +model.layers.38.self_attn.k_proj: 4 +model.layers.38.self_attn.v_proj: 4 +model.layers.38.self_attn.o_proj: 4 +model.layers.38.mlp.gate_proj: 4 +model.layers.38.mlp.up_proj: 4 +model.layers.38.mlp.down_proj: 4 +model.layers.39.self_attn.q_proj: 4 +model.layers.39.self_attn.k_proj: 4 +model.layers.39.self_attn.v_proj: 4 +model.layers.39.self_attn.o_proj: 4 +model.layers.39.mlp.gate_proj: 4 +model.layers.39.mlp.up_proj: 4 +model.layers.39.mlp.down_proj: 4 +model.layers.40.self_attn.q_proj: 4 +model.layers.40.self_attn.k_proj: 4 +model.layers.40.self_attn.v_proj: 4 +model.layers.40.self_attn.o_proj: 4 +model.layers.40.mlp.gate_proj: 4 +model.layers.40.mlp.up_proj: 4 +model.layers.40.mlp.down_proj: 4 +model.layers.41.self_attn.q_proj: 4 +model.layers.41.self_attn.k_proj: 4 +model.layers.41.self_attn.v_proj: 4 +model.layers.41.self_attn.o_proj: 4 +model.layers.41.mlp.gate_proj: 4 +model.layers.41.mlp.up_proj: 4 +model.layers.41.mlp.down_proj: 4 +model.layers.42.self_attn.q_proj: 4 +model.layers.42.self_attn.k_proj: 4 +model.layers.42.self_attn.v_proj: 4 +model.layers.42.self_attn.o_proj: 4 +model.layers.42.mlp.gate_proj: 4 +model.layers.42.mlp.up_proj: 4 +model.layers.42.mlp.down_proj: 4 +model.layers.43.self_attn.q_proj: 4 +model.layers.43.self_attn.k_proj: 4 +model.layers.43.self_attn.v_proj: 4 +model.layers.43.self_attn.o_proj: 4 +model.layers.43.mlp.gate_proj: 4 +model.layers.43.mlp.up_proj: 4 +model.layers.43.mlp.down_proj: 4 +model.layers.44.self_attn.q_proj: 4 +model.layers.44.self_attn.k_proj: 4 +model.layers.44.self_attn.v_proj: 4 +model.layers.44.self_attn.o_proj: 4 +model.layers.44.mlp.gate_proj: 4 +model.layers.44.mlp.up_proj: 4 +model.layers.44.mlp.down_proj: 4 +model.layers.45.self_attn.q_proj: 4 +model.layers.45.self_attn.k_proj: 4 +model.layers.45.self_attn.v_proj: 4 +model.layers.45.self_attn.o_proj: 4 +model.layers.45.mlp.gate_proj: 4 +model.layers.45.mlp.up_proj: 4 +model.layers.45.mlp.down_proj: 4 +model.layers.46.self_attn.q_proj: 4 +model.layers.46.self_attn.k_proj: 4 +model.layers.46.self_attn.v_proj: 4 +model.layers.46.self_attn.o_proj: 4 +model.layers.46.mlp.gate_proj: 4 +model.layers.46.mlp.up_proj: 4 +model.layers.46.mlp.down_proj: 4 +model.layers.47.self_attn.q_proj: 4 +model.layers.47.self_attn.k_proj: 4 +model.layers.47.self_attn.v_proj: 4 +model.layers.47.self_attn.o_proj: 4 +model.layers.47.mlp.gate_proj: 4 +model.layers.47.mlp.up_proj: 4 +model.layers.47.mlp.down_proj: 4 +model.layers.48.self_attn.q_proj: 4 +model.layers.48.self_attn.k_proj: 4 +model.layers.48.self_attn.v_proj: 4 +model.layers.48.self_attn.o_proj: 4 +model.layers.48.mlp.gate_proj: 4 +model.layers.48.mlp.up_proj: 4 +model.layers.48.mlp.down_proj: 4 +model.layers.49.self_attn.q_proj: 4 +model.layers.49.self_attn.k_proj: 4 +model.layers.49.self_attn.v_proj: 4 +model.layers.49.self_attn.o_proj: 4 +model.layers.49.mlp.gate_proj: 4 +model.layers.49.mlp.up_proj: 4 +model.layers.49.mlp.down_proj: 4 +model.layers.50.self_attn.q_proj: 4 +model.layers.50.self_attn.k_proj: 4 +model.layers.50.self_attn.v_proj: 4 +model.layers.50.self_attn.o_proj: 4 +model.layers.50.mlp.gate_proj: 4 +model.layers.50.mlp.up_proj: 4 +model.layers.50.mlp.down_proj: 4 +model.layers.51.self_attn.q_proj: 4 +model.layers.51.self_attn.k_proj: 4 +model.layers.51.self_attn.v_proj: 4 +model.layers.51.self_attn.o_proj: 4 +model.layers.51.mlp.gate_proj: 4 +model.layers.51.mlp.up_proj: 4 +model.layers.51.mlp.down_proj: 4 +model.layers.52.self_attn.q_proj: 4 +model.layers.52.self_attn.k_proj: 4 +model.layers.52.self_attn.v_proj: 4 +model.layers.52.self_attn.o_proj: 4 +model.layers.52.mlp.gate_proj: 4 +model.layers.52.mlp.up_proj: 4 +model.layers.52.mlp.down_proj: 4 +model.layers.53.self_attn.q_proj: 4 +model.layers.53.self_attn.k_proj: 4 +model.layers.53.self_attn.v_proj: 4 +model.layers.53.self_attn.o_proj: 4 +model.layers.53.mlp.gate_proj: 4 +model.layers.53.mlp.up_proj: 4 +model.layers.53.mlp.down_proj: 4 +model.layers.54.self_attn.q_proj: 4 +model.layers.54.self_attn.k_proj: 4 +model.layers.54.self_attn.v_proj: 4 +model.layers.54.self_attn.o_proj: 4 +model.layers.54.mlp.gate_proj: 4 +model.layers.54.mlp.up_proj: 4 +model.layers.54.mlp.down_proj: 4 +model.layers.55.self_attn.q_proj: 4 +model.layers.55.self_attn.k_proj: 4 +model.layers.55.self_attn.v_proj: 4 +model.layers.55.self_attn.o_proj: 4 +model.layers.55.mlp.gate_proj: 4 +model.layers.55.mlp.up_proj: 4 +model.layers.55.mlp.down_proj: 4 +model.layers.56.self_attn.q_proj: 4 +model.layers.56.self_attn.k_proj: 4 +model.layers.56.self_attn.v_proj: 4 +model.layers.56.self_attn.o_proj: 4 +model.layers.56.mlp.gate_proj: 4 +model.layers.56.mlp.up_proj: 4 +model.layers.56.mlp.down_proj: 4 +model.layers.57.self_attn.q_proj: 4 +model.layers.57.self_attn.k_proj: 4 +model.layers.57.self_attn.v_proj: 4 +model.layers.57.self_attn.o_proj: 4 +model.layers.57.mlp.gate_proj: 4 +model.layers.57.mlp.up_proj: 4 +model.layers.57.mlp.down_proj: 4 +model.layers.58.self_attn.q_proj: 4 +model.layers.58.self_attn.k_proj: 4 +model.layers.58.self_attn.v_proj: 4 +model.layers.58.self_attn.o_proj: 4 +model.layers.58.mlp.gate_proj: 4 +model.layers.58.mlp.up_proj: 4 +model.layers.58.mlp.down_proj: 4 +model.layers.59.self_attn.q_proj: 4 +model.layers.59.self_attn.k_proj: 4 +model.layers.59.self_attn.v_proj: 4 +model.layers.59.self_attn.o_proj: 4 +model.layers.59.mlp.gate_proj: 4 +model.layers.59.mlp.up_proj: 4 +model.layers.59.mlp.down_proj: 4 +model.layers.60.self_attn.q_proj: 4 +model.layers.60.self_attn.k_proj: 4 +model.layers.60.self_attn.v_proj: 4 +model.layers.60.self_attn.o_proj: 4 +model.layers.60.mlp.gate_proj: 4 +model.layers.60.mlp.up_proj: 4 +model.layers.60.mlp.down_proj: 4 +model.layers.61.self_attn.q_proj: 4 +model.layers.61.self_attn.k_proj: 4 +model.layers.61.self_attn.v_proj: 4 +model.layers.61.self_attn.o_proj: 4 +model.layers.61.mlp.gate_proj: 4 +model.layers.61.mlp.up_proj: 4 +model.layers.61.mlp.down_proj: 4 +model.layers.62.self_attn.q_proj: 4 +model.layers.62.self_attn.k_proj: 4 +model.layers.62.self_attn.v_proj: 4 +model.layers.62.self_attn.o_proj: 4 +model.layers.62.mlp.gate_proj: 4 +model.layers.62.mlp.up_proj: 4 +model.layers.62.mlp.down_proj: 4 +model.layers.63.self_attn.q_proj: 4 +model.layers.63.self_attn.k_proj: 4 +model.layers.63.self_attn.v_proj: 4 +model.layers.63.self_attn.o_proj: 4 +model.layers.63.mlp.gate_proj: 4 +model.layers.63.mlp.up_proj: 4 +model.layers.63.mlp.down_proj: 4 +model.layers.64.self_attn.q_proj: 4 +model.layers.64.self_attn.k_proj: 4 +model.layers.64.self_attn.v_proj: 4 +model.layers.64.self_attn.o_proj: 4 +model.layers.64.mlp.gate_proj: 4 +model.layers.64.mlp.up_proj: 4 +model.layers.64.mlp.down_proj: 4 +model.layers.65.self_attn.q_proj: 4 +model.layers.65.self_attn.k_proj: 4 +model.layers.65.self_attn.v_proj: 4 +model.layers.65.self_attn.o_proj: 4 +model.layers.65.mlp.gate_proj: 4 +model.layers.65.mlp.up_proj: 4 +model.layers.65.mlp.down_proj: 4 +model.layers.66.self_attn.q_proj: 4 +model.layers.66.self_attn.k_proj: 4 +model.layers.66.self_attn.v_proj: 4 +model.layers.66.self_attn.o_proj: 4 +model.layers.66.mlp.gate_proj: 4 +model.layers.66.mlp.up_proj: 4 +model.layers.66.mlp.down_proj: 4 +model.layers.67.self_attn.q_proj: 4 +model.layers.67.self_attn.k_proj: 4 +model.layers.67.self_attn.v_proj: 4 +model.layers.67.self_attn.o_proj: 4 +model.layers.67.mlp.gate_proj: 4 +model.layers.67.mlp.up_proj: 4 +model.layers.67.mlp.down_proj: 4 +model.layers.68.self_attn.q_proj: 4 +model.layers.68.self_attn.k_proj: 4 +model.layers.68.self_attn.v_proj: 4 +model.layers.68.self_attn.o_proj: 4 +model.layers.68.mlp.gate_proj: 4 +model.layers.68.mlp.up_proj: 4 +model.layers.68.mlp.down_proj: 4 +model.layers.69.self_attn.q_proj: 4 +model.layers.69.self_attn.k_proj: 4 +model.layers.69.self_attn.v_proj: 4 +model.layers.69.self_attn.o_proj: 4 +model.layers.69.mlp.gate_proj: 4 +model.layers.69.mlp.up_proj: 4 +model.layers.69.mlp.down_proj: 4 +model.layers.70.self_attn.q_proj: 4 +model.layers.70.self_attn.k_proj: 4 +model.layers.70.self_attn.v_proj: 4 +model.layers.70.self_attn.o_proj: 4 +model.layers.70.mlp.gate_proj: 4 +model.layers.70.mlp.up_proj: 4 +model.layers.70.mlp.down_proj: 4 +model.layers.71.self_attn.q_proj: 4 +model.layers.71.self_attn.k_proj: 4 +model.layers.71.self_attn.v_proj: 4 +model.layers.71.self_attn.o_proj: 4 +model.layers.71.mlp.gate_proj: 4 +model.layers.71.mlp.up_proj: 4 +model.layers.71.mlp.down_proj: 4 +model.layers.72.self_attn.q_proj: 4 +model.layers.72.self_attn.k_proj: 4 +model.layers.72.self_attn.v_proj: 4 +model.layers.72.self_attn.o_proj: 4 +model.layers.72.mlp.gate_proj: 4 +model.layers.72.mlp.up_proj: 4 +model.layers.72.mlp.down_proj: 4 +model.layers.73.self_attn.q_proj: 4 +model.layers.73.self_attn.k_proj: 4 +model.layers.73.self_attn.v_proj: 4 +model.layers.73.self_attn.o_proj: 4 +model.layers.73.mlp.gate_proj: 4 +model.layers.73.mlp.up_proj: 4 +model.layers.73.mlp.down_proj: 4 +model.layers.74.self_attn.q_proj: 4 +model.layers.74.self_attn.k_proj: 4 +model.layers.74.self_attn.v_proj: 4 +model.layers.74.self_attn.o_proj: 4 +model.layers.74.mlp.gate_proj: 4 +model.layers.74.mlp.up_proj: 4 +model.layers.74.mlp.down_proj: 4 +model.layers.75.self_attn.q_proj: 4 +model.layers.75.self_attn.k_proj: 4 +model.layers.75.self_attn.v_proj: 4 +model.layers.75.self_attn.o_proj: 4 +model.layers.75.mlp.gate_proj: 4 +model.layers.75.mlp.up_proj: 4 +model.layers.75.mlp.down_proj: 4 +model.layers.76.self_attn.q_proj: 4 +model.layers.76.self_attn.k_proj: 4 +model.layers.76.self_attn.v_proj: 4 +model.layers.76.self_attn.o_proj: 4 +model.layers.76.mlp.gate_proj: 4 +model.layers.76.mlp.up_proj: 4 +model.layers.76.mlp.down_proj: 4 +model.layers.77.self_attn.q_proj: 4 +model.layers.77.self_attn.k_proj: 4 +model.layers.77.self_attn.v_proj: 4 +model.layers.77.self_attn.o_proj: 4 +model.layers.77.mlp.gate_proj: 4 +model.layers.77.mlp.up_proj: 4 +model.layers.77.mlp.down_proj: 4 +model.layers.78.self_attn.q_proj: 4 +model.layers.78.self_attn.k_proj: 4 +model.layers.78.self_attn.v_proj: 4 +model.layers.78.self_attn.o_proj: 4 +model.layers.78.mlp.gate_proj: 4 +model.layers.78.mlp.up_proj: 4 +model.layers.78.mlp.down_proj: 4 +model.layers.79.self_attn.q_proj: 4 +model.layers.79.self_attn.k_proj: 4 +model.layers.79.self_attn.v_proj: 4 +model.layers.79.self_attn.o_proj: 4 +model.layers.79.mlp.gate_proj: 4 +model.layers.79.mlp.up_proj: 4 +model.layers.79.mlp.down_proj: 4 diff --git a/Llama-3.3-70B-Instruct/ll_4bit/tokenizer_config.json b/Llama-3.3-70B-Instruct/ll_4bit/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8213417e51fa3bca8617f3231d8e41d17e398214 --- /dev/null +++ b/Llama-3.3-70B-Instruct/ll_4bit/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/Qwen3-32B/ll_4bit/README.md b/Qwen3-32B/ll_4bit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69b948e165edf9328192beee7bbd9b0f12b987f6 --- /dev/null +++ b/Qwen3-32B/ll_4bit/README.md @@ -0,0 +1,7 @@ +# Quantized Model Checkpoint + +**Base model:** unknown + +**Average bitwidth:** unknown + +See `quantization_config.txt` for full configuration details. diff --git a/Qwen3-32B/ll_4bit/chat_template.jinja b/Qwen3-32B/ll_4bit/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01be9b307daa2d425f7c168c9fb145a286e0afb4 --- /dev/null +++ b/Qwen3-32B/ll_4bit/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/Qwen3-32B/ll_4bit/config.json b/Qwen3-32B/ll_4bit/config.json new file mode 100644 index 0000000000000000000000000000000000000000..232e9bf5c353986af38d4bf9de4683158089b93c --- /dev/null +++ b/Qwen3-32B/ll_4bit/config.json @@ -0,0 +1,99 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "float16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 25600, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 64, + "model_type": "qwen3", + "num_attention_heads": 64, + "num_hidden_layers": 64, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.3.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/Qwen3-32B/ll_4bit/generation_config.json b/Qwen3-32B/ll_4bit/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cfb3c1a14c48fda643794324f7c126c3c83bcd3e --- /dev/null +++ b/Qwen3-32B/ll_4bit/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.3.0" +} diff --git a/Qwen3-32B/ll_4bit/humming_online_quant_config.json b/Qwen3-32B/ll_4bit/humming_online_quant_config.json new file mode 100644 index 0000000000000000000000000000000000000000..71f9c5cc25ef014276e683aa28267a8997891b97 --- /dev/null +++ b/Qwen3-32B/ll_4bit/humming_online_quant_config.json @@ -0,0 +1,5 @@ +{ + "quant_method": "gptq", + "bits": 4, + "group_size": 128 +} \ No newline at end of file diff --git a/Qwen3-32B/ll_4bit/model.safetensors.index.json b/Qwen3-32B/ll_4bit/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8b2a0b459d8cf056d279cb6405e9ac0770396d01 --- /dev/null +++ b/Qwen3-32B/ll_4bit/model.safetensors.index.json @@ -0,0 +1,715 @@ +{ + "metadata": { + "total_parameters": 32762123264, + "total_size": 65524246528 + }, + "weight_map": { + "lm_head.weight": "model-00001-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.36.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.36.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.36.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.36.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.36.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.37.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.37.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.37.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.37.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.37.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.38.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.38.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.38.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.38.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.38.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.39.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.39.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.39.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.39.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.39.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.40.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.40.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.40.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.40.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.40.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.41.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.41.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.41.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.41.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.41.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.42.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.42.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.42.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.42.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.42.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.43.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.43.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.43.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.43.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.43.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.44.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.44.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.44.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.44.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.44.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.45.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.45.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.45.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.45.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.45.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.46.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.46.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.46.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.46.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.46.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.47.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.47.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.47.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.47.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.47.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.48.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.48.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.48.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.48.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.48.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.48.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.48.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.48.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.48.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.49.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.49.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.49.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.49.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.49.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.50.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.50.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.50.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.50.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.50.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.50.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.50.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.50.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.50.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.51.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.51.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.51.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.51.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.51.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.52.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.52.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.52.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.52.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.52.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.53.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.53.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.53.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.53.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.53.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.54.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.54.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.54.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.54.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.54.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.55.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.55.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.55.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.55.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.55.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.56.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.56.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.56.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.56.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.56.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.57.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.57.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.57.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.57.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.57.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.58.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.58.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.58.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.58.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.58.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.59.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.59.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.59.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.59.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.59.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.60.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.60.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.60.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.60.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.60.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.60.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.60.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.60.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.60.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.61.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.61.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.61.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.61.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.61.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.62.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.62.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.62.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.62.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.62.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.63.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.63.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.63.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.layers.63.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.63.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/Qwen3-32B/ll_4bit/quantization_config.txt b/Qwen3-32B/ll_4bit/quantization_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb782a8e5e5dab52b3267a020d3128febc5738c8 --- /dev/null +++ b/Qwen3-32B/ll_4bit/quantization_config.txt @@ -0,0 +1,280 @@ +model.layers.0.mlp.gate_proj: 4 +model.layers.0.mlp.up_proj: 4 +model.layers.0.self_attn.k_proj: 4 +model.layers.0.self_attn.q_proj: 4 +model.layers.0.self_attn.v_proj: 4 +model.layers.1.mlp.gate_proj: 4 +model.layers.1.mlp.up_proj: 4 +model.layers.1.self_attn.k_proj: 4 +model.layers.1.self_attn.q_proj: 4 +model.layers.1.self_attn.v_proj: 4 +model.layers.2.mlp.gate_proj: 4 +model.layers.2.mlp.up_proj: 4 +model.layers.2.self_attn.k_proj: 4 +model.layers.2.self_attn.q_proj: 4 +model.layers.2.self_attn.v_proj: 4 +model.layers.3.mlp.gate_proj: 4 +model.layers.3.mlp.up_proj: 4 +model.layers.3.self_attn.k_proj: 4 +model.layers.3.self_attn.q_proj: 4 +model.layers.3.self_attn.v_proj: 4 +model.layers.4.mlp.gate_proj: 4 +model.layers.4.mlp.up_proj: 4 +model.layers.4.self_attn.k_proj: 4 +model.layers.4.self_attn.q_proj: 4 +model.layers.4.self_attn.v_proj: 4 +model.layers.5.mlp.gate_proj: 4 +model.layers.5.mlp.up_proj: 4 +model.layers.5.self_attn.k_proj: 4 +model.layers.5.self_attn.q_proj: 4 +model.layers.5.self_attn.v_proj: 4 +model.layers.6.mlp.gate_proj: 4 +model.layers.6.mlp.up_proj: 4 +model.layers.6.self_attn.k_proj: 4 +model.layers.6.self_attn.q_proj: 4 +model.layers.6.self_attn.v_proj: 4 +model.layers.7.mlp.gate_proj: 4 +model.layers.7.mlp.up_proj: 4 +model.layers.7.self_attn.k_proj: 4 +model.layers.7.self_attn.q_proj: 4 +model.layers.7.self_attn.v_proj: 4 +model.layers.8.mlp.gate_proj: 4 +model.layers.8.mlp.up_proj: 4 +model.layers.8.self_attn.k_proj: 4 +model.layers.8.self_attn.q_proj: 4 +model.layers.8.self_attn.v_proj: 4 +model.layers.9.mlp.gate_proj: 4 +model.layers.9.mlp.up_proj: 4 +model.layers.9.self_attn.k_proj: 4 +model.layers.9.self_attn.q_proj: 4 +model.layers.9.self_attn.v_proj: 4 +model.layers.10.mlp.gate_proj: 4 +model.layers.10.mlp.up_proj: 4 +model.layers.10.self_attn.k_proj: 4 +model.layers.10.self_attn.q_proj: 4 +model.layers.10.self_attn.v_proj: 4 +model.layers.11.mlp.gate_proj: 4 +model.layers.11.mlp.up_proj: 4 +model.layers.11.self_attn.k_proj: 4 +model.layers.11.self_attn.q_proj: 4 +model.layers.11.self_attn.v_proj: 4 +model.layers.12.mlp.gate_proj: 4 +model.layers.12.mlp.up_proj: 4 +model.layers.12.self_attn.k_proj: 4 +model.layers.12.self_attn.q_proj: 4 +model.layers.12.self_attn.v_proj: 4 +model.layers.13.mlp.gate_proj: 4 +model.layers.13.mlp.up_proj: 4 +model.layers.13.self_attn.k_proj: 4 +model.layers.13.self_attn.q_proj: 4 +model.layers.13.self_attn.v_proj: 4 +model.layers.14.mlp.gate_proj: 4 +model.layers.14.mlp.up_proj: 4 +model.layers.14.self_attn.k_proj: 4 +model.layers.14.self_attn.q_proj: 4 +model.layers.14.self_attn.v_proj: 4 +model.layers.15.mlp.gate_proj: 4 +model.layers.15.mlp.up_proj: 4 +model.layers.15.self_attn.k_proj: 4 +model.layers.15.self_attn.q_proj: 4 +model.layers.15.self_attn.v_proj: 4 +model.layers.16.mlp.gate_proj: 4 +model.layers.16.mlp.up_proj: 4 +model.layers.16.self_attn.k_proj: 4 +model.layers.16.self_attn.q_proj: 4 +model.layers.16.self_attn.v_proj: 4 +model.layers.17.mlp.gate_proj: 4 +model.layers.17.mlp.up_proj: 4 +model.layers.17.self_attn.k_proj: 4 +model.layers.17.self_attn.q_proj: 4 +model.layers.17.self_attn.v_proj: 4 +model.layers.18.mlp.gate_proj: 4 +model.layers.18.mlp.up_proj: 4 +model.layers.18.self_attn.k_proj: 4 +model.layers.18.self_attn.q_proj: 4 +model.layers.18.self_attn.v_proj: 4 +model.layers.19.mlp.gate_proj: 4 +model.layers.19.mlp.up_proj: 4 +model.layers.19.self_attn.k_proj: 4 +model.layers.19.self_attn.q_proj: 4 +model.layers.19.self_attn.v_proj: 4 +model.layers.20.mlp.gate_proj: 4 +model.layers.20.mlp.up_proj: 4 +model.layers.20.self_attn.k_proj: 4 +model.layers.20.self_attn.q_proj: 4 +model.layers.20.self_attn.v_proj: 4 +model.layers.21.mlp.gate_proj: 4 +model.layers.21.mlp.up_proj: 4 +model.layers.21.self_attn.k_proj: 4 +model.layers.21.self_attn.q_proj: 4 +model.layers.21.self_attn.v_proj: 4 +model.layers.22.mlp.gate_proj: 4 +model.layers.22.mlp.up_proj: 4 +model.layers.22.self_attn.k_proj: 4 +model.layers.22.self_attn.q_proj: 4 +model.layers.22.self_attn.v_proj: 4 +model.layers.23.mlp.gate_proj: 4 +model.layers.23.mlp.up_proj: 4 +model.layers.23.self_attn.k_proj: 4 +model.layers.23.self_attn.q_proj: 4 +model.layers.23.self_attn.v_proj: 4 +model.layers.24.mlp.gate_proj: 4 +model.layers.24.mlp.up_proj: 4 +model.layers.24.self_attn.k_proj: 4 +model.layers.24.self_attn.q_proj: 4 +model.layers.24.self_attn.v_proj: 4 +model.layers.25.mlp.gate_proj: 4 +model.layers.25.mlp.up_proj: 4 +model.layers.25.self_attn.k_proj: 4 +model.layers.25.self_attn.q_proj: 4 +model.layers.25.self_attn.v_proj: 4 +model.layers.26.mlp.gate_proj: 4 +model.layers.26.mlp.up_proj: 4 +model.layers.26.self_attn.k_proj: 4 +model.layers.26.self_attn.q_proj: 4 +model.layers.26.self_attn.v_proj: 4 +model.layers.27.mlp.gate_proj: 4 +model.layers.27.mlp.up_proj: 4 +model.layers.27.self_attn.k_proj: 4 +model.layers.27.self_attn.q_proj: 4 +model.layers.27.self_attn.v_proj: 4 +model.layers.28.mlp.gate_proj: 4 +model.layers.28.mlp.up_proj: 4 +model.layers.28.self_attn.k_proj: 4 +model.layers.28.self_attn.q_proj: 4 +model.layers.28.self_attn.v_proj: 4 +model.layers.29.mlp.gate_proj: 4 +model.layers.29.mlp.up_proj: 4 +model.layers.29.self_attn.k_proj: 4 +model.layers.29.self_attn.q_proj: 4 +model.layers.29.self_attn.v_proj: 4 +model.layers.30.mlp.gate_proj: 4 +model.layers.30.mlp.up_proj: 4 +model.layers.30.self_attn.k_proj: 4 +model.layers.30.self_attn.q_proj: 4 +model.layers.30.self_attn.v_proj: 4 +model.layers.31.mlp.gate_proj: 4 +model.layers.31.mlp.up_proj: 4 +model.layers.31.self_attn.k_proj: 4 +model.layers.31.self_attn.q_proj: 4 +model.layers.31.self_attn.v_proj: 4 +model.layers.32.mlp.gate_proj: 4 +model.layers.32.mlp.up_proj: 4 +model.layers.32.self_attn.k_proj: 4 +model.layers.32.self_attn.q_proj: 4 +model.layers.32.self_attn.v_proj: 4 +model.layers.33.mlp.gate_proj: 4 +model.layers.33.mlp.up_proj: 4 +model.layers.33.self_attn.k_proj: 4 +model.layers.33.self_attn.q_proj: 4 +model.layers.33.self_attn.v_proj: 4 +model.layers.34.mlp.gate_proj: 4 +model.layers.34.mlp.up_proj: 4 +model.layers.34.self_attn.k_proj: 4 +model.layers.34.self_attn.q_proj: 4 +model.layers.34.self_attn.v_proj: 4 +model.layers.35.mlp.gate_proj: 4 +model.layers.35.mlp.up_proj: 4 +model.layers.35.self_attn.k_proj: 4 +model.layers.35.self_attn.q_proj: 4 +model.layers.35.self_attn.v_proj: 4 +model.layers.36.mlp.gate_proj: 4 +model.layers.36.mlp.up_proj: 4 +model.layers.36.self_attn.k_proj: 4 +model.layers.36.self_attn.q_proj: 4 +model.layers.36.self_attn.v_proj: 4 +model.layers.37.mlp.gate_proj: 4 +model.layers.37.mlp.up_proj: 4 +model.layers.37.self_attn.k_proj: 4 +model.layers.37.self_attn.q_proj: 4 +model.layers.37.self_attn.v_proj: 4 +model.layers.38.mlp.gate_proj: 4 +model.layers.38.mlp.up_proj: 4 +model.layers.38.self_attn.k_proj: 4 +model.layers.38.self_attn.q_proj: 4 +model.layers.38.self_attn.v_proj: 4 +model.layers.39.mlp.gate_proj: 4 +model.layers.39.mlp.up_proj: 4 +model.layers.39.self_attn.k_proj: 4 +model.layers.39.self_attn.q_proj: 4 +model.layers.39.self_attn.v_proj: 4 +model.layers.0.mlp.down_proj: 4 +model.layers.1.mlp.down_proj: 4 +model.layers.0.self_attn.o_proj: 4 +model.layers.2.self_attn.o_proj: 4 +model.layers.1.self_attn.o_proj: 4 +model.layers.29.mlp.down_proj: 4 +model.layers.21.mlp.down_proj: 4 +model.layers.3.self_attn.o_proj: 4 +model.layers.2.mlp.down_proj: 4 +model.layers.3.mlp.down_proj: 4 +model.layers.8.mlp.down_proj: 4 +model.layers.7.mlp.down_proj: 4 +model.layers.6.mlp.down_proj: 4 +model.layers.13.mlp.down_proj: 4 +model.layers.16.mlp.down_proj: 4 +model.layers.22.self_attn.o_proj: 4 +model.layers.38.mlp.down_proj: 4 +model.layers.16.self_attn.o_proj: 4 +model.layers.15.self_attn.o_proj: 4 +model.layers.7.self_attn.o_proj: 4 +model.layers.27.self_attn.o_proj: 4 +model.layers.28.mlp.down_proj: 4 +model.layers.5.self_attn.o_proj: 4 +model.layers.6.self_attn.o_proj: 4 +model.layers.12.mlp.down_proj: 4 +model.layers.23.self_attn.o_proj: 4 +model.layers.8.self_attn.o_proj: 4 +model.layers.19.self_attn.o_proj: 4 +model.layers.4.self_attn.o_proj: 4 +model.layers.4.mlp.down_proj: 4 +model.layers.5.mlp.down_proj: 4 +model.layers.9.mlp.down_proj: 4 +model.layers.9.self_attn.o_proj: 4 +model.layers.30.mlp.down_proj: 4 +model.layers.34.mlp.down_proj: 4 +model.layers.22.mlp.down_proj: 4 +model.layers.18.self_attn.o_proj: 4 +model.layers.20.self_attn.o_proj: 4 +model.layers.21.self_attn.o_proj: 4 +model.layers.38.self_attn.o_proj: 4 +model.layers.14.mlp.down_proj: 4 +model.layers.25.self_attn.o_proj: 4 +model.layers.11.mlp.down_proj: 4 +model.layers.10.self_attn.o_proj: 4 +model.layers.17.self_attn.o_proj: 4 +model.layers.34.self_attn.o_proj: 4 +model.layers.29.self_attn.o_proj: 4 +model.layers.39.self_attn.o_proj: 4 +model.layers.14.self_attn.o_proj: 4 +model.layers.20.mlp.down_proj: 4 +model.layers.32.mlp.down_proj: 4 +model.layers.37.self_attn.o_proj: 4 +model.layers.10.mlp.down_proj: 4 +model.layers.15.mlp.down_proj: 4 +model.layers.11.self_attn.o_proj: 4 +model.layers.12.self_attn.o_proj: 4 +model.layers.13.self_attn.o_proj: 4 +model.layers.18.mlp.down_proj: 4 +model.layers.31.mlp.down_proj: 4 +model.layers.32.self_attn.o_proj: 4 +model.layers.37.mlp.down_proj: 4 +model.layers.23.mlp.down_proj: 4 +model.layers.25.mlp.down_proj: 4 +model.layers.28.self_attn.o_proj: 4 +model.layers.24.mlp.down_proj: 4 +model.layers.17.mlp.down_proj: 4 +model.layers.19.mlp.down_proj: 4 +model.layers.26.self_attn.o_proj: 4 +model.layers.26.mlp.down_proj: 4 +model.layers.27.mlp.down_proj: 4 +model.layers.31.self_attn.o_proj: 4 +model.layers.24.self_attn.o_proj: 4 +model.layers.33.self_attn.o_proj: 4 +model.layers.30.self_attn.o_proj: 4 +model.layers.33.mlp.down_proj: 4 +model.layers.36.self_attn.o_proj: 4 +model.layers.39.mlp.down_proj: 4 +model.layers.35.mlp.down_proj: 4 +model.layers.36.mlp.down_proj: 4 +model.layers.35.self_attn.o_proj: 4 diff --git a/Qwen3-32B/ll_4bit/tokenizer_config.json b/Qwen3-32B/ll_4bit/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/Qwen3-32B/ll_4bit/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/Qwen3-8B/ll_4bit/README.md b/Qwen3-8B/ll_4bit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69b948e165edf9328192beee7bbd9b0f12b987f6 --- /dev/null +++ b/Qwen3-8B/ll_4bit/README.md @@ -0,0 +1,7 @@ +# Quantized Model Checkpoint + +**Base model:** unknown + +**Average bitwidth:** unknown + +See `quantization_config.txt` for full configuration details. diff --git a/Qwen3-8B/ll_4bit/chat_template.jinja b/Qwen3-8B/ll_4bit/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01be9b307daa2d425f7c168c9fb145a286e0afb4 --- /dev/null +++ b/Qwen3-8B/ll_4bit/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/Qwen3-8B/ll_4bit/config.json b/Qwen3-8B/ll_4bit/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ba589300345eda2a40e727e7bc0bdc65b087579b --- /dev/null +++ b/Qwen3-8B/ll_4bit/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "float16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.3.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/Qwen3-8B/ll_4bit/generation_config.json b/Qwen3-8B/ll_4bit/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cfb3c1a14c48fda643794324f7c126c3c83bcd3e --- /dev/null +++ b/Qwen3-8B/ll_4bit/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.3.0" +} diff --git a/Qwen3-8B/ll_4bit/humming_online_quant_config.json b/Qwen3-8B/ll_4bit/humming_online_quant_config.json new file mode 100644 index 0000000000000000000000000000000000000000..71f9c5cc25ef014276e683aa28267a8997891b97 --- /dev/null +++ b/Qwen3-8B/ll_4bit/humming_online_quant_config.json @@ -0,0 +1,5 @@ +{ + "quant_method": "gptq", + "bits": 4, + "group_size": 128 +} \ No newline at end of file diff --git a/Qwen3-8B/ll_4bit/quantization_config.txt b/Qwen3-8B/ll_4bit/quantization_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ca0894605558e83f82a42fc72e0a9cd606d0222 --- /dev/null +++ b/Qwen3-8B/ll_4bit/quantization_config.txt @@ -0,0 +1,252 @@ +model.layers.23.self_attn.k_proj: 4 +model.layers.22.self_attn.o_proj: 4 +model.layers.4.self_attn.o_proj: 4 +model.layers.1.mlp.down_proj: 4 +model.layers.11.mlp.up_proj: 4 +model.layers.5.self_attn.k_proj: 4 +model.layers.14.self_attn.q_proj: 4 +model.layers.17.mlp.down_proj: 4 +model.layers.33.mlp.gate_proj: 4 +model.layers.5.self_attn.v_proj: 4 +model.layers.23.self_attn.v_proj: 4 +model.layers.27.mlp.gate_proj: 4 +model.layers.2.mlp.up_proj: 4 +model.layers.15.self_attn.q_proj: 4 +model.layers.23.self_attn.o_proj: 4 +model.layers.21.mlp.up_proj: 4 +model.layers.22.self_attn.k_proj: 4 +model.layers.4.mlp.down_proj: 4 +model.layers.4.self_attn.k_proj: 4 +model.layers.5.self_attn.o_proj: 4 +model.layers.31.mlp.up_proj: 4 +model.layers.4.self_attn.v_proj: 4 +model.layers.22.self_attn.v_proj: 4 +model.layers.22.mlp.gate_proj: 4 +model.layers.8.mlp.gate_proj: 4 +model.layers.12.mlp.down_proj: 4 +model.layers.7.mlp.gate_proj: 4 +model.layers.35.self_attn.o_proj: 4 +model.layers.21.self_attn.v_proj: 4 +model.layers.34.self_attn.k_proj: 4 +model.layers.7.self_attn.v_proj: 4 +model.layers.7.self_attn.k_proj: 4 +model.layers.6.self_attn.o_proj: 4 +model.layers.34.self_attn.v_proj: 4 +model.layers.20.self_attn.o_proj: 4 +model.layers.13.mlp.up_proj: 4 +model.layers.21.self_attn.k_proj: 4 +model.layers.35.mlp.down_proj: 4 +model.layers.28.self_attn.q_proj: 4 +model.layers.11.mlp.gate_proj: 4 +model.layers.16.self_attn.q_proj: 4 +model.layers.21.mlp.down_proj: 4 +model.layers.35.self_attn.k_proj: 4 +model.layers.33.mlp.up_proj: 4 +model.layers.20.self_attn.v_proj: 4 +model.layers.34.self_attn.o_proj: 4 +model.layers.6.self_attn.v_proj: 4 +model.layers.28.mlp.gate_proj: 4 +model.layers.2.mlp.gate_proj: 4 +model.layers.18.mlp.down_proj: 4 +model.layers.17.self_attn.q_proj: 4 +model.layers.0.mlp.up_proj: 4 +model.layers.24.mlp.down_proj: 4 +model.layers.23.mlp.up_proj: 4 +model.layers.7.self_attn.o_proj: 4 +model.layers.6.self_attn.k_proj: 4 +model.layers.29.self_attn.q_proj: 4 +model.layers.20.self_attn.k_proj: 4 +model.layers.30.mlp.down_proj: 4 +model.layers.14.mlp.gate_proj: 4 +model.layers.21.self_attn.o_proj: 4 +model.layers.35.self_attn.v_proj: 4 +model.layers.6.mlp.up_proj: 4 +model.layers.9.self_attn.q_proj: 4 +model.layers.0.self_attn.k_proj: 4 +model.layers.9.mlp.up_proj: 4 +model.layers.1.self_attn.o_proj: 4 +model.layers.33.self_attn.v_proj: 4 +model.layers.27.self_attn.o_proj: 4 +model.layers.26.self_attn.k_proj: 4 +model.layers.19.self_attn.o_proj: 4 +model.layers.0.mlp.gate_proj: 4 +model.layers.11.self_attn.q_proj: 4 +model.layers.18.self_attn.k_proj: 4 +model.layers.25.mlp.up_proj: 4 +model.layers.26.mlp.down_proj: 4 +model.layers.18.self_attn.v_proj: 4 +model.layers.35.mlp.up_proj: 4 +model.layers.32.self_attn.o_proj: 4 +model.layers.26.self_attn.v_proj: 4 +model.layers.33.self_attn.k_proj: 4 +model.layers.32.mlp.down_proj: 4 +model.layers.16.mlp.gate_proj: 4 +model.layers.0.self_attn.v_proj: 4 +model.layers.15.mlp.up_proj: 4 +model.layers.19.self_attn.k_proj: 4 +model.layers.10.self_attn.q_proj: 4 +model.layers.5.mlp.gate_proj: 4 +model.layers.18.self_attn.o_proj: 4 +model.layers.0.self_attn.o_proj: 4 +model.layers.1.self_attn.k_proj: 4 +model.layers.8.self_attn.q_proj: 4 +model.layers.27.self_attn.k_proj: 4 +model.layers.26.self_attn.o_proj: 4 +model.layers.32.self_attn.v_proj: 4 +model.layers.9.mlp.down_proj: 4 +model.layers.32.self_attn.k_proj: 4 +model.layers.13.mlp.gate_proj: 4 +model.layers.27.self_attn.v_proj: 4 +model.layers.33.self_attn.o_proj: 4 +model.layers.1.self_attn.v_proj: 4 +model.layers.23.mlp.down_proj: 4 +model.layers.19.self_attn.v_proj: 4 +model.layers.2.self_attn.v_proj: 4 +model.layers.6.mlp.down_proj: 4 +model.layers.31.self_attn.k_proj: 4 +model.layers.24.self_attn.v_proj: 4 +model.layers.30.self_attn.o_proj: 4 +model.layers.24.self_attn.k_proj: 4 +model.layers.4.mlp.up_proj: 4 +model.layers.25.self_attn.o_proj: 4 +model.layers.31.self_attn.v_proj: 4 +model.layers.20.mlp.gate_proj: 4 +model.layers.3.self_attn.o_proj: 4 +model.layers.2.self_attn.k_proj: 4 +model.layers.10.mlp.down_proj: 4 +model.layers.34.mlp.gate_proj: 4 +model.layers.13.self_attn.q_proj: 4 +model.layers.28.mlp.up_proj: 4 +model.layers.27.mlp.up_proj: 4 +model.layers.3.mlp.down_proj: 4 +model.layers.3.self_attn.v_proj: 4 +model.layers.19.mlp.gate_proj: 4 +model.layers.31.self_attn.o_proj: 4 +model.layers.25.self_attn.v_proj: 4 +model.layers.30.self_attn.k_proj: 4 +model.layers.29.mlp.down_proj: 4 +model.layers.18.mlp.up_proj: 4 +model.layers.15.mlp.down_proj: 4 +model.layers.17.mlp.up_proj: 4 +model.layers.31.mlp.gate_proj: 4 +model.layers.12.self_attn.q_proj: 4 +model.layers.30.self_attn.v_proj: 4 +model.layers.24.self_attn.o_proj: 4 +model.layers.25.self_attn.k_proj: 4 +model.layers.25.mlp.gate_proj: 4 +model.layers.3.self_attn.k_proj: 4 +model.layers.2.self_attn.o_proj: 4 +model.layers.2.mlp.down_proj: 4 +model.layers.16.self_attn.o_proj: 4 +model.layers.18.mlp.gate_proj: 4 +model.layers.17.self_attn.k_proj: 4 +model.layers.6.self_attn.q_proj: 4 +model.layers.28.mlp.down_proj: 4 +model.layers.28.self_attn.o_proj: 4 +model.layers.20.self_attn.q_proj: 4 +model.layers.29.self_attn.k_proj: 4 +model.layers.29.self_attn.v_proj: 4 +model.layers.14.mlp.down_proj: 4 +model.layers.35.self_attn.q_proj: 4 +model.layers.30.mlp.gate_proj: 4 +model.layers.24.mlp.gate_proj: 4 +model.layers.14.mlp.up_proj: 4 +model.layers.17.self_attn.v_proj: 4 +model.layers.7.self_attn.q_proj: 4 +model.layers.28.self_attn.k_proj: 4 +model.layers.21.self_attn.q_proj: 4 +model.layers.29.self_attn.o_proj: 4 +model.layers.7.mlp.down_proj: 4 +model.layers.16.self_attn.k_proj: 4 +model.layers.17.self_attn.o_proj: 4 +model.layers.34.mlp.up_proj: 4 +model.layers.21.mlp.gate_proj: 4 +model.layers.16.self_attn.v_proj: 4 +model.layers.24.mlp.up_proj: 4 +model.layers.34.self_attn.q_proj: 4 +model.layers.8.mlp.up_proj: 4 +model.layers.11.mlp.down_proj: 4 +model.layers.7.mlp.up_proj: 4 +model.layers.35.mlp.gate_proj: 4 +model.layers.28.self_attn.v_proj: 4 +model.layers.4.mlp.gate_proj: 4 +model.layers.16.mlp.up_proj: 4 +model.layers.15.self_attn.v_proj: 4 +model.layers.19.mlp.up_proj: 4 +model.layers.8.mlp.down_proj: 4 +model.layers.12.mlp.gate_proj: 4 +model.layers.15.self_attn.k_proj: 4 +model.layers.14.self_attn.o_proj: 4 +model.layers.22.self_attn.q_proj: 4 +model.layers.22.mlp.down_proj: 4 +model.layers.4.self_attn.q_proj: 4 +model.layers.14.self_attn.v_proj: 4 +model.layers.26.mlp.up_proj: 4 +model.layers.29.mlp.up_proj: 4 +model.layers.5.mlp.up_proj: 4 +model.layers.1.mlp.gate_proj: 4 +model.layers.27.mlp.down_proj: 4 +model.layers.23.self_attn.q_proj: 4 +model.layers.5.self_attn.q_proj: 4 +model.layers.33.mlp.down_proj: 4 +model.layers.17.mlp.gate_proj: 4 +model.layers.15.self_attn.o_proj: 4 +model.layers.14.self_attn.k_proj: 4 +model.layers.12.self_attn.k_proj: 4 +model.layers.13.self_attn.o_proj: 4 +model.layers.29.mlp.gate_proj: 4 +model.layers.25.self_attn.q_proj: 4 +model.layers.30.mlp.up_proj: 4 +model.layers.3.mlp.gate_proj: 4 +model.layers.19.mlp.down_proj: 4 +model.layers.3.self_attn.q_proj: 4 +model.layers.20.mlp.up_proj: 4 +model.layers.30.self_attn.q_proj: 4 +model.layers.25.mlp.down_proj: 4 +model.layers.12.self_attn.v_proj: 4 +model.layers.3.mlp.up_proj: 4 +model.layers.31.mlp.down_proj: 4 +model.layers.15.mlp.gate_proj: 4 +model.layers.24.self_attn.q_proj: 4 +model.layers.2.self_attn.q_proj: 4 +model.layers.6.mlp.gate_proj: 4 +model.layers.12.self_attn.o_proj: 4 +model.layers.13.self_attn.k_proj: 4 +model.layers.13.self_attn.v_proj: 4 +model.layers.34.mlp.down_proj: 4 +model.layers.10.mlp.gate_proj: 4 +model.layers.10.mlp.up_proj: 4 +model.layers.20.mlp.down_proj: 4 +model.layers.31.self_attn.q_proj: 4 +model.layers.22.mlp.up_proj: 4 +model.layers.32.self_attn.q_proj: 4 +model.layers.8.self_attn.v_proj: 4 +model.layers.5.mlp.down_proj: 4 +model.layers.10.self_attn.v_proj: 4 +model.layers.1.mlp.up_proj: 4 +model.layers.11.self_attn.o_proj: 4 +model.layers.10.self_attn.k_proj: 4 +model.layers.19.self_attn.q_proj: 4 +model.layers.23.mlp.gate_proj: 4 +model.layers.8.self_attn.k_proj: 4 +model.layers.32.mlp.up_proj: 4 +model.layers.1.self_attn.q_proj: 4 +model.layers.9.self_attn.o_proj: 4 +model.layers.9.mlp.gate_proj: 4 +model.layers.13.mlp.down_proj: 4 +model.layers.27.self_attn.q_proj: 4 +model.layers.0.mlp.down_proj: 4 +model.layers.11.self_attn.v_proj: 4 +model.layers.33.self_attn.q_proj: 4 +model.layers.9.self_attn.v_proj: 4 +model.layers.12.mlp.up_proj: 4 +model.layers.8.self_attn.o_proj: 4 +model.layers.0.self_attn.q_proj: 4 +model.layers.9.self_attn.k_proj: 4 +model.layers.26.self_attn.q_proj: 4 +model.layers.16.mlp.down_proj: 4 +model.layers.32.mlp.gate_proj: 4 +model.layers.18.self_attn.q_proj: 4 +model.layers.11.self_attn.k_proj: 4 +model.layers.10.self_attn.o_proj: 4 +model.layers.26.mlp.gate_proj: 4 diff --git a/Qwen3-8B/ll_4bit/tokenizer_config.json b/Qwen3-8B/ll_4bit/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d75d3bb5300d205e48769cc1999073ab5971214 --- /dev/null +++ b/Qwen3-8B/ll_4bit/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "add_prefix_space": false, + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "extra_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "is_local": false, + "model_max_length": 131072, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null +} diff --git a/Qwen3.5-27B/ll_4bit/README.md b/Qwen3.5-27B/ll_4bit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69b948e165edf9328192beee7bbd9b0f12b987f6 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/README.md @@ -0,0 +1,7 @@ +# Quantized Model Checkpoint + +**Base model:** unknown + +**Average bitwidth:** unknown + +See `quantization_config.txt` for full configuration details. diff --git a/Qwen3.5-27B/ll_4bit/chat_template.jinja b/Qwen3.5-27B/ll_4bit/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a585dec894e63da457d9440ec6aa7caa16d20860 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/chat_template.jinja @@ -0,0 +1,154 @@ +{%- set image_count = namespace(value=0) %} +{%- set video_count = namespace(value=0) %} +{%- macro render_content(content, do_vision_count, is_system_content=false) %} + {%- if content is string %} + {{- content }} + {%- elif content is iterable and content is not mapping %} + {%- for item in content %} + {%- if 'image' in item or 'image_url' in item or item.type == 'image' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain images.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set image_count.value = image_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Picture ' ~ image_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|image_pad|><|vision_end|>' }} + {%- elif 'video' in item or item.type == 'video' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain videos.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set video_count.value = video_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Video ' ~ video_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|video_pad|><|vision_end|>' }} + {%- elif 'text' in item %} + {{- item.text }} + {%- else %} + {{- raise_exception('Unexpected item type in content.') }} + {%- endif %} + {%- endfor %} + {%- elif content is none or content is undefined %} + {{- '' }} + {%- else %} + {{- raise_exception('Unexpected content type.') }} + {%- endif %} +{%- endmacro %} +{%- if not messages %} + {{- raise_exception('No messages provided.') }} +{%- endif %} +{%- if tools and tools is iterable and tools is not mapping %} + {{- '<|im_start|>system\n' }} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if content %} + {{- '\n\n' + content }} + {%- endif %} + {%- endif %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" %} + {%- set content = render_content(message.content, false)|trim %} + {%- if not(content.startswith('') and content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if ns.multi_step_tool %} + {{- raise_exception('No user query found in messages.') }} +{%- endif %} +{%- for message in messages %} + {%- set content = render_content(message.content, true)|trim %} + {%- if message.role == "system" %} + {%- if not loop.first %} + {{- raise_exception('System message must be at the beginning.') }} + {%- endif %} + {%- elif message.role == "user" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- set reasoning_content = reasoning_content|trim %} + {%- if loop.index0 > ns.last_query_index %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if loop.first %} + {%- if content|trim %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} + {%- else %} + {{- '\n\n\n' }} + {%- endif %} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- raise_exception('Unexpected message role.') }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_4bit/config.json b/Qwen3.5-27B/ll_4bit/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6c0fc6f0ed2ea5d07e1c36677cfab84cd8f27351 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/config.json @@ -0,0 +1,138 @@ +{ + "vision_start_token_id": 248053, + "video_token_id": 248057, + "image_token_id": 248056, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "model_type": "qwen3_5", + "vision_end_token_id": 248054, + "tie_word_embeddings": false, + "vision_config": { + "deepstack_visual_indexes": [], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 4304, + "model_type": "qwen3_5", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 5120, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2 + }, + "transformers_version": "4.57.0.dev0", + "text_config": { + "attention_bias": false, + "attention_dropout": 0.0, + "attn_output_gate": true, + "bos_token_id": null, + "dtype": "float16", + "eos_token_id": 248044, + "full_attention_interval": 4, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17408, + "layer_types": [ + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention" + ], + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 48, + "linear_value_head_dim": 128, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 262144, + "mlp_only_layers": [], + "mtp_num_hidden_layers": 1, + "mtp_use_dedicated_embeddings": false, + "num_attention_heads": 24, + "num_hidden_layers": 64, + "num_key_value_heads": 4, + "pad_token_id": null, + "partial_rotary_factor": 0.25, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "mrope_interleaved": true, + "mrope_section": [ + 11, + 11, + 10 + ], + "partial_rotary_factor": 0.25, + "rope_theta": 10000000, + "rope_type": "default" + }, + "use_cache": false, + "vocab_size": 248320, + "model_type": "qwen3_5_text" + } +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_4bit/generation_config.json b/Qwen3.5-27B/ll_4bit/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1068c09fbcc050fcccf2066dda235127d9bad05e --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 248044, + "do_sample": true, + "eos_token_id": [ + 248046, + 248044 + ], + "pad_token_id": 248044, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.3.0" +} diff --git a/Qwen3.5-27B/ll_4bit/humming_online_quant_config.json b/Qwen3.5-27B/ll_4bit/humming_online_quant_config.json new file mode 100644 index 0000000000000000000000000000000000000000..71f9c5cc25ef014276e683aa28267a8997891b97 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/humming_online_quant_config.json @@ -0,0 +1,5 @@ +{ + "quant_method": "gptq", + "bits": 4, + "group_size": 128 +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_4bit/model-00001-of-00002.safetensors b/Qwen3.5-27B/ll_4bit/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aa3f7b261cfa8a98c4312a2547101fdd5d33c491 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:710d2e34d9313fe8c77e9dbb1425d9d61560271c23c509c4fa4c1cc168b859d7 +size 49825162192 diff --git a/Qwen3.5-27B/ll_4bit/model.safetensors.index.json b/Qwen3.5-27B/ll_4bit/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..70fe4d0836094f2d2b8dd7c40966b8baa44d5114 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/model.safetensors.index.json @@ -0,0 +1,859 @@ +{ + "metadata": { + "total_parameters": 26895998464, + "total_size": 53791996928 + }, + "weight_map": { + "lm_head.weight": "model-00001-of-00002.safetensors", + "model.language_model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.58.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.60.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.A_log": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.conv1d.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.dt_bias": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_a.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_b.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_qkv.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_z.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.A_log": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.conv1d.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.dt_bias": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_a.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_b.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_qkv.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_z.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.A_log": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.conv1d.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.dt_bias": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_a.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_b.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_qkv.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_z.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/Qwen3.5-27B/ll_4bit/preprocessor_config.json b/Qwen3.5-27B/ll_4bit/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2ea84a437d448ff71b08df68fdd949d5cc4ebb64 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/preprocessor_config.json @@ -0,0 +1,21 @@ +{ + "size": { + "longest_edge": 16777216, + "shortest_edge": 65536 + }, + "patch_size": 16, + "temporal_patch_size": 2, + "merge_size": 2, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "Qwen3VLProcessor", + "image_processor_type": "Qwen2VLImageProcessorFast" +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_4bit/quantization_config.txt b/Qwen3.5-27B/ll_4bit/quantization_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dc784298206b6747d4776f10db7482e6556328c --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/quantization_config.txt @@ -0,0 +1,432 @@ +model.layers.0.mlp.gate_proj: 4 +model.layers.0.mlp.up_proj: 4 +model.layers.1.mlp.gate_proj: 4 +model.layers.1.mlp.up_proj: 4 +model.layers.2.mlp.gate_proj: 4 +model.layers.2.mlp.up_proj: 4 +model.layers.3.mlp.gate_proj: 4 +model.layers.3.mlp.up_proj: 4 +model.layers.4.mlp.gate_proj: 4 +model.layers.4.mlp.up_proj: 4 +model.layers.5.mlp.gate_proj: 4 +model.layers.5.mlp.up_proj: 4 +model.layers.6.mlp.gate_proj: 4 +model.layers.6.mlp.up_proj: 4 +model.layers.7.mlp.gate_proj: 4 +model.layers.7.mlp.up_proj: 4 +model.layers.8.mlp.gate_proj: 4 +model.layers.8.mlp.up_proj: 4 +model.layers.9.mlp.gate_proj: 4 +model.layers.9.mlp.up_proj: 4 +model.layers.10.mlp.gate_proj: 4 +model.layers.10.mlp.up_proj: 4 +model.layers.11.mlp.gate_proj: 4 +model.layers.11.mlp.up_proj: 4 +model.layers.12.mlp.gate_proj: 4 +model.layers.12.mlp.up_proj: 4 +model.layers.13.mlp.gate_proj: 4 +model.layers.13.mlp.up_proj: 4 +model.layers.14.mlp.gate_proj: 4 +model.layers.14.mlp.up_proj: 4 +model.layers.15.mlp.gate_proj: 4 +model.layers.15.mlp.up_proj: 4 +model.layers.16.mlp.gate_proj: 4 +model.layers.16.mlp.up_proj: 4 +model.layers.17.mlp.gate_proj: 4 +model.layers.17.mlp.up_proj: 4 +model.layers.18.mlp.gate_proj: 4 +model.layers.18.mlp.up_proj: 4 +model.layers.19.mlp.gate_proj: 4 +model.layers.19.mlp.up_proj: 4 +model.layers.20.mlp.gate_proj: 4 +model.layers.20.mlp.up_proj: 4 +model.layers.21.mlp.gate_proj: 4 +model.layers.21.mlp.up_proj: 4 +model.layers.22.mlp.gate_proj: 4 +model.layers.22.mlp.up_proj: 4 +model.layers.23.mlp.gate_proj: 4 +model.layers.23.mlp.up_proj: 4 +model.layers.24.mlp.gate_proj: 4 +model.layers.24.mlp.up_proj: 4 +model.layers.25.mlp.gate_proj: 4 +model.layers.25.mlp.up_proj: 4 +model.layers.26.mlp.gate_proj: 4 +model.layers.26.mlp.up_proj: 4 +model.layers.27.mlp.gate_proj: 4 +model.layers.27.mlp.up_proj: 4 +model.layers.28.mlp.gate_proj: 4 +model.layers.28.mlp.up_proj: 4 +model.layers.29.mlp.gate_proj: 4 +model.layers.29.mlp.up_proj: 4 +model.layers.30.mlp.gate_proj: 4 +model.layers.30.mlp.up_proj: 4 +model.layers.31.mlp.gate_proj: 4 +model.layers.31.mlp.up_proj: 4 +model.layers.32.mlp.gate_proj: 4 +model.layers.32.mlp.up_proj: 4 +model.layers.33.mlp.gate_proj: 4 +model.layers.33.mlp.up_proj: 4 +model.layers.34.mlp.gate_proj: 4 +model.layers.34.mlp.up_proj: 4 +model.layers.35.mlp.gate_proj: 4 +model.layers.35.mlp.up_proj: 4 +model.layers.36.mlp.gate_proj: 4 +model.layers.36.mlp.up_proj: 4 +model.layers.37.mlp.gate_proj: 4 +model.layers.37.mlp.up_proj: 4 +model.layers.38.mlp.gate_proj: 4 +model.layers.38.mlp.up_proj: 4 +model.layers.39.mlp.gate_proj: 4 +model.layers.39.mlp.up_proj: 4 +model.layers.40.mlp.gate_proj: 4 +model.layers.40.mlp.up_proj: 4 +model.layers.41.mlp.gate_proj: 4 +model.layers.41.mlp.up_proj: 4 +model.layers.42.mlp.gate_proj: 4 +model.layers.42.mlp.up_proj: 4 +model.layers.43.mlp.gate_proj: 4 +model.layers.43.mlp.up_proj: 4 +model.layers.44.mlp.gate_proj: 4 +model.layers.44.mlp.up_proj: 4 +model.layers.45.mlp.gate_proj: 4 +model.layers.45.mlp.up_proj: 4 +model.layers.46.mlp.gate_proj: 4 +model.layers.46.mlp.up_proj: 4 +model.layers.47.mlp.gate_proj: 4 +model.layers.47.mlp.up_proj: 4 +model.layers.48.mlp.gate_proj: 4 +model.layers.48.mlp.up_proj: 4 +model.layers.49.mlp.gate_proj: 4 +model.layers.49.mlp.up_proj: 4 +model.layers.50.mlp.gate_proj: 4 +model.layers.50.mlp.up_proj: 4 +model.layers.51.mlp.gate_proj: 4 +model.layers.51.mlp.up_proj: 4 +model.layers.52.mlp.gate_proj: 4 +model.layers.52.mlp.up_proj: 4 +model.layers.53.mlp.gate_proj: 4 +model.layers.53.mlp.up_proj: 4 +model.layers.54.mlp.gate_proj: 4 +model.layers.54.mlp.up_proj: 4 +model.layers.55.mlp.gate_proj: 4 +model.layers.55.mlp.up_proj: 4 +model.layers.56.mlp.gate_proj: 4 +model.layers.56.mlp.up_proj: 4 +model.layers.57.mlp.gate_proj: 4 +model.layers.57.mlp.up_proj: 4 +model.layers.58.mlp.gate_proj: 4 +model.layers.58.mlp.up_proj: 4 +model.layers.59.mlp.gate_proj: 4 +model.layers.59.mlp.up_proj: 4 +model.layers.60.mlp.gate_proj: 4 +model.layers.60.mlp.up_proj: 4 +model.layers.61.mlp.gate_proj: 4 +model.layers.61.mlp.up_proj: 4 +model.layers.62.mlp.gate_proj: 4 +model.layers.62.mlp.up_proj: 4 +model.layers.63.mlp.gate_proj: 4 +model.layers.63.mlp.up_proj: 4 +model.layers.61.linear_attn.in_proj_b: 4 +model.layers.36.linear_attn.in_proj_qkv: 4 +model.layers.60.linear_attn.in_proj_a: 4 +model.layers.60.linear_attn.out_proj: 4 +model.layers.54.linear_attn.in_proj_z: 4 +model.layers.12.linear_attn.in_proj_qkv: 4 +model.layers.56.linear_attn.in_proj_b: 4 +model.layers.9.linear_attn.in_proj_z: 4 +model.layers.57.linear_attn.in_proj_a: 4 +model.layers.52.linear_attn.in_proj_z: 4 +model.layers.50.linear_attn.in_proj_b: 4 +model.layers.52.mlp.down_proj: 4 +model.layers.27.mlp.down_proj: 4 +model.layers.3.mlp.down_proj: 4 +model.layers.4.linear_attn.in_proj_qkv: 4 +model.layers.38.linear_attn.in_proj_b: 4 +model.layers.46.mlp.down_proj: 4 +model.layers.26.linear_attn.in_proj_qkv: 4 +model.layers.33.mlp.down_proj: 4 +model.layers.33.linear_attn.out_proj: 4 +model.layers.22.linear_attn.in_proj_qkv: 4 +model.layers.1.linear_attn.in_proj_b: 4 +model.layers.58.linear_attn.out_proj: 4 +model.layers.0.linear_attn.in_proj_qkv: 4 +model.layers.0.linear_attn.in_proj_a: 4 +model.layers.42.linear_attn.out_proj: 4 +model.layers.34.linear_attn.in_proj_b: 4 +model.layers.29.linear_attn.out_proj: 4 +model.layers.36.linear_attn.in_proj_z: 4 +model.layers.22.linear_attn.out_proj: 4 +model.layers.6.mlp.down_proj: 4 +model.layers.33.linear_attn.in_proj_a: 4 +model.layers.43.mlp.down_proj: 4 +model.layers.49.linear_attn.out_proj: 4 +model.layers.32.linear_attn.in_proj_b: 4 +model.layers.16.linear_attn.in_proj_qkv: 4 +model.layers.30.linear_attn.in_proj_z: 4 +model.layers.36.mlp.down_proj: 4 +model.layers.5.linear_attn.in_proj_z: 4 +model.layers.57.mlp.down_proj: 4 +model.layers.32.linear_attn.in_proj_qkv: 4 +model.layers.53.linear_attn.out_proj: 4 +model.layers.6.linear_attn.in_proj_a: 4 +model.layers.58.linear_attn.in_proj_z: 4 +model.layers.38.linear_attn.out_proj: 4 +model.layers.22.mlp.down_proj: 4 +model.layers.20.linear_attn.in_proj_z: 4 +model.layers.58.mlp.down_proj: 4 +model.layers.22.linear_attn.in_proj_b: 4 +model.layers.62.linear_attn.in_proj_qkv: 4 +model.layers.9.mlp.down_proj: 4 +model.layers.48.linear_attn.in_proj_z: 4 +model.layers.17.linear_attn.in_proj_z: 4 +model.layers.46.linear_attn.in_proj_qkv: 4 +model.layers.24.linear_attn.out_proj: 4 +model.layers.14.linear_attn.in_proj_a: 4 +model.layers.39.mlp.down_proj: 4 +model.layers.13.linear_attn.in_proj_b: 4 +model.layers.12.linear_attn.in_proj_a: 4 +model.layers.56.linear_attn.in_proj_qkv: 4 +model.layers.44.linear_attn.out_proj: 4 +model.layers.26.linear_attn.in_proj_z: 4 +model.layers.8.linear_attn.in_proj_qkv: 4 +model.layers.24.linear_attn.in_proj_b: 4 +model.layers.25.linear_attn.in_proj_a: 4 +model.layers.11.mlp.down_proj: 4 +model.layers.49.mlp.down_proj: 4 +model.layers.41.linear_attn.in_proj_a: 4 +model.layers.40.linear_attn.in_proj_b: 4 +model.layers.42.linear_attn.in_proj_z: 4 +model.layers.29.linear_attn.in_proj_a: 4 +model.layers.52.linear_attn.in_proj_qkv: 4 +model.layers.28.linear_attn.in_proj_b: 4 +model.layers.28.mlp.down_proj: 4 +model.layers.61.mlp.down_proj: 4 +model.layers.42.linear_attn.in_proj_qkv: 4 +model.layers.1.linear_attn.out_proj: 4 +model.layers.14.mlp.down_proj: 4 +model.layers.18.linear_attn.in_proj_a: 4 +model.layers.46.linear_attn.in_proj_b: 4 +model.layers.44.linear_attn.in_proj_z: 4 +model.layers.17.linear_attn.out_proj: 4 +model.layers.57.linear_attn.in_proj_b: 4 +model.layers.56.linear_attn.in_proj_a: 4 +model.layers.29.linear_attn.in_proj_qkv: 4 +model.layers.8.linear_attn.in_proj_z: 4 +model.layers.62.linear_attn.in_proj_z: 4 +model.layers.60.linear_attn.in_proj_b: 4 +model.layers.61.linear_attn.in_proj_a: 4 +model.layers.16.mlp.down_proj: 4 +model.layers.61.linear_attn.out_proj: 4 +model.layers.6.linear_attn.out_proj: 4 +model.layers.38.linear_attn.in_proj_a: 4 +model.layers.45.linear_attn.in_proj_qkv: 4 +model.layers.63.mlp.down_proj: 4 +model.layers.10.linear_attn.out_proj: 4 +model.layers.53.linear_attn.in_proj_z: 4 +model.layers.61.linear_attn.in_proj_qkv: 4 +model.layers.50.linear_attn.in_proj_a: 4 +model.layers.52.linear_attn.out_proj: 4 +model.layers.34.linear_attn.in_proj_a: 4 +model.layers.37.linear_attn.in_proj_z: 4 +model.layers.0.linear_attn.in_proj_b: 4 +model.layers.2.linear_attn.in_proj_z: 4 +model.layers.41.linear_attn.in_proj_qkv: 4 +model.layers.1.linear_attn.in_proj_a: 4 +model.layers.48.linear_attn.out_proj: 4 +model.layers.6.linear_attn.in_proj_b: 4 +model.layers.4.linear_attn.in_proj_z: 4 +model.layers.28.linear_attn.out_proj: 4 +model.layers.32.linear_attn.out_proj: 4 +model.layers.13.mlp.down_proj: 4 +model.layers.32.linear_attn.in_proj_a: 4 +model.layers.33.linear_attn.in_proj_b: 4 +model.layers.16.linear_attn.in_proj_z: 4 +model.layers.14.linear_attn.in_proj_b: 4 +model.layers.49.linear_attn.in_proj_z: 4 +model.layers.25.linear_attn.in_proj_qkv: 4 +model.layers.34.linear_attn.out_proj: 4 +model.layers.21.linear_attn.in_proj_z: 4 +model.layers.22.linear_attn.in_proj_a: 4 +model.layers.45.linear_attn.out_proj: 4 +model.layers.34.mlp.down_proj: 4 +model.layers.25.linear_attn.in_proj_b: 4 +model.layers.49.linear_attn.in_proj_qkv: 4 +model.layers.4.mlp.down_proj: 4 +model.layers.41.mlp.down_proj: 4 +model.layers.24.linear_attn.in_proj_a: 4 +model.layers.25.linear_attn.out_proj: 4 +model.layers.10.linear_attn.in_proj_z: 4 +model.layers.12.linear_attn.in_proj_b: 4 +model.layers.20.mlp.down_proj: 4 +model.layers.54.linear_attn.out_proj: 4 +model.layers.55.mlp.down_proj: 4 +model.layers.13.linear_attn.in_proj_a: 4 +model.layers.28.linear_attn.in_proj_a: 4 +model.layers.16.linear_attn.out_proj: 4 +model.layers.29.linear_attn.in_proj_b: 4 +model.layers.0.linear_attn.out_proj: 4 +model.layers.40.linear_attn.in_proj_a: 4 +model.layers.19.mlp.down_proj: 4 +model.layers.41.linear_attn.in_proj_b: 4 +model.layers.25.mlp.down_proj: 4 +model.layers.46.linear_attn.in_proj_a: 4 +model.layers.18.linear_attn.in_proj_b: 4 +model.layers.50.mlp.down_proj: 4 +model.layers.45.linear_attn.in_proj_z: 4 +model.layers.21.linear_attn.in_proj_qkv: 4 +model.layers.31.mlp.down_proj: 4 +model.layers.1.mlp.down_proj: 4 +model.layers.44.mlp.down_proj: 4 +model.layers.57.linear_attn.in_proj_qkv: 4 +model.layers.53.linear_attn.in_proj_a: 4 +model.layers.52.linear_attn.in_proj_b: 4 +model.layers.50.linear_attn.in_proj_z: 4 +model.layers.20.linear_attn.out_proj: 4 +model.layers.18.mlp.down_proj: 4 +model.layers.9.linear_attn.in_proj_qkv: 4 +model.layers.38.linear_attn.in_proj_z: 4 +model.layers.62.linear_attn.in_proj_a: 4 +model.layers.24.mlp.down_proj: 4 +model.layers.51.mlp.down_proj: 4 +model.layers.61.linear_attn.in_proj_z: 4 +model.layers.9.linear_attn.in_proj_b: 4 +model.layers.30.mlp.down_proj: 4 +model.layers.0.mlp.down_proj: 4 +model.layers.8.linear_attn.in_proj_a: 4 +model.layers.40.linear_attn.out_proj: 4 +model.layers.54.linear_attn.in_proj_b: 4 +model.layers.45.mlp.down_proj: 4 +model.layers.56.linear_attn.in_proj_z: 4 +model.layers.32.linear_attn.in_proj_z: 4 +model.layers.30.linear_attn.in_proj_b: 4 +model.layers.4.linear_attn.in_proj_a: 4 +model.layers.18.linear_attn.out_proj: 4 +model.layers.58.linear_attn.in_proj_b: 4 +model.layers.5.linear_attn.in_proj_b: 4 +model.layers.2.linear_attn.in_proj_a: 4 +model.layers.35.mlp.down_proj: 4 +model.layers.13.linear_attn.out_proj: 4 +model.layers.5.mlp.down_proj: 4 +model.layers.1.linear_attn.in_proj_z: 4 +model.layers.40.mlp.down_proj: 4 +model.layers.5.linear_attn.out_proj: 4 +model.layers.34.linear_attn.in_proj_z: 4 +model.layers.21.mlp.down_proj: 4 +model.layers.36.linear_attn.in_proj_b: 4 +model.layers.37.linear_attn.in_proj_a: 4 +model.layers.54.mlp.down_proj: 4 +model.layers.53.linear_attn.in_proj_qkv: 4 +model.layers.62.linear_attn.out_proj: 4 +model.layers.10.linear_attn.in_proj_a: 4 +model.layers.13.linear_attn.in_proj_z: 4 +model.layers.26.linear_attn.in_proj_b: 4 +model.layers.24.linear_attn.in_proj_z: 4 +model.layers.5.linear_attn.in_proj_qkv: 4 +model.layers.37.linear_attn.in_proj_qkv: 4 +model.layers.21.linear_attn.in_proj_a: 4 +model.layers.20.linear_attn.in_proj_b: 4 +model.layers.22.linear_attn.in_proj_z: 4 +model.layers.49.linear_attn.in_proj_a: 4 +model.layers.8.linear_attn.out_proj: 4 +model.layers.16.linear_attn.in_proj_a: 4 +model.layers.12.mlp.down_proj: 4 +model.layers.48.linear_attn.in_proj_b: 4 +model.layers.17.linear_attn.in_proj_b: 4 +model.layers.13.linear_attn.in_proj_qkv: 4 +model.layers.46.linear_attn.out_proj: 4 +model.layers.17.linear_attn.in_proj_qkv: 4 +model.layers.46.linear_attn.in_proj_z: 4 +model.layers.44.linear_attn.in_proj_b: 4 +model.layers.37.linear_attn.out_proj: 4 +model.layers.45.linear_attn.in_proj_a: 4 +model.layers.33.linear_attn.in_proj_qkv: 4 +model.layers.57.linear_attn.out_proj: 4 +model.layers.40.linear_attn.in_proj_z: 4 +model.layers.1.linear_attn.in_proj_qkv: 4 +model.layers.17.mlp.down_proj: 4 +model.layers.42.linear_attn.in_proj_b: 4 +model.layers.62.mlp.down_proj: 4 +model.layers.28.linear_attn.in_proj_z: 4 +model.layers.26.linear_attn.out_proj: 4 +model.layers.48.linear_attn.in_proj_qkv: 4 +model.layers.48.mlp.down_proj: 4 +model.layers.41.linear_attn.out_proj: 4 +model.layers.10.linear_attn.in_proj_qkv: 4 +model.layers.34.linear_attn.in_proj_qkv: 4 +model.layers.52.linear_attn.in_proj_a: 4 +model.layers.30.linear_attn.out_proj: 4 +model.layers.53.linear_attn.in_proj_b: 4 +model.layers.29.mlp.down_proj: 4 +model.layers.24.linear_attn.in_proj_qkv: 4 +model.layers.54.linear_attn.in_proj_a: 4 +model.layers.60.mlp.down_proj: 4 +model.layers.8.linear_attn.in_proj_b: 4 +model.layers.6.linear_attn.in_proj_qkv: 4 +model.layers.57.linear_attn.in_proj_z: 4 +model.layers.9.linear_attn.in_proj_a: 4 +model.layers.50.linear_attn.out_proj: 4 +model.layers.15.mlp.down_proj: 4 +model.layers.58.linear_attn.in_proj_qkv: 4 +model.layers.62.linear_attn.in_proj_b: 4 +model.layers.21.linear_attn.out_proj: 4 +model.layers.60.linear_attn.in_proj_z: 4 +model.layers.59.mlp.down_proj: 4 +model.layers.5.linear_attn.in_proj_a: 4 +model.layers.4.linear_attn.out_proj: 4 +model.layers.58.linear_attn.in_proj_a: 4 +model.layers.6.linear_attn.in_proj_z: 4 +model.layers.4.linear_attn.in_proj_b: 4 +model.layers.8.mlp.down_proj: 4 +model.layers.33.linear_attn.in_proj_z: 4 +model.layers.2.linear_attn.in_proj_qkv: 4 +model.layers.30.linear_attn.in_proj_a: 4 +model.layers.38.mlp.down_proj: 4 +model.layers.20.linear_attn.in_proj_qkv: 4 +model.layers.12.linear_attn.out_proj: 4 +model.layers.37.linear_attn.in_proj_b: 4 +model.layers.36.linear_attn.in_proj_a: 4 +model.layers.30.linear_attn.in_proj_qkv: 4 +model.layers.14.linear_attn.in_proj_qkv: 4 +model.layers.0.linear_attn.in_proj_z: 4 +model.layers.2.linear_attn.in_proj_b: 4 +model.layers.10.mlp.down_proj: 4 +model.layers.44.linear_attn.in_proj_qkv: 4 +model.layers.9.linear_attn.out_proj: 4 +model.layers.26.linear_attn.in_proj_a: 4 +model.layers.25.linear_attn.in_proj_z: 4 +model.layers.38.linear_attn.in_proj_qkv: 4 +model.layers.10.linear_attn.in_proj_b: 4 +model.layers.60.linear_attn.in_proj_qkv: 4 +model.layers.12.linear_attn.in_proj_z: 4 +model.layers.7.mlp.down_proj: 4 +model.layers.28.linear_attn.in_proj_qkv: 4 +model.layers.17.linear_attn.in_proj_a: 4 +model.layers.48.linear_attn.in_proj_a: 4 +model.layers.42.mlp.down_proj: 4 +model.layers.14.linear_attn.out_proj: 4 +model.layers.16.linear_attn.in_proj_b: 4 +model.layers.14.linear_attn.in_proj_z: 4 +model.layers.37.mlp.down_proj: 4 +model.layers.49.linear_attn.in_proj_b: 4 +model.layers.20.linear_attn.in_proj_a: 4 +model.layers.54.linear_attn.in_proj_qkv: 4 +model.layers.56.mlp.down_proj: 4 +model.layers.21.linear_attn.in_proj_b: 4 +model.layers.23.mlp.down_proj: 4 +model.layers.2.linear_attn.out_proj: 4 +model.layers.18.linear_attn.in_proj_z: 4 +model.layers.45.linear_attn.in_proj_b: 4 +model.layers.50.linear_attn.in_proj_qkv: 4 +model.layers.44.linear_attn.in_proj_a: 4 +model.layers.56.linear_attn.out_proj: 4 +model.layers.36.linear_attn.out_proj: 4 +model.layers.53.mlp.down_proj: 4 +model.layers.29.linear_attn.in_proj_z: 4 +model.layers.26.mlp.down_proj: 4 +model.layers.2.mlp.down_proj: 4 +model.layers.41.linear_attn.in_proj_z: 4 +model.layers.18.linear_attn.in_proj_qkv: 4 +model.layers.47.mlp.down_proj: 4 +model.layers.42.linear_attn.in_proj_a: 4 +model.layers.40.linear_attn.in_proj_qkv: 4 +model.layers.32.mlp.down_proj: 4 diff --git a/Qwen3.5-27B/ll_4bit/tokenizer_config.json b/Qwen3.5-27B/ll_4bit/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6be6ce1780cf43bd47577fbb76e74aee6db89f21 --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/tokenizer_config.json @@ -0,0 +1,31 @@ +{ + "add_prefix_space": false, + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "image_token": "<|image_pad|>", + "is_local": false, + "model_max_length": 262144, + "model_specific_special_tokens": { + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "image_token": "<|image_pad|>", + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" + }, + "pad_token": "<|endoftext|>", + "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + "split_special_tokens": false, + "tokenizer_class": "TokenizersBackend", + "unk_token": null, + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" +} diff --git a/Qwen3.5-27B/ll_4bit/video_preprocessor_config.json b/Qwen3.5-27B/ll_4bit/video_preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ba673a5ad7d4d13f54155ecd38b2a94a6dac8fe --- /dev/null +++ b/Qwen3.5-27B/ll_4bit/video_preprocessor_config.json @@ -0,0 +1,21 @@ +{ + "size": { + "longest_edge": 25165824, + "shortest_edge": 4096 + }, + "patch_size": 16, + "temporal_patch_size": 2, + "merge_size": 2, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "Qwen3VLProcessor", + "video_processor_type": "Qwen3VLVideoProcessor" +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/README.md b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e6daf3807e2876ad9583a97e7c2d585088f85267 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/README.md @@ -0,0 +1,18 @@ +# Quantized Model Checkpoint + +**Base model:** Qwen/Qwen3.5-27B + +**Average bitwidth:** 5.1871 + +**Sensitivity method:** linear + +**Constraints:** +- max_kl: 0.005 +- min_eap: 0.985 + +**Metrics:** +- final_kl: 0.001887 +- final_eap: 0.985100 +- final_etl: 0.014900 + +See `quantization_config.txt` for full configuration details. diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/chat_template.jinja b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..a585dec894e63da457d9440ec6aa7caa16d20860 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/chat_template.jinja @@ -0,0 +1,154 @@ +{%- set image_count = namespace(value=0) %} +{%- set video_count = namespace(value=0) %} +{%- macro render_content(content, do_vision_count, is_system_content=false) %} + {%- if content is string %} + {{- content }} + {%- elif content is iterable and content is not mapping %} + {%- for item in content %} + {%- if 'image' in item or 'image_url' in item or item.type == 'image' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain images.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set image_count.value = image_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Picture ' ~ image_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|image_pad|><|vision_end|>' }} + {%- elif 'video' in item or item.type == 'video' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain videos.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set video_count.value = video_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Video ' ~ video_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|video_pad|><|vision_end|>' }} + {%- elif 'text' in item %} + {{- item.text }} + {%- else %} + {{- raise_exception('Unexpected item type in content.') }} + {%- endif %} + {%- endfor %} + {%- elif content is none or content is undefined %} + {{- '' }} + {%- else %} + {{- raise_exception('Unexpected content type.') }} + {%- endif %} +{%- endmacro %} +{%- if not messages %} + {{- raise_exception('No messages provided.') }} +{%- endif %} +{%- if tools and tools is iterable and tools is not mapping %} + {{- '<|im_start|>system\n' }} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if content %} + {{- '\n\n' + content }} + {%- endif %} + {%- endif %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" %} + {%- set content = render_content(message.content, false)|trim %} + {%- if not(content.startswith('') and content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if ns.multi_step_tool %} + {{- raise_exception('No user query found in messages.') }} +{%- endif %} +{%- for message in messages %} + {%- set content = render_content(message.content, true)|trim %} + {%- if message.role == "system" %} + {%- if not loop.first %} + {{- raise_exception('System message must be at the beginning.') }} + {%- endif %} + {%- elif message.role == "user" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- set reasoning_content = reasoning_content|trim %} + {%- if loop.index0 > ns.last_query_index %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if loop.first %} + {%- if content|trim %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} + {%- else %} + {{- '\n\n\n' }} + {%- endif %} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- raise_exception('Unexpected message role.') }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/config.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6c0fc6f0ed2ea5d07e1c36677cfab84cd8f27351 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/config.json @@ -0,0 +1,138 @@ +{ + "vision_start_token_id": 248053, + "video_token_id": 248057, + "image_token_id": 248056, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "model_type": "qwen3_5", + "vision_end_token_id": 248054, + "tie_word_embeddings": false, + "vision_config": { + "deepstack_visual_indexes": [], + "depth": 27, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 4304, + "model_type": "qwen3_5", + "num_heads": 16, + "num_position_embeddings": 2304, + "out_hidden_size": 5120, + "patch_size": 16, + "spatial_merge_size": 2, + "temporal_patch_size": 2 + }, + "transformers_version": "4.57.0.dev0", + "text_config": { + "attention_bias": false, + "attention_dropout": 0.0, + "attn_output_gate": true, + "bos_token_id": null, + "dtype": "float16", + "eos_token_id": 248044, + "full_attention_interval": 4, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17408, + "layer_types": [ + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention" + ], + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 48, + "linear_value_head_dim": 128, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 262144, + "mlp_only_layers": [], + "mtp_num_hidden_layers": 1, + "mtp_use_dedicated_embeddings": false, + "num_attention_heads": 24, + "num_hidden_layers": 64, + "num_key_value_heads": 4, + "pad_token_id": null, + "partial_rotary_factor": 0.25, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "mrope_interleaved": true, + "mrope_section": [ + 11, + 11, + 10 + ], + "partial_rotary_factor": 0.25, + "rope_theta": 10000000, + "rope_type": "default" + }, + "use_cache": false, + "vocab_size": 248320, + "model_type": "qwen3_5_text" + } +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/generation_config.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1068c09fbcc050fcccf2066dda235127d9bad05e --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/generation_config.json @@ -0,0 +1,13 @@ +{ + "bos_token_id": 248044, + "do_sample": true, + "eos_token_id": [ + 248046, + 248044 + ], + "pad_token_id": 248044, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "5.3.0" +} diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/humming_online_quant_config.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/humming_online_quant_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7880677abaf367bd1f16b6a3d6bf81a5f46eb278 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/humming_online_quant_config.json @@ -0,0 +1,457 @@ +{ + "quant_method": "gptq", + "bits": 5, + "group_size": 128, + "dynamic": { + "+:model\\.layers\\.60\\.mlp\\.gate_proj": { + "bits": 4 + }, + "+:model\\.layers\\.60\\.mlp\\.up_proj": { + "bits": 4 + }, + "+:model\\.layers\\.61\\.mlp\\.gate_proj": { + "bits": 4 + }, + "+:model\\.layers\\.61\\.mlp\\.up_proj": { + "bits": 4 + }, + "+:model\\.layers\\.61\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.60\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.56\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.9\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.57\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.50\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.38\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.1\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.0\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.34\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.33\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.32\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.30\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.5\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.57\\.mlp\\.down_proj": { + "bits": 4 + }, + "+:model\\.layers\\.6\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.20\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.22\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.17\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.14\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.13\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.12\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.26\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.24\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.25\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.41\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.40\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.29\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.28\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.61\\.mlp\\.down_proj": { + "bits": 4 + }, + "+:model\\.layers\\.1\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.18\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.46\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.17\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.57\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.56\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.8\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.60\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.61\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.6\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.38\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.10\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.50\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.34\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.0\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.2\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.1\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.6\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.4\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.32\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.33\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.16\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.14\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.21\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.22\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.25\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.24\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.10\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.12\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.13\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.28\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.16\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.29\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.0\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.40\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.41\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.46\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.18\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.53\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.52\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.20\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.62\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.9\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.8\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.54\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.32\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.30\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.4\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.18\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.58\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.5\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.2\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.13\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.1\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.5\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.34\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.36\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.37\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.10\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.13\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.26\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.24\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.21\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.20\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.22\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.49\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.8\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.16\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.48\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.17\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.44\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.45\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.1\\.linear_attn\\.in_proj_qkv": { + "bits": 6 + }, + "+:model\\.layers\\.42\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.62\\.mlp\\.down_proj": { + "bits": 4 + }, + "+:model\\.layers\\.28\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.52\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.53\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.54\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.60\\.mlp\\.down_proj": { + "bits": 4 + }, + "+:model\\.layers\\.8\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.9\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.62\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.21\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.59\\.mlp\\.down_proj": { + "bits": 4 + }, + "+:model\\.layers\\.5\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.4\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.58\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.6\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.4\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.33\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.30\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.12\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.37\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.36\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.0\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.2\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.9\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.26\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.25\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.10\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.12\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.17\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.48\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.14\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.16\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.14\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.49\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.20\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.21\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.2\\.linear_attn\\.out_proj": { + "bits": 6 + }, + "+:model\\.layers\\.18\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.45\\.linear_attn\\.in_proj_b": { + "bits": 8 + }, + "+:model\\.layers\\.44\\.linear_attn\\.in_proj_a": { + "bits": 8 + }, + "+:model\\.layers\\.29\\.linear_attn\\.in_proj_z": { + "bits": 6 + }, + "+:model\\.layers\\.42\\.linear_attn\\.in_proj_a": { + "bits": 8 + } + } +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/model.safetensors.index.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..70fe4d0836094f2d2b8dd7c40966b8baa44d5114 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/model.safetensors.index.json @@ -0,0 +1,859 @@ +{ + "metadata": { + "total_parameters": 26895998464, + "total_size": 53791996928 + }, + "weight_map": { + "lm_head.weight": "model-00001-of-00002.safetensors", + "model.language_model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.30.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.31.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.32.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.33.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.34.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.35.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.36.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.37.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.38.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.39.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.40.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.41.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.42.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.43.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.44.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.45.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.46.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.47.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.48.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.49.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.50.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.51.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.52.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.53.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.54.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.55.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.56.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.57.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.58.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.58.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.59.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.60.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.A_log": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.conv1d.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.dt_bias": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_a.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_b.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_qkv.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.in_proj_z.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.linear_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.60.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.A_log": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.conv1d.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.dt_bias": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_a.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_b.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_qkv.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.in_proj_z.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.linear_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.61.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.A_log": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.conv1d.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.dt_bias": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_a.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_b.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_qkv.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.in_proj_z.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.linear_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.62.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.k_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.q_norm.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.63.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.A_log": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.conv1d.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.dt_bias": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_a.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_b.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_qkv.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.in_proj_z.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.norm.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.linear_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.language_model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/preprocessor_config.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2ea84a437d448ff71b08df68fdd949d5cc4ebb64 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/preprocessor_config.json @@ -0,0 +1,21 @@ +{ + "size": { + "longest_edge": 16777216, + "shortest_edge": 65536 + }, + "patch_size": 16, + "temporal_patch_size": 2, + "merge_size": 2, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "Qwen3VLProcessor", + "image_processor_type": "Qwen2VLImageProcessorFast" +} \ No newline at end of file diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/quantization_config.txt b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/quantization_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2d60c0295877ca3d87de949eab131c1c624b964 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/quantization_config.txt @@ -0,0 +1,878 @@ +# Model: Qwen/Qwen3.5-27B +# Layer directory: /nfs/scistore19/alistgrp/mhelcig/data/search/4_5_6_7_8bit_asym_g128/Qwen3.5-27B/4bit/ +# Sensitivity method: linear +# Estimation method: linear +# Available bitwidths: [4, 5, 6, 7, 8] +# Bitwidth map: {4: 4.156, 5: 5.156, 6: 6.156, 7: 7.156, 8: 8.156} +# +# Layer groups: 368 groups (fused layers share bitwidth) +# block_0:mlp.gate_proj,mlp.up_proj.block_0:mlp.gate_proj,mlp.up_proj: group 0, 2 layers +# block_10:mlp.gate_proj,mlp.up_proj.block_10:mlp.gate_proj,mlp.up_proj: group 10, 2 layers +# block_11:mlp.gate_proj,mlp.up_proj.block_11:mlp.gate_proj,mlp.up_proj: group 11, 2 layers +# block_12:mlp.gate_proj,mlp.up_proj.block_12:mlp.gate_proj,mlp.up_proj: group 12, 2 layers +# block_13:mlp.gate_proj,mlp.up_proj.block_13:mlp.gate_proj,mlp.up_proj: group 13, 2 layers +# block_14:mlp.gate_proj,mlp.up_proj.block_14:mlp.gate_proj,mlp.up_proj: group 14, 2 layers +# block_15:mlp.gate_proj,mlp.up_proj.block_15:mlp.gate_proj,mlp.up_proj: group 15, 2 layers +# block_16:mlp.gate_proj,mlp.up_proj.block_16:mlp.gate_proj,mlp.up_proj: group 16, 2 layers +# block_17:mlp.gate_proj,mlp.up_proj.block_17:mlp.gate_proj,mlp.up_proj: group 17, 2 layers +# block_18:mlp.gate_proj,mlp.up_proj.block_18:mlp.gate_proj,mlp.up_proj: group 18, 2 layers +# block_19:mlp.gate_proj,mlp.up_proj.block_19:mlp.gate_proj,mlp.up_proj: group 19, 2 layers +# block_1:mlp.gate_proj,mlp.up_proj.block_1:mlp.gate_proj,mlp.up_proj: group 1, 2 layers +# block_20:mlp.gate_proj,mlp.up_proj.block_20:mlp.gate_proj,mlp.up_proj: group 20, 2 layers +# block_21:mlp.gate_proj,mlp.up_proj.block_21:mlp.gate_proj,mlp.up_proj: group 21, 2 layers +# block_22:mlp.gate_proj,mlp.up_proj.block_22:mlp.gate_proj,mlp.up_proj: group 22, 2 layers +# block_23:mlp.gate_proj,mlp.up_proj.block_23:mlp.gate_proj,mlp.up_proj: group 23, 2 layers +# block_24:mlp.gate_proj,mlp.up_proj.block_24:mlp.gate_proj,mlp.up_proj: group 24, 2 layers +# block_25:mlp.gate_proj,mlp.up_proj.block_25:mlp.gate_proj,mlp.up_proj: group 25, 2 layers +# block_26:mlp.gate_proj,mlp.up_proj.block_26:mlp.gate_proj,mlp.up_proj: group 26, 2 layers +# block_27:mlp.gate_proj,mlp.up_proj.block_27:mlp.gate_proj,mlp.up_proj: group 27, 2 layers +# block_28:mlp.gate_proj,mlp.up_proj.block_28:mlp.gate_proj,mlp.up_proj: group 28, 2 layers +# block_29:mlp.gate_proj,mlp.up_proj.block_29:mlp.gate_proj,mlp.up_proj: group 29, 2 layers +# block_2:mlp.gate_proj,mlp.up_proj.block_2:mlp.gate_proj,mlp.up_proj: group 2, 2 layers +# block_30:mlp.gate_proj,mlp.up_proj.block_30:mlp.gate_proj,mlp.up_proj: group 30, 2 layers +# block_31:mlp.gate_proj,mlp.up_proj.block_31:mlp.gate_proj,mlp.up_proj: group 31, 2 layers +# block_32:mlp.gate_proj,mlp.up_proj.block_32:mlp.gate_proj,mlp.up_proj: group 32, 2 layers +# block_33:mlp.gate_proj,mlp.up_proj.block_33:mlp.gate_proj,mlp.up_proj: group 33, 2 layers +# block_34:mlp.gate_proj,mlp.up_proj.block_34:mlp.gate_proj,mlp.up_proj: group 34, 2 layers +# block_35:mlp.gate_proj,mlp.up_proj.block_35:mlp.gate_proj,mlp.up_proj: group 35, 2 layers +# block_36:mlp.gate_proj,mlp.up_proj.block_36:mlp.gate_proj,mlp.up_proj: group 36, 2 layers +# block_37:mlp.gate_proj,mlp.up_proj.block_37:mlp.gate_proj,mlp.up_proj: group 37, 2 layers +# block_38:mlp.gate_proj,mlp.up_proj.block_38:mlp.gate_proj,mlp.up_proj: group 38, 2 layers +# block_39:mlp.gate_proj,mlp.up_proj.block_39:mlp.gate_proj,mlp.up_proj: group 39, 2 layers +# block_3:mlp.gate_proj,mlp.up_proj.block_3:mlp.gate_proj,mlp.up_proj: group 3, 2 layers +# block_40:mlp.gate_proj,mlp.up_proj.block_40:mlp.gate_proj,mlp.up_proj: group 40, 2 layers +# block_41:mlp.gate_proj,mlp.up_proj.block_41:mlp.gate_proj,mlp.up_proj: group 41, 2 layers +# block_42:mlp.gate_proj,mlp.up_proj.block_42:mlp.gate_proj,mlp.up_proj: group 42, 2 layers +# block_43:mlp.gate_proj,mlp.up_proj.block_43:mlp.gate_proj,mlp.up_proj: group 43, 2 layers +# block_44:mlp.gate_proj,mlp.up_proj.block_44:mlp.gate_proj,mlp.up_proj: group 44, 2 layers +# block_45:mlp.gate_proj,mlp.up_proj.block_45:mlp.gate_proj,mlp.up_proj: group 45, 2 layers +# block_46:mlp.gate_proj,mlp.up_proj.block_46:mlp.gate_proj,mlp.up_proj: group 46, 2 layers +# block_47:mlp.gate_proj,mlp.up_proj.block_47:mlp.gate_proj,mlp.up_proj: group 47, 2 layers +# block_48:mlp.gate_proj,mlp.up_proj.block_48:mlp.gate_proj,mlp.up_proj: group 48, 2 layers +# block_49:mlp.gate_proj,mlp.up_proj.block_49:mlp.gate_proj,mlp.up_proj: group 49, 2 layers +# block_4:mlp.gate_proj,mlp.up_proj.block_4:mlp.gate_proj,mlp.up_proj: group 4, 2 layers +# block_50:mlp.gate_proj,mlp.up_proj.block_50:mlp.gate_proj,mlp.up_proj: group 50, 2 layers +# block_51:mlp.gate_proj,mlp.up_proj.block_51:mlp.gate_proj,mlp.up_proj: group 51, 2 layers +# block_52:mlp.gate_proj,mlp.up_proj.block_52:mlp.gate_proj,mlp.up_proj: group 52, 2 layers +# block_53:mlp.gate_proj,mlp.up_proj.block_53:mlp.gate_proj,mlp.up_proj: group 53, 2 layers +# block_54:mlp.gate_proj,mlp.up_proj.block_54:mlp.gate_proj,mlp.up_proj: group 54, 2 layers +# block_55:mlp.gate_proj,mlp.up_proj.block_55:mlp.gate_proj,mlp.up_proj: group 55, 2 layers +# block_56:mlp.gate_proj,mlp.up_proj.block_56:mlp.gate_proj,mlp.up_proj: group 56, 2 layers +# block_57:mlp.gate_proj,mlp.up_proj.block_57:mlp.gate_proj,mlp.up_proj: group 57, 2 layers +# block_58:mlp.gate_proj,mlp.up_proj.block_58:mlp.gate_proj,mlp.up_proj: group 58, 2 layers +# block_59:mlp.gate_proj,mlp.up_proj.block_59:mlp.gate_proj,mlp.up_proj: group 59, 2 layers +# block_5:mlp.gate_proj,mlp.up_proj.block_5:mlp.gate_proj,mlp.up_proj: group 5, 2 layers +# block_60:mlp.gate_proj,mlp.up_proj.block_60:mlp.gate_proj,mlp.up_proj: group 60, 2 layers +# block_61:mlp.gate_proj,mlp.up_proj.block_61:mlp.gate_proj,mlp.up_proj: group 61, 2 layers +# block_62:mlp.gate_proj,mlp.up_proj.block_62:mlp.gate_proj,mlp.up_proj: group 62, 2 layers +# block_63:mlp.gate_proj,mlp.up_proj.block_63:mlp.gate_proj,mlp.up_proj: group 63, 2 layers +# block_6:mlp.gate_proj,mlp.up_proj.block_6:mlp.gate_proj,mlp.up_proj: group 6, 2 layers +# block_7:mlp.gate_proj,mlp.up_proj.block_7:mlp.gate_proj,mlp.up_proj: group 7, 2 layers +# block_8:mlp.gate_proj,mlp.up_proj.block_8:mlp.gate_proj,mlp.up_proj: group 8, 2 layers +# block_9:mlp.gate_proj,mlp.up_proj.block_9:mlp.gate_proj,mlp.up_proj: group 9, 2 layers +# Fused in model.layers.0: +# - linear_attn.in_proj_qkv (group 87, 1 layers) +# - linear_attn.in_proj_a (group 88, 1 layers) +# - linear_attn.in_proj_b (group 166, 1 layers) +# - linear_attn.out_proj (group 202, 1 layers) +# - mlp.down_proj (group 229, 1 layers) +# - linear_attn.in_proj_z (group 325, 1 layers) +# Fused in model.layers.1: +# - linear_attn.in_proj_b (group 85, 1 layers) +# - linear_attn.out_proj (group 140, 1 layers) +# - linear_attn.in_proj_a (group 169, 1 layers) +# - mlp.down_proj (group 213, 1 layers) +# - linear_attn.in_proj_z (group 245, 1 layers) +# - linear_attn.in_proj_qkv (group 280, 1 layers) +# Fused in model.layers.10: +# - linear_attn.out_proj (group 159, 1 layers) +# - linear_attn.in_proj_z (group 193, 1 layers) +# - linear_attn.in_proj_a (group 255, 1 layers) +# - linear_attn.in_proj_qkv (group 289, 1 layers) +# - mlp.down_proj (group 327, 1 layers) +# - linear_attn.in_proj_b (group 333, 1 layers) +# model.layers.11.mlp.down_proj: group 129, 1 layers +# Fused in model.layers.12: +# - linear_attn.in_proj_qkv (group 69, 1 layers) +# - linear_attn.in_proj_a (group 122, 1 layers) +# - linear_attn.in_proj_b (group 194, 1 layers) +# - mlp.down_proj (group 267, 1 layers) +# - linear_attn.out_proj (group 320, 1 layers) +# - linear_attn.in_proj_z (group 335, 1 layers) +# Fused in model.layers.13: +# - linear_attn.in_proj_b (group 121, 1 layers) +# - mlp.down_proj (group 175, 1 layers) +# - linear_attn.in_proj_a (group 198, 1 layers) +# - linear_attn.out_proj (group 243, 1 layers) +# - linear_attn.in_proj_z (group 256, 1 layers) +# - linear_attn.in_proj_qkv (group 270, 1 layers) +# Fused in model.layers.14: +# - linear_attn.in_proj_a (group 119, 1 layers) +# - mlp.down_proj (group 141, 1 layers) +# - linear_attn.in_proj_b (group 179, 1 layers) +# - linear_attn.in_proj_qkv (group 324, 1 layers) +# - linear_attn.out_proj (group 341, 1 layers) +# - linear_attn.in_proj_z (group 343, 1 layers) +# model.layers.15.mlp.down_proj: group 303, 1 layers +# Fused in model.layers.16: +# - linear_attn.in_proj_qkv (group 99, 1 layers) +# - mlp.down_proj (group 153, 1 layers) +# - linear_attn.in_proj_z (group 178, 1 layers) +# - linear_attn.out_proj (group 200, 1 layers) +# - linear_attn.in_proj_a (group 266, 1 layers) +# - linear_attn.in_proj_b (group 342, 1 layers) +# Fused in model.layers.17: +# - linear_attn.in_proj_z (group 116, 1 layers) +# - linear_attn.out_proj (group 145, 1 layers) +# - linear_attn.in_proj_b (group 269, 1 layers) +# - linear_attn.in_proj_qkv (group 272, 1 layers) +# - mlp.down_proj (group 281, 1 layers) +# - linear_attn.in_proj_a (group 338, 1 layers) +# Fused in model.layers.18: +# - linear_attn.in_proj_a (group 142, 1 layers) +# - linear_attn.in_proj_b (group 208, 1 layers) +# - mlp.down_proj (group 220, 1 layers) +# - linear_attn.out_proj (group 238, 1 layers) +# - linear_attn.in_proj_z (group 352, 1 layers) +# - linear_attn.in_proj_qkv (group 363, 1 layers) +# model.layers.19.mlp.down_proj: group 204, 1 layers +# Fused in model.layers.2: +# - linear_attn.in_proj_z (group 167, 1 layers) +# - linear_attn.in_proj_a (group 241, 1 layers) +# - linear_attn.in_proj_qkv (group 316, 1 layers) +# - linear_attn.in_proj_b (group 326, 1 layers) +# - linear_attn.out_proj (group 351, 1 layers) +# - mlp.down_proj (group 361, 1 layers) +# Fused in model.layers.20: +# - linear_attn.in_proj_z (group 110, 1 layers) +# - mlp.down_proj (group 195, 1 layers) +# - linear_attn.out_proj (group 219, 1 layers) +# - linear_attn.in_proj_b (group 262, 1 layers) +# - linear_attn.in_proj_qkv (group 319, 1 layers) +# - linear_attn.in_proj_a (group 346, 1 layers) +# Fused in model.layers.21: +# - linear_attn.in_proj_z (group 183, 1 layers) +# - linear_attn.in_proj_qkv (group 211, 1 layers) +# - mlp.down_proj (group 249, 1 layers) +# - linear_attn.in_proj_a (group 261, 1 layers) +# - linear_attn.out_proj (group 306, 1 layers) +# - linear_attn.in_proj_b (group 349, 1 layers) +# Fused in model.layers.22: +# - linear_attn.in_proj_qkv (group 84, 1 layers) +# - linear_attn.out_proj (group 93, 1 layers) +# - mlp.down_proj (group 109, 1 layers) +# - linear_attn.in_proj_b (group 112, 1 layers) +# - linear_attn.in_proj_a (group 184, 1 layers) +# - linear_attn.in_proj_z (group 263, 1 layers) +# model.layers.23.mlp.down_proj: group 350, 1 layers +# Fused in model.layers.24: +# - linear_attn.out_proj (group 118, 1 layers) +# - linear_attn.in_proj_b (group 127, 1 layers) +# - linear_attn.in_proj_a (group 191, 1 layers) +# - mlp.down_proj (group 224, 1 layers) +# - linear_attn.in_proj_z (group 258, 1 layers) +# - linear_attn.in_proj_qkv (group 295, 1 layers) +# Fused in model.layers.25: +# - linear_attn.in_proj_a (group 128, 1 layers) +# - linear_attn.in_proj_qkv (group 181, 1 layers) +# - linear_attn.in_proj_b (group 187, 1 layers) +# - linear_attn.out_proj (group 192, 1 layers) +# - mlp.down_proj (group 206, 1 layers) +# - linear_attn.in_proj_z (group 331, 1 layers) +# Fused in model.layers.26: +# - linear_attn.in_proj_qkv (group 81, 1 layers) +# - linear_attn.in_proj_z (group 125, 1 layers) +# - linear_attn.in_proj_b (group 257, 1 layers) +# - linear_attn.out_proj (group 285, 1 layers) +# - linear_attn.in_proj_a (group 330, 1 layers) +# - mlp.down_proj (group 360, 1 layers) +# model.layers.27.mlp.down_proj: group 76, 1 layers +# Fused in model.layers.28: +# - linear_attn.in_proj_b (group 136, 1 layers) +# - mlp.down_proj (group 137, 1 layers) +# - linear_attn.out_proj (group 173, 1 layers) +# - linear_attn.in_proj_a (group 199, 1 layers) +# - linear_attn.in_proj_z (group 284, 1 layers) +# - linear_attn.in_proj_qkv (group 337, 1 layers) +# Fused in model.layers.29: +# - linear_attn.out_proj (group 91, 1 layers) +# - linear_attn.in_proj_a (group 134, 1 layers) +# - linear_attn.in_proj_qkv (group 148, 1 layers) +# - linear_attn.in_proj_b (group 201, 1 layers) +# - mlp.down_proj (group 294, 1 layers) +# - linear_attn.in_proj_z (group 359, 1 layers) +# model.layers.3.mlp.down_proj: group 77, 1 layers +# Fused in model.layers.30: +# - linear_attn.in_proj_z (group 100, 1 layers) +# - mlp.down_proj (group 228, 1 layers) +# - linear_attn.in_proj_b (group 236, 1 layers) +# - linear_attn.out_proj (group 292, 1 layers) +# - linear_attn.in_proj_a (group 317, 1 layers) +# - linear_attn.in_proj_qkv (group 323, 1 layers) +# model.layers.31.mlp.down_proj: group 212, 1 layers +# Fused in model.layers.32: +# - linear_attn.in_proj_b (group 98, 1 layers) +# - linear_attn.in_proj_qkv (group 104, 1 layers) +# - linear_attn.out_proj (group 174, 1 layers) +# - linear_attn.in_proj_a (group 176, 1 layers) +# - linear_attn.in_proj_z (group 235, 1 layers) +# - mlp.down_proj (group 367, 1 layers) +# Fused in model.layers.33: +# - mlp.down_proj (group 82, 1 layers) +# - linear_attn.out_proj (group 83, 1 layers) +# - linear_attn.in_proj_a (group 95, 1 layers) +# - linear_attn.in_proj_b (group 177, 1 layers) +# - linear_attn.in_proj_qkv (group 277, 1 layers) +# - linear_attn.in_proj_z (group 315, 1 layers) +# Fused in model.layers.34: +# - linear_attn.in_proj_b (group 90, 1 layers) +# - linear_attn.in_proj_a (group 164, 1 layers) +# - linear_attn.out_proj (group 182, 1 layers) +# - mlp.down_proj (group 186, 1 layers) +# - linear_attn.in_proj_z (group 248, 1 layers) +# - linear_attn.in_proj_qkv (group 290, 1 layers) +# model.layers.35.mlp.down_proj: group 242, 1 layers +# Fused in model.layers.36: +# - linear_attn.in_proj_qkv (group 65, 1 layers) +# - linear_attn.in_proj_z (group 92, 1 layers) +# - mlp.down_proj (group 101, 1 layers) +# - linear_attn.in_proj_b (group 250, 1 layers) +# - linear_attn.in_proj_a (group 322, 1 layers) +# - linear_attn.out_proj (group 357, 1 layers) +# Fused in model.layers.37: +# - linear_attn.in_proj_z (group 165, 1 layers) +# - linear_attn.in_proj_a (group 251, 1 layers) +# - linear_attn.in_proj_qkv (group 260, 1 layers) +# - linear_attn.out_proj (group 275, 1 layers) +# - linear_attn.in_proj_b (group 321, 1 layers) +# - mlp.down_proj (group 344, 1 layers) +# Fused in model.layers.38: +# - linear_attn.in_proj_b (group 79, 1 layers) +# - linear_attn.out_proj (group 108, 1 layers) +# - linear_attn.in_proj_a (group 156, 1 layers) +# - linear_attn.in_proj_z (group 222, 1 layers) +# - mlp.down_proj (group 318, 1 layers) +# - linear_attn.in_proj_qkv (group 332, 1 layers) +# model.layers.39.mlp.down_proj: group 120, 1 layers +# Fused in model.layers.4: +# - linear_attn.in_proj_qkv (group 78, 1 layers) +# - linear_attn.in_proj_z (group 172, 1 layers) +# - mlp.down_proj (group 189, 1 layers) +# - linear_attn.in_proj_a (group 237, 1 layers) +# - linear_attn.out_proj (group 310, 1 layers) +# - linear_attn.in_proj_b (group 313, 1 layers) +# Fused in model.layers.40: +# - linear_attn.in_proj_b (group 132, 1 layers) +# - linear_attn.in_proj_a (group 203, 1 layers) +# - linear_attn.out_proj (group 231, 1 layers) +# - mlp.down_proj (group 246, 1 layers) +# - linear_attn.in_proj_z (group 279, 1 layers) +# - linear_attn.in_proj_qkv (group 366, 1 layers) +# Fused in model.layers.41: +# - linear_attn.in_proj_a (group 131, 1 layers) +# - linear_attn.in_proj_qkv (group 168, 1 layers) +# - mlp.down_proj (group 190, 1 layers) +# - linear_attn.in_proj_b (group 205, 1 layers) +# - linear_attn.out_proj (group 288, 1 layers) +# - linear_attn.in_proj_z (group 362, 1 layers) +# Fused in model.layers.42: +# - linear_attn.out_proj (group 89, 1 layers) +# - linear_attn.in_proj_z (group 133, 1 layers) +# - linear_attn.in_proj_qkv (group 139, 1 layers) +# - linear_attn.in_proj_b (group 282, 1 layers) +# - mlp.down_proj (group 340, 1 layers) +# - linear_attn.in_proj_a (group 365, 1 layers) +# model.layers.43.mlp.down_proj: group 96, 1 layers +# Fused in model.layers.44: +# - linear_attn.out_proj (group 124, 1 layers) +# - linear_attn.in_proj_z (group 144, 1 layers) +# - mlp.down_proj (group 214, 1 layers) +# - linear_attn.in_proj_b (group 274, 1 layers) +# - linear_attn.in_proj_qkv (group 328, 1 layers) +# - linear_attn.in_proj_a (group 355, 1 layers) +# Fused in model.layers.45: +# - linear_attn.in_proj_qkv (group 157, 1 layers) +# - linear_attn.out_proj (group 185, 1 layers) +# - linear_attn.in_proj_z (group 210, 1 layers) +# - mlp.down_proj (group 233, 1 layers) +# - linear_attn.in_proj_a (group 276, 1 layers) +# - linear_attn.in_proj_b (group 353, 1 layers) +# Fused in model.layers.46: +# - mlp.down_proj (group 80, 1 layers) +# - linear_attn.in_proj_qkv (group 117, 1 layers) +# - linear_attn.in_proj_b (group 143, 1 layers) +# - linear_attn.in_proj_a (group 207, 1 layers) +# - linear_attn.out_proj (group 271, 1 layers) +# - linear_attn.in_proj_z (group 273, 1 layers) +# model.layers.47.mlp.down_proj: group 364, 1 layers +# Fused in model.layers.48: +# - linear_attn.in_proj_z (group 115, 1 layers) +# - linear_attn.out_proj (group 170, 1 layers) +# - linear_attn.in_proj_b (group 268, 1 layers) +# - linear_attn.in_proj_qkv (group 286, 1 layers) +# - mlp.down_proj (group 287, 1 layers) +# - linear_attn.in_proj_a (group 339, 1 layers) +# Fused in model.layers.49: +# - linear_attn.out_proj (group 97, 1 layers) +# - mlp.down_proj (group 130, 1 layers) +# - linear_attn.in_proj_z (group 180, 1 layers) +# - linear_attn.in_proj_qkv (group 188, 1 layers) +# - linear_attn.in_proj_a (group 264, 1 layers) +# - linear_attn.in_proj_b (group 345, 1 layers) +# Fused in model.layers.5: +# - linear_attn.in_proj_z (group 102, 1 layers) +# - linear_attn.in_proj_b (group 240, 1 layers) +# - mlp.down_proj (group 244, 1 layers) +# - linear_attn.out_proj (group 247, 1 layers) +# - linear_attn.in_proj_qkv (group 259, 1 layers) +# - linear_attn.in_proj_a (group 309, 1 layers) +# Fused in model.layers.50: +# - linear_attn.in_proj_b (group 74, 1 layers) +# - linear_attn.in_proj_a (group 162, 1 layers) +# - mlp.down_proj (group 209, 1 layers) +# - linear_attn.in_proj_z (group 218, 1 layers) +# - linear_attn.out_proj (group 302, 1 layers) +# - linear_attn.in_proj_qkv (group 354, 1 layers) +# model.layers.51.mlp.down_proj: group 225, 1 layers +# Fused in model.layers.52: +# - linear_attn.in_proj_z (group 73, 1 layers) +# - mlp.down_proj (group 75, 1 layers) +# - linear_attn.in_proj_qkv (group 135, 1 layers) +# - linear_attn.out_proj (group 163, 1 layers) +# - linear_attn.in_proj_b (group 217, 1 layers) +# - linear_attn.in_proj_a (group 291, 1 layers) +# Fused in model.layers.53: +# - linear_attn.out_proj (group 105, 1 layers) +# - linear_attn.in_proj_z (group 160, 1 layers) +# - linear_attn.in_proj_a (group 216, 1 layers) +# - linear_attn.in_proj_qkv (group 253, 1 layers) +# - linear_attn.in_proj_b (group 293, 1 layers) +# - mlp.down_proj (group 358, 1 layers) +# Fused in model.layers.54: +# - linear_attn.in_proj_z (group 68, 1 layers) +# - linear_attn.out_proj (group 196, 1 layers) +# - linear_attn.in_proj_b (group 232, 1 layers) +# - mlp.down_proj (group 252, 1 layers) +# - linear_attn.in_proj_a (group 296, 1 layers) +# - linear_attn.in_proj_qkv (group 347, 1 layers) +# model.layers.55.mlp.down_proj: group 197, 1 layers +# Fused in model.layers.56: +# - linear_attn.in_proj_b (group 70, 1 layers) +# - linear_attn.in_proj_qkv (group 123, 1 layers) +# - linear_attn.in_proj_a (group 147, 1 layers) +# - linear_attn.in_proj_z (group 234, 1 layers) +# - mlp.down_proj (group 348, 1 layers) +# - linear_attn.out_proj (group 356, 1 layers) +# Fused in model.layers.57: +# - linear_attn.in_proj_a (group 72, 1 layers) +# - mlp.down_proj (group 103, 1 layers) +# - linear_attn.in_proj_b (group 146, 1 layers) +# - linear_attn.in_proj_qkv (group 215, 1 layers) +# - linear_attn.out_proj (group 278, 1 layers) +# - linear_attn.in_proj_z (group 300, 1 layers) +# Fused in model.layers.58: +# - linear_attn.out_proj (group 86, 1 layers) +# - linear_attn.in_proj_z (group 107, 1 layers) +# - mlp.down_proj (group 111, 1 layers) +# - linear_attn.in_proj_b (group 239, 1 layers) +# - linear_attn.in_proj_qkv (group 304, 1 layers) +# - linear_attn.in_proj_a (group 311, 1 layers) +# model.layers.59.mlp.down_proj: group 308, 1 layers +# Fused in model.layers.6: +# - mlp.down_proj (group 94, 1 layers) +# - linear_attn.in_proj_a (group 106, 1 layers) +# - linear_attn.out_proj (group 155, 1 layers) +# - linear_attn.in_proj_b (group 171, 1 layers) +# - linear_attn.in_proj_qkv (group 299, 1 layers) +# - linear_attn.in_proj_z (group 312, 1 layers) +# Fused in model.layers.60: +# - linear_attn.in_proj_a (group 66, 1 layers) +# - linear_attn.out_proj (group 67, 1 layers) +# - linear_attn.in_proj_b (group 151, 1 layers) +# - mlp.down_proj (group 297, 1 layers) +# - linear_attn.in_proj_z (group 307, 1 layers) +# - linear_attn.in_proj_qkv (group 334, 1 layers) +# Fused in model.layers.61: +# - linear_attn.in_proj_b (group 64, 1 layers) +# - mlp.down_proj (group 138, 1 layers) +# - linear_attn.in_proj_a (group 152, 1 layers) +# - linear_attn.out_proj (group 154, 1 layers) +# - linear_attn.in_proj_qkv (group 161, 1 layers) +# - linear_attn.in_proj_z (group 226, 1 layers) +# Fused in model.layers.62: +# - linear_attn.in_proj_qkv (group 113, 1 layers) +# - linear_attn.in_proj_z (group 150, 1 layers) +# - linear_attn.in_proj_a (group 223, 1 layers) +# - linear_attn.out_proj (group 254, 1 layers) +# - mlp.down_proj (group 283, 1 layers) +# - linear_attn.in_proj_b (group 305, 1 layers) +# model.layers.63.mlp.down_proj: group 158, 1 layers +# model.layers.7.mlp.down_proj: group 336, 1 layers +# Fused in model.layers.8: +# - linear_attn.in_proj_qkv (group 126, 1 layers) +# - linear_attn.in_proj_z (group 149, 1 layers) +# - linear_attn.in_proj_a (group 230, 1 layers) +# - linear_attn.out_proj (group 265, 1 layers) +# - linear_attn.in_proj_b (group 298, 1 layers) +# - mlp.down_proj (group 314, 1 layers) +# Fused in model.layers.9: +# - linear_attn.in_proj_z (group 71, 1 layers) +# - mlp.down_proj (group 114, 1 layers) +# - linear_attn.in_proj_qkv (group 221, 1 layers) +# - linear_attn.in_proj_b (group 227, 1 layers) +# - linear_attn.in_proj_a (group 301, 1 layers) +# - linear_attn.out_proj (group 329, 1 layers) +# +# Mode: binary_search_constraint (measured) +# Constraint max_kl: 0.005 +# Constraint min_eap: 0.985 +# Weights: nll=0.0, kl=1.0, eap=0.0 +# +# Average bitwidth: 5.1871 +# Total params: 22672834560 +# Total bits: 117606302351 +# Final KL: 0.001887 +# Final EAP: 0.985100 +# Final ETL: 0.014900 +# Satisfies constraints: True +# Solver calls: 9 +# Evaluations: 9 +# +# Bitwidth distribution: +# 8-bit: 96 layers (22.2%) +# 6-bit: 45 layers (10.4%) +# 5-bit: 282 layers (65.3%) +# 4-bit: 9 layers (2.1%) +# +model.layers.0.mlp.gate_proj: 5 +model.layers.0.mlp.up_proj: 5 +model.layers.1.mlp.gate_proj: 5 +model.layers.1.mlp.up_proj: 5 +model.layers.2.mlp.gate_proj: 5 +model.layers.2.mlp.up_proj: 5 +model.layers.3.mlp.gate_proj: 5 +model.layers.3.mlp.up_proj: 5 +model.layers.4.mlp.gate_proj: 5 +model.layers.4.mlp.up_proj: 5 +model.layers.5.mlp.gate_proj: 5 +model.layers.5.mlp.up_proj: 5 +model.layers.6.mlp.gate_proj: 5 +model.layers.6.mlp.up_proj: 5 +model.layers.7.mlp.gate_proj: 5 +model.layers.7.mlp.up_proj: 5 +model.layers.8.mlp.gate_proj: 5 +model.layers.8.mlp.up_proj: 5 +model.layers.9.mlp.gate_proj: 5 +model.layers.9.mlp.up_proj: 5 +model.layers.10.mlp.gate_proj: 5 +model.layers.10.mlp.up_proj: 5 +model.layers.11.mlp.gate_proj: 5 +model.layers.11.mlp.up_proj: 5 +model.layers.12.mlp.gate_proj: 5 +model.layers.12.mlp.up_proj: 5 +model.layers.13.mlp.gate_proj: 5 +model.layers.13.mlp.up_proj: 5 +model.layers.14.mlp.gate_proj: 5 +model.layers.14.mlp.up_proj: 5 +model.layers.15.mlp.gate_proj: 5 +model.layers.15.mlp.up_proj: 5 +model.layers.16.mlp.gate_proj: 5 +model.layers.16.mlp.up_proj: 5 +model.layers.17.mlp.gate_proj: 5 +model.layers.17.mlp.up_proj: 5 +model.layers.18.mlp.gate_proj: 5 +model.layers.18.mlp.up_proj: 5 +model.layers.19.mlp.gate_proj: 5 +model.layers.19.mlp.up_proj: 5 +model.layers.20.mlp.gate_proj: 5 +model.layers.20.mlp.up_proj: 5 +model.layers.21.mlp.gate_proj: 5 +model.layers.21.mlp.up_proj: 5 +model.layers.22.mlp.gate_proj: 5 +model.layers.22.mlp.up_proj: 5 +model.layers.23.mlp.gate_proj: 5 +model.layers.23.mlp.up_proj: 5 +model.layers.24.mlp.gate_proj: 5 +model.layers.24.mlp.up_proj: 5 +model.layers.25.mlp.gate_proj: 5 +model.layers.25.mlp.up_proj: 5 +model.layers.26.mlp.gate_proj: 5 +model.layers.26.mlp.up_proj: 5 +model.layers.27.mlp.gate_proj: 5 +model.layers.27.mlp.up_proj: 5 +model.layers.28.mlp.gate_proj: 5 +model.layers.28.mlp.up_proj: 5 +model.layers.29.mlp.gate_proj: 5 +model.layers.29.mlp.up_proj: 5 +model.layers.30.mlp.gate_proj: 5 +model.layers.30.mlp.up_proj: 5 +model.layers.31.mlp.gate_proj: 5 +model.layers.31.mlp.up_proj: 5 +model.layers.32.mlp.gate_proj: 5 +model.layers.32.mlp.up_proj: 5 +model.layers.33.mlp.gate_proj: 5 +model.layers.33.mlp.up_proj: 5 +model.layers.34.mlp.gate_proj: 5 +model.layers.34.mlp.up_proj: 5 +model.layers.35.mlp.gate_proj: 5 +model.layers.35.mlp.up_proj: 5 +model.layers.36.mlp.gate_proj: 5 +model.layers.36.mlp.up_proj: 5 +model.layers.37.mlp.gate_proj: 5 +model.layers.37.mlp.up_proj: 5 +model.layers.38.mlp.gate_proj: 5 +model.layers.38.mlp.up_proj: 5 +model.layers.39.mlp.gate_proj: 5 +model.layers.39.mlp.up_proj: 5 +model.layers.40.mlp.gate_proj: 5 +model.layers.40.mlp.up_proj: 5 +model.layers.41.mlp.gate_proj: 5 +model.layers.41.mlp.up_proj: 5 +model.layers.42.mlp.gate_proj: 5 +model.layers.42.mlp.up_proj: 5 +model.layers.43.mlp.gate_proj: 5 +model.layers.43.mlp.up_proj: 5 +model.layers.44.mlp.gate_proj: 5 +model.layers.44.mlp.up_proj: 5 +model.layers.45.mlp.gate_proj: 5 +model.layers.45.mlp.up_proj: 5 +model.layers.46.mlp.gate_proj: 5 +model.layers.46.mlp.up_proj: 5 +model.layers.47.mlp.gate_proj: 5 +model.layers.47.mlp.up_proj: 5 +model.layers.48.mlp.gate_proj: 5 +model.layers.48.mlp.up_proj: 5 +model.layers.49.mlp.gate_proj: 5 +model.layers.49.mlp.up_proj: 5 +model.layers.50.mlp.gate_proj: 5 +model.layers.50.mlp.up_proj: 5 +model.layers.51.mlp.gate_proj: 5 +model.layers.51.mlp.up_proj: 5 +model.layers.52.mlp.gate_proj: 5 +model.layers.52.mlp.up_proj: 5 +model.layers.53.mlp.gate_proj: 5 +model.layers.53.mlp.up_proj: 5 +model.layers.54.mlp.gate_proj: 5 +model.layers.54.mlp.up_proj: 5 +model.layers.55.mlp.gate_proj: 5 +model.layers.55.mlp.up_proj: 5 +model.layers.56.mlp.gate_proj: 5 +model.layers.56.mlp.up_proj: 5 +model.layers.57.mlp.gate_proj: 5 +model.layers.57.mlp.up_proj: 5 +model.layers.58.mlp.gate_proj: 5 +model.layers.58.mlp.up_proj: 5 +model.layers.59.mlp.gate_proj: 5 +model.layers.59.mlp.up_proj: 5 +model.layers.60.mlp.gate_proj: 4 +model.layers.60.mlp.up_proj: 4 +model.layers.61.mlp.gate_proj: 4 +model.layers.61.mlp.up_proj: 4 +model.layers.62.mlp.gate_proj: 5 +model.layers.62.mlp.up_proj: 5 +model.layers.63.mlp.gate_proj: 5 +model.layers.63.mlp.up_proj: 5 +model.layers.61.linear_attn.in_proj_b: 8 +model.layers.36.linear_attn.in_proj_qkv: 5 +model.layers.60.linear_attn.in_proj_a: 8 +model.layers.60.linear_attn.out_proj: 5 +model.layers.54.linear_attn.in_proj_z: 5 +model.layers.12.linear_attn.in_proj_qkv: 5 +model.layers.56.linear_attn.in_proj_b: 8 +model.layers.9.linear_attn.in_proj_z: 6 +model.layers.57.linear_attn.in_proj_a: 8 +model.layers.52.linear_attn.in_proj_z: 5 +model.layers.50.linear_attn.in_proj_b: 8 +model.layers.52.mlp.down_proj: 5 +model.layers.27.mlp.down_proj: 5 +model.layers.3.mlp.down_proj: 5 +model.layers.4.linear_attn.in_proj_qkv: 5 +model.layers.38.linear_attn.in_proj_b: 8 +model.layers.46.mlp.down_proj: 5 +model.layers.26.linear_attn.in_proj_qkv: 5 +model.layers.33.mlp.down_proj: 5 +model.layers.33.linear_attn.out_proj: 5 +model.layers.22.linear_attn.in_proj_qkv: 5 +model.layers.1.linear_attn.in_proj_b: 8 +model.layers.58.linear_attn.out_proj: 5 +model.layers.0.linear_attn.in_proj_qkv: 5 +model.layers.0.linear_attn.in_proj_a: 8 +model.layers.42.linear_attn.out_proj: 5 +model.layers.34.linear_attn.in_proj_b: 8 +model.layers.29.linear_attn.out_proj: 5 +model.layers.36.linear_attn.in_proj_z: 5 +model.layers.22.linear_attn.out_proj: 5 +model.layers.6.mlp.down_proj: 5 +model.layers.33.linear_attn.in_proj_a: 8 +model.layers.43.mlp.down_proj: 5 +model.layers.49.linear_attn.out_proj: 5 +model.layers.32.linear_attn.in_proj_b: 8 +model.layers.16.linear_attn.in_proj_qkv: 5 +model.layers.30.linear_attn.in_proj_z: 6 +model.layers.36.mlp.down_proj: 5 +model.layers.5.linear_attn.in_proj_z: 6 +model.layers.57.mlp.down_proj: 4 +model.layers.32.linear_attn.in_proj_qkv: 5 +model.layers.53.linear_attn.out_proj: 5 +model.layers.6.linear_attn.in_proj_a: 8 +model.layers.58.linear_attn.in_proj_z: 5 +model.layers.38.linear_attn.out_proj: 5 +model.layers.22.mlp.down_proj: 5 +model.layers.20.linear_attn.in_proj_z: 6 +model.layers.58.mlp.down_proj: 5 +model.layers.22.linear_attn.in_proj_b: 8 +model.layers.62.linear_attn.in_proj_qkv: 5 +model.layers.9.mlp.down_proj: 5 +model.layers.48.linear_attn.in_proj_z: 5 +model.layers.17.linear_attn.in_proj_z: 6 +model.layers.46.linear_attn.in_proj_qkv: 5 +model.layers.24.linear_attn.out_proj: 5 +model.layers.14.linear_attn.in_proj_a: 8 +model.layers.39.mlp.down_proj: 5 +model.layers.13.linear_attn.in_proj_b: 8 +model.layers.12.linear_attn.in_proj_a: 8 +model.layers.56.linear_attn.in_proj_qkv: 5 +model.layers.44.linear_attn.out_proj: 5 +model.layers.26.linear_attn.in_proj_z: 6 +model.layers.8.linear_attn.in_proj_qkv: 5 +model.layers.24.linear_attn.in_proj_b: 8 +model.layers.25.linear_attn.in_proj_a: 8 +model.layers.11.mlp.down_proj: 5 +model.layers.49.mlp.down_proj: 5 +model.layers.41.linear_attn.in_proj_a: 8 +model.layers.40.linear_attn.in_proj_b: 8 +model.layers.42.linear_attn.in_proj_z: 5 +model.layers.29.linear_attn.in_proj_a: 8 +model.layers.52.linear_attn.in_proj_qkv: 5 +model.layers.28.linear_attn.in_proj_b: 8 +model.layers.28.mlp.down_proj: 5 +model.layers.61.mlp.down_proj: 4 +model.layers.42.linear_attn.in_proj_qkv: 5 +model.layers.1.linear_attn.out_proj: 6 +model.layers.14.mlp.down_proj: 5 +model.layers.18.linear_attn.in_proj_a: 8 +model.layers.46.linear_attn.in_proj_b: 8 +model.layers.44.linear_attn.in_proj_z: 5 +model.layers.17.linear_attn.out_proj: 6 +model.layers.57.linear_attn.in_proj_b: 8 +model.layers.56.linear_attn.in_proj_a: 8 +model.layers.29.linear_attn.in_proj_qkv: 5 +model.layers.8.linear_attn.in_proj_z: 6 +model.layers.62.linear_attn.in_proj_z: 5 +model.layers.60.linear_attn.in_proj_b: 8 +model.layers.61.linear_attn.in_proj_a: 8 +model.layers.16.mlp.down_proj: 5 +model.layers.61.linear_attn.out_proj: 5 +model.layers.6.linear_attn.out_proj: 6 +model.layers.38.linear_attn.in_proj_a: 8 +model.layers.45.linear_attn.in_proj_qkv: 5 +model.layers.63.mlp.down_proj: 5 +model.layers.10.linear_attn.out_proj: 6 +model.layers.53.linear_attn.in_proj_z: 5 +model.layers.61.linear_attn.in_proj_qkv: 5 +model.layers.50.linear_attn.in_proj_a: 8 +model.layers.52.linear_attn.out_proj: 5 +model.layers.34.linear_attn.in_proj_a: 8 +model.layers.37.linear_attn.in_proj_z: 5 +model.layers.0.linear_attn.in_proj_b: 8 +model.layers.2.linear_attn.in_proj_z: 6 +model.layers.41.linear_attn.in_proj_qkv: 5 +model.layers.1.linear_attn.in_proj_a: 8 +model.layers.48.linear_attn.out_proj: 5 +model.layers.6.linear_attn.in_proj_b: 8 +model.layers.4.linear_attn.in_proj_z: 6 +model.layers.28.linear_attn.out_proj: 5 +model.layers.32.linear_attn.out_proj: 5 +model.layers.13.mlp.down_proj: 5 +model.layers.32.linear_attn.in_proj_a: 8 +model.layers.33.linear_attn.in_proj_b: 8 +model.layers.16.linear_attn.in_proj_z: 6 +model.layers.14.linear_attn.in_proj_b: 8 +model.layers.49.linear_attn.in_proj_z: 5 +model.layers.25.linear_attn.in_proj_qkv: 5 +model.layers.34.linear_attn.out_proj: 5 +model.layers.21.linear_attn.in_proj_z: 6 +model.layers.22.linear_attn.in_proj_a: 8 +model.layers.45.linear_attn.out_proj: 5 +model.layers.34.mlp.down_proj: 5 +model.layers.25.linear_attn.in_proj_b: 8 +model.layers.49.linear_attn.in_proj_qkv: 5 +model.layers.4.mlp.down_proj: 5 +model.layers.41.mlp.down_proj: 5 +model.layers.24.linear_attn.in_proj_a: 8 +model.layers.25.linear_attn.out_proj: 5 +model.layers.10.linear_attn.in_proj_z: 6 +model.layers.12.linear_attn.in_proj_b: 8 +model.layers.20.mlp.down_proj: 5 +model.layers.54.linear_attn.out_proj: 5 +model.layers.55.mlp.down_proj: 5 +model.layers.13.linear_attn.in_proj_a: 8 +model.layers.28.linear_attn.in_proj_a: 8 +model.layers.16.linear_attn.out_proj: 6 +model.layers.29.linear_attn.in_proj_b: 8 +model.layers.0.linear_attn.out_proj: 6 +model.layers.40.linear_attn.in_proj_a: 8 +model.layers.19.mlp.down_proj: 5 +model.layers.41.linear_attn.in_proj_b: 8 +model.layers.25.mlp.down_proj: 5 +model.layers.46.linear_attn.in_proj_a: 8 +model.layers.18.linear_attn.in_proj_b: 8 +model.layers.50.mlp.down_proj: 5 +model.layers.45.linear_attn.in_proj_z: 5 +model.layers.21.linear_attn.in_proj_qkv: 5 +model.layers.31.mlp.down_proj: 5 +model.layers.1.mlp.down_proj: 5 +model.layers.44.mlp.down_proj: 5 +model.layers.57.linear_attn.in_proj_qkv: 5 +model.layers.53.linear_attn.in_proj_a: 8 +model.layers.52.linear_attn.in_proj_b: 8 +model.layers.50.linear_attn.in_proj_z: 5 +model.layers.20.linear_attn.out_proj: 6 +model.layers.18.mlp.down_proj: 5 +model.layers.9.linear_attn.in_proj_qkv: 5 +model.layers.38.linear_attn.in_proj_z: 5 +model.layers.62.linear_attn.in_proj_a: 8 +model.layers.24.mlp.down_proj: 5 +model.layers.51.mlp.down_proj: 5 +model.layers.61.linear_attn.in_proj_z: 5 +model.layers.9.linear_attn.in_proj_b: 8 +model.layers.30.mlp.down_proj: 5 +model.layers.0.mlp.down_proj: 5 +model.layers.8.linear_attn.in_proj_a: 8 +model.layers.40.linear_attn.out_proj: 5 +model.layers.54.linear_attn.in_proj_b: 8 +model.layers.45.mlp.down_proj: 5 +model.layers.56.linear_attn.in_proj_z: 5 +model.layers.32.linear_attn.in_proj_z: 6 +model.layers.30.linear_attn.in_proj_b: 8 +model.layers.4.linear_attn.in_proj_a: 8 +model.layers.18.linear_attn.out_proj: 6 +model.layers.58.linear_attn.in_proj_b: 8 +model.layers.5.linear_attn.in_proj_b: 8 +model.layers.2.linear_attn.in_proj_a: 8 +model.layers.35.mlp.down_proj: 5 +model.layers.13.linear_attn.out_proj: 6 +model.layers.5.mlp.down_proj: 5 +model.layers.1.linear_attn.in_proj_z: 6 +model.layers.40.mlp.down_proj: 5 +model.layers.5.linear_attn.out_proj: 6 +model.layers.34.linear_attn.in_proj_z: 6 +model.layers.21.mlp.down_proj: 5 +model.layers.36.linear_attn.in_proj_b: 8 +model.layers.37.linear_attn.in_proj_a: 8 +model.layers.54.mlp.down_proj: 5 +model.layers.53.linear_attn.in_proj_qkv: 5 +model.layers.62.linear_attn.out_proj: 5 +model.layers.10.linear_attn.in_proj_a: 8 +model.layers.13.linear_attn.in_proj_z: 6 +model.layers.26.linear_attn.in_proj_b: 8 +model.layers.24.linear_attn.in_proj_z: 6 +model.layers.5.linear_attn.in_proj_qkv: 5 +model.layers.37.linear_attn.in_proj_qkv: 5 +model.layers.21.linear_attn.in_proj_a: 8 +model.layers.20.linear_attn.in_proj_b: 8 +model.layers.22.linear_attn.in_proj_z: 6 +model.layers.49.linear_attn.in_proj_a: 8 +model.layers.8.linear_attn.out_proj: 6 +model.layers.16.linear_attn.in_proj_a: 8 +model.layers.12.mlp.down_proj: 5 +model.layers.48.linear_attn.in_proj_b: 8 +model.layers.17.linear_attn.in_proj_b: 8 +model.layers.13.linear_attn.in_proj_qkv: 5 +model.layers.46.linear_attn.out_proj: 5 +model.layers.17.linear_attn.in_proj_qkv: 5 +model.layers.46.linear_attn.in_proj_z: 5 +model.layers.44.linear_attn.in_proj_b: 8 +model.layers.37.linear_attn.out_proj: 5 +model.layers.45.linear_attn.in_proj_a: 8 +model.layers.33.linear_attn.in_proj_qkv: 5 +model.layers.57.linear_attn.out_proj: 5 +model.layers.40.linear_attn.in_proj_z: 5 +model.layers.1.linear_attn.in_proj_qkv: 6 +model.layers.17.mlp.down_proj: 5 +model.layers.42.linear_attn.in_proj_b: 8 +model.layers.62.mlp.down_proj: 4 +model.layers.28.linear_attn.in_proj_z: 6 +model.layers.26.linear_attn.out_proj: 5 +model.layers.48.linear_attn.in_proj_qkv: 5 +model.layers.48.mlp.down_proj: 5 +model.layers.41.linear_attn.out_proj: 5 +model.layers.10.linear_attn.in_proj_qkv: 5 +model.layers.34.linear_attn.in_proj_qkv: 5 +model.layers.52.linear_attn.in_proj_a: 8 +model.layers.30.linear_attn.out_proj: 5 +model.layers.53.linear_attn.in_proj_b: 8 +model.layers.29.mlp.down_proj: 5 +model.layers.24.linear_attn.in_proj_qkv: 5 +model.layers.54.linear_attn.in_proj_a: 8 +model.layers.60.mlp.down_proj: 4 +model.layers.8.linear_attn.in_proj_b: 8 +model.layers.6.linear_attn.in_proj_qkv: 5 +model.layers.57.linear_attn.in_proj_z: 5 +model.layers.9.linear_attn.in_proj_a: 8 +model.layers.50.linear_attn.out_proj: 5 +model.layers.15.mlp.down_proj: 5 +model.layers.58.linear_attn.in_proj_qkv: 5 +model.layers.62.linear_attn.in_proj_b: 8 +model.layers.21.linear_attn.out_proj: 6 +model.layers.60.linear_attn.in_proj_z: 5 +model.layers.59.mlp.down_proj: 4 +model.layers.5.linear_attn.in_proj_a: 8 +model.layers.4.linear_attn.out_proj: 6 +model.layers.58.linear_attn.in_proj_a: 8 +model.layers.6.linear_attn.in_proj_z: 6 +model.layers.4.linear_attn.in_proj_b: 8 +model.layers.8.mlp.down_proj: 5 +model.layers.33.linear_attn.in_proj_z: 6 +model.layers.2.linear_attn.in_proj_qkv: 5 +model.layers.30.linear_attn.in_proj_a: 8 +model.layers.38.mlp.down_proj: 5 +model.layers.20.linear_attn.in_proj_qkv: 5 +model.layers.12.linear_attn.out_proj: 6 +model.layers.37.linear_attn.in_proj_b: 8 +model.layers.36.linear_attn.in_proj_a: 8 +model.layers.30.linear_attn.in_proj_qkv: 5 +model.layers.14.linear_attn.in_proj_qkv: 5 +model.layers.0.linear_attn.in_proj_z: 6 +model.layers.2.linear_attn.in_proj_b: 8 +model.layers.10.mlp.down_proj: 5 +model.layers.44.linear_attn.in_proj_qkv: 5 +model.layers.9.linear_attn.out_proj: 6 +model.layers.26.linear_attn.in_proj_a: 8 +model.layers.25.linear_attn.in_proj_z: 6 +model.layers.38.linear_attn.in_proj_qkv: 5 +model.layers.10.linear_attn.in_proj_b: 8 +model.layers.60.linear_attn.in_proj_qkv: 5 +model.layers.12.linear_attn.in_proj_z: 6 +model.layers.7.mlp.down_proj: 5 +model.layers.28.linear_attn.in_proj_qkv: 5 +model.layers.17.linear_attn.in_proj_a: 8 +model.layers.48.linear_attn.in_proj_a: 8 +model.layers.42.mlp.down_proj: 5 +model.layers.14.linear_attn.out_proj: 6 +model.layers.16.linear_attn.in_proj_b: 8 +model.layers.14.linear_attn.in_proj_z: 6 +model.layers.37.mlp.down_proj: 5 +model.layers.49.linear_attn.in_proj_b: 8 +model.layers.20.linear_attn.in_proj_a: 8 +model.layers.54.linear_attn.in_proj_qkv: 5 +model.layers.56.mlp.down_proj: 5 +model.layers.21.linear_attn.in_proj_b: 8 +model.layers.23.mlp.down_proj: 5 +model.layers.2.linear_attn.out_proj: 6 +model.layers.18.linear_attn.in_proj_z: 6 +model.layers.45.linear_attn.in_proj_b: 8 +model.layers.50.linear_attn.in_proj_qkv: 5 +model.layers.44.linear_attn.in_proj_a: 8 +model.layers.56.linear_attn.out_proj: 5 +model.layers.36.linear_attn.out_proj: 5 +model.layers.53.mlp.down_proj: 5 +model.layers.29.linear_attn.in_proj_z: 6 +model.layers.26.mlp.down_proj: 5 +model.layers.2.mlp.down_proj: 5 +model.layers.41.linear_attn.in_proj_z: 5 +model.layers.18.linear_attn.in_proj_qkv: 5 +model.layers.47.mlp.down_proj: 5 +model.layers.42.linear_attn.in_proj_a: 8 +model.layers.40.linear_attn.in_proj_qkv: 5 +model.layers.32.mlp.down_proj: 5 diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/tokenizer_config.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6be6ce1780cf43bd47577fbb76e74aee6db89f21 --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/tokenizer_config.json @@ -0,0 +1,31 @@ +{ + "add_prefix_space": false, + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "image_token": "<|image_pad|>", + "is_local": false, + "model_max_length": 262144, + "model_specific_special_tokens": { + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "image_token": "<|image_pad|>", + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" + }, + "pad_token": "<|endoftext|>", + "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + "split_special_tokens": false, + "tokenizer_class": "TokenizersBackend", + "unk_token": null, + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" +} diff --git a/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/video_preprocessor_config.json b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/video_preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3ba673a5ad7d4d13f54155ecd38b2a94a6dac8fe --- /dev/null +++ b/Qwen3.5-27B/ll_bsearch_kl0.005_eap0.985_lin_bw5.19_4-5-6-8bit_grouped_seed42/video_preprocessor_config.json @@ -0,0 +1,21 @@ +{ + "size": { + "longest_edge": 25165824, + "shortest_edge": 4096 + }, + "patch_size": 16, + "temporal_patch_size": 2, + "merge_size": 2, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "processor_class": "Qwen3VLProcessor", + "video_processor_type": "Qwen3VLVideoProcessor" +} \ No newline at end of file diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00001-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00001-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..33498ea035c18a5c6517a22eb43dabfc4c539128 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00001-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:811f0cc20a13e264e2cf10d9f370974735ebe24e3a5780af578288aa5be49b81 +size 4584408792 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00006-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00006-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3aadcd20bab03ecc03ddcf834f86d70e56d16b4c --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00006-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a3e05127ff7055e9e5973668121d314c7c1181f9ff27edd0f8ad16c2ac716a1 +size 4664167384 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00010-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00010-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34d4d17b45ecdc58fb7712f4b6553791f31b9d6b --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00010-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9addab7bdb80f98acc88a5e4de2f8e811e62ef4c64dfdeb0a3e0d03bc3f10bfc +size 4664134384 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00011-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00011-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0c7956c39f909cc5e60973e80edf6163c81ff5c7 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00011-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c836f78c332014ddee87ce1c3e785ad9e745d7b812ae852a0b94797475bc386 +size 4664167384 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00013-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00013-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4848522dc5f728d086506fcd78b4991c99259ad --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00013-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ec46b2d4823e38e963ab3683507f33e7bcfbcf8028f2df4a1abf1f455075ef +size 4999711704 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00014-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00014-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..05fba471617702eb8c1cacfbc98ab529c25af700 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00014-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf4b53980cb73c5d6716e5c1e9901cf3e1c25947dad98790d677792b4a38cc1d +size 4966157032 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00015-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00015-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0df70cc384fa77bca08789c4a4be1a4ba3caa02d --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00015-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c405a59c9a022b8ea52716d168be1b1336cbeb08dc53369861558b9ff894c5d +size 4664134384 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00018-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00018-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d4c9c095ea50ab896f0ff99f87e9f2e85bdce46 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00018-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb637f85a83e2e1f79c9b90f6918ed08ea03ba0ee75f4fca7e1131234bf43557 +size 4999711704 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00019-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00019-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ef2045ef64a5093bdd102797f27fd4220a67d8f --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00019-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f4e18a1dc811fa4074495356d801459e7f7920ae4aebf4453241cab819b4358 +size 4966157032 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00022-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00022-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a47e40a1a6152eee8dee25caad84862d7280beda --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00022-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e039b5cb65929ec0ad929a65476ee4a2c8ed224b1943127cff37b6416223a9d0 +size 4664167376 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00023-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00023-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3dc5ab5e9dc6518d00735dc02a73b83ce5ac4f70 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00023-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad75d44ce277e15a5f42b269f948ab5faa5fee15936497efa84d3d82136099d2 +size 4999711704 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00024-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00024-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b102a8fff6bade4a0dfa0b692dd6ae6b3f010c48 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00024-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:215dc7cab538ca5e56d7da030a01cf8a8c73107eedb42b44a9aac50e611e31d3 +size 4966157032 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00026-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00026-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ef668a974a7d4cb9affa7b288ee7cb85bd9c643e --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00026-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:145a7b63968a1fcfd0e8bd342563d9dd41a72aeb26cc1f2b80ceb79f71432b19 +size 4664167384 diff --git a/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00029-of-00030.safetensors b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00029-of-00030.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8687b42550305d81adf93e3eee774539143948b9 --- /dev/null +++ b/humming/Llama-3.3-70B-Instruct/Llama-3.3-70B-Instruct/ll_bsearch_kl0.01_eap0.985_sha_bw6.55_4-5-6-7-8bit_seed42/model-00029-of-00030.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0a199692ecf760e90fd1a6f3ebcb867ff8c558303fbe7e818d46ad88f84f2df +size 4966173512 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00002-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00002-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..90c8782f45b9e55b668380f0febb7b982d6b50f5 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00002-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b3955bcd3767c42ddb4e830136893cb2787132d360166239cee957a5de6d23 +size 4875989640 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00004-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00004-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..89fb44dccc81aa315391619b768ed55a1eeb23bd --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00004-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9dbedd22b1dbcb9471108e5411ccabff3418c850e0c9149ec8474bbccb174a +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00007-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00007-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0169aaa2ff1660a4f3c2d35a5f126ba26f9aec7e --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00007-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cb9ec01b52fa85fbc37c443e439d4ed2a6124d82d6641f020b4c3e41ccc4134 +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00010-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00010-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f6ae36eb94ed0dd7e8c528c4a44cddd97120a6d9 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00010-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0e39f9035e52a8c021adef81f220433c233f17eb62ed48d6795784f0490238 +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00011-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00011-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8782469b620fa1df4d14da47789d6eff4121cb5 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00011-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fad98056554d27ef2154efff9c1f9ad89241371b02a75d03ad81c5b3cb6e2562 +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00012-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00012-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..993667737e754245f9f73457c6ac2d84c3112249 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00012-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e050568252c82d2e64d5d62264783ad888b0df355ac4673e91fbc04d99c44f +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00013-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00013-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b3bee82bd5110870abde06c33ff42f1aefb5a4ee --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bsearch_kl0.005_eap0.99_sha_bw4.98_4-5-6-7-8bit_seed42/model-00013-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b7fb013d4449db09e80adabe125418e5c2ff21efb435259055adeb9c19bd1a +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00001-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00001-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c2b0d73382aa9ee35ac1d3ef3b7f43e233394e76 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00001-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4d4e7c83617d9ae753a7d29eb5198e3b7f56989a692f25538d435dbab5a383 +size 4932307544 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00002-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00002-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bda3cf6b20bb390689191cbf4373bf60648a69e9 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00002-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd82d75f6714709eca1ee19e634013a35f38ff59bfa5c0d82d60b2098093440c +size 4875989640 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00008-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00008-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f7fce98453bea8587be76074085f243c47a4878 --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00008-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f49b39e539d682e774efdf69e79bb30361478275dea6206b6c1841f1897c89f +size 4875989696 diff --git a/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00013-of-00014.safetensors b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00013-of-00014.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b3bee82bd5110870abde06c33ff42f1aefb5a4ee --- /dev/null +++ b/humming/Qwen3-32B/Qwen3-32B/ll_bw8.16_8bit/model-00013-of-00014.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b7fb013d4449db09e80adabe125418e5c2ff21efb435259055adeb9c19bd1a +size 4875989696 diff --git a/humming/Qwen3-8B/ll_sha_bw8.16_8bit/README.md b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d70d302176840f0eaa956b5bf961008db5fadb16 --- /dev/null +++ b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/README.md @@ -0,0 +1,9 @@ +# Quantized Model Checkpoint + +**Base model:** Qwen/Qwen3-8B + +**Average bitwidth:** 8.156 + +**Sensitivity method:** shapley + +See `quantization_config.txt` for full configuration details. diff --git a/humming/Qwen3-8B/ll_sha_bw8.16_8bit/added_tokens.json b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b54f9135e44c1e81047e8d05cb027af8bc039eed --- /dev/null +++ b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/humming/Qwen3-8B/ll_sha_bw8.16_8bit/chat_template.jinja b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01be9b307daa2d425f7c168c9fb145a286e0afb4 --- /dev/null +++ b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/humming/Qwen3-8B/ll_sha_bw8.16_8bit/humming_online_quant_config.json b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/humming_online_quant_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5d73fd43702e20990aee6d43cadcbe7a32521689 --- /dev/null +++ b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/humming_online_quant_config.json @@ -0,0 +1,5 @@ +{ + "quant_method": "gptq", + "bits": 8, + "group_size": 128 +} \ No newline at end of file diff --git a/humming/Qwen3-8B/ll_sha_bw8.16_8bit/model.safetensors.index.json b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..991332e2a7dada9479948259f715fe1f6c69db54 --- /dev/null +++ b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/model.safetensors.index.json @@ -0,0 +1,407 @@ +{ + "metadata": { + "total_parameters": 8190735360, + "total_size": 16381470720 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors", + "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/humming/Qwen3-8B/ll_sha_bw8.16_8bit/quantization_config.txt b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/quantization_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..a36288aaeb969f74481af6be05d56817dbe7d373 --- /dev/null +++ b/humming/Qwen3-8B/ll_sha_bw8.16_8bit/quantization_config.txt @@ -0,0 +1,266 @@ +# Model: Qwen/Qwen3-8B +# Layer directory: /nfs/scistore19/alistgrp/mhelcig/local/data/search/4_5_6_7_8bit_asym_g128/Qwen3-8B/4bit +# Sensitivity method: shapley +# Estimation method: permutation_separate +# Available bitwidths: [4, 5, 6, 7, 8] +# Bitwidth map: {4: 4.156, 5: 5.156, 6: 6.156, 7: 7.156, 8: 8.156} +# +# Average bitwidth: 8.156 +# Total params: 8 +# Total bits: 8 +# +# Bitwidth distribution: +# 8-bit: 252 +# +model.layers.23.self_attn.k_proj: 8 +model.layers.22.self_attn.o_proj: 8 +model.layers.4.self_attn.o_proj: 8 +model.layers.1.mlp.down_proj: 8 +model.layers.11.mlp.up_proj: 8 +model.layers.5.self_attn.k_proj: 8 +model.layers.14.self_attn.q_proj: 8 +model.layers.17.mlp.down_proj: 8 +model.layers.33.mlp.gate_proj: 8 +model.layers.5.self_attn.v_proj: 8 +model.layers.23.self_attn.v_proj: 8 +model.layers.27.mlp.gate_proj: 8 +model.layers.2.mlp.up_proj: 8 +model.layers.15.self_attn.q_proj: 8 +model.layers.23.self_attn.o_proj: 8 +model.layers.21.mlp.up_proj: 8 +model.layers.22.self_attn.k_proj: 8 +model.layers.4.mlp.down_proj: 8 +model.layers.4.self_attn.k_proj: 8 +model.layers.5.self_attn.o_proj: 8 +model.layers.31.mlp.up_proj: 8 +model.layers.4.self_attn.v_proj: 8 +model.layers.22.self_attn.v_proj: 8 +model.layers.22.mlp.gate_proj: 8 +model.layers.8.mlp.gate_proj: 8 +model.layers.12.mlp.down_proj: 8 +model.layers.7.mlp.gate_proj: 8 +model.layers.35.self_attn.o_proj: 8 +model.layers.21.self_attn.v_proj: 8 +model.layers.34.self_attn.k_proj: 8 +model.layers.7.self_attn.v_proj: 8 +model.layers.7.self_attn.k_proj: 8 +model.layers.6.self_attn.o_proj: 8 +model.layers.34.self_attn.v_proj: 8 +model.layers.20.self_attn.o_proj: 8 +model.layers.13.mlp.up_proj: 8 +model.layers.21.self_attn.k_proj: 8 +model.layers.35.mlp.down_proj: 8 +model.layers.28.self_attn.q_proj: 8 +model.layers.11.mlp.gate_proj: 8 +model.layers.16.self_attn.q_proj: 8 +model.layers.21.mlp.down_proj: 8 +model.layers.35.self_attn.k_proj: 8 +model.layers.33.mlp.up_proj: 8 +model.layers.20.self_attn.v_proj: 8 +model.layers.34.self_attn.o_proj: 8 +model.layers.6.self_attn.v_proj: 8 +model.layers.28.mlp.gate_proj: 8 +model.layers.2.mlp.gate_proj: 8 +model.layers.18.mlp.down_proj: 8 +model.layers.17.self_attn.q_proj: 8 +model.layers.0.mlp.up_proj: 8 +model.layers.24.mlp.down_proj: 8 +model.layers.23.mlp.up_proj: 8 +model.layers.7.self_attn.o_proj: 8 +model.layers.6.self_attn.k_proj: 8 +model.layers.29.self_attn.q_proj: 8 +model.layers.20.self_attn.k_proj: 8 +model.layers.30.mlp.down_proj: 8 +model.layers.14.mlp.gate_proj: 8 +model.layers.21.self_attn.o_proj: 8 +model.layers.35.self_attn.v_proj: 8 +model.layers.6.mlp.up_proj: 8 +model.layers.9.self_attn.q_proj: 8 +model.layers.0.self_attn.k_proj: 8 +model.layers.9.mlp.up_proj: 8 +model.layers.1.self_attn.o_proj: 8 +model.layers.33.self_attn.v_proj: 8 +model.layers.27.self_attn.o_proj: 8 +model.layers.26.self_attn.k_proj: 8 +model.layers.19.self_attn.o_proj: 8 +model.layers.0.mlp.gate_proj: 8 +model.layers.11.self_attn.q_proj: 8 +model.layers.18.self_attn.k_proj: 8 +model.layers.25.mlp.up_proj: 8 +model.layers.26.mlp.down_proj: 8 +model.layers.18.self_attn.v_proj: 8 +model.layers.35.mlp.up_proj: 8 +model.layers.32.self_attn.o_proj: 8 +model.layers.26.self_attn.v_proj: 8 +model.layers.33.self_attn.k_proj: 8 +model.layers.32.mlp.down_proj: 8 +model.layers.16.mlp.gate_proj: 8 +model.layers.0.self_attn.v_proj: 8 +model.layers.15.mlp.up_proj: 8 +model.layers.19.self_attn.k_proj: 8 +model.layers.10.self_attn.q_proj: 8 +model.layers.5.mlp.gate_proj: 8 +model.layers.18.self_attn.o_proj: 8 +model.layers.0.self_attn.o_proj: 8 +model.layers.1.self_attn.k_proj: 8 +model.layers.8.self_attn.q_proj: 8 +model.layers.27.self_attn.k_proj: 8 +model.layers.26.self_attn.o_proj: 8 +model.layers.32.self_attn.v_proj: 8 +model.layers.9.mlp.down_proj: 8 +model.layers.32.self_attn.k_proj: 8 +model.layers.13.mlp.gate_proj: 8 +model.layers.27.self_attn.v_proj: 8 +model.layers.33.self_attn.o_proj: 8 +model.layers.1.self_attn.v_proj: 8 +model.layers.23.mlp.down_proj: 8 +model.layers.19.self_attn.v_proj: 8 +model.layers.2.self_attn.v_proj: 8 +model.layers.6.mlp.down_proj: 8 +model.layers.31.self_attn.k_proj: 8 +model.layers.24.self_attn.v_proj: 8 +model.layers.30.self_attn.o_proj: 8 +model.layers.24.self_attn.k_proj: 8 +model.layers.4.mlp.up_proj: 8 +model.layers.25.self_attn.o_proj: 8 +model.layers.31.self_attn.v_proj: 8 +model.layers.20.mlp.gate_proj: 8 +model.layers.3.self_attn.o_proj: 8 +model.layers.2.self_attn.k_proj: 8 +model.layers.10.mlp.down_proj: 8 +model.layers.34.mlp.gate_proj: 8 +model.layers.13.self_attn.q_proj: 8 +model.layers.28.mlp.up_proj: 8 +model.layers.27.mlp.up_proj: 8 +model.layers.3.mlp.down_proj: 8 +model.layers.3.self_attn.v_proj: 8 +model.layers.19.mlp.gate_proj: 8 +model.layers.31.self_attn.o_proj: 8 +model.layers.25.self_attn.v_proj: 8 +model.layers.30.self_attn.k_proj: 8 +model.layers.29.mlp.down_proj: 8 +model.layers.18.mlp.up_proj: 8 +model.layers.15.mlp.down_proj: 8 +model.layers.17.mlp.up_proj: 8 +model.layers.31.mlp.gate_proj: 8 +model.layers.12.self_attn.q_proj: 8 +model.layers.30.self_attn.v_proj: 8 +model.layers.24.self_attn.o_proj: 8 +model.layers.25.self_attn.k_proj: 8 +model.layers.25.mlp.gate_proj: 8 +model.layers.3.self_attn.k_proj: 8 +model.layers.2.self_attn.o_proj: 8 +model.layers.2.mlp.down_proj: 8 +model.layers.16.self_attn.o_proj: 8 +model.layers.18.mlp.gate_proj: 8 +model.layers.17.self_attn.k_proj: 8 +model.layers.6.self_attn.q_proj: 8 +model.layers.28.mlp.down_proj: 8 +model.layers.28.self_attn.o_proj: 8 +model.layers.20.self_attn.q_proj: 8 +model.layers.29.self_attn.k_proj: 8 +model.layers.29.self_attn.v_proj: 8 +model.layers.14.mlp.down_proj: 8 +model.layers.35.self_attn.q_proj: 8 +model.layers.30.mlp.gate_proj: 8 +model.layers.24.mlp.gate_proj: 8 +model.layers.14.mlp.up_proj: 8 +model.layers.17.self_attn.v_proj: 8 +model.layers.7.self_attn.q_proj: 8 +model.layers.28.self_attn.k_proj: 8 +model.layers.21.self_attn.q_proj: 8 +model.layers.29.self_attn.o_proj: 8 +model.layers.7.mlp.down_proj: 8 +model.layers.16.self_attn.k_proj: 8 +model.layers.17.self_attn.o_proj: 8 +model.layers.34.mlp.up_proj: 8 +model.layers.21.mlp.gate_proj: 8 +model.layers.16.self_attn.v_proj: 8 +model.layers.24.mlp.up_proj: 8 +model.layers.34.self_attn.q_proj: 8 +model.layers.8.mlp.up_proj: 8 +model.layers.11.mlp.down_proj: 8 +model.layers.7.mlp.up_proj: 8 +model.layers.35.mlp.gate_proj: 8 +model.layers.28.self_attn.v_proj: 8 +model.layers.4.mlp.gate_proj: 8 +model.layers.16.mlp.up_proj: 8 +model.layers.15.self_attn.v_proj: 8 +model.layers.19.mlp.up_proj: 8 +model.layers.8.mlp.down_proj: 8 +model.layers.12.mlp.gate_proj: 8 +model.layers.15.self_attn.k_proj: 8 +model.layers.14.self_attn.o_proj: 8 +model.layers.22.self_attn.q_proj: 8 +model.layers.22.mlp.down_proj: 8 +model.layers.4.self_attn.q_proj: 8 +model.layers.14.self_attn.v_proj: 8 +model.layers.26.mlp.up_proj: 8 +model.layers.29.mlp.up_proj: 8 +model.layers.5.mlp.up_proj: 8 +model.layers.1.mlp.gate_proj: 8 +model.layers.27.mlp.down_proj: 8 +model.layers.23.self_attn.q_proj: 8 +model.layers.5.self_attn.q_proj: 8 +model.layers.33.mlp.down_proj: 8 +model.layers.17.mlp.gate_proj: 8 +model.layers.15.self_attn.o_proj: 8 +model.layers.14.self_attn.k_proj: 8 +model.layers.12.self_attn.k_proj: 8 +model.layers.13.self_attn.o_proj: 8 +model.layers.29.mlp.gate_proj: 8 +model.layers.25.self_attn.q_proj: 8 +model.layers.30.mlp.up_proj: 8 +model.layers.3.mlp.gate_proj: 8 +model.layers.19.mlp.down_proj: 8 +model.layers.3.self_attn.q_proj: 8 +model.layers.20.mlp.up_proj: 8 +model.layers.30.self_attn.q_proj: 8 +model.layers.25.mlp.down_proj: 8 +model.layers.12.self_attn.v_proj: 8 +model.layers.3.mlp.up_proj: 8 +model.layers.31.mlp.down_proj: 8 +model.layers.15.mlp.gate_proj: 8 +model.layers.24.self_attn.q_proj: 8 +model.layers.2.self_attn.q_proj: 8 +model.layers.6.mlp.gate_proj: 8 +model.layers.12.self_attn.o_proj: 8 +model.layers.13.self_attn.k_proj: 8 +model.layers.13.self_attn.v_proj: 8 +model.layers.34.mlp.down_proj: 8 +model.layers.10.mlp.gate_proj: 8 +model.layers.10.mlp.up_proj: 8 +model.layers.20.mlp.down_proj: 8 +model.layers.31.self_attn.q_proj: 8 +model.layers.22.mlp.up_proj: 8 +model.layers.32.self_attn.q_proj: 8 +model.layers.8.self_attn.v_proj: 8 +model.layers.5.mlp.down_proj: 8 +model.layers.10.self_attn.v_proj: 8 +model.layers.1.mlp.up_proj: 8 +model.layers.11.self_attn.o_proj: 8 +model.layers.10.self_attn.k_proj: 8 +model.layers.19.self_attn.q_proj: 8 +model.layers.23.mlp.gate_proj: 8 +model.layers.8.self_attn.k_proj: 8 +model.layers.32.mlp.up_proj: 8 +model.layers.1.self_attn.q_proj: 8 +model.layers.9.self_attn.o_proj: 8 +model.layers.9.mlp.gate_proj: 8 +model.layers.13.mlp.down_proj: 8 +model.layers.27.self_attn.q_proj: 8 +model.layers.0.mlp.down_proj: 8 +model.layers.11.self_attn.v_proj: 8 +model.layers.33.self_attn.q_proj: 8 +model.layers.9.self_attn.v_proj: 8 +model.layers.12.mlp.up_proj: 8 +model.layers.8.self_attn.o_proj: 8 +model.layers.0.self_attn.q_proj: 8 +model.layers.9.self_attn.k_proj: 8 +model.layers.26.self_attn.q_proj: 8 +model.layers.16.mlp.down_proj: 8 +model.layers.32.mlp.gate_proj: 8 +model.layers.18.self_attn.q_proj: 8 +model.layers.11.self_attn.k_proj: 8 +model.layers.10.self_attn.o_proj: 8 +model.layers.26.mlp.gate_proj: 8