bearzi commited on
Commit
ffd54db
·
verified ·
1 Parent(s): 403d37d

Upload Trinity-Mini-oQ4

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: arcee-ai/Trinity-Mini
3
+ library_name: mlx
4
+ pipeline_tag: text-generation
5
+ license: apache-2.0
6
+ tags:
7
+ - mlx
8
+ - omlx
9
+ - oq
10
+ - oq4
11
+ - quantized
12
+ ---
13
+
14
+ # Trinity-Mini-oQ4
15
+
16
+ oQ4 mixed-precision MLX quantization produced via [oMLX](https://github.com/jundot/omlx).
17
+
18
+ - **Quantization:** oQ4 (sensitivity-driven mixed precision, group_size=64)
19
+ - **Format:** MLX safetensors
20
+ - **Compatible with:** mlx-lm, mlx-vlm, oMLX on Apple Silicon
21
+
22
+ ## Usage
23
+
24
+ ```python
25
+ from mlx_lm import load, generate
26
+ model, tokenizer = load("bearzi/Trinity-Mini-oQ4")
27
+ prompt = tokenizer.apply_chat_template(
28
+ [{"role": "user", "content": "Hello"}],
29
+ add_generation_prompt=True,
30
+ )
31
+ print(generate(model, tokenizer, prompt=prompt, max_tokens=512, verbose=True))
32
+ ```
33
+
34
+ ## About oQ
35
+
36
+ oQ measures per-layer quantization sensitivity through calibration and allocates bits where they matter most — critical layers stay at higher precision, tolerant layers compress aggressively. Target averages of 2/3/4/6/8 bits are provided; actual per-layer bits vary by measured sensitivity.
37
+
38
+ See [oQ documentation](https://github.com/jundot/omlx/blob/main/docs/oQ_Quantization.md).
39
+
40
+ Comparative benchmarks and feedback welcome — please open a discussion.
chat_template.jinja ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {%- if '</think>' in content %}
27
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
28
+ {%- endif %}
29
+ {{- '<|im_start|>' + message.role + '\n' }}
30
+ {% generation %}
31
+ {{- content }}
32
+ {%- if message.tool_calls %}
33
+ {%- for tool_call in message.tool_calls %}
34
+ {%- if (loop.first and content) or (not loop.first) %}
35
+ {{- '\n' }}
36
+ {%- endif %}
37
+ {%- if tool_call.function %}
38
+ {%- set tool_call = tool_call.function %}
39
+ {%- endif %}
40
+ {{- '<tool_call>\n{"name": "' }}
41
+ {{- tool_call.name }}
42
+ {{- '", "arguments": ' }}
43
+ {%- if tool_call.arguments is string %}
44
+ {{- tool_call.arguments }}
45
+ {%- else %}
46
+ {{- tool_call.arguments | tojson }}
47
+ {%- endif %}
48
+ {{- '}\n</tool_call>' }}
49
+ {%- endfor %}
50
+ {%- endif %}
51
+ {{- '<|im_end|>' }}
52
+ {% endgeneration %}
53
+ {{- '\n' }}
54
+ {%- elif message.role == "tool" %}
55
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
56
+ {{- '<|im_start|>user' }}
57
+ {%- endif %}
58
+ {{- '\n<tool_response>\n' }}
59
+ {{- content }}
60
+ {{- '\n</tool_response>' }}
61
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
62
+ {{- '<|im_end|>\n' }}
63
+ {%- endif %}
64
+ {%- endif %}
65
+ {%- endfor %}
66
+ {%- if add_generation_prompt %}
67
+ {{- '<|im_start|>assistant\n<think>\n' }}
68
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,2620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AfmoeForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_afmoe.AfmoeConfig",
8
+ "AutoModel": "modeling_afmoe.AfmoeModel",
9
+ "AutoModelForCausalLM": "modeling_afmoe.AfmoeForCausalLM"
10
+ },
11
+ "dtype": "bfloat16",
12
+ "global_attn_every_n_layers": 4,
13
+ "head_dim": 128,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 2048,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 6144,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "full_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "sliding_attention",
26
+ "full_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "full_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "full_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "full_attention"
51
+ ],
52
+ "load_balance_coeff": 0.001,
53
+ "max_position_embeddings": 131072,
54
+ "model_type": "afmoe",
55
+ "moe_intermediate_size": 1024,
56
+ "mup_enabled": true,
57
+ "n_group": 1,
58
+ "num_attention_heads": 32,
59
+ "num_dense_layers": 2,
60
+ "num_expert_groups": 1,
61
+ "num_experts": 128,
62
+ "num_experts_per_tok": 8,
63
+ "num_hidden_layers": 32,
64
+ "num_key_value_heads": 4,
65
+ "num_limited_groups": 1,
66
+ "num_shared_experts": 1,
67
+ "rms_norm_eps": 1e-05,
68
+ "rope_scaling": null,
69
+ "rope_theta": 10000,
70
+ "route_norm": true,
71
+ "route_scale": 2.826,
72
+ "score_func": "sigmoid",
73
+ "sliding_window": 2048,
74
+ "tie_word_embeddings": false,
75
+ "topk_group": 1,
76
+ "transformers_version": "4.57.3",
77
+ "use_cache": true,
78
+ "use_grouped_mm": true,
79
+ "vocab_size": 200192,
80
+ "quantization": {
81
+ "group_size": 64,
82
+ "bits": 4,
83
+ "mode": "affine",
84
+ "lm_head": {
85
+ "bits": 8,
86
+ "group_size": 64,
87
+ "mode": "affine"
88
+ },
89
+ "model.embed_tokens": {
90
+ "bits": 8,
91
+ "group_size": 64,
92
+ "mode": "affine"
93
+ },
94
+ "model.layers.0.mlp.down_proj": {
95
+ "bits": 8,
96
+ "group_size": 64,
97
+ "mode": "affine"
98
+ },
99
+ "model.layers.0.mlp.gate_proj": {
100
+ "bits": 8,
101
+ "group_size": 64,
102
+ "mode": "affine"
103
+ },
104
+ "model.layers.0.mlp.up_proj": {
105
+ "bits": 8,
106
+ "group_size": 64,
107
+ "mode": "affine"
108
+ },
109
+ "model.layers.0.self_attn.gate_proj": {
110
+ "bits": 8,
111
+ "group_size": 64,
112
+ "mode": "affine"
113
+ },
114
+ "model.layers.0.self_attn.k_proj": {
115
+ "bits": 8,
116
+ "group_size": 64,
117
+ "mode": "affine"
118
+ },
119
+ "model.layers.0.self_attn.o_proj": {
120
+ "bits": 8,
121
+ "group_size": 64,
122
+ "mode": "affine"
123
+ },
124
+ "model.layers.0.self_attn.q_proj": {
125
+ "bits": 8,
126
+ "group_size": 64,
127
+ "mode": "affine"
128
+ },
129
+ "model.layers.0.self_attn.v_proj": {
130
+ "bits": 8,
131
+ "group_size": 64,
132
+ "mode": "affine"
133
+ },
134
+ "model.layers.1.mlp.down_proj": {
135
+ "bits": 6,
136
+ "group_size": 64,
137
+ "mode": "affine"
138
+ },
139
+ "model.layers.1.mlp.gate_proj": {
140
+ "bits": 5,
141
+ "group_size": 64,
142
+ "mode": "affine"
143
+ },
144
+ "model.layers.1.mlp.up_proj": {
145
+ "bits": 5,
146
+ "group_size": 64,
147
+ "mode": "affine"
148
+ },
149
+ "model.layers.1.self_attn.gate_proj": {
150
+ "bits": 5,
151
+ "group_size": 64,
152
+ "mode": "affine"
153
+ },
154
+ "model.layers.1.self_attn.k_proj": {
155
+ "bits": 6,
156
+ "group_size": 64,
157
+ "mode": "affine"
158
+ },
159
+ "model.layers.1.self_attn.o_proj": {
160
+ "bits": 5,
161
+ "group_size": 64,
162
+ "mode": "affine"
163
+ },
164
+ "model.layers.1.self_attn.q_proj": {
165
+ "bits": 6,
166
+ "group_size": 64,
167
+ "mode": "affine"
168
+ },
169
+ "model.layers.1.self_attn.v_proj": {
170
+ "bits": 6,
171
+ "group_size": 64,
172
+ "mode": "affine"
173
+ },
174
+ "model.layers.10.mlp.shared_experts.down_proj": {
175
+ "bits": 8,
176
+ "group_size": 64,
177
+ "mode": "affine"
178
+ },
179
+ "model.layers.10.mlp.shared_experts.gate_proj": {
180
+ "bits": 8,
181
+ "group_size": 64,
182
+ "mode": "affine"
183
+ },
184
+ "model.layers.10.mlp.shared_experts.up_proj": {
185
+ "bits": 8,
186
+ "group_size": 64,
187
+ "mode": "affine"
188
+ },
189
+ "model.layers.10.self_attn.gate_proj": {
190
+ "bits": 5,
191
+ "group_size": 64,
192
+ "mode": "affine"
193
+ },
194
+ "model.layers.10.self_attn.k_proj": {
195
+ "bits": 5,
196
+ "group_size": 64,
197
+ "mode": "affine"
198
+ },
199
+ "model.layers.10.self_attn.o_proj": {
200
+ "bits": 5,
201
+ "group_size": 64,
202
+ "mode": "affine"
203
+ },
204
+ "model.layers.10.self_attn.q_proj": {
205
+ "bits": 5,
206
+ "group_size": 64,
207
+ "mode": "affine"
208
+ },
209
+ "model.layers.10.self_attn.v_proj": {
210
+ "bits": 5,
211
+ "group_size": 64,
212
+ "mode": "affine"
213
+ },
214
+ "model.layers.11.mlp.shared_experts.down_proj": {
215
+ "bits": 8,
216
+ "group_size": 64,
217
+ "mode": "affine"
218
+ },
219
+ "model.layers.11.mlp.shared_experts.gate_proj": {
220
+ "bits": 8,
221
+ "group_size": 64,
222
+ "mode": "affine"
223
+ },
224
+ "model.layers.11.mlp.shared_experts.up_proj": {
225
+ "bits": 8,
226
+ "group_size": 64,
227
+ "mode": "affine"
228
+ },
229
+ "model.layers.11.self_attn.gate_proj": {
230
+ "bits": 5,
231
+ "group_size": 64,
232
+ "mode": "affine"
233
+ },
234
+ "model.layers.11.self_attn.k_proj": {
235
+ "bits": 5,
236
+ "group_size": 64,
237
+ "mode": "affine"
238
+ },
239
+ "model.layers.11.self_attn.o_proj": {
240
+ "bits": 5,
241
+ "group_size": 64,
242
+ "mode": "affine"
243
+ },
244
+ "model.layers.11.self_attn.q_proj": {
245
+ "bits": 5,
246
+ "group_size": 64,
247
+ "mode": "affine"
248
+ },
249
+ "model.layers.11.self_attn.v_proj": {
250
+ "bits": 5,
251
+ "group_size": 64,
252
+ "mode": "affine"
253
+ },
254
+ "model.layers.12.mlp.shared_experts.down_proj": {
255
+ "bits": 8,
256
+ "group_size": 64,
257
+ "mode": "affine"
258
+ },
259
+ "model.layers.12.mlp.shared_experts.gate_proj": {
260
+ "bits": 8,
261
+ "group_size": 64,
262
+ "mode": "affine"
263
+ },
264
+ "model.layers.12.mlp.shared_experts.up_proj": {
265
+ "bits": 8,
266
+ "group_size": 64,
267
+ "mode": "affine"
268
+ },
269
+ "model.layers.12.self_attn.gate_proj": {
270
+ "bits": 5,
271
+ "group_size": 64,
272
+ "mode": "affine"
273
+ },
274
+ "model.layers.12.self_attn.k_proj": {
275
+ "bits": 5,
276
+ "group_size": 64,
277
+ "mode": "affine"
278
+ },
279
+ "model.layers.12.self_attn.o_proj": {
280
+ "bits": 5,
281
+ "group_size": 64,
282
+ "mode": "affine"
283
+ },
284
+ "model.layers.12.self_attn.q_proj": {
285
+ "bits": 5,
286
+ "group_size": 64,
287
+ "mode": "affine"
288
+ },
289
+ "model.layers.12.self_attn.v_proj": {
290
+ "bits": 5,
291
+ "group_size": 64,
292
+ "mode": "affine"
293
+ },
294
+ "model.layers.13.mlp.shared_experts.down_proj": {
295
+ "bits": 8,
296
+ "group_size": 64,
297
+ "mode": "affine"
298
+ },
299
+ "model.layers.13.mlp.shared_experts.gate_proj": {
300
+ "bits": 8,
301
+ "group_size": 64,
302
+ "mode": "affine"
303
+ },
304
+ "model.layers.13.mlp.shared_experts.up_proj": {
305
+ "bits": 8,
306
+ "group_size": 64,
307
+ "mode": "affine"
308
+ },
309
+ "model.layers.13.self_attn.gate_proj": {
310
+ "bits": 5,
311
+ "group_size": 64,
312
+ "mode": "affine"
313
+ },
314
+ "model.layers.13.self_attn.k_proj": {
315
+ "bits": 5,
316
+ "group_size": 64,
317
+ "mode": "affine"
318
+ },
319
+ "model.layers.13.self_attn.o_proj": {
320
+ "bits": 5,
321
+ "group_size": 64,
322
+ "mode": "affine"
323
+ },
324
+ "model.layers.13.self_attn.q_proj": {
325
+ "bits": 5,
326
+ "group_size": 64,
327
+ "mode": "affine"
328
+ },
329
+ "model.layers.13.self_attn.v_proj": {
330
+ "bits": 5,
331
+ "group_size": 64,
332
+ "mode": "affine"
333
+ },
334
+ "model.layers.14.mlp.shared_experts.down_proj": {
335
+ "bits": 8,
336
+ "group_size": 64,
337
+ "mode": "affine"
338
+ },
339
+ "model.layers.14.mlp.shared_experts.gate_proj": {
340
+ "bits": 8,
341
+ "group_size": 64,
342
+ "mode": "affine"
343
+ },
344
+ "model.layers.14.mlp.shared_experts.up_proj": {
345
+ "bits": 8,
346
+ "group_size": 64,
347
+ "mode": "affine"
348
+ },
349
+ "model.layers.14.self_attn.gate_proj": {
350
+ "bits": 5,
351
+ "group_size": 64,
352
+ "mode": "affine"
353
+ },
354
+ "model.layers.14.self_attn.k_proj": {
355
+ "bits": 5,
356
+ "group_size": 64,
357
+ "mode": "affine"
358
+ },
359
+ "model.layers.14.self_attn.o_proj": {
360
+ "bits": 5,
361
+ "group_size": 64,
362
+ "mode": "affine"
363
+ },
364
+ "model.layers.14.self_attn.q_proj": {
365
+ "bits": 5,
366
+ "group_size": 64,
367
+ "mode": "affine"
368
+ },
369
+ "model.layers.14.self_attn.v_proj": {
370
+ "bits": 5,
371
+ "group_size": 64,
372
+ "mode": "affine"
373
+ },
374
+ "model.layers.15.mlp.shared_experts.down_proj": {
375
+ "bits": 8,
376
+ "group_size": 64,
377
+ "mode": "affine"
378
+ },
379
+ "model.layers.15.mlp.shared_experts.gate_proj": {
380
+ "bits": 8,
381
+ "group_size": 64,
382
+ "mode": "affine"
383
+ },
384
+ "model.layers.15.mlp.shared_experts.up_proj": {
385
+ "bits": 8,
386
+ "group_size": 64,
387
+ "mode": "affine"
388
+ },
389
+ "model.layers.15.self_attn.gate_proj": {
390
+ "bits": 5,
391
+ "group_size": 64,
392
+ "mode": "affine"
393
+ },
394
+ "model.layers.15.self_attn.k_proj": {
395
+ "bits": 5,
396
+ "group_size": 64,
397
+ "mode": "affine"
398
+ },
399
+ "model.layers.15.self_attn.o_proj": {
400
+ "bits": 5,
401
+ "group_size": 64,
402
+ "mode": "affine"
403
+ },
404
+ "model.layers.15.self_attn.q_proj": {
405
+ "bits": 5,
406
+ "group_size": 64,
407
+ "mode": "affine"
408
+ },
409
+ "model.layers.15.self_attn.v_proj": {
410
+ "bits": 5,
411
+ "group_size": 64,
412
+ "mode": "affine"
413
+ },
414
+ "model.layers.16.mlp.shared_experts.down_proj": {
415
+ "bits": 8,
416
+ "group_size": 64,
417
+ "mode": "affine"
418
+ },
419
+ "model.layers.16.mlp.shared_experts.gate_proj": {
420
+ "bits": 8,
421
+ "group_size": 64,
422
+ "mode": "affine"
423
+ },
424
+ "model.layers.16.mlp.shared_experts.up_proj": {
425
+ "bits": 8,
426
+ "group_size": 64,
427
+ "mode": "affine"
428
+ },
429
+ "model.layers.16.self_attn.gate_proj": {
430
+ "bits": 5,
431
+ "group_size": 64,
432
+ "mode": "affine"
433
+ },
434
+ "model.layers.16.self_attn.k_proj": {
435
+ "bits": 5,
436
+ "group_size": 64,
437
+ "mode": "affine"
438
+ },
439
+ "model.layers.16.self_attn.o_proj": {
440
+ "bits": 5,
441
+ "group_size": 64,
442
+ "mode": "affine"
443
+ },
444
+ "model.layers.16.self_attn.q_proj": {
445
+ "bits": 5,
446
+ "group_size": 64,
447
+ "mode": "affine"
448
+ },
449
+ "model.layers.16.self_attn.v_proj": {
450
+ "bits": 5,
451
+ "group_size": 64,
452
+ "mode": "affine"
453
+ },
454
+ "model.layers.17.mlp.shared_experts.down_proj": {
455
+ "bits": 8,
456
+ "group_size": 64,
457
+ "mode": "affine"
458
+ },
459
+ "model.layers.17.mlp.shared_experts.gate_proj": {
460
+ "bits": 8,
461
+ "group_size": 64,
462
+ "mode": "affine"
463
+ },
464
+ "model.layers.17.mlp.shared_experts.up_proj": {
465
+ "bits": 8,
466
+ "group_size": 64,
467
+ "mode": "affine"
468
+ },
469
+ "model.layers.17.self_attn.gate_proj": {
470
+ "bits": 5,
471
+ "group_size": 64,
472
+ "mode": "affine"
473
+ },
474
+ "model.layers.17.self_attn.k_proj": {
475
+ "bits": 5,
476
+ "group_size": 64,
477
+ "mode": "affine"
478
+ },
479
+ "model.layers.17.self_attn.o_proj": {
480
+ "bits": 5,
481
+ "group_size": 64,
482
+ "mode": "affine"
483
+ },
484
+ "model.layers.17.self_attn.q_proj": {
485
+ "bits": 5,
486
+ "group_size": 64,
487
+ "mode": "affine"
488
+ },
489
+ "model.layers.17.self_attn.v_proj": {
490
+ "bits": 5,
491
+ "group_size": 64,
492
+ "mode": "affine"
493
+ },
494
+ "model.layers.18.mlp.shared_experts.down_proj": {
495
+ "bits": 8,
496
+ "group_size": 64,
497
+ "mode": "affine"
498
+ },
499
+ "model.layers.18.mlp.shared_experts.gate_proj": {
500
+ "bits": 8,
501
+ "group_size": 64,
502
+ "mode": "affine"
503
+ },
504
+ "model.layers.18.mlp.shared_experts.up_proj": {
505
+ "bits": 8,
506
+ "group_size": 64,
507
+ "mode": "affine"
508
+ },
509
+ "model.layers.18.self_attn.gate_proj": {
510
+ "bits": 5,
511
+ "group_size": 64,
512
+ "mode": "affine"
513
+ },
514
+ "model.layers.18.self_attn.k_proj": {
515
+ "bits": 5,
516
+ "group_size": 64,
517
+ "mode": "affine"
518
+ },
519
+ "model.layers.18.self_attn.o_proj": {
520
+ "bits": 5,
521
+ "group_size": 64,
522
+ "mode": "affine"
523
+ },
524
+ "model.layers.18.self_attn.q_proj": {
525
+ "bits": 5,
526
+ "group_size": 64,
527
+ "mode": "affine"
528
+ },
529
+ "model.layers.18.self_attn.v_proj": {
530
+ "bits": 5,
531
+ "group_size": 64,
532
+ "mode": "affine"
533
+ },
534
+ "model.layers.19.mlp.shared_experts.down_proj": {
535
+ "bits": 8,
536
+ "group_size": 64,
537
+ "mode": "affine"
538
+ },
539
+ "model.layers.19.mlp.shared_experts.gate_proj": {
540
+ "bits": 8,
541
+ "group_size": 64,
542
+ "mode": "affine"
543
+ },
544
+ "model.layers.19.mlp.shared_experts.up_proj": {
545
+ "bits": 8,
546
+ "group_size": 64,
547
+ "mode": "affine"
548
+ },
549
+ "model.layers.19.self_attn.gate_proj": {
550
+ "bits": 5,
551
+ "group_size": 64,
552
+ "mode": "affine"
553
+ },
554
+ "model.layers.19.self_attn.k_proj": {
555
+ "bits": 5,
556
+ "group_size": 64,
557
+ "mode": "affine"
558
+ },
559
+ "model.layers.19.self_attn.o_proj": {
560
+ "bits": 5,
561
+ "group_size": 64,
562
+ "mode": "affine"
563
+ },
564
+ "model.layers.19.self_attn.q_proj": {
565
+ "bits": 5,
566
+ "group_size": 64,
567
+ "mode": "affine"
568
+ },
569
+ "model.layers.19.self_attn.v_proj": {
570
+ "bits": 5,
571
+ "group_size": 64,
572
+ "mode": "affine"
573
+ },
574
+ "model.layers.2.mlp.shared_experts.down_proj": {
575
+ "bits": 8,
576
+ "group_size": 64,
577
+ "mode": "affine"
578
+ },
579
+ "model.layers.2.mlp.shared_experts.gate_proj": {
580
+ "bits": 8,
581
+ "group_size": 64,
582
+ "mode": "affine"
583
+ },
584
+ "model.layers.2.mlp.shared_experts.up_proj": {
585
+ "bits": 8,
586
+ "group_size": 64,
587
+ "mode": "affine"
588
+ },
589
+ "model.layers.2.self_attn.gate_proj": {
590
+ "bits": 5,
591
+ "group_size": 64,
592
+ "mode": "affine"
593
+ },
594
+ "model.layers.2.self_attn.k_proj": {
595
+ "bits": 5,
596
+ "group_size": 64,
597
+ "mode": "affine"
598
+ },
599
+ "model.layers.2.self_attn.o_proj": {
600
+ "bits": 5,
601
+ "group_size": 64,
602
+ "mode": "affine"
603
+ },
604
+ "model.layers.2.self_attn.q_proj": {
605
+ "bits": 5,
606
+ "group_size": 64,
607
+ "mode": "affine"
608
+ },
609
+ "model.layers.2.self_attn.v_proj": {
610
+ "bits": 5,
611
+ "group_size": 64,
612
+ "mode": "affine"
613
+ },
614
+ "model.layers.20.mlp.shared_experts.down_proj": {
615
+ "bits": 8,
616
+ "group_size": 64,
617
+ "mode": "affine"
618
+ },
619
+ "model.layers.20.mlp.shared_experts.gate_proj": {
620
+ "bits": 8,
621
+ "group_size": 64,
622
+ "mode": "affine"
623
+ },
624
+ "model.layers.20.mlp.shared_experts.up_proj": {
625
+ "bits": 8,
626
+ "group_size": 64,
627
+ "mode": "affine"
628
+ },
629
+ "model.layers.20.self_attn.gate_proj": {
630
+ "bits": 5,
631
+ "group_size": 64,
632
+ "mode": "affine"
633
+ },
634
+ "model.layers.20.self_attn.k_proj": {
635
+ "bits": 5,
636
+ "group_size": 64,
637
+ "mode": "affine"
638
+ },
639
+ "model.layers.20.self_attn.o_proj": {
640
+ "bits": 5,
641
+ "group_size": 64,
642
+ "mode": "affine"
643
+ },
644
+ "model.layers.20.self_attn.q_proj": {
645
+ "bits": 5,
646
+ "group_size": 64,
647
+ "mode": "affine"
648
+ },
649
+ "model.layers.20.self_attn.v_proj": {
650
+ "bits": 5,
651
+ "group_size": 64,
652
+ "mode": "affine"
653
+ },
654
+ "model.layers.21.mlp.shared_experts.down_proj": {
655
+ "bits": 8,
656
+ "group_size": 64,
657
+ "mode": "affine"
658
+ },
659
+ "model.layers.21.mlp.shared_experts.gate_proj": {
660
+ "bits": 8,
661
+ "group_size": 64,
662
+ "mode": "affine"
663
+ },
664
+ "model.layers.21.mlp.shared_experts.up_proj": {
665
+ "bits": 8,
666
+ "group_size": 64,
667
+ "mode": "affine"
668
+ },
669
+ "model.layers.21.self_attn.gate_proj": {
670
+ "bits": 5,
671
+ "group_size": 64,
672
+ "mode": "affine"
673
+ },
674
+ "model.layers.21.self_attn.k_proj": {
675
+ "bits": 5,
676
+ "group_size": 64,
677
+ "mode": "affine"
678
+ },
679
+ "model.layers.21.self_attn.o_proj": {
680
+ "bits": 5,
681
+ "group_size": 64,
682
+ "mode": "affine"
683
+ },
684
+ "model.layers.21.self_attn.q_proj": {
685
+ "bits": 5,
686
+ "group_size": 64,
687
+ "mode": "affine"
688
+ },
689
+ "model.layers.21.self_attn.v_proj": {
690
+ "bits": 5,
691
+ "group_size": 64,
692
+ "mode": "affine"
693
+ },
694
+ "model.layers.22.mlp.shared_experts.down_proj": {
695
+ "bits": 8,
696
+ "group_size": 64,
697
+ "mode": "affine"
698
+ },
699
+ "model.layers.22.mlp.shared_experts.gate_proj": {
700
+ "bits": 8,
701
+ "group_size": 64,
702
+ "mode": "affine"
703
+ },
704
+ "model.layers.22.mlp.shared_experts.up_proj": {
705
+ "bits": 8,
706
+ "group_size": 64,
707
+ "mode": "affine"
708
+ },
709
+ "model.layers.22.self_attn.gate_proj": {
710
+ "bits": 5,
711
+ "group_size": 64,
712
+ "mode": "affine"
713
+ },
714
+ "model.layers.22.self_attn.k_proj": {
715
+ "bits": 5,
716
+ "group_size": 64,
717
+ "mode": "affine"
718
+ },
719
+ "model.layers.22.self_attn.o_proj": {
720
+ "bits": 5,
721
+ "group_size": 64,
722
+ "mode": "affine"
723
+ },
724
+ "model.layers.22.self_attn.q_proj": {
725
+ "bits": 5,
726
+ "group_size": 64,
727
+ "mode": "affine"
728
+ },
729
+ "model.layers.22.self_attn.v_proj": {
730
+ "bits": 5,
731
+ "group_size": 64,
732
+ "mode": "affine"
733
+ },
734
+ "model.layers.23.mlp.shared_experts.down_proj": {
735
+ "bits": 8,
736
+ "group_size": 64,
737
+ "mode": "affine"
738
+ },
739
+ "model.layers.23.mlp.shared_experts.gate_proj": {
740
+ "bits": 8,
741
+ "group_size": 64,
742
+ "mode": "affine"
743
+ },
744
+ "model.layers.23.mlp.shared_experts.up_proj": {
745
+ "bits": 8,
746
+ "group_size": 64,
747
+ "mode": "affine"
748
+ },
749
+ "model.layers.23.self_attn.gate_proj": {
750
+ "bits": 5,
751
+ "group_size": 64,
752
+ "mode": "affine"
753
+ },
754
+ "model.layers.23.self_attn.k_proj": {
755
+ "bits": 5,
756
+ "group_size": 64,
757
+ "mode": "affine"
758
+ },
759
+ "model.layers.23.self_attn.o_proj": {
760
+ "bits": 5,
761
+ "group_size": 64,
762
+ "mode": "affine"
763
+ },
764
+ "model.layers.23.self_attn.q_proj": {
765
+ "bits": 5,
766
+ "group_size": 64,
767
+ "mode": "affine"
768
+ },
769
+ "model.layers.23.self_attn.v_proj": {
770
+ "bits": 5,
771
+ "group_size": 64,
772
+ "mode": "affine"
773
+ },
774
+ "model.layers.24.mlp.shared_experts.down_proj": {
775
+ "bits": 8,
776
+ "group_size": 64,
777
+ "mode": "affine"
778
+ },
779
+ "model.layers.24.mlp.shared_experts.gate_proj": {
780
+ "bits": 8,
781
+ "group_size": 64,
782
+ "mode": "affine"
783
+ },
784
+ "model.layers.24.mlp.shared_experts.up_proj": {
785
+ "bits": 8,
786
+ "group_size": 64,
787
+ "mode": "affine"
788
+ },
789
+ "model.layers.24.self_attn.gate_proj": {
790
+ "bits": 5,
791
+ "group_size": 64,
792
+ "mode": "affine"
793
+ },
794
+ "model.layers.24.self_attn.k_proj": {
795
+ "bits": 5,
796
+ "group_size": 64,
797
+ "mode": "affine"
798
+ },
799
+ "model.layers.24.self_attn.o_proj": {
800
+ "bits": 5,
801
+ "group_size": 64,
802
+ "mode": "affine"
803
+ },
804
+ "model.layers.24.self_attn.q_proj": {
805
+ "bits": 5,
806
+ "group_size": 64,
807
+ "mode": "affine"
808
+ },
809
+ "model.layers.24.self_attn.v_proj": {
810
+ "bits": 5,
811
+ "group_size": 64,
812
+ "mode": "affine"
813
+ },
814
+ "model.layers.25.mlp.shared_experts.down_proj": {
815
+ "bits": 8,
816
+ "group_size": 64,
817
+ "mode": "affine"
818
+ },
819
+ "model.layers.25.mlp.shared_experts.gate_proj": {
820
+ "bits": 8,
821
+ "group_size": 64,
822
+ "mode": "affine"
823
+ },
824
+ "model.layers.25.mlp.shared_experts.up_proj": {
825
+ "bits": 8,
826
+ "group_size": 64,
827
+ "mode": "affine"
828
+ },
829
+ "model.layers.25.self_attn.gate_proj": {
830
+ "bits": 5,
831
+ "group_size": 64,
832
+ "mode": "affine"
833
+ },
834
+ "model.layers.25.self_attn.k_proj": {
835
+ "bits": 5,
836
+ "group_size": 64,
837
+ "mode": "affine"
838
+ },
839
+ "model.layers.25.self_attn.o_proj": {
840
+ "bits": 5,
841
+ "group_size": 64,
842
+ "mode": "affine"
843
+ },
844
+ "model.layers.25.self_attn.q_proj": {
845
+ "bits": 5,
846
+ "group_size": 64,
847
+ "mode": "affine"
848
+ },
849
+ "model.layers.25.self_attn.v_proj": {
850
+ "bits": 5,
851
+ "group_size": 64,
852
+ "mode": "affine"
853
+ },
854
+ "model.layers.26.mlp.shared_experts.down_proj": {
855
+ "bits": 8,
856
+ "group_size": 64,
857
+ "mode": "affine"
858
+ },
859
+ "model.layers.26.mlp.shared_experts.gate_proj": {
860
+ "bits": 8,
861
+ "group_size": 64,
862
+ "mode": "affine"
863
+ },
864
+ "model.layers.26.mlp.shared_experts.up_proj": {
865
+ "bits": 8,
866
+ "group_size": 64,
867
+ "mode": "affine"
868
+ },
869
+ "model.layers.26.self_attn.gate_proj": {
870
+ "bits": 5,
871
+ "group_size": 64,
872
+ "mode": "affine"
873
+ },
874
+ "model.layers.26.self_attn.k_proj": {
875
+ "bits": 6,
876
+ "group_size": 64,
877
+ "mode": "affine"
878
+ },
879
+ "model.layers.26.self_attn.o_proj": {
880
+ "bits": 5,
881
+ "group_size": 64,
882
+ "mode": "affine"
883
+ },
884
+ "model.layers.26.self_attn.q_proj": {
885
+ "bits": 6,
886
+ "group_size": 64,
887
+ "mode": "affine"
888
+ },
889
+ "model.layers.26.self_attn.v_proj": {
890
+ "bits": 6,
891
+ "group_size": 64,
892
+ "mode": "affine"
893
+ },
894
+ "model.layers.27.mlp.shared_experts.down_proj": {
895
+ "bits": 8,
896
+ "group_size": 64,
897
+ "mode": "affine"
898
+ },
899
+ "model.layers.27.mlp.shared_experts.gate_proj": {
900
+ "bits": 8,
901
+ "group_size": 64,
902
+ "mode": "affine"
903
+ },
904
+ "model.layers.27.mlp.shared_experts.up_proj": {
905
+ "bits": 8,
906
+ "group_size": 64,
907
+ "mode": "affine"
908
+ },
909
+ "model.layers.27.self_attn.gate_proj": {
910
+ "bits": 5,
911
+ "group_size": 64,
912
+ "mode": "affine"
913
+ },
914
+ "model.layers.27.self_attn.k_proj": {
915
+ "bits": 6,
916
+ "group_size": 64,
917
+ "mode": "affine"
918
+ },
919
+ "model.layers.27.self_attn.o_proj": {
920
+ "bits": 5,
921
+ "group_size": 64,
922
+ "mode": "affine"
923
+ },
924
+ "model.layers.27.self_attn.q_proj": {
925
+ "bits": 6,
926
+ "group_size": 64,
927
+ "mode": "affine"
928
+ },
929
+ "model.layers.27.self_attn.v_proj": {
930
+ "bits": 6,
931
+ "group_size": 64,
932
+ "mode": "affine"
933
+ },
934
+ "model.layers.28.mlp.shared_experts.down_proj": {
935
+ "bits": 8,
936
+ "group_size": 64,
937
+ "mode": "affine"
938
+ },
939
+ "model.layers.28.mlp.shared_experts.gate_proj": {
940
+ "bits": 8,
941
+ "group_size": 64,
942
+ "mode": "affine"
943
+ },
944
+ "model.layers.28.mlp.shared_experts.up_proj": {
945
+ "bits": 8,
946
+ "group_size": 64,
947
+ "mode": "affine"
948
+ },
949
+ "model.layers.28.self_attn.gate_proj": {
950
+ "bits": 5,
951
+ "group_size": 64,
952
+ "mode": "affine"
953
+ },
954
+ "model.layers.28.self_attn.k_proj": {
955
+ "bits": 6,
956
+ "group_size": 64,
957
+ "mode": "affine"
958
+ },
959
+ "model.layers.28.self_attn.o_proj": {
960
+ "bits": 5,
961
+ "group_size": 64,
962
+ "mode": "affine"
963
+ },
964
+ "model.layers.28.self_attn.q_proj": {
965
+ "bits": 6,
966
+ "group_size": 64,
967
+ "mode": "affine"
968
+ },
969
+ "model.layers.28.self_attn.v_proj": {
970
+ "bits": 6,
971
+ "group_size": 64,
972
+ "mode": "affine"
973
+ },
974
+ "model.layers.29.mlp.shared_experts.down_proj": {
975
+ "bits": 8,
976
+ "group_size": 64,
977
+ "mode": "affine"
978
+ },
979
+ "model.layers.29.mlp.shared_experts.gate_proj": {
980
+ "bits": 8,
981
+ "group_size": 64,
982
+ "mode": "affine"
983
+ },
984
+ "model.layers.29.mlp.shared_experts.up_proj": {
985
+ "bits": 8,
986
+ "group_size": 64,
987
+ "mode": "affine"
988
+ },
989
+ "model.layers.29.self_attn.gate_proj": {
990
+ "bits": 5,
991
+ "group_size": 64,
992
+ "mode": "affine"
993
+ },
994
+ "model.layers.29.self_attn.k_proj": {
995
+ "bits": 6,
996
+ "group_size": 64,
997
+ "mode": "affine"
998
+ },
999
+ "model.layers.29.self_attn.o_proj": {
1000
+ "bits": 5,
1001
+ "group_size": 64,
1002
+ "mode": "affine"
1003
+ },
1004
+ "model.layers.29.self_attn.q_proj": {
1005
+ "bits": 6,
1006
+ "group_size": 64,
1007
+ "mode": "affine"
1008
+ },
1009
+ "model.layers.29.self_attn.v_proj": {
1010
+ "bits": 6,
1011
+ "group_size": 64,
1012
+ "mode": "affine"
1013
+ },
1014
+ "model.layers.3.mlp.shared_experts.down_proj": {
1015
+ "bits": 8,
1016
+ "group_size": 64,
1017
+ "mode": "affine"
1018
+ },
1019
+ "model.layers.3.mlp.shared_experts.gate_proj": {
1020
+ "bits": 8,
1021
+ "group_size": 64,
1022
+ "mode": "affine"
1023
+ },
1024
+ "model.layers.3.mlp.shared_experts.up_proj": {
1025
+ "bits": 8,
1026
+ "group_size": 64,
1027
+ "mode": "affine"
1028
+ },
1029
+ "model.layers.3.self_attn.k_proj": {
1030
+ "bits": 5,
1031
+ "group_size": 64,
1032
+ "mode": "affine"
1033
+ },
1034
+ "model.layers.3.self_attn.v_proj": {
1035
+ "bits": 5,
1036
+ "group_size": 64,
1037
+ "mode": "affine"
1038
+ },
1039
+ "model.layers.30.mlp.shared_experts.down_proj": {
1040
+ "bits": 8,
1041
+ "group_size": 64,
1042
+ "mode": "affine"
1043
+ },
1044
+ "model.layers.30.mlp.shared_experts.gate_proj": {
1045
+ "bits": 8,
1046
+ "group_size": 64,
1047
+ "mode": "affine"
1048
+ },
1049
+ "model.layers.30.mlp.shared_experts.up_proj": {
1050
+ "bits": 8,
1051
+ "group_size": 64,
1052
+ "mode": "affine"
1053
+ },
1054
+ "model.layers.30.self_attn.gate_proj": {
1055
+ "bits": 5,
1056
+ "group_size": 64,
1057
+ "mode": "affine"
1058
+ },
1059
+ "model.layers.30.self_attn.k_proj": {
1060
+ "bits": 6,
1061
+ "group_size": 64,
1062
+ "mode": "affine"
1063
+ },
1064
+ "model.layers.30.self_attn.o_proj": {
1065
+ "bits": 5,
1066
+ "group_size": 64,
1067
+ "mode": "affine"
1068
+ },
1069
+ "model.layers.30.self_attn.q_proj": {
1070
+ "bits": 6,
1071
+ "group_size": 64,
1072
+ "mode": "affine"
1073
+ },
1074
+ "model.layers.30.self_attn.v_proj": {
1075
+ "bits": 6,
1076
+ "group_size": 64,
1077
+ "mode": "affine"
1078
+ },
1079
+ "model.layers.31.mlp.shared_experts.down_proj": {
1080
+ "bits": 8,
1081
+ "group_size": 64,
1082
+ "mode": "affine"
1083
+ },
1084
+ "model.layers.31.mlp.shared_experts.gate_proj": {
1085
+ "bits": 8,
1086
+ "group_size": 64,
1087
+ "mode": "affine"
1088
+ },
1089
+ "model.layers.31.mlp.shared_experts.up_proj": {
1090
+ "bits": 8,
1091
+ "group_size": 64,
1092
+ "mode": "affine"
1093
+ },
1094
+ "model.layers.31.self_attn.gate_proj": {
1095
+ "bits": 5,
1096
+ "group_size": 64,
1097
+ "mode": "affine"
1098
+ },
1099
+ "model.layers.31.self_attn.k_proj": {
1100
+ "bits": 6,
1101
+ "group_size": 64,
1102
+ "mode": "affine"
1103
+ },
1104
+ "model.layers.31.self_attn.o_proj": {
1105
+ "bits": 5,
1106
+ "group_size": 64,
1107
+ "mode": "affine"
1108
+ },
1109
+ "model.layers.31.self_attn.q_proj": {
1110
+ "bits": 6,
1111
+ "group_size": 64,
1112
+ "mode": "affine"
1113
+ },
1114
+ "model.layers.31.self_attn.v_proj": {
1115
+ "bits": 6,
1116
+ "group_size": 64,
1117
+ "mode": "affine"
1118
+ },
1119
+ "model.layers.4.mlp.shared_experts.down_proj": {
1120
+ "bits": 8,
1121
+ "group_size": 64,
1122
+ "mode": "affine"
1123
+ },
1124
+ "model.layers.4.mlp.shared_experts.gate_proj": {
1125
+ "bits": 8,
1126
+ "group_size": 64,
1127
+ "mode": "affine"
1128
+ },
1129
+ "model.layers.4.mlp.shared_experts.up_proj": {
1130
+ "bits": 8,
1131
+ "group_size": 64,
1132
+ "mode": "affine"
1133
+ },
1134
+ "model.layers.4.self_attn.gate_proj": {
1135
+ "bits": 5,
1136
+ "group_size": 64,
1137
+ "mode": "affine"
1138
+ },
1139
+ "model.layers.4.self_attn.k_proj": {
1140
+ "bits": 5,
1141
+ "group_size": 64,
1142
+ "mode": "affine"
1143
+ },
1144
+ "model.layers.4.self_attn.v_proj": {
1145
+ "bits": 5,
1146
+ "group_size": 64,
1147
+ "mode": "affine"
1148
+ },
1149
+ "model.layers.5.mlp.shared_experts.down_proj": {
1150
+ "bits": 8,
1151
+ "group_size": 64,
1152
+ "mode": "affine"
1153
+ },
1154
+ "model.layers.5.mlp.shared_experts.gate_proj": {
1155
+ "bits": 8,
1156
+ "group_size": 64,
1157
+ "mode": "affine"
1158
+ },
1159
+ "model.layers.5.mlp.shared_experts.up_proj": {
1160
+ "bits": 8,
1161
+ "group_size": 64,
1162
+ "mode": "affine"
1163
+ },
1164
+ "model.layers.5.self_attn.gate_proj": {
1165
+ "bits": 5,
1166
+ "group_size": 64,
1167
+ "mode": "affine"
1168
+ },
1169
+ "model.layers.5.self_attn.k_proj": {
1170
+ "bits": 5,
1171
+ "group_size": 64,
1172
+ "mode": "affine"
1173
+ },
1174
+ "model.layers.5.self_attn.o_proj": {
1175
+ "bits": 5,
1176
+ "group_size": 64,
1177
+ "mode": "affine"
1178
+ },
1179
+ "model.layers.5.self_attn.q_proj": {
1180
+ "bits": 5,
1181
+ "group_size": 64,
1182
+ "mode": "affine"
1183
+ },
1184
+ "model.layers.5.self_attn.v_proj": {
1185
+ "bits": 5,
1186
+ "group_size": 64,
1187
+ "mode": "affine"
1188
+ },
1189
+ "model.layers.6.mlp.shared_experts.down_proj": {
1190
+ "bits": 8,
1191
+ "group_size": 64,
1192
+ "mode": "affine"
1193
+ },
1194
+ "model.layers.6.mlp.shared_experts.gate_proj": {
1195
+ "bits": 8,
1196
+ "group_size": 64,
1197
+ "mode": "affine"
1198
+ },
1199
+ "model.layers.6.mlp.shared_experts.up_proj": {
1200
+ "bits": 8,
1201
+ "group_size": 64,
1202
+ "mode": "affine"
1203
+ },
1204
+ "model.layers.6.self_attn.gate_proj": {
1205
+ "bits": 5,
1206
+ "group_size": 64,
1207
+ "mode": "affine"
1208
+ },
1209
+ "model.layers.6.self_attn.k_proj": {
1210
+ "bits": 5,
1211
+ "group_size": 64,
1212
+ "mode": "affine"
1213
+ },
1214
+ "model.layers.6.self_attn.o_proj": {
1215
+ "bits": 5,
1216
+ "group_size": 64,
1217
+ "mode": "affine"
1218
+ },
1219
+ "model.layers.6.self_attn.q_proj": {
1220
+ "bits": 5,
1221
+ "group_size": 64,
1222
+ "mode": "affine"
1223
+ },
1224
+ "model.layers.6.self_attn.v_proj": {
1225
+ "bits": 5,
1226
+ "group_size": 64,
1227
+ "mode": "affine"
1228
+ },
1229
+ "model.layers.7.mlp.shared_experts.down_proj": {
1230
+ "bits": 8,
1231
+ "group_size": 64,
1232
+ "mode": "affine"
1233
+ },
1234
+ "model.layers.7.mlp.shared_experts.gate_proj": {
1235
+ "bits": 8,
1236
+ "group_size": 64,
1237
+ "mode": "affine"
1238
+ },
1239
+ "model.layers.7.mlp.shared_experts.up_proj": {
1240
+ "bits": 8,
1241
+ "group_size": 64,
1242
+ "mode": "affine"
1243
+ },
1244
+ "model.layers.7.self_attn.gate_proj": {
1245
+ "bits": 5,
1246
+ "group_size": 64,
1247
+ "mode": "affine"
1248
+ },
1249
+ "model.layers.7.self_attn.k_proj": {
1250
+ "bits": 5,
1251
+ "group_size": 64,
1252
+ "mode": "affine"
1253
+ },
1254
+ "model.layers.7.self_attn.o_proj": {
1255
+ "bits": 5,
1256
+ "group_size": 64,
1257
+ "mode": "affine"
1258
+ },
1259
+ "model.layers.7.self_attn.q_proj": {
1260
+ "bits": 5,
1261
+ "group_size": 64,
1262
+ "mode": "affine"
1263
+ },
1264
+ "model.layers.7.self_attn.v_proj": {
1265
+ "bits": 5,
1266
+ "group_size": 64,
1267
+ "mode": "affine"
1268
+ },
1269
+ "model.layers.8.mlp.shared_experts.down_proj": {
1270
+ "bits": 8,
1271
+ "group_size": 64,
1272
+ "mode": "affine"
1273
+ },
1274
+ "model.layers.8.mlp.shared_experts.gate_proj": {
1275
+ "bits": 8,
1276
+ "group_size": 64,
1277
+ "mode": "affine"
1278
+ },
1279
+ "model.layers.8.mlp.shared_experts.up_proj": {
1280
+ "bits": 8,
1281
+ "group_size": 64,
1282
+ "mode": "affine"
1283
+ },
1284
+ "model.layers.8.self_attn.gate_proj": {
1285
+ "bits": 5,
1286
+ "group_size": 64,
1287
+ "mode": "affine"
1288
+ },
1289
+ "model.layers.8.self_attn.k_proj": {
1290
+ "bits": 5,
1291
+ "group_size": 64,
1292
+ "mode": "affine"
1293
+ },
1294
+ "model.layers.8.self_attn.o_proj": {
1295
+ "bits": 5,
1296
+ "group_size": 64,
1297
+ "mode": "affine"
1298
+ },
1299
+ "model.layers.8.self_attn.q_proj": {
1300
+ "bits": 5,
1301
+ "group_size": 64,
1302
+ "mode": "affine"
1303
+ },
1304
+ "model.layers.8.self_attn.v_proj": {
1305
+ "bits": 5,
1306
+ "group_size": 64,
1307
+ "mode": "affine"
1308
+ },
1309
+ "model.layers.9.mlp.shared_experts.down_proj": {
1310
+ "bits": 8,
1311
+ "group_size": 64,
1312
+ "mode": "affine"
1313
+ },
1314
+ "model.layers.9.mlp.shared_experts.gate_proj": {
1315
+ "bits": 8,
1316
+ "group_size": 64,
1317
+ "mode": "affine"
1318
+ },
1319
+ "model.layers.9.mlp.shared_experts.up_proj": {
1320
+ "bits": 8,
1321
+ "group_size": 64,
1322
+ "mode": "affine"
1323
+ },
1324
+ "model.layers.9.self_attn.gate_proj": {
1325
+ "bits": 5,
1326
+ "group_size": 64,
1327
+ "mode": "affine"
1328
+ },
1329
+ "model.layers.9.self_attn.k_proj": {
1330
+ "bits": 5,
1331
+ "group_size": 64,
1332
+ "mode": "affine"
1333
+ },
1334
+ "model.layers.9.self_attn.o_proj": {
1335
+ "bits": 5,
1336
+ "group_size": 64,
1337
+ "mode": "affine"
1338
+ },
1339
+ "model.layers.9.self_attn.q_proj": {
1340
+ "bits": 5,
1341
+ "group_size": 64,
1342
+ "mode": "affine"
1343
+ },
1344
+ "model.layers.9.self_attn.v_proj": {
1345
+ "bits": 5,
1346
+ "group_size": 64,
1347
+ "mode": "affine"
1348
+ }
1349
+ },
1350
+ "quantization_config": {
1351
+ "group_size": 64,
1352
+ "bits": 4,
1353
+ "mode": "affine",
1354
+ "lm_head": {
1355
+ "bits": 8,
1356
+ "group_size": 64,
1357
+ "mode": "affine"
1358
+ },
1359
+ "model.embed_tokens": {
1360
+ "bits": 8,
1361
+ "group_size": 64,
1362
+ "mode": "affine"
1363
+ },
1364
+ "model.layers.0.mlp.down_proj": {
1365
+ "bits": 8,
1366
+ "group_size": 64,
1367
+ "mode": "affine"
1368
+ },
1369
+ "model.layers.0.mlp.gate_proj": {
1370
+ "bits": 8,
1371
+ "group_size": 64,
1372
+ "mode": "affine"
1373
+ },
1374
+ "model.layers.0.mlp.up_proj": {
1375
+ "bits": 8,
1376
+ "group_size": 64,
1377
+ "mode": "affine"
1378
+ },
1379
+ "model.layers.0.self_attn.gate_proj": {
1380
+ "bits": 8,
1381
+ "group_size": 64,
1382
+ "mode": "affine"
1383
+ },
1384
+ "model.layers.0.self_attn.k_proj": {
1385
+ "bits": 8,
1386
+ "group_size": 64,
1387
+ "mode": "affine"
1388
+ },
1389
+ "model.layers.0.self_attn.o_proj": {
1390
+ "bits": 8,
1391
+ "group_size": 64,
1392
+ "mode": "affine"
1393
+ },
1394
+ "model.layers.0.self_attn.q_proj": {
1395
+ "bits": 8,
1396
+ "group_size": 64,
1397
+ "mode": "affine"
1398
+ },
1399
+ "model.layers.0.self_attn.v_proj": {
1400
+ "bits": 8,
1401
+ "group_size": 64,
1402
+ "mode": "affine"
1403
+ },
1404
+ "model.layers.1.mlp.down_proj": {
1405
+ "bits": 6,
1406
+ "group_size": 64,
1407
+ "mode": "affine"
1408
+ },
1409
+ "model.layers.1.mlp.gate_proj": {
1410
+ "bits": 5,
1411
+ "group_size": 64,
1412
+ "mode": "affine"
1413
+ },
1414
+ "model.layers.1.mlp.up_proj": {
1415
+ "bits": 5,
1416
+ "group_size": 64,
1417
+ "mode": "affine"
1418
+ },
1419
+ "model.layers.1.self_attn.gate_proj": {
1420
+ "bits": 5,
1421
+ "group_size": 64,
1422
+ "mode": "affine"
1423
+ },
1424
+ "model.layers.1.self_attn.k_proj": {
1425
+ "bits": 6,
1426
+ "group_size": 64,
1427
+ "mode": "affine"
1428
+ },
1429
+ "model.layers.1.self_attn.o_proj": {
1430
+ "bits": 5,
1431
+ "group_size": 64,
1432
+ "mode": "affine"
1433
+ },
1434
+ "model.layers.1.self_attn.q_proj": {
1435
+ "bits": 6,
1436
+ "group_size": 64,
1437
+ "mode": "affine"
1438
+ },
1439
+ "model.layers.1.self_attn.v_proj": {
1440
+ "bits": 6,
1441
+ "group_size": 64,
1442
+ "mode": "affine"
1443
+ },
1444
+ "model.layers.10.mlp.shared_experts.down_proj": {
1445
+ "bits": 8,
1446
+ "group_size": 64,
1447
+ "mode": "affine"
1448
+ },
1449
+ "model.layers.10.mlp.shared_experts.gate_proj": {
1450
+ "bits": 8,
1451
+ "group_size": 64,
1452
+ "mode": "affine"
1453
+ },
1454
+ "model.layers.10.mlp.shared_experts.up_proj": {
1455
+ "bits": 8,
1456
+ "group_size": 64,
1457
+ "mode": "affine"
1458
+ },
1459
+ "model.layers.10.self_attn.gate_proj": {
1460
+ "bits": 5,
1461
+ "group_size": 64,
1462
+ "mode": "affine"
1463
+ },
1464
+ "model.layers.10.self_attn.k_proj": {
1465
+ "bits": 5,
1466
+ "group_size": 64,
1467
+ "mode": "affine"
1468
+ },
1469
+ "model.layers.10.self_attn.o_proj": {
1470
+ "bits": 5,
1471
+ "group_size": 64,
1472
+ "mode": "affine"
1473
+ },
1474
+ "model.layers.10.self_attn.q_proj": {
1475
+ "bits": 5,
1476
+ "group_size": 64,
1477
+ "mode": "affine"
1478
+ },
1479
+ "model.layers.10.self_attn.v_proj": {
1480
+ "bits": 5,
1481
+ "group_size": 64,
1482
+ "mode": "affine"
1483
+ },
1484
+ "model.layers.11.mlp.shared_experts.down_proj": {
1485
+ "bits": 8,
1486
+ "group_size": 64,
1487
+ "mode": "affine"
1488
+ },
1489
+ "model.layers.11.mlp.shared_experts.gate_proj": {
1490
+ "bits": 8,
1491
+ "group_size": 64,
1492
+ "mode": "affine"
1493
+ },
1494
+ "model.layers.11.mlp.shared_experts.up_proj": {
1495
+ "bits": 8,
1496
+ "group_size": 64,
1497
+ "mode": "affine"
1498
+ },
1499
+ "model.layers.11.self_attn.gate_proj": {
1500
+ "bits": 5,
1501
+ "group_size": 64,
1502
+ "mode": "affine"
1503
+ },
1504
+ "model.layers.11.self_attn.k_proj": {
1505
+ "bits": 5,
1506
+ "group_size": 64,
1507
+ "mode": "affine"
1508
+ },
1509
+ "model.layers.11.self_attn.o_proj": {
1510
+ "bits": 5,
1511
+ "group_size": 64,
1512
+ "mode": "affine"
1513
+ },
1514
+ "model.layers.11.self_attn.q_proj": {
1515
+ "bits": 5,
1516
+ "group_size": 64,
1517
+ "mode": "affine"
1518
+ },
1519
+ "model.layers.11.self_attn.v_proj": {
1520
+ "bits": 5,
1521
+ "group_size": 64,
1522
+ "mode": "affine"
1523
+ },
1524
+ "model.layers.12.mlp.shared_experts.down_proj": {
1525
+ "bits": 8,
1526
+ "group_size": 64,
1527
+ "mode": "affine"
1528
+ },
1529
+ "model.layers.12.mlp.shared_experts.gate_proj": {
1530
+ "bits": 8,
1531
+ "group_size": 64,
1532
+ "mode": "affine"
1533
+ },
1534
+ "model.layers.12.mlp.shared_experts.up_proj": {
1535
+ "bits": 8,
1536
+ "group_size": 64,
1537
+ "mode": "affine"
1538
+ },
1539
+ "model.layers.12.self_attn.gate_proj": {
1540
+ "bits": 5,
1541
+ "group_size": 64,
1542
+ "mode": "affine"
1543
+ },
1544
+ "model.layers.12.self_attn.k_proj": {
1545
+ "bits": 5,
1546
+ "group_size": 64,
1547
+ "mode": "affine"
1548
+ },
1549
+ "model.layers.12.self_attn.o_proj": {
1550
+ "bits": 5,
1551
+ "group_size": 64,
1552
+ "mode": "affine"
1553
+ },
1554
+ "model.layers.12.self_attn.q_proj": {
1555
+ "bits": 5,
1556
+ "group_size": 64,
1557
+ "mode": "affine"
1558
+ },
1559
+ "model.layers.12.self_attn.v_proj": {
1560
+ "bits": 5,
1561
+ "group_size": 64,
1562
+ "mode": "affine"
1563
+ },
1564
+ "model.layers.13.mlp.shared_experts.down_proj": {
1565
+ "bits": 8,
1566
+ "group_size": 64,
1567
+ "mode": "affine"
1568
+ },
1569
+ "model.layers.13.mlp.shared_experts.gate_proj": {
1570
+ "bits": 8,
1571
+ "group_size": 64,
1572
+ "mode": "affine"
1573
+ },
1574
+ "model.layers.13.mlp.shared_experts.up_proj": {
1575
+ "bits": 8,
1576
+ "group_size": 64,
1577
+ "mode": "affine"
1578
+ },
1579
+ "model.layers.13.self_attn.gate_proj": {
1580
+ "bits": 5,
1581
+ "group_size": 64,
1582
+ "mode": "affine"
1583
+ },
1584
+ "model.layers.13.self_attn.k_proj": {
1585
+ "bits": 5,
1586
+ "group_size": 64,
1587
+ "mode": "affine"
1588
+ },
1589
+ "model.layers.13.self_attn.o_proj": {
1590
+ "bits": 5,
1591
+ "group_size": 64,
1592
+ "mode": "affine"
1593
+ },
1594
+ "model.layers.13.self_attn.q_proj": {
1595
+ "bits": 5,
1596
+ "group_size": 64,
1597
+ "mode": "affine"
1598
+ },
1599
+ "model.layers.13.self_attn.v_proj": {
1600
+ "bits": 5,
1601
+ "group_size": 64,
1602
+ "mode": "affine"
1603
+ },
1604
+ "model.layers.14.mlp.shared_experts.down_proj": {
1605
+ "bits": 8,
1606
+ "group_size": 64,
1607
+ "mode": "affine"
1608
+ },
1609
+ "model.layers.14.mlp.shared_experts.gate_proj": {
1610
+ "bits": 8,
1611
+ "group_size": 64,
1612
+ "mode": "affine"
1613
+ },
1614
+ "model.layers.14.mlp.shared_experts.up_proj": {
1615
+ "bits": 8,
1616
+ "group_size": 64,
1617
+ "mode": "affine"
1618
+ },
1619
+ "model.layers.14.self_attn.gate_proj": {
1620
+ "bits": 5,
1621
+ "group_size": 64,
1622
+ "mode": "affine"
1623
+ },
1624
+ "model.layers.14.self_attn.k_proj": {
1625
+ "bits": 5,
1626
+ "group_size": 64,
1627
+ "mode": "affine"
1628
+ },
1629
+ "model.layers.14.self_attn.o_proj": {
1630
+ "bits": 5,
1631
+ "group_size": 64,
1632
+ "mode": "affine"
1633
+ },
1634
+ "model.layers.14.self_attn.q_proj": {
1635
+ "bits": 5,
1636
+ "group_size": 64,
1637
+ "mode": "affine"
1638
+ },
1639
+ "model.layers.14.self_attn.v_proj": {
1640
+ "bits": 5,
1641
+ "group_size": 64,
1642
+ "mode": "affine"
1643
+ },
1644
+ "model.layers.15.mlp.shared_experts.down_proj": {
1645
+ "bits": 8,
1646
+ "group_size": 64,
1647
+ "mode": "affine"
1648
+ },
1649
+ "model.layers.15.mlp.shared_experts.gate_proj": {
1650
+ "bits": 8,
1651
+ "group_size": 64,
1652
+ "mode": "affine"
1653
+ },
1654
+ "model.layers.15.mlp.shared_experts.up_proj": {
1655
+ "bits": 8,
1656
+ "group_size": 64,
1657
+ "mode": "affine"
1658
+ },
1659
+ "model.layers.15.self_attn.gate_proj": {
1660
+ "bits": 5,
1661
+ "group_size": 64,
1662
+ "mode": "affine"
1663
+ },
1664
+ "model.layers.15.self_attn.k_proj": {
1665
+ "bits": 5,
1666
+ "group_size": 64,
1667
+ "mode": "affine"
1668
+ },
1669
+ "model.layers.15.self_attn.o_proj": {
1670
+ "bits": 5,
1671
+ "group_size": 64,
1672
+ "mode": "affine"
1673
+ },
1674
+ "model.layers.15.self_attn.q_proj": {
1675
+ "bits": 5,
1676
+ "group_size": 64,
1677
+ "mode": "affine"
1678
+ },
1679
+ "model.layers.15.self_attn.v_proj": {
1680
+ "bits": 5,
1681
+ "group_size": 64,
1682
+ "mode": "affine"
1683
+ },
1684
+ "model.layers.16.mlp.shared_experts.down_proj": {
1685
+ "bits": 8,
1686
+ "group_size": 64,
1687
+ "mode": "affine"
1688
+ },
1689
+ "model.layers.16.mlp.shared_experts.gate_proj": {
1690
+ "bits": 8,
1691
+ "group_size": 64,
1692
+ "mode": "affine"
1693
+ },
1694
+ "model.layers.16.mlp.shared_experts.up_proj": {
1695
+ "bits": 8,
1696
+ "group_size": 64,
1697
+ "mode": "affine"
1698
+ },
1699
+ "model.layers.16.self_attn.gate_proj": {
1700
+ "bits": 5,
1701
+ "group_size": 64,
1702
+ "mode": "affine"
1703
+ },
1704
+ "model.layers.16.self_attn.k_proj": {
1705
+ "bits": 5,
1706
+ "group_size": 64,
1707
+ "mode": "affine"
1708
+ },
1709
+ "model.layers.16.self_attn.o_proj": {
1710
+ "bits": 5,
1711
+ "group_size": 64,
1712
+ "mode": "affine"
1713
+ },
1714
+ "model.layers.16.self_attn.q_proj": {
1715
+ "bits": 5,
1716
+ "group_size": 64,
1717
+ "mode": "affine"
1718
+ },
1719
+ "model.layers.16.self_attn.v_proj": {
1720
+ "bits": 5,
1721
+ "group_size": 64,
1722
+ "mode": "affine"
1723
+ },
1724
+ "model.layers.17.mlp.shared_experts.down_proj": {
1725
+ "bits": 8,
1726
+ "group_size": 64,
1727
+ "mode": "affine"
1728
+ },
1729
+ "model.layers.17.mlp.shared_experts.gate_proj": {
1730
+ "bits": 8,
1731
+ "group_size": 64,
1732
+ "mode": "affine"
1733
+ },
1734
+ "model.layers.17.mlp.shared_experts.up_proj": {
1735
+ "bits": 8,
1736
+ "group_size": 64,
1737
+ "mode": "affine"
1738
+ },
1739
+ "model.layers.17.self_attn.gate_proj": {
1740
+ "bits": 5,
1741
+ "group_size": 64,
1742
+ "mode": "affine"
1743
+ },
1744
+ "model.layers.17.self_attn.k_proj": {
1745
+ "bits": 5,
1746
+ "group_size": 64,
1747
+ "mode": "affine"
1748
+ },
1749
+ "model.layers.17.self_attn.o_proj": {
1750
+ "bits": 5,
1751
+ "group_size": 64,
1752
+ "mode": "affine"
1753
+ },
1754
+ "model.layers.17.self_attn.q_proj": {
1755
+ "bits": 5,
1756
+ "group_size": 64,
1757
+ "mode": "affine"
1758
+ },
1759
+ "model.layers.17.self_attn.v_proj": {
1760
+ "bits": 5,
1761
+ "group_size": 64,
1762
+ "mode": "affine"
1763
+ },
1764
+ "model.layers.18.mlp.shared_experts.down_proj": {
1765
+ "bits": 8,
1766
+ "group_size": 64,
1767
+ "mode": "affine"
1768
+ },
1769
+ "model.layers.18.mlp.shared_experts.gate_proj": {
1770
+ "bits": 8,
1771
+ "group_size": 64,
1772
+ "mode": "affine"
1773
+ },
1774
+ "model.layers.18.mlp.shared_experts.up_proj": {
1775
+ "bits": 8,
1776
+ "group_size": 64,
1777
+ "mode": "affine"
1778
+ },
1779
+ "model.layers.18.self_attn.gate_proj": {
1780
+ "bits": 5,
1781
+ "group_size": 64,
1782
+ "mode": "affine"
1783
+ },
1784
+ "model.layers.18.self_attn.k_proj": {
1785
+ "bits": 5,
1786
+ "group_size": 64,
1787
+ "mode": "affine"
1788
+ },
1789
+ "model.layers.18.self_attn.o_proj": {
1790
+ "bits": 5,
1791
+ "group_size": 64,
1792
+ "mode": "affine"
1793
+ },
1794
+ "model.layers.18.self_attn.q_proj": {
1795
+ "bits": 5,
1796
+ "group_size": 64,
1797
+ "mode": "affine"
1798
+ },
1799
+ "model.layers.18.self_attn.v_proj": {
1800
+ "bits": 5,
1801
+ "group_size": 64,
1802
+ "mode": "affine"
1803
+ },
1804
+ "model.layers.19.mlp.shared_experts.down_proj": {
1805
+ "bits": 8,
1806
+ "group_size": 64,
1807
+ "mode": "affine"
1808
+ },
1809
+ "model.layers.19.mlp.shared_experts.gate_proj": {
1810
+ "bits": 8,
1811
+ "group_size": 64,
1812
+ "mode": "affine"
1813
+ },
1814
+ "model.layers.19.mlp.shared_experts.up_proj": {
1815
+ "bits": 8,
1816
+ "group_size": 64,
1817
+ "mode": "affine"
1818
+ },
1819
+ "model.layers.19.self_attn.gate_proj": {
1820
+ "bits": 5,
1821
+ "group_size": 64,
1822
+ "mode": "affine"
1823
+ },
1824
+ "model.layers.19.self_attn.k_proj": {
1825
+ "bits": 5,
1826
+ "group_size": 64,
1827
+ "mode": "affine"
1828
+ },
1829
+ "model.layers.19.self_attn.o_proj": {
1830
+ "bits": 5,
1831
+ "group_size": 64,
1832
+ "mode": "affine"
1833
+ },
1834
+ "model.layers.19.self_attn.q_proj": {
1835
+ "bits": 5,
1836
+ "group_size": 64,
1837
+ "mode": "affine"
1838
+ },
1839
+ "model.layers.19.self_attn.v_proj": {
1840
+ "bits": 5,
1841
+ "group_size": 64,
1842
+ "mode": "affine"
1843
+ },
1844
+ "model.layers.2.mlp.shared_experts.down_proj": {
1845
+ "bits": 8,
1846
+ "group_size": 64,
1847
+ "mode": "affine"
1848
+ },
1849
+ "model.layers.2.mlp.shared_experts.gate_proj": {
1850
+ "bits": 8,
1851
+ "group_size": 64,
1852
+ "mode": "affine"
1853
+ },
1854
+ "model.layers.2.mlp.shared_experts.up_proj": {
1855
+ "bits": 8,
1856
+ "group_size": 64,
1857
+ "mode": "affine"
1858
+ },
1859
+ "model.layers.2.self_attn.gate_proj": {
1860
+ "bits": 5,
1861
+ "group_size": 64,
1862
+ "mode": "affine"
1863
+ },
1864
+ "model.layers.2.self_attn.k_proj": {
1865
+ "bits": 5,
1866
+ "group_size": 64,
1867
+ "mode": "affine"
1868
+ },
1869
+ "model.layers.2.self_attn.o_proj": {
1870
+ "bits": 5,
1871
+ "group_size": 64,
1872
+ "mode": "affine"
1873
+ },
1874
+ "model.layers.2.self_attn.q_proj": {
1875
+ "bits": 5,
1876
+ "group_size": 64,
1877
+ "mode": "affine"
1878
+ },
1879
+ "model.layers.2.self_attn.v_proj": {
1880
+ "bits": 5,
1881
+ "group_size": 64,
1882
+ "mode": "affine"
1883
+ },
1884
+ "model.layers.20.mlp.shared_experts.down_proj": {
1885
+ "bits": 8,
1886
+ "group_size": 64,
1887
+ "mode": "affine"
1888
+ },
1889
+ "model.layers.20.mlp.shared_experts.gate_proj": {
1890
+ "bits": 8,
1891
+ "group_size": 64,
1892
+ "mode": "affine"
1893
+ },
1894
+ "model.layers.20.mlp.shared_experts.up_proj": {
1895
+ "bits": 8,
1896
+ "group_size": 64,
1897
+ "mode": "affine"
1898
+ },
1899
+ "model.layers.20.self_attn.gate_proj": {
1900
+ "bits": 5,
1901
+ "group_size": 64,
1902
+ "mode": "affine"
1903
+ },
1904
+ "model.layers.20.self_attn.k_proj": {
1905
+ "bits": 5,
1906
+ "group_size": 64,
1907
+ "mode": "affine"
1908
+ },
1909
+ "model.layers.20.self_attn.o_proj": {
1910
+ "bits": 5,
1911
+ "group_size": 64,
1912
+ "mode": "affine"
1913
+ },
1914
+ "model.layers.20.self_attn.q_proj": {
1915
+ "bits": 5,
1916
+ "group_size": 64,
1917
+ "mode": "affine"
1918
+ },
1919
+ "model.layers.20.self_attn.v_proj": {
1920
+ "bits": 5,
1921
+ "group_size": 64,
1922
+ "mode": "affine"
1923
+ },
1924
+ "model.layers.21.mlp.shared_experts.down_proj": {
1925
+ "bits": 8,
1926
+ "group_size": 64,
1927
+ "mode": "affine"
1928
+ },
1929
+ "model.layers.21.mlp.shared_experts.gate_proj": {
1930
+ "bits": 8,
1931
+ "group_size": 64,
1932
+ "mode": "affine"
1933
+ },
1934
+ "model.layers.21.mlp.shared_experts.up_proj": {
1935
+ "bits": 8,
1936
+ "group_size": 64,
1937
+ "mode": "affine"
1938
+ },
1939
+ "model.layers.21.self_attn.gate_proj": {
1940
+ "bits": 5,
1941
+ "group_size": 64,
1942
+ "mode": "affine"
1943
+ },
1944
+ "model.layers.21.self_attn.k_proj": {
1945
+ "bits": 5,
1946
+ "group_size": 64,
1947
+ "mode": "affine"
1948
+ },
1949
+ "model.layers.21.self_attn.o_proj": {
1950
+ "bits": 5,
1951
+ "group_size": 64,
1952
+ "mode": "affine"
1953
+ },
1954
+ "model.layers.21.self_attn.q_proj": {
1955
+ "bits": 5,
1956
+ "group_size": 64,
1957
+ "mode": "affine"
1958
+ },
1959
+ "model.layers.21.self_attn.v_proj": {
1960
+ "bits": 5,
1961
+ "group_size": 64,
1962
+ "mode": "affine"
1963
+ },
1964
+ "model.layers.22.mlp.shared_experts.down_proj": {
1965
+ "bits": 8,
1966
+ "group_size": 64,
1967
+ "mode": "affine"
1968
+ },
1969
+ "model.layers.22.mlp.shared_experts.gate_proj": {
1970
+ "bits": 8,
1971
+ "group_size": 64,
1972
+ "mode": "affine"
1973
+ },
1974
+ "model.layers.22.mlp.shared_experts.up_proj": {
1975
+ "bits": 8,
1976
+ "group_size": 64,
1977
+ "mode": "affine"
1978
+ },
1979
+ "model.layers.22.self_attn.gate_proj": {
1980
+ "bits": 5,
1981
+ "group_size": 64,
1982
+ "mode": "affine"
1983
+ },
1984
+ "model.layers.22.self_attn.k_proj": {
1985
+ "bits": 5,
1986
+ "group_size": 64,
1987
+ "mode": "affine"
1988
+ },
1989
+ "model.layers.22.self_attn.o_proj": {
1990
+ "bits": 5,
1991
+ "group_size": 64,
1992
+ "mode": "affine"
1993
+ },
1994
+ "model.layers.22.self_attn.q_proj": {
1995
+ "bits": 5,
1996
+ "group_size": 64,
1997
+ "mode": "affine"
1998
+ },
1999
+ "model.layers.22.self_attn.v_proj": {
2000
+ "bits": 5,
2001
+ "group_size": 64,
2002
+ "mode": "affine"
2003
+ },
2004
+ "model.layers.23.mlp.shared_experts.down_proj": {
2005
+ "bits": 8,
2006
+ "group_size": 64,
2007
+ "mode": "affine"
2008
+ },
2009
+ "model.layers.23.mlp.shared_experts.gate_proj": {
2010
+ "bits": 8,
2011
+ "group_size": 64,
2012
+ "mode": "affine"
2013
+ },
2014
+ "model.layers.23.mlp.shared_experts.up_proj": {
2015
+ "bits": 8,
2016
+ "group_size": 64,
2017
+ "mode": "affine"
2018
+ },
2019
+ "model.layers.23.self_attn.gate_proj": {
2020
+ "bits": 5,
2021
+ "group_size": 64,
2022
+ "mode": "affine"
2023
+ },
2024
+ "model.layers.23.self_attn.k_proj": {
2025
+ "bits": 5,
2026
+ "group_size": 64,
2027
+ "mode": "affine"
2028
+ },
2029
+ "model.layers.23.self_attn.o_proj": {
2030
+ "bits": 5,
2031
+ "group_size": 64,
2032
+ "mode": "affine"
2033
+ },
2034
+ "model.layers.23.self_attn.q_proj": {
2035
+ "bits": 5,
2036
+ "group_size": 64,
2037
+ "mode": "affine"
2038
+ },
2039
+ "model.layers.23.self_attn.v_proj": {
2040
+ "bits": 5,
2041
+ "group_size": 64,
2042
+ "mode": "affine"
2043
+ },
2044
+ "model.layers.24.mlp.shared_experts.down_proj": {
2045
+ "bits": 8,
2046
+ "group_size": 64,
2047
+ "mode": "affine"
2048
+ },
2049
+ "model.layers.24.mlp.shared_experts.gate_proj": {
2050
+ "bits": 8,
2051
+ "group_size": 64,
2052
+ "mode": "affine"
2053
+ },
2054
+ "model.layers.24.mlp.shared_experts.up_proj": {
2055
+ "bits": 8,
2056
+ "group_size": 64,
2057
+ "mode": "affine"
2058
+ },
2059
+ "model.layers.24.self_attn.gate_proj": {
2060
+ "bits": 5,
2061
+ "group_size": 64,
2062
+ "mode": "affine"
2063
+ },
2064
+ "model.layers.24.self_attn.k_proj": {
2065
+ "bits": 5,
2066
+ "group_size": 64,
2067
+ "mode": "affine"
2068
+ },
2069
+ "model.layers.24.self_attn.o_proj": {
2070
+ "bits": 5,
2071
+ "group_size": 64,
2072
+ "mode": "affine"
2073
+ },
2074
+ "model.layers.24.self_attn.q_proj": {
2075
+ "bits": 5,
2076
+ "group_size": 64,
2077
+ "mode": "affine"
2078
+ },
2079
+ "model.layers.24.self_attn.v_proj": {
2080
+ "bits": 5,
2081
+ "group_size": 64,
2082
+ "mode": "affine"
2083
+ },
2084
+ "model.layers.25.mlp.shared_experts.down_proj": {
2085
+ "bits": 8,
2086
+ "group_size": 64,
2087
+ "mode": "affine"
2088
+ },
2089
+ "model.layers.25.mlp.shared_experts.gate_proj": {
2090
+ "bits": 8,
2091
+ "group_size": 64,
2092
+ "mode": "affine"
2093
+ },
2094
+ "model.layers.25.mlp.shared_experts.up_proj": {
2095
+ "bits": 8,
2096
+ "group_size": 64,
2097
+ "mode": "affine"
2098
+ },
2099
+ "model.layers.25.self_attn.gate_proj": {
2100
+ "bits": 5,
2101
+ "group_size": 64,
2102
+ "mode": "affine"
2103
+ },
2104
+ "model.layers.25.self_attn.k_proj": {
2105
+ "bits": 5,
2106
+ "group_size": 64,
2107
+ "mode": "affine"
2108
+ },
2109
+ "model.layers.25.self_attn.o_proj": {
2110
+ "bits": 5,
2111
+ "group_size": 64,
2112
+ "mode": "affine"
2113
+ },
2114
+ "model.layers.25.self_attn.q_proj": {
2115
+ "bits": 5,
2116
+ "group_size": 64,
2117
+ "mode": "affine"
2118
+ },
2119
+ "model.layers.25.self_attn.v_proj": {
2120
+ "bits": 5,
2121
+ "group_size": 64,
2122
+ "mode": "affine"
2123
+ },
2124
+ "model.layers.26.mlp.shared_experts.down_proj": {
2125
+ "bits": 8,
2126
+ "group_size": 64,
2127
+ "mode": "affine"
2128
+ },
2129
+ "model.layers.26.mlp.shared_experts.gate_proj": {
2130
+ "bits": 8,
2131
+ "group_size": 64,
2132
+ "mode": "affine"
2133
+ },
2134
+ "model.layers.26.mlp.shared_experts.up_proj": {
2135
+ "bits": 8,
2136
+ "group_size": 64,
2137
+ "mode": "affine"
2138
+ },
2139
+ "model.layers.26.self_attn.gate_proj": {
2140
+ "bits": 5,
2141
+ "group_size": 64,
2142
+ "mode": "affine"
2143
+ },
2144
+ "model.layers.26.self_attn.k_proj": {
2145
+ "bits": 6,
2146
+ "group_size": 64,
2147
+ "mode": "affine"
2148
+ },
2149
+ "model.layers.26.self_attn.o_proj": {
2150
+ "bits": 5,
2151
+ "group_size": 64,
2152
+ "mode": "affine"
2153
+ },
2154
+ "model.layers.26.self_attn.q_proj": {
2155
+ "bits": 6,
2156
+ "group_size": 64,
2157
+ "mode": "affine"
2158
+ },
2159
+ "model.layers.26.self_attn.v_proj": {
2160
+ "bits": 6,
2161
+ "group_size": 64,
2162
+ "mode": "affine"
2163
+ },
2164
+ "model.layers.27.mlp.shared_experts.down_proj": {
2165
+ "bits": 8,
2166
+ "group_size": 64,
2167
+ "mode": "affine"
2168
+ },
2169
+ "model.layers.27.mlp.shared_experts.gate_proj": {
2170
+ "bits": 8,
2171
+ "group_size": 64,
2172
+ "mode": "affine"
2173
+ },
2174
+ "model.layers.27.mlp.shared_experts.up_proj": {
2175
+ "bits": 8,
2176
+ "group_size": 64,
2177
+ "mode": "affine"
2178
+ },
2179
+ "model.layers.27.self_attn.gate_proj": {
2180
+ "bits": 5,
2181
+ "group_size": 64,
2182
+ "mode": "affine"
2183
+ },
2184
+ "model.layers.27.self_attn.k_proj": {
2185
+ "bits": 6,
2186
+ "group_size": 64,
2187
+ "mode": "affine"
2188
+ },
2189
+ "model.layers.27.self_attn.o_proj": {
2190
+ "bits": 5,
2191
+ "group_size": 64,
2192
+ "mode": "affine"
2193
+ },
2194
+ "model.layers.27.self_attn.q_proj": {
2195
+ "bits": 6,
2196
+ "group_size": 64,
2197
+ "mode": "affine"
2198
+ },
2199
+ "model.layers.27.self_attn.v_proj": {
2200
+ "bits": 6,
2201
+ "group_size": 64,
2202
+ "mode": "affine"
2203
+ },
2204
+ "model.layers.28.mlp.shared_experts.down_proj": {
2205
+ "bits": 8,
2206
+ "group_size": 64,
2207
+ "mode": "affine"
2208
+ },
2209
+ "model.layers.28.mlp.shared_experts.gate_proj": {
2210
+ "bits": 8,
2211
+ "group_size": 64,
2212
+ "mode": "affine"
2213
+ },
2214
+ "model.layers.28.mlp.shared_experts.up_proj": {
2215
+ "bits": 8,
2216
+ "group_size": 64,
2217
+ "mode": "affine"
2218
+ },
2219
+ "model.layers.28.self_attn.gate_proj": {
2220
+ "bits": 5,
2221
+ "group_size": 64,
2222
+ "mode": "affine"
2223
+ },
2224
+ "model.layers.28.self_attn.k_proj": {
2225
+ "bits": 6,
2226
+ "group_size": 64,
2227
+ "mode": "affine"
2228
+ },
2229
+ "model.layers.28.self_attn.o_proj": {
2230
+ "bits": 5,
2231
+ "group_size": 64,
2232
+ "mode": "affine"
2233
+ },
2234
+ "model.layers.28.self_attn.q_proj": {
2235
+ "bits": 6,
2236
+ "group_size": 64,
2237
+ "mode": "affine"
2238
+ },
2239
+ "model.layers.28.self_attn.v_proj": {
2240
+ "bits": 6,
2241
+ "group_size": 64,
2242
+ "mode": "affine"
2243
+ },
2244
+ "model.layers.29.mlp.shared_experts.down_proj": {
2245
+ "bits": 8,
2246
+ "group_size": 64,
2247
+ "mode": "affine"
2248
+ },
2249
+ "model.layers.29.mlp.shared_experts.gate_proj": {
2250
+ "bits": 8,
2251
+ "group_size": 64,
2252
+ "mode": "affine"
2253
+ },
2254
+ "model.layers.29.mlp.shared_experts.up_proj": {
2255
+ "bits": 8,
2256
+ "group_size": 64,
2257
+ "mode": "affine"
2258
+ },
2259
+ "model.layers.29.self_attn.gate_proj": {
2260
+ "bits": 5,
2261
+ "group_size": 64,
2262
+ "mode": "affine"
2263
+ },
2264
+ "model.layers.29.self_attn.k_proj": {
2265
+ "bits": 6,
2266
+ "group_size": 64,
2267
+ "mode": "affine"
2268
+ },
2269
+ "model.layers.29.self_attn.o_proj": {
2270
+ "bits": 5,
2271
+ "group_size": 64,
2272
+ "mode": "affine"
2273
+ },
2274
+ "model.layers.29.self_attn.q_proj": {
2275
+ "bits": 6,
2276
+ "group_size": 64,
2277
+ "mode": "affine"
2278
+ },
2279
+ "model.layers.29.self_attn.v_proj": {
2280
+ "bits": 6,
2281
+ "group_size": 64,
2282
+ "mode": "affine"
2283
+ },
2284
+ "model.layers.3.mlp.shared_experts.down_proj": {
2285
+ "bits": 8,
2286
+ "group_size": 64,
2287
+ "mode": "affine"
2288
+ },
2289
+ "model.layers.3.mlp.shared_experts.gate_proj": {
2290
+ "bits": 8,
2291
+ "group_size": 64,
2292
+ "mode": "affine"
2293
+ },
2294
+ "model.layers.3.mlp.shared_experts.up_proj": {
2295
+ "bits": 8,
2296
+ "group_size": 64,
2297
+ "mode": "affine"
2298
+ },
2299
+ "model.layers.3.self_attn.k_proj": {
2300
+ "bits": 5,
2301
+ "group_size": 64,
2302
+ "mode": "affine"
2303
+ },
2304
+ "model.layers.3.self_attn.v_proj": {
2305
+ "bits": 5,
2306
+ "group_size": 64,
2307
+ "mode": "affine"
2308
+ },
2309
+ "model.layers.30.mlp.shared_experts.down_proj": {
2310
+ "bits": 8,
2311
+ "group_size": 64,
2312
+ "mode": "affine"
2313
+ },
2314
+ "model.layers.30.mlp.shared_experts.gate_proj": {
2315
+ "bits": 8,
2316
+ "group_size": 64,
2317
+ "mode": "affine"
2318
+ },
2319
+ "model.layers.30.mlp.shared_experts.up_proj": {
2320
+ "bits": 8,
2321
+ "group_size": 64,
2322
+ "mode": "affine"
2323
+ },
2324
+ "model.layers.30.self_attn.gate_proj": {
2325
+ "bits": 5,
2326
+ "group_size": 64,
2327
+ "mode": "affine"
2328
+ },
2329
+ "model.layers.30.self_attn.k_proj": {
2330
+ "bits": 6,
2331
+ "group_size": 64,
2332
+ "mode": "affine"
2333
+ },
2334
+ "model.layers.30.self_attn.o_proj": {
2335
+ "bits": 5,
2336
+ "group_size": 64,
2337
+ "mode": "affine"
2338
+ },
2339
+ "model.layers.30.self_attn.q_proj": {
2340
+ "bits": 6,
2341
+ "group_size": 64,
2342
+ "mode": "affine"
2343
+ },
2344
+ "model.layers.30.self_attn.v_proj": {
2345
+ "bits": 6,
2346
+ "group_size": 64,
2347
+ "mode": "affine"
2348
+ },
2349
+ "model.layers.31.mlp.shared_experts.down_proj": {
2350
+ "bits": 8,
2351
+ "group_size": 64,
2352
+ "mode": "affine"
2353
+ },
2354
+ "model.layers.31.mlp.shared_experts.gate_proj": {
2355
+ "bits": 8,
2356
+ "group_size": 64,
2357
+ "mode": "affine"
2358
+ },
2359
+ "model.layers.31.mlp.shared_experts.up_proj": {
2360
+ "bits": 8,
2361
+ "group_size": 64,
2362
+ "mode": "affine"
2363
+ },
2364
+ "model.layers.31.self_attn.gate_proj": {
2365
+ "bits": 5,
2366
+ "group_size": 64,
2367
+ "mode": "affine"
2368
+ },
2369
+ "model.layers.31.self_attn.k_proj": {
2370
+ "bits": 6,
2371
+ "group_size": 64,
2372
+ "mode": "affine"
2373
+ },
2374
+ "model.layers.31.self_attn.o_proj": {
2375
+ "bits": 5,
2376
+ "group_size": 64,
2377
+ "mode": "affine"
2378
+ },
2379
+ "model.layers.31.self_attn.q_proj": {
2380
+ "bits": 6,
2381
+ "group_size": 64,
2382
+ "mode": "affine"
2383
+ },
2384
+ "model.layers.31.self_attn.v_proj": {
2385
+ "bits": 6,
2386
+ "group_size": 64,
2387
+ "mode": "affine"
2388
+ },
2389
+ "model.layers.4.mlp.shared_experts.down_proj": {
2390
+ "bits": 8,
2391
+ "group_size": 64,
2392
+ "mode": "affine"
2393
+ },
2394
+ "model.layers.4.mlp.shared_experts.gate_proj": {
2395
+ "bits": 8,
2396
+ "group_size": 64,
2397
+ "mode": "affine"
2398
+ },
2399
+ "model.layers.4.mlp.shared_experts.up_proj": {
2400
+ "bits": 8,
2401
+ "group_size": 64,
2402
+ "mode": "affine"
2403
+ },
2404
+ "model.layers.4.self_attn.gate_proj": {
2405
+ "bits": 5,
2406
+ "group_size": 64,
2407
+ "mode": "affine"
2408
+ },
2409
+ "model.layers.4.self_attn.k_proj": {
2410
+ "bits": 5,
2411
+ "group_size": 64,
2412
+ "mode": "affine"
2413
+ },
2414
+ "model.layers.4.self_attn.v_proj": {
2415
+ "bits": 5,
2416
+ "group_size": 64,
2417
+ "mode": "affine"
2418
+ },
2419
+ "model.layers.5.mlp.shared_experts.down_proj": {
2420
+ "bits": 8,
2421
+ "group_size": 64,
2422
+ "mode": "affine"
2423
+ },
2424
+ "model.layers.5.mlp.shared_experts.gate_proj": {
2425
+ "bits": 8,
2426
+ "group_size": 64,
2427
+ "mode": "affine"
2428
+ },
2429
+ "model.layers.5.mlp.shared_experts.up_proj": {
2430
+ "bits": 8,
2431
+ "group_size": 64,
2432
+ "mode": "affine"
2433
+ },
2434
+ "model.layers.5.self_attn.gate_proj": {
2435
+ "bits": 5,
2436
+ "group_size": 64,
2437
+ "mode": "affine"
2438
+ },
2439
+ "model.layers.5.self_attn.k_proj": {
2440
+ "bits": 5,
2441
+ "group_size": 64,
2442
+ "mode": "affine"
2443
+ },
2444
+ "model.layers.5.self_attn.o_proj": {
2445
+ "bits": 5,
2446
+ "group_size": 64,
2447
+ "mode": "affine"
2448
+ },
2449
+ "model.layers.5.self_attn.q_proj": {
2450
+ "bits": 5,
2451
+ "group_size": 64,
2452
+ "mode": "affine"
2453
+ },
2454
+ "model.layers.5.self_attn.v_proj": {
2455
+ "bits": 5,
2456
+ "group_size": 64,
2457
+ "mode": "affine"
2458
+ },
2459
+ "model.layers.6.mlp.shared_experts.down_proj": {
2460
+ "bits": 8,
2461
+ "group_size": 64,
2462
+ "mode": "affine"
2463
+ },
2464
+ "model.layers.6.mlp.shared_experts.gate_proj": {
2465
+ "bits": 8,
2466
+ "group_size": 64,
2467
+ "mode": "affine"
2468
+ },
2469
+ "model.layers.6.mlp.shared_experts.up_proj": {
2470
+ "bits": 8,
2471
+ "group_size": 64,
2472
+ "mode": "affine"
2473
+ },
2474
+ "model.layers.6.self_attn.gate_proj": {
2475
+ "bits": 5,
2476
+ "group_size": 64,
2477
+ "mode": "affine"
2478
+ },
2479
+ "model.layers.6.self_attn.k_proj": {
2480
+ "bits": 5,
2481
+ "group_size": 64,
2482
+ "mode": "affine"
2483
+ },
2484
+ "model.layers.6.self_attn.o_proj": {
2485
+ "bits": 5,
2486
+ "group_size": 64,
2487
+ "mode": "affine"
2488
+ },
2489
+ "model.layers.6.self_attn.q_proj": {
2490
+ "bits": 5,
2491
+ "group_size": 64,
2492
+ "mode": "affine"
2493
+ },
2494
+ "model.layers.6.self_attn.v_proj": {
2495
+ "bits": 5,
2496
+ "group_size": 64,
2497
+ "mode": "affine"
2498
+ },
2499
+ "model.layers.7.mlp.shared_experts.down_proj": {
2500
+ "bits": 8,
2501
+ "group_size": 64,
2502
+ "mode": "affine"
2503
+ },
2504
+ "model.layers.7.mlp.shared_experts.gate_proj": {
2505
+ "bits": 8,
2506
+ "group_size": 64,
2507
+ "mode": "affine"
2508
+ },
2509
+ "model.layers.7.mlp.shared_experts.up_proj": {
2510
+ "bits": 8,
2511
+ "group_size": 64,
2512
+ "mode": "affine"
2513
+ },
2514
+ "model.layers.7.self_attn.gate_proj": {
2515
+ "bits": 5,
2516
+ "group_size": 64,
2517
+ "mode": "affine"
2518
+ },
2519
+ "model.layers.7.self_attn.k_proj": {
2520
+ "bits": 5,
2521
+ "group_size": 64,
2522
+ "mode": "affine"
2523
+ },
2524
+ "model.layers.7.self_attn.o_proj": {
2525
+ "bits": 5,
2526
+ "group_size": 64,
2527
+ "mode": "affine"
2528
+ },
2529
+ "model.layers.7.self_attn.q_proj": {
2530
+ "bits": 5,
2531
+ "group_size": 64,
2532
+ "mode": "affine"
2533
+ },
2534
+ "model.layers.7.self_attn.v_proj": {
2535
+ "bits": 5,
2536
+ "group_size": 64,
2537
+ "mode": "affine"
2538
+ },
2539
+ "model.layers.8.mlp.shared_experts.down_proj": {
2540
+ "bits": 8,
2541
+ "group_size": 64,
2542
+ "mode": "affine"
2543
+ },
2544
+ "model.layers.8.mlp.shared_experts.gate_proj": {
2545
+ "bits": 8,
2546
+ "group_size": 64,
2547
+ "mode": "affine"
2548
+ },
2549
+ "model.layers.8.mlp.shared_experts.up_proj": {
2550
+ "bits": 8,
2551
+ "group_size": 64,
2552
+ "mode": "affine"
2553
+ },
2554
+ "model.layers.8.self_attn.gate_proj": {
2555
+ "bits": 5,
2556
+ "group_size": 64,
2557
+ "mode": "affine"
2558
+ },
2559
+ "model.layers.8.self_attn.k_proj": {
2560
+ "bits": 5,
2561
+ "group_size": 64,
2562
+ "mode": "affine"
2563
+ },
2564
+ "model.layers.8.self_attn.o_proj": {
2565
+ "bits": 5,
2566
+ "group_size": 64,
2567
+ "mode": "affine"
2568
+ },
2569
+ "model.layers.8.self_attn.q_proj": {
2570
+ "bits": 5,
2571
+ "group_size": 64,
2572
+ "mode": "affine"
2573
+ },
2574
+ "model.layers.8.self_attn.v_proj": {
2575
+ "bits": 5,
2576
+ "group_size": 64,
2577
+ "mode": "affine"
2578
+ },
2579
+ "model.layers.9.mlp.shared_experts.down_proj": {
2580
+ "bits": 8,
2581
+ "group_size": 64,
2582
+ "mode": "affine"
2583
+ },
2584
+ "model.layers.9.mlp.shared_experts.gate_proj": {
2585
+ "bits": 8,
2586
+ "group_size": 64,
2587
+ "mode": "affine"
2588
+ },
2589
+ "model.layers.9.mlp.shared_experts.up_proj": {
2590
+ "bits": 8,
2591
+ "group_size": 64,
2592
+ "mode": "affine"
2593
+ },
2594
+ "model.layers.9.self_attn.gate_proj": {
2595
+ "bits": 5,
2596
+ "group_size": 64,
2597
+ "mode": "affine"
2598
+ },
2599
+ "model.layers.9.self_attn.k_proj": {
2600
+ "bits": 5,
2601
+ "group_size": 64,
2602
+ "mode": "affine"
2603
+ },
2604
+ "model.layers.9.self_attn.o_proj": {
2605
+ "bits": 5,
2606
+ "group_size": 64,
2607
+ "mode": "affine"
2608
+ },
2609
+ "model.layers.9.self_attn.q_proj": {
2610
+ "bits": 5,
2611
+ "group_size": 64,
2612
+ "mode": "affine"
2613
+ },
2614
+ "model.layers.9.self_attn.v_proj": {
2615
+ "bits": 5,
2616
+ "group_size": 64,
2617
+ "mode": "affine"
2618
+ }
2619
+ }
2620
+ }
configuration_afmoe.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+ from transformers.configuration_utils import layer_type_validation
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ class AfmoeConfig(PretrainedConfig):
23
+ """
24
+ n_group (`int`, *optional*, defaults to 1):
25
+ Number of groups for routed experts.
26
+ topk_group (`int`, *optional*, defaults to 1):
27
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
28
+ """
29
+ model_type = "afmoe"
30
+ base_model_pp_plan = {
31
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
32
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
33
+ "norm": (["hidden_states"], ["hidden_states"]),
34
+ }
35
+
36
+ def __init__(
37
+ self,
38
+ num_hidden_layers: int = 32,
39
+ vocab_size: int = 200192,
40
+ hidden_size: int = 2048,
41
+ intermediate_size: int = 6144,
42
+ moe_intermediate_size=1408,
43
+ num_dense_layers=1,
44
+ num_attention_heads=16,
45
+ num_key_value_heads=None,
46
+ head_dim=128,
47
+ hidden_act="silu",
48
+ max_position_embeddings=16384,
49
+ initializer_range=0.02,
50
+ rms_norm_eps=1e-5,
51
+ use_cache=True,
52
+ tie_word_embeddings=False,
53
+ rope_theta=10000.0,
54
+ rope_scaling=None,
55
+ num_experts=64,
56
+ num_experts_per_tok=6,
57
+ num_shared_experts=2,
58
+ num_expert_groups=1,
59
+ num_limited_groups=1,
60
+ score_func="sigmoid",
61
+ route_norm=True,
62
+ route_scale=1.0,
63
+ global_attn_every_n_layers=4,
64
+ sliding_window=1024,
65
+ mup_enabled=False,
66
+ layer_types=None,
67
+ attention_dropout: float = 0.0,
68
+ n_group: int = 1,
69
+ topk_group: int = 1,
70
+ **kwargs,
71
+ ):
72
+ self.vocab_size = vocab_size
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.hidden_size = hidden_size
75
+ self.intermediate_size = intermediate_size
76
+ self.num_hidden_layers = num_hidden_layers
77
+ self.num_dense_layers = num_dense_layers
78
+ self.num_attention_heads = num_attention_heads
79
+ self.head_dim = head_dim
80
+ self.hidden_act = hidden_act
81
+ self.initializer_range = initializer_range
82
+ self.rms_norm_eps = rms_norm_eps
83
+ self.use_cache = use_cache
84
+ self.rope_theta = rope_theta
85
+ self.rope_scaling = rope_scaling
86
+
87
+
88
+ # MoE specific
89
+ self.moe_intermediate_size = moe_intermediate_size
90
+ self.num_experts_per_tok = num_experts_per_tok
91
+ self.n_group = n_group
92
+ self.topk_group = topk_group
93
+ self.num_experts = num_experts
94
+ self.num_shared_experts = num_shared_experts
95
+ self.num_expert_groups = num_expert_groups
96
+ self.num_limited_groups = num_limited_groups
97
+ self.score_func = score_func
98
+ self.route_norm = route_norm
99
+ self.route_scale = route_scale
100
+
101
+
102
+ # Attention specific
103
+ self.attention_dropout = attention_dropout
104
+ self.global_attn_every_n_layers = global_attn_every_n_layers
105
+ self.sliding_window = sliding_window
106
+ self.layer_types = layer_types
107
+ if self.layer_types is None:
108
+ self.layer_types = [
109
+ "sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers)
110
+ ]
111
+ layer_type_validation(self.layer_types)
112
+
113
+ # muP specific
114
+ self.mup_enabled = mup_enabled
115
+
116
+ if num_key_value_heads is None:
117
+ num_key_value_heads = num_attention_heads
118
+
119
+ self.num_key_value_heads = num_key_value_heads
120
+
121
+
122
+ # Validate rope configs
123
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
124
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
125
+ rope_config_validation(self)
126
+
127
+ super().__init__(
128
+ tie_word_embeddings=tie_word_embeddings,
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ __all__ = ["AfmoeConfig"]
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.57.1",
4
+ "temperature": 0.15,
5
+ "top_p": 0.75,
6
+ "top_k": 50,
7
+ "min_p": 0.06
8
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8440bde0fd6cb42add9571166a5932b4501826a27c196f3ee5ebccec486dc731
3
+ size 5091649223
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28dab33737787970991f5807bebacbfc1633872fae41005223b257596110fe5b
3
+ size 5133840764
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eac27e3d14ec9e02230a7a214260f975269217e59fbf26dd21a74876ef3c507
3
+ size 5133840766
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_afmoe.py ADDED
@@ -0,0 +1,680 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Optional, Tuple, Union
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn
6
+
7
+ from transformers.activations import ACT2FN
8
+ from transformers.generation import GenerationMixin
9
+ from transformers.modeling_outputs import (
10
+ MoeCausalLMOutputWithPast,
11
+ MoeModelOutputWithPast,
12
+ )
13
+ from transformers.modeling_utils import PreTrainedModel, ALL_ATTENTION_FUNCTIONS
14
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
15
+ from transformers.masking_utils import (
16
+ create_causal_mask,
17
+ create_sliding_window_causal_mask,
18
+ )
19
+ from transformers.modeling_layers import GradientCheckpointingLayer
20
+ from transformers.processing_utils import Unpack
21
+ from transformers.utils import TransformersKwargs
22
+ from transformers.cache_utils import Cache, DynamicCache
23
+ from transformers.integrations import use_kernel_forward_from_hub
24
+
25
+
26
+ try:
27
+ from .configuration_afmoe import AfmoeConfig
28
+ except:
29
+ from configuration_afmoe import AfmoeConfig
30
+
31
+ class AfmoeRotaryEmbedding(nn.Module):
32
+
33
+ def __init__(self, config: AfmoeConfig, device=None):
34
+ super().__init__()
35
+ # BC: "rope_type" was originally "type"
36
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
37
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
38
+ else:
39
+ self.rope_type = "default"
40
+ self.max_seq_len_cached = config.max_position_embeddings
41
+ self.original_max_seq_len = config.max_position_embeddings
42
+
43
+ self.config = config
44
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
45
+
46
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
47
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
48
+ self.original_inv_freq = self.inv_freq
49
+
50
+ def _dynamic_frequency_update(self, position_ids, device):
51
+ """
52
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
53
+ 1 - growing beyond the cached sequence length (allow scaling)
54
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
55
+ """
56
+ seq_len = torch.max(position_ids) + 1
57
+ if seq_len > self.max_seq_len_cached: # growth
58
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
59
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
60
+ self.max_seq_len_cached = seq_len
61
+
62
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
63
+ # This .to() is needed if the model has been moved to a device after being initialized (because
64
+ # the buffer is automatically moved, but not the original copy)
65
+ self.original_inv_freq = self.original_inv_freq.to(device)
66
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
67
+ self.max_seq_len_cached = self.original_max_seq_len
68
+
69
+ @torch.no_grad()
70
+ def forward(self, x, position_ids):
71
+ if "dynamic" in self.rope_type:
72
+ self._dynamic_frequency_update(position_ids, device=x.device)
73
+
74
+ # Core RoPE block
75
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
76
+ position_ids_expanded = position_ids[:, None, :].float()
77
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
78
+ device_type = x.device.type
79
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
80
+ with torch.autocast(device_type=device_type, enabled=False):
81
+ freqs = (inv_freq_expanded.float().to(x.device) @ position_ids_expanded.float()).transpose(1, 2)
82
+ emb = torch.cat((freqs, freqs), dim=-1)
83
+ cos = emb.cos()
84
+ sin = emb.sin()
85
+
86
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
87
+ cos = cos * self.attention_scaling
88
+ sin = sin * self.attention_scaling
89
+
90
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
91
+
92
+
93
+ def rotate_half(x):
94
+ """Rotates half the hidden dims of the input."""
95
+ x1 = x[..., : x.shape[-1] // 2]
96
+ x2 = x[..., x.shape[-1] // 2 :]
97
+ return torch.cat((-x2, x1), dim=-1)
98
+
99
+
100
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
101
+ """Applies Rotary Position Embedding to the query and key tensors.
102
+
103
+ Args:
104
+ q (`torch.Tensor`): The query tensor.
105
+ k (`torch.Tensor`): The key tensor.
106
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
107
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
108
+ position_ids (`torch.Tensor`, *optional*):
109
+ Deprecated and unused.
110
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
111
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
112
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
113
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
114
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
115
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
116
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
117
+ Returns:
118
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
119
+ """
120
+ cos = cos.unsqueeze(unsqueeze_dim)
121
+ sin = sin.unsqueeze(unsqueeze_dim)
122
+ q_embed = (q * cos) + (rotate_half(q) * sin)
123
+ k_embed = (k * cos) + (rotate_half(k) * sin)
124
+ return q_embed, k_embed
125
+
126
+
127
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
128
+ """
129
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
130
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
131
+ """
132
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
133
+ if n_rep == 1:
134
+ return hidden_states
135
+ hidden_states = hidden_states[:, :, None, :, :].expand(
136
+ batch, num_key_value_heads, n_rep, slen, head_dim
137
+ )
138
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
139
+
140
+ @use_kernel_forward_from_hub("RMSNorm")
141
+ class AfmoeRMSNorm(nn.Module):
142
+ def __init__(self, hidden_size: int, eps: float):
143
+ """
144
+ AfmoeRMSNorm is equivalent to T5LayerNorm
145
+ """
146
+ super().__init__()
147
+ self.weight = nn.Parameter(torch.ones(hidden_size))
148
+ self.variance_epsilon = eps
149
+
150
+ def forward(self, hidden_states):
151
+ input_dtype = hidden_states.dtype
152
+ hidden_states = hidden_states.to(torch.float32)
153
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
154
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
155
+ return self.weight * hidden_states.to(input_dtype)
156
+
157
+ def extra_repr(self):
158
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
159
+
160
+
161
+
162
+ def eager_attention_forward(
163
+ module: nn.Module,
164
+ query: torch.Tensor,
165
+ key: torch.Tensor,
166
+ value: torch.Tensor,
167
+ attention_mask: Optional[torch.Tensor],
168
+ scaling: float,
169
+ dropout: float = 0.0,
170
+ **kwargs,
171
+ ):
172
+ key_states = repeat_kv(key, module.num_key_value_groups)
173
+ value_states = repeat_kv(value, module.num_key_value_groups)
174
+
175
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
176
+ if attention_mask is not None:
177
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
178
+ attn_weights = attn_weights + causal_mask
179
+
180
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
181
+ query.dtype
182
+ )
183
+ attn_weights = nn.functional.dropout(
184
+ attn_weights, p=dropout, training=module.training
185
+ )
186
+ attn_output = torch.matmul(attn_weights, value_states)
187
+ attn_output = attn_output.transpose(1, 2).contiguous()
188
+
189
+ return attn_output, attn_weights
190
+
191
+
192
+ class AfmoeMLP(nn.Module):
193
+ def __init__(self, config, intermediate_size=None):
194
+ super().__init__()
195
+ self.config = config
196
+ self.hidden_size = config.hidden_size
197
+ self.intermediate_size = intermediate_size or config.intermediate_size
198
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
199
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
200
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
201
+ self.act_fn = ACT2FN[config.hidden_act]
202
+
203
+ def forward(self, x):
204
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
205
+
206
+
207
+ class AfmoeTokenChoiceRouter(nn.Module):
208
+ """Token-choice top-K router for MoE routing."""
209
+
210
+ def __init__(self, config):
211
+ super().__init__()
212
+ self.config = config
213
+ self.top_k = config.num_experts_per_tok
214
+ self.num_experts = config.num_experts
215
+ self.score_func = config.score_func
216
+ self.route_norm = config.route_norm
217
+ self.route_scale = config.route_scale
218
+ self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
219
+
220
+ def forward(self, hidden_states, expert_bias: torch.Tensor | None):
221
+ _, _, hidden_dim = hidden_states.shape
222
+ hidden_states = hidden_states.view(-1, hidden_dim)
223
+
224
+ scores = self.gate(hidden_states)
225
+
226
+ # Apply scoring function in float32 for stability
227
+ if self.score_func == "sigmoid":
228
+ scores = torch.sigmoid(scores.to(torch.float32))
229
+ else:
230
+ scores = F.softmax(scores.to(torch.float32), dim=-1)
231
+
232
+ if expert_bias is not None:
233
+ _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
234
+ top_scores = scores.gather(dim=1, index=selected_experts)
235
+ else:
236
+ top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
237
+
238
+ # Normalize weights if using sigmoid
239
+ if self.score_func == "sigmoid" and self.route_norm:
240
+ denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
241
+ top_scores = top_scores / denominator
242
+
243
+ top_scores = top_scores * self.route_scale
244
+ return top_scores, selected_experts
245
+
246
+ class AfmoeMoE(nn.Module):
247
+ def __init__(self, config):
248
+ super().__init__()
249
+ self.config = config
250
+ self.router = AfmoeTokenChoiceRouter(config)
251
+
252
+ self.shared_experts = None
253
+ if config.num_shared_experts > 0:
254
+ self.shared_experts = AfmoeMLP(
255
+ config, config.moe_intermediate_size * config.num_shared_experts
256
+ )
257
+ self.experts = nn.ModuleList(
258
+ [AfmoeMLP(
259
+ config, intermediate_size=config.moe_intermediate_size
260
+ ) for _ in range(config.num_experts)]
261
+ )
262
+ self.expert_bias = nn.Parameter(torch.zeros(config.num_experts, dtype=torch.float32), requires_grad=False)
263
+
264
+
265
+ def forward(self, hidden_states):
266
+ batch_size, seq_len, hidden_dim = hidden_states.shape
267
+ hidden_states_flat = hidden_states.view(-1, hidden_dim)
268
+
269
+ # Get routing decisions
270
+ top_scores, selected_experts = self.router(hidden_states, self.expert_bias)
271
+
272
+ # Process through shared experts
273
+ if self.shared_experts is not None:
274
+ shared_output = self.shared_experts(hidden_states_flat)
275
+ else:
276
+ shared_output = torch.zeros_like(hidden_states_flat)
277
+
278
+ # Reorder tokens by expert for efficient processing
279
+ token_indices_sorted = torch.argsort(selected_experts.view(-1), stable=True)
280
+ top_scores_sorted = top_scores.view(-1)[token_indices_sorted]
281
+ token_to_expert = selected_experts.view(-1)[token_indices_sorted]
282
+ token_indices_sorted = token_indices_sorted // self.config.num_experts_per_tok
283
+
284
+ # Gather input tokens
285
+ token_indices_expanded = token_indices_sorted.unsqueeze(-1).expand(
286
+ -1, hidden_dim
287
+ )
288
+ routed_input = torch.gather(
289
+ hidden_states_flat, dim=0, index=token_indices_expanded
290
+ )
291
+
292
+ routed_output = torch.zeros_like(routed_input)
293
+ for expert_id in range(self.config.num_experts):
294
+ mask = token_to_expert == expert_id
295
+ if mask.any():
296
+ expert_input = routed_input[mask]
297
+ expert_out = self.experts[expert_id](expert_input)
298
+ routed_output[mask] = expert_out
299
+
300
+ routed_output = (
301
+ routed_output.to(torch.float32) * top_scores_sorted.unsqueeze(-1)
302
+ ).to(hidden_states.dtype)
303
+
304
+ # Scatter back to original positions
305
+ output = shared_output.scatter_add(
306
+ dim=0, index=token_indices_expanded, src=routed_output
307
+ )
308
+
309
+ return output.view(batch_size, seq_len, hidden_dim)
310
+
311
+
312
+ class AfmoeAttention(nn.Module):
313
+ """Multi-headed attention with local/global pattern and gating."""
314
+
315
+ def __init__(self, config: AfmoeConfig, layer_idx: int):
316
+ super().__init__()
317
+ self.config = config
318
+ self.layer_idx = layer_idx
319
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
320
+ self.num_heads = config.num_attention_heads
321
+ self.num_key_value_heads = config.num_key_value_heads
322
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
323
+
324
+ self.scaling = self.head_dim**-0.5
325
+ self.attention_dropout = config.attention_dropout
326
+ self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
327
+ self.sliding_window = config.sliding_window if self.is_local_attention else None
328
+
329
+ self.q_proj = nn.Linear(
330
+ config.hidden_size, self.num_heads * self.head_dim, bias=False
331
+ )
332
+ self.k_proj = nn.Linear(
333
+ config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
334
+ )
335
+ self.v_proj = nn.Linear(
336
+ config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
337
+ )
338
+ self.o_proj = nn.Linear(
339
+ self.num_heads * self.head_dim, config.hidden_size, bias=False
340
+ )
341
+
342
+ self.q_norm = AfmoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
343
+ self.k_norm = AfmoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
344
+
345
+ self.gate_proj = nn.Linear(
346
+ config.hidden_size, self.num_heads * self.head_dim, bias=False
347
+ )
348
+
349
+ def forward(
350
+ self,
351
+ hidden_states: torch.Tensor,
352
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
353
+ attention_mask: Optional[torch.Tensor],
354
+ past_key_value: Optional[Cache] = None,
355
+ cache_position: Optional[torch.LongTensor] = None,
356
+ **kwargs: Unpack[TransformersKwargs],
357
+ ) -> torch.Tensor:
358
+
359
+ input_shape = hidden_states.shape[:-1]
360
+ hidden_shape = (*input_shape, -1, self.head_dim)
361
+
362
+ query_states = self.q_proj(hidden_states).view(hidden_shape)
363
+ key_states = self.k_proj(hidden_states).view(hidden_shape)
364
+ value_states = self.v_proj(hidden_states).view(hidden_shape)
365
+ gate_states = self.gate_proj(hidden_states)
366
+
367
+ query_states = self.q_norm(query_states)
368
+ key_states = self.k_norm(key_states)
369
+
370
+ query_states = query_states.transpose(1, 2)
371
+ key_states = key_states.transpose(1, 2)
372
+ value_states = value_states.transpose(1, 2)
373
+
374
+ if self.is_local_attention:
375
+ cos, sin = position_embeddings
376
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
377
+
378
+ if past_key_value is not None:
379
+ cache_kwargs = {"cache_position": cache_position}
380
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
381
+
382
+ attention_interface: Callable = eager_attention_forward
383
+ if self.config._attn_implementation != "eager":
384
+ attention_interface = ALL_ATTENTION_FUNCTIONS[
385
+ self.config._attn_implementation
386
+ ]
387
+
388
+ output, _ = attention_interface(
389
+ self,
390
+ query_states,
391
+ key_states,
392
+ value_states,
393
+ attention_mask=attention_mask,
394
+ dropout=0.0 if not self.training else self.attention_dropout,
395
+ scaling=self.scaling,
396
+ sliding_window=self.sliding_window,
397
+ **kwargs,
398
+ )
399
+
400
+ output = output.view(*input_shape, -1).contiguous()
401
+ output = output * F.sigmoid(gate_states)
402
+ return self.o_proj(output)
403
+
404
+
405
+ class AfmoeDecoderLayer(GradientCheckpointingLayer):
406
+ def __init__(self, config: AfmoeConfig, layer_idx: int):
407
+ super().__init__()
408
+ self.hidden_size = config.hidden_size
409
+ self.layer_idx = layer_idx
410
+
411
+ self.self_attn = AfmoeAttention(config=config, layer_idx=layer_idx)
412
+ self.attention_type = config.layer_types[layer_idx]
413
+
414
+ # Dual normalization for attention
415
+ self.input_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
416
+ self.post_attention_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
417
+
418
+ # Dual normalization for FFN
419
+ self.pre_mlp_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
420
+ self.post_mlp_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
421
+
422
+ # MoE or dense FFN
423
+ self.moe_enabled = layer_idx >= config.num_dense_layers
424
+ if self.moe_enabled:
425
+ self.mlp = AfmoeMoE(config)
426
+ else:
427
+ self.mlp = AfmoeMLP(config)
428
+
429
+ def forward(
430
+ self,
431
+ hidden_states: torch.Tensor,
432
+ attention_mask: Optional[torch.Tensor] = None,
433
+ position_ids: Optional[torch.LongTensor] = None,
434
+ past_key_value: Optional[Cache] = None,
435
+ use_cache: Optional[bool] = None,
436
+ cache_position: Optional[torch.LongTensor] = None,
437
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
438
+ **kwargs: Unpack[TransformersKwargs],
439
+ ) -> torch.FloatTensor:
440
+ residual = hidden_states
441
+
442
+ # Self Attention with dual normalization
443
+ hidden_states = self.input_layernorm(hidden_states)
444
+ hidden_states = self.self_attn(
445
+ hidden_states=hidden_states,
446
+ attention_mask=attention_mask,
447
+ position_ids=position_ids,
448
+ past_key_value=past_key_value,
449
+ use_cache=use_cache,
450
+ cache_position=cache_position,
451
+ position_embeddings=position_embeddings,
452
+ **kwargs,
453
+ )
454
+ hidden_states = self.post_attention_layernorm(hidden_states)
455
+ hidden_states = residual + hidden_states
456
+
457
+ # FFN with dual normalization
458
+ residual = hidden_states
459
+ hidden_states = self.pre_mlp_layernorm(hidden_states)
460
+
461
+ if self.moe_enabled:
462
+ hidden_states = self.mlp(hidden_states)
463
+ else:
464
+ hidden_states = self.mlp(hidden_states)
465
+
466
+ hidden_states = self.post_mlp_layernorm(hidden_states)
467
+ hidden_states = residual + hidden_states
468
+ return hidden_states
469
+
470
+
471
+ class AfmoePreTrainedModel(PreTrainedModel):
472
+ config_class = AfmoeConfig
473
+ base_model_prefix = "model"
474
+ _no_split_modules = ["AfmoeDecoderLayer"]
475
+ _skip_keys_device_placement = ["past_key_values"]
476
+ _keep_in_fp32_modules = [
477
+ "input_layernorm",
478
+ "post_attention_layernorm",
479
+ "pre_mlp_layernorm",
480
+ "post_mlp_layernorm",
481
+ "q_norm",
482
+ "k_norm",
483
+ "norm",
484
+ ]
485
+ _supports_sdpa = True
486
+ _supports_attention_backend = True
487
+ supports_gradient_checkpointing = True
488
+
489
+
490
+ class AfmoeModel(AfmoePreTrainedModel):
491
+ _no_split_modules = ["AfmoeDecoderLayer"]
492
+
493
+ def __init__(self, config: AfmoeConfig):
494
+ super().__init__(config)
495
+ self.padding_idx = config.pad_token_id
496
+ self.vocab_size = config.vocab_size
497
+
498
+ self.embed_tokens = nn.Embedding(
499
+ config.vocab_size, config.hidden_size, self.padding_idx
500
+ )
501
+ self.layers = nn.ModuleList(
502
+ [
503
+ AfmoeDecoderLayer(config, layer_idx)
504
+ for layer_idx in range(config.num_hidden_layers)
505
+ ]
506
+ )
507
+ self.norm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
508
+ self.rotary_emb = AfmoeRotaryEmbedding(config=config)
509
+ self.gradient_checkpointing = False
510
+
511
+ self.post_init()
512
+
513
+ def get_input_embeddings(self):
514
+ return self.embed_tokens
515
+
516
+ def set_input_embeddings(self, value):
517
+ self.embed_tokens = value
518
+
519
+
520
+ def forward(
521
+ self,
522
+ input_ids: torch.LongTensor,
523
+ attention_mask: Optional[torch.Tensor] = None,
524
+ position_ids: Optional[torch.LongTensor] = None,
525
+ past_key_values: Optional[list[torch.FloatTensor]] = None,
526
+ inputs_embeds: Optional[torch.FloatTensor] = None,
527
+ use_cache: Optional[bool] = None,
528
+ cache_position: Optional[torch.LongTensor] = None,
529
+ **kwargs: Unpack[TransformersKwargs],
530
+ ) -> MoeModelOutputWithPast:
531
+ if (input_ids is None) ^ (inputs_embeds is not None):
532
+ raise ValueError(
533
+ "You must specify exactly one of input_ids or inputs_embeds"
534
+ )
535
+
536
+ if use_cache and past_key_values is None:
537
+ past_key_values = DynamicCache()
538
+
539
+ if inputs_embeds is None:
540
+ inputs_embeds = self.embed_tokens(input_ids)
541
+
542
+ if cache_position is None:
543
+ past_seen_tokens = (
544
+ past_key_values.get_seq_length() if past_key_values is not None else 0
545
+ )
546
+ cache_position = torch.arange(
547
+ past_seen_tokens,
548
+ past_seen_tokens + inputs_embeds.shape[1],
549
+ device=inputs_embeds.device,
550
+ )
551
+ if position_ids is None:
552
+ position_ids = cache_position.unsqueeze(0)
553
+
554
+ # It may already have been prepared by e.g. `generate`
555
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
556
+ mask_kwargs = {
557
+ "config": self.config,
558
+ "input_embeds": inputs_embeds,
559
+ "attention_mask": attention_mask,
560
+ "cache_position": cache_position,
561
+ "past_key_values": past_key_values,
562
+ }
563
+ causal_mask_mapping = {
564
+ "full_attention": create_causal_mask(**mask_kwargs),
565
+ "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
566
+ }
567
+
568
+ hidden_states = inputs_embeds
569
+
570
+ # Apply muP input scaling if enabled
571
+ if self.config.mup_enabled:
572
+ hidden_states = hidden_states * (self.config.hidden_size**0.5)
573
+
574
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
575
+
576
+ for decoder_layer in self.layers:
577
+ hidden_states = decoder_layer(
578
+ hidden_states,
579
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
580
+ position_ids=position_ids,
581
+ past_key_value=past_key_values,
582
+ use_cache=use_cache,
583
+ cache_position=cache_position,
584
+ position_embeddings=position_embeddings,
585
+ **kwargs,
586
+ )
587
+
588
+ hidden_states = self.norm(hidden_states)
589
+ return MoeModelOutputWithPast(
590
+ last_hidden_state=hidden_states,
591
+ past_key_values=past_key_values,
592
+ )
593
+
594
+
595
+ class AfmoeForCausalLM(AfmoePreTrainedModel, GenerationMixin):
596
+ _tied_weights_keys = ["lm_head.weight"]
597
+ _tp_plan = {"lm_head": "colwise_rep"}
598
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
599
+
600
+ def __init__(self, config):
601
+ super().__init__(config)
602
+ self.model = AfmoeModel(config)
603
+ self.vocab_size = config.vocab_size
604
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
605
+
606
+ # Initialize weights and apply final processing
607
+ self.post_init()
608
+
609
+ def get_input_embeddings(self):
610
+ return self.model.embed_tokens
611
+
612
+ def set_input_embeddings(self, value):
613
+ self.model.embed_tokens = value
614
+
615
+ def get_output_embeddings(self):
616
+ return self.lm_head
617
+
618
+ def set_output_embeddings(self, new_embeddings):
619
+ self.lm_head = new_embeddings
620
+
621
+ def set_decoder(self, decoder):
622
+ self.model = decoder
623
+
624
+ def get_decoder(self):
625
+ return self.model
626
+
627
+ def forward(
628
+ self,
629
+ input_ids: torch.LongTensor,
630
+ attention_mask: Optional[torch.Tensor] = None,
631
+ position_ids: Optional[torch.LongTensor] = None,
632
+ past_key_values: Optional[Cache] = None,
633
+ inputs_embeds: Optional[torch.FloatTensor] = None,
634
+ labels: Optional[torch.LongTensor] = None,
635
+ use_cache: Optional[bool] = None,
636
+ cache_position: Optional[torch.LongTensor] = None,
637
+ logits_to_keep: Union[int, torch.Tensor] = 0,
638
+ token_type_ids: Optional[torch.Tensor] = None, # will be ignored
639
+ **kwargs: Unpack[TransformersKwargs],
640
+ ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
641
+ outputs: MoeModelOutputWithPast = self.model(
642
+ input_ids=input_ids,
643
+ attention_mask=attention_mask,
644
+ position_ids=position_ids,
645
+ past_key_values=past_key_values,
646
+ inputs_embeds=inputs_embeds,
647
+ use_cache=use_cache,
648
+ cache_position=cache_position,
649
+ **kwargs,
650
+ )
651
+
652
+ hidden_states = outputs.last_hidden_state
653
+ # Only compute necessary logits
654
+ slice_indices = (
655
+ slice(-logits_to_keep, None)
656
+ if isinstance(logits_to_keep, int)
657
+ else logits_to_keep
658
+ )
659
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
660
+
661
+ loss = None
662
+ if labels is not None:
663
+ loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
664
+
665
+
666
+ return MoeCausalLMOutputWithPast(
667
+ loss=loss,
668
+ logits=logits,
669
+ past_key_values=outputs.past_key_values,
670
+ hidden_states=outputs.hidden_states,
671
+ attentions=outputs.attentions,
672
+ router_logits=outputs.router_logits,
673
+ )
674
+
675
+
676
+ __all__ = [
677
+ "AfmoeForCausalLM",
678
+ "AfmoeModel",
679
+ "AfmoePreTrainedModel",
680
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0058acf26a6e7228298b0c9fed2a87fcb3f6cb5f84752cfde101b9e68b380918
3
+ size 14614841
tokenizer_config.json ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<|begin_of_text|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<|end_of_text|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "<|im_start|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<|im_end|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<|eot_id|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<|start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<|channel|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "7": {
63
+ "content": "<|message|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "8": {
71
+ "content": "<|end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "9": {
79
+ "content": "<|fitm_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "10": {
87
+ "content": "<|fitm_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "11": {
95
+ "content": "<|fitm_hole|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "12": {
103
+ "content": "<|pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "13": {
111
+ "content": "<think>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "14": {
119
+ "content": "</think>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "15": {
127
+ "content": "<|reserved_special_2|>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ },
134
+ "16": {
135
+ "content": "<|reserved_special_3|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": true
141
+ },
142
+ "17": {
143
+ "content": "<|reserved_special_4|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": true
149
+ },
150
+ "18": {
151
+ "content": "<|reserved_special_5|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": true
157
+ },
158
+ "19": {
159
+ "content": "<|reserved_special_6|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": true
165
+ },
166
+ "20": {
167
+ "content": "<|reserved_special_7|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": true
173
+ },
174
+ "21": {
175
+ "content": "<|reserved_special_8|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ },
182
+ "22": {
183
+ "content": "<|reserved_special_9|>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "23": {
191
+ "content": "<|reserved_special_10|>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "24": {
199
+ "content": "<|reserved_special_11|>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "25": {
207
+ "content": "<|reserved_special_12|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "26": {
215
+ "content": "<|reserved_special_13|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "27": {
223
+ "content": "<|reserved_special_14|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "28": {
231
+ "content": "<|reserved_special_15|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "29": {
239
+ "content": "<|reserved_special_16|>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "30": {
247
+ "content": "<|reserved_special_17|>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "31": {
255
+ "content": "<|reserved_special_18|>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ }
262
+ },
263
+ "bos_token": "<|begin_of_text|>",
264
+ "clean_up_tokenization_spaces": false,
265
+ "eos_token": "<|im_end|>",
266
+ "extra_special_tokens": {},
267
+ "model_max_length": 65536,
268
+ "pad_token": "<|pad|>",
269
+ "tokenizer_class": "PreTrainedTokenizerFast",
270
+ "use_default_system_prompt": false
271
+ }