wenhuach commited on
Commit
16511cd
·
verified ·
1 Parent(s): b377d30

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. chat_template.jinja +3 -0
  3. config.json +1540 -0
  4. configuration_deepseek.py +199 -0
  5. generation_config.json +9 -0
  6. model-00001-of-00072.safetensors +3 -0
  7. model-00002-of-00072.safetensors +3 -0
  8. model-00003-of-00072.safetensors +3 -0
  9. model-00004-of-00072.safetensors +3 -0
  10. model-00005-of-00072.safetensors +3 -0
  11. model-00006-of-00072.safetensors +3 -0
  12. model-00007-of-00072.safetensors +3 -0
  13. model-00008-of-00072.safetensors +3 -0
  14. model-00009-of-00072.safetensors +3 -0
  15. model-00010-of-00072.safetensors +3 -0
  16. model-00011-of-00072.safetensors +3 -0
  17. model-00012-of-00072.safetensors +3 -0
  18. model-00013-of-00072.safetensors +3 -0
  19. model-00014-of-00072.safetensors +3 -0
  20. model-00015-of-00072.safetensors +3 -0
  21. model-00016-of-00072.safetensors +3 -0
  22. model-00017-of-00072.safetensors +3 -0
  23. model-00018-of-00072.safetensors +3 -0
  24. model-00019-of-00072.safetensors +3 -0
  25. model-00020-of-00072.safetensors +3 -0
  26. model-00021-of-00072.safetensors +3 -0
  27. model-00022-of-00072.safetensors +3 -0
  28. model-00023-of-00072.safetensors +3 -0
  29. model-00024-of-00072.safetensors +3 -0
  30. model-00025-of-00072.safetensors +3 -0
  31. model-00026-of-00072.safetensors +3 -0
  32. model-00027-of-00072.safetensors +3 -0
  33. model-00028-of-00072.safetensors +3 -0
  34. model-00029-of-00072.safetensors +3 -0
  35. model-00030-of-00072.safetensors +3 -0
  36. model-00031-of-00072.safetensors +3 -0
  37. model-00032-of-00072.safetensors +3 -0
  38. model-00033-of-00072.safetensors +3 -0
  39. model-00034-of-00072.safetensors +3 -0
  40. model-00035-of-00072.safetensors +3 -0
  41. model-00036-of-00072.safetensors +3 -0
  42. model-00037-of-00072.safetensors +3 -0
  43. model-00038-of-00072.safetensors +3 -0
  44. model-00039-of-00072.safetensors +3 -0
  45. model-00040-of-00072.safetensors +3 -0
  46. model-00041-of-00072.safetensors +3 -0
  47. model-00042-of-00072.safetensors +3 -0
  48. model-00043-of-00072.safetensors +3 -0
  49. model-00044-of-00072.safetensors +3 -0
  50. model-00045-of-00072.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if not thinking is defined %}{% set thinking = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '
2
+
3
+ ' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{%- set ns.is_first = false -%}{%- set ns.is_last_user = true -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}{%- if ns.is_last_user %}{{'<|Assistant|></think>'}}{%- endif %}{%- set ns.is_last_user = false -%}{%- set ns.is_first = false %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- else %}{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}{%- if ns.is_last_user %}{{'<|Assistant|>'}}{%- if message['prefix'] is defined and message['prefix'] and thinking %}{{'<think>'}} {%- else %}{{'</think>'}}{%- endif %}{%- endif %}{%- set ns.is_last_user = false -%}{%- if ns.is_tool %}{{message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{%- set content = message['content'] -%}{%- if '</think>' in content %}{%- set content = content.split('</think>', 1)[1] -%}{%- endif %}{{content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_last_user = false -%}{%- set ns.is_tool = true -%}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endfor -%}{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}{{'<|Assistant|>'}}{%- if not thinking %}{{'</think>'}}{%- else %}{{'<think>'}}{%- endif %}{% endif %}
config.json ADDED
@@ -0,0 +1,1540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "attn_module_list_cfg": [],
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
10
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
11
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
12
+ },
13
+ "bos_token_id": 0,
14
+ "dtype": "bfloat16",
15
+ "eos_token_id": 1,
16
+ "ep_size": 1,
17
+ "first_k_dense_replace": 3,
18
+ "head_dim": 64,
19
+ "hidden_act": "silu",
20
+ "hidden_size": 7168,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 18432,
23
+ "kv_lora_rank": 512,
24
+ "max_position_embeddings": 163840,
25
+ "model_type": "deepseek_v3",
26
+ "moe_intermediate_size": 2048,
27
+ "moe_layer_freq": 1,
28
+ "n_group": 8,
29
+ "n_routed_experts": 256,
30
+ "n_shared_experts": 1,
31
+ "norm_topk_prob": true,
32
+ "num_attention_heads": 128,
33
+ "num_experts_per_tok": 8,
34
+ "num_hidden_layers": 61,
35
+ "num_key_value_heads": 128,
36
+ "num_nextn_predict_layers": 1,
37
+ "pretraining_tp": 1,
38
+ "q_lora_rank": 1536,
39
+ "qk_head_dim": 192,
40
+ "qk_nope_head_dim": 128,
41
+ "qk_rope_head_dim": 64,
42
+ "quantization_config": {
43
+ "autoround_version": "0.8.0.dev",
44
+ "bits": 4,
45
+ "data_type": "int",
46
+ "extra_config": {
47
+ "model.layers.0.mlp.down_proj": {
48
+ "bits": 8
49
+ },
50
+ "model.layers.0.mlp.gate_proj": {
51
+ "bits": 8
52
+ },
53
+ "model.layers.0.mlp.up_proj": {
54
+ "bits": 8
55
+ },
56
+ "model.layers.0.self_attn.kv_a_proj_with_mqa": {
57
+ "bits": 8
58
+ },
59
+ "model.layers.0.self_attn.kv_b_proj": {
60
+ "bits": 8
61
+ },
62
+ "model.layers.0.self_attn.o_proj": {
63
+ "bits": 8
64
+ },
65
+ "model.layers.0.self_attn.q_a_proj": {
66
+ "bits": 8
67
+ },
68
+ "model.layers.0.self_attn.q_b_proj": {
69
+ "bits": 8
70
+ },
71
+ "model.layers.1.mlp.down_proj": {
72
+ "bits": 8
73
+ },
74
+ "model.layers.1.mlp.gate_proj": {
75
+ "bits": 8
76
+ },
77
+ "model.layers.1.mlp.up_proj": {
78
+ "bits": 8
79
+ },
80
+ "model.layers.1.self_attn.kv_a_proj_with_mqa": {
81
+ "bits": 8
82
+ },
83
+ "model.layers.1.self_attn.kv_b_proj": {
84
+ "bits": 8
85
+ },
86
+ "model.layers.1.self_attn.o_proj": {
87
+ "bits": 8
88
+ },
89
+ "model.layers.1.self_attn.q_a_proj": {
90
+ "bits": 8
91
+ },
92
+ "model.layers.1.self_attn.q_b_proj": {
93
+ "bits": 8
94
+ },
95
+ "model.layers.10.mlp.shared_experts.down_proj": {
96
+ "bits": 8
97
+ },
98
+ "model.layers.10.mlp.shared_experts.gate_proj": {
99
+ "bits": 8
100
+ },
101
+ "model.layers.10.mlp.shared_experts.up_proj": {
102
+ "bits": 8
103
+ },
104
+ "model.layers.10.self_attn.kv_a_proj_with_mqa": {
105
+ "bits": 8
106
+ },
107
+ "model.layers.10.self_attn.kv_b_proj": {
108
+ "bits": 8
109
+ },
110
+ "model.layers.10.self_attn.o_proj": {
111
+ "bits": 8
112
+ },
113
+ "model.layers.10.self_attn.q_a_proj": {
114
+ "bits": 8
115
+ },
116
+ "model.layers.10.self_attn.q_b_proj": {
117
+ "bits": 8
118
+ },
119
+ "model.layers.11.mlp.shared_experts.down_proj": {
120
+ "bits": 8
121
+ },
122
+ "model.layers.11.mlp.shared_experts.gate_proj": {
123
+ "bits": 8
124
+ },
125
+ "model.layers.11.mlp.shared_experts.up_proj": {
126
+ "bits": 8
127
+ },
128
+ "model.layers.11.self_attn.kv_a_proj_with_mqa": {
129
+ "bits": 8
130
+ },
131
+ "model.layers.11.self_attn.kv_b_proj": {
132
+ "bits": 8
133
+ },
134
+ "model.layers.11.self_attn.o_proj": {
135
+ "bits": 8
136
+ },
137
+ "model.layers.11.self_attn.q_a_proj": {
138
+ "bits": 8
139
+ },
140
+ "model.layers.11.self_attn.q_b_proj": {
141
+ "bits": 8
142
+ },
143
+ "model.layers.12.mlp.shared_experts.down_proj": {
144
+ "bits": 8
145
+ },
146
+ "model.layers.12.mlp.shared_experts.gate_proj": {
147
+ "bits": 8
148
+ },
149
+ "model.layers.12.mlp.shared_experts.up_proj": {
150
+ "bits": 8
151
+ },
152
+ "model.layers.12.self_attn.kv_a_proj_with_mqa": {
153
+ "bits": 8
154
+ },
155
+ "model.layers.12.self_attn.kv_b_proj": {
156
+ "bits": 8
157
+ },
158
+ "model.layers.12.self_attn.o_proj": {
159
+ "bits": 8
160
+ },
161
+ "model.layers.12.self_attn.q_a_proj": {
162
+ "bits": 8
163
+ },
164
+ "model.layers.12.self_attn.q_b_proj": {
165
+ "bits": 8
166
+ },
167
+ "model.layers.13.mlp.shared_experts.down_proj": {
168
+ "bits": 8
169
+ },
170
+ "model.layers.13.mlp.shared_experts.gate_proj": {
171
+ "bits": 8
172
+ },
173
+ "model.layers.13.mlp.shared_experts.up_proj": {
174
+ "bits": 8
175
+ },
176
+ "model.layers.13.self_attn.kv_a_proj_with_mqa": {
177
+ "bits": 8
178
+ },
179
+ "model.layers.13.self_attn.kv_b_proj": {
180
+ "bits": 8
181
+ },
182
+ "model.layers.13.self_attn.o_proj": {
183
+ "bits": 8
184
+ },
185
+ "model.layers.13.self_attn.q_a_proj": {
186
+ "bits": 8
187
+ },
188
+ "model.layers.13.self_attn.q_b_proj": {
189
+ "bits": 8
190
+ },
191
+ "model.layers.14.mlp.shared_experts.down_proj": {
192
+ "bits": 8
193
+ },
194
+ "model.layers.14.mlp.shared_experts.gate_proj": {
195
+ "bits": 8
196
+ },
197
+ "model.layers.14.mlp.shared_experts.up_proj": {
198
+ "bits": 8
199
+ },
200
+ "model.layers.14.self_attn.kv_a_proj_with_mqa": {
201
+ "bits": 8
202
+ },
203
+ "model.layers.14.self_attn.kv_b_proj": {
204
+ "bits": 8
205
+ },
206
+ "model.layers.14.self_attn.o_proj": {
207
+ "bits": 8
208
+ },
209
+ "model.layers.14.self_attn.q_a_proj": {
210
+ "bits": 8
211
+ },
212
+ "model.layers.14.self_attn.q_b_proj": {
213
+ "bits": 8
214
+ },
215
+ "model.layers.15.mlp.shared_experts.down_proj": {
216
+ "bits": 8
217
+ },
218
+ "model.layers.15.mlp.shared_experts.gate_proj": {
219
+ "bits": 8
220
+ },
221
+ "model.layers.15.mlp.shared_experts.up_proj": {
222
+ "bits": 8
223
+ },
224
+ "model.layers.15.self_attn.kv_a_proj_with_mqa": {
225
+ "bits": 8
226
+ },
227
+ "model.layers.15.self_attn.kv_b_proj": {
228
+ "bits": 8
229
+ },
230
+ "model.layers.15.self_attn.o_proj": {
231
+ "bits": 8
232
+ },
233
+ "model.layers.15.self_attn.q_a_proj": {
234
+ "bits": 8
235
+ },
236
+ "model.layers.15.self_attn.q_b_proj": {
237
+ "bits": 8
238
+ },
239
+ "model.layers.16.mlp.shared_experts.down_proj": {
240
+ "bits": 8
241
+ },
242
+ "model.layers.16.mlp.shared_experts.gate_proj": {
243
+ "bits": 8
244
+ },
245
+ "model.layers.16.mlp.shared_experts.up_proj": {
246
+ "bits": 8
247
+ },
248
+ "model.layers.16.self_attn.kv_a_proj_with_mqa": {
249
+ "bits": 8
250
+ },
251
+ "model.layers.16.self_attn.kv_b_proj": {
252
+ "bits": 8
253
+ },
254
+ "model.layers.16.self_attn.o_proj": {
255
+ "bits": 8
256
+ },
257
+ "model.layers.16.self_attn.q_a_proj": {
258
+ "bits": 8
259
+ },
260
+ "model.layers.16.self_attn.q_b_proj": {
261
+ "bits": 8
262
+ },
263
+ "model.layers.17.mlp.shared_experts.down_proj": {
264
+ "bits": 8
265
+ },
266
+ "model.layers.17.mlp.shared_experts.gate_proj": {
267
+ "bits": 8
268
+ },
269
+ "model.layers.17.mlp.shared_experts.up_proj": {
270
+ "bits": 8
271
+ },
272
+ "model.layers.17.self_attn.kv_a_proj_with_mqa": {
273
+ "bits": 8
274
+ },
275
+ "model.layers.17.self_attn.kv_b_proj": {
276
+ "bits": 8
277
+ },
278
+ "model.layers.17.self_attn.o_proj": {
279
+ "bits": 8
280
+ },
281
+ "model.layers.17.self_attn.q_a_proj": {
282
+ "bits": 8
283
+ },
284
+ "model.layers.17.self_attn.q_b_proj": {
285
+ "bits": 8
286
+ },
287
+ "model.layers.18.mlp.shared_experts.down_proj": {
288
+ "bits": 8
289
+ },
290
+ "model.layers.18.mlp.shared_experts.gate_proj": {
291
+ "bits": 8
292
+ },
293
+ "model.layers.18.mlp.shared_experts.up_proj": {
294
+ "bits": 8
295
+ },
296
+ "model.layers.18.self_attn.kv_a_proj_with_mqa": {
297
+ "bits": 8
298
+ },
299
+ "model.layers.18.self_attn.kv_b_proj": {
300
+ "bits": 8
301
+ },
302
+ "model.layers.18.self_attn.o_proj": {
303
+ "bits": 8
304
+ },
305
+ "model.layers.18.self_attn.q_a_proj": {
306
+ "bits": 8
307
+ },
308
+ "model.layers.18.self_attn.q_b_proj": {
309
+ "bits": 8
310
+ },
311
+ "model.layers.19.mlp.shared_experts.down_proj": {
312
+ "bits": 8
313
+ },
314
+ "model.layers.19.mlp.shared_experts.gate_proj": {
315
+ "bits": 8
316
+ },
317
+ "model.layers.19.mlp.shared_experts.up_proj": {
318
+ "bits": 8
319
+ },
320
+ "model.layers.19.self_attn.kv_a_proj_with_mqa": {
321
+ "bits": 8
322
+ },
323
+ "model.layers.19.self_attn.kv_b_proj": {
324
+ "bits": 8
325
+ },
326
+ "model.layers.19.self_attn.o_proj": {
327
+ "bits": 8
328
+ },
329
+ "model.layers.19.self_attn.q_a_proj": {
330
+ "bits": 8
331
+ },
332
+ "model.layers.19.self_attn.q_b_proj": {
333
+ "bits": 8
334
+ },
335
+ "model.layers.2.mlp.down_proj": {
336
+ "bits": 8
337
+ },
338
+ "model.layers.2.mlp.gate_proj": {
339
+ "bits": 8
340
+ },
341
+ "model.layers.2.mlp.up_proj": {
342
+ "bits": 8
343
+ },
344
+ "model.layers.2.self_attn.kv_a_proj_with_mqa": {
345
+ "bits": 8
346
+ },
347
+ "model.layers.2.self_attn.kv_b_proj": {
348
+ "bits": 8
349
+ },
350
+ "model.layers.2.self_attn.o_proj": {
351
+ "bits": 8
352
+ },
353
+ "model.layers.2.self_attn.q_a_proj": {
354
+ "bits": 8
355
+ },
356
+ "model.layers.2.self_attn.q_b_proj": {
357
+ "bits": 8
358
+ },
359
+ "model.layers.20.mlp.shared_experts.down_proj": {
360
+ "bits": 8
361
+ },
362
+ "model.layers.20.mlp.shared_experts.gate_proj": {
363
+ "bits": 8
364
+ },
365
+ "model.layers.20.mlp.shared_experts.up_proj": {
366
+ "bits": 8
367
+ },
368
+ "model.layers.20.self_attn.kv_a_proj_with_mqa": {
369
+ "bits": 8
370
+ },
371
+ "model.layers.20.self_attn.kv_b_proj": {
372
+ "bits": 8
373
+ },
374
+ "model.layers.20.self_attn.o_proj": {
375
+ "bits": 8
376
+ },
377
+ "model.layers.20.self_attn.q_a_proj": {
378
+ "bits": 8
379
+ },
380
+ "model.layers.20.self_attn.q_b_proj": {
381
+ "bits": 8
382
+ },
383
+ "model.layers.21.mlp.shared_experts.down_proj": {
384
+ "bits": 8
385
+ },
386
+ "model.layers.21.mlp.shared_experts.gate_proj": {
387
+ "bits": 8
388
+ },
389
+ "model.layers.21.mlp.shared_experts.up_proj": {
390
+ "bits": 8
391
+ },
392
+ "model.layers.21.self_attn.kv_a_proj_with_mqa": {
393
+ "bits": 8
394
+ },
395
+ "model.layers.21.self_attn.kv_b_proj": {
396
+ "bits": 8
397
+ },
398
+ "model.layers.21.self_attn.o_proj": {
399
+ "bits": 8
400
+ },
401
+ "model.layers.21.self_attn.q_a_proj": {
402
+ "bits": 8
403
+ },
404
+ "model.layers.21.self_attn.q_b_proj": {
405
+ "bits": 8
406
+ },
407
+ "model.layers.22.mlp.shared_experts.down_proj": {
408
+ "bits": 8
409
+ },
410
+ "model.layers.22.mlp.shared_experts.gate_proj": {
411
+ "bits": 8
412
+ },
413
+ "model.layers.22.mlp.shared_experts.up_proj": {
414
+ "bits": 8
415
+ },
416
+ "model.layers.22.self_attn.kv_a_proj_with_mqa": {
417
+ "bits": 8
418
+ },
419
+ "model.layers.22.self_attn.kv_b_proj": {
420
+ "bits": 8
421
+ },
422
+ "model.layers.22.self_attn.o_proj": {
423
+ "bits": 8
424
+ },
425
+ "model.layers.22.self_attn.q_a_proj": {
426
+ "bits": 8
427
+ },
428
+ "model.layers.22.self_attn.q_b_proj": {
429
+ "bits": 8
430
+ },
431
+ "model.layers.23.mlp.shared_experts.down_proj": {
432
+ "bits": 8
433
+ },
434
+ "model.layers.23.mlp.shared_experts.gate_proj": {
435
+ "bits": 8
436
+ },
437
+ "model.layers.23.mlp.shared_experts.up_proj": {
438
+ "bits": 8
439
+ },
440
+ "model.layers.23.self_attn.kv_a_proj_with_mqa": {
441
+ "bits": 8
442
+ },
443
+ "model.layers.23.self_attn.kv_b_proj": {
444
+ "bits": 8
445
+ },
446
+ "model.layers.23.self_attn.o_proj": {
447
+ "bits": 8
448
+ },
449
+ "model.layers.23.self_attn.q_a_proj": {
450
+ "bits": 8
451
+ },
452
+ "model.layers.23.self_attn.q_b_proj": {
453
+ "bits": 8
454
+ },
455
+ "model.layers.24.mlp.shared_experts.down_proj": {
456
+ "bits": 8
457
+ },
458
+ "model.layers.24.mlp.shared_experts.gate_proj": {
459
+ "bits": 8
460
+ },
461
+ "model.layers.24.mlp.shared_experts.up_proj": {
462
+ "bits": 8
463
+ },
464
+ "model.layers.24.self_attn.kv_a_proj_with_mqa": {
465
+ "bits": 8
466
+ },
467
+ "model.layers.24.self_attn.kv_b_proj": {
468
+ "bits": 8
469
+ },
470
+ "model.layers.24.self_attn.o_proj": {
471
+ "bits": 8
472
+ },
473
+ "model.layers.24.self_attn.q_a_proj": {
474
+ "bits": 8
475
+ },
476
+ "model.layers.24.self_attn.q_b_proj": {
477
+ "bits": 8
478
+ },
479
+ "model.layers.25.mlp.shared_experts.down_proj": {
480
+ "bits": 8
481
+ },
482
+ "model.layers.25.mlp.shared_experts.gate_proj": {
483
+ "bits": 8
484
+ },
485
+ "model.layers.25.mlp.shared_experts.up_proj": {
486
+ "bits": 8
487
+ },
488
+ "model.layers.25.self_attn.kv_a_proj_with_mqa": {
489
+ "bits": 8
490
+ },
491
+ "model.layers.25.self_attn.kv_b_proj": {
492
+ "bits": 8
493
+ },
494
+ "model.layers.25.self_attn.o_proj": {
495
+ "bits": 8
496
+ },
497
+ "model.layers.25.self_attn.q_a_proj": {
498
+ "bits": 8
499
+ },
500
+ "model.layers.25.self_attn.q_b_proj": {
501
+ "bits": 8
502
+ },
503
+ "model.layers.26.mlp.shared_experts.down_proj": {
504
+ "bits": 8
505
+ },
506
+ "model.layers.26.mlp.shared_experts.gate_proj": {
507
+ "bits": 8
508
+ },
509
+ "model.layers.26.mlp.shared_experts.up_proj": {
510
+ "bits": 8
511
+ },
512
+ "model.layers.26.self_attn.kv_a_proj_with_mqa": {
513
+ "bits": 8
514
+ },
515
+ "model.layers.26.self_attn.kv_b_proj": {
516
+ "bits": 8
517
+ },
518
+ "model.layers.26.self_attn.o_proj": {
519
+ "bits": 8
520
+ },
521
+ "model.layers.26.self_attn.q_a_proj": {
522
+ "bits": 8
523
+ },
524
+ "model.layers.26.self_attn.q_b_proj": {
525
+ "bits": 8
526
+ },
527
+ "model.layers.27.mlp.shared_experts.down_proj": {
528
+ "bits": 8
529
+ },
530
+ "model.layers.27.mlp.shared_experts.gate_proj": {
531
+ "bits": 8
532
+ },
533
+ "model.layers.27.mlp.shared_experts.up_proj": {
534
+ "bits": 8
535
+ },
536
+ "model.layers.27.self_attn.kv_a_proj_with_mqa": {
537
+ "bits": 8
538
+ },
539
+ "model.layers.27.self_attn.kv_b_proj": {
540
+ "bits": 8
541
+ },
542
+ "model.layers.27.self_attn.o_proj": {
543
+ "bits": 8
544
+ },
545
+ "model.layers.27.self_attn.q_a_proj": {
546
+ "bits": 8
547
+ },
548
+ "model.layers.27.self_attn.q_b_proj": {
549
+ "bits": 8
550
+ },
551
+ "model.layers.28.mlp.shared_experts.down_proj": {
552
+ "bits": 8
553
+ },
554
+ "model.layers.28.mlp.shared_experts.gate_proj": {
555
+ "bits": 8
556
+ },
557
+ "model.layers.28.mlp.shared_experts.up_proj": {
558
+ "bits": 8
559
+ },
560
+ "model.layers.28.self_attn.kv_a_proj_with_mqa": {
561
+ "bits": 8
562
+ },
563
+ "model.layers.28.self_attn.kv_b_proj": {
564
+ "bits": 8
565
+ },
566
+ "model.layers.28.self_attn.o_proj": {
567
+ "bits": 8
568
+ },
569
+ "model.layers.28.self_attn.q_a_proj": {
570
+ "bits": 8
571
+ },
572
+ "model.layers.28.self_attn.q_b_proj": {
573
+ "bits": 8
574
+ },
575
+ "model.layers.29.mlp.shared_experts.down_proj": {
576
+ "bits": 8
577
+ },
578
+ "model.layers.29.mlp.shared_experts.gate_proj": {
579
+ "bits": 8
580
+ },
581
+ "model.layers.29.mlp.shared_experts.up_proj": {
582
+ "bits": 8
583
+ },
584
+ "model.layers.29.self_attn.kv_a_proj_with_mqa": {
585
+ "bits": 8
586
+ },
587
+ "model.layers.29.self_attn.kv_b_proj": {
588
+ "bits": 8
589
+ },
590
+ "model.layers.29.self_attn.o_proj": {
591
+ "bits": 8
592
+ },
593
+ "model.layers.29.self_attn.q_a_proj": {
594
+ "bits": 8
595
+ },
596
+ "model.layers.29.self_attn.q_b_proj": {
597
+ "bits": 8
598
+ },
599
+ "model.layers.3.mlp.shared_experts.down_proj": {
600
+ "bits": 8
601
+ },
602
+ "model.layers.3.mlp.shared_experts.gate_proj": {
603
+ "bits": 8
604
+ },
605
+ "model.layers.3.mlp.shared_experts.up_proj": {
606
+ "bits": 8
607
+ },
608
+ "model.layers.3.self_attn.kv_a_proj_with_mqa": {
609
+ "bits": 8
610
+ },
611
+ "model.layers.3.self_attn.kv_b_proj": {
612
+ "bits": 8
613
+ },
614
+ "model.layers.3.self_attn.o_proj": {
615
+ "bits": 8
616
+ },
617
+ "model.layers.3.self_attn.q_a_proj": {
618
+ "bits": 8
619
+ },
620
+ "model.layers.3.self_attn.q_b_proj": {
621
+ "bits": 8
622
+ },
623
+ "model.layers.30.mlp.shared_experts.down_proj": {
624
+ "bits": 8
625
+ },
626
+ "model.layers.30.mlp.shared_experts.gate_proj": {
627
+ "bits": 8
628
+ },
629
+ "model.layers.30.mlp.shared_experts.up_proj": {
630
+ "bits": 8
631
+ },
632
+ "model.layers.30.self_attn.kv_a_proj_with_mqa": {
633
+ "bits": 8
634
+ },
635
+ "model.layers.30.self_attn.kv_b_proj": {
636
+ "bits": 8
637
+ },
638
+ "model.layers.30.self_attn.o_proj": {
639
+ "bits": 8
640
+ },
641
+ "model.layers.30.self_attn.q_a_proj": {
642
+ "bits": 8
643
+ },
644
+ "model.layers.30.self_attn.q_b_proj": {
645
+ "bits": 8
646
+ },
647
+ "model.layers.31.mlp.shared_experts.down_proj": {
648
+ "bits": 8
649
+ },
650
+ "model.layers.31.mlp.shared_experts.gate_proj": {
651
+ "bits": 8
652
+ },
653
+ "model.layers.31.mlp.shared_experts.up_proj": {
654
+ "bits": 8
655
+ },
656
+ "model.layers.31.self_attn.kv_a_proj_with_mqa": {
657
+ "bits": 8
658
+ },
659
+ "model.layers.31.self_attn.kv_b_proj": {
660
+ "bits": 8
661
+ },
662
+ "model.layers.31.self_attn.o_proj": {
663
+ "bits": 8
664
+ },
665
+ "model.layers.31.self_attn.q_a_proj": {
666
+ "bits": 8
667
+ },
668
+ "model.layers.31.self_attn.q_b_proj": {
669
+ "bits": 8
670
+ },
671
+ "model.layers.32.mlp.shared_experts.down_proj": {
672
+ "bits": 8
673
+ },
674
+ "model.layers.32.mlp.shared_experts.gate_proj": {
675
+ "bits": 8
676
+ },
677
+ "model.layers.32.mlp.shared_experts.up_proj": {
678
+ "bits": 8
679
+ },
680
+ "model.layers.32.self_attn.kv_a_proj_with_mqa": {
681
+ "bits": 8
682
+ },
683
+ "model.layers.32.self_attn.kv_b_proj": {
684
+ "bits": 8
685
+ },
686
+ "model.layers.32.self_attn.o_proj": {
687
+ "bits": 8
688
+ },
689
+ "model.layers.32.self_attn.q_a_proj": {
690
+ "bits": 8
691
+ },
692
+ "model.layers.32.self_attn.q_b_proj": {
693
+ "bits": 8
694
+ },
695
+ "model.layers.33.mlp.shared_experts.down_proj": {
696
+ "bits": 8
697
+ },
698
+ "model.layers.33.mlp.shared_experts.gate_proj": {
699
+ "bits": 8
700
+ },
701
+ "model.layers.33.mlp.shared_experts.up_proj": {
702
+ "bits": 8
703
+ },
704
+ "model.layers.33.self_attn.kv_a_proj_with_mqa": {
705
+ "bits": 8
706
+ },
707
+ "model.layers.33.self_attn.kv_b_proj": {
708
+ "bits": 8
709
+ },
710
+ "model.layers.33.self_attn.o_proj": {
711
+ "bits": 8
712
+ },
713
+ "model.layers.33.self_attn.q_a_proj": {
714
+ "bits": 8
715
+ },
716
+ "model.layers.33.self_attn.q_b_proj": {
717
+ "bits": 8
718
+ },
719
+ "model.layers.34.mlp.shared_experts.down_proj": {
720
+ "bits": 8
721
+ },
722
+ "model.layers.34.mlp.shared_experts.gate_proj": {
723
+ "bits": 8
724
+ },
725
+ "model.layers.34.mlp.shared_experts.up_proj": {
726
+ "bits": 8
727
+ },
728
+ "model.layers.34.self_attn.kv_a_proj_with_mqa": {
729
+ "bits": 8
730
+ },
731
+ "model.layers.34.self_attn.kv_b_proj": {
732
+ "bits": 8
733
+ },
734
+ "model.layers.34.self_attn.o_proj": {
735
+ "bits": 8
736
+ },
737
+ "model.layers.34.self_attn.q_a_proj": {
738
+ "bits": 8
739
+ },
740
+ "model.layers.34.self_attn.q_b_proj": {
741
+ "bits": 8
742
+ },
743
+ "model.layers.35.mlp.shared_experts.down_proj": {
744
+ "bits": 8
745
+ },
746
+ "model.layers.35.mlp.shared_experts.gate_proj": {
747
+ "bits": 8
748
+ },
749
+ "model.layers.35.mlp.shared_experts.up_proj": {
750
+ "bits": 8
751
+ },
752
+ "model.layers.35.self_attn.kv_a_proj_with_mqa": {
753
+ "bits": 8
754
+ },
755
+ "model.layers.35.self_attn.kv_b_proj": {
756
+ "bits": 8
757
+ },
758
+ "model.layers.35.self_attn.o_proj": {
759
+ "bits": 8
760
+ },
761
+ "model.layers.35.self_attn.q_a_proj": {
762
+ "bits": 8
763
+ },
764
+ "model.layers.35.self_attn.q_b_proj": {
765
+ "bits": 8
766
+ },
767
+ "model.layers.36.mlp.shared_experts.down_proj": {
768
+ "bits": 8
769
+ },
770
+ "model.layers.36.mlp.shared_experts.gate_proj": {
771
+ "bits": 8
772
+ },
773
+ "model.layers.36.mlp.shared_experts.up_proj": {
774
+ "bits": 8
775
+ },
776
+ "model.layers.36.self_attn.kv_a_proj_with_mqa": {
777
+ "bits": 8
778
+ },
779
+ "model.layers.36.self_attn.kv_b_proj": {
780
+ "bits": 8
781
+ },
782
+ "model.layers.36.self_attn.o_proj": {
783
+ "bits": 8
784
+ },
785
+ "model.layers.36.self_attn.q_a_proj": {
786
+ "bits": 8
787
+ },
788
+ "model.layers.36.self_attn.q_b_proj": {
789
+ "bits": 8
790
+ },
791
+ "model.layers.37.mlp.shared_experts.down_proj": {
792
+ "bits": 8
793
+ },
794
+ "model.layers.37.mlp.shared_experts.gate_proj": {
795
+ "bits": 8
796
+ },
797
+ "model.layers.37.mlp.shared_experts.up_proj": {
798
+ "bits": 8
799
+ },
800
+ "model.layers.37.self_attn.kv_a_proj_with_mqa": {
801
+ "bits": 8
802
+ },
803
+ "model.layers.37.self_attn.kv_b_proj": {
804
+ "bits": 8
805
+ },
806
+ "model.layers.37.self_attn.o_proj": {
807
+ "bits": 8
808
+ },
809
+ "model.layers.37.self_attn.q_a_proj": {
810
+ "bits": 8
811
+ },
812
+ "model.layers.37.self_attn.q_b_proj": {
813
+ "bits": 8
814
+ },
815
+ "model.layers.38.mlp.shared_experts.down_proj": {
816
+ "bits": 8
817
+ },
818
+ "model.layers.38.mlp.shared_experts.gate_proj": {
819
+ "bits": 8
820
+ },
821
+ "model.layers.38.mlp.shared_experts.up_proj": {
822
+ "bits": 8
823
+ },
824
+ "model.layers.38.self_attn.kv_a_proj_with_mqa": {
825
+ "bits": 8
826
+ },
827
+ "model.layers.38.self_attn.kv_b_proj": {
828
+ "bits": 8
829
+ },
830
+ "model.layers.38.self_attn.o_proj": {
831
+ "bits": 8
832
+ },
833
+ "model.layers.38.self_attn.q_a_proj": {
834
+ "bits": 8
835
+ },
836
+ "model.layers.38.self_attn.q_b_proj": {
837
+ "bits": 8
838
+ },
839
+ "model.layers.39.mlp.shared_experts.down_proj": {
840
+ "bits": 8
841
+ },
842
+ "model.layers.39.mlp.shared_experts.gate_proj": {
843
+ "bits": 8
844
+ },
845
+ "model.layers.39.mlp.shared_experts.up_proj": {
846
+ "bits": 8
847
+ },
848
+ "model.layers.39.self_attn.kv_a_proj_with_mqa": {
849
+ "bits": 8
850
+ },
851
+ "model.layers.39.self_attn.kv_b_proj": {
852
+ "bits": 8
853
+ },
854
+ "model.layers.39.self_attn.o_proj": {
855
+ "bits": 8
856
+ },
857
+ "model.layers.39.self_attn.q_a_proj": {
858
+ "bits": 8
859
+ },
860
+ "model.layers.39.self_attn.q_b_proj": {
861
+ "bits": 8
862
+ },
863
+ "model.layers.4.mlp.shared_experts.down_proj": {
864
+ "bits": 8
865
+ },
866
+ "model.layers.4.mlp.shared_experts.gate_proj": {
867
+ "bits": 8
868
+ },
869
+ "model.layers.4.mlp.shared_experts.up_proj": {
870
+ "bits": 8
871
+ },
872
+ "model.layers.4.self_attn.kv_a_proj_with_mqa": {
873
+ "bits": 8
874
+ },
875
+ "model.layers.4.self_attn.kv_b_proj": {
876
+ "bits": 8
877
+ },
878
+ "model.layers.4.self_attn.o_proj": {
879
+ "bits": 8
880
+ },
881
+ "model.layers.4.self_attn.q_a_proj": {
882
+ "bits": 8
883
+ },
884
+ "model.layers.4.self_attn.q_b_proj": {
885
+ "bits": 8
886
+ },
887
+ "model.layers.40.mlp.shared_experts.down_proj": {
888
+ "bits": 8
889
+ },
890
+ "model.layers.40.mlp.shared_experts.gate_proj": {
891
+ "bits": 8
892
+ },
893
+ "model.layers.40.mlp.shared_experts.up_proj": {
894
+ "bits": 8
895
+ },
896
+ "model.layers.40.self_attn.kv_a_proj_with_mqa": {
897
+ "bits": 8
898
+ },
899
+ "model.layers.40.self_attn.kv_b_proj": {
900
+ "bits": 8
901
+ },
902
+ "model.layers.40.self_attn.o_proj": {
903
+ "bits": 8
904
+ },
905
+ "model.layers.40.self_attn.q_a_proj": {
906
+ "bits": 8
907
+ },
908
+ "model.layers.40.self_attn.q_b_proj": {
909
+ "bits": 8
910
+ },
911
+ "model.layers.41.mlp.shared_experts.down_proj": {
912
+ "bits": 8
913
+ },
914
+ "model.layers.41.mlp.shared_experts.gate_proj": {
915
+ "bits": 8
916
+ },
917
+ "model.layers.41.mlp.shared_experts.up_proj": {
918
+ "bits": 8
919
+ },
920
+ "model.layers.41.self_attn.kv_a_proj_with_mqa": {
921
+ "bits": 8
922
+ },
923
+ "model.layers.41.self_attn.kv_b_proj": {
924
+ "bits": 8
925
+ },
926
+ "model.layers.41.self_attn.o_proj": {
927
+ "bits": 8
928
+ },
929
+ "model.layers.41.self_attn.q_a_proj": {
930
+ "bits": 8
931
+ },
932
+ "model.layers.41.self_attn.q_b_proj": {
933
+ "bits": 8
934
+ },
935
+ "model.layers.42.mlp.shared_experts.down_proj": {
936
+ "bits": 8
937
+ },
938
+ "model.layers.42.mlp.shared_experts.gate_proj": {
939
+ "bits": 8
940
+ },
941
+ "model.layers.42.mlp.shared_experts.up_proj": {
942
+ "bits": 8
943
+ },
944
+ "model.layers.42.self_attn.kv_a_proj_with_mqa": {
945
+ "bits": 8
946
+ },
947
+ "model.layers.42.self_attn.kv_b_proj": {
948
+ "bits": 8
949
+ },
950
+ "model.layers.42.self_attn.o_proj": {
951
+ "bits": 8
952
+ },
953
+ "model.layers.42.self_attn.q_a_proj": {
954
+ "bits": 8
955
+ },
956
+ "model.layers.42.self_attn.q_b_proj": {
957
+ "bits": 8
958
+ },
959
+ "model.layers.43.mlp.shared_experts.down_proj": {
960
+ "bits": 8
961
+ },
962
+ "model.layers.43.mlp.shared_experts.gate_proj": {
963
+ "bits": 8
964
+ },
965
+ "model.layers.43.mlp.shared_experts.up_proj": {
966
+ "bits": 8
967
+ },
968
+ "model.layers.43.self_attn.kv_a_proj_with_mqa": {
969
+ "bits": 8
970
+ },
971
+ "model.layers.43.self_attn.kv_b_proj": {
972
+ "bits": 8
973
+ },
974
+ "model.layers.43.self_attn.o_proj": {
975
+ "bits": 8
976
+ },
977
+ "model.layers.43.self_attn.q_a_proj": {
978
+ "bits": 8
979
+ },
980
+ "model.layers.43.self_attn.q_b_proj": {
981
+ "bits": 8
982
+ },
983
+ "model.layers.44.mlp.shared_experts.down_proj": {
984
+ "bits": 8
985
+ },
986
+ "model.layers.44.mlp.shared_experts.gate_proj": {
987
+ "bits": 8
988
+ },
989
+ "model.layers.44.mlp.shared_experts.up_proj": {
990
+ "bits": 8
991
+ },
992
+ "model.layers.44.self_attn.kv_a_proj_with_mqa": {
993
+ "bits": 8
994
+ },
995
+ "model.layers.44.self_attn.kv_b_proj": {
996
+ "bits": 8
997
+ },
998
+ "model.layers.44.self_attn.o_proj": {
999
+ "bits": 8
1000
+ },
1001
+ "model.layers.44.self_attn.q_a_proj": {
1002
+ "bits": 8
1003
+ },
1004
+ "model.layers.44.self_attn.q_b_proj": {
1005
+ "bits": 8
1006
+ },
1007
+ "model.layers.45.mlp.shared_experts.down_proj": {
1008
+ "bits": 8
1009
+ },
1010
+ "model.layers.45.mlp.shared_experts.gate_proj": {
1011
+ "bits": 8
1012
+ },
1013
+ "model.layers.45.mlp.shared_experts.up_proj": {
1014
+ "bits": 8
1015
+ },
1016
+ "model.layers.45.self_attn.kv_a_proj_with_mqa": {
1017
+ "bits": 8
1018
+ },
1019
+ "model.layers.45.self_attn.kv_b_proj": {
1020
+ "bits": 8
1021
+ },
1022
+ "model.layers.45.self_attn.o_proj": {
1023
+ "bits": 8
1024
+ },
1025
+ "model.layers.45.self_attn.q_a_proj": {
1026
+ "bits": 8
1027
+ },
1028
+ "model.layers.45.self_attn.q_b_proj": {
1029
+ "bits": 8
1030
+ },
1031
+ "model.layers.46.mlp.shared_experts.down_proj": {
1032
+ "bits": 8
1033
+ },
1034
+ "model.layers.46.mlp.shared_experts.gate_proj": {
1035
+ "bits": 8
1036
+ },
1037
+ "model.layers.46.mlp.shared_experts.up_proj": {
1038
+ "bits": 8
1039
+ },
1040
+ "model.layers.46.self_attn.kv_a_proj_with_mqa": {
1041
+ "bits": 8
1042
+ },
1043
+ "model.layers.46.self_attn.kv_b_proj": {
1044
+ "bits": 8
1045
+ },
1046
+ "model.layers.46.self_attn.o_proj": {
1047
+ "bits": 8
1048
+ },
1049
+ "model.layers.46.self_attn.q_a_proj": {
1050
+ "bits": 8
1051
+ },
1052
+ "model.layers.46.self_attn.q_b_proj": {
1053
+ "bits": 8
1054
+ },
1055
+ "model.layers.47.mlp.shared_experts.down_proj": {
1056
+ "bits": 8
1057
+ },
1058
+ "model.layers.47.mlp.shared_experts.gate_proj": {
1059
+ "bits": 8
1060
+ },
1061
+ "model.layers.47.mlp.shared_experts.up_proj": {
1062
+ "bits": 8
1063
+ },
1064
+ "model.layers.47.self_attn.kv_a_proj_with_mqa": {
1065
+ "bits": 8
1066
+ },
1067
+ "model.layers.47.self_attn.kv_b_proj": {
1068
+ "bits": 8
1069
+ },
1070
+ "model.layers.47.self_attn.o_proj": {
1071
+ "bits": 8
1072
+ },
1073
+ "model.layers.47.self_attn.q_a_proj": {
1074
+ "bits": 8
1075
+ },
1076
+ "model.layers.47.self_attn.q_b_proj": {
1077
+ "bits": 8
1078
+ },
1079
+ "model.layers.48.mlp.shared_experts.down_proj": {
1080
+ "bits": 8
1081
+ },
1082
+ "model.layers.48.mlp.shared_experts.gate_proj": {
1083
+ "bits": 8
1084
+ },
1085
+ "model.layers.48.mlp.shared_experts.up_proj": {
1086
+ "bits": 8
1087
+ },
1088
+ "model.layers.48.self_attn.kv_a_proj_with_mqa": {
1089
+ "bits": 8
1090
+ },
1091
+ "model.layers.48.self_attn.kv_b_proj": {
1092
+ "bits": 8
1093
+ },
1094
+ "model.layers.48.self_attn.o_proj": {
1095
+ "bits": 8
1096
+ },
1097
+ "model.layers.48.self_attn.q_a_proj": {
1098
+ "bits": 8
1099
+ },
1100
+ "model.layers.48.self_attn.q_b_proj": {
1101
+ "bits": 8
1102
+ },
1103
+ "model.layers.49.mlp.shared_experts.down_proj": {
1104
+ "bits": 8
1105
+ },
1106
+ "model.layers.49.mlp.shared_experts.gate_proj": {
1107
+ "bits": 8
1108
+ },
1109
+ "model.layers.49.mlp.shared_experts.up_proj": {
1110
+ "bits": 8
1111
+ },
1112
+ "model.layers.49.self_attn.kv_a_proj_with_mqa": {
1113
+ "bits": 8
1114
+ },
1115
+ "model.layers.49.self_attn.kv_b_proj": {
1116
+ "bits": 8
1117
+ },
1118
+ "model.layers.49.self_attn.o_proj": {
1119
+ "bits": 8
1120
+ },
1121
+ "model.layers.49.self_attn.q_a_proj": {
1122
+ "bits": 8
1123
+ },
1124
+ "model.layers.49.self_attn.q_b_proj": {
1125
+ "bits": 8
1126
+ },
1127
+ "model.layers.5.mlp.shared_experts.down_proj": {
1128
+ "bits": 8
1129
+ },
1130
+ "model.layers.5.mlp.shared_experts.gate_proj": {
1131
+ "bits": 8
1132
+ },
1133
+ "model.layers.5.mlp.shared_experts.up_proj": {
1134
+ "bits": 8
1135
+ },
1136
+ "model.layers.5.self_attn.kv_a_proj_with_mqa": {
1137
+ "bits": 8
1138
+ },
1139
+ "model.layers.5.self_attn.kv_b_proj": {
1140
+ "bits": 8
1141
+ },
1142
+ "model.layers.5.self_attn.o_proj": {
1143
+ "bits": 8
1144
+ },
1145
+ "model.layers.5.self_attn.q_a_proj": {
1146
+ "bits": 8
1147
+ },
1148
+ "model.layers.5.self_attn.q_b_proj": {
1149
+ "bits": 8
1150
+ },
1151
+ "model.layers.50.mlp.shared_experts.down_proj": {
1152
+ "bits": 8
1153
+ },
1154
+ "model.layers.50.mlp.shared_experts.gate_proj": {
1155
+ "bits": 8
1156
+ },
1157
+ "model.layers.50.mlp.shared_experts.up_proj": {
1158
+ "bits": 8
1159
+ },
1160
+ "model.layers.50.self_attn.kv_a_proj_with_mqa": {
1161
+ "bits": 8
1162
+ },
1163
+ "model.layers.50.self_attn.kv_b_proj": {
1164
+ "bits": 8
1165
+ },
1166
+ "model.layers.50.self_attn.o_proj": {
1167
+ "bits": 8
1168
+ },
1169
+ "model.layers.50.self_attn.q_a_proj": {
1170
+ "bits": 8
1171
+ },
1172
+ "model.layers.50.self_attn.q_b_proj": {
1173
+ "bits": 8
1174
+ },
1175
+ "model.layers.51.mlp.shared_experts.down_proj": {
1176
+ "bits": 8
1177
+ },
1178
+ "model.layers.51.mlp.shared_experts.gate_proj": {
1179
+ "bits": 8
1180
+ },
1181
+ "model.layers.51.mlp.shared_experts.up_proj": {
1182
+ "bits": 8
1183
+ },
1184
+ "model.layers.51.self_attn.kv_a_proj_with_mqa": {
1185
+ "bits": 8
1186
+ },
1187
+ "model.layers.51.self_attn.kv_b_proj": {
1188
+ "bits": 8
1189
+ },
1190
+ "model.layers.51.self_attn.o_proj": {
1191
+ "bits": 8
1192
+ },
1193
+ "model.layers.51.self_attn.q_a_proj": {
1194
+ "bits": 8
1195
+ },
1196
+ "model.layers.51.self_attn.q_b_proj": {
1197
+ "bits": 8
1198
+ },
1199
+ "model.layers.52.mlp.shared_experts.down_proj": {
1200
+ "bits": 8
1201
+ },
1202
+ "model.layers.52.mlp.shared_experts.gate_proj": {
1203
+ "bits": 8
1204
+ },
1205
+ "model.layers.52.mlp.shared_experts.up_proj": {
1206
+ "bits": 8
1207
+ },
1208
+ "model.layers.52.self_attn.kv_a_proj_with_mqa": {
1209
+ "bits": 8
1210
+ },
1211
+ "model.layers.52.self_attn.kv_b_proj": {
1212
+ "bits": 8
1213
+ },
1214
+ "model.layers.52.self_attn.o_proj": {
1215
+ "bits": 8
1216
+ },
1217
+ "model.layers.52.self_attn.q_a_proj": {
1218
+ "bits": 8
1219
+ },
1220
+ "model.layers.52.self_attn.q_b_proj": {
1221
+ "bits": 8
1222
+ },
1223
+ "model.layers.53.mlp.shared_experts.down_proj": {
1224
+ "bits": 8
1225
+ },
1226
+ "model.layers.53.mlp.shared_experts.gate_proj": {
1227
+ "bits": 8
1228
+ },
1229
+ "model.layers.53.mlp.shared_experts.up_proj": {
1230
+ "bits": 8
1231
+ },
1232
+ "model.layers.53.self_attn.kv_a_proj_with_mqa": {
1233
+ "bits": 8
1234
+ },
1235
+ "model.layers.53.self_attn.kv_b_proj": {
1236
+ "bits": 8
1237
+ },
1238
+ "model.layers.53.self_attn.o_proj": {
1239
+ "bits": 8
1240
+ },
1241
+ "model.layers.53.self_attn.q_a_proj": {
1242
+ "bits": 8
1243
+ },
1244
+ "model.layers.53.self_attn.q_b_proj": {
1245
+ "bits": 8
1246
+ },
1247
+ "model.layers.54.mlp.shared_experts.down_proj": {
1248
+ "bits": 8
1249
+ },
1250
+ "model.layers.54.mlp.shared_experts.gate_proj": {
1251
+ "bits": 8
1252
+ },
1253
+ "model.layers.54.mlp.shared_experts.up_proj": {
1254
+ "bits": 8
1255
+ },
1256
+ "model.layers.54.self_attn.kv_a_proj_with_mqa": {
1257
+ "bits": 8
1258
+ },
1259
+ "model.layers.54.self_attn.kv_b_proj": {
1260
+ "bits": 8
1261
+ },
1262
+ "model.layers.54.self_attn.o_proj": {
1263
+ "bits": 8
1264
+ },
1265
+ "model.layers.54.self_attn.q_a_proj": {
1266
+ "bits": 8
1267
+ },
1268
+ "model.layers.54.self_attn.q_b_proj": {
1269
+ "bits": 8
1270
+ },
1271
+ "model.layers.55.mlp.shared_experts.down_proj": {
1272
+ "bits": 8
1273
+ },
1274
+ "model.layers.55.mlp.shared_experts.gate_proj": {
1275
+ "bits": 8
1276
+ },
1277
+ "model.layers.55.mlp.shared_experts.up_proj": {
1278
+ "bits": 8
1279
+ },
1280
+ "model.layers.55.self_attn.kv_a_proj_with_mqa": {
1281
+ "bits": 8
1282
+ },
1283
+ "model.layers.55.self_attn.kv_b_proj": {
1284
+ "bits": 8
1285
+ },
1286
+ "model.layers.55.self_attn.o_proj": {
1287
+ "bits": 8
1288
+ },
1289
+ "model.layers.55.self_attn.q_a_proj": {
1290
+ "bits": 8
1291
+ },
1292
+ "model.layers.55.self_attn.q_b_proj": {
1293
+ "bits": 8
1294
+ },
1295
+ "model.layers.56.mlp.shared_experts.down_proj": {
1296
+ "bits": 8
1297
+ },
1298
+ "model.layers.56.mlp.shared_experts.gate_proj": {
1299
+ "bits": 8
1300
+ },
1301
+ "model.layers.56.mlp.shared_experts.up_proj": {
1302
+ "bits": 8
1303
+ },
1304
+ "model.layers.56.self_attn.kv_a_proj_with_mqa": {
1305
+ "bits": 8
1306
+ },
1307
+ "model.layers.56.self_attn.kv_b_proj": {
1308
+ "bits": 8
1309
+ },
1310
+ "model.layers.56.self_attn.o_proj": {
1311
+ "bits": 8
1312
+ },
1313
+ "model.layers.56.self_attn.q_a_proj": {
1314
+ "bits": 8
1315
+ },
1316
+ "model.layers.56.self_attn.q_b_proj": {
1317
+ "bits": 8
1318
+ },
1319
+ "model.layers.57.mlp.shared_experts.down_proj": {
1320
+ "bits": 8
1321
+ },
1322
+ "model.layers.57.mlp.shared_experts.gate_proj": {
1323
+ "bits": 8
1324
+ },
1325
+ "model.layers.57.mlp.shared_experts.up_proj": {
1326
+ "bits": 8
1327
+ },
1328
+ "model.layers.57.self_attn.kv_a_proj_with_mqa": {
1329
+ "bits": 8
1330
+ },
1331
+ "model.layers.57.self_attn.kv_b_proj": {
1332
+ "bits": 8
1333
+ },
1334
+ "model.layers.57.self_attn.o_proj": {
1335
+ "bits": 8
1336
+ },
1337
+ "model.layers.57.self_attn.q_a_proj": {
1338
+ "bits": 8
1339
+ },
1340
+ "model.layers.57.self_attn.q_b_proj": {
1341
+ "bits": 8
1342
+ },
1343
+ "model.layers.58.mlp.shared_experts.down_proj": {
1344
+ "bits": 8
1345
+ },
1346
+ "model.layers.58.mlp.shared_experts.gate_proj": {
1347
+ "bits": 8
1348
+ },
1349
+ "model.layers.58.mlp.shared_experts.up_proj": {
1350
+ "bits": 8
1351
+ },
1352
+ "model.layers.58.self_attn.kv_a_proj_with_mqa": {
1353
+ "bits": 8
1354
+ },
1355
+ "model.layers.58.self_attn.kv_b_proj": {
1356
+ "bits": 8
1357
+ },
1358
+ "model.layers.58.self_attn.o_proj": {
1359
+ "bits": 8
1360
+ },
1361
+ "model.layers.58.self_attn.q_a_proj": {
1362
+ "bits": 8
1363
+ },
1364
+ "model.layers.58.self_attn.q_b_proj": {
1365
+ "bits": 8
1366
+ },
1367
+ "model.layers.59.mlp.shared_experts.down_proj": {
1368
+ "bits": 8
1369
+ },
1370
+ "model.layers.59.mlp.shared_experts.gate_proj": {
1371
+ "bits": 8
1372
+ },
1373
+ "model.layers.59.mlp.shared_experts.up_proj": {
1374
+ "bits": 8
1375
+ },
1376
+ "model.layers.59.self_attn.kv_a_proj_with_mqa": {
1377
+ "bits": 8
1378
+ },
1379
+ "model.layers.59.self_attn.kv_b_proj": {
1380
+ "bits": 8
1381
+ },
1382
+ "model.layers.59.self_attn.o_proj": {
1383
+ "bits": 8
1384
+ },
1385
+ "model.layers.59.self_attn.q_a_proj": {
1386
+ "bits": 8
1387
+ },
1388
+ "model.layers.59.self_attn.q_b_proj": {
1389
+ "bits": 8
1390
+ },
1391
+ "model.layers.6.mlp.shared_experts.down_proj": {
1392
+ "bits": 8
1393
+ },
1394
+ "model.layers.6.mlp.shared_experts.gate_proj": {
1395
+ "bits": 8
1396
+ },
1397
+ "model.layers.6.mlp.shared_experts.up_proj": {
1398
+ "bits": 8
1399
+ },
1400
+ "model.layers.6.self_attn.kv_a_proj_with_mqa": {
1401
+ "bits": 8
1402
+ },
1403
+ "model.layers.6.self_attn.kv_b_proj": {
1404
+ "bits": 8
1405
+ },
1406
+ "model.layers.6.self_attn.o_proj": {
1407
+ "bits": 8
1408
+ },
1409
+ "model.layers.6.self_attn.q_a_proj": {
1410
+ "bits": 8
1411
+ },
1412
+ "model.layers.6.self_attn.q_b_proj": {
1413
+ "bits": 8
1414
+ },
1415
+ "model.layers.60.mlp.shared_experts.down_proj": {
1416
+ "bits": 8
1417
+ },
1418
+ "model.layers.60.mlp.shared_experts.gate_proj": {
1419
+ "bits": 8
1420
+ },
1421
+ "model.layers.60.mlp.shared_experts.up_proj": {
1422
+ "bits": 8
1423
+ },
1424
+ "model.layers.60.self_attn.kv_a_proj_with_mqa": {
1425
+ "bits": 8
1426
+ },
1427
+ "model.layers.60.self_attn.kv_b_proj": {
1428
+ "bits": 8
1429
+ },
1430
+ "model.layers.60.self_attn.o_proj": {
1431
+ "bits": 8
1432
+ },
1433
+ "model.layers.60.self_attn.q_a_proj": {
1434
+ "bits": 8
1435
+ },
1436
+ "model.layers.60.self_attn.q_b_proj": {
1437
+ "bits": 8
1438
+ },
1439
+ "model.layers.7.mlp.shared_experts.down_proj": {
1440
+ "bits": 8
1441
+ },
1442
+ "model.layers.7.mlp.shared_experts.gate_proj": {
1443
+ "bits": 8
1444
+ },
1445
+ "model.layers.7.mlp.shared_experts.up_proj": {
1446
+ "bits": 8
1447
+ },
1448
+ "model.layers.7.self_attn.kv_a_proj_with_mqa": {
1449
+ "bits": 8
1450
+ },
1451
+ "model.layers.7.self_attn.kv_b_proj": {
1452
+ "bits": 8
1453
+ },
1454
+ "model.layers.7.self_attn.o_proj": {
1455
+ "bits": 8
1456
+ },
1457
+ "model.layers.7.self_attn.q_a_proj": {
1458
+ "bits": 8
1459
+ },
1460
+ "model.layers.7.self_attn.q_b_proj": {
1461
+ "bits": 8
1462
+ },
1463
+ "model.layers.8.mlp.shared_experts.down_proj": {
1464
+ "bits": 8
1465
+ },
1466
+ "model.layers.8.mlp.shared_experts.gate_proj": {
1467
+ "bits": 8
1468
+ },
1469
+ "model.layers.8.mlp.shared_experts.up_proj": {
1470
+ "bits": 8
1471
+ },
1472
+ "model.layers.8.self_attn.kv_a_proj_with_mqa": {
1473
+ "bits": 8
1474
+ },
1475
+ "model.layers.8.self_attn.kv_b_proj": {
1476
+ "bits": 8
1477
+ },
1478
+ "model.layers.8.self_attn.o_proj": {
1479
+ "bits": 8
1480
+ },
1481
+ "model.layers.8.self_attn.q_a_proj": {
1482
+ "bits": 8
1483
+ },
1484
+ "model.layers.8.self_attn.q_b_proj": {
1485
+ "bits": 8
1486
+ },
1487
+ "model.layers.9.mlp.shared_experts.down_proj": {
1488
+ "bits": 8
1489
+ },
1490
+ "model.layers.9.mlp.shared_experts.gate_proj": {
1491
+ "bits": 8
1492
+ },
1493
+ "model.layers.9.mlp.shared_experts.up_proj": {
1494
+ "bits": 8
1495
+ },
1496
+ "model.layers.9.self_attn.kv_a_proj_with_mqa": {
1497
+ "bits": 8
1498
+ },
1499
+ "model.layers.9.self_attn.kv_b_proj": {
1500
+ "bits": 8
1501
+ },
1502
+ "model.layers.9.self_attn.o_proj": {
1503
+ "bits": 8
1504
+ },
1505
+ "model.layers.9.self_attn.q_a_proj": {
1506
+ "bits": 8
1507
+ },
1508
+ "model.layers.9.self_attn.q_b_proj": {
1509
+ "bits": 8
1510
+ }
1511
+ },
1512
+ "group_size": 128,
1513
+ "iters": 0,
1514
+ "packing_format": "auto_round:auto_gptq",
1515
+ "quant_method": "auto-round",
1516
+ "sym": true
1517
+ },
1518
+ "rms_norm_eps": 1e-06,
1519
+ "rope_interleave": true,
1520
+ "rope_scaling": {
1521
+ "beta_fast": 32.0,
1522
+ "beta_slow": 1.0,
1523
+ "factor": 40.0,
1524
+ "mscale": 1.0,
1525
+ "mscale_all_dim": 1.0,
1526
+ "original_max_position_embeddings": 4096,
1527
+ "rope_type": "yarn",
1528
+ "type": "yarn"
1529
+ },
1530
+ "rope_theta": 10000,
1531
+ "routed_scaling_factor": 2.5,
1532
+ "scoring_func": "sigmoid",
1533
+ "tie_word_embeddings": false,
1534
+ "topk_group": 4,
1535
+ "topk_method": "noaux_tc",
1536
+ "transformers_version": "4.56.1",
1537
+ "use_cache": true,
1538
+ "v_head_dim": 128,
1539
+ "vocab_size": 129280
1540
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
+ class DeepseekV3Config(PretrainedConfig):
8
+ r"""
9
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
10
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
12
+
13
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
14
+ documentation from [`PretrainedConfig`] for more information.
15
+
16
+
17
+ Args:
18
+ vocab_size (`int`, *optional*, defaults to 129280):
19
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
20
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
21
+ hidden_size (`int`, *optional*, defaults to 4096):
22
+ Dimension of the hidden representations.
23
+ intermediate_size (`int`, *optional*, defaults to 11008):
24
+ Dimension of the MLP representations.
25
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
26
+ Dimension of the MoE representations.
27
+ num_hidden_layers (`int`, *optional*, defaults to 32):
28
+ Number of hidden layers in the Transformer decoder.
29
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
30
+ Number of nextn predict layers in the DeepSeekV3 Model.
31
+ num_attention_heads (`int`, *optional*, defaults to 32):
32
+ Number of attention heads for each attention layer in the Transformer decoder.
33
+ n_shared_experts (`int`, *optional*, defaults to None):
34
+ Number of shared experts, None means dense model.
35
+ n_routed_experts (`int`, *optional*, defaults to None):
36
+ Number of routed experts, None means dense model.
37
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
38
+ Scaling factor or routed experts.
39
+ topk_method (`str`, *optional*, defaults to `gready`):
40
+ Topk method used in routed gate.
41
+ n_group (`int`, *optional*, defaults to None):
42
+ Number of groups for routed experts.
43
+ topk_group (`int`, *optional*, defaults to None):
44
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
45
+ num_experts_per_tok (`int`, *optional*, defaults to None):
46
+ Number of selected experts, None means dense model.
47
+ moe_layer_freq (`int`, *optional*, defaults to 1):
48
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
49
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
50
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
51
+ \--k dense layers--/
52
+ norm_topk_prob (`bool`, *optional*, defaults to False):
53
+ Whether to normalize the weights of the routed experts.
54
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
55
+ Method of computing expert weights.
56
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
57
+ Auxiliary loss weight coefficient.
58
+ seq_aux = (`bool`, *optional*, defaults to True):
59
+ Whether to compute the auxiliary loss for each individual sample.
60
+ num_key_value_heads (`int`, *optional*):
61
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
62
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
63
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
64
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
65
+ by meanpooling all the original heads within that group. For more details checkout [this
66
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
67
+ `num_attention_heads`.
68
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
69
+ The non-linear activation function (function or string) in the decoder.
70
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
71
+ The maximum sequence length that this model might ever be used with.
72
+ initializer_range (`float`, *optional*, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
75
+ The epsilon used by the rms normalization layers.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`.
79
+ pad_token_id (`int`, *optional*):
80
+ Padding token id.
81
+ bos_token_id (`int`, *optional*, defaults to 1):
82
+ Beginning of stream token id.
83
+ eos_token_id (`int`, *optional*, defaults to 2):
84
+ End of stream token id.
85
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
86
+ Whether to tie weight embeddings
87
+ rope_theta (`float`, *optional*, defaults to 10000.0):
88
+ The base period of the RoPE embeddings.
89
+ rope_scaling (`Dict`, *optional*):
90
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
91
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
92
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
93
+ `max_position_embeddings` to the expected new maximum.
94
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
95
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
96
+ attention_dropout (`float`, *optional*, defaults to 0.0):
97
+ The dropout ratio for the attention probabilities.
98
+
99
+ ```python
100
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
101
+
102
+ >>> # Initializing a Deepseek-V3 style configuration
103
+ >>> configuration = DeepseekV3Config()
104
+
105
+ >>> # Accessing the model configuration
106
+ >>> configuration = model.config
107
+ ```"""
108
+
109
+ model_type = "deepseek_v3"
110
+ keys_to_ignore_at_inference = ["past_key_values"]
111
+
112
+ def __init__(
113
+ self,
114
+ vocab_size=129280,
115
+ hidden_size=7168,
116
+ intermediate_size=18432,
117
+ moe_intermediate_size = 2048,
118
+ num_hidden_layers=61,
119
+ num_nextn_predict_layers=1,
120
+ num_attention_heads=128,
121
+ num_key_value_heads=128,
122
+ n_shared_experts = 1,
123
+ n_routed_experts = 256,
124
+ ep_size = 1,
125
+ routed_scaling_factor = 2.5,
126
+ kv_lora_rank = 512,
127
+ q_lora_rank = 1536,
128
+ qk_rope_head_dim = 64,
129
+ v_head_dim = 128,
130
+ qk_nope_head_dim = 128,
131
+ topk_method = 'noaux_tc',
132
+ n_group = 8,
133
+ topk_group = 4,
134
+ num_experts_per_tok = 8,
135
+ moe_layer_freq = 1,
136
+ first_k_dense_replace = 3,
137
+ norm_topk_prob = True,
138
+ scoring_func = 'sigmoid',
139
+ hidden_act="silu",
140
+ max_position_embeddings=4096,
141
+ initializer_range=0.02,
142
+ rms_norm_eps=1e-6,
143
+ use_cache=True,
144
+ pad_token_id=None,
145
+ bos_token_id=0,
146
+ eos_token_id=1,
147
+ tie_word_embeddings=False,
148
+ rope_theta=10000.0,
149
+ rope_scaling=None,
150
+ attention_bias=False,
151
+ attention_dropout=0.0,
152
+ **kwargs,
153
+ ):
154
+ self.vocab_size = vocab_size
155
+ self.max_position_embeddings = max_position_embeddings
156
+ self.hidden_size = hidden_size
157
+ self.intermediate_size = intermediate_size
158
+ self.moe_intermediate_size = moe_intermediate_size
159
+ self.num_hidden_layers = num_hidden_layers
160
+ self.num_nextn_predict_layers = num_nextn_predict_layers
161
+ self.num_attention_heads = num_attention_heads
162
+ self.n_shared_experts = n_shared_experts
163
+ self.n_routed_experts = n_routed_experts
164
+ self.ep_size = ep_size
165
+ self.routed_scaling_factor = routed_scaling_factor
166
+ self.kv_lora_rank = kv_lora_rank
167
+ self.q_lora_rank = q_lora_rank
168
+ self.qk_rope_head_dim = qk_rope_head_dim
169
+ self.v_head_dim = v_head_dim
170
+ self.qk_nope_head_dim = qk_nope_head_dim
171
+ self.topk_method = topk_method
172
+ self.n_group = n_group
173
+ self.topk_group = topk_group
174
+ self.num_experts_per_tok = num_experts_per_tok
175
+ self.moe_layer_freq = moe_layer_freq
176
+ self.first_k_dense_replace = first_k_dense_replace
177
+ self.norm_topk_prob = norm_topk_prob
178
+ self.scoring_func = scoring_func
179
+ # for backward compatibility
180
+ if num_key_value_heads is None:
181
+ num_key_value_heads = num_attention_heads
182
+
183
+ self.num_key_value_heads = num_key_value_heads
184
+ self.hidden_act = hidden_act
185
+ self.initializer_range = initializer_range
186
+ self.rms_norm_eps = rms_norm_eps
187
+ self.use_cache = use_cache
188
+ self.rope_theta = rope_theta
189
+ self.rope_scaling = rope_scaling
190
+ self.attention_bias = attention_bias
191
+ self.attention_dropout = attention_dropout
192
+
193
+ super().__init__(
194
+ pad_token_id=pad_token_id,
195
+ bos_token_id=bos_token_id,
196
+ eos_token_id=eos_token_id,
197
+ tie_word_embeddings=tie_word_embeddings,
198
+ **kwargs,
199
+ )
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "do_sample": true,
5
+ "eos_token_id": 1,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.56.1"
9
+ }
model-00001-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:703ef06b868c2d598dab11427aa7a00ecf6a99d49bffa25410cc2c5c90e292ca
3
+ size 4995710664
model-00002-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bc655ced1b36105733ed8b148aa2f4dd37fa6d921b94f9d154e332b3f824979
3
+ size 4999591680
model-00003-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da42565e1aef66ac3265d4789587a2125a92341f083ef789267f580a68bee74f
3
+ size 4995762096
model-00004-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5b07d9c2cd64fe4ba2f6985b1912ed9afc8faa8314058ce404b41c70a1e10c
3
+ size 4999591192
model-00005-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:142a194a48965b06b4ba5b2939e73ac510522950458d8d8a25e8a6c4e4034690
3
+ size 4999591176
model-00006-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:013aac65ba5a34aa916d0f35243062455a2cd97334a7a1a8bf00f570f1813486
3
+ size 4999591384
model-00007-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee5edbf308abb53e16ef1c667c9fe30a149078292cd8dc1066e5071548ea67d
3
+ size 4999591608
model-00008-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:139cb00ce3d086a36a1aed2fc091a0de023072e9d2313ef780502c4511345cad
3
+ size 4995762400
model-00009-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c81a75f64d7b4102eb1b76d3971c7685580bcf9aaee1ca7163a394f914f61c4
3
+ size 4999591192
model-00010-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b32f2783bfc0cad2f37977c6ac41bac957d1e956721e002fce98226d68f26d9
3
+ size 4999592560
model-00011-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e91365a964adc6db75d592580510da52e75f61fcf8709631a09f09314cec7cac
3
+ size 4999593080
model-00012-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ac245373666959a07089b8a043332049588019e09440ad6f7e86f4c011892b5
3
+ size 4999593480
model-00013-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7845cacea27c54f8e9f63ea02c57cb08a9b8dd04062d24b95ae7f82d88d81955
3
+ size 4999593536
model-00014-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b9991c5ffb98ed8de71fe4c2fef86912311a332bb1dc5f589d30496387f8c64
3
+ size 4995764144
model-00015-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e078c0f9bca278d9baba37e14a0c3b33f336e8a9e1bc10602de0b97d0f09dea
3
+ size 4999593096
model-00016-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29c5c3733e761a0efa6632a6e98f9d9431c1b590b617d5acb7a92884783ecddc
3
+ size 4999593080
model-00017-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3853be036896f2c72c83099ee9189d5a6f9401efeb99797c56b9256d3619341f
3
+ size 4999593248
model-00018-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c1b3e03d649d9783c9661588750ba72b98485bfd3430935da3c8608409a4b4e
3
+ size 4999593512
model-00019-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe76912734a193a5150f16487bf86142302453eed321cbb9a63e588f87a4ec2b
3
+ size 4995764408
model-00020-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:712af5e0fd470826e8c2f9aae3d6150d23dc6355389a6123a7400ac8db2cab10
3
+ size 4999593096
model-00021-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a89d1673ac5a2b408e0fc387abd17b428a3154d27f04adb3e071d019b8870b97
3
+ size 4999593080
model-00022-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1098bd43cc06f981f31e6b06e72aee9a25612669e6df03ce1ac426d878b55b28
3
+ size 4999593080
model-00023-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c691792a6ce68b151874b91d93130a4b5320fa1e8cbf873bd397c3f7cb88ed9
3
+ size 4999593440
model-00024-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a09419e6a04c5d389fcb15cde7b2710e45a0b67b394b9311950c53ba8abd4447
3
+ size 4999593512
model-00025-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0619c06e8fdd0e7dd8440fe34cfa2e98fa429b9a41b7d068b25bed2fb3d8b57c
3
+ size 4995764216
model-00026-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa9c55e657b4a8464fa8df39711b614ecc3c0984b607172161156fd2062384e1
3
+ size 4999593096
model-00027-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:367ce952d1a57435332148a8eb41ed1671d152a6705366337f83ba5e3e799e73
3
+ size 4999593080
model-00028-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d31fc80cd3f0aa58b0b491b3b43df641cdda8fabbeb99cf2f8e59ceef39b012
3
+ size 4999593208
model-00029-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff1224ef6ff84448c60fa4b9fe301250c15388b5c6cbfcb58d329ad332886b1a
3
+ size 4999593512
model-00030-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a68fd7e0c834184057e08985a6ec3d74a238e4621697a6cedce7ade9f88302
3
+ size 4999205472
model-00031-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a47d4636cd955c53e7b97314f74145f34b925afc405af452ba5fb4c950455d74
3
+ size 4996152080
model-00032-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8913646eb8d9913a6c5ad3c525409e7a488831e1a5219950ecd3765fbb6a7f1
3
+ size 4999593080
model-00033-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684c02ae0841f49e40db1f3ae60c426cd49922c49524cdaa4d3e25556c9a8760
3
+ size 4999593080
model-00034-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc18c532945bc84ba94e0a0c44d8394ce6a7fb6540511d83ad5842c6e9819bcb
3
+ size 4999593400
model-00035-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c389187026d9ed6bcfc92a0fbf7cd02f355d73032e403718c62261022fb6c0da
3
+ size 4999593512
model-00036-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf48b79855218765f8cf6e3bbf4be0e357ea076bba7b0f3e7cc7e85434b70f1
3
+ size 4995764256
model-00037-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400161123244e49d1254d4ce168cf6fa2430200b34343f9f31b3485e097fd032
3
+ size 4999593096
model-00038-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81366286a3b9a7f61bd1c80fc176fe3499113c2c42955c94e70d725b9435d854
3
+ size 4999593080
model-00039-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a21429275f2cbfe668ba7e17241da12a85fc5a83edfa9da8a84ec7ff81e1abe
3
+ size 4999593160
model-00040-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8240647013d396e134efd31c0e15e8fd6aa9a739af9f55b06d9bce95fdfb353
3
+ size 4999593512
model-00041-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d57fcef5604a3e1657cf703eec11c7724424cf207b64027fc21e053ec61119
3
+ size 4993807032
model-00042-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eb63475587d22f8f0d5ec3faaad624f6428c0eba5f03dbb21bb916718d26f49
3
+ size 4993923416
model-00043-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f07929aa534df63c44db768d5ff3126e8a260d5e84a87bebac06cce62ebc26fe
3
+ size 4999593096
model-00044-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30efdee55ca95b5325edc8b3b3ddd5f122ce25ec83126cc7e811324f426ef482
3
+ size 4999593080
model-00045-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e5a0619efb94ef12fed6a2439aa8f76779be45a3eab1e7378fdd37c8af0e478
3
+ size 4999593360