yujiepan commited on
Commit
6b02277
·
verified ·
1 Parent(s): 35499df

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.meta.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "torch": "2.11.0",
3
+ "transformers": "5.5.0"
4
+ }
README.md ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - zai-org/GLM-5.1
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [zai-org/GLM-5.1](https://huggingface.co/zai-org/GLM-5.1).
8
+
9
+ | File path | Size |
10
+ |------|------|
11
+ | model.safetensors | 17.6MB |
12
+
13
+
14
+ ### Example usage:
15
+
16
+ - vLLM
17
+
18
+ ```bash
19
+ # Multi-token prediction is supported
20
+ model_id=tiny-random/glm-5.1
21
+ vllm serve $model_id \
22
+ --tensor-parallel-size 2 \
23
+ --speculative-config.method mtp \
24
+ --speculative-config.num_speculative_tokens 1 \
25
+ --tool-call-parser glm47 \
26
+ --reasoning-parser glm45 \
27
+ --enable-auto-tool-choice
28
+ ```
29
+
30
+ - SGLang
31
+
32
+ ```bash
33
+ # Multi-token prediction is supported
34
+ model_id=tiny-random/glm-5.1
35
+ python3 -m sglang.launch_server --model-path $model_id --tp-size 2 \
36
+ --tool-call-parser glm47 \
37
+ --reasoning-parser glm45 \
38
+ --speculative-algorithm EAGLE \
39
+ --speculative-num-steps 3 \
40
+ --speculative-eagle-topk 1 \
41
+ --speculative-num-draft-tokens 4
42
+ ```
43
+
44
+ - Transformers
45
+
46
+ ```python
47
+ import torch
48
+ from transformers import AutoModelForCausalLM, AutoTokenizer
49
+
50
+ model_id = "tiny-random/glm-5.1"
51
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
53
+ input_ids = torch.randint(1000, 2000, size=(1, 2333), dtype=torch.long).to(device) # trigger DSA
54
+ model = AutoModelForCausalLM.from_pretrained(
55
+ model_id,
56
+ dtype=torch.bfloat16,
57
+ device_map=device,
58
+ )
59
+ generated_ids = model.generate(input_ids, max_new_tokens=8)
60
+ output_text = tokenizer.decode(generated_ids[0][input_ids.shape[1]:])
61
+ print(output_text)
62
+ ```
63
+
64
+ ### Codes to create this repo:
65
+
66
+ <details>
67
+ <summary>Click to expand</summary>
68
+
69
+ ```python
70
+ import json
71
+ from copy import deepcopy
72
+ from pathlib import Path
73
+
74
+ import accelerate
75
+ import torch
76
+ import torch.nn as nn
77
+ from huggingface_hub import file_exists, hf_hub_download
78
+ from transformers import (
79
+ AutoConfig,
80
+ AutoModelForCausalLM,
81
+ AutoProcessor,
82
+ GenerationConfig,
83
+ set_seed,
84
+ )
85
+
86
+ source_model_id = "zai-org/GLM-5.1"
87
+ save_folder = "/tmp/tiny-random/glm-51"
88
+
89
+ processor = AutoProcessor.from_pretrained(
90
+ source_model_id, trust_remote_code=True)
91
+ processor.save_pretrained(save_folder)
92
+
93
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
94
+ config_json: dict = json.load(f)
95
+
96
+ config_json.update({
97
+ "first_k_dense_replace": 1,
98
+ "mlp_layer_types": ['dense'] + ['sparse'],
99
+ "hidden_size": 8,
100
+ "index_n_heads": 4,
101
+ "intermediate_size": 32,
102
+ "moe_intermediate_size": 32,
103
+ "num_hidden_layers": 2,
104
+ "num_attention_heads": 8,
105
+ 'num_key_value_heads': 8,
106
+ 'q_lora_rank': 32,
107
+ 'tie_word_embeddings': False,
108
+ })
109
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
110
+ json.dump(config_json, f, indent=2)
111
+
112
+ config = AutoConfig.from_pretrained(
113
+ save_folder,
114
+ trust_remote_code=True,
115
+ )
116
+ print(config)
117
+ torch.set_default_dtype(torch.bfloat16)
118
+ model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
119
+ torch.set_default_dtype(torch.float32)
120
+
121
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
122
+ model.generation_config = GenerationConfig.from_pretrained(
123
+ source_model_id, trust_remote_code=True,
124
+ )
125
+ model.generation_config.do_sample = True
126
+ print(model.generation_config)
127
+
128
+ model = model.cpu()
129
+ set_seed(42)
130
+ n_params = sum(p.numel() for p in model.parameters())
131
+ with torch.no_grad():
132
+ for name, p in sorted(model.named_parameters()):
133
+ torch.nn.init.normal_(p, 0, 0.2)
134
+ print(name, p.shape, p.numel() / n_params * 100, '%')
135
+ # MTP
136
+ set_seed(42)
137
+ model.model.layers.append(nn.ModuleDict(dict(
138
+ shared_head=nn.ModuleDict(dict(
139
+ norm=nn.RMSNorm(config.hidden_size),
140
+ # head=deepcopy(model.model.embed_tokens),
141
+ )),
142
+ # embed_tokens=deepcopy(model.model.embed_tokens),
143
+ eh_proj=nn.Linear(config.hidden_size * 2,
144
+ config.hidden_size, bias=False),
145
+ enorm=nn.RMSNorm(config.hidden_size),
146
+ hnorm=nn.RMSNorm(config.hidden_size),
147
+ input_layernorm=nn.RMSNorm(config.hidden_size),
148
+ post_attention_layernorm=nn.RMSNorm(config.hidden_size),
149
+ self_attn=deepcopy(model.model.layers[1].self_attn),
150
+ mlp=deepcopy(model.model.layers[1].mlp),
151
+ )))
152
+ for i in range(1, len(model.model.layers)):
153
+ model.model.layers[i].mlp.gate.e_score_correction_bias = torch.rand_like(
154
+ model.model.layers[i].mlp.gate.e_score_correction_bias).float()
155
+ model.save_pretrained(save_folder)
156
+ print(model)
157
+ ```
158
+
159
+ </details>
160
+
161
+ ### Printing the model:
162
+
163
+ <details><summary>Click to expand</summary>
164
+
165
+ ```text
166
+ GlmMoeDsaForCausalLM(
167
+ (model): GlmMoeDsaModel(
168
+ (embed_tokens): Embedding(154880, 8, padding_idx=154820)
169
+ (layers): ModuleList(
170
+ (0): GlmMoeDsaDecoderLayer(
171
+ (self_attn): GlmMoeDsaAttention(
172
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
173
+ (q_a_layernorm): GlmMoeDsaRMSNorm((32,), eps=1e-06)
174
+ (q_b_proj): Linear(in_features=32, out_features=2048, bias=False)
175
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
176
+ (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
177
+ (kv_b_proj): Linear(in_features=512, out_features=3584, bias=False)
178
+ (o_proj): Linear(in_features=2048, out_features=8, bias=False)
179
+ (indexer): GlmMoeDsaIndexer(
180
+ (wq_b): Linear(in_features=32, out_features=512, bias=False)
181
+ (wk): Linear(in_features=8, out_features=128, bias=False)
182
+ (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
183
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
184
+ )
185
+ )
186
+ (mlp): GlmMoeDsaMLP(
187
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
188
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
189
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
190
+ (act_fn): SiLUActivation()
191
+ )
192
+ (input_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
193
+ (post_attention_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
194
+ )
195
+ (1): GlmMoeDsaDecoderLayer(
196
+ (self_attn): GlmMoeDsaAttention(
197
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
198
+ (q_a_layernorm): GlmMoeDsaRMSNorm((32,), eps=1e-06)
199
+ (q_b_proj): Linear(in_features=32, out_features=2048, bias=False)
200
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
201
+ (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
202
+ (kv_b_proj): Linear(in_features=512, out_features=3584, bias=False)
203
+ (o_proj): Linear(in_features=2048, out_features=8, bias=False)
204
+ (indexer): GlmMoeDsaIndexer(
205
+ (wq_b): Linear(in_features=32, out_features=512, bias=False)
206
+ (wk): Linear(in_features=8, out_features=128, bias=False)
207
+ (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
208
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
209
+ )
210
+ )
211
+ (mlp): GlmMoeDsaMoE(
212
+ (experts): GlmMoeDsaNaiveMoe(
213
+ (act_fn): SiLUActivation()
214
+ )
215
+ (gate): GlmMoeDsaTopkRouter()
216
+ (shared_experts): GlmMoeDsaMLP(
217
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
218
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
219
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
220
+ (act_fn): SiLUActivation()
221
+ )
222
+ )
223
+ (input_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
224
+ (post_attention_layernorm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
225
+ )
226
+ (2): ModuleDict(
227
+ (shared_head): ModuleDict(
228
+ (norm): RMSNorm((8,), eps=None, elementwise_affine=True)
229
+ )
230
+ (eh_proj): Linear(in_features=16, out_features=8, bias=False)
231
+ (enorm): RMSNorm((8,), eps=None, elementwise_affine=True)
232
+ (hnorm): RMSNorm((8,), eps=None, elementwise_affine=True)
233
+ (input_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
234
+ (post_attention_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
235
+ (self_attn): GlmMoeDsaAttention(
236
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
237
+ (q_a_layernorm): GlmMoeDsaRMSNorm((32,), eps=1e-06)
238
+ (q_b_proj): Linear(in_features=32, out_features=2048, bias=False)
239
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
240
+ (kv_a_layernorm): GlmMoeDsaRMSNorm((512,), eps=1e-06)
241
+ (kv_b_proj): Linear(in_features=512, out_features=3584, bias=False)
242
+ (o_proj): Linear(in_features=2048, out_features=8, bias=False)
243
+ (indexer): GlmMoeDsaIndexer(
244
+ (wq_b): Linear(in_features=32, out_features=512, bias=False)
245
+ (wk): Linear(in_features=8, out_features=128, bias=False)
246
+ (k_norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
247
+ (weights_proj): Linear(in_features=8, out_features=4, bias=False)
248
+ )
249
+ )
250
+ (mlp): GlmMoeDsaMoE(
251
+ (experts): GlmMoeDsaNaiveMoe(
252
+ (act_fn): SiLUActivation()
253
+ )
254
+ (gate): GlmMoeDsaTopkRouter()
255
+ (shared_experts): GlmMoeDsaMLP(
256
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
257
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
258
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
259
+ (act_fn): SiLUActivation()
260
+ )
261
+ )
262
+ )
263
+ )
264
+ (norm): GlmMoeDsaRMSNorm((8,), eps=1e-05)
265
+ (rotary_emb): GlmMoeDsaRotaryEmbedding()
266
+ )
267
+ (lm_head): Linear(in_features=8, out_features=154880, bias=False)
268
+ )
269
+ ```
270
+
271
+ </details>
272
+
273
+ ### Test environment:
274
+
275
+ - torch: 2.11.0
276
+ - transformers: 5.5.0
chat_template.jinja ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ {%- macro tool_to_json(tool) -%}
4
+ {%- set ns_tool = namespace(first=true) -%}
5
+ {{ '{' -}}
6
+ {%- for k, v in tool.items() -%}
7
+ {%- if k != 'defer_loading' and k != 'strict' -%}
8
+ {%- if not ns_tool.first -%}{{- ', ' -}}{%- endif -%}
9
+ {%- set ns_tool.first = false -%}
10
+ "{{ k }}": {{ v | tojson(ensure_ascii=False) }}
11
+ {%- endif -%}
12
+ {%- endfor -%}
13
+ {{- '}' -}}
14
+ {%- endmacro -%}
15
+ <|system|>
16
+ # Tools
17
+
18
+ You may call one or more functions to assist with the user query.
19
+
20
+ You are provided with function signatures within <tools></tools> XML tags:
21
+ <tools>
22
+ {% for tool in tools %}
23
+ {%- if 'function' in tool -%}
24
+ {%- set tool = tool['function'] -%}
25
+ {%- endif -%}
26
+ {% if tool.defer_loading is not defined or not tool.defer_loading %}
27
+ {{ tool_to_json(tool) }}
28
+ {% endif %}
29
+ {% endfor %}
30
+ </tools>
31
+
32
+ For each function call, output the function name and arguments within the following XML format:
33
+ <tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}
34
+ {%- macro visible_text(content) -%}
35
+ {%- if content is string -%}
36
+ {{- content }}
37
+ {%- elif content is iterable and content is not mapping -%}
38
+ {%- for item in content -%}
39
+ {%- if item is mapping and item.type == 'text' -%}
40
+ {{- item.text }}
41
+ {%- elif item is string -%}
42
+ {{- item }}
43
+ {%- endif -%}
44
+ {%- endfor -%}
45
+ {%- else -%}
46
+ {{- content }}
47
+ {%- endif -%}
48
+ {%- endmacro -%}
49
+ {%- set ns = namespace(last_user_index=-1, thinking_indices='') -%}
50
+ {%- for m in messages %}
51
+ {%- if m.role == 'user' %}
52
+ {%- set ns.last_user_index = loop.index0 -%}
53
+ {%- elif m.role == 'assistant' %}
54
+ {%- if m.reasoning_content is string %}
55
+ {%- set ns.thinking_indices = ns.thinking_indices ~ ',' ~ ns.last_user_index ~ ',' -%}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- set ns.has_thinking = false -%}
60
+ {%- for m in messages -%}
61
+ {%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}{% set ns.has_thinking = (',' ~ loop.index0 ~ ',') in ns.thinking_indices -%}
62
+ {%- elif m.role == 'assistant' -%}
63
+ <|assistant|>
64
+ {%- set content = visible_text(m.content) %}
65
+ {%- if m.reasoning_content is string %}
66
+ {%- set reasoning_content = m.reasoning_content %}
67
+ {%- elif '</think>' in content %}
68
+ {%- set reasoning_content = content.split('</think>')[0].split('<think>')[-1] %}
69
+ {%- set content = content.split('</think>')[-1] %}
70
+ {%- elif loop.index0 > ns.last_user_index and not (enable_thinking is defined and not enable_thinking) %}
71
+ {%- set reasoning_content = '' %}
72
+ {%- elif loop.index0 < ns.last_user_index and ns.has_thinking %}
73
+ {%- set reasoning_content = '' %}
74
+ {%- endif %}
75
+ {%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content is defined -%}
76
+ {{ '<think>' + reasoning_content + '</think>'}}
77
+ {%- else -%}
78
+ {{ '</think>' }}
79
+ {%- endif -%}
80
+ {%- if content.strip() -%}
81
+ {{ content.strip() }}
82
+ {%- endif -%}
83
+ {% if m.tool_calls %}
84
+ {% for tc in m.tool_calls %}
85
+ {%- if tc.function %}
86
+ {%- set tc = tc.function %}
87
+ {%- endif %}
88
+ {{- '<tool_call>' + tc.name -}}
89
+ {% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}
90
+ {% endif %}
91
+ {%- elif m.role == 'tool' -%}
92
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
93
+ {{- '<|observation|>' -}}
94
+ {%- endif %}
95
+ {%- if m.content is string -%}
96
+ {{- '<tool_response>' + m.content + '</tool_response>' -}}
97
+ {%- elif m.content is iterable and m.content is not mapping and m.content and m.content.0.type == "tool_reference" -%}
98
+ {{- '<tool_response><tools>\n' -}}
99
+ {% for tr in m.content %}
100
+ {%- for tool in tools -%}
101
+ {%- if 'function' in tool -%}
102
+ {%- set tool = tool['function'] -%}
103
+ {%- endif -%}
104
+ {%- if tool.name == tr.name -%}
105
+ {{- tool_to_json(tool) + '\n' -}}
106
+ {%- endif -%}
107
+ {%- endfor -%}
108
+ {%- endfor -%}
109
+ {{- '</tools></tool_response>' -}}
110
+ {%- else -%}
111
+ {{- '<tool_response>' + visible_text(m.content) + '</tool_response>' -}}
112
+ {% endif -%}
113
+ {%- elif m.role == 'system' -%}
114
+ <|system|>{{ visible_text(m.content) }}
115
+ {%- endif -%}
116
+ {%- endfor -%}
117
+ {%- if add_generation_prompt -%}
118
+ <|assistant|>{{- '</think>' if (enable_thinking is defined and not enable_thinking) else '<think>' -}}
119
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GlmMoeDsaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": [
10
+ 154820,
11
+ 154827,
12
+ 154829
13
+ ],
14
+ "ep_size": 1,
15
+ "first_k_dense_replace": 1,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 8,
18
+ "index_head_dim": 128,
19
+ "index_n_heads": 4,
20
+ "index_topk": 2048,
21
+ "indexer_rope_interleave": true,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 32,
24
+ "kv_lora_rank": 512,
25
+ "max_position_embeddings": 202752,
26
+ "mlp_layer_types": [
27
+ "dense",
28
+ "sparse"
29
+ ],
30
+ "model_type": "glm_moe_dsa",
31
+ "moe_intermediate_size": 32,
32
+ "moe_layer_freq": 1,
33
+ "n_group": 1,
34
+ "n_routed_experts": 256,
35
+ "n_shared_experts": 1,
36
+ "norm_topk_prob": true,
37
+ "num_attention_heads": 8,
38
+ "num_experts_per_tok": 8,
39
+ "num_hidden_layers": 2,
40
+ "num_key_value_heads": 8,
41
+ "num_nextn_predict_layers": 1,
42
+ "pad_token_id": 154820,
43
+ "pretraining_tp": 1,
44
+ "q_lora_rank": 32,
45
+ "qk_head_dim": 256,
46
+ "qk_nope_head_dim": 192,
47
+ "qk_rope_head_dim": 64,
48
+ "rms_norm_eps": 1e-05,
49
+ "rope_interleave": true,
50
+ "rope_parameters": {
51
+ "rope_theta": 1000000,
52
+ "rope_type": "default"
53
+ },
54
+ "routed_scaling_factor": 2.5,
55
+ "scoring_func": "sigmoid",
56
+ "tie_word_embeddings": false,
57
+ "topk_group": 1,
58
+ "topk_method": "noaux_tc",
59
+ "transformers_version": "5.5.0",
60
+ "use_cache": true,
61
+ "v_head_dim": 256,
62
+ "vocab_size": 154880
63
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 154820,
6
+ 154827,
7
+ 154829
8
+ ],
9
+ "pad_token_id": 154820,
10
+ "temperature": 1.0,
11
+ "top_p": 0.95,
12
+ "transformers_version": "5.5.0"
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fde54d8d9a380c57a28e4871e487fe90b2b5a53b2c6c6d93d4f0bac83c875394
3
+ size 17581680
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d
3
+ size 20217442
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "extra_special_tokens": [
7
+ "<|endoftext|>",
8
+ "[MASK]",
9
+ "[gMASK]",
10
+ "[sMASK]",
11
+ "<sop>",
12
+ "<eop>",
13
+ "<|system|>",
14
+ "<|user|>",
15
+ "<|assistant|>",
16
+ "<|observation|>",
17
+ "<|begin_of_image|>",
18
+ "<|end_of_image|>",
19
+ "<|begin_of_video|>",
20
+ "<|end_of_video|>",
21
+ "<|begin_of_audio|>",
22
+ "<|end_of_audio|>",
23
+ "<|begin_of_transcription|>",
24
+ "<|end_of_transcription|>"
25
+ ],
26
+ "is_local": false,
27
+ "model_max_length": 202752,
28
+ "model_specific_special_tokens": {},
29
+ "pad_token": "<|endoftext|>",
30
+ "padding_side": "left",
31
+ "remove_space": false,
32
+ "tokenizer_class": "TokenizersBackend"
33
+ }