Upload DeepseekV4ForCausalLM
Browse files- README.md +9 -0
- chat_template.jinja +198 -0
- config.json +75 -0
- generation_config.json +9 -0
- model.safetensors +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +14 -0
README.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
tags:
|
| 4 |
+
- trl
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Tiny DeepseekV4ForCausalLM
|
| 8 |
+
|
| 9 |
+
This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library.
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- macro render_tools_block(eff_tools) -%}
|
| 2 |
+
{%- set tl_ns = namespace(lines=[]) -%}
|
| 3 |
+
{%- for t in eff_tools -%}
|
| 4 |
+
{%- if t.function is defined -%}
|
| 5 |
+
{%- set tl_ns.lines = tl_ns.lines + [t.function | tojson(ensure_ascii=false)] -%}
|
| 6 |
+
{%- else -%}
|
| 7 |
+
{%- set tl_ns.lines = tl_ns.lines + [t | tojson(ensure_ascii=false)] -%}
|
| 8 |
+
{%- endif -%}
|
| 9 |
+
{%- endfor -%}
|
| 10 |
+
{{- "\n\n## Tools\n\nYou have access to a set of tools to help answer the user's question. You can invoke tools by writing a \"<|DSML|tool_calls>\" block like the following:\n\n<|DSML|tool_calls>\n<|DSML|invoke name=\"$TOOL_NAME\">\n<|DSML|parameter name=\"$PARAMETER_NAME\" string=\"true|false\">$PARAMETER_VALUE</|DSML|parameter>\n...\n</|DSML|invoke>\n<|DSML|invoke name=\"$TOOL_NAME2\">\n...\n</|DSML|invoke>\n</|DSML|tool_calls>\n\nString parameters should be specified as is and set `string=\"true\"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string=\"false\"`.\n\nIf thinking_mode is enabled (triggered by <think>), you MUST output your complete reasoning inside <think>...</think> BEFORE any tool calls or final response.\n\nOtherwise, output directly after </think> with tool calls or final response.\n\n### Available Tool Schemas\n\n" -}}
|
| 11 |
+
{{- tl_ns.lines | join("\n") -}}
|
| 12 |
+
{{- "\n\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.\n" -}}
|
| 13 |
+
{%- endmacro -%}
|
| 14 |
+
|
| 15 |
+
{%- macro render_response_format(rf) -%}
|
| 16 |
+
{{- "\n\n## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n" -}}
|
| 17 |
+
{{- rf | tojson(ensure_ascii=false) -}}
|
| 18 |
+
{%- endmacro -%}
|
| 19 |
+
|
| 20 |
+
{%- if thinking_mode is not defined -%}{%- set thinking_mode = "thinking" -%}{%- endif -%}
|
| 21 |
+
{%- if drop_thinking is not defined -%}{%- set drop_thinking = true -%}{%- endif -%}
|
| 22 |
+
{%- if reasoning_effort is not defined -%}{%- set reasoning_effort = none -%}{%- endif -%}
|
| 23 |
+
{%- if tools is not defined -%}{%- set tools = none -%}{%- endif -%}
|
| 24 |
+
{%- if add_generation_prompt is not defined -%}{%- set add_generation_prompt = false -%}{%- endif -%}
|
| 25 |
+
|
| 26 |
+
{%- set tools_ns = namespace(has_any=false) -%}
|
| 27 |
+
{%- if tools -%}{%- set tools_ns.has_any = true -%}{%- endif -%}
|
| 28 |
+
{%- for m in messages -%}
|
| 29 |
+
{%- if m.tools -%}{%- set tools_ns.has_any = true -%}{%- endif -%}
|
| 30 |
+
{%- endfor -%}
|
| 31 |
+
{%- set effective_drop = drop_thinking and (not tools_ns.has_any) -%}
|
| 32 |
+
|
| 33 |
+
{%- set mns = namespace(list=[]) -%}
|
| 34 |
+
{%- for msg in messages -%}
|
| 35 |
+
{%- if msg.role == "tool" -%}
|
| 36 |
+
{%- set tblock = {"type": "tool_result", "tool_use_id": msg.get("tool_call_id", ""), "content": msg.content} -%}
|
| 37 |
+
{%- if mns.list|length > 0 and mns.list[-1].role == "user" and "content_blocks" in mns.list[-1] -%}
|
| 38 |
+
{%- set last = mns.list[-1] -%}
|
| 39 |
+
{%- set mns.list = mns.list[:-1] + [dict(last, content_blocks=last.content_blocks + [tblock])] -%}
|
| 40 |
+
{%- else -%}
|
| 41 |
+
{%- set mns.list = mns.list + [{"role": "user", "content_blocks": [tblock]}] -%}
|
| 42 |
+
{%- endif -%}
|
| 43 |
+
{%- elif msg.role == "user" -%}
|
| 44 |
+
{%- set text_block = {"type": "text", "text": msg.get("content", "")} -%}
|
| 45 |
+
{%- if mns.list|length > 0 and mns.list[-1].role == "user" and "content_blocks" in mns.list[-1] and mns.list[-1].get("task") is none -%}
|
| 46 |
+
{%- set last = mns.list[-1] -%}
|
| 47 |
+
{%- set mns.list = mns.list[:-1] + [dict(last, content_blocks=last.content_blocks + [text_block])] -%}
|
| 48 |
+
{%- else -%}
|
| 49 |
+
{%- set new_msg = {"role": "user", "content": msg.get("content", ""), "content_blocks": [text_block]} -%}
|
| 50 |
+
{%- if msg.get("task") is not none -%}
|
| 51 |
+
{%- set new_msg = dict(new_msg, task=msg.task) -%}
|
| 52 |
+
{%- endif -%}
|
| 53 |
+
{%- if msg.get("wo_eos") is not none -%}
|
| 54 |
+
{%- set new_msg = dict(new_msg, wo_eos=msg.wo_eos) -%}
|
| 55 |
+
{%- endif -%}
|
| 56 |
+
{%- set mns.list = mns.list + [new_msg] -%}
|
| 57 |
+
{%- endif -%}
|
| 58 |
+
{%- else -%}
|
| 59 |
+
{%- set mns.list = mns.list + [msg] -%}
|
| 60 |
+
{%- endif -%}
|
| 61 |
+
{%- endfor -%}
|
| 62 |
+
|
| 63 |
+
{%- set lu = namespace(idx=-1) -%}
|
| 64 |
+
{%- for m in mns.list -%}
|
| 65 |
+
{%- if m.role == "user" or m.role == "developer" -%}
|
| 66 |
+
{%- set lu.idx = loop.index0 -%}
|
| 67 |
+
{%- endif -%}
|
| 68 |
+
{%- endfor -%}
|
| 69 |
+
|
| 70 |
+
{%- set fns = namespace(list=[], lu_idx=-1) -%}
|
| 71 |
+
{%- if thinking_mode == "thinking" and effective_drop -%}
|
| 72 |
+
{%- for m in mns.list -%}
|
| 73 |
+
{%- if not (m.role == "developer" and loop.index0 < lu.idx) -%}
|
| 74 |
+
{%- if loop.index0 == lu.idx -%}{%- set fns.lu_idx = fns.list|length -%}{%- endif -%}
|
| 75 |
+
{%- set fns.list = fns.list + [m] -%}
|
| 76 |
+
{%- endif -%}
|
| 77 |
+
{%- endfor -%}
|
| 78 |
+
{%- else -%}
|
| 79 |
+
{%- set fns.list = mns.list -%}
|
| 80 |
+
{%- set fns.lu_idx = lu.idx -%}
|
| 81 |
+
{%- endif -%}
|
| 82 |
+
|
| 83 |
+
{%- set att = namespace(idx=-1, sys=-1) -%}
|
| 84 |
+
{%- if tools -%}
|
| 85 |
+
{%- for m in fns.list -%}
|
| 86 |
+
{%- if m.role == "developer" and att.idx == -1 -%}{%- set att.idx = loop.index0 -%}{%- endif -%}
|
| 87 |
+
{%- if m.role == "system" and att.sys == -1 -%}{%- set att.sys = loop.index0 -%}{%- endif -%}
|
| 88 |
+
{%- endfor -%}
|
| 89 |
+
{%- if att.idx == -1 -%}{%- set att.idx = att.sys -%}{%- endif -%}
|
| 90 |
+
{%- endif -%}
|
| 91 |
+
|
| 92 |
+
{{- "<|begin▁of▁sentence|>" -}}
|
| 93 |
+
|
| 94 |
+
{%- if thinking_mode == "thinking" and reasoning_effort == "max" -%}
|
| 95 |
+
{{- "Reasoning Effort: Absolute maximum with no shortcuts permitted.\nYou MUST be very thorough in your thinking and comprehensively decompose the problem to resolve the root cause, rigorously stress-testing your logic against all potential paths, edge cases, and adversarial scenarios.\nExplicitly write out your entire deliberation process, documenting every intermediate step, considered alternative, and rejected hypothesis to ensure absolutely no assumption is left unchecked.\n\n" -}}
|
| 96 |
+
{%- endif -%}
|
| 97 |
+
|
| 98 |
+
{%- for msg in fns.list -%}
|
| 99 |
+
{%- set idx = loop.index0 -%}
|
| 100 |
+
{%- set is_last = (idx == fns.list|length - 1) -%}
|
| 101 |
+
{%- set next_role = (fns.list[idx + 1].role) if (not is_last) else none -%}
|
| 102 |
+
{%- set prev_has_task = (idx > 0) and (fns.list[idx - 1].get("task") is not none) -%}
|
| 103 |
+
|
| 104 |
+
{%- set eff_tools = none -%}
|
| 105 |
+
{%- if msg.tools -%}{%- set eff_tools = msg.tools -%}
|
| 106 |
+
{%- elif idx == att.idx -%}{%- set eff_tools = tools -%}{%- endif -%}
|
| 107 |
+
|
| 108 |
+
{%- if msg.role == "system" or msg.role == "developer" -%}
|
| 109 |
+
{%- if msg.role == "developer" -%}{{- "<|User|>" -}}{%- endif -%}
|
| 110 |
+
{{- (msg.get("content", "") or "") -}}
|
| 111 |
+
{%- if eff_tools -%}{{- render_tools_block(eff_tools) -}}{%- endif -%}
|
| 112 |
+
{%- if msg.response_format is defined and msg.response_format -%}{{- render_response_format(msg.response_format) -}}{%- endif -%}
|
| 113 |
+
|
| 114 |
+
{%- elif msg.role == "user" -%}
|
| 115 |
+
{{- "<|User|>" -}}
|
| 116 |
+
{%- set parts_ns = namespace(parts=[]) -%}
|
| 117 |
+
{%- for b in msg.content_blocks -%}
|
| 118 |
+
{%- if b.type == "text" -%}
|
| 119 |
+
{%- set parts_ns.parts = parts_ns.parts + [b.get("text", "")] -%}
|
| 120 |
+
{%- elif b.type == "tool_result" -%}
|
| 121 |
+
{%- set tc_content = b.get("content", "") -%}
|
| 122 |
+
{%- if tc_content is iterable and tc_content is not string and tc_content is not mapping -%}
|
| 123 |
+
{%- set txt_ns = namespace(texts=[]) -%}
|
| 124 |
+
{%- for sub in tc_content -%}
|
| 125 |
+
{%- if sub.type == "text" -%}
|
| 126 |
+
{%- set txt_ns.texts = txt_ns.texts + [sub.get("text", "")] -%}
|
| 127 |
+
{%- else -%}
|
| 128 |
+
{%- set txt_ns.texts = txt_ns.texts + ["[Unsupported " ~ sub.type ~ "]"] -%}
|
| 129 |
+
{%- endif -%}
|
| 130 |
+
{%- endfor -%}
|
| 131 |
+
{%- set tc_content = txt_ns.texts | join("\n\n") -%}
|
| 132 |
+
{%- endif -%}
|
| 133 |
+
{%- set parts_ns.parts = parts_ns.parts + ["<tool_result>" ~ tc_content ~ "</tool_result>"] -%}
|
| 134 |
+
{%- else -%}
|
| 135 |
+
{%- set parts_ns.parts = parts_ns.parts + ["[Unsupported " ~ b.type ~ "]"] -%}
|
| 136 |
+
{%- endif -%}
|
| 137 |
+
{%- endfor -%}
|
| 138 |
+
{{- parts_ns.parts | join("\n\n") -}}
|
| 139 |
+
|
| 140 |
+
{%- elif msg.role == "latest_reminder" -%}
|
| 141 |
+
{{- "<|latest_reminder|>" -}}{{- msg.content -}}
|
| 142 |
+
|
| 143 |
+
{%- elif msg.role == "assistant" -%}
|
| 144 |
+
{%- set rc = msg.get("reasoning_content", "") or "" -%}
|
| 145 |
+
{%- if (thinking_mode == "thinking") and (not prev_has_task) and ((not effective_drop) or idx > fns.lu_idx) -%}
|
| 146 |
+
{{- rc -}}{{- "</think>" -}}
|
| 147 |
+
{%- endif -%}
|
| 148 |
+
{{- msg.get("content", "") or "" -}}
|
| 149 |
+
{%- if msg.tool_calls -%}
|
| 150 |
+
{{- "\n\n<|DSML|tool_calls>\n" -}}
|
| 151 |
+
{%- set tc_ns = namespace(lines=[]) -%}
|
| 152 |
+
{%- for tc in msg.tool_calls -%}
|
| 153 |
+
{%- if tc.function is defined -%}
|
| 154 |
+
{%- set tc_name = tc.function.name -%}{%- set tc_args = tc.function.arguments -%}
|
| 155 |
+
{%- else -%}
|
| 156 |
+
{%- set tc_name = tc.name -%}{%- set tc_args = tc.arguments -%}
|
| 157 |
+
{%- endif -%}
|
| 158 |
+
{%- set p_ns = namespace(lines=[]) -%}
|
| 159 |
+
{%- if tc_args is mapping -%}
|
| 160 |
+
{%- for key, value in tc_args.items() -%}
|
| 161 |
+
{%- if value is string -%}
|
| 162 |
+
{%- set p_ns.lines = p_ns.lines + ['<|DSML|parameter name="' ~ key ~ '" string="true">' ~ value ~ '</|DSML|parameter>'] -%}
|
| 163 |
+
{%- else -%}
|
| 164 |
+
{%- set p_ns.lines = p_ns.lines + ['<|DSML|parameter name="' ~ key ~ '" string="false">' ~ (value | tojson(ensure_ascii=false)) ~ '</|DSML|parameter>'] -%}
|
| 165 |
+
{%- endif -%}
|
| 166 |
+
{%- endfor -%}
|
| 167 |
+
{%- else -%}
|
| 168 |
+
{%- set p_ns.lines = p_ns.lines + ['<|DSML|parameter name="arguments" string="true">' ~ (tc_args | string) ~ '</|DSML|parameter>'] -%}
|
| 169 |
+
{%- endif -%}
|
| 170 |
+
{%- set tc_ns.lines = tc_ns.lines + ['<|DSML|invoke name="' ~ tc_name ~ '">\n' ~ (p_ns.lines | join("\n")) ~ '\n</|DSML|invoke>'] -%}
|
| 171 |
+
{%- endfor -%}
|
| 172 |
+
{{- tc_ns.lines | join("\n") -}}{{- "\n</|DSML|tool_calls>" -}}
|
| 173 |
+
{%- endif -%}
|
| 174 |
+
{%- if not msg.get("wo_eos") -%}{{- "<|end▁of▁sentence|>" -}}{%- endif -%}
|
| 175 |
+
|
| 176 |
+
{%- else -%}
|
| 177 |
+
{{- raise_exception("Unknown role: " ~ msg.role) -}}
|
| 178 |
+
{%- endif -%}
|
| 179 |
+
|
| 180 |
+
{%- set need_transition = is_last or (next_role == "assistant") or (next_role == "latest_reminder") -%}
|
| 181 |
+
{%- set this_task = msg.get("task", none) -%}
|
| 182 |
+
|
| 183 |
+
{%- if need_transition and this_task is not none -%}
|
| 184 |
+
{%- set task_tokens = {"action": "<|action|>", "query": "<|query|>", "authority": "<|authority|>", "domain": "<|domain|>", "title": "<|title|>", "read_url": "<|read_url|>"} -%}
|
| 185 |
+
{%- if this_task not in task_tokens -%}{{- raise_exception("Invalid task: " ~ this_task) -}}{%- endif -%}
|
| 186 |
+
{%- if this_task == "action" -%}
|
| 187 |
+
{{- "<|Assistant|>" -}}{{- "<think>" if thinking_mode == "thinking" else "</think>" -}}
|
| 188 |
+
{%- endif -%}
|
| 189 |
+
{{- task_tokens[this_task] -}}
|
| 190 |
+
{%- elif need_transition and (msg.role == "user" or msg.role == "developer") and not (is_last and not add_generation_prompt) -%}
|
| 191 |
+
{{- "<|Assistant|>" -}}
|
| 192 |
+
{%- if thinking_mode == "thinking" -%}
|
| 193 |
+
{{- "<think>" if (not effective_drop) or idx >= fns.lu_idx else "</think>" -}}
|
| 194 |
+
{%- else -%}
|
| 195 |
+
{{- "</think>" -}}
|
| 196 |
+
{%- endif -%}
|
| 197 |
+
{%- endif -%}
|
| 198 |
+
{%- endfor -%}
|
config.json
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"DeepseekV4ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"compress_ratios": [
|
| 9 |
+
0,
|
| 10 |
+
0
|
| 11 |
+
],
|
| 12 |
+
"compress_rope_parameters": {
|
| 13 |
+
"partial_rotary_factor": 0.125,
|
| 14 |
+
"rope_theta": 160000.0,
|
| 15 |
+
"rope_type": "default"
|
| 16 |
+
},
|
| 17 |
+
"compress_rope_theta": 160000.0,
|
| 18 |
+
"dtype": "bfloat16",
|
| 19 |
+
"eos_token_id": 1,
|
| 20 |
+
"first_k_dense_replace": null,
|
| 21 |
+
"hc_eps": 1e-06,
|
| 22 |
+
"hc_mult": 4,
|
| 23 |
+
"hc_sinkhorn_iters": 20,
|
| 24 |
+
"head_dim": 512,
|
| 25 |
+
"hidden_act": "silu",
|
| 26 |
+
"hidden_size": 8,
|
| 27 |
+
"index_head_dim": 128,
|
| 28 |
+
"index_n_heads": 64,
|
| 29 |
+
"index_topk": 512,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 32,
|
| 32 |
+
"kv_lora_rank": null,
|
| 33 |
+
"max_position_embeddings": 1048576,
|
| 34 |
+
"model_type": "deepseek_v4",
|
| 35 |
+
"moe_intermediate_size": 2048,
|
| 36 |
+
"n_group": null,
|
| 37 |
+
"n_routed_experts": 256,
|
| 38 |
+
"n_shared_experts": 1,
|
| 39 |
+
"norm_topk_prob": true,
|
| 40 |
+
"num_attention_heads": 4,
|
| 41 |
+
"num_experts_per_tok": 6,
|
| 42 |
+
"num_hash_layers": 3,
|
| 43 |
+
"num_hidden_layers": 2,
|
| 44 |
+
"num_key_value_heads": 2,
|
| 45 |
+
"num_nextn_predict_layers": 1,
|
| 46 |
+
"o_groups": 8,
|
| 47 |
+
"o_lora_rank": 1024,
|
| 48 |
+
"output_router_logits": false,
|
| 49 |
+
"pad_token_id": null,
|
| 50 |
+
"partial_rotary_factor": 0.125,
|
| 51 |
+
"pretraining_tp": 1,
|
| 52 |
+
"q_lora_rank": 1024,
|
| 53 |
+
"qk_nope_head_dim": 448,
|
| 54 |
+
"qk_rope_head_dim": 64,
|
| 55 |
+
"rms_norm_eps": 1e-06,
|
| 56 |
+
"rope_interleave": true,
|
| 57 |
+
"rope_parameters": {
|
| 58 |
+
"partial_rotary_factor": 0.125,
|
| 59 |
+
"rope_theta": 10000.0,
|
| 60 |
+
"rope_type": "default"
|
| 61 |
+
},
|
| 62 |
+
"rope_theta": 10000.0,
|
| 63 |
+
"routed_scaling_factor": 1.5,
|
| 64 |
+
"router_aux_loss_coef": 0.001,
|
| 65 |
+
"router_jitter_noise": 0.0,
|
| 66 |
+
"scoring_func": "sqrtsoftplus",
|
| 67 |
+
"sliding_window": 128,
|
| 68 |
+
"swiglu_limit": 10.0,
|
| 69 |
+
"tie_word_embeddings": false,
|
| 70 |
+
"topk_group": null,
|
| 71 |
+
"transformers_version": "5.7.0.dev0",
|
| 72 |
+
"use_cache": true,
|
| 73 |
+
"v_head_dim": null,
|
| 74 |
+
"vocab_size": 129280
|
| 75 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 0,
|
| 4 |
+
"do_sample": true,
|
| 5 |
+
"eos_token_id": 1,
|
| 6 |
+
"temperature": 1.0,
|
| 7 |
+
"top_p": 1.0,
|
| 8 |
+
"transformers_version": "5.7.0.dev0"
|
| 9 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53f6bf42b1d190e81a90dc7b9f30eaa4b3fa650313fe4785d26b4cb11bf7adef
|
| 3 |
+
size 83997562
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<|begin▁of▁sentence|>",
|
| 4 |
+
"clean_up_tokenization_spaces": false,
|
| 5 |
+
"eos_token": "<|end▁of▁sentence|>",
|
| 6 |
+
"is_local": false,
|
| 7 |
+
"legacy": true,
|
| 8 |
+
"local_files_only": false,
|
| 9 |
+
"model_max_length": 1048576,
|
| 10 |
+
"pad_token": "<|end▁of▁sentence|>",
|
| 11 |
+
"sp_model_kwargs": {},
|
| 12 |
+
"tokenizer_class": "TokenizersBackend",
|
| 13 |
+
"unk_token": null
|
| 14 |
+
}
|