aiqtech

zhiyuan8 commited on 16 days ago

Commit

e3b1a57

verified ·

0 Parent(s):

Duplicate from moonshotai/Kimi-Linear-48B-A3B-Instruct

Browse files

Co-authored-by: LiZhiyuan <zhiyuan8@users.noreply.huggingface.co>

Files changed (37) hide show

.gitattributes +37 -0
README.md +113 -0
chat_template.jinja +48 -0
config.json +86 -0
configuration_kimi.py +140 -0
figures/arch.png +3 -0
figures/banner.png +0 -0
figures/github.png +0 -0
figures/logo.png +0 -0
figures/perf_speed.png +3 -0
generation_config.json +7 -0
model-00001-of-00020.safetensors +3 -0
model-00002-of-00020.safetensors +3 -0
model-00003-of-00020.safetensors +3 -0
model-00004-of-00020.safetensors +3 -0
model-00005-of-00020.safetensors +3 -0
model-00006-of-00020.safetensors +3 -0
model-00007-of-00020.safetensors +3 -0
model-00008-of-00020.safetensors +3 -0
model-00009-of-00020.safetensors +3 -0
model-00010-of-00020.safetensors +3 -0
model-00011-of-00020.safetensors +3 -0
model-00012-of-00020.safetensors +3 -0
model-00013-of-00020.safetensors +3 -0
model-00014-of-00020.safetensors +3 -0
model-00015-of-00020.safetensors +3 -0
model-00016-of-00020.safetensors +3 -0
model-00017-of-00020.safetensors +3 -0
model-00018-of-00020.safetensors +3 -0
model-00019-of-00020.safetensors +3 -0
model-00020-of-00020.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_kimi.py +1028 -0
special_tokens_map.json +260 -0
tiktoken.model +3 -0
tokenization_kimi.py +347 -0
tokenizer_config.json +164 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/arch.png filter=lfs diff=lfs merge=lfs -text
+figures/perf_speed.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,113 @@

+---
+license: mit
+pipeline_tag: text-generation
+library_name: transformers
+---
+<div align="center">
+  <a href="https://huggingface.co/papers/2510.26692"><img width="80%" src="figures/banner.png"></a>
+</div>
+<div align="center">
+  <a href="https://huggingface.co/papers/2510.26692" ><img src="figures/logo.png" height="16" width="16" style="display: inline-block; vertical-align: middle; margin: 2px;"><b style="display: inline-block;"> Tech Report</b></a>  |
+  <a href="https://github.com/MoonshotAI/Kimi-Linear"><img src="figures/github.png" height="16" width="16" style="display: inline-block; vertical-align: middle; margin: 2px;"><b style="display: inline-block;"> Code</b></a> |
+  <a href="https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct"><img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" height="16" width="16" style="display: inline-block; vertical-align: middle; margin: 2px;"><b style="display: inline-block;"> HuggingFace</b></a>
+</div>
+<div align="center">
+  <img width="90%" src="figures/perf_speed.png">
+  <p><em><b>(a)</b> On MMLU-Pro (4k context length), Kimi Linear achieves 51.0 performance with similar speed as full attention. On RULER (128k context length), it shows Pareto-optimal performance (84.3) and 3.98x speedup. <b>(b)</b> Kimi Linear achieves 6.3x faster TPOT compared to MLA, offering significant speedups at long sequence lengths (1M tokens).</em></p>
+</div>
+## Overview
+Kimi Linear is a hybrid linear attention architecture that outperforms traditional full attention methods across various contexts, including short, long, and reinforcement learning (RL) scaling regimes.
+At its core is Kimi Delta Attention (KDA)—a refined version of [Gated DeltaNet](https://arxiv.org/abs/2412.06464) that introduces a more efficient gating mechanism to optimize the use of finite-state RNN memory.
+Kimi Linear achieves superior performance and hardware efficiency, especially for long-context tasks. It reduces the need for large KV caches by up to 75% and boosts decoding throughput by up to $6\times$ for contexts as long as 1M tokens.
+We open-source the KDA kernel in [FLA](https://github.com/fla-org/flash-linear-attention/tree/main/fla/ops/kda), and release two versions model checkpoints trained with 5.7T tokens.
+|      **Model**       | **#Total Params** | **#Activated Params** | **Context Length** |                                **Download Link**                                 |
+| :------------------: | :---------------: | :-------------------: | :----------------: | :------------------------------------------------------------------------------: |
+|   Kimi-Linear-Base   |        48B        |          3B           |         1M         |   [🤗 Hugging Face](https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Base)   |
+| Kimi-Linear-Instruct |        48B        |          3B           |         1M         | [🤗 Hugging Face](https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct) |
+## Key Features
+- **Kimi Delta Attention (KDA):** A linear attention mechanism that refines the gated delta rule with finegrained gating.
+- **Hybrid Architecture:** A 3:1 KDA-to-global MLA ratio reduces memory usage while maintaining or surpassing the quality of full attention.
+- **Superior Performance:** Outperforms full attention in a variety of tasks, including long-context and RL-style benchmarks on 1.4T token training runs with fair comparisons.
+- **High Throughput:** Achieves up to 6&times; faster decoding and significantly reduces time per output token (TPOT).
+<div align="center">
+  <img width="60%" src="figures/arch.png">
+</div>
+## Usage
+### Inference with Hugging Face Transformers
+To use the Kimi Linear model, we recommend the following environment:
+* `python` >= 3.10
+* `torch` >= 2.6
+* `fla-core` >= 0.4.0
+```shell
+pip install -U fla-core
+```
+Example Code:
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto",
+    trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant provided by Moonshot-AI."},
+    {"role": "user", "content": "Is 123 a prime?"}
+]
+input_ids = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt"
+).to(model.device)
+generated_ids = model.generate(inputs=input_ids, max_new_tokens=500)
+response = tokenizer.batch_decode(generated_ids)[0]
+print(response)
+```
+### Deployment
+For deployment, you can use the latest vllm to create an OpenAI-compatible API endpoint.
+```sh
+vllm serve moonshotai/Kimi-Linear-48B-A3B-Instruct \
+  --port 8000 \
+  --tensor-parallel-size 4 \
+  --max-model-len 1048576 \
+  --trust-remote-code
+```
+### Citation
+If you found our work useful, please cite
+```bibtex
+@misc{team2025kimi,
+    title         = {Kimi Linear: An Expressive, Efficient Attention Architecture},
+    author        = {Zhang, Yu  and Lin, Zongyu  and Yao, Xingcheng  and Hu, Jiaxi  and Meng, Fanqing  and Liu, Chengyin  and Men, Xin  and Yang, Songlin  and Li, Zhiyuan  and Li, Wentao  and Lu, Enzhe  and Liu, Weizhou  and Chen, Yanru  and Xu, Weixin  and Yu, Longhui  and Wang, Yejie  and Fan, Yu  and Zhong, Longguang  and Yuan, Enming  and Zhang, Dehao  and Zhang, Yizhi  and T. Liu, Y.  and Wang, Haiming  and Fang, Shengjun  and He, Weiran  and Liu, Shaowei  and Li, Yiwei  and Su, Jianlin  and Qiu, Jiezhong  and Pang, Bo  and Yan, Junjie  and Jiang, Zhejun  and Huang, Weixiao  and Yin, Bohong  and You, Jiacheng  and Wei, Chu  and Wang, Zhengtao  and Hong, Chao  and Chen, Yutian  and Chen, Guanduo  and Wang, Yucheng  and Zheng, Huabin  and Wang, Feng  and Liu, Yibo  and Dong, Mengnan  and Zhang, Zheng  and Pan, Siyuan  and Wu, Wenhao  and Wu, Yuhao  and Guan, Longyu  and Tao, Jiawen  and Fu, Guohong  and Xu, Xinran  and Wang, Yuzhi  and Lai, Guokun  and Wu, Yuxin  and Zhou, Xinyu  and Yang, Zhilin  and Du, Yulun},
+    year          = {2025},
+    eprint        = {2510.26692},
+    archivePrefix = {arXiv},
+    primaryClass  = {cs.CL}
+}
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,48 @@

+{% macro render_content(msg) -%}
+    {%- set c = msg.get('content') -%}
+    {%- if c is string -%}
+      {{ c }}
+    {%- elif c is not none -%}
+      {% for content in c -%}
+        {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+          <|media_start|>image<|media_content|><|media_pad|><|media_end|>
+        {% else -%}
+          {{ content['text'] }}
+        {%- endif -%}
+      {%- endfor -%}
+    {%- endif -%}
+{%- endmacro %}
+{%- if tools -%}
+  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
+{%- endif -%}
+{% for message in messages %}
+  {%- set role_name =  message.get('name') or  message['role'] -%}
+  {%- if message['role'] == 'user' -%}
+    <|im_user|>{{role_name}}<|im_middle|>
+  {%- elif message['role'] == 'assistant' -%}
+    <|im_assistant|>{{role_name}}<|im_middle|>
+  {%- else -%}
+    <|im_system|>{{role_name}}<|im_middle|>
+  {%- endif -%}
+  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
+    {{render_content(message)}}<|tool_calls_section_begin|>
+    {%- for tool_call in message['tool_calls'] -%}
+        {%- set formatted_id = tool_call['id'] -%}
+      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
+    {%- endfor -%}
+    <|tool_calls_section_end|>
+  {%- elif message['role'] == 'tool' -%}
+    {%- set tool_call_id = message.tool_call_id -%}
+    ## Return of {{ tool_call_id }}
+{{render_content(message)}}
+  {%- elif message['content'] is not none -%}
+    {{render_content(message)}}
+  {%- endif -%}
+  <|im_end|>
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+  <|im_assistant|>assistant<|im_middle|>
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "architectures": [
+    "KimiLinearForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_kimi.KimiLinearConfig",
+    "AutoModel": "modeling_kimi.KimiLinearModel",
+    "AutoModelForCausalLM": "modeling_kimi.KimiLinearForCausalLM"
+  },
+  "bos_token_id": 163584,
+  "dtype": "bfloat16",
+  "eos_token_id": 163586,
+  "first_k_dense_replace": 1,
+  "head_dim": 72,
+  "hidden_act": "silu",
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "kv_lora_rank": 512,
+  "linear_attn_config": {
+    "full_attn_layers": [
+      4,
+      8,
+      12,
+      16,
+      20,
+      24,
+      27
+    ],
+    "head_dim": 128,
+    "kda_layers": [
+      1,
+      2,
+      3,
+      5,
+      6,
+      7,
+      9,
+      10,
+      11,
+      13,
+      14,
+      15,
+      17,
+      18,
+      19,
+      21,
+      22,
+      23,
+      25,
+      26
+    ],
+    "num_heads": 32,
+    "short_conv_kernel_size": 4
+  },
+  "mla_use_nope": true,
+  "model_max_length": 1048576,
+  "model_type": "kimi_linear",
+  "moe_intermediate_size": 1024,
+  "moe_layer_freq": 1,
+  "moe_renormalize": true,
+  "moe_router_activation_func": "sigmoid",
+  "num_attention_heads": 32,
+  "num_expert_group": 1,
+  "num_experts": 256,
+  "num_experts_per_token": 8,
+  "num_hidden_layers": 27,
+  "num_key_value_heads": 32,
+  "num_nextn_predict_layers": 0,
+  "num_shared_experts": 1,
+  "pad_token_id": 163839,
+  "q_lora_rank": null,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "routed_scaling_factor": 2.446,
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_grouped_topk": true,
+  "v_head_dim": 128,
+  "vocab_size": 163840
+}

configuration_kimi.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# coding=utf-8
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+class KimiLinearConfig(PretrainedConfig):
+    model_type = "kimi_linear"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        model_type="kimi_linear",
+        vocab_size=163840,
+        hidden_size=4096,
+        head_dim=None,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        tie_word_embeddings=False,
+        moe_intermediate_size: Optional[int] = None,
+        moe_renormalize: bool = True,
+        moe_router_activation_func: str = "sigmoid",
+        num_experts: Optional[int] = None,
+        num_experts_per_token: Optional[int] = None,
+        num_shared_experts: int = 0,
+        routed_scaling_factor: float = 1.0,
+        first_k_dense_replace: int = 0,
+        moe_layer_freq: int = 1,
+        use_grouped_topk: bool = True,
+        num_expert_group: int = 1,
+        topk_group: int = 1,
+        q_lora_rank: Optional[int] = None,
+        kv_lora_rank: Optional[int] = None,
+        qk_nope_head_dim: Optional[int] = None,
+        qk_rope_head_dim: Optional[int] = None,
+        v_head_dim: Optional[int] = None,
+        mla_use_nope: Optional[bool] = False,
+        num_nextn_predict_layers: int = 0,
+        linear_attn_config: Optional[dict] = None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.mla_use_nope = mla_use_nope
+        # moe config
+        self.num_experts = num_experts
+        self.num_experts_per_token = num_experts_per_token
+        self.moe_renormalize = moe_renormalize
+        self.num_shared_experts = num_shared_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_router_activation_func = moe_router_activation_func
+        assert self.moe_router_activation_func in ("softmax", "sigmoid")
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_layer_freq = moe_layer_freq
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        if linear_attn_config is not None:
+            assert linear_attn_config["kda_layers"] is not None
+            assert linear_attn_config["full_attn_layers"] is not None
+        self.linear_attn_config = linear_attn_config
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @property
+    def is_mla(self):
+        return (
+            self.q_lora_rank is not None
+            or self.kv_lora_rank is not None
+            or self.qk_nope_head_dim is not None
+            or self.qk_rope_head_dim is not None
+            or self.v_head_dim is not None
+            or self.mla_use_nope is True
+        )
+    @property
+    def is_moe(self):
+        return self.num_experts is not None
+    @property
+    def is_linear_attn(self) -> bool:
+        return not (
+            self.linear_attn_config is None
+            or (
+                isinstance(self.linear_attn_config, dict)
+                and self.linear_attn_config["kda_layers"] is not None
+                and len(self.linear_attn_config["kda_layers"]) == 0
+            )
+        )
+    def is_kda_layer(self, layer_idx: int):
+        return (
+            self.linear_attn_config is not None
+            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
+        )

figures/arch.png ADDED Viewed

Git LFS Details

SHA256: 132ae021fa4661ed39e7be784d46f05f22b82aabb9afd2bab8dbdc0a5a61cba0
Pointer size: 131 Bytes
Size of remote file: 238 kB

figures/banner.png ADDED Viewed

figures/github.png ADDED Viewed

figures/logo.png ADDED Viewed

figures/perf_speed.png ADDED Viewed

Git LFS Details

SHA256: f8951e618db41ae57fa0cec4845d7b275dffbd7f9db12c6496bfea536c625aea
Pointer size: 131 Bytes
Size of remote file: 160 kB

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 163584,
+  "eos_token_id": 163586,
+  "pad_token_id": 163839,
+  "transformers_version": "4.57.1"
+}

model-00001-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5c908aa3b86b6486080b577cb7aa8dbe9ca7cb18789653768017e602b61a7f
+size 4999482712

model-00002-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fcb34e9ebe2434f32761c06ef17a465157308e6e583eb7eb70cc25e57cd2cb0
+size 4999923264

model-00003-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f35d2a95dd1e3170fd642d0db4d0d07933985ef59041494652092cc27893e231
+size 4997138040

model-00004-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eda18b6226777bb9a07584dfa64986ac4f28a26cee3203f16ffb14deef9ef48b
+size 4997148016

model-00005-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:652e8a43d493105176807d256af0a5c56e45c6d783e6c8221832918f3425c0a0
+size 4999923296

model-00006-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77f5dc551436c934f0991eee0c319e0f33689ea3c10cb8cb8f48acc32238526f
+size 4997138040

model-00007-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3980c7efccdc6a27633eb8909afc381cb780cccaa7be0347d1645496ea3eb5a2
+size 4997148128

model-00008-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dab91b8eaed9874c75de99a4a08669f520fa3e2c8977175333db552504a1c5d3
+size 4999924384

model-00009-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13f6ae84d557682ec4a0fc8b6090d4f89cdd26e5e216445cc9d77a65c7f4c90b
+size 4997139104

model-00010-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d58ab0a201e26ff429b9d18678a76f3a3284ad977719e055f94d892133ee247b
+size 4997149016

model-00011-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:142aa90317af104b2d9f5a6ae4dc661f4a7f7c152f83d6c2477de8037be92201
+size 4999924408

model-00012-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb1d4fe2d94a04898eb8300b5144923e5540a75091ee5f4c8b67936a69d91780
+size 4997139104

model-00013-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12fb6f2dea889d460f33f7fbb55f76d7beb468698cf56707f4d77a9ab69461d3
+size 4997148992

model-00014-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf178169e0dfdc721492f1a98a5be2ef5f66fd8569039f5a77819641a5a1b32d
+size 4999924440

model-00015-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e65460934e8794faeadd6d8cbeffd23fcdbf07d9c61ca92ef97afc95d0ccdaa
+size 4997139104

model-00016-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05cc77846f94d50dc09180f5844f01aee38e489b1fd833d8c7aec6a62214ef03
+size 4997148960

model-00017-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf238a2f1a971ef311c445309de323992da59287d55759cf2a4a3a85ca6a1cc
+size 4999924472

model-00018-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:315cdb6964a975522cdb755cf5eb76b46478346b015113e241f81127ad9e6fd4
+size 4997139104

model-00019-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbdc2c77e41baa76a2c2b3ced0a59fe7587e95ca3d1acc75247b88a80dee3041
+size 4999934384

model-00020-of-00020.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f1e4a9194d045e01c90ed2697939bcedd533b6aa1f1b97b0ae0a5932e5a4bc7
+size 3280687152

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_kimi.py ADDED Viewed

	@@ -0,0 +1,1028 @@

+import math
+from collections.abc import Callable
+from typing import Any, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import transformers
+from einops import rearrange
+from packaging import version
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast)
+from transformers.modeling_utils import (ALL_ATTENTION_FUNCTIONS,
+                                         PreTrainedModel)
+from transformers.processing_utils import Unpack
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (TransformersKwargs, auto_docstring,
+                                can_return_tuple, logging)
+from transformers.utils.generic import OutputRecorder, check_model_inputs
+try:
+    from fla.layers.utils import get_unpad_data, index_first_axis, pad_input
+    from fla.modules import FusedRMSNormGated, ShortConvolution
+    from fla.ops.kda import chunk_kda, fused_recurrent_kda
+    from fla.ops.kda.gate import fused_kda_gate
+except ImportError:
+    raise ImportError("Plese run `pip install -U fla-core`")
+from .configuration_kimi import KimiLinearConfig
+assert version.parse(transformers.__version__) >= version.parse("4.56.0"), \
+    "Please upgrade transformers to >= 4.56.0"
+logger = logging.get_logger(__name__)
+class KimiDynamicCache:
+    """
+    Dynamic cache for Kimi model.
+    Inspired by Qwen3-Next
+    """
+    is_compileable = False
+    def __init__(self, config: KimiLinearConfig):
+        super().__init__()
+        self.config = config
+        if config.linear_attn_config is not None:
+            self.layer_types = []
+            for i in range(config.num_hidden_layers):
+                if config.is_kda_layer(i):
+                    self.layer_types.append("linear_attention")
+                else:
+                    self.layer_types.append("full_attention")
+        else:
+            self.layer_types = ["full_attention"] * config.num_hidden_layers
+        self.transformer_layers = [
+            i for i in range(config.num_hidden_layers) if self.layer_types[i] == "full_attention"
+        ]
+        linear_layers = [i for i in range(
+            config.num_hidden_layers) if self.layer_types[i] == "linear_attention"]
+        self.last_linear_layer = linear_layers[-1] if linear_layers else -1
+        self.conv_states = [None for _ in range(config.num_hidden_layers)]
+        self.recurrent_states = [None for _ in range(config.num_hidden_layers)]
+        self.key_cache = [None for _ in range(config.num_hidden_layers)]
+        self.value_cache = [None for _ in range(config.num_hidden_layers)]
+    def __len__(self):
+        return len(self.layer_types)
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.key_cache[layer_idx] is None:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat(
+                [self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat(
+                [self.value_cache[layer_idx], value_states], dim=2)
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            if self.key_cache[layer_idx] is not None:
+                device = self.key_cache[layer_idx].device
+                beam_idx = beam_idx.to(device)
+                self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(
+                    0, beam_idx)
+                self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(
+                    0, beam_idx)
+            if self.conv_states[layer_idx] is not None:
+                device = self.conv_states[layer_idx][0].device
+                beam_idx = beam_idx.to(device)
+                q_conv, k_conv, v_conv = self.conv_states[layer_idx]
+                self.conv_states[layer_idx] = (
+                    q_conv.index_select(0, beam_idx),
+                    k_conv.index_select(0, beam_idx),
+                    v_conv.index_select(0, beam_idx)
+                )
+                self.recurrent_states[layer_idx] = self.recurrent_states[layer_idx].index_select(
+                    0, beam_idx)
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns for each layer.
+        """
+        kv_offset = 0
+        query_length = cache_position.shape[0]
+        past_seen_tokens = self.get_seq_length(layer_idx)
+        kv_length = query_length + past_seen_tokens
+        return kv_length, kv_offset
+    @property
+    def has_previous_state(self):
+        """We have a previous state if the last linear (conv) layer was already updated."""
+        if self.last_linear_layer == -1:
+            return False
+        return self.conv_states[self.last_linear_layer] is not None
+class KimiRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        KimiRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * \
+            torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(KimiRMSNorm)
+class KimiBlockSparseMLP(nn.Module):
+    def __init__(self, config: KimiLinearConfig, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.ffn_dim = config.intermediate_size if intermediate_size is None else intermediate_size
+        self.hidden_dim = config.hidden_size if hidden_size is None else hidden_size
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)   # gate
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)   # down
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)   # up
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states):
+        current_hidden_states = self.act_fn(
+            self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+class KimiMLP(nn.Module):
+    def __init__(self, config: KimiLinearConfig, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(
+            self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(
+        attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(
+        attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class KimiMLAAttention(nn.Module):
+    """
+    Multi-Latent Attention adapted from deepseek-v3
+    """
+    def __init__(self, config: KimiLinearConfig, layer_idx: int):
+        nn.Module.__init__(self)
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.rope_theta = config.rope_theta
+        self.attention_dropout = getattr(config, "attention_dropout", 0.0)
+        try:
+            self.q_lora_rank = config.q_lora_rank
+            self.qk_rope_head_dim = config.qk_rope_head_dim
+            self.kv_lora_rank = config.kv_lora_rank
+            self.v_head_dim = config.v_head_dim
+            self.qk_nope_head_dim = config.qk_nope_head_dim
+            self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+            self.use_nope = config.mla_use_nope
+            self.scaling = self.q_head_dim ** (-0.5)
+        except Exception as e:
+            raise ValueError(
+                f"Kimi MLA config is not found or not properly formatted: {e}")
+        assert self.q_lora_rank is None
+        self.q_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.q_head_dim, bias=False,
+        )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+        )
+        self.kv_a_layernorm = KimiRMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+        )
+        self.is_causal = True
+        assert self.use_nope
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.q_head_dim)
+        key_shape = (batch_size, seq_length, -1,
+                     self.qk_nope_head_dim + self.v_head_dim)
+        q_states = self.q_proj(hidden_states)
+        q_states = q_states.view(query_shape).transpose(1, 2)
+        q_pass, q_rot = torch.split(
+            q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_pass, k_rot = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_pass = self.kv_b_proj(self.kv_a_layernorm(
+            k_pass)).view(key_shape).transpose(1, 2)
+        k_pass, value_states = torch.split(
+            k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx)
+        if self.config._attn_implementation == "flash_attention_2" and self.q_head_dim != self.v_head_dim:
+            value_states = F.pad(
+                value_states, [0, self.q_head_dim - self.v_head_dim])
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, _ = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        if self.config._attn_implementation == "flash_attention_2" and self.q_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+        attn_output = attn_output.reshape(
+            batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class KimiDeltaAttention(nn.Module):
+    def __init__(self, config: KimiLinearConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.mode = "chunk"
+        self.hidden_size = config.hidden_size
+        self.conv_size = config.linear_attn_config["short_conv_kernel_size"]
+        self.head_dim = config.linear_attn_config["head_dim"]
+        self.num_heads = config.linear_attn_config["num_heads"]
+        self.head_k_dim = self.head_dim
+        self.num_k_heads = self.num_heads
+        self.layer_idx = layer_idx
+        assert self.mode in [
+            'chunk', 'fused_recurrent'], f"Not suppoerted mode `{self.mode}`."
+        projection_k_size = self.head_k_dim * self.num_k_heads
+        projection_size = self.head_dim * self.num_heads
+        self.q_proj = nn.Linear(
+            self.hidden_size, projection_k_size, bias=False)
+        self.k_proj = nn.Linear(
+            self.hidden_size, projection_k_size, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, projection_size, bias=False)
+        self.q_conv1d = ShortConvolution(
+            hidden_size=projection_k_size,
+            kernel_size=self.conv_size,
+            activation='silu',
+        )
+        self.k_conv1d = ShortConvolution(
+            hidden_size=projection_k_size,
+            kernel_size=self.conv_size,
+            activation='silu'
+        )
+        self.v_conv1d = ShortConvolution(
+            hidden_size=projection_size,
+            kernel_size=self.conv_size,
+            activation='silu'
+        )
+        self.A_log = torch.nn.Parameter(torch.log(torch.empty(
+            self.num_heads, dtype=torch.float32).uniform_(1, 16)).view(1, 1, -1, 1))
+        self.f_a_proj = nn.Linear(self.hidden_size, self.head_dim, bias=False)
+        self.f_b_proj = nn.Linear(self.head_dim, projection_size, bias=False)
+        self.dt_bias = nn.Parameter(
+            torch.empty(projection_size, dtype=torch.float32))
+        self.b_proj = nn.Linear(self.hidden_size, self.num_heads, bias=False)
+        self.g_a_proj = nn.Linear(self.hidden_size, self.head_dim, bias=False)
+        self.g_b_proj = nn.Linear(self.head_dim, projection_size, bias=False)
+        self.o_norm = FusedRMSNormGated(
+            self.head_dim, eps=config.rms_norm_eps, activation='sigmoid')
+        self.o_proj = nn.Linear(projection_size, self.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_params: Optional[KimiDynamicCache] = None,
+        **kwargs: Unpack[dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            if attention_mask.dim() != 2:
+                attention_mask = kwargs.get("padding_mask", None)
+            if attention_mask is not None and attention_mask.dim() != 2:
+                raise ValueError(
+                    "attention_mask must be a 0-1 matrix of shape [batch_size, seq_len] "
+                    "(0 = padding). 3D masks are not supported here."
+                )
+        use_cache = cache_params is not None
+        batch_size, q_len, _ = hidden_states.shape
+        mode = 'fused_recurrent' if q_len <= 64 else self.mode
+        if self.training:
+            assert mode == 'chunk', "Only chunk mode is supported in training."
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        indices = None
+        if attention_mask is not None:
+            indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
+            hidden_states = index_first_axis(
+                rearrange(hidden_states, "b s ... -> (b s) ..."), indices).unsqueeze(0)
+        conv_state_q, conv_state_k, conv_state_v = None, None, None
+        recurrent_state = None
+        if cache_params is not None:
+            if cache_params.conv_states[self.layer_idx] is not None:
+                conv_state_q, conv_state_k, conv_state_v = cache_params.conv_states[
+                    self.layer_idx]
+            recurrent_state = cache_params.recurrent_states[self.layer_idx]
+        q, conv_state_q = self.q_conv1d(
+            x=self.q_proj(hidden_states),
+            cache=conv_state_q,
+            output_final_state=use_cache,
+            cu_seqlens=cu_seqlens
+        )
+        k, conv_state_k = self.k_conv1d(
+            x=self.k_proj(hidden_states),
+            cache=conv_state_k,
+            output_final_state=use_cache,
+            cu_seqlens=cu_seqlens
+        )
+        v, conv_state_v = self.v_conv1d(
+            x=self.v_proj(hidden_states),
+            cache=conv_state_v,
+            output_final_state=use_cache,
+            cu_seqlens=cu_seqlens
+        )
+        g = self.f_b_proj(self.f_a_proj(hidden_states))
+        g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias)
+        beta = self.b_proj(hidden_states).float().sigmoid()
+        q, k = map(lambda x: rearrange(
+            x, '... (h d) -> ... h d', d=self.head_k_dim), (q, k))
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
+        if mode == 'chunk':
+            o, recurrent_state = chunk_kda(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=True,
+                use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
+            )
+        else:
+            o, recurrent_state = fused_recurrent_kda(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=True,
+                use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
+            )
+        if cache_params is not None:
+            cache_params.recurrent_states[self.layer_idx] = recurrent_state
+            cache_params.conv_states[self.layer_idx] = (
+                conv_state_q, conv_state_k, conv_state_v)
+        g = self.g_b_proj(self.g_a_proj(hidden_states))
+        g = rearrange(g, '... (h d) -> ... h d', d=self.head_dim)
+        o = self.o_norm(o, g)
+        o = rearrange(o, 'b t h d -> b t (h d)')
+        o = self.o_proj(o)
+        if attention_mask is not None:
+            o = pad_input(o.squeeze(0), indices, batch_size, q_len)
+        return o
+class KimiMoEGate(nn.Module):
+    """
+    MoEGate adapted from Deepseek-V3.
+    Parameter correspondences:
+        num_experts -> n_routed_experts
+        num_experts_per_token -> num_experts_per_tok
+        num_expert_group -> n_group
+        moe_router_activation_func -> scoring_func
+    """
+    def __init__(self, config: KimiLinearConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_token
+        self.num_experts = config.num_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.moe_router_activation_func = config.moe_router_activation_func
+        self.num_expert_group = getattr(config, "num_expert_group", 1)
+        self.topk_group = getattr(config, "topk_group", 1)
+        # topk selection algorithm
+        self.moe_renormalize = config.moe_renormalize
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.num_experts, self.gating_dim))
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((self.num_experts))
+        )
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        # compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32), self.weight.type(
+                torch.float32), None
+        )
+        if self.moe_router_activation_func == "sigmoid":
+            scores = logits.sigmoid()
+        elif self.moe_router_activation_func == "softmax":
+            scores = logits.softmax(dim=1)
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.moe_router_activation_func}"
+            )
+        # select top-k experts
+        assert not self.training
+        scores_for_choice = scores.view(bsz * seq_len, -1)
+        scores_for_choice += self.e_score_correction_bias.unsqueeze(0)
+        group_scores = (
+            scores_for_choice.view(
+                bsz * seq_len, self.num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+        )  # [n, num_expert_group]
+        group_idx = torch.topk(
+            group_scores, k=self.topk_group, dim=-1, sorted=False
+        )[
+            1
+        ]  # [n, top_k_group]
+        group_mask = torch.zeros_like(group_scores)  # [n, num_expert_group]
+        group_mask.scatter_(1, group_idx, 1)  # [n, num_expert_group]
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(
+                bsz * seq_len, self.num_expert_group, self.num_experts // self.num_expert_group
+            )
+            .reshape(bsz * seq_len, -1)
+        )  # [n, e]
+        tmp_scores = scores_for_choice.masked_fill(
+            ~score_mask.bool(), 0.0)  # [n, e]
+        _, topk_idx = torch.topk(
+            tmp_scores, k=self.top_k, dim=-1, sorted=False
+        )
+        topk_weight = scores.gather(1, topk_idx)
+        # norm gate to sum 1
+        if self.top_k > 1 and self.moe_renormalize:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        # must multiply the scaling factor
+        topk_weight = topk_weight * self.routed_scaling_factor
+        return topk_idx, topk_weight
+class KimiSparseMoeBlock(nn.Module):
+    """
+    Adapted from Deepseek-V3's MOE implementation
+    The namings are consistent with Kimi's version.
+    """
+    def __init__(self, config: KimiLinearConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_dim = config.hidden_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_token
+        self.moe_renormalize = config.moe_renormalize
+        self.ep_size = 1
+        self.experts_per_rank = config.num_experts
+        self.ep_rank = 0
+        self.experts = nn.ModuleList(
+            [
+                KimiBlockSparseMLP(
+                    config, intermediate_size=config.moe_intermediate_size
+                )
+                for _ in range(config.num_experts)
+            ]
+        )
+        self.gate = KimiMoEGate(config)
+        if config.num_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = KimiMLP(
+                config=config, intermediate_size=intermediate_size
+            )
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        topk_idx, topk_weight = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if not self.training:
+            y = self.moe_infer(hidden_states, topk_idx,
+                               topk_weight).view(*orig_shape)
+        else:
+            raise NotImplementedError(
+                "Training mode is not supported in KimiSparseMoeBlock")
+        if self.config.num_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+    @torch.no_grad()
+    def moe_infer(self, x, topk_ids, topk_weight):
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        cnts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert(tokens_for_this_expert)
+            outputs.append(expert_out)
+            start_idx = end_idx
+        outs = torch.cat(outputs, dim=0) if len(
+            outputs) else sorted_tokens.new_empty(0)
+        new_x = torch.empty_like(outs)
+        new_x[idxs] = outs
+        final_out = (
+            new_x.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(new_x.dtype)
+        )
+        return final_out
+class KimiDecoderLayer(nn.Module):
+    def __init__(self, config: KimiLinearConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        if config.is_kda_layer(layer_idx):
+            self.is_linear_attn = True
+            self.self_attn = KimiDeltaAttention(
+                config=config, layer_idx=layer_idx)
+        elif config.is_mla:
+            self.is_linear_attn = False
+            self.self_attn = KimiMLAAttention(
+                config=config, layer_idx=layer_idx)
+        else:
+            raise NotImplementedError
+        if (
+            config.num_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+            and layer_idx % getattr(config, "moe_layer_freq", 1) == 0
+        ):
+            self.block_sparse_moe = KimiSparseMoeBlock(config)
+        else:
+            self.mlp = KimiMLP(config)
+        self.input_layernorm = KimiRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = KimiRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        if self.is_linear_attn is False:
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        else:
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                cache_params=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if hasattr(self, "block_sparse_moe"):
+            hidden_states = self.block_sparse_moe(hidden_states)
+        else:
+            hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class KimiPreTrainedModel(PreTrainedModel):
+    config_class = KimiLinearConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["KimiDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(KimiBlockSparseMLP, index=1),
+        "hidden_states": KimiDecoderLayer,
+        "attentions": KimiMLAAttention,
+    }
+    _is_stateful = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class KimiLinearModel(KimiPreTrainedModel):
+    def __init__(self, config: KimiLinearConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([KimiDecoderLayer(
+            config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = KimiRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps)
+        if getattr(config, "_attn_implementation", None) is not None:
+            if config._attn_implementation != "flash_attention_2":
+                logger.warning_once(
+                    f"Ignoring the provided attention implementation {config._attn_implementation}")
+                logger.warning_once("Using flash_attention_2 backend instead.")
+                config._attn_implementation = "flash_attention_2"
+        else:
+            config._attn_implementation = "flash_attention_2"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _update_linear_attn_mask(self, attention_mask, cache_position):
+        """
+        NOTE: Left-padding is used for linear attention mask.
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        linear_attn_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            linear_attn_mask = None
+        return linear_attn_mask
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) and (inputs_embeds is None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds")
+        # Get inputs_embeds
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = KimiDynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length(
+            ) if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        linear_attn_mask = self._update_linear_attn_mask(
+            attention_mask, cache_position)
+        hidden_states = inputs_embeds
+        if past_key_values is not None:
+            assert isinstance(past_key_values, KimiDynamicCache)
+        for decoder_layer in self.layers:
+            layer_mask = linear_attn_mask if decoder_layer.is_linear_attn else causal_mask
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=layer_mask,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+class KimiLinearForCausalLM(KimiPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = KimiLinearModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        generation_mode: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, KimiLinearForCausalLM
+        >>> model = KimiLinearForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        logits = outputs[0]
+        if generation_mode:
+            logits = logits[:, -1:]
+        logits = self.lm_head(logits)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits, labels, self.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,260 @@

+{
+  "additional_special_tokens": [
+    "[extra_id_0]",
+    "[extra_id_1]",
+    "[extra_id_2]",
+    "[extra_id_3]",
+    "[start_header_id]",
+    "[end_header_id]",
+    "[extra_id_4]",
+    "[EOT]",
+    "[extra_id_5]",
+    "[extra_id_6]",
+    "[extra_id_7]",
+    "[extra_id_8]",
+    "[extra_id_9]",
+    "[extra_id_10]",
+    "[extra_id_11]",
+    "[extra_id_12]",
+    "[extra_id_13]",
+    "[extra_id_14]",
+    "[extra_id_15]",
+    "[extra_id_16]",
+    "[extra_id_17]",
+    "[extra_id_18]",
+    "[extra_id_19]",
+    "[extra_id_20]",
+    "[extra_id_21]",
+    "[extra_id_22]",
+    "[extra_id_23]",
+    "[extra_id_24]",
+    "[extra_id_25]",
+    "[extra_id_26]",
+    "[extra_id_27]",
+    "[extra_id_28]",
+    "[extra_id_29]",
+    "[extra_id_30]",
+    "[extra_id_31]",
+    "[extra_id_32]",
+    "[extra_id_33]",
+    "[extra_id_34]",
+    "[extra_id_35]",
+    "[extra_id_36]",
+    "[extra_id_37]",
+    "[extra_id_38]",
+    "[extra_id_39]",
+    "[extra_id_40]",
+    "[extra_id_41]",
+    "[extra_id_42]",
+    "[extra_id_43]",
+    "[extra_id_44]",
+    "[extra_id_45]",
+    "[extra_id_46]",
+    "[extra_id_47]",
+    "[extra_id_48]",
+    "[extra_id_49]",
+    "[extra_id_50]",
+    "[extra_id_51]",
+    "[extra_id_52]",
+    "[extra_id_53]",
+    "[extra_id_54]",
+    "[extra_id_55]",
+    "[extra_id_56]",
+    "[extra_id_57]",
+    "[extra_id_58]",
+    "[extra_id_59]",
+    "[extra_id_60]",
+    "[extra_id_61]",
+    "[extra_id_62]",
+    "[extra_id_63]",
+    "[extra_id_64]",
+    "[extra_id_65]",
+    "[extra_id_66]",
+    "[extra_id_67]",
+    "[extra_id_68]",
+    "[extra_id_69]",
+    "[extra_id_70]",
+    "[extra_id_71]",
+    "[extra_id_72]",
+    "[extra_id_73]",
+    "[extra_id_74]",
+    "[extra_id_75]",
+    "[extra_id_76]",
+    "[extra_id_77]",
+    "[extra_id_78]",
+    "[extra_id_79]",
+    "[extra_id_80]",
+    "[extra_id_81]",
+    "[extra_id_82]",
+    "[extra_id_83]",
+    "[extra_id_84]",
+    "[extra_id_85]",
+    "[extra_id_86]",
+    "[extra_id_87]",
+    "[extra_id_88]",
+    "[extra_id_89]",
+    "[extra_id_90]",
+    "[extra_id_91]",
+    "[extra_id_92]",
+    "[extra_id_93]",
+    "[extra_id_94]",
+    "[extra_id_95]",
+    "[extra_id_96]",
+    "[extra_id_97]",
+    "[extra_id_98]",
+    "[extra_id_99]",
+    "[extra_id_100]",
+    "[extra_id_101]",
+    "[extra_id_102]",
+    "[extra_id_103]",
+    "[extra_id_104]",
+    "[extra_id_105]",
+    "[extra_id_106]",
+    "[extra_id_107]",
+    "[extra_id_108]",
+    "[extra_id_109]",
+    "[extra_id_110]",
+    "[extra_id_111]",
+    "[extra_id_112]",
+    "[extra_id_113]",
+    "[extra_id_114]",
+    "[extra_id_115]",
+    "[extra_id_116]",
+    "[extra_id_117]",
+    "[extra_id_118]",
+    "[extra_id_119]",
+    "[extra_id_120]",
+    "[extra_id_121]",
+    "[extra_id_122]",
+    "[extra_id_123]",
+    "[extra_id_124]",
+    "[extra_id_125]",
+    "[extra_id_126]",
+    "[extra_id_127]",
+    "[extra_id_128]",
+    "[extra_id_129]",
+    "[extra_id_130]",
+    "[extra_id_131]",
+    "[extra_id_132]",
+    "[extra_id_133]",
+    "[extra_id_134]",
+    "[extra_id_135]",
+    "[extra_id_136]",
+    "[extra_id_137]",
+    "[extra_id_138]",
+    "[extra_id_139]",
+    "[extra_id_140]",
+    "[extra_id_141]",
+    "[extra_id_142]",
+    "[extra_id_143]",
+    "[extra_id_144]",
+    "[extra_id_145]",
+    "[extra_id_146]",
+    "[extra_id_147]",
+    "[extra_id_148]",
+    "[extra_id_149]",
+    "[extra_id_150]",
+    "[extra_id_151]",
+    "[extra_id_152]",
+    "[extra_id_153]",
+    "[extra_id_154]",
+    "[extra_id_155]",
+    "[extra_id_156]",
+    "[extra_id_157]",
+    "[extra_id_158]",
+    "[extra_id_159]",
+    "[extra_id_160]",
+    "[extra_id_161]",
+    "[extra_id_162]",
+    "[extra_id_163]",
+    "[extra_id_164]",
+    "[extra_id_165]",
+    "[extra_id_166]",
+    "[extra_id_167]",
+    "[extra_id_168]",
+    "[extra_id_169]",
+    "[extra_id_170]",
+    "[extra_id_171]",
+    "[extra_id_172]",
+    "[extra_id_173]",
+    "[extra_id_174]",
+    "[extra_id_175]",
+    "[extra_id_176]",
+    "[extra_id_177]",
+    "[extra_id_178]",
+    "[extra_id_179]",
+    "[extra_id_180]",
+    "[extra_id_181]",
+    "[extra_id_182]",
+    "[extra_id_183]",
+    "[extra_id_184]",
+    "[extra_id_185]",
+    "[extra_id_186]",
+    "[extra_id_187]",
+    "[extra_id_188]",
+    "[extra_id_189]",
+    "[extra_id_190]",
+    "[extra_id_191]",
+    "[extra_id_192]",
+    "[extra_id_193]",
+    "[extra_id_194]",
+    "[extra_id_195]",
+    "[extra_id_196]",
+    "[extra_id_197]",
+    "[extra_id_198]",
+    "[extra_id_199]",
+    "[extra_id_200]",
+    "[extra_id_201]",
+    "[extra_id_202]",
+    "[extra_id_203]",
+    "[extra_id_204]",
+    "[extra_id_205]",
+    "[extra_id_206]",
+    "[extra_id_207]",
+    "[extra_id_208]",
+    "[extra_id_209]",
+    "[extra_id_210]",
+    "[extra_id_211]",
+    "[extra_id_212]",
+    "[extra_id_213]",
+    "[extra_id_214]",
+    "[extra_id_215]",
+    "[extra_id_216]",
+    "[extra_id_217]",
+    "[extra_id_218]",
+    "[extra_id_219]",
+    "[extra_id_220]",
+    "[extra_id_221]",
+    "[extra_id_222]",
+    "[extra_id_223]",
+    "[extra_id_224]",
+    "[extra_id_225]",
+    "[extra_id_226]",
+    "[extra_id_227]",
+    "[extra_id_228]",
+    "[extra_id_229]",
+    "[extra_id_230]",
+    "[extra_id_231]",
+    "[extra_id_232]",
+    "[extra_id_233]",
+    "[extra_id_234]",
+    "[extra_id_235]",
+    "[extra_id_236]",
+    "[extra_id_237]",
+    "[extra_id_238]",
+    "[extra_id_239]",
+    "[extra_id_240]",
+    "[extra_id_241]",
+    "[extra_id_242]",
+    "[extra_id_243]",
+    "[extra_id_244]",
+    "[extra_id_245]",
+    "[extra_id_246]",
+    "[extra_id_247]",
+    "[extra_id_248]"
+  ],
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "pad_token": "[extra_id_250]",
+  "unk_token": "[extra_id_249]"
+}

tiktoken.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
+size 2795286

tokenization_kimi.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import os
+import tiktoken
+from logging import getLogger
+from pathlib import Path
+from typing import (
+    cast,
+    Tuple,
+    Dict,
+    Iterator,
+    List,
+    Union,
+    Optional,
+)
+from shutil import copyfile
+from tiktoken.load import load_tiktoken_bpe
+from tokenizers import AddedToken, pre_tokenizers, Regex
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+from typing import Any
+logger = getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
+class TikTokenTokenizer(PreTrainedTokenizer):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            The path to the Tiktoken model file.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
+            The end of sequence token.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. The second to last item in special_tokens.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (list of `str`, *optional*):
+            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
+            skipped when decoding if `skip_special_tokens` is set to `True`.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    special_tokens: Dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = "|".join(
+        [
+            r"""[\p{Han}]+""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""\p{N}{1,3}""",
+            r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
+            r"""\s*[\r\n]+""",
+            r"""\s+(?!\S)""",
+            r"""\s+""",
+        ]
+    )
+    def __init__(
+        self,
+        vocab_file,
+        bos_token: Union[str, AddedToken]="[BOS]",
+        eos_token: Union[str, AddedToken]="[EOS]",
+        unk_token: Union[str, AddedToken, None]=None,
+        pad_token: Union[str, AddedToken, None]=None,
+        additional_special_tokens: List[str]=None,
+        added_tokens_decoder: Optional[dict] = None,
+        **kwargs,
+    ):
+        assert os.path.isfile(vocab_file), vocab_file
+        if additional_special_tokens is None:
+            additional_special_tokens  = [
+                "<|im_end|>",
+                "<|im_user|>",
+                "<|im_assistant|>",
+                "<|start_header_id|>",
+                "<|end_header_id|>",
+                "[EOT]",
+                "<|im_system|>",
+                "<|im_middle|>",
+            ]
+        special_tokens_mapping = {
+        i: added_tokens_decoder[i].content for i in added_tokens_decoder
+    }
+        self.vocab_file = vocab_file
+        mergeable_ranks = load_tiktoken_bpe(vocab_file)
+        num_base_tokens = len(mergeable_ranks)
+        self.special_tokens = {
+            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
+            for i in range(
+                num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
+            )
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(vocab_file).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.info(f"Reloaded tiktoken model from {vocab_file}")
+        self.n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens[str(bos_token)]
+        self.eos_id: int = self.special_tokens[str(eos_token)]
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        self.pad_id: int = self.special_tokens[str(pad_token)]
+        self.unk_id: int = self.special_tokens[str(unk_token)]
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.decoder = {}
+        for i in range(self.n_words):
+            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
+            decoding = ''.join([
+                self.byte_encoder[ord(char)] for char in
+                self.model.decode_single_token_bytes(i).decode('latin-1')
+            ])
+            self.decoder[i] = decoding
+        self.encoder = {}
+        for i in range(self.n_words):
+            if i in self.decoder:
+                self.encoder[self.decoder[i]] = i
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.all_special_ids_set = set(self.all_special_ids)
+    def encode(
+        self,
+        text: str,
+        allow_special_tokens: bool = True,
+        **kwargs
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            text (str): The input string to be encoded.
+        Returns:
+            list[int]: A list of token IDs.
+        """
+        # If there are other args, we should call super().encode because there are a lot of code
+        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
+        # NOTE: our encode method is not compatible with the super().encode method,
+        #   e.g. split_special_tokens' default is True in our encode method.
+        if len(kwargs) > 0:
+            logger.warning( f"Calling super().encode with {kwargs}" )
+            return super().encode(text, **kwargs)
+        assert type(text) is str
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        texts = self.pre_tokenizer_process(text)
+        all_substrs = []
+        for text in texts:
+            substrs = (
+                substr
+                for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
+                for substr in self._split_whitespaces_or_nonwhitespaces(
+                    text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+                )
+            )
+            all_substrs.extend(substrs)
+        t: List[int] = []
+        for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    )
+                )
+            else:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    )
+                )
+        return t
+    def decode(
+        self,
+        token_ids: Union[int, List[int]],
+        **kwargs
+    ) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            token_ids (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # If there are other args, we should call super().decode because there are a lot of code
+        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
+        if len(kwargs) > 0:
+            return super().decode(token_ids, **kwargs)
+        if type(token_ids) is int:
+            token_ids = [token_ids]
+        return self.model.decode(cast(List[int], token_ids))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+    def pre_tokenizer_process(self, text: str) -> List[str]:
+        """
+        pre-tokenizes the input text into a list of tokens.
+        This method is used to split the input text into smaller chunks for internal processing.
+        """
+        return [text]
+    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
+    @property
+    def vocab_size(self) -> int:
+        return self.n_words
+    def get_vocab(self) -> Dict[str, int]:
+        return self.encoder
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        return [
+            self.decoder[t]
+            for t in self.encode(text)
+        ]
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.encoder.get(token, self.unk_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.decoder.get(index)
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        return out_string
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
+        return text
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+    def apply_chat_template(
+        self, conversation, tools: Optional[list[dict]] = None,
+        tokenize: bool = False,
+        add_generation_prompt: bool = True,
+        **kwargs
+    ):
+        tools = deep_sort_dict(tools)
+        return super().apply_chat_template(conversation,
+                                           tools=tools,
+                                           tokenize=tokenize,
+                                           add_generation_prompt=add_generation_prompt,
+                                           **kwargs)
+def deep_sort_dict(obj: Any) -> Any:
+    if isinstance(obj, dict):
+        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
+    if isinstance(obj, list):
+        return [deep_sort_dict(item) for item in obj]
+    return obj

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+  "added_tokens_decoder": {
+    "163584": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163585": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163586": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163587": {
+      "content": "<|im_user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163588": {
+      "content": "<|im_assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163590": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163591": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163593": {
+      "content": "[EOT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163594": {
+      "content": "<|im_system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163595": {
+      "content": "<|tool_calls_section_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163596": {
+      "content": "<|tool_calls_section_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163597": {
+      "content": "<|tool_call_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163598": {
+      "content": "<|tool_call_argument_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163599": {
+      "content": "<|tool_call_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163601": {
+      "content": "<|im_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163838": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163839": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_end|>",
+    "<|im_user|>",
+    "<|im_assistant|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "[EOT]",
+    "<|im_system|>",
+    "<|im_middle|>"
+  ],
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "TikTokenTokenizer",
+  "unk_token": "[UNK]",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_kimi.TikTokenTokenizer",
+      null
+    ]
+  }
+}