Upload tiny-random deepseek_v32 model

Browse files

Files changed (3) hide show

configuration_deepseek_v32.py +12 -0
model.safetensors +2 -2
modeling_deepseek_v32.py +174 -31

configuration_deepseek_v32.py CHANGED Viewed

@@ -98,6 +98,12 @@ class DeepseekV32Config(PretrainedConfig):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
     ```python
     >>> from transformers import DeepseekV32Model, DeepseekV32Config
@@ -152,6 +158,9 @@ class DeepseekV32Config(PretrainedConfig):
         rope_scaling=None,
         attention_bias=False,
         attention_dropout=0.0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -192,6 +201,9 @@ class DeepseekV32Config(PretrainedConfig):
         self.rope_scaling = rope_scaling
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         super().__init__(
             pad_token_id=pad_token_id,

             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        index_n_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads used in the sparse attention indexer.
+        index_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each head in the sparse attention indexer.
+        index_topk (`int`, *optional*, defaults to 2048):
+            Number of top-k key-value positions selected by the sparse attention indexer.
     ```python
     >>> from transformers import DeepseekV32Model, DeepseekV32Config
         rope_scaling=None,
         attention_bias=False,
         attention_dropout=0.0,
+        index_n_heads=64,
+        index_head_dim=128,
+        index_topk=2048,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.rope_scaling = rope_scaling
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
+        self.index_n_heads = index_n_heads
+        self.index_head_dim = index_head_dim
+        self.index_topk = index_topk
         super().__init__(
             pad_token_id=pad_token_id,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7630389ac118c34d12846521ff5102b4ba0b97fa733ad63eb780c38aed731f0
-size 545819392

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa9a2cfe783c2448f7d7e3d0d2149fe388261955b0c412f1f343a15ba17369e2
+size 546248736

modeling_deepseek_v32.py CHANGED Viewed

@@ -336,39 +336,39 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
     Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    b, h, s, d = q.shape
-    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
-    b, h, s, d = k.shape
-    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -610,6 +610,128 @@ class DeepseekV32MoE(nn.Module):
         return final_out
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -696,6 +818,9 @@ class DeepseekV32Attention(nn.Module):
                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                 self.softmax_scale = self.softmax_scale * mscale * mscale
     def _init_rope(self):
         if self.config.rope_scaling is None:
             self.rotary_emb = DeepseekV32RotaryEmbedding(
@@ -767,8 +892,10 @@ class DeepseekV32Attention(nn.Module):
         if self.q_lora_rank is None:
             q = self.q_proj(hidden_states)
         else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
         q_nope, q_pe = torch.split(
             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
@@ -823,12 +950,27 @@ class DeepseekV32Attention(nn.Module):
                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                 f" {attn_weights.size()}"
             )
-        assert attention_mask is not None
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
@@ -903,7 +1045,8 @@ class DeepseekV32FlashAttention2(DeepseekV32Attention):
         if self.q_lora_rank is None:
             q = self.q_proj(hidden_states)
         else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
         q_nope, q_pe = torch.split(
             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1

     return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_emb(x, cos, sin, position_ids, unsqueeze_dim=1, interleaved=True):
+    """Applies rotary positional embeddings using complex number operations.
+    Matches DeepSeek-V3.2-Exp/inference/model.py apply_rotary_emb:
+    Uses view_as_complex / view_as_real for the rotation.
     Args:
+        x: Input tensor [batch, heads, seq_len, rope_dim]
+        cos, sin: Cached cos/sin values [seq_len, rope_dim]
+        position_ids: Position indices [batch, seq_len]
+        unsqueeze_dim: Dimension to unsqueeze for broadcasting (default 1 for heads dim)
+        interleaved: If True, consecutive pairs are (real, imag). If False, first half real, second half imag.
     """
+    dtype = x.dtype
+    shape = x.shape
+    half = cos.shape[-1] // 2
+    cos_pos = cos[position_ids][..., :half].unsqueeze(unsqueeze_dim)
+    sin_pos = sin[position_ids][..., :half].unsqueeze(unsqueeze_dim)
+    freqs_cis = torch.complex(cos_pos, sin_pos)
+    if not interleaved:
+        x = x.view(*shape[:-1], 2, -1).transpose(-1, -2).contiguous()
+    x_complex = torch.view_as_complex(x.float().view(*shape[:-1], -1, 2))
+    y = torch.view_as_real(x_complex * freqs_cis).flatten(-2)
+    if not interleaved:
+        y = torch.cat([y[..., 0::2], y[..., 1::2]], dim=-1)
+    return y.to(dtype)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors (interleaved format)."""
+    q_embed = apply_rotary_emb(q, cos, sin, position_ids, unsqueeze_dim, interleaved=True)
+    k_embed = apply_rotary_emb(k, cos, sin, position_ids, unsqueeze_dim, interleaved=True)
     return q_embed, k_embed
         return final_out
+def hadamard_transform(x: torch.Tensor, scale: float) -> torch.Tensor:
+    """Pure PyTorch Hadamard transform via butterfly decomposition."""
+    n = x.size(-1)
+    h = 1
+    while h < n:
+        x = x.unflatten(-1, (-1, h * 2))
+        a = x[..., :h]
+        b = x[..., h:]
+        x = torch.cat([a + b, a - b], dim=-1).flatten(-2)
+        h *= 2
+    return x * scale
+def rotate_activation(x: torch.Tensor) -> torch.Tensor:
+    """Applies Hadamard transform to distribute magnitudes evenly across dimensions."""
+    hidden_size = x.size(-1)
+    return hadamard_transform(x, scale=hidden_size ** -0.5)
+class DeepseekV32Indexer(nn.Module):
+    """
+    Sparse attention indexer for DeepSeek V3.2.
+    Selects top-k key-value positions to attend to, enabling efficient sparse attention.
+    """
+    def __init__(self, config: DeepseekV32Config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.n_heads = config.index_n_heads
+        self.head_dim = config.index_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.index_topk = config.index_topk
+        self.q_lora_rank = config.q_lora_rank
+        # Query projection from compressed q (q_lora_rank) to indexer heads
+        self.wq_b = nn.Linear(self.q_lora_rank, self.n_heads * self.head_dim, bias=False)
+        # Key projection from hidden states
+        self.wk = nn.Linear(self.hidden_size, self.head_dim, bias=False)
+        self.k_norm = nn.LayerNorm(self.head_dim)
+        # Importance weighting projection
+        self.weights_proj = nn.Linear(self.hidden_size, self.n_heads, bias=False)
+        self.softmax_scale = self.head_dim ** -0.5
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        compressed_q: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: Input hidden states [batch, seq_len, hidden_size]
+            compressed_q: Compressed query from q_a_layernorm(q_a_proj(x)) [batch, seq_len, q_lora_rank]
+            cos, sin: Rotary embedding cos/sin values
+            position_ids: Position IDs
+            attention_mask: Attention mask [batch, 1, seq_len, seq_len]
+        Returns:
+            topk_indices: Indices of top-k positions to attend to [batch, seq_len, index_topk]
+        """
+        bsz, q_len, _ = hidden_states.size()
+        # Compute indexer queries
+        q = self.wq_b(compressed_q)
+        q = q.view(bsz, q_len, self.n_heads, self.head_dim)
+        # Split into rope and non-rope parts
+        q_pe = q[..., :self.qk_rope_head_dim]
+        q_nope = q[..., self.qk_rope_head_dim:]
+        # Apply RoPE to query (non-interleaved in indexer, matching reference)
+        q_pe = q_pe.transpose(1, 2)  # [bsz, n_heads, q_len, rope_dim]
+        q_pe = apply_rotary_emb(q_pe, cos, sin, position_ids, unsqueeze_dim=1, interleaved=False)
+        q_pe = q_pe.transpose(1, 2)  # back to [bsz, q_len, n_heads, rope_dim]
+        q = torch.cat([q_pe, q_nope], dim=-1)  # [bsz, q_len, n_heads, head_dim]
+        # Compute indexer keys
+        k = self.wk(hidden_states)  # [bsz, q_len, head_dim]
+        k = self.k_norm(k)
+        k_pe = k[..., :self.qk_rope_head_dim]
+        k_nope = k[..., self.qk_rope_head_dim:]
+        # Apply RoPE to key (non-interleaved in indexer, matching reference)
+        k_pe = k_pe.unsqueeze(1)  # [bsz, 1, q_len, rope_dim]
+        k_pe = apply_rotary_emb(k_pe, cos, sin, position_ids, unsqueeze_dim=1, interleaved=False)
+        k_pe = k_pe.squeeze(1)  # [bsz, q_len, rope_dim]
+        k = torch.cat([k_pe, k_nope], dim=-1)  # [bsz, q_len, head_dim]
+        # Apply Hadamard transform (from DeepSeek-V3.2-Exp/inference/model.py)
+        q = rotate_activation(q)
+        k = rotate_activation(k)
+        # Compute importance weights
+        weights = self.weights_proj(hidden_states.float()) * (self.n_heads ** -0.5)  # [bsz, q_len, n_heads]
+        # Compute index scores: q @ k^T scaled by weights
+        # q: [bsz, q_len, n_heads, head_dim], k: [bsz, q_len, head_dim]
+        # scores: [bsz, q_len, q_len] - sum over heads of (q_i @ k_j * weight_i)
+        q = q.transpose(1, 2)  # [bsz, n_heads, q_len, head_dim]
+        k_expanded = k.unsqueeze(1)  # [bsz, 1, q_len, head_dim]
+        index_score = torch.matmul(q, k_expanded.transpose(-1, -2))  # [bsz, n_heads, q_len, q_len]
+        index_score = index_score * self.softmax_scale
+        # Weight by importance: weights is [bsz, q_len, n_heads] -> [bsz, n_heads, q_len, 1]
+        weights = weights.permute(0, 2, 1).unsqueeze(-1)
+        index_score = (index_score * weights).sum(dim=1)  # [bsz, q_len, q_len]
+        if attention_mask is not None:
+            # attention_mask shape: [bsz, 1, q_len, kv_len]
+            index_score = index_score + attention_mask.squeeze(1)
+        # Select top-k indices
+        topk = min(self.index_topk, q_len)
+        topk_indices = index_score.topk(topk, dim=-1)[1]  # [bsz, q_len, topk]
+        return topk_indices
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                 self.softmax_scale = self.softmax_scale * mscale * mscale
+        # DeepSeek V3.2 Sparse Attention Indexer
+        self.indexer = DeepseekV32Indexer(config)
     def _init_rope(self):
         if self.config.rope_scaling is None:
             self.rotary_emb = DeepseekV32RotaryEmbedding(
         if self.q_lora_rank is None:
             q = self.q_proj(hidden_states)
+            compressed_q = None
         else:
+            compressed_q = self.q_a_layernorm(self.q_a_proj(hidden_states))
+            q = self.q_b_proj(compressed_q)
         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
         q_nope, q_pe = torch.split(
             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                 f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
+        # DeepSeek V3.2: Apply sparse attention indexer mask (includes causal mask)
+        # Matching reference: causal mask is applied only once, via index_mask
+        if compressed_q is not None:
+            topk_indices = self.indexer(
+                hidden_states, compressed_q, cos, sin, position_ids, attention_mask
+            )
+            # Create sparse index mask: only attend to top-k positions
+            index_mask = torch.full(
+                (bsz, q_len, kv_seq_len), float("-inf"), device=hidden_states.device
+            )
+            index_mask.scatter_(-1, topk_indices, 0.0)
+            if attention_mask is not None:
+                index_mask = index_mask + attention_mask.squeeze(1)
+            attn_weights = attn_weights + index_mask.unsqueeze(1)
+        elif attention_mask is not None:
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         if self.q_lora_rank is None:
             q = self.q_proj(hidden_states)
         else:
+            compressed_q = self.q_a_layernorm(self.q_a_proj(hidden_states))
+            q = self.q_b_proj(compressed_q)
         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
         q_nope, q_pe = torch.split(
             q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1