ByteDance
/

Ouro-2.6B-Thinking

@@ -1,13 +1,17 @@
-from typing import Callable, Optional, Union
 import torch
 from torch import nn
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
-from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import (
     GenericForQuestionAnswering,
@@ -15,7 +19,10 @@ from transformers.modeling_layers import (
     GenericForTokenClassification,
     GradientCheckpointingLayer,
 )
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
@@ -24,6 +31,37 @@ from transformers.utils.generic import check_model_inputs
 from .configuration_ouro import OuroConfig
 class OuroMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -82,10 +120,111 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -104,8 +243,12 @@ def eager_attention_forward(
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
@@ -119,16 +262,32 @@ class OuroAttention(nn.Module):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
-        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
-        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
     def forward(
         self,
@@ -148,16 +307,25 @@ class OuroAttention(nn.Module):
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, current_ut * self.config.num_hidden_layers + self.layer_idx, cache_kwargs)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
             self,
@@ -206,9 +374,15 @@ class OuroDecoderLayer(GradientCheckpointingLayer):
         self.mlp = OuroMLP(config)
         self.input_layernorm = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.input_layernorm_2 = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm_2 = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.attention_type = config.layer_types[layer_idx]
     def forward(
@@ -219,7 +393,9 @@ class OuroDecoderLayer(GradientCheckpointingLayer):
         past_key_value: Optional[Cache] = None,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor]:
         residual = hidden_states
@@ -271,7 +447,9 @@ class OuroRotaryEmbedding(nn.Module):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
             self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
@@ -287,12 +465,23 @@ class OuroRotaryEmbedding(nn.Module):
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -307,9 +496,14 @@ class OuroModel(OuroPreTrainedModel):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [OuroDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = OuroRotaryEmbedding(config=config)
@@ -334,18 +528,34 @@ class OuroModel(OuroPreTrainedModel):
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
         if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         if position_ids is None:
@@ -368,7 +578,9 @@ class OuroModel(OuroPreTrainedModel):
             }
             # The sliding window alternating layers are not always activated depending on the config
             if self.has_sliding_layers:
-                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
         hidden_states = inputs_embeds
@@ -395,10 +607,14 @@ class OuroModel(OuroPreTrainedModel):
             hidden_states_list.append(hidden_states)
             gate_list.append(self.early_exit_gate(hidden_states))
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-        ), hidden_states_list, gate_list
 @auto_docstring
@@ -412,12 +628,11 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
         self.model = OuroModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # 分块大小配置
-        self.chunk_size = getattr(config, 'chunk_size', 2)  # 默认分块大小为2
         self.early_exit_step = getattr(config, "early_exit_step", None)
         self.early_exit_threshold = getattr(config, "early_exit_threshold", None)
         # Initialize weights and apply final processing
         self.post_init()
@@ -449,13 +664,13 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
         r"""
         Args:
             use_weighted_exit (`bool`, *optional*, defaults to `False`):
-                Whether to use weighted early exit. If `True`, the logits from all UT steps will be
                 averaged according to the exit probability distribution.
             exit_at_step (`int`, *optional*):
-                Specifies which UT step to exit at. If set, the model will directly use the hidden states
                 from this step to generate logits, ignoring other exit strategies.
             exit_threshold (`float`, *optional*):
-                The cumulative probability threshold for early exit. When the cumulative exit probability
                 reaches this threshold, the model will exit at that step.
         Example:
@@ -471,8 +686,12 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
-        exit_at_step = exit_at_step if exit_at_step is not None else self.early_exit_step
-        exit_threshold = exit_threshold if exit_threshold is not None else self.early_exit_threshold
         outputs, hidden_states_list, gate_list = self.model(
             input_ids=input_ids,
@@ -484,14 +703,20 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
             cache_position=cache_position,
             **kwargs,
         )
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         def _select_token_positions(tensor: torch.Tensor) -> torch.Tensor:
             if isinstance(slice_indices, slice):
                 return tensor[:, slice_indices, ...]
             if isinstance(slice_indices, torch.Tensor):
                 return tensor.index_select(1, slice_indices.to(tensor.device))
-            raise TypeError(f"Unsupported index type for logits_to_keep: {type(slice_indices)}")
         stacked_exit_pdf = None
         if gate_list:
@@ -520,8 +745,14 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
             for step_idx, hidden in enumerate(hidden_states_list):
                 step_hidden = _select_token_positions(hidden)
                 step_logits = self.lm_head(step_hidden)
-                weight = token_exit_pdf[..., step_idx].unsqueeze(-1).to(step_logits.dtype)
-                expected_logits = step_logits * weight if expected_logits is None else expected_logits + step_logits * weight
             expected_logits_cache = expected_logits
             return expected_logits_cache
@@ -533,10 +764,17 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
             if logits is None:
                 hidden_states = outputs.last_hidden_state
                 logits = self.lm_head(_select_token_positions(hidden_states))
-            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
         else:
             if stacked_exit_pdf is not None and hidden_states_list:
-                if exit_at_step is not None and 0 <= exit_at_step < len(hidden_states_list):
                     selected_hidden = hidden_states_list[exit_at_step]
                     logits = self.lm_head(_select_token_positions(selected_hidden))
                 elif exit_threshold is not None:
@@ -551,8 +789,14 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
                         never_exceeded = ~threshold_mask.any(dim=2)
                         exit_steps[never_exceeded] = last_step_idx
                     stacked_hidden = torch.stack(hidden_states_list, dim=2)
-                    gather_index = exit_steps.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 1, stacked_hidden.size(-1))
-                    final_hidden_states = torch.gather(stacked_hidden, 2, gather_index).squeeze(2)
                     logits = self.lm_head(_select_token_positions(final_hidden_states))
                 elif use_weighted_exit:
                     logits = compute_expected_logits()
@@ -572,7 +816,9 @@ class OuroForCausalLM(OuroPreTrainedModel, GenerationMixin):
         return result
-class OuroForSequenceClassification(GenericForSequenceClassification, OuroPreTrainedModel):
     pass
@@ -581,7 +827,9 @@ class OuroForTokenClassification(GenericForTokenClassification, OuroPreTrainedMo
 class OuroForQuestionAnswering(GenericForQuestionAnswering, OuroPreTrainedModel):
-    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`
 __all__ = [
@@ -591,4 +839,5 @@ __all__ = [
     "OuroForSequenceClassification",
     "OuroForTokenClassification",
     "OuroForQuestionAnswering",
-]

+import logging
+from typing import Any, Callable, Optional, Union
 import torch
 from torch import nn
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import (
+    create_causal_mask,
+    create_sliding_window_causal_mask,
+)
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import (
     GenericForQuestionAnswering,
     GenericForTokenClassification,
     GradientCheckpointingLayer,
 )
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from .configuration_ouro import OuroConfig
+logger = logging.getLogger(__name__)
+def needs_universal_cache(
+    cache: Optional[Cache], max_cache_size: Optional[int]
+) -> bool:
+    if cache is None:
+        return True
+    if isinstance(cache, UniversalTransformerCache):
+        return False
+    if not isinstance(cache, Cache):
+        return False
+    can_grow = getattr(cache, "layer_class_to_replicate", None) is not None
+    if can_grow:
+        # Dynamic caches can extend to any index, so let them be
+        return False
+    cache_layers = getattr(cache, "layers", [])
+    if max_cache_size is not None and len(cache_layers) < max_cache_size:
+        try:
+            cached_tokens = cache.get_seq_length()
+        except Exception:
+            cached_tokens = 0
+        if cached_tokens > 0:
+            raise ValueError(
+                "The provided cache cannot store all Universal Transformer iterations. Please "
+                "instantiate Ouro.modeling_ouro.UniversalTransformerCache and pass it as past_key_values."
+            )
+        return True
+    return False
 class OuroMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class UniversalTransformerCache(Cache):
+    """Cache implementation that supports Ouro's multi-step Universal Transformer loops."""
+    def __init__(self, max_cache_size: Optional[int] = None):
+        # We intentionally don't call super().__init__ because the parent assumes static cache sizes.
+        self.key_cache: list[Optional[torch.Tensor]] = []
+        self.value_cache: list[Optional[torch.Tensor]] = []
+        self.layers: list[Any] = []  # attribute expected by HF Cache utilities
+        self._seen_tokens = 0
+        self.max_cache_size = max_cache_size
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if layer_idx < 0:
+            raise ValueError(f"layer_idx must be non-negative, got {layer_idx}")
+        if self.max_cache_size is not None and layer_idx >= self.max_cache_size:
+            raise IndexError(
+                f"Cache index {layer_idx} exceeds configured max_cache_size={self.max_cache_size}. "
+                "Check total_ut_steps and num_hidden_layers."
+            )
+        # Expand cache storage so the requested index is available.
+        while len(self.key_cache) <= layer_idx:
+            self.key_cache.append(None)
+            self.value_cache.append(None)
+        cached_key = self.key_cache[layer_idx]
+        cached_value = self.value_cache[layer_idx]
+        if cached_key is None:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            if (
+                key_states.shape[0] != cached_key.shape[0]
+                or key_states.shape[1] != cached_key.shape[1]
+                or key_states.shape[3] != cached_key.shape[3]
+            ):
+                raise ValueError(
+                    "Cached and incoming key/value tensors must match on batch, head, and head_dim dimensions."
+                )
+            assert cached_value is not None
+            self.key_cache[layer_idx] = torch.cat([cached_key, key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([cached_value, value_states], dim=2)
+        result_key = self.key_cache[layer_idx]
+        result_value = self.value_cache[layer_idx]
+        assert result_key is not None and result_value is not None
+        # Track sequence length using the first populated cache entry.
+        self._seen_tokens = result_key.shape[2]
+        return result_key, result_value
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        if layer_idx is None:
+            layer_idx = 0
+        if layer_idx < 0 or len(self.key_cache) <= layer_idx:
+            return 0
+        cached = self.key_cache[layer_idx]
+        if cached is None:
+            return 0
+        return cached.shape[2]
+    def get_max_length(self) -> Optional[int]:
+        return None
+    def get_usable_length(
+        self, new_seq_length: int, layer_idx: Optional[int] = 0
+    ) -> int:
+        return self.get_seq_length(layer_idx)
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
+        for idx, (key_entry, value_entry) in enumerate(
+            zip(self.key_cache, self.value_cache)
+        ):
+            if key_entry is None:
+                continue
+            assert value_entry is not None
+            device = key_entry.device
+            self.key_cache[idx] = key_entry.index_select(0, beam_idx.to(device))
+            self.value_cache[idx] = value_entry.index_select(0, beam_idx.to(device))
+    @property
+    def is_compileable(self) -> bool:
+        return False
+    def clear(self) -> None:
+        logger.debug("Clearing UniversalTransformerCache")
+        self.key_cache = []
+        self.value_cache = []
+        self._seen_tokens = 0
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+        query.dtype
+    )
+    attn_weights = nn.functional.dropout(
+        attn_weights, p=dropout, training=module.training
+    )
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
+        self.head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.num_key_value_groups = (
+            config.num_attention_heads // config.num_key_value_heads
+        )
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=False
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=False
+        )
+        self.sliding_window = (
+            config.sliding_window
+            if config.layer_types[layer_idx] == "sliding_attention"
+            else None
+        )
     def forward(
         self,
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states,
+                value_states,
+                current_ut * self.config.num_hidden_layers + self.layer_idx,
+                cache_kwargs,
+            )
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[
+                self.config._attn_implementation
+            ]
         attn_output, attn_weights = attention_interface(
             self,
         self.mlp = OuroMLP(config)
         self.input_layernorm = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm_2 = OuroRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = OuroRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm_2 = OuroRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
         self.attention_type = config.layer_types[layer_idx]
     def forward(
         past_key_value: Optional[Cache] = None,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[
+            tuple[torch.Tensor, torch.Tensor]
+        ] = None,  # necessary, but kept here for BC
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor]:
         residual = hidden_states
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type")
+            )
         else:
             self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None]
+            .float()
+            .expand(position_ids.shape[0], -1, 1)
+            .to(x.device)
+        )
         position_ids_expanded = position_ids[:, None, :].float()
+        device_type = (
+            x.device.type
+            if isinstance(x.device.type, str) and x.device.type != "mps"
+            else "cpu"
+        )
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
         self.layers = nn.ModuleList(
+            [
+                OuroDecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
         )
         self.norm = OuroRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = OuroRotaryEmbedding(config=config)
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        max_cache_size: Optional[int] = None
+        if use_cache:
+            total_ut_steps = getattr(self.config, "total_ut_steps", 1) or 1
+            total_layers = getattr(self.config, "num_hidden_layers", None)
+            if total_layers is not None:
+                max_cache_size = total_layers * total_ut_steps
+            if needs_universal_cache(past_key_values, max_cache_size):
+                past_key_values = UniversalTransformerCache(max_cache_size)
         if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
             cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
             )
         if position_ids is None:
             }
             # The sliding window alternating layers are not always activated depending on the config
             if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = (
+                    create_sliding_window_causal_mask(**mask_kwargs)
+                )
         hidden_states = inputs_embeds
             hidden_states_list.append(hidden_states)
             gate_list.append(self.early_exit_gate(hidden_states))
+        return (
+            BaseModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=past_key_values if use_cache else None,
+            ),
+            hidden_states_list,
+            gate_list,
+        )
 @auto_docstring
         self.model = OuroModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # 分块大小配置
+        self.chunk_size = getattr(config, "chunk_size", 2)  # 默认分块大小为2
         self.early_exit_step = getattr(config, "early_exit_step", None)
         self.early_exit_threshold = getattr(config, "early_exit_threshold", None)
         # Initialize weights and apply final processing
         self.post_init()
         r"""
         Args:
             use_weighted_exit (`bool`, *optional*, defaults to `False`):
+                Whether to use weighted early exit. If `True`, the logits from all UT steps will be
                 averaged according to the exit probability distribution.
             exit_at_step (`int`, *optional*):
+                Specifies which UT step to exit at. If set, the model will directly use the hidden states
                 from this step to generate logits, ignoring other exit strategies.
             exit_threshold (`float`, *optional*):
+                The cumulative probability threshold for early exit. When the cumulative exit probability
                 reaches this threshold, the model will exit at that step.
         Example:
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
+        exit_at_step = (
+            exit_at_step if exit_at_step is not None else self.early_exit_step
+        )
+        exit_threshold = (
+            exit_threshold if exit_threshold is not None else self.early_exit_threshold
+        )
         outputs, hidden_states_list, gate_list = self.model(
             input_ids=input_ids,
             cache_position=cache_position,
             **kwargs,
         )
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
         def _select_token_positions(tensor: torch.Tensor) -> torch.Tensor:
             if isinstance(slice_indices, slice):
                 return tensor[:, slice_indices, ...]
             if isinstance(slice_indices, torch.Tensor):
                 return tensor.index_select(1, slice_indices.to(tensor.device))
+            raise TypeError(
+                f"Unsupported index type for logits_to_keep: {type(slice_indices)}"
+            )
         stacked_exit_pdf = None
         if gate_list:
             for step_idx, hidden in enumerate(hidden_states_list):
                 step_hidden = _select_token_positions(hidden)
                 step_logits = self.lm_head(step_hidden)
+                weight = (
+                    token_exit_pdf[..., step_idx].unsqueeze(-1).to(step_logits.dtype)
+                )
+                expected_logits = (
+                    step_logits * weight
+                    if expected_logits is None
+                    else expected_logits + step_logits * weight
+                )
             expected_logits_cache = expected_logits
             return expected_logits_cache
             if logits is None:
                 hidden_states = outputs.last_hidden_state
                 logits = self.lm_head(_select_token_positions(hidden_states))
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
         else:
             if stacked_exit_pdf is not None and hidden_states_list:
+                if exit_at_step is not None and 0 <= exit_at_step < len(
+                    hidden_states_list
+                ):
                     selected_hidden = hidden_states_list[exit_at_step]
                     logits = self.lm_head(_select_token_positions(selected_hidden))
                 elif exit_threshold is not None:
                         never_exceeded = ~threshold_mask.any(dim=2)
                         exit_steps[never_exceeded] = last_step_idx
                     stacked_hidden = torch.stack(hidden_states_list, dim=2)
+                    gather_index = (
+                        exit_steps.unsqueeze(-1)
+                        .unsqueeze(-1)
+                        .expand(-1, -1, 1, stacked_hidden.size(-1))
+                    )
+                    final_hidden_states = torch.gather(
+                        stacked_hidden, 2, gather_index
+                    ).squeeze(2)
                     logits = self.lm_head(_select_token_positions(final_hidden_states))
                 elif use_weighted_exit:
                     logits = compute_expected_logits()
         return result
+class OuroForSequenceClassification(
+    GenericForSequenceClassification, OuroPreTrainedModel
+):
     pass
 class OuroForQuestionAnswering(GenericForQuestionAnswering, OuroPreTrainedModel):
+    base_model_prefix = (
+        "transformer"  # For BC, where `transformer` was used instead of `model`
+    )
 __all__ = [
     "OuroForSequenceClassification",
     "OuroForTokenClassification",
     "OuroForQuestionAnswering",
+    "UniversalTransformerCache",
+]