KitsuVp
/

NeoLLM

@@ -787,6 +787,9 @@ class LeviathanGenerator(nn.Module):
             x_all: [N, M, d_seed], values in [0, 1], all heads stacked.
         Returns:
             [N, M, d_seed, n_knots] float32.
         """
         x32   = x_all.float()
         x_e   = x32.unsqueeze(-1)                              # [N, M, d_seed, 1]
@@ -801,6 +804,113 @@ class LeviathanGenerator(nn.Module):
             torch.where(d < 1.5, 0.5 * (1.5 - d) ** 2, torch.zeros_like(d)),
         )  # [N, M, d_seed, n_knots] float32
     def _khronos_all_heads(
         self,
         B_all: torch.Tensor,
@@ -927,52 +1037,39 @@ class LeviathanGenerator(nn.Module):
             analysis.z_tilde = z_tilde.detach()
             analysis.B_vals  = B_vals.detach()
-        # ── Per-head generator path (fully vectorized, 6 kernels) ────────
-        # All 8 heads are processed simultaneously. No Python loop.
-        # Maximum intermediate tensor [N, M, d_seed, n_knots] appears once.
-        # Kernel 1: fused linear projection for all heads
-        # z @ W^T → [N, M*d_seed] → [N, M, d_seed]
-        z_all = F.linear(z.to(target_dtype), self.head_proj_weight)
-        z_all = z_all.view(N, self.num_modes, self.d_seed)   # [N, M, d_seed]
-        if analysis is not None:
-            analysis.z_all_pre_norm = z_all.detach()
-        # Kernel 2: per-head LayerNorm + sigmoid(x/2)
-        # Manual LN over last dim with independent weight/bias per head.
-        # Mathematically identical to 8 separate nn.LayerNorm(d_seed).
-        mean = z_all.mean(dim=-1, keepdim=True)
-        var  = z_all.var(dim=-1, keepdim=True, unbiased=False)
-        z_all = (z_all - mean) / (var + self.head_norm_eps).sqrt()
-        # head_norm_weight/bias: [M, d_seed] → broadcast over N
-        z_all = z_all * self.head_norm_weight.unsqueeze(0) \
-                      + self.head_norm_bias.unsqueeze(0)
-        z_all = torch.sigmoid(z_all / 2.0)                   # [N, M, d_seed]
-        if analysis is not None:
-            analysis.z_all_post_sigmoid = z_all.detach()
-        # Kernel 3: vectorized B-spline basis for all heads
-        # head_scale [M, d_seed] is used inside _bspline_basis_all_heads
-        B_all = self._bspline_basis_all_heads(
-            z_all.clamp(0.0, 1.0)
-        )                                                      # [N, M, d_seed, n_knots]
-        # Kernel 4: vectorized KHRONOS tensor product for all heads
-        modes_all = self._khronos_all_heads(B_all)            # [N, M, krank]
-        if analysis is not None:
-            analysis.modes_all = modes_all.detach()
-        # Kernel 5: project all heads to hidden_size and sum
-        # einsum: token n, head m, krank k → hidden d (summed over m)
-        # head_out_weight [M, krank, hidden_size]
-        e = torch.einsum(
-            "nmk,mkd->nd",
-            modes_all.to(target_dtype),
-            self.head_out_weight.to(target_dtype),
-        )                                                      # [N, hidden_size]
         # No W_res — confirmed absent in the authors' implementation
         e = e.reshape(*orig_shape, self.hidden_size)
@@ -1342,22 +1439,22 @@ class GPAS(nn.Module):
 class SeeDNorm(nn.Module):
     """
-    Self-Rescaled Dynamic Normalization with dual dropout.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
     """
     def __init__(
         self,
         dim: int,
         eps: float = 1e-6,
-        dropout_input: float = 0.01,
-        dropout_hidden: float = 0.01,
     ):
         super().__init__()
-        self.dim            = dim
-        self.eps            = eps
-        self.dropout_input  = dropout_input
-        self.dropout_hidden = dropout_hidden
         self.gamma = nn.Parameter(torch.ones(dim))
         self.beta  = nn.Parameter(torch.zeros(dim))
@@ -1371,13 +1468,11 @@ class SeeDNorm(nn.Module):
         x: torch.Tensor,
         analysis: Optional[SeeDNormAnalysis] = None,
     ) -> torch.Tensor:
-        x_for_dynamic  = F.dropout(x, p=self.dropout_input)
         rescale_factor = torch.tanh(
-            torch.sum(x_for_dynamic * self.beta, dim=-1, keepdim=True)
         )
         dynamic_scale  = rescale_factor * self.alpha + self.gamma
         x_normalized   = self._rms_norm(x.float())
-        x_normalized   = F.dropout(x_normalized, p=self.dropout_hidden)
         output = (x_normalized * dynamic_scale.float()).type_as(x)
         if analysis is not None:
             analysis.rescale_factor = rescale_factor.detach()
@@ -1387,9 +1482,7 @@ class SeeDNorm(nn.Module):
         return output
     def extra_repr(self) -> str:
-        return (f"dim={self.dim}, eps={self.eps}, "
-                f"dropout_input={self.dropout_input}, "
-                f"dropout_hidden={self.dropout_hidden}")
 # ==================== ROTARY EMBEDDING ====================
@@ -2743,6 +2836,36 @@ class VersatileFFN(nn.Module):
         - Width path load-balancing returns (output, aux_stats) for integration
           with the existing NeoLLMForCausalLM aux-loss accumulation pattern.
     Reference:
         Nie et al. (2026). "VersatileFFN: Achieving Parameter Efficiency in
         LLMs via Adaptive Wide-and-Deep Reuse." arXiv:2512.14531.
@@ -2920,45 +3043,97 @@ class VersatileFFN(nn.Module):
             depth_stack = torch.stack(depth_outputs, dim=-1)          # [B,S,D,L]
             x_depth     = (depth_stack * depth_probs.unsqueeze(2)).sum(dim=-1)  # [B,S,D]
-            # ── Width path: Top-K routing over virtual experts ────────────────
-            routing_logits = self.expert_gate(x)                       # [B,S,N]
-            topk_w, topk_i = torch.topk(routing_logits, k=self.active_experts, dim=-1)
-            topk_w         = torch.softmax(topk_w, dim=-1)            # [B,S,k]
-            x_flat     = x.reshape(-1, D)                             # [N,D]
-            x_fan_flat = x_fan.reshape(-1, x_fan.shape[-1])          # [N,fan]
-            topk_i_f   = topk_i.reshape(-1, self.active_experts)     # [N,k]
-            topk_w_f   = topk_w.reshape(-1, self.active_experts)     # [N,k]
-            N_tok      = x_flat.shape[0]
-            x_moe_flat = torch.zeros_like(x_flat)
-            for eid in range(self.total_experts):
-                mask           = (topk_i_f == eid)
-                tok_idx, k_idx = torch.where(mask)
-                if tok_idx.numel() == 0:
-                    continue
-                w_e   = topk_w_f[tok_idx, k_idx].unsqueeze(-1)
-                out_e = self._expert_forward(
-                    x_fan_flat[tok_idx], x_flat[tok_idx], self.expert_idx[eid]
                 )
-                x_moe_flat.index_add_(
-                    0, tok_idx, (out_e * w_e).to(x_moe_flat.dtype)
                 )
             x_moe = x_moe_flat.reshape(B, S, D)
-            # Load-balancing aux stats (same pattern as JTok-M)
             r_probs_flat = torch.softmax(
-                routing_logits.reshape(-1, self.total_experts), dim=-1
-            )  # [N_tok, N_experts]
-            p_sum = r_probs_flat.sum(dim=0)                           # [N_experts]
-            f_counts = torch.zeros(
-                self.total_experts, device=x.device, dtype=x.dtype
-            )
-            for eid in range(self.total_experts):
-                f_counts[eid] = (topk_i_f == eid).float().sum()
-            f_sum     = f_counts / (N_tok * self.active_experts)      # [N_experts]
             aux_stats = (p_sum, f_sum, N_tok)
             # ── Difficulty-aware fusion (Eq. 12–13) ──────────────────────────
@@ -2976,16 +3151,27 @@ class VersatileFFN(nn.Module):
         # ═════════════════════ INFERENCE ══════════════════════════════════════
         else:
             loop_choice = depth_logits.argmax(dim=-1)                  # [B, S]
-            max_loop    = int(loop_choice.max().item())
-            # Depth path: early exit — only compute needed iterations
             depth_outputs = []
             current_x     = x
-            for _ in range(max_loop + 1):
                 current_x = self._full_forward_step(current_x)
                 depth_outputs.append(current_x)
-            depth_stack = torch.stack(depth_outputs, dim=-1)          # [B,S,D,run]
             gather_idx  = (
                 loop_choice.unsqueeze(-1).unsqueeze(-1).expand(B, S, D, 1)
             )
@@ -2995,40 +3181,63 @@ class VersatileFFN(nn.Module):
             expected_L = (loop_choice + 1).float()                    # [B, S]
             moe_weight = (self.max_depth - expected_L) / self.max_depth  # [B, S]
-            # Width path: conditional on λ > 0 (Conditional Parallelism)
-            active_mask = (moe_weight > 1e-6)                         # [B, S]
-            x_moe       = torch.zeros_like(x)
             aux_stats   = None
             depth_probs = None
-            if active_mask.any():
-                x_flat_all     = x.reshape(-1, D)
-                x_fan_flat_all = x_fan.reshape(-1, x_fan.shape[-1])
-                active_flat    = active_mask.reshape(-1)
-                x_active       = x_flat_all[active_flat]
-                x_fan_active   = x_fan_flat_all[active_flat]
-                r_log = self.expert_gate(x_active)                    # [Na, N]
-                tw, ti = torch.topk(r_log, k=self.active_experts, dim=-1)
-                tw     = torch.softmax(tw, dim=-1)
-                x_moe_active = torch.zeros_like(x_active)
-                for eid in range(self.total_experts):
-                    mask_e         = (ti == eid)
-                    tok_idx, k_idx = torch.where(mask_e)
-                    if tok_idx.numel() == 0:
-                        continue
-                    w_e   = tw[tok_idx, k_idx].unsqueeze(-1)
-                    out_e = self._expert_forward(
-                        x_fan_active[tok_idx], x_active[tok_idx], self.expert_idx[eid]
-                    )
-                    x_moe_active.index_add_(
-                        0, tok_idx, (out_e * w_e).to(x_moe_active.dtype)
-                    )
-                x_moe_flat              = x_moe.reshape(-1, D)
-                x_moe_flat[active_flat] = x_moe_active
-                x_moe                   = x_moe_flat.reshape(B, S, D)
             output = (
                 x_depth * (1.0 - moe_weight.unsqueeze(-1))

             x_all: [N, M, d_seed], values in [0, 1], all heads stacked.
         Returns:
             [N, M, d_seed, n_knots] float32.
+        NOTE: Este método se mantiene para compatibilidad con JTok-M y análisis.
+        El forward del generator ya NO lo usa — usa _compute_head en su lugar.
         """
         x32   = x_all.float()
         x_e   = x32.unsqueeze(-1)                              # [N, M, d_seed, 1]
             torch.where(d < 1.5, 0.5 * (1.5 - d) ** 2, torch.zeros_like(d)),
         )  # [N, M, d_seed, n_knots] float32
+    def _compute_head(
+        self,
+        z: torch.Tensor,
+        m: int,
+    ) -> torch.Tensor:
+        """
+        Forward completo para el cabezal m del generator.
+        Reemplaza la materialización conjunta [N, M, d_seed, n_knots] del path
+        vectorizado. Cada llamada materializa solo [N, d_seed, n_knots] (1 cabezal),
+        reduciendo el pico de memoria de O(M·d_seed·n_knots) a O(d_seed·n_knots)
+        por cabezal.
+        Pipeline:
+            z [N, d_seed]
+            → Linear(head_proj_weight[m*d_seed:(m+1)*d_seed]) → [N, d_seed]
+            → ManualLayerNorm(weight[m], bias[m]) → [N, d_seed]
+            → sigmoid(x/2) → [N, d_seed]  (coordenada en [0,1]^d_seed)
+            → B-spline KHRONOS con scale=head_scale[m] → [N, d_seed, n_knots]
+            → einsum con head_spline[m] → per_dim [N, d_seed, krank]
+            → sign-parity product (log-sum-exp) → modes [N, krank]
+            → Linear(head_out_weight[m]) → [N, hidden_size]
+        Por qué loop Python sobre M cabezales en lugar de vmap:
+            torch.vmap sobre cabezales con parámetros distintos requiere
+            functional_call y stack_module_state, lo que complica el acceso
+            a buffers (knot_grid, head_norm_eps) desde dentro del transform.
+            Un loop Python con M=8 fijo es unrolleado por TorchDynamo en una
+            secuencia estática de ops — exactamente como lo hace XLA/Flax en
+            la implementación original de Reza. El compilador ve 8 grafos
+            idénticos en estructura pero con parámetros distintos, y puede
+            fusionarlos u optimizarlos de forma independiente. Con chunk_size=1
+            en vmap el comportamiento sería análogo pero con mayor overhead de
+            instrumentación.
+        Args:
+            z: [N, d_seed] — codebook seed compartido (float del dtype del modelo).
+            m: índice del cabezal (0 ≤ m < num_modes), Python int estático.
+        Returns:
+            [N, hidden_size] — contribución de este cabezal al embedding final.
+        """
+        d   = self.d_seed
+        nk  = self.num_knots
+        kr  = self.krank
+        # ── Proyección lineal para el cabezal m ──────────────────────────
+        # head_proj_weight [M*d_seed, d_seed] — los pesos del cabezal m
+        # son las filas [m*d_seed : (m+1)*d_seed].
+        proj_w = self.head_proj_weight[m * d : (m + 1) * d]       # [d_seed, d_seed]
+        zh = F.linear(z.float(), proj_w)                           # [N, d_seed]
+        # ── LayerNorm manual por cabezal ──────────────────────────────────
+        # Equivalente a nn.LayerNorm(d_seed) con parámetros independientes
+        # head_norm_weight[m] y head_norm_bias[m].
+        mean = zh.mean(dim=-1, keepdim=True)
+        var  = zh.var(dim=-1, keepdim=True, unbiased=False)
+        zh   = (zh - mean) / (var + self.head_norm_eps).sqrt()
+        zh   = zh * self.head_norm_weight[m] + self.head_norm_bias[m]
+        # ── Sigmoid(x/2) → coordenada latente en [0,1]^d_seed ────────────
+        zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0)              # [N, d_seed]
+        # ── B-spline KHRONOS para este cabezal ────────────────────────────
+        # head_scale[m]: [d_seed] — escala por dimensión para este cabezal.
+        # Materializa [N, d_seed, n_knots] en lugar de [N, M, d_seed, n_knots].
+        sc  = self.head_scale[m].float().view(1, -1, 1)            # [1, d_seed, 1]
+        x_e = zh.unsqueeze(-1)                                     # [N, d_seed, 1]
+        grid = self.knot_grid.float().view(1, 1, -1)               # [1, 1, n_knots]
+        dist = (x_e - grid).abs() * sc                            # [N, d_seed, n_knots]
+        B_m  = torch.where(
+            dist < 0.5,
+            0.75 - dist ** 2,
+            torch.where(dist < 1.5, 0.5 * (1.5 - dist) ** 2, torch.zeros_like(dist)),
+        )                                                           # [N, d_seed, n_knots]
+        # ── KHRONOS tensor product para este cabezal ──────────────────────
+        # head_spline[m]: [d_seed, n_knots, krank]
+        # per_dim[n, d, k] = Σ_g  B_m[n, d, g] * head_spline[m, d, g, k]
+        # Shape: [N, d_seed, krank] — pico máximo en este cabezal.
+        per_dim = torch.einsum(
+            "ndg,dgk->ndk",
+            B_m,
+            self.head_spline[m].float(),
+        )                                                           # [N, d_seed, krank]
+        # Sign-parity log-product (KHRONOS): evita underflow multiplicando
+        # en log-space y recuperando el signo por paridad de negativos.
+        per_dim_abs = per_dim.abs() + 1e-9
+        log_mag   = torch.log(per_dim_abs).sum(dim=1)              # [N, krank]
+        num_neg   = (per_dim < 0).long().sum(dim=1)                # [N, krank]
+        prod_sign = 1.0 - 2.0 * (num_neg % 2).float()             # [N, krank]
+        modes_m   = prod_sign * torch.exp(log_mag)                 # [N, krank]
+        # ── Proyección de salida del cabezal ──────────────────────────────
+        # head_out_weight[m]: [krank, hidden_size]
+        # NOTA: NO usar F.linear aquí. F.linear(A, W) computa A @ W.T,
+        # esperando W con shape [out, in] = [hidden, krank]. Pero
+        # head_out_weight está almacenado como [krank, hidden] (igual que
+        # el einsum original "nmk,mkd->nd" que contrae sobre k sin transponer).
+        # La multiplicación correcta es modes_m @ W directamente:
+        #   [N, krank] @ [krank, hidden] → [N, hidden]
+        out_m = (
+            modes_m.to(self.head_out_weight.dtype)
+            @ self.head_out_weight[m]
+        )                                                           # [N, hidden_size]
+        return out_m
     def _khronos_all_heads(
         self,
         B_all: torch.Tensor,
             analysis.z_tilde = z_tilde.detach()
             analysis.B_vals  = B_vals.detach()
+        # ── Per-head generator path (secuencial, un cabezal a la vez) ──────
+        # ORIGINAL PROBLEM: el path vectorizado anterior procesaba los M
+        # cabezales en paralelo con kernels fusionados:
+        #
+        #   _bspline_basis_all_heads → [N, M, d_seed, n_knots]   ← TENSOR GIGANTE
+        #   _khronos_all_heads       → per_dim [N, M, d_seed, krank] ← AÚN MAYOR
+        #
+        # Con N=B*S=32768, M=8, d_seed=128, n_knots=32, krank=16:
+        #   [N,M,d_seed,n_knots] = 32768 × 8 × 128 × 32 × 4 bytes ≈ 512 MB
+        #   [N,M,d_seed,krank]   = 32768 × 8 × 128 × 16 × 4 bytes ≈ 256 MB
+        # Estos tensores viven simultáneamente en el pool de CUDAGraphs,
+        # causando OOM en el backward cuando se suman las activaciones guardadas
+        # de las 12 capas del decoder.
+        #
+        # SOLUCIÓN (equivalente a la impl. JAX de Reza):
+        #   Loop Python sobre M=8 cabezales (count fijo → TorchDynamo unrollea
+        #   en 8 secuencias de ops estáticas sin graph breaks).
+        #   Cada cabezal materializa como máximo [N, d_seed, krank] ≈ 32 MB.
+        #   La suma se acumula in-place → el tensor del cabezal anterior puede
+        #   ser liberado por el allocator antes de procesar el siguiente.
+        #
+        # Por qué NO vmap(chunk_size=1):
+        #   vmap requiere que la función sea "pura" (sin acceso a self.*).
+        #   head_norm_eps, knot_grid y los parámetros indexados [m] se pasan
+        #   implícitamente a través del closure. Con vmap habría que
+        #   stack_module_state + functional_call, lo que añade overhead de
+        #   instrumentación sin beneficio real ya que el loop estático es
+        #   igualmente trazable por el compilador y produce el mismo grafo.
+        target_dtype = self.codebooks.dtype
+        e = torch.zeros(N, self.hidden_size, device=token_ids.device, dtype=target_dtype)
+        for m in range(self.num_modes):
+            e = e + self._compute_head(z, m)
         # No W_res — confirmed absent in the authors' implementation
         e = e.reshape(*orig_shape, self.hidden_size)
 class SeeDNorm(nn.Module):
     """
+    Self-Rescaled Dynamic Normalization.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
+    rescale_factor = tanh(x · β)  ∈ (-1, 1)   escalar por token
+    dynamic_scale  = rescale_factor · α + γ    ∈ ℝ^dim
+    output         = dynamic_scale ⊙ RMSNorm(x)
     """
     def __init__(
         self,
         dim: int,
         eps: float = 1e-6,
     ):
         super().__init__()
+        self.dim   = dim
+        self.eps   = eps
         self.gamma = nn.Parameter(torch.ones(dim))
         self.beta  = nn.Parameter(torch.zeros(dim))
         x: torch.Tensor,
         analysis: Optional[SeeDNormAnalysis] = None,
     ) -> torch.Tensor:
         rescale_factor = torch.tanh(
+            torch.sum(x * self.beta, dim=-1, keepdim=True)
         )
         dynamic_scale  = rescale_factor * self.alpha + self.gamma
         x_normalized   = self._rms_norm(x.float())
         output = (x_normalized * dynamic_scale.float()).type_as(x)
         if analysis is not None:
             analysis.rescale_factor = rescale_factor.detach()
         return output
     def extra_repr(self) -> str:
+        return f"dim={self.dim}, eps={self.eps}"
 # ==================== ROTARY EMBEDDING ====================
         - Width path load-balancing returns (output, aux_stats) for integration
           with the existing NeoLLMForCausalLM aux-loss accumulation pattern.
+    Width dispatch (CUDAGraph-compatible sparse routing):
+        El dispatch original del paper (torch.where + index_add_) es sparse y
+        fiel al paper pero produce shapes dependientes de datos → incompatible
+        con CUDAGraphs. La implementación usa argsort como dispatcher estático:
+            flat_expert [N·K] → argsort → perm [N·K]   (shape siempre igual)
+            sorted_tok  [N·K] = flat_tok[perm]          (índices de token originales)
+            grouped_tok [E, C] = sorted_tok.view(E, C)  (C = N·K // E, constante)
+        Propiedades clave:
+        · argsort: output shape = input shape, siempre [N·K]. CUDAGraph ✓
+        · C = N_tok·K // E es un entero Python conocido en compile-time.
+          Con el aux loss manteniendo balance, cada experto recibe ≈ C slots.
+        · scatter_add_ con index [C, D] de shape estático: CUDAGraph ✓
+          (los VALORES del index cambian por batch, no el SHAPE).
+        · FLOPs idénticos al original: cada experto procesa [C, D] = [N·K/E, D]
+          tokens, no todos los N tokens. Con K=2, E=4: C = N/2 por experto.
+        Conditional Parallelism (inferencia, Algorithm 2):
+        · Los tokens con λ=0 (argmax → max_depth) igualmente participan en el
+          grouped buffer y su expert forward se computa (shapes estáticos).
+        · Su contribución es cancelada por λ=0 en la fusión:
+              output = x_depth·(1−λ) + x_moe·λ  →  x_depth  si ��=0
+        · Esto pierde el saving de FLOPs de los λ=0 tokens, pero la correctitud
+          matemática es exacta. Tradeoff aceptable vs CUDAGraph-incompatibilidad.
+        Discrete Early-Exit (inferencia, Algorithm 2):
+        · Sustituido por always-max_depth + torch.gather con loop_choice.
+          Para max_depth=2 el overhead es ≤ 1 iteración extra por token.
     Reference:
         Nie et al. (2026). "VersatileFFN: Achieving Parameter Efficiency in
         LLMs via Adaptive Wide-and-Deep Reuse." arXiv:2512.14531.
             depth_stack = torch.stack(depth_outputs, dim=-1)          # [B,S,D,L]
             x_depth     = (depth_stack * depth_probs.unsqueeze(2)).sum(dim=-1)  # [B,S,D]
+            # ── Width path: argsort-based sparse dispatch (Eq. 7–8) ──────────
+            # Matemática (paper §3.2):
+            #   Y_width = Σ_{k∈TopK} g_k · Y_k,
+            #   Y_k = H + W_out^(k) φ(W_proj^(k) LayerNorm(H))     (Eq. 8)
+            #   Como Σ_{k∈TopK} g_k = 1 (softmax normalizado sobre TopK):
+            #   Y_width = H + Σ_{k∈TopK} g_k · delta_k
+            #
+            # Implementación sparse con shapes estáticos:
+            #
+            #   1. flat_expert [N_tok·K]: índices de experto por token-slot.
+            #      argsort → perm [N_tok·K] con shape siempre igual. CUDAGraph ✓
+            #
+            #   2. sorted_tok [N_tok·K] = flat_tok[perm]: tokens ordenados por
+            #      experto. Todos los tokens del experto e quedan contiguos.
+            #
+            #   3. view(E, C) con C = N_tok·K // E constante Python → shape
+            #      estático [E, C, D] para gather y forward.
+            #
+            #   4. _expert_forward sobre [C, D] por experto — mismos FLOPs que
+            #      el original con torch.where: solo C tokens por experto,
+            #      no los N_tok completos. Con K=2, E=4: C = N_tok/2.
+            #
+            #   5. scatter_add_: index de shape [C, D] siempre estático.
+            #      Los VALORES varían por batch, el SHAPE no. CUDAGraph ✓
+            #      Acumula Σ_{k} g_k · Y_k para cada token n mediante
+            #      sum sobre los K slots que apuntan a n.
+            K   = self.active_experts
+            E   = self.total_experts
+            N_tok = B * S
+            C   = (N_tok * K) // E   # tokens por experto — constante compile-time
+            routing_logits = self.expert_gate(x)                       # [B, S, E]
+            topk_w, topk_i = torch.topk(routing_logits, k=K, dim=-1)
+            topk_w         = torch.softmax(topk_w, dim=-1)            # [B, S, K]
+            x_flat     = x.reshape(-1, D)                              # [N_tok, D]
+            x_fan_flat = x_fan.reshape(-1, x_fan.shape[-1])           # [N_tok, fan_dim]
+            # Aplanar: cada token aparece K veces, una por experto seleccionado
+            flat_expert = topk_i.reshape(-1)                           # [N_tok·K] long
+            flat_tok    = (
+                torch.arange(N_tok, device=x.device, dtype=torch.long)
+                .unsqueeze(1).expand(N_tok, K).reshape(-1)
+            )                                                           # [N_tok·K] long
+            flat_w      = topk_w.reshape(-1)                           # [N_tok·K]
+            # Ordenar por expert ID: todos los tokens del mismo experto juntos
+            perm        = torch.argsort(flat_expert, stable=True)      # [N_tok·K] long
+            sorted_tok  = flat_tok[perm]                               # [N_tok·K] long
+            sorted_w    = flat_w[perm]                                  # [N_tok·K]
+            # Agrupar por experto [E, C] — C conocido en compile-time
+            grouped_tok = sorted_tok.view(E, C)                        # [E, C] long
+            grouped_w   = sorted_w.view(E, C)                          # [E, C]
+            # Gather features del token original para cada slot de experto
+            flat_idx    = grouped_tok.reshape(-1)                      # [E·C] long
+            fan_dim     = x_fan_flat.shape[-1]
+            x_grouped   = x_flat[flat_idx].view(E, C, D)              # [E, C, D]
+            xf_grouped  = x_fan_flat[flat_idx].view(E, C, fan_dim)    # [E, C, fan_dim]
+            # Expert forward + scatter_add_ de vuelta a [N_tok, D]
+            # Loop desenrollado por dynamo (E constante Python) — sin graph breaks
+            x_moe_flat  = torch.zeros(N_tok, D, device=x.device, dtype=x.dtype)
+            for eid in range(E):
+                # out_e [C, D] = x_grouped[eid] + delta_e (residual incluido, Eq. 8)
+                out_e     = self._expert_forward(
+                    xf_grouped[eid], x_grouped[eid], self.expert_idx[eid]
                 )
+                w_e       = grouped_w[eid].unsqueeze(-1)               # [C, 1]
+                tok_idx_e = grouped_tok[eid].unsqueeze(1).expand(C, D)  # [C, D] long
+                # Acumula g_k · Y_k en la posición original del token
+                # Cuando eid recorre los K experts de un token n:
+                #   x_moe_flat[n] = Σ_k g_k · Y_k = H_n + Σ_k g_k · delta_k
+                x_moe_flat.scatter_add_(
+                    0, tok_idx_e, (out_e * w_e).to(x_moe_flat.dtype)
                 )
             x_moe = x_moe_flat.reshape(B, S, D)
+            # Load-balancing aux stats (Eq. load-balancing loss)
+            # p_sum: probabilidad media por experto (sobre routing_logits completo)
+            # f_sum: fracción real de tokens asignados a cada experto
             r_probs_flat = torch.softmax(
+                routing_logits.reshape(-1, E), dim=-1
+            )                                                           # [N_tok, E]
+            p_sum = r_probs_flat.sum(dim=0)                            # [E]
+            f_sum = (
+                F.one_hot(flat_expert.long(), E).float().sum(dim=0)
+                / float(N_tok * K)
+            )                                                           # [E]
             aux_stats = (p_sum, f_sum, N_tok)
             # ── Difficulty-aware fusion (Eq. 12–13) ──────────────────────────
         # ═════════════════════ INFERENCE ══════════════════════════════════════
         else:
             loop_choice = depth_logits.argmax(dim=-1)                  # [B, S]
+            # ── Depth path: siempre max_depth iteraciones (shape estático) ─
+            # ORIGINAL PROBLEM: el early-exit original usaba
+            #   max_loop = int(loop_choice.max().item())
+            # que produce una sincronización CPU-GPU (equivalente a .item())
+            # y hace que el número de iteraciones del loop dependa de datos —
+            # ambas condiciones prohíben la captura de CUDAGraphs.
+            #
+            # SOLUCIÓN: siempre se ejecutan exactamente self.max_depth
+            # iteraciones. depth_stack [B,S,D,max_depth] tiene shape estático.
+            # El gather sobre loop_choice selecciona la salida correcta por
+            # token sin necesidad de conocer cuántas iteraciones se ejecutaron.
+            # La pérdida de FLOPs por iteraciones "extra" es mínima porque
+            # max_depth es pequeño (default 2) y _full_forward_step es ligero.
             depth_outputs = []
             current_x     = x
+            for _ in range(self.max_depth):
                 current_x = self._full_forward_step(current_x)
                 depth_outputs.append(current_x)
+            depth_stack = torch.stack(depth_outputs, dim=-1)          # [B,S,D,max_depth]
             gather_idx  = (
                 loop_choice.unsqueeze(-1).unsqueeze(-1).expand(B, S, D, 1)
             )
             expected_L = (loop_choice + 1).float()                    # [B, S]
             moe_weight = (self.max_depth - expected_L) / self.max_depth  # [B, S]
             aux_stats   = None
             depth_probs = None
+            # ── Width path: argsort-based sparse dispatch (mismo mecanismo
+            #    que entrenamiento, Eq. 7–8 + Conditional Parallelism §A) ───
+            #
+            # Conditional Parallelism (Algorithm 2 del paper):
+            #   Si λ=0 para un token → Y = Y_depth, el width path se omite.
+            #   Con shapes estáticos no podemos excluir dinámicamente esos tokens
+            #   del buffer. En su lugar, los λ=0 tokens participan en el grouped
+            #   buffer y su expert forward corre, pero la fusión
+            #       output = x_depth·(1−λ) + x_moe·λ
+            #   garantiza output = x_depth cuando λ=0, sin ninguna rama condicional.
+            #   Los FLOPs del width path para esos tokens son el único overhead.
+            N_tok_inf = B * S
+            K_inf     = self.active_experts
+            E_inf     = self.total_experts
+            C_inf     = (N_tok_inf * K_inf) // E_inf
+            x_flat     = x.reshape(-1, D)
+            x_fan_flat = x_fan.reshape(-1, x_fan.shape[-1])
+            routing_logits = self.expert_gate(x_flat)                  # [N_tok, E]
+            tw, ti = torch.topk(routing_logits, k=K_inf, dim=-1)
+            tw     = torch.softmax(tw, dim=-1)                         # [N_tok, K]
+            flat_expert_i = ti.reshape(-1)                             # [N_tok·K] long
+            flat_tok_i    = (
+                torch.arange(N_tok_inf, device=x.device, dtype=torch.long)
+                .unsqueeze(1).expand(N_tok_inf, K_inf).reshape(-1)
+            )                                                           # [N_tok·K] long
+            flat_w_i      = tw.reshape(-1)                             # [N_tok·K]
+            perm_i        = torch.argsort(flat_expert_i, stable=True)  # [N_tok·K]
+            sorted_tok_i  = flat_tok_i[perm_i]                         # [N_tok·K]
+            sorted_w_i    = flat_w_i[perm_i]                           # [N_tok·K]
+            grouped_tok_i = sorted_tok_i.view(E_inf, C_inf)            # [E, C]
+            grouped_w_i   = sorted_w_i.view(E_inf, C_inf)              # [E, C]
+            flat_idx_i    = grouped_tok_i.reshape(-1)                  # [E·C]
+            fan_dim_i     = x_fan_flat.shape[-1]
+            x_grouped_i   = x_flat[flat_idx_i].view(E_inf, C_inf, D)            # [E, C, D]
+            xf_grouped_i  = x_fan_flat[flat_idx_i].view(E_inf, C_inf, fan_dim_i)  # [E, C, fan_dim]
+            x_moe_flat_i  = torch.zeros(N_tok_inf, D, device=x.device, dtype=x.dtype)
+            for eid in range(E_inf):
+                out_e_i     = self._expert_forward(
+                    xf_grouped_i[eid], x_grouped_i[eid], self.expert_idx[eid]
+                )
+                w_e_i       = grouped_w_i[eid].unsqueeze(-1)           # [C, 1]
+                tok_idx_e_i = grouped_tok_i[eid].unsqueeze(1).expand(C_inf, D)
+                x_moe_flat_i.scatter_add_(
+                    0, tok_idx_e_i, (out_e_i * w_e_i).to(x_moe_flat_i.dtype)
+                )
+            x_moe = x_moe_flat_i.reshape(B, S, D)
             output = (
                 x_depth * (1.0 - moe_weight.unsqueeze(-1))