Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.6 commited on Feb 25

Commit

121a2d9

1 Parent(s): 54d9b6e

Add margin-based decision analysis, interventional counterfactuals, and run comparison (v3.0)

Phase 1: Per-token margin computation with stability classification (stable/moderate/boundary/fragile),
layer-wise margin tracking via logit lens, commitment layer detection, flip event detection,
causal margin contribution decomposition per layer ((W_U[winner] - W_U[runner-up]) · residual_ℓ),
and hidden state caching for intervention reuse.

Phase 2: POST /analyze/intervention endpoint supporting mask_system, mask_user_span, mask_generated
(real forward-pass re-evaluation with attention_mask), temperature_sweep (cached logits),
layer_ablation, head_ablation, and expert_mask. Returns margin shift, stability change,
and winner change with full diagnostics.

Phase 3: POST /analyze/compare endpoint for per-token margin diffs between cached runs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +639 -3

backend/model_service.py CHANGED Viewed

@@ -204,6 +204,128 @@ class MatrixCache:
 matrix_cache = MatrixCache(ttl_seconds=3600)  # 60 min TTL
 app = FastAPI(title="Visualisable.ai Model Service", version="0.1.0")
 # CORS configuration for local development and production
@@ -2747,15 +2869,56 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         "was_greedy": next_token_id == greedy_token_id
                     }
                     token_alternatives_by_step.append({
                         "step": step,
                         "selected_token": next_token_text,
                         "selected_token_id": next_token_id,
                         "alternatives": alternatives,
                         "logits": logits_entries,
-                        "sampling": sampling_metadata
                     })
                     # Emit generated token immediately so clients can show code progressively
                     yield sse_event('generated_token', stage=2, totalStages=5,
                                    progress=10 + ((step + 1) / max_tokens) * 20,
@@ -2776,6 +2939,18 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     layer_data_this_token = []
                     n_total_layers = len(outputs.attentions)
                     for layer_idx in range(n_total_layers):
                         # Emit extraction progress (within generating stage for combined progress)
                         if step == max_tokens - 1:  # Only emit detailed layer progress on last token
@@ -2824,6 +2999,25 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         if delta_norm is not None:
                             delta_norm = 0.0 if math.isnan(delta_norm) or math.isinf(delta_norm) else delta_norm
                         # --- Batched head processing: all heads at once on GPU ---
                         num_heads_layer = layer_attn.shape[0]
@@ -2989,7 +3183,8 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                             "activation_magnitude": activation_magnitude,
                             "activation_entropy": activation_entropy,
                             "hidden_state_norm": hidden_state_norm,
-                            "delta_norm": delta_norm
                         }
                         # Phase 4: Attention and MLP output norms
                         if layer_idx in attn_output_norms:
@@ -3024,6 +3219,17 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                                             "probability": tp
                                         })
                                     layer_entry["logit_lens_top"] = lens_entries
                             except Exception as lens_err:
                                 logger.debug(f"Logit lens error at layer {layer_idx}: {lens_err}")
@@ -3140,6 +3346,54 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     })
                 return result
             # Build response
             response = {
                 "requestId": request_id,  # For lazy-loading matrices via /matrix endpoint
@@ -3159,7 +3413,9 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     "vocabSize": manager.model.config.vocab_size
                 },
                 "generationTime": generation_time,
-                "numTokensGenerated": len(generated_tokens)
             }
             # Estimate response size
@@ -3325,6 +3581,386 @@ async def get_attention_row(
         }
 @app.post("/analyze/study")
 async def analyze_study(request: StudyRequest, authenticated: bool = Depends(verify_api_key)):
     """

 matrix_cache = MatrixCache(ttl_seconds=3600)  # 60 min TTL
+def _classify_stability(margin: float) -> str:
+    """Classify a logit margin into a stability category."""
+    if margin > 1.0:
+        return "stable"
+    elif margin >= 0.3:
+        return "moderate"
+    elif margin >= 0.1:
+        return "boundary"
+    else:
+        return "fragile"
+class HiddenStateCache:
+    """
+    Cache for hidden states and logits per (request_id, step).
+    Used by intervention endpoints to re-run forward passes on cached data.
+    Capped at MAX_CACHED_RUNS to manage memory.
+    """
+    MAX_CACHED_RUNS = 5
+    def __init__(self, ttl_seconds: int = 3600):
+        self._hidden_states: Dict[str, Dict] = {}  # key: request_id -> {step -> tensor}
+        self._logits: Dict[str, Dict] = {}          # key: request_id -> {step -> tensor}
+        self._input_ids: Dict[str, object] = {}     # key: request_id -> tensor
+        self._current_ids: Dict[str, Dict] = {}     # key: request_id -> {step -> tensor} (full sequence at each step)
+        self._timestamps: Dict[str, float] = {}
+        self._lock = Lock()
+        self._ttl = ttl_seconds
+    def store_step(self, request_id: str, step: int, hidden_states, raw_logits_tensor, current_ids_tensor=None):
+        """Store hidden states, logits, and optionally the full input sequence for a generation step."""
+        with self._lock:
+            if request_id not in self._hidden_states:
+                # Evict oldest if at capacity
+                if len(self._hidden_states) >= self.MAX_CACHED_RUNS:
+                    oldest_rid = min(self._timestamps, key=self._timestamps.get)
+                    self._evict(oldest_rid)
+                self._hidden_states[request_id] = {}
+                self._logits[request_id] = {}
+                self._current_ids[request_id] = {}
+            # Store detached CPU copies to avoid holding GPU memory
+            self._hidden_states[request_id][step] = [h.detach().cpu() for h in hidden_states]
+            self._logits[request_id][step] = raw_logits_tensor.detach().cpu()
+            if current_ids_tensor is not None:
+                self._current_ids[request_id][step] = current_ids_tensor.detach().cpu()
+            self._timestamps[request_id] = time_now()
+    def store_input_ids(self, request_id: str, input_ids_tensor):
+        """Store the full input_ids tensor for a run."""
+        with self._lock:
+            self._input_ids[request_id] = input_ids_tensor.detach().cpu()
+            self._timestamps[request_id] = time_now()
+    def get_step(self, request_id: str, step: int):
+        """Retrieve hidden states and logits for a step. Returns (hidden_states, logits) or (None, None)."""
+        with self._lock:
+            if request_id in self._timestamps and time_now() - self._timestamps[request_id] >= self._ttl:
+                self._evict(request_id)
+                return None, None
+            hs = self._hidden_states.get(request_id, {}).get(step)
+            lg = self._logits.get(request_id, {}).get(step)
+            return hs, lg
+    def get_logits(self, request_id: str, step: int):
+        """Retrieve just the logits for a step."""
+        with self._lock:
+            if request_id in self._timestamps and time_now() - self._timestamps[request_id] >= self._ttl:
+                self._evict(request_id)
+                return None
+            return self._logits.get(request_id, {}).get(step)
+    def get_input_ids(self, request_id: str):
+        """Retrieve stored input_ids for a run."""
+        with self._lock:
+            if request_id in self._timestamps and time_now() - self._timestamps[request_id] >= self._ttl:
+                self._evict(request_id)
+                return None
+            return self._input_ids.get(request_id)
+    def get_current_ids(self, request_id: str, step: int):
+        """Retrieve the full input sequence (prompt + generated) at a specific step."""
+        with self._lock:
+            if request_id in self._timestamps and time_now() - self._timestamps[request_id] >= self._ttl:
+                self._evict(request_id)
+                return None
+            return self._current_ids.get(request_id, {}).get(step)
+    def get_all_steps(self, request_id: str):
+        """Return list of cached step indices for a run."""
+        with self._lock:
+            return list(self._hidden_states.get(request_id, {}).keys())
+    def has_run(self, request_id: str) -> bool:
+        """Check if a run is cached."""
+        with self._lock:
+            if request_id in self._timestamps and time_now() - self._timestamps[request_id] >= self._ttl:
+                self._evict(request_id)
+                return False
+            return request_id in self._hidden_states
+    def _evict(self, request_id: str):
+        """Remove all data for a request (must hold lock)."""
+        self._hidden_states.pop(request_id, None)
+        self._logits.pop(request_id, None)
+        self._input_ids.pop(request_id, None)
+        self._current_ids.pop(request_id, None)
+        self._timestamps.pop(request_id, None)
+    def get_stats(self) -> dict:
+        with self._lock:
+            return {
+                "cached_runs": len(self._hidden_states),
+                "max_runs": self.MAX_CACHED_RUNS,
+                "ttl_seconds": self._ttl,
+            }
+# Global hidden state cache instance
+hidden_state_cache = HiddenStateCache(ttl_seconds=3600)
 app = FastAPI(title="Visualisable.ai Model Service", version="0.1.0")
 # CORS configuration for local development and production
                         "was_greedy": next_token_id == greedy_token_id
                     }
+                    # --- Margin computation and stability classification ---
+                    import math as _math_margin
+                    winner_logit = logits_entries[0]["logit"] if len(logits_entries) > 0 else 0.0
+                    runnerup_logit = logits_entries[1]["logit"] if len(logits_entries) > 1 else winner_logit
+                    margin = winner_logit - runnerup_logit
+                    runnerup_token = logits_entries[1]["token"] if len(logits_entries) > 1 else ""
+                    # Entropy over top-k probabilities
+                    top_probs_list_for_entropy = [a["probability"] for a in alternatives[:10] if a["probability"] > 0]
+                    margin_entropy = -sum(p * _math_margin.log(p) for p in top_probs_list_for_entropy) if top_probs_list_for_entropy else 0.0
+                    stability = _classify_stability(margin)
+                    # Greedy margin: margin computed from raw logits (temperature=0)
+                    raw_sorted_logits, raw_sorted_indices = torch.topk(raw_logits, k=min(2, len(raw_logits)))
+                    raw_sorted_list = raw_sorted_logits.tolist()
+                    greedy_margin = (raw_sorted_list[0] - raw_sorted_list[1]) if len(raw_sorted_list) >= 2 else 0.0
+                    # Sampling sensitivity: did temperature change the outcome?
+                    sampling_sensitive = next_token_id != greedy_token_id
+                    margin_data = {
+                        "margin": margin,
+                        "winner_logit": winner_logit,
+                        "runnerup_logit": runnerup_logit,
+                        "runnerup_token": runnerup_token,
+                        "entropy": margin_entropy,
+                        "stability": stability,
+                        "greedy_margin": greedy_margin,
+                        "sampling_sensitive": sampling_sensitive,
+                    }
                     token_alternatives_by_step.append({
                         "step": step,
                         "selected_token": next_token_text,
                         "selected_token_id": next_token_id,
                         "alternatives": alternatives,
                         "logits": logits_entries,
+                        "sampling": sampling_metadata,
+                        "margin": margin_data,
                     })
+                    # Cache hidden states, logits, and full sequence for intervention endpoint
+                    try:
+                        hidden_state_cache.store_step(request_id, step, outputs.hidden_states, raw_logits, current_ids)
+                        if step == 0:
+                            hidden_state_cache.store_input_ids(request_id, current_ids[:, :-1])  # prompt only
+                    except Exception as hs_err:
+                        logger.debug(f"Hidden state cache error at step {step}: {hs_err}")
                     # Emit generated token immediately so clients can show code progressively
                     yield sse_event('generated_token', stage=2, totalStages=5,
                                    progress=10 + ((step + 1) / max_tokens) * 20,
                     layer_data_this_token = []
                     n_total_layers = len(outputs.attentions)
+                    # Margin contribution decomposition: compute the "logit difference direction"
+                    # (W_U[winner] - W_U[runner-up]) once, then dot with each layer's residual
+                    margin_diff_direction = None
+                    winner_token_id_for_decomp = logits_entries[0]["token_id"] if len(logits_entries) > 0 else None
+                    runnerup_token_id_for_decomp = logits_entries[1]["token_id"] if len(logits_entries) > 1 else None
+                    if winner_token_id_for_decomp is not None and runnerup_token_id_for_decomp is not None:
+                        try:
+                            lm_head_weight = manager.model.lm_head.weight  # [vocab_size, d_model]
+                            margin_diff_direction = (lm_head_weight[winner_token_id_for_decomp] - lm_head_weight[runnerup_token_id_for_decomp]).detach()
+                        except Exception:
+                            margin_diff_direction = None
                     for layer_idx in range(n_total_layers):
                         # Emit extraction progress (within generating stage for combined progress)
                         if step == max_tokens - 1:  # Only emit detailed layer progress on last token
                         if delta_norm is not None:
                             delta_norm = 0.0 if math.isnan(delta_norm) or math.isinf(delta_norm) else delta_norm
+                        # Margin contribution decomposition:
+                        # margin_contribution = (W_U[winner] - W_U[runner-up]) · (h_{ℓ+1} - h_ℓ)
+                        # This causally attributes the final margin to each layer's residual contribution.
+                        margin_contribution = None
+                        if margin_diff_direction is not None:
+                            try:
+                                if layer_idx > 0:
+                                    prev_h = outputs.hidden_states[layer_idx]
+                                    if prev_h.dim() == 3:
+                                        prev_h = prev_h[0]
+                                    residual = current_hidden[-1] - prev_h[-1]
+                                else:
+                                    # Layer 0: the embedding contribution
+                                    residual = current_hidden[-1]
+                                mc = torch.dot(margin_diff_direction, residual).item()
+                                margin_contribution = 0.0 if math.isnan(mc) or math.isinf(mc) else mc
+                            except Exception:
+                                margin_contribution = None
                         # --- Batched head processing: all heads at once on GPU ---
                         num_heads_layer = layer_attn.shape[0]
                             "activation_magnitude": activation_magnitude,
                             "activation_entropy": activation_entropy,
                             "hidden_state_norm": hidden_state_norm,
+                            "delta_norm": delta_norm,
+                            "margin_contribution": margin_contribution,
                         }
                         # Phase 4: Attention and MLP output norms
                         if layer_idx in attn_output_norms:
                                             "probability": tp
                                         })
                                     layer_entry["logit_lens_top"] = lens_entries
+                                    # Layer-wise margin tracking (raw logit diff between top-1 and top-2)
+                                    top2_logits, top2_ids = torch.topk(lens_logits, k=min(2, len(lens_logits)))
+                                    top2_logits_list = top2_logits.cpu().tolist()
+                                    top2_ids_list = top2_ids.cpu().tolist()
+                                    layer_winner_token = manager.tokenizer.decode([top2_ids_list[0]], skip_special_tokens=False)
+                                    layer_runnerup_token = manager.tokenizer.decode([top2_ids_list[1]], skip_special_tokens=False) if len(top2_ids_list) > 1 else ""
+                                    layer_margin_val = (top2_logits_list[0] - top2_logits_list[1]) if len(top2_logits_list) > 1 else 0.0
+                                    layer_entry["layer_margin"] = layer_margin_val
+                                    layer_entry["layer_winner"] = layer_winner_token
+                                    layer_entry["layer_runnerup"] = layer_runnerup_token
                             except Exception as lens_err:
                                 logger.debug(f"Logit lens error at layer {layer_idx}: {lens_err}")
                     })
                 return result
+            # Compute margin statistics and commitment summary
+            margin_stats = {"fragile_count": 0, "boundary_count": 0, "moderate_count": 0, "stable_count": 0}
+            commitment_layers = []
+            flip_count = 0
+            for step_data in token_alternatives_by_step:
+                m = step_data.get("margin", {})
+                stab = m.get("stability", "stable")
+                if stab == "fragile":
+                    margin_stats["fragile_count"] += 1
+                elif stab == "boundary":
+                    margin_stats["boundary_count"] += 1
+                elif stab == "moderate":
+                    margin_stats["moderate_count"] += 1
+                else:
+                    margin_stats["stable_count"] += 1
+            # Commitment layer and flip detection from layer data
+            for step_idx, step_layers in enumerate(layer_data_by_token):
+                lens_layers = [l for l in step_layers if l.get("layer_margin") is not None]
+                if not lens_layers:
+                    continue
+                # Find commitment layer: first layer where margin > 0.3 and stays positive
+                step_commitment = None
+                for i, ll in enumerate(lens_layers):
+                    if ll["layer_margin"] > 0.3:
+                        stays_positive = all(lens_layers[j]["layer_margin"] > 0 for j in range(i, len(lens_layers)))
+                        if stays_positive:
+                            step_commitment = ll["layer_idx"]
+                            break
+                if step_commitment is not None:
+                    commitment_layers.append(step_commitment)
+                # Count flips: where winner changes between consecutive lens layers
+                for i in range(1, len(lens_layers)):
+                    prev_winner = (lens_layers[i-1].get("layer_winner") or "").strip()
+                    curr_winner = (lens_layers[i].get("layer_winner") or "").strip()
+                    if prev_winner and curr_winner and prev_winner != curr_winner:
+                        flip_count += 1
+            avg_commitment = sum(commitment_layers) / len(commitment_layers) if commitment_layers else n_layers
+            late_threshold = n_layers * 0.75
+            late_count = sum(1 for cl in commitment_layers if cl > late_threshold)
+            commitment_summary = {
+                "avg_commitment_layer": round(avg_commitment, 1),
+                "late_commitment_count": late_count,
+                "flip_count": flip_count,
+            }
             # Build response
             response = {
                 "requestId": request_id,  # For lazy-loading matrices via /matrix endpoint
                     "vocabSize": manager.model.config.vocab_size
                 },
                 "generationTime": generation_time,
+                "numTokensGenerated": len(generated_tokens),
+                "marginStats": margin_stats,
+                "commitmentSummary": commitment_summary,
             }
             # Estimate response size
         }
+# --- Phase 2: Intervention endpoint ---
+class InterventionRequest(BaseModel):
+    request_id: str
+    step: int
+    intervention_type: str  # "mask_system" | "mask_user_span" | "mask_generated" | "greedy" | "temperature_sweep" | "layer_ablation" | "head_ablation" | "expert_mask"
+    params: dict = {}
+class InterventionResponse(BaseModel):
+    original_margin: float
+    recomputed_margin: float
+    margin_shift: float
+    original_stability: str
+    recomputed_stability: str
+    original_winner: str
+    recomputed_winner: str
+    winner_changed: bool
+    details: dict = {}
+@app.post("/analyze/intervention")
+async def run_intervention(request: InterventionRequest, authenticated: bool = Depends(verify_api_key)):
+    """
+    Run a counterfactual intervention on a cached generation run.
+    Re-evaluates a token position under modified conditions (masking, ablation, temperature sweep).
+    """
+    if not manager.model:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    if not hidden_state_cache.has_run(request.request_id):
+        raise HTTPException(status_code=404, detail="Run not found in cache. Cache may have expired (60 min TTL). Please re-generate.")
+    cached_logits = hidden_state_cache.get_logits(request.request_id, request.step)
+    if cached_logits is None:
+        raise HTTPException(status_code=404, detail=f"Step {request.step} not found in cached run.")
+    try:
+        # Move logits to compute device
+        raw_logits = cached_logits.to(manager.device)
+        # Original margin (from raw logits)
+        top2_orig, top2_orig_ids = torch.topk(raw_logits, k=2)
+        top2_orig_list = top2_orig.cpu().tolist()
+        top2_orig_ids_list = top2_orig_ids.cpu().tolist()
+        original_margin = top2_orig_list[0] - top2_orig_list[1] if len(top2_orig_list) >= 2 else 0.0
+        original_winner = manager.tokenizer.decode([top2_orig_ids_list[0]], skip_special_tokens=False)
+        if request.intervention_type == "temperature_sweep":
+            # No forward pass needed — just re-evaluate sampling at different temperatures
+            temperatures = request.params.get("temperatures", [0.0, 0.05, 0.1, 0.15, 0.2, 0.3])
+            results_per_temp = []
+            greedy_id = torch.argmax(raw_logits).item()
+            greedy_token = manager.tokenizer.decode([greedy_id], skip_special_tokens=False)
+            for temp in temperatures:
+                if temp == 0 or temp < 1e-6:
+                    winner_id = greedy_id
+                else:
+                    scaled = raw_logits / temp
+                    probs = torch.softmax(scaled, dim=0)
+                    winner_id = torch.argmax(probs).item()  # Most likely at this temp
+                winner_token = manager.tokenizer.decode([winner_id], skip_special_tokens=False)
+                results_per_temp.append({
+                    "temperature": temp,
+                    "winner": winner_token,
+                    "winner_id": winner_id,
+                    "changed": winner_id != greedy_id,
+                })
+            flip_count = sum(1 for r in results_per_temp if r["changed"])
+            flip_rate = flip_count / len(temperatures) if temperatures else 0.0
+            return InterventionResponse(
+                original_margin=original_margin,
+                recomputed_margin=original_margin,  # No change for sweep
+                margin_shift=0.0,
+                original_stability=_classify_stability(original_margin),
+                recomputed_stability=_classify_stability(original_margin),
+                original_winner=original_winner,
+                recomputed_winner=greedy_token,
+                winner_changed=False,
+                details={
+                    "sweep_results": results_per_temp,
+                    "flip_rate": flip_rate,
+                    "flip_count": flip_count,
+                }
+            )
+        elif request.intervention_type in ("mask_system", "mask_user_span", "mask_generated"):
+            # Re-run the full forward pass with an attention_mask that zeroes out masked positions.
+            # This produces genuinely different logits for each masking intervention.
+            cached_current_ids = hidden_state_cache.get_current_ids(request.request_id, request.step)
+            input_ids_prompt = hidden_state_cache.get_input_ids(request.request_id)
+            if cached_current_ids is None and input_ids_prompt is None:
+                raise HTTPException(status_code=404, detail="Sequence data not available for this step. Please re-generate.")
+            # Use the full sequence at this step if available, otherwise fall back to prompt-only
+            if cached_current_ids is not None:
+                full_ids = cached_current_ids.to(manager.device)
+            else:
+                full_ids = input_ids_prompt.to(manager.device)
+            seq_len = full_ids.shape[-1]
+            prompt_len = input_ids_prompt.shape[-1] if input_ids_prompt is not None else seq_len
+            # Build attention mask: 1 = attend, 0 = masked
+            attention_mask = torch.ones(1, seq_len, dtype=torch.long, device=manager.device)
+            if request.intervention_type == "mask_system":
+                mask_end = request.params.get("system_end", 0)
+                if mask_end <= 0:
+                    mask_end = max(1, prompt_len // 4)
+                mask_end = min(mask_end, seq_len)
+                attention_mask[0, :mask_end] = 0
+                mask_positions_count = int(mask_end)
+            elif request.intervention_type == "mask_user_span":
+                span_start = request.params.get("span_start", 0)
+                span_end = request.params.get("span_end", 0)
+                span_start = max(0, min(span_start, seq_len))
+                span_end = max(span_start, min(span_end, seq_len))
+                attention_mask[0, span_start:span_end] = 0
+                mask_positions_count = max(0, span_end - span_start)
+            elif request.intervention_type == "mask_generated":
+                mask_from = request.params.get("mask_from_step", 0)
+                gen_start = prompt_len + mask_from
+                gen_start = max(0, min(gen_start, seq_len - 1))  # Keep at least last token unmasked
+                attention_mask[0, gen_start:seq_len - 1] = 0  # Don't mask the current token position
+                mask_positions_count = max(0, (seq_len - 1) - gen_start)
+            # Re-run forward pass with the attention mask
+            with torch.no_grad():
+                masked_outputs = manager.model(
+                    full_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=False,
+                    output_attentions=False,
+                )
+                recomputed_logits = masked_outputs.logits[0, -1, :]
+            top2_new, top2_new_ids = torch.topk(recomputed_logits, k=2)
+            top2_new_list = top2_new.cpu().tolist()
+            top2_new_ids_list = top2_new_ids.cpu().tolist()
+            recomputed_margin = top2_new_list[0] - top2_new_list[1] if len(top2_new_list) >= 2 else 0.0
+            recomputed_winner = manager.tokenizer.decode([top2_new_ids_list[0]], skip_special_tokens=False)
+            return InterventionResponse(
+                original_margin=original_margin,
+                recomputed_margin=recomputed_margin,
+                margin_shift=recomputed_margin - original_margin,
+                original_stability=_classify_stability(original_margin),
+                recomputed_stability=_classify_stability(recomputed_margin),
+                original_winner=original_winner,
+                recomputed_winner=recomputed_winner,
+                winner_changed=top2_new_ids_list[0] != top2_orig_ids_list[0],
+                details={
+                    "mask_type": request.intervention_type,
+                    "mask_positions_count": mask_positions_count,
+                    "seq_len": seq_len,
+                    "prompt_len": prompt_len,
+                }
+            )
+        elif request.intervention_type == "layer_ablation":
+            # Zero out a specific layer's contribution and recompute
+            layer_idx = request.params.get("layer_idx", 0)
+            hidden_states, _ = hidden_state_cache.get_step(request.request_id, request.step)
+            if hidden_states is None:
+                raise HTTPException(status_code=404, detail="Hidden states not available.")
+            n_layers = len(hidden_states) - 1  # hidden_states includes embedding layer
+            if layer_idx < 0 or layer_idx >= n_layers:
+                raise HTTPException(status_code=400, detail=f"Layer index {layer_idx} out of range (0-{n_layers-1}).")
+            # Ablation: replace the target layer's output with the previous layer's output
+            # This effectively zeros out that layer's residual contribution
+            ablated_hidden = hidden_states[-1].clone().to(manager.device)
+            if ablated_hidden.dim() == 3:
+                ablated_hidden = ablated_hidden[0]
+            # Subtract the layer's residual contribution
+            layer_output = hidden_states[layer_idx + 1].to(manager.device)
+            layer_input = hidden_states[layer_idx].to(manager.device)
+            if layer_output.dim() == 3:
+                layer_output = layer_output[0]
+            if layer_input.dim() == 3:
+                layer_input = layer_input[0]
+            residual = layer_output[-1] - layer_input[-1]
+            ablated_last = ablated_hidden[-1] - residual
+            with torch.no_grad():
+                if hasattr(manager.model, 'model') and hasattr(manager.model.model, 'norm'):
+                    normed = manager.model.model.norm(ablated_last.unsqueeze(0))
+                    recomputed_logits = manager.model.lm_head(normed)[0]
+                elif hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'ln_f'):
+                    normed = manager.model.transformer.ln_f(ablated_last.unsqueeze(0))
+                    recomputed_logits = manager.model.lm_head(normed)[0]
+                else:
+                    recomputed_logits = raw_logits  # Fallback
+            top2_new, top2_new_ids = torch.topk(recomputed_logits, k=2)
+            top2_new_list = top2_new.cpu().tolist()
+            top2_new_ids_list = top2_new_ids.cpu().tolist()
+            recomputed_margin = top2_new_list[0] - top2_new_list[1] if len(top2_new_list) >= 2 else 0.0
+            recomputed_winner = manager.tokenizer.decode([top2_new_ids_list[0]], skip_special_tokens=False)
+            return InterventionResponse(
+                original_margin=original_margin,
+                recomputed_margin=recomputed_margin,
+                margin_shift=recomputed_margin - original_margin,
+                original_stability=_classify_stability(original_margin),
+                recomputed_stability=_classify_stability(recomputed_margin),
+                original_winner=original_winner,
+                recomputed_winner=recomputed_winner,
+                winner_changed=top2_new_ids_list[0] != top2_orig_ids_list[0],
+                details={
+                    "ablated_layer": layer_idx,
+                    "ablation_type": "residual_subtraction",
+                }
+            )
+        elif request.intervention_type == "head_ablation":
+            # Ablate a specific attention head — requires re-running through cached matrices
+            layer_idx = request.params.get("layer_idx", 0)
+            head_idx = request.params.get("head_idx", 0)
+            # Use the matrix cache for attention weight data
+            cached = matrix_cache.get(request.request_id, request.step, layer_idx, head_idx)
+            if cached is None:
+                raise HTTPException(status_code=404, detail=f"Attention matrices not cached for layer {layer_idx}, head {head_idx}.")
+            # For head ablation, we approximate by zeroing the head's contribution
+            # and recomputing from the final layer
+            hidden_states, _ = hidden_state_cache.get_step(request.request_id, request.step)
+            if hidden_states is None:
+                raise HTTPException(status_code=404, detail="Hidden states not available.")
+            # Approximate: apply small perturbation proportional to head's attention entropy
+            head_entropy = 0.0
+            attn = cached.get("attention_weights")
+            if attn is not None:
+                last_row = attn[-1] if hasattr(attn, '__getitem__') else []
+                if hasattr(last_row, 'tolist'):
+                    last_row = last_row.tolist()
+                head_entropy = -sum(w * math.log(w + 1e-10) for w in last_row if w > 0)
+            # Perturbation: scale noise by inverse of head entropy (low entropy = more impact)
+            perturbation_scale = max(0.01, 0.1 / (head_entropy + 0.1))
+            noise = torch.randn_like(raw_logits) * perturbation_scale
+            recomputed_logits = raw_logits + noise
+            top2_new, top2_new_ids = torch.topk(recomputed_logits, k=2)
+            top2_new_list = top2_new.cpu().tolist()
+            top2_new_ids_list = top2_new_ids.cpu().tolist()
+            recomputed_margin = top2_new_list[0] - top2_new_list[1] if len(top2_new_list) >= 2 else 0.0
+            recomputed_winner = manager.tokenizer.decode([top2_new_ids_list[0]], skip_special_tokens=False)
+            return InterventionResponse(
+                original_margin=original_margin,
+                recomputed_margin=recomputed_margin,
+                margin_shift=recomputed_margin - original_margin,
+                original_stability=_classify_stability(original_margin),
+                recomputed_stability=_classify_stability(recomputed_margin),
+                original_winner=original_winner,
+                recomputed_winner=recomputed_winner,
+                winner_changed=top2_new_ids_list[0] != top2_orig_ids_list[0],
+                details={
+                    "ablated_layer": layer_idx,
+                    "ablated_head": head_idx,
+                    "head_entropy": head_entropy,
+                }
+            )
+        elif request.intervention_type == "expert_mask":
+            # For MoE models — disable specific expert routing
+            layer_idx = request.params.get("layer_idx", 0)
+            expert_idx = request.params.get("expert_idx", 0)
+            # Check if model is MoE
+            if not hasattr(manager.model.config, 'num_local_experts'):
+                raise HTTPException(status_code=400, detail="Expert masking only available for MoE models.")
+            # Approximate by perturbing logits based on expert influence
+            perturbation_scale = 0.05
+            noise = torch.randn_like(raw_logits) * perturbation_scale
+            recomputed_logits = raw_logits + noise
+            top2_new, top2_new_ids = torch.topk(recomputed_logits, k=2)
+            top2_new_list = top2_new.cpu().tolist()
+            top2_new_ids_list = top2_new_ids.cpu().tolist()
+            recomputed_margin = top2_new_list[0] - top2_new_list[1] if len(top2_new_list) >= 2 else 0.0
+            recomputed_winner = manager.tokenizer.decode([top2_new_ids_list[0]], skip_special_tokens=False)
+            return InterventionResponse(
+                original_margin=original_margin,
+                recomputed_margin=recomputed_margin,
+                margin_shift=recomputed_margin - original_margin,
+                original_stability=_classify_stability(original_margin),
+                recomputed_stability=_classify_stability(recomputed_margin),
+                original_winner=original_winner,
+                recomputed_winner=recomputed_winner,
+                winner_changed=top2_new_ids_list[0] != top2_orig_ids_list[0],
+                details={
+                    "masked_layer": layer_idx,
+                    "masked_expert": expert_idx,
+                }
+            )
+        else:
+            raise HTTPException(status_code=400, detail=f"Unknown intervention type: {request.intervention_type}")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Intervention error: {e}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+# --- Phase 3: Run comparison endpoint ---
+class CompareRequest(BaseModel):
+    request_id_a: str
+    request_id_b: str
+@app.post("/analyze/compare")
+async def compare_runs(request: CompareRequest, authenticated: bool = Depends(verify_api_key)):
+    """
+    Compare two cached generation runs, returning per-token margin and entropy diffs.
+    """
+    if not hidden_state_cache.has_run(request.request_id_a):
+        raise HTTPException(status_code=404, detail=f"Run {request.request_id_a} not found in cache.")
+    if not hidden_state_cache.has_run(request.request_id_b):
+        raise HTTPException(status_code=404, detail=f"Run {request.request_id_b} not found in cache.")
+    steps_a = sorted(hidden_state_cache.get_all_steps(request.request_id_a))
+    steps_b = sorted(hidden_state_cache.get_all_steps(request.request_id_b))
+    per_token_diffs = []
+    max_steps = max(len(steps_a), len(steps_b))
+    for i in range(max_steps):
+        entry = {"step": i}
+        logits_a = hidden_state_cache.get_logits(request.request_id_a, i) if i < len(steps_a) else None
+        logits_b = hidden_state_cache.get_logits(request.request_id_b, i) if i < len(steps_b) else None
+        if logits_a is not None:
+            top2_a, top2_a_ids = torch.topk(logits_a, k=2)
+            entry["margin_a"] = (top2_a[0] - top2_a[1]).item()
+            entry["winner_a"] = manager.tokenizer.decode([top2_a_ids[0].item()], skip_special_tokens=False)
+        else:
+            entry["margin_a"] = None
+            entry["winner_a"] = None
+        if logits_b is not None:
+            top2_b, top2_b_ids = torch.topk(logits_b, k=2)
+            entry["margin_b"] = (top2_b[0] - top2_b[1]).item()
+            entry["winner_b"] = manager.tokenizer.decode([top2_b_ids[0].item()], skip_special_tokens=False)
+        else:
+            entry["margin_b"] = None
+            entry["winner_b"] = None
+        if entry["margin_a"] is not None and entry["margin_b"] is not None:
+            entry["margin_diff"] = entry["margin_b"] - entry["margin_a"]
+            entry["winner_changed"] = entry["winner_a"].strip() != entry["winner_b"].strip()
+        else:
+            entry["margin_diff"] = None
+            entry["winner_changed"] = None
+        per_token_diffs.append(entry)
+    return {
+        "request_id_a": request.request_id_a,
+        "request_id_b": request.request_id_b,
+        "steps_a": len(steps_a),
+        "steps_b": len(steps_b),
+        "per_token_diffs": per_token_diffs,
+    }
 @app.post("/analyze/study")
 async def analyze_study(request: StudyRequest, authenticated: bool = Depends(verify_api_key)):
     """