Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.6 commited on Feb 24

Commit

54d9b6e

1 Parent(s): d8d197a

Add deep inspection: data-driven pattern classification, attention/MLP tracking, logit lens

- Replace hardcoded position-based head/layer pattern classification with
data-driven detection (sink, previous-token, local, induction, positional, semantic)
- Layer patterns computed as confidence-weighted majority vote of head patterns
- Add forward hooks for attention and MLP output norms per layer
- Add logit lens computation at sampled layers (every n_layers//5)
- Compute per-head sink weight, local weight, and induction weight metrics

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +205 -43

backend/model_service.py CHANGED Viewed

@@ -2065,26 +2065,51 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                         entropy = 0.0 if math.isnan(entropy) or math.isinf(entropy) else entropy
                         avg_entropy = 0.0 if math.isnan(avg_entropy) or math.isinf(avg_entropy) else avg_entropy
-                        # Classify pattern
                         pattern_type = None
                         confidence = 0.0
-                        # Induction pattern: high attention to previous similar tokens
-                        if step > 0 and max_weight > 0.8:
-                            pattern_type = "induction"
-                            confidence = max_weight
-                        # Positional pattern: attention focused on nearby tokens
                         elif entropy < 1.0:
                             pattern_type = "positional"
                             confidence = 1.0 - entropy
-                        # Semantic pattern: broader attention with moderate entropy
-                        elif 1.0 <= entropy < 2.5:
                             pattern_type = "semantic"
-                            confidence = min(1.0, entropy / 2.5)
-                        # Previous token pattern: sharp focus on immediate predecessor
-                        elif max_weight > 0.9 and head_weights[-2].item() > 0.85:
-                            pattern_type = "previous_token"
-                            confidence = head_weights[-2].item()
                         # Sanitize confidence
                         confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
@@ -2129,17 +2154,21 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                     # Sort by max_weight (return all heads, frontend will decide how many to display)
                     critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
-                    # Detect layer-level pattern (percentage-based for any layer count)
                     layer_pattern = None
-                    layer_fraction = (layer_idx + 1) / n_layers  # 1-indexed fraction
-                    if layer_idx == 0:
-                        layer_pattern = {"type": "positional", "confidence": 0.78}
-                    elif layer_fraction <= 0.25 and step > 0:
-                        layer_pattern = {"type": "previous_token", "confidence": 0.65}
-                    elif layer_fraction <= 0.75:
-                        layer_pattern = {"type": "induction", "confidence": 0.87}
-                    else:
-                        layer_pattern = {"type": "semantic", "confidence": 0.92}
                     layer_data_this_token.append({
                         "layer_idx": layer_idx,
@@ -2506,6 +2535,55 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")
             with torch.no_grad():
                 current_ids = inputs["input_ids"]
@@ -2520,6 +2598,8 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     await asyncio.sleep(0)
                     qkv_captures.clear()
                     # Forward pass with full outputs
                     outputs = manager.model(
@@ -2775,13 +2855,37 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         # Previous-token weights for pattern detection: [n_heads]
                         all_prev_token_weights = all_last_row[:, -2] if all_last_row.shape[1] >= 2 else torch.zeros(num_heads_layer, device=layer_attn.device)
                         # Single bulk transfer: all head metrics to CPU
-                        head_metrics_gpu = torch.stack([all_max_weights, all_entropies, all_avg_entropies, all_prev_token_weights])  # [4, n_heads]
                         head_metrics_cpu = head_metrics_gpu.cpu().tolist()  # one sync point
                         max_weights_list = head_metrics_cpu[0]
                         entropies_list = head_metrics_cpu[1]
                         avg_entropies_list = head_metrics_cpu[2]
                         prev_token_list = head_metrics_cpu[3]
                         # Bulk transfer attention matrices to CPU: one .cpu() for entire layer
                         layer_attn_cpu = layer_attn.cpu().float().numpy()  # [n_heads, seq_len, seq_len]
@@ -2796,26 +2900,42 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                             ent = entropies_list[head_idx]
                             avg_ent = avg_entropies_list[head_idx]
                             ptw = prev_token_list[head_idx]
                             # Sanitize
                             mw = 0.0 if math.isnan(mw) or math.isinf(mw) else mw
                             ent = 0.0 if math.isnan(ent) or math.isinf(ent) else ent
                             avg_ent = 0.0 if math.isnan(avg_ent) or math.isinf(avg_ent) else avg_ent
                             pattern_type = None
                             confidence = 0.0
-                            if step > 0 and mw > 0.8:
                                 pattern_type = "induction"
-                                confidence = mw
                             elif ent < 1.0:
                                 pattern_type = "positional"
                                 confidence = 1.0 - ent
-                            elif 1.0 <= ent < 2.5:
                                 pattern_type = "semantic"
-                                confidence = min(1.0, ent / 2.5)
-                            elif mw > 0.9 and ptw > 0.85:
-                                pattern_type = "previous_token"
-                                confidence = ptw
                             confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
                             attention_matrix = layer_attn_cpu[head_idx]
@@ -2846,18 +2966,23 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
                         layer_pattern = None
-                        layer_fraction = (layer_idx + 1) / n_layers
-                        if layer_idx == 0:
-                            layer_pattern = {"type": "positional", "confidence": 0.78}
-                        elif layer_fraction <= 0.25 and step > 0:
-                            layer_pattern = {"type": "previous_token", "confidence": 0.65}
-                        elif layer_fraction <= 0.75:
-                            layer_pattern = {"type": "induction", "confidence": 0.87}
-                        else:
-                            layer_pattern = {"type": "semantic", "confidence": 0.92}
-                        layer_data_this_token.append({
                             "layer_idx": layer_idx,
                             "pattern": layer_pattern,
                             "critical_heads": critical_heads,
@@ -2865,7 +2990,44 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                             "activation_entropy": activation_entropy,
                             "hidden_state_norm": hidden_state_norm,
                             "delta_norm": delta_norm
-                        })
                     layer_data_by_token.append(layer_data_this_token)

                         entropy = 0.0 if math.isnan(entropy) or math.isinf(entropy) else entropy
                         avg_entropy = 0.0 if math.isnan(avg_entropy) or math.isinf(avg_entropy) else avg_entropy
+                        # Data-driven head pattern classification (priority order)
+                        seq_len_hw = head_weights.shape[0]
                         pattern_type = None
                         confidence = 0.0
+                        # 1. Attention sink: >50% weight on positions 0-2
+                        sink_w = head_weights[:min(3, seq_len_hw)].sum().item()
+                        if sink_w > 0.5:
+                            pattern_type = "attention_sink"
+                            confidence = sink_w
+                        # 2. Previous token: sharp focus on immediate predecessor
+                        elif max_weight > 0.9 and head_weights[-2].item() > 0.85:
+                            pattern_type = "previous_token"
+                            confidence = head_weights[-2].item()
+                        # 3. Local: >80% weight within 5 positions of query
+                        elif seq_len_hw > 5 and head_weights[max(0, seq_len_hw - 5):].sum().item() > 0.8:
+                            pattern_type = "local"
+                            confidence = head_weights[max(0, seq_len_hw - 5):].sum().item()
+                        # 4. Induction: attends to positions following previous occurrences of current token
+                        elif step > 0:
+                            current_tok = current_ids[0, -1]
+                            prev_occ = (current_ids[0, :-1] == current_tok).nonzero(as_tuple=True)[0]
+                            if len(prev_occ) > 0:
+                                foll = prev_occ + 1
+                                foll = foll[foll < seq_len_hw]
+                                if len(foll) > 0:
+                                    ind_w = head_weights[foll].sum().item()
+                                    if ind_w > 0.3:
+                                        pattern_type = "induction"
+                                        confidence = min(1.0, ind_w)
+                            if pattern_type is None:
+                                if entropy < 1.0:
+                                    pattern_type = "positional"
+                                    confidence = 1.0 - entropy
+                                elif entropy >= 1.0:
+                                    pattern_type = "semantic"
+                                    confidence = min(1.0, 0.5)
+                        # 5. Positional: low entropy, focused attention
                         elif entropy < 1.0:
                             pattern_type = "positional"
                             confidence = 1.0 - entropy
+                        # 6. Semantic: broad attention (fallback)
+                        elif entropy >= 1.0:
                             pattern_type = "semantic"
+                            confidence = min(1.0, 0.5)
                         # Sanitize confidence
                         confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
                     # Sort by max_weight (return all heads, frontend will decide how many to display)
                     critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
+                    # Layer-level pattern: majority vote of head patterns, weighted by confidence
+                    pattern_votes = {}
+                    for h in critical_heads:
+                        if h["pattern"] and h["pattern"]["type"]:
+                            pt = h["pattern"]["type"]
+                            pc = h["pattern"]["confidence"]
+                            pattern_votes[pt] = pattern_votes.get(pt, 0.0) + pc
                     layer_pattern = None
+                    if pattern_votes:
+                        best_type = max(pattern_votes, key=pattern_votes.get)
+                        total_conf = sum(pattern_votes.values())
+                        layer_pattern = {
+                            "type": best_type,
+                            "confidence": round(pattern_votes[best_type] / total_conf, 3) if total_conf > 0 else 0.0
+                        }
                     layer_data_this_token.append({
                         "layer_idx": layer_idx,
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")
+            # Phase 4: Hooks for attention and MLP output norms
+            attn_output_norms = {}
+            mlp_output_norms = {}
+            def make_attn_output_hook(layer_idx):
+                def hook(module, input, output):
+                    try:
+                        out = output[0] if isinstance(output, tuple) else output
+                        if out.dim() == 3:
+                            attn_output_norms[layer_idx] = torch.norm(out[0, -1]).item()
+                    except Exception:
+                        pass
+                return hook
+            def make_mlp_output_hook(layer_idx):
+                def hook(module, input, output):
+                    try:
+                        out = output[0] if isinstance(output, tuple) else output
+                        if out.dim() == 3:
+                            mlp_output_norms[layer_idx] = torch.norm(out[0, -1]).item()
+                        elif out.dim() == 2:
+                            mlp_output_norms[layer_idx] = torch.norm(out[-1]).item()
+                    except Exception:
+                        pass
+                return hook
+            try:
+                # CodeGen style
+                if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
+                    for layer_idx, layer in enumerate(manager.model.transformer.h):
+                        if hasattr(layer, 'attn'):
+                            hook = layer.attn.register_forward_hook(make_attn_output_hook(layer_idx))
+                            hooks.append(hook)
+                        if hasattr(layer, 'mlp'):
+                            hook = layer.mlp.register_forward_hook(make_mlp_output_hook(layer_idx))
+                            hooks.append(hook)
+                # Mistral/LLaMA style
+                elif hasattr(manager.model, 'model') and hasattr(manager.model.model, 'layers'):
+                    for layer_idx, layer in enumerate(manager.model.model.layers):
+                        if hasattr(layer, 'self_attn'):
+                            hook = layer.self_attn.register_forward_hook(make_attn_output_hook(layer_idx))
+                            hooks.append(hook)
+                        if hasattr(layer, 'mlp'):
+                            hook = layer.mlp.register_forward_hook(make_mlp_output_hook(layer_idx))
+                            hooks.append(hook)
+                logger.info(f"Registered attn/MLP output hooks for contribution tracking")
+            except Exception as hook_error:
+                logger.warning(f"Could not register attn/MLP hooks: {hook_error}")
             with torch.no_grad():
                 current_ids = inputs["input_ids"]
                     await asyncio.sleep(0)
                     qkv_captures.clear()
+                    attn_output_norms.clear()
+                    mlp_output_norms.clear()
                     # Forward pass with full outputs
                     outputs = manager.model(
                         # Previous-token weights for pattern detection: [n_heads]
                         all_prev_token_weights = all_last_row[:, -2] if all_last_row.shape[1] >= 2 else torch.zeros(num_heads_layer, device=layer_attn.device)
+                        # Attention sink weights: sum of attention on positions 0-2 per head [n_heads]
+                        seq_len_attn = all_last_row.shape[1]
+                        all_sink_weights = all_last_row[:, :min(3, seq_len_attn)].sum(dim=-1)
+                        # Local attention weights: sum within 5 positions of query per head [n_heads]
+                        all_local_weights = all_last_row[:, max(0, seq_len_attn - 5):].sum(dim=-1) if seq_len_attn > 5 else torch.ones(num_heads_layer, device=layer_attn.device)
+                        # Induction detection: attention to positions following previous occurrences of current token
+                        all_induction_weights = torch.zeros(num_heads_layer, device=layer_attn.device)
+                        if step > 0:
+                            current_token = current_ids[0, -1]
+                            prev_occurrences = (current_ids[0, :-1] == current_token).nonzero(as_tuple=True)[0]
+                            if len(prev_occurrences) > 0:
+                                following_positions = prev_occurrences + 1
+                                following_positions = following_positions[following_positions < seq_len_attn]
+                                if len(following_positions) > 0:
+                                    all_induction_weights = all_last_row[:, following_positions].sum(dim=-1)
                         # Single bulk transfer: all head metrics to CPU
+                        head_metrics_gpu = torch.stack([
+                            all_max_weights, all_entropies, all_avg_entropies, all_prev_token_weights,
+                            all_sink_weights, all_local_weights, all_induction_weights
+                        ])  # [7, n_heads]
                         head_metrics_cpu = head_metrics_gpu.cpu().tolist()  # one sync point
                         max_weights_list = head_metrics_cpu[0]
                         entropies_list = head_metrics_cpu[1]
                         avg_entropies_list = head_metrics_cpu[2]
                         prev_token_list = head_metrics_cpu[3]
+                        sink_weights_list = head_metrics_cpu[4]
+                        local_weights_list = head_metrics_cpu[5]
+                        induction_weights_list = head_metrics_cpu[6]
                         # Bulk transfer attention matrices to CPU: one .cpu() for entire layer
                         layer_attn_cpu = layer_attn.cpu().float().numpy()  # [n_heads, seq_len, seq_len]
                             ent = entropies_list[head_idx]
                             avg_ent = avg_entropies_list[head_idx]
                             ptw = prev_token_list[head_idx]
+                            skw = sink_weights_list[head_idx]
+                            lcw = local_weights_list[head_idx]
+                            idw = induction_weights_list[head_idx]
                             # Sanitize
                             mw = 0.0 if math.isnan(mw) or math.isinf(mw) else mw
                             ent = 0.0 if math.isnan(ent) or math.isinf(ent) else ent
                             avg_ent = 0.0 if math.isnan(avg_ent) or math.isinf(avg_ent) else avg_ent
+                            # Data-driven head pattern classification (priority order)
                             pattern_type = None
                             confidence = 0.0
+                            # 1. Attention sink: >50% weight on positions 0-2
+                            if skw > 0.5:
+                                pattern_type = "attention_sink"
+                                confidence = skw
+                            # 2. Previous token: sharp focus on immediate predecessor
+                            elif mw > 0.9 and ptw > 0.85:
+                                pattern_type = "previous_token"
+                                confidence = ptw
+                            # 3. Local: >80% weight within 5 positions of query
+                            elif seq_len_attn > 5 and lcw > 0.8:
+                                pattern_type = "local"
+                                confidence = lcw
+                            # 4. Induction: attends to positions following previous occurrences of current token
+                            elif step > 0 and idw > 0.3:
                                 pattern_type = "induction"
+                                confidence = min(1.0, idw)
+                            # 5. Positional: low entropy, focused attention
                             elif ent < 1.0:
                                 pattern_type = "positional"
                                 confidence = 1.0 - ent
+                            # 6. Semantic: broad attention (fallback)
+                            elif ent >= 1.0:
                                 pattern_type = "semantic"
+                                confidence = min(1.0, 0.5)
                             confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
                             attention_matrix = layer_attn_cpu[head_idx]
                         critical_heads.sort(key=lambda h: h["max_weight"], reverse=True)
+                        # Layer-level pattern: majority vote of head patterns, weighted by confidence
+                        pattern_votes = {}
+                        for h in critical_heads:
+                            if h["pattern"] and h["pattern"]["type"]:
+                                pt = h["pattern"]["type"]
+                                pc = h["pattern"]["confidence"]
+                                pattern_votes[pt] = pattern_votes.get(pt, 0.0) + pc
                         layer_pattern = None
+                        if pattern_votes:
+                            best_type = max(pattern_votes, key=pattern_votes.get)
+                            total_conf = sum(pattern_votes.values())
+                            layer_pattern = {
+                                "type": best_type,
+                                "confidence": round(pattern_votes[best_type] / total_conf, 3) if total_conf > 0 else 0.0
+                            }
+                        layer_entry = {
                             "layer_idx": layer_idx,
                             "pattern": layer_pattern,
                             "critical_heads": critical_heads,
                             "activation_entropy": activation_entropy,
                             "hidden_state_norm": hidden_state_norm,
                             "delta_norm": delta_norm
+                        }
+                        # Phase 4: Attention and MLP output norms
+                        if layer_idx in attn_output_norms:
+                            layer_entry["attn_output_norm"] = attn_output_norms[layer_idx]
+                        if layer_idx in mlp_output_norms:
+                            layer_entry["mlp_output_norm"] = mlp_output_norms[layer_idx]
+                        # Phase 5: Logit lens at sampled layers (every 8th layer)
+                        logit_lens_stride = max(1, n_layers // 5)
+                        if layer_idx % logit_lens_stride == 0 or layer_idx == n_layers - 1:
+                            try:
+                                hidden_for_lens = current_hidden[-1].unsqueeze(0)  # [1, hidden_dim]
+                                # Apply final layer norm then project through lm_head
+                                if hasattr(manager.model, 'model') and hasattr(manager.model.model, 'norm'):
+                                    normed = manager.model.model.norm(hidden_for_lens)
+                                    lens_logits = manager.model.lm_head(normed)[0]  # [vocab_size]
+                                elif hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'ln_f'):
+                                    normed = manager.model.transformer.ln_f(hidden_for_lens)
+                                    lens_logits = manager.model.lm_head(normed)[0]
+                                else:
+                                    lens_logits = None
+                                if lens_logits is not None:
+                                    lens_probs = torch.softmax(lens_logits, dim=-1)
+                                    top_probs, top_ids = torch.topk(lens_probs, k=5)
+                                    top_probs_list = top_probs.cpu().tolist()
+                                    top_ids_list = top_ids.cpu().tolist()
+                                    lens_entries = []
+                                    for tp, tid in zip(top_probs_list, top_ids_list):
+                                        lens_entries.append({
+                                            "token": manager.tokenizer.decode([tid], skip_special_tokens=False),
+                                            "probability": tp
+                                        })
+                                    layer_entry["logit_lens_top"] = lens_entries
+                            except Exception as lens_err:
+                                logger.debug(f"Logit lens error at layer {layer_idx}: {lens_err}")
+                        layer_data_this_token.append(layer_entry)
                     layer_data_by_token.append(layer_data_this_token)