Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.6 commited on Feb 22

Commit

d8d197a

1 Parent(s): 9978aec

Change default model to Devstral and optimise attention extraction

Switch the default model fallback from codegen-350m to devstral-small,
matching the instruction-tuned model used for PhD research. Also includes
batched tensor operations for attention extraction (reduces GPU→CPU sync
points from ~4000 to ~40 per token).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +111 -61

backend/model_service.py CHANGED Viewed

@@ -294,7 +294,7 @@ class ModelManager:
         self.trace_buffer: List[TraceData] = []
         # Read configuration from environment variables
-        self.model_id = os.environ.get("DEFAULT_MODEL", "codegen-350m")
         self.max_context = int(os.environ.get("MAX_CONTEXT", "8192"))
         self.batch_size = int(os.environ.get("BATCH_SIZE", "1"))
@@ -2676,107 +2676,158 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         "sampling": sampling_metadata
                     })
                     # === STAGE 3: EXTRACTING (per layer within each token) ===
                     layer_data_this_token = []
-                    for layer_idx in range(len(outputs.attentions)):
                         # Emit extraction progress (within generating stage for combined progress)
                         if step == max_tokens - 1:  # Only emit detailed layer progress on last token
-                            layer_progress = (layer_idx / len(outputs.attentions)) * 100
-                            overall_progress = 30 + (layer_idx / len(outputs.attentions)) * 40  # 30-70%
                             yield sse_event('extracting', stage=3, totalStages=5, progress=overall_progress,
                                            stageProgress=layer_progress,
-                                           detail=f'Processing layer {layer_idx + 1}/{len(outputs.attentions)}',
-                                           metadata={'layerIndex': layer_idx, 'totalLayers': len(outputs.attentions),
                                                     'headsPerLayer': n_heads, 'stepIndex': step, 'totalSteps': max_tokens})
-                            if layer_idx % 5 == 0:  # Yield every 5 layers to avoid too many events
                                 await asyncio.sleep(0)
-                        layer_attn = outputs.attentions[layer_idx][0]
                         current_hidden = outputs.hidden_states[layer_idx + 1]
                         if current_hidden.dim() == 3:
                             current_hidden = current_hidden[0]
                         if layer_idx > 0:
                             prev_hidden = outputs.hidden_states[layer_idx]
                             if prev_hidden.dim() == 3:
                                 prev_hidden = prev_hidden[0]
-                            delta_norm = torch.norm(current_hidden - prev_hidden).item()
                         else:
                             delta_norm = None
-                        activation_magnitude = torch.norm(current_hidden).item()
-                        last_token_hidden = current_hidden[-1]
-                        activation_entropy = torch.std(last_token_hidden).item()
-                        hidden_state_norm = torch.norm(last_token_hidden).item()
-                        # Sanitize
                         activation_magnitude = 0.0 if math.isnan(activation_magnitude) or math.isinf(activation_magnitude) else activation_magnitude
                         activation_entropy = 0.0 if math.isnan(activation_entropy) or math.isinf(activation_entropy) else activation_entropy
                         hidden_state_norm = 0.0 if math.isnan(hidden_state_norm) or math.isinf(hidden_state_norm) else hidden_state_norm
                         if delta_norm is not None:
                             delta_norm = 0.0 if math.isnan(delta_norm) or math.isinf(delta_norm) else delta_norm
-                        # Process heads
-                        critical_heads = []
-                        for head_idx in range(layer_attn.shape[0]):
-                            head_weights = layer_attn[head_idx, -1, :]
-                            max_weight = head_weights.max().item()
-                            entropy = -(head_weights * torch.log(head_weights + 1e-10)).sum().item()
-                            # Normalized attention entropy averaged over latter half of query positions
-                            # Normalized by log(k_i) where k_i = number of keys position i can attend to
-                            # This produces values in [0,1] with better spread across heads
-                            head_attn = layer_attn[head_idx]  # [q_len, k_len]
-                            q_len = head_attn.shape[0]
-                            # Compute raw entropy per query position
-                            token_entropies = -(head_attn * torch.log(head_attn + 1e-10)).sum(dim=-1)  # [q_len]
-                            # Normalize by max possible entropy: log(k_i) where k_i = i + 1 (causal mask)
-                            positions = torch.arange(1, q_len + 1, device=head_attn.device, dtype=head_attn.dtype)
-                            max_entropies = torch.log(positions + 1e-10)
-                            normalized_entropies = token_entropies / (max_entropies + 1e-10)  # [0, 1] range
-                            # Average over latter half of positions
-                            start_idx = q_len // 2
-                            avg_entropy = normalized_entropies[start_idx:].mean().item() if start_idx < q_len else normalized_entropies.mean().item()
-                            max_weight = 0.0 if math.isnan(max_weight) or math.isinf(max_weight) else max_weight
-                            entropy = 0.0 if math.isnan(entropy) or math.isinf(entropy) else entropy
-                            avg_entropy = 0.0 if math.isnan(avg_entropy) or math.isinf(avg_entropy) else avg_entropy
                             pattern_type = None
                             confidence = 0.0
-                            if step > 0 and max_weight > 0.8:
                                 pattern_type = "induction"
-                                confidence = max_weight
-                            elif entropy < 1.0:
                                 pattern_type = "positional"
-                                confidence = 1.0 - entropy
-                            elif 1.0 <= entropy < 2.5:
                                 pattern_type = "semantic"
-                                confidence = min(1.0, entropy / 2.5)
-                            elif max_weight > 0.9 and head_weights[-2].item() > 0.85:
                                 pattern_type = "previous_token"
-                                confidence = head_weights[-2].item()
                             confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
-                            # Store as numpy arrays (not Python lists) to save memory
-                            # ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
-                            attention_matrix = layer_attn[head_idx].cpu().float().numpy()
                             q_matrix = None
                             k_matrix = None
                             v_matrix = None
-                            if layer_idx in qkv_captures:
-                                q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
-                                k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
-                                v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
-                            # Store matrices in cache for lazy loading (reduces response size)
                             matrix_cache.store(request_id, step, layer_idx, head_idx, {
                                 "attention_weights": attention_matrix,
                                 "q_matrix": q_matrix,
@@ -2784,13 +2835,12 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                                 "v_matrix": v_matrix
                             })
-                            # Return only metadata (matrices fetched on-demand via /matrix endpoint)
                             critical_heads.append({
                                 "head_idx": head_idx,
-                                "entropy": entropy,
-                                "avg_entropy": avg_entropy,  # Averaged over all query positions
-                                "max_weight": max_weight,
-                                "has_matrices": attention_matrix is not None,  # Flag for frontend
                                 "pattern": {"type": pattern_type, "confidence": confidence} if pattern_type else None
                             })

         self.trace_buffer: List[TraceData] = []
         # Read configuration from environment variables
+        self.model_id = os.environ.get("DEFAULT_MODEL", "devstral-small")
         self.max_context = int(os.environ.get("MAX_CONTEXT", "8192"))
         self.batch_size = int(os.environ.get("BATCH_SIZE", "1"))
                         "sampling": sampling_metadata
                     })
+                    # Emit generated token immediately so clients can show code progressively
+                    yield sse_event('generated_token', stage=2, totalStages=5,
+                                   progress=10 + ((step + 1) / max_tokens) * 20,
+                                   stageProgress=((step + 1) / max_tokens) * 100,
+                                   detail=f'Generated token {step + 1}/{max_tokens}',
+                                   metadata={
+                                       'stepIndex': step,
+                                       'totalSteps': max_tokens,
+                                       'token': next_token_text,
+                                       'tokenId': next_token_id,
+                                       'generatedTokens': generated_tokens.copy(),
+                                   })
+                    await asyncio.sleep(0)
                     # === STAGE 3: EXTRACTING (per layer within each token) ===
+                    # Optimised: batched tensor ops per layer instead of per-head Python loops
+                    # Reduces GPU→CPU sync points from ~4000 to ~40 per token
                     layer_data_this_token = []
+                    n_total_layers = len(outputs.attentions)
+                    for layer_idx in range(n_total_layers):
                         # Emit extraction progress (within generating stage for combined progress)
                         if step == max_tokens - 1:  # Only emit detailed layer progress on last token
+                            layer_progress = (layer_idx / n_total_layers) * 100
+                            overall_progress = 30 + (layer_idx / n_total_layers) * 40  # 30-70%
                             yield sse_event('extracting', stage=3, totalStages=5, progress=overall_progress,
                                            stageProgress=layer_progress,
+                                           detail=f'Processing layer {layer_idx + 1}/{n_total_layers}',
+                                           metadata={'layerIndex': layer_idx, 'totalLayers': n_total_layers,
                                                     'headsPerLayer': n_heads, 'stepIndex': step, 'totalSteps': max_tokens})
+                            if layer_idx % 5 == 0:
                                 await asyncio.sleep(0)
+                        # --- Per-layer: bulk GPU ops then single CPU transfer ---
+                        layer_attn = outputs.attentions[layer_idx][0]  # [n_heads, seq_len, seq_len]
                         current_hidden = outputs.hidden_states[layer_idx + 1]
                         if current_hidden.dim() == 3:
                             current_hidden = current_hidden[0]
+                        # Hidden state metrics — 4 values, one .cpu() call
+                        last_token_hidden = current_hidden[-1]
                         if layer_idx > 0:
                             prev_hidden = outputs.hidden_states[layer_idx]
                             if prev_hidden.dim() == 3:
                                 prev_hidden = prev_hidden[0]
+                            hidden_metrics = torch.stack([
+                                torch.norm(current_hidden - prev_hidden),
+                                torch.norm(current_hidden),
+                                torch.std(last_token_hidden),
+                                torch.norm(last_token_hidden),
+                            ]).cpu().tolist()
+                            delta_norm, activation_magnitude, activation_entropy, hidden_state_norm = hidden_metrics
                         else:
+                            hidden_metrics = torch.stack([
+                                torch.norm(current_hidden),
+                                torch.std(last_token_hidden),
+                                torch.norm(last_token_hidden),
+                            ]).cpu().tolist()
+                            activation_magnitude, activation_entropy, hidden_state_norm = hidden_metrics
                             delta_norm = None
+                        # Sanitize hidden state metrics
                         activation_magnitude = 0.0 if math.isnan(activation_magnitude) or math.isinf(activation_magnitude) else activation_magnitude
                         activation_entropy = 0.0 if math.isnan(activation_entropy) or math.isinf(activation_entropy) else activation_entropy
                         hidden_state_norm = 0.0 if math.isnan(hidden_state_norm) or math.isinf(hidden_state_norm) else hidden_state_norm
                         if delta_norm is not None:
                             delta_norm = 0.0 if math.isnan(delta_norm) or math.isinf(delta_norm) else delta_norm
+                        # --- Batched head processing: all heads at once on GPU ---
+                        num_heads_layer = layer_attn.shape[0]
+                        # Last-row attention weights for all heads: [n_heads, seq_len]
+                        all_last_row = layer_attn[:, -1, :]
+                        # Max weight per head: [n_heads] — single GPU op
+                        all_max_weights = all_last_row.max(dim=-1).values
+                        # Entropy of last-row per head: [n_heads] — single GPU op
+                        all_entropies = -(all_last_row * torch.log(all_last_row + 1e-10)).sum(dim=-1)
+                        # Normalized average entropy per head (latter half of query positions)
+                        # layer_attn: [n_heads, q_len, k_len]
+                        q_len = layer_attn.shape[1]
+                        # Raw entropy per query position per head: [n_heads, q_len]
+                        all_token_entropies = -(layer_attn * torch.log(layer_attn + 1e-10)).sum(dim=-1)
+                        # Normalize by log(position): [q_len]
+                        positions = torch.arange(1, q_len + 1, device=layer_attn.device, dtype=layer_attn.dtype)
+                        max_ents = torch.log(positions + 1e-10)  # [q_len]
+                        all_normalized = all_token_entropies / (max_ents.unsqueeze(0) + 1e-10)  # [n_heads, q_len]
+                        # Average over latter half: [n_heads]
+                        start_idx = q_len // 2
+                        if start_idx < q_len:
+                            all_avg_entropies = all_normalized[:, start_idx:].mean(dim=-1)
+                        else:
+                            all_avg_entropies = all_normalized.mean(dim=-1)
+                        # Previous-token weights for pattern detection: [n_heads]
+                        all_prev_token_weights = all_last_row[:, -2] if all_last_row.shape[1] >= 2 else torch.zeros(num_heads_layer, device=layer_attn.device)
+                        # Single bulk transfer: all head metrics to CPU
+                        head_metrics_gpu = torch.stack([all_max_weights, all_entropies, all_avg_entropies, all_prev_token_weights])  # [4, n_heads]
+                        head_metrics_cpu = head_metrics_gpu.cpu().tolist()  # one sync point
+                        max_weights_list = head_metrics_cpu[0]
+                        entropies_list = head_metrics_cpu[1]
+                        avg_entropies_list = head_metrics_cpu[2]
+                        prev_token_list = head_metrics_cpu[3]
+                        # Bulk transfer attention matrices to CPU: one .cpu() for entire layer
+                        layer_attn_cpu = layer_attn.cpu().float().numpy()  # [n_heads, seq_len, seq_len]
+                        # QKV matrices (already on CPU from hooks)
+                        qkv_layer = qkv_captures.get(layer_idx)
+                        # Build per-head metadata from CPU-side data (no more GPU calls)
+                        critical_heads = []
+                        for head_idx in range(num_heads_layer):
+                            mw = max_weights_list[head_idx]
+                            ent = entropies_list[head_idx]
+                            avg_ent = avg_entropies_list[head_idx]
+                            ptw = prev_token_list[head_idx]
+                            # Sanitize
+                            mw = 0.0 if math.isnan(mw) or math.isinf(mw) else mw
+                            ent = 0.0 if math.isnan(ent) or math.isinf(ent) else ent
+                            avg_ent = 0.0 if math.isnan(avg_ent) or math.isinf(avg_ent) else avg_ent
                             pattern_type = None
                             confidence = 0.0
+                            if step > 0 and mw > 0.8:
                                 pattern_type = "induction"
+                                confidence = mw
+                            elif ent < 1.0:
                                 pattern_type = "positional"
+                                confidence = 1.0 - ent
+                            elif 1.0 <= ent < 2.5:
                                 pattern_type = "semantic"
+                                confidence = min(1.0, ent / 2.5)
+                            elif mw > 0.9 and ptw > 0.85:
                                 pattern_type = "previous_token"
+                                confidence = ptw
                             confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
+                            attention_matrix = layer_attn_cpu[head_idx]
                             q_matrix = None
                             k_matrix = None
                             v_matrix = None
+                            if qkv_layer is not None:
+                                q_matrix = qkv_layer['q'][:, head_idx, :].float().numpy()
+                                k_matrix = qkv_layer['k'][:, head_idx, :].float().numpy()
+                                v_matrix = qkv_layer['v'][:, head_idx, :].float().numpy()
                             matrix_cache.store(request_id, step, layer_idx, head_idx, {
                                 "attention_weights": attention_matrix,
                                 "q_matrix": q_matrix,
                                 "v_matrix": v_matrix
                             })
                             critical_heads.append({
                                 "head_idx": head_idx,
+                                "entropy": ent,
+                                "avg_entropy": avg_ent,
+                                "max_weight": mw,
+                                "has_matrices": attention_matrix is not None,
                                 "pattern": {"type": pattern_type, "confidence": confidence} if pattern_type else None
                             })