Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Sep 5

Commit

00b4f4f

verified ·

1 Parent(s): 11a45c9

Update compression.py

Browse files

Files changed (1) hide show

compression.py +0 -1052

compression.py CHANGED Viewed

@@ -1,1052 +0,0 @@
-"""
-Enhanced SPG compression algorithms with RocketKV-style 450x compression.
-NO ESTIMATIONS - only measured values. FAIL FAST on errors.
-"""
-import torch
-import torch.nn.functional as F
-import numpy as np
-from typing import Tuple, Optional, Dict, Any, List
-from dataclasses import replace
-import logging
-from config import (
-    CompressionConfig, EnhancedSPGConfig, CompressionType,
-    ResearchConstants
-)
-logger = logging.getLogger(__name__)
-class EnhancedSlidingPrecisionGradient:
-    """
-    Research-grade Enhanced SPG with RocketKV-style 450x compression capability.
-    NO ESTIMATIONS OR HARDCODED VALUES - all parameters from validated config.
-    """
-    def __init__(self, config: EnhancedSPGConfig):
-        self.config = config
-        self.constants = ResearchConstants()
-        self.layer_decay_rates: Optional[List[float]] = None
-        self.compression_stats: List[Dict[str, Any]] = []
-        # Progressive compression state
-        self.current_compression_ratio = config.initial_compression_ratio if config.enable_progressive else None
-        self.progressive_step = 0
-        self.quality_history: List[float] = []
-        # Adaptive state
-        self.adaptive_enabled = config.enable_adaptive
-        self.decay_adjustment_rate = config.decay_adjustment_rate
-        self.target_perplexity_delta = config.target_perplexity_delta
-        # RocketKV-style adaptive decomposition
-        self.use_adaptive_decomposition = config.use_adaptive_decomposition
-        self.use_hybrid_sparse_attention = config.use_hybrid_sparse_attention
-        self.target_compression_ratio = config.target_compression_ratio
-        logger.info(f"Enhanced SPG initialized with {config.magnitude_threshold_mode} magnitude thresholds")
-        if self.use_hybrid_sparse_attention:
-            logger.info("RocketKV-style Hybrid Sparse Attention enabled")
-    def initialize_layer_decay_rates(self, n_layers: int) -> None:
-        """Initialize per-layer decay rates with validation."""
-        if not self.constants.MIN_LAYERS <= n_layers <= self.constants.MAX_LAYERS:
-            logger.warning(f"n_layers {n_layers} outside typical range [{self.constants.MIN_LAYERS}, {self.constants.MAX_LAYERS}]")
-        if self.config.per_layer_decay:
-            self.layer_decay_rates = [self.config.base_decay_rate] * n_layers
-        else:
-            self.layer_decay_rates = [self.config.base_decay_rate] * n_layers
-        self.n_layers = n_layers
-        logger.info(f"Initialized decay rates for {n_layers} layers")
-    def update_decay_rate(self, layer_idx: int, quality_metric: float, target_quality: float) -> None:
-        """Update decay rate for adaptive SPG with proper validation."""
-        if not self.adaptive_enabled or self.layer_decay_rates is None:
-            return
-        if not 0 <= layer_idx < len(self.layer_decay_rates):
-            logger.error(f"Invalid layer_idx {layer_idx}, valid range: [0, {len(self.layer_decay_rates)})")
-            return
-        # Validate and clamp inputs
-        quality_metric = max(0.1, min(1000.0, float(quality_metric)))
-        target_quality = max(0.1, min(1000.0, float(target_quality)))
-        # Compute adjustment
-        quality_delta = quality_metric - target_quality
-        if quality_delta > 0:  # Quality worse than target
-            adjustment = -self.decay_adjustment_rate * (quality_delta / target_quality)
-        else:  # Quality better than target
-            adjustment = self.decay_adjustment_rate * (abs(quality_delta) / target_quality)
-        # Apply with bounds
-        old_rate = self.layer_decay_rates[layer_idx]
-        new_rate = max(0.8, min(0.99, old_rate + adjustment))
-        self.layer_decay_rates[layer_idx] = new_rate
-        logger.debug(f"Adaptive SPG Layer {layer_idx}: quality={quality_metric:.3f}, "
-                    f"target={target_quality:.3f}, decay_rate: {old_rate:.3f} → {new_rate:.3f}")
-    def compute_magnitude_importance(self, keys: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
-        """
-        Compute importance scores based on magnitude statistics.
-        This is an EXPLICIT magnitude-based proxy, not an estimation.
-        """
-        try:
-            # Compute L2 norm across head dimension for each token
-            k_norms = keys.norm(dim=-1).mean(dim=1).mean(dim=0)  # [seq_len]
-            v_norms = values.norm(dim=-1).mean(dim=1).mean(dim=0)  # [seq_len]
-            # Combine key and value magnitudes (explicit formula)
-            importance_scores = (k_norms + v_norms) / 2.0
-            # Normalize to [0, 1] range for consistent thresholding
-            score_min = importance_scores.min()
-            score_max = importance_scores.max()
-            if score_max > score_min:
-                importance_scores = (importance_scores - score_min) / (score_max - score_min)
-            else:
-                importance_scores = torch.ones_like(importance_scores)
-            logger.debug(f"Computed magnitude importance: min={score_min:.6f}, max={score_max:.6f}")
-            return importance_scores
-        except Exception as e:
-            logger.error(f"Error computing magnitude importance: {e}")
-            raise
-    def estimate_attention_sparsity(self, keys: torch.Tensor, values: torch.Tensor) -> float:
-        """Estimate attention pattern sparsity for adaptive decomposition. FAIL FAST on error."""
-        try:
-            # Compute approximate attention patterns using key-key similarity
-            k_norm = F.normalize(keys.float(), p=2, dim=-1)
-            attention_approx = torch.matmul(k_norm, k_norm.transpose(-2, -1))
-            # Measure sparsity as fraction of near-zero attention weights
-            # Use configurable threshold from constants
-            threshold = self.constants.ATTENTION_SPARSITY_THRESHOLD
-            sparse_fraction = (attention_approx.abs() < threshold).float().mean().item()
-            return sparse_fraction
-        except Exception as e:
-            # FAIL FAST - NO FALLBACK VALUES
-            logger.error(f"Failed to estimate attention sparsity: {e}")
-            raise RuntimeError(f"Cannot measure attention sparsity: {e}")
-    def adaptive_stage_split(self, target_ratio: float, seq_len: int, sparsity: float) -> Tuple[float, float]:
-        """RocketKV-style adaptive compression decomposition with explicit parameters."""
-        # Use explicit formulas from research constants
-        if sparsity > self.constants.SPARSITY_HIGH_THRESHOLD:
-            stage1_power = self.constants.SPARSE_STAGE1_POWER
-        elif sparsity > self.constants.SPARSITY_MEDIUM_THRESHOLD:
-            stage1_power = self.constants.BALANCED_STAGE1_POWER
-        else:
-            stage1_power = self.constants.DENSE_STAGE1_POWER
-        stage1_ratio = target_ratio ** stage1_power
-        stage2_ratio = target_ratio / stage1_ratio
-        # Bounds checking with explicit limits from config
-        stage1_ratio = max(self.config.stage_compression_min, min(self.config.stage_compression_max, stage1_ratio))
-        stage2_ratio = max(self.config.stage_compression_min, min(self.config.stage_compression_max, stage2_ratio))
-        logger.debug(f"Adaptive split: sparsity={sparsity:.3f}, stage1={stage1_ratio:.1f}x, stage2={stage2_ratio:.1f}x")
-        return stage1_ratio, stage2_ratio
-    def snapkv_plus_plus(self, keys: torch.Tensor, values: torch.Tensor,
-                        compression_ratio: float) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
-        """SnapKV++ with GQA support and adaptive pooling - no hardcoded values."""
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        # Adaptive kernel size based on sequence length (from config)
-        kernel_size = self.config.get_adaptive_kernel_size(seq_len)
-        # Compute importance scores with adaptive pooling
-        key_norms = keys.norm(dim=-1)  # [batch, heads, seq]
-        value_norms = values.norm(dim=-1)
-        combined_importance = (key_norms + value_norms) / 2.0
-        # Multi-head aggregation with adaptive pooling
-        if kernel_size > 1:
-            # Apply 1D pooling along sequence dimension
-            pooled_importance = F.avg_pool1d(
-                combined_importance.mean(dim=1).unsqueeze(1),  # [batch, 1, seq]
-                kernel_size=kernel_size,
-                stride=1,
-                padding=kernel_size // 2
-            ).squeeze(1)  # [batch, seq]
-            # Ensure pooled output matches original sequence length
-            if pooled_importance.shape[-1] != seq_len:
-                pooled_importance = pooled_importance[:, :seq_len]
-        else:
-            pooled_importance = combined_importance.mean(dim=1)
-        # Aggregate across batch
-        final_importance = pooled_importance.mean(dim=0)  # [seq]
-        # Ensure importance tensor matches sequence length
-        if final_importance.shape[0] != seq_len:
-            final_importance = final_importance[:seq_len]
-        # Preserve sink and recent tokens
-        preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
-        preserve_mask[:min(self.config.sink_tokens, seq_len)] = True
-        preserve_mask[-min(self.config.recent_window, seq_len):] = True
-        # Top-k selection for remaining tokens
-        n_keep = max(self.config.sink_tokens + self.config.recent_window,
-                    int(seq_len / compression_ratio))
-        n_keep = min(n_keep, seq_len)  # Ensure we don't exceed sequence length
-        remaining_slots = n_keep - preserve_mask.sum().item()
-        if remaining_slots > 0:
-            masked_importance = final_importance.clone()
-            masked_importance[preserve_mask] = -float('inf')
-            available_indices = (~preserve_mask).nonzero(as_tuple=True)[0]
-            if len(available_indices) > 0:
-                k = min(remaining_slots, len(available_indices))
-                if k > 0:
-                    _, relative_top_indices = torch.topk(masked_importance[available_indices], k)
-                    absolute_top_indices = available_indices[relative_top_indices]
-                    preserve_mask[absolute_top_indices] = True
-        # Extract retained tokens with bounds checking
-        retained_indices = torch.where(preserve_mask)[0]
-        retained_indices = retained_indices[retained_indices < seq_len]  # Safety check
-        keys_compressed = keys[:, :, retained_indices, :]
-        values_compressed = values[:, :, retained_indices, :]
-        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else float('inf')
-        logger.debug(f"SnapKV++: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
-        return keys_compressed, values_compressed, retained_indices.tolist()
-    def hybrid_sparse_attention(self, keys: torch.Tensor, values: torch.Tensor,
-                               head_budget: int, seq_budget: int) -> Dict[str, Any]:
-        """RocketKV-style Hybrid Sparse Attention for Stage 2 - no hardcoded values."""
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        # 1. Head-wise importance scoring
-        head_importance = (
-            keys.float().pow(2).sum(dim=(-1, -2)).sum(dim=0) +  # Sum over batch, seq, hidden
-            values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
-        )  # [n_heads]
-        # Select top heads
-        actual_head_budget = min(head_budget, n_heads)
-        _, top_head_indices = torch.topk(head_importance, actual_head_budget)
-        compressed_data = {
-            'keys': {},
-            'values': {},
-            'metadata': {
-                'head_selection': top_head_indices.tolist(),
-                'original_shape': keys.shape,
-                'compression_type': 'hybrid_sparse_attention'
-            }
-        }
-        # 2. Sequence-wise top-k selection per selected head
-        for head_idx in top_head_indices:
-            head_keys = keys[:, head_idx:head_idx+1, :, :]  # Keep head dimension
-            head_values = values[:, head_idx:head_idx+1, :, :]
-            # Compute sequence importance for this head
-            seq_importance = (
-                head_keys.norm(dim=-1).squeeze(1).mean(dim=0) +  # [seq]
-                head_values.norm(dim=-1).squeeze(1).mean(dim=0)
-            ) / 2.0
-            # Apply position-based boost (from research constants)
-            position_boost = torch.ones_like(seq_importance)
-            position_boost[:self.config.sink_tokens] *= self.constants.POSITION_BOOST_SINK
-            position_boost[-self.config.recent_window:] *= self.constants.POSITION_BOOST_RECENT
-            boosted_importance = seq_importance * position_boost
-            # Select top tokens for this head
-            actual_seq_budget = min(seq_budget, seq_len)
-            _, top_token_indices = torch.topk(boosted_importance, actual_seq_budget)
-            # Store compressed data
-            head_key = f'head_{head_idx.item()}'
-            compressed_data['keys'][head_key] = {
-                'data': head_keys[:, :, top_token_indices, :].clone(),
-                'indices': top_token_indices.tolist()
-            }
-            compressed_data['values'][head_key] = {
-                'data': head_values[:, :, top_token_indices, :].clone(),
-                'indices': top_token_indices.tolist()
-            }
-        return compressed_data
-    def stage1_permanent_eviction(self, keys: torch.Tensor, values: torch.Tensor,
-                                 layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
-        """
-        Stage 1: RocketKV-style permanent eviction with SnapKV++ or magnitude-guided approach.
-        """
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        if self.use_adaptive_decomposition:
-            # Use adaptive compression split
-            sparsity = self.estimate_attention_sparsity(keys, values)  # May raise if fails
-            stage1_ratio, _ = self.adaptive_stage_split(self.target_compression_ratio, seq_len, sparsity)
-        else:
-            stage1_ratio = self.config.stage1_compression_ratio
-        # Choose compression method based on configuration
-        if self.config.use_snapkv_plus_plus:
-            return self.snapkv_plus_plus(keys, values, stage1_ratio)
-        else:
-            # Original magnitude-guided approach
-            return self._magnitude_guided_stage1(keys, values, layer_idx, stage1_ratio)
-    def _magnitude_guided_stage1(self, keys: torch.Tensor, values: torch.Tensor,
-                                layer_idx: int, compression_ratio: float) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
-        """Original magnitude-guided Stage 1 eviction with explicit parameters."""
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        # Calculate retention based on compression ratio
-        retention_ratio = 1.0 / compression_ratio
-        min_retain = self.config.sink_tokens + self.config.recent_window
-        n_retain = max(min_retain, int(seq_len * retention_ratio))
-        # Apply layer-specific constraints (from research constants)
-        layer_position = layer_idx / max(getattr(self, 'n_layers', 12) - 1, 1)
-        if layer_position <= 0.5:  # Early layers
-            max_retain = int(seq_len * self.constants.EARLY_LAYER_MAX_RETENTION)
-        else:  # Late layers
-            max_retain = int(seq_len * self.constants.LATE_LAYER_MAX_RETENTION)
-        n_retain = min(n_retain, max_retain)
-        # Compute magnitude-based importance
-        importance_scores = self.compute_magnitude_importance(keys, values)
-        # Quality preservation: boost recent tokens (explicit formula from config)
-        recent_boost = torch.zeros_like(importance_scores)
-        if self.config.recent_window > 0:
-            recent_boost[-self.config.recent_window:] = importance_scores.max() * self.config.recent_boost_factor
-        importance_scores = importance_scores + recent_boost
-        # Initialize preservation mask
-        preserve_mask = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
-        preserve_mask[:self.config.sink_tokens] = True
-        preserve_mask[-self.config.recent_window:] = True
-        # Select additional tokens based on importance
-        remaining_slots = n_retain - preserve_mask.sum().item()
-        if remaining_slots > 0:
-            masked_importance = importance_scores.clone()
-            masked_importance[preserve_mask] = -float('inf')
-            # Use configured threshold (not hardcoded)
-            magnitude_threshold = torch.quantile(
-                importance_scores.float(),
-                self.config.get_magnitude_threshold()
-            )
-            below_threshold = masked_importance < magnitude_threshold
-            masked_importance[below_threshold] = -float('inf')
-            available = (masked_importance > -float('inf')).sum().item()
-            k = min(remaining_slots, available)
-            if k > 0:
-                _, top_indices = torch.topk(masked_importance, k)
-                preserve_mask[top_indices] = True
-        # Extract retained tokens
-        retained_indices = torch.where(preserve_mask)[0]
-        keys_stage1 = keys[:, :, retained_indices, :]
-        values_stage1 = values[:, :, retained_indices, :]
-        actual_ratio = seq_len / len(retained_indices) if len(retained_indices) > 0 else float('inf')
-        logger.debug(f"Stage 1 Layer {layer_idx}: {seq_len} → {len(retained_indices)} tokens ({actual_ratio:.1f}x)")
-        return keys_stage1, values_stage1, retained_indices.tolist()
-    def stage2_multi_dimensional_compression(self, keys: torch.Tensor, values: torch.Tensor,
-                                           layer_idx: int, retained_indices: List[int]) -> Dict[str, Any]:
-        """
-        Stage 2: RocketKV-style Hybrid Sparse Attention compression.
-        Uses dynamic top-k selection with head and sequence reductions.
-        """
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        if self.use_hybrid_sparse_attention:
-            # RocketKV-style compression with adaptive budgets
-            sparsity = self.estimate_attention_sparsity(keys, values)  # May raise if fails
-            if self.use_adaptive_decomposition:
-                _, stage2_ratio = self.adaptive_stage_split(
-                    self.target_compression_ratio, seq_len, sparsity
-                )
-            else:
-                stage2_ratio = self.config.stage2_compression_ratio
-            # Dynamic budgets based on compression target (from config)
-            head_retention_ratio = self.config.get_head_retention_ratio()
-            head_budget = max(1, int(n_heads * head_retention_ratio))
-            seq_budget = max(self.config.min_tokens_for_stability, int(seq_len / stage2_ratio))
-            # Use hybrid sparse attention
-            compressed_data = self.hybrid_sparse_attention(keys, values, head_budget, seq_budget)
-            # Add metadata
-            compressed_data['metadata'].update({
-                'stage1_retained_indices': retained_indices,
-                'original_shape_after_stage1': keys.shape,
-                'original_dtype': keys.dtype,
-                'layer_idx': layer_idx,
-                'sparsity_estimate': sparsity,
-                'stage2_compression_ratio': stage2_ratio,
-                'head_budget': head_budget,
-                'seq_budget': seq_budget,
-                'head_retention_ratio': head_retention_ratio
-            })
-            return compressed_data
-        # Fallback to original multi-dimensional compression
-        return self._original_stage2_compression(keys, values, layer_idx, retained_indices)
-    def _original_stage2_compression(self, keys: torch.Tensor, values: torch.Tensor,
-                                   layer_idx: int, retained_indices: List[int]) -> Dict[str, Any]:
-        """Original Stage 2 implementation for comparison."""
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        # Compute importance for remaining tokens
-        importance_scores = self.compute_magnitude_importance(keys, values)
-        # Combine with position-based decay (explicit formula)
-        decay_rate = self.layer_decay_rates[layer_idx] if self.layer_decay_rates else self.config.base_decay_rate
-        position_scores = torch.pow(
-            decay_rate,
-            torch.arange(seq_len, device=keys.device).float() / self.config.decay_normalization
-        )
-        combined_importance = importance_scores * position_scores
-        compressed_data = {
-            'keys': {},
-            'values': {},
-            'metadata': {
-                'stage1_retained_indices': retained_indices,
-                'importance_scores': combined_importance,
-                'original_shape_after_stage1': keys.shape,
-                'original_dtype': keys.dtype,
-                'layer_idx': layer_idx,
-                'magnitude_threshold_mode': self.config.magnitude_threshold_mode,
-                'compression_type': 'original_multi_dimensional'
-            }
-        }
-        # Head dimension compression with explicit parameters
-        if self.config.enable_head_compression:
-            n_important_heads = max(1, int(n_heads * self.config.head_compression_ratio))
-            # UPDATED: Always reserve top head_fp16_reserve heads at full precision
-            n_reserved_heads = min(getattr(self.config, 'head_fp16_reserve', 2), n_heads)
-            n_important_heads = max(n_reserved_heads, n_important_heads)
-            # Compute head importance (explicit calculation)
-            head_importance = (
-                keys.float().pow(2).sum(dim=(-1, -2)).sum(dim=0) +
-                values.float().pow(2).sum(dim=(-1, -2)).sum(dim=0)
-            )
-            _, important_head_indices = torch.topk(head_importance, n_important_heads)
-            other_head_indices = torch.tensor(
-                [h for h in range(n_heads) if h not in important_head_indices.tolist()],
-                device=keys.device, dtype=torch.long
-            )
-            # Store important heads at full precision
-            compressed_data['keys']['heads_fp16'] = {
-                'data': keys[:, important_head_indices, :, :].clone(),
-                'indices': important_head_indices.tolist()
-            }
-            compressed_data['values']['heads_fp16'] = {
-                'data': values[:, important_head_indices, :, :].clone(),
-                'indices': important_head_indices.tolist()
-            }
-            if other_head_indices.numel() == 0:
-                return compressed_data
-            seq_keys = keys[:, other_head_indices, :, :]
-            seq_values = values[:, other_head_indices, :, :]
-        else:
-            seq_keys = keys
-            seq_values = values
-        # Sequence dimension compression with explicit ratios
-        levels = self.config.precision_levels
-        # Explicit top-K selection for FP16
-        keep_fp16 = max(0, int(seq_len * self.config.sequence_compression_ratio))
-        top_fp16 = torch.topk(combined_importance, k=keep_fp16).indices if keep_fp16 > 0 else torch.empty(0, dtype=torch.long, device=keys.device)
-        is_fp16 = torch.zeros(seq_len, dtype=torch.bool, device=keys.device)
-        if keep_fp16 > 0:
-            is_fp16[top_fp16] = True
-        # Vectorized token binning
-        thresh = torch.tensor([pl.threshold for pl in levels], device=keys.device)
-        thresh_sorted, order = torch.sort(thresh, descending=True)
-        level_ids = torch.bucketize(combined_importance, thresh_sorted, right=False)
-        # Assign tokens to precision levels
-        for i in range(seq_len):
-            if is_fp16[i]:
-                precision_key = 'seq_fp16'
-            else:
-                level_idx = min(level_ids[i].item(), len(levels) - 1)
-                level = levels[order[level_idx]]
-                if level.bits is not None:
-                    precision_key = f'seq_{level.bits}bit'
-                else:
-                    precision_key = f'seq_{level.name}'
-            if precision_key not in compressed_data['keys']:
-                compressed_data['keys'][precision_key] = {
-                    'indices': [], 'data': None, 'scale': None, 'zero': None
-                }
-                compressed_data['values'][precision_key] = {
-                    'indices': [], 'data': None, 'scale': None, 'zero': None
-                }
-            compressed_data['keys'][precision_key]['indices'].append(i)
-            compressed_data['values'][precision_key]['indices'].append(i)
-        # Store data with aggressive precision (FP16 for most important tokens)
-        keys_to_delete = []
-        for precision_key in list(compressed_data['keys'].keys()):
-            if not precision_key.startswith('seq_'):
-                continue
-            indices = compressed_data['keys'][precision_key]['indices']
-            if not indices:
-                keys_to_delete.append(precision_key)
-                continue
-            if precision_key == 'seq_discard':
-                keys_to_delete.append(precision_key)
-                continue
-            idx_tensor = torch.tensor(indices, device=keys.device, dtype=torch.long)
-            k_slice = seq_keys.index_select(2, idx_tensor)
-            v_slice = seq_values.index_select(2, idx_tensor)
-            # Store with aggressive precision - only FP16 for ultra-selective tokens
-            compressed_data['keys'][precision_key]['data'] = k_slice.clone()
-            compressed_data['values'][precision_key]['data'] = v_slice.clone()
-        # Clean up empty keys
-        for pk in keys_to_delete:
-            compressed_data['keys'].pop(pk, None)
-            compressed_data['values'].pop(pk, None)
-        return compressed_data
-    def compress_with_enhanced_gradient(self, keys: torch.Tensor, values: torch.Tensor,
-                                       layer_idx: int, current_position: int) -> Dict[str, Any]:
-        """
-        Main compression function with explicit two-stage approach.
-        """
-        if not self.config.enable_two_stage:
-            return self._fallback_to_original_spg(keys, values, layer_idx, current_position)
-        try:
-            # Record original shape
-            orig_shape_full = keys.shape
-            # Stage 1: Permanent eviction
-            keys_stage1, values_stage1, retained_indices = self.stage1_permanent_eviction(
-                keys, values, layer_idx
-            )
-            # Stage 2: Multi-dimensional compression
-            compressed_data = self.stage2_multi_dimensional_compression(
-                keys_stage1, values_stage1, layer_idx, retained_indices
-            )
-            # Add metadata
-            compressed_data['metadata']['original_full_shape'] = orig_shape_full
-            # Progressive compression
-            if self.config.enable_progressive:
-                compressed_data = self._apply_progressive_compression(compressed_data, layer_idx)
-            return compressed_data
-        except Exception as e:
-            logger.error(f"Error in enhanced compression for layer {layer_idx}: {e}")
-            raise
-    def _fallback_to_original_spg(self, keys: torch.Tensor, values: torch.Tensor,
-                                 layer_idx: int, current_position: Optional[int]) -> Dict[str, Any]:
-        """Fallback to original SPG implementation with actual data storage."""
-        batch_size, n_heads, seq_len, head_dim = keys.shape
-        # Original position-based precision computation
-        device = keys.device
-        precision_scores = torch.zeros(seq_len, device=device)
-        decay_rate = self.layer_decay_rates[layer_idx] if self.layer_decay_rates else self.config.base_decay_rate
-        positions = torch.arange(seq_len, device=device)
-        if current_position is None or not isinstance(current_position, (int, float)):
-            current_position = seq_len
-        current_position = int(current_position)
-        distances = torch.tensor(current_position, device=device, dtype=positions.dtype) - positions
-        precision_scores = torch.pow(decay_rate, distances.float() / self.config.decay_normalization)
-        precision_scores[:self.config.sink_tokens] = 1.0
-        recent_mask = distances < self.config.recent_window
-        precision_scores[recent_mask] = torch.maximum(
-            precision_scores[recent_mask],
-            torch.tensor(self.config.recent_min_precision, device=device)
-        )
-        # Apply precision levels with actual data storage
-        compressed_data = {
-            'keys': {},
-            'values': {},
-            'metadata': {
-                'precision_scores': precision_scores,
-                'original_shape': keys.shape,
-                'original_dtype': keys.dtype,
-                'layer_idx': layer_idx,
-                'compression_type': 'original_spg'
-            }
-        }
-        # Exclusive binning for precision levels
-        levels = self.config.precision_levels
-        for i, score in enumerate(precision_scores):
-            for j, level in enumerate(levels):
-                lo = level.threshold
-                hi = levels[j-1].threshold if j > 0 else float('inf')
-                if lo <= score < hi:
-                    if level.bits is not None:
-                        precision_key = f'{level.bits}bit'
-                    else:
-                        precision_key = level.name
-                    if precision_key not in compressed_data['keys']:
-                        compressed_data['keys'][precision_key] = {
-                            'indices': [], 'data': None, 'scale': None, 'zero': None
-                        }
-                        compressed_data['values'][precision_key] = {
-                            'indices': [], 'data': None, 'scale': None, 'zero': None
-                        }
-                    compressed_data['keys'][precision_key]['indices'].append(i)
-                    compressed_data['values'][precision_key]['indices'].append(i)
-                    break
-        # Process data
-        keys_to_delete = []
-        for precision_key in list(compressed_data['keys'].keys()):
-            indices = compressed_data['keys'][precision_key]['indices']
-            if not indices:
-                keys_to_delete.append(precision_key)
-                continue
-            if precision_key == 'discard':
-                keys_to_delete.append(precision_key)
-                continue
-            level_indices = torch.tensor(indices, device=device, dtype=torch.long)
-            k_slice = keys.index_select(2, level_indices)
-            v_slice = values.index_select(2, level_indices)
-            # Store with FP16 precision (simplified for original SPG)
-            compressed_data['keys'][precision_key]['data'] = k_slice.clone()
-            compressed_data['values'][precision_key]['data'] = v_slice.clone()
-        # Clean up empty keys
-        for pk in keys_to_delete:
-            compressed_data['keys'].pop(pk, None)
-            compressed_data['values'].pop(pk, None)
-        return compressed_data
-    def _apply_progressive_compression(self, compressed_data: Dict, layer_idx: int) -> Dict:
-        """Apply progressive compression with relative quality change detection."""
-        if len(self.quality_history) >= self.constants.PROGRESSIVE_QUALITY_WINDOW:
-            recent = float(np.mean(self.quality_history[-self.constants.PROGRESSIVE_RECENT_WINDOW:]))
-            prev = float(np.mean(self.quality_history[-self.constants.PROGRESSIVE_QUALITY_WINDOW:-self.constants.PROGRESSIVE_RECENT_WINDOW]))
-            rel_delta = (recent - prev) / max(prev, 1e-9)
-            if rel_delta <= self.config.quality_threshold:
-                old_ratio = self.current_compression_ratio or self.config.initial_compression_ratio
-                new_ratio = min(old_ratio * self.config.progression_factor, self.config.max_compression_ratio)
-                if new_ratio > old_ratio:
-                    self.current_compression_ratio = new_ratio
-                    compression_factor = new_ratio / old_ratio
-                    # Tighten compression ratios (use configurable minimum from config)
-                    self.config.head_compression_ratio = max(self.config.progressive_min_ratio,
-                        self.config.head_compression_ratio / compression_factor)
-                    self.config.sequence_compression_ratio = max(self.config.progressive_min_ratio,
-                        self.config.sequence_compression_ratio / compression_factor)
-                    self.progressive_step += 1
-                    logger.info(f"Progressive step {self.progressive_step}: rel_delta={rel_delta:.4f}, new_ratio={new_ratio:.1f}x")
-        compressed_data['metadata']['progressive_compression_ratio'] = self.current_compression_ratio
-        compressed_data['metadata']['progressive_step'] = self.progressive_step
-        return compressed_data
-    def decompress(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Decompress enhanced SPG compressed data."""
-        metadata = compressed_data['metadata']
-        if metadata.get('compression_type') == 'original_spg':
-            return self._decompress_original_spg(compressed_data)
-        return self._decompress_enhanced_spg(compressed_data)
-    def _decompress_enhanced_spg(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Decompress enhanced multi-stage compressed data with HSA support."""
-        metadata = compressed_data['metadata']
-        # Get device from first available tensor
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        for storage_type in ['keys', 'values']:
-            for key, data in compressed_data[storage_type].items():
-                if isinstance(data, dict) and 'data' in data and isinstance(data['data'], torch.Tensor):
-                    device = data['data'].device
-                    break
-            if device != torch.device('cuda' if torch.cuda.is_available() else 'cpu'):
-                break
-        # Handle hybrid sparse attention format
-        if metadata.get('compression_type') == 'hybrid_sparse_attention':
-            return self._decompress_hybrid_sparse_attention(compressed_data)
-        # Original enhanced SPG decompression
-        original_shape = metadata['original_shape_after_stage1']
-        original_dtype = metadata['original_dtype']
-        keys_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
-        values_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
-        # Decompress head dimension data first
-        if 'heads_fp16' in compressed_data['keys']:
-            head_indices = compressed_data['keys']['heads_fp16']['indices']
-            head_idx_tensor = torch.tensor(head_indices, device=device, dtype=torch.long)
-            keys_full[:, head_idx_tensor, :, :] = compressed_data['keys']['heads_fp16']['data']
-            values_full[:, head_idx_tensor, :, :] = compressed_data['values']['heads_fp16']['data']
-            if self.config.enable_head_compression:
-                n_heads = original_shape[1]
-                other_head_indices = torch.tensor([h for h in range(n_heads) if h not in head_indices],
-                                                 device=device, dtype=torch.long)
-            else:
-                other_head_indices = head_idx_tensor
-        else:
-            other_head_indices = torch.arange(original_shape[1], device=device, dtype=torch.long)
-        # Decompress sequence dimension data
-        for precision_key in [k for k in compressed_data['keys'].keys() if k.startswith('seq_')]:
-            if 'data' not in compressed_data['keys'][precision_key]:
-                continue
-            indices = compressed_data['keys'][precision_key]['indices']
-            idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
-            # All data stored as FP16 in this simplified version
-            keys_full[:, other_head_indices, :, :].index_copy_(2, idx_tensor,
-                compressed_data['keys'][precision_key]['data'])
-            values_full[:, other_head_indices, :, :].index_copy_(2, idx_tensor,
-                compressed_data['values'][precision_key]['data'])
-        return keys_full, values_full
-    def _decompress_hybrid_sparse_attention(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Decompress RocketKV-style hybrid sparse attention data."""
-        metadata = compressed_data['metadata']
-        original_shape = metadata['original_shape']
-        # Get device from first available tensor
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        for head_key in compressed_data['keys'].keys():
-            if head_key.startswith('head_'):
-                device = compressed_data['keys'][head_key]['data'].device
-                break
-        # Initialize full tensors
-        keys_full = torch.zeros(original_shape, dtype=torch.float16, device=device)
-        values_full = torch.zeros(original_shape, dtype=torch.float16, device=device)
-        # Reconstruct selected heads with their tokens
-        for head_key in compressed_data['keys'].keys():
-            if not head_key.startswith('head_'):
-                continue
-            head_idx = int(head_key.split('_')[1])
-            head_data_k = compressed_data['keys'][head_key]
-            head_data_v = compressed_data['values'][head_key]
-            token_indices = head_data_k['indices']
-            # Place data in the correct head and token positions
-            keys_full[:, head_idx:head_idx+1, token_indices, :] = head_data_k['data']
-            values_full[:, head_idx:head_idx+1, token_indices, :] = head_data_v['data']
-        return keys_full, values_full
-    def _decompress_original_spg(self, compressed_data: Dict) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Decompress original SPG data."""
-        metadata = compressed_data['metadata']
-        original_shape = metadata['original_shape']
-        original_dtype = metadata['original_dtype']
-        device = metadata['precision_scores'].device
-        keys_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
-        values_full = torch.zeros(original_shape, dtype=original_dtype, device=device)
-        for precision_key in compressed_data['keys']:
-            data_dict = compressed_data['keys'][precision_key]
-            if 'data' in data_dict and 'indices' in data_dict:
-                indices = data_dict['indices']
-                idx_tensor = torch.tensor(indices, device=device, dtype=torch.long)
-                # All data stored as original precision
-                keys_full.index_copy_(2, idx_tensor, data_dict['data'])
-                values_full.index_copy_(2, idx_tensor, compressed_data['values'][precision_key]['data'])
-        return keys_full, values_full
-    def get_memory_footprint(self, compressed_data: Dict[str, Any]) -> int:
-        """
-        Calculate ACTUAL memory usage - NO ESTIMATES.
-        Every byte is accounted for explicitly.
-        """
-        total_bytes = 0
-        try:
-            # Count all stored tensors
-            for storage_type in ['keys', 'values']:
-                for key, data in compressed_data[storage_type].items():
-                    if isinstance(data, dict):
-                        # Data tensors
-                        if 'data' in data and isinstance(data['data'], torch.Tensor):
-                            total_bytes += data['data'].nelement() * data['data'].element_size()
-                        # Scale/zero tensors
-                        if 'scale' in data and isinstance(data['scale'], torch.Tensor):
-                            total_bytes += data['scale'].nelement() * data['scale'].element_size()
-                        if 'zero' in data and isinstance(data['zero'], torch.Tensor):
-                            total_bytes += data['zero'].nelement() * data['zero'].element_size()
-                        # Levels tensor for bit-packed data
-                        if 'levels' in data and isinstance(data['levels'], torch.Tensor):
-                            total_bytes += data['levels'].nelement() * data['levels'].element_size()
-                        # Metadata overhead (measured, not estimated)
-                        if 'meta' in data and isinstance(data['meta'], dict):
-                            total_bytes += self.constants.INT2_METADATA_BYTES
-                        # Indices (count only once under keys to avoid double counting)
-                        if storage_type == 'keys' and 'indices' in data and data['indices']:
-                            total_bytes += len(data['indices']) * self.constants.INDEX_SIZE_BYTES
-            # Metadata overhead
-            total_bytes += self.constants.METADATA_OVERHEAD_BYTES
-            logger.debug(f"Measured memory footprint: {total_bytes} bytes ({total_bytes/1024/1024:.2f} MB)")
-            return total_bytes
-        except Exception as e:
-            logger.error(f"Error calculating memory footprint: {e}")
-            raise
-    def update_quality_feedback(self, layer_idx: int, quality_metric: float):
-        """Update quality feedback for progressive compression."""
-        self.quality_history.append(quality_metric)
-        # Keep only recent history
-        if len(self.quality_history) > self.constants.QUALITY_HISTORY_MAX_SIZE:
-            self.quality_history = self.quality_history[-self.constants.QUALITY_HISTORY_MAX_SIZE:]
-class QuantizedKVCache:
-    """Enhanced quantized KV cache with working multi-stage SPG support."""
-    def __init__(self, config: CompressionConfig):
-        self.config = config
-        self.compressed_data = {}
-        self.dtypes = {}
-        # Initialize enhanced SPG with RocketKV features
-        if config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG]:
-            spg_config = replace(config.enhanced_spg_config,
-                               enable_two_stage=False,
-                               enable_adaptive=(config.compression_type == CompressionType.ADAPTIVE_SPG))
-            self.spg = EnhancedSlidingPrecisionGradient(spg_config)
-        elif config.compression_type in [CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
-            enhanced_config = config.enhanced_spg_config
-            if config.compression_type == CompressionType.PROGRESSIVE_SPG:
-                enhanced_config.enable_progressive = True
-            self.spg = EnhancedSlidingPrecisionGradient(enhanced_config)
-        else:
-            self.spg = None
-        self.current_position = 0
-        self.quality_history = []
-        self.n_layers = None
-    def compress_and_store(self, layer_idx: int, keys: torch.Tensor, values: torch.Tensor):
-        """Compress and store KV pairs with enhanced SPG support."""
-        key_dtype = keys.dtype
-        value_dtype = values.dtype
-        if self.config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG,
-                                           CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
-            if self.spg.layer_decay_rates is None:
-                if self.n_layers is None:
-                    raise ValueError("Model layer count not set - call detect_model_layers first")
-                self.spg.initialize_layer_decay_rates(self.n_layers)
-            if self.config.compression_type in [CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
-                compressed_data = self.spg.compress_with_enhanced_gradient(
-                    keys, values, layer_idx, self.current_position
-                )
-            else:
-                compressed_data = self.spg._fallback_to_original_spg(
-                    keys, values, layer_idx, self.current_position
-                )
-            self.compressed_data[layer_idx] = compressed_data
-            self.dtypes[layer_idx] = {'keys': key_dtype, 'values': value_dtype}
-        else:
-            # No compression - store original tensors
-            self.compressed_data[layer_idx] = {
-                'keys': {'original': {'data': keys.clone(), 'indices': list(range(keys.shape[2]))}},
-                'values': {'original': {'data': values.clone(), 'indices': list(range(values.shape[2]))}},
-                'metadata': {
-                    'compression_type': 'none',
-                    'original_shape': keys.shape,
-                    'original_dtype': keys.dtype
-                }
-            }
-            self.dtypes[layer_idx] = {'keys': key_dtype, 'values': value_dtype}
-    def get_decompressed(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Get decompressed KV pairs with enhanced SPG support."""
-        if self.config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG,
-                                           CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
-            if layer_idx in self.compressed_data:
-                return self.spg.decompress(self.compressed_data[layer_idx])
-            return None, None
-        else:
-            # No compression - return original tensors
-            if layer_idx in self.compressed_data:
-                data = self.compressed_data[layer_idx]
-                return data['keys']['original']['data'], data['values']['original']['data']
-            return None, None
-    def get_memory_footprint(self) -> int:
-        """Calculate actual memory usage with enhanced SPG support."""
-        total_bytes = 0
-        constants = ResearchConstants()
-        if self.config.compression_type in [CompressionType.SPG, CompressionType.ADAPTIVE_SPG,
-                                           CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
-            for layer_idx in self.compressed_data:
-                total_bytes += self.spg.get_memory_footprint(self.compressed_data[layer_idx])
-        else:
-            # No compression - calculate uncompressed memory
-            for layer_idx in self.compressed_data:
-                data = self.compressed_data[layer_idx]
-                keys_data = data['keys']['original']['data']
-                values_data = data['values']['original']['data']
-                total_bytes += keys_data.nelement() * keys_data.element_size()
-                total_bytes += values_data.nelement() * values_data.element_size()
-                total_bytes += constants.METADATA_OVERHEAD_BYTES
-        return total_bytes
-    def update_position(self, new_position: int):
-        """Update current generation position."""
-        self.current_position = new_position
-    def update_quality_feedback(self, layer_idx: int, quality_metric: float):
-        """Provide quality feedback for adaptive methods."""
-        if self.config.compression_type == CompressionType.ADAPTIVE_SPG and hasattr(self.spg, 'update_decay_rate'):
-            target_quality = self.config.enhanced_spg_config.target_perplexity_delta
-            self.spg.update_decay_rate(layer_idx, quality_metric, target_quality)
-            self.quality_history.append((layer_idx, quality_metric))
-        elif self.config.compression_type in [CompressionType.ENHANCED_SPG, CompressionType.PROGRESSIVE_SPG]:
-            self.spg.update_quality_feedback(layer_idx, quality_metric)
-def detect_model_layers(model) -> int:
-    """Detect the number of transformer layers with comprehensive validation."""
-    config_attrs = [
-        'num_hidden_layers',
-        'n_layer',
-        'num_layers',
-        'n_layers',
-        'decoder_layers',
-        'n_head_layers',
-    ]
-    for attr in config_attrs:
-        if hasattr(model.config, attr):
-            n_layers = getattr(model.config, attr)
-            if isinstance(n_layers, int) and n_layers > 0:
-                logger.info(f"Detected {n_layers} layers from config.{attr}")
-                return n_layers
-    layer_patterns = [
-        'layer', 'layers', 'h', 'blocks', 'decoder.layers', 'transformer_blocks', 'decoderLayer',
-    ]
-    for module_name, module in model.named_modules():
-        for pattern in layer_patterns:
-            if pattern in module_name.lower():
-                if hasattr(module, '__len__'):
-                    n_layers = len(module)
-                    if n_layers > 0:
-                        logger.info(f"Detected {n_layers} layers by counting {module_name}")
-                        return n_layers
-    decoder_layer_types = [
-        'TransformerBlock', 'DecoderLayer', 'EncoderLayer', 'Block', 'Layer',
-        'GPT2Block', 'LlamaDecoderLayer', 'MistralDecoderLayer', 'OPTDecoderLayer',
-    ]
-    layers = []
-    for module in model.modules():
-        module_type = type(module).__name__
-        if any(layer_type in module_type for layer_type in decoder_layer_types):
-            layers.append(module)
-    if layers:
-        n_layers = len(set(layers))
-        if n_layers > 0:
-            logger.info(f"Detected {n_layers} layers by module type matching")
-            return n_layers
-    # Fail fast if cannot detect layers
-    raise ValueError(
-        f"Could not automatically detect the number of layers for model {type(model).__name__}. "
-        "Please check the model architecture and update the detection logic."
-    )