Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Sep 6

Commit

e49d439

verified ·

1 Parent(s): 33d9292

Update benchmark.py

Browse files

Files changed (1) hide show

benchmark.py +32 -138

benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@ Benchmarking, metrics, and proof generation for Enhanced SPG.
 Supports LongBench, NIAH, RULER, SCBench benchmarks.
 MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
 ALL BENCHMARKS USE SAME COMPRESSION PIPELINE AS WIKITEXT.
-FIXED: CUDA assert errors, tokenization issues, safe generation.
 """
 import torch
@@ -144,16 +144,12 @@ class BenchmarkMetrics:
                 self.prefill_time_std = float(np.std(self.prefill_times))
                 self.prefill_time_ci = self._bootstrap_ci(self.prefill_times, config)
                 self.prefill_tokens_per_sec = config.prefill_length / self.prefill_time_mean if self.prefill_time_mean > 0 else 0.0
-            else:
-                logger.debug("No prefill time data available")
             if self.prefill_peak_memories:
                 memories_mb = [m / (1024 * 1024) for m in self.prefill_peak_memories]
                 self.prefill_peak_memory_mean_mb = float(np.mean(memories_mb))
                 self.prefill_peak_memory_std_mb = float(np.std(memories_mb))
                 self.prefill_peak_memory_ci_mb = self._bootstrap_ci(memories_mb, config)
-            else:
-                logger.debug("No prefill memory data available")
             if self.decode_times:
                 self.decode_time_per_token_mean_ms = float(np.mean(self.decode_times) * 1000)
@@ -162,8 +158,6 @@ class BenchmarkMetrics:
                 self.decode_tokens_per_sec = 1.0 / np.mean(self.decode_times) if self.decode_times else 0.0
                 self.decode_time_p50_ms = float(np.percentile(self.decode_times, 50) * 1000)
                 self.decode_time_p95_ms = float(np.percentile(self.decode_times, 95) * 1000)
-            else:
-                logger.debug("No decode time data available")
             # Calculate end-to-end throughput
             if self.prefill_time_mean > 0 and self.decode_time_per_token_mean_ms > 0:
@@ -174,37 +168,23 @@ class BenchmarkMetrics:
             if self.decode_peak_memories:
                 self.decode_peak_memory_mean_mb = float(np.mean(self.decode_peak_memories) / (1024 * 1024))
-            else:
-                logger.debug("No decode memory data available")
             if self.prefill_perplexities:
                 self.prefill_perplexity_mean = float(np.mean(self.prefill_perplexities))
                 self.prefill_perplexity_std = float(np.std(self.prefill_perplexities))
                 self.prefill_perplexity_ci = self._bootstrap_ci(self.prefill_perplexities, config)
-                logger.info(f"Calculated prefill perplexity: mean={self.prefill_perplexity_mean:.2f}, "
-                           f"std={self.prefill_perplexity_std:.2f}, samples={len(self.prefill_perplexities)}")
-            else:
-                logger.warning("No prefill perplexity data available")
             if self.generation_perplexities:
                 self.generation_perplexity_mean = float(np.mean(self.generation_perplexities))
                 self.generation_perplexity_std = float(np.std(self.generation_perplexities))
                 self.generation_perplexity_ci = self._bootstrap_ci(self.generation_perplexities, config)
-                logger.info(f"Calculated generation perplexity: mean={self.generation_perplexity_mean:.2f}, "
-                           f"std={self.generation_perplexity_std:.2f}, samples={len(self.generation_perplexities)}")
-            else:
-                logger.warning("No generation perplexity data available")
             if self.compression_ratios:
                 self.compression_ratio_mean = float(np.mean(self.compression_ratios))
                 self.compression_ratio_std = float(np.std(self.compression_ratios))
-            else:
-                logger.debug("No compression ratio data available")
             if self.kv_cache_memory_samples_mb:
                 self.kv_cache_memory_mb = float(np.mean(self.kv_cache_memory_samples_mb))
-            else:
-                logger.debug("No KV cache memory data available")
         except Exception as e:
             logger.error(f"Error calculating statistics: {e}")
@@ -213,7 +193,6 @@ class BenchmarkMetrics:
     def _bootstrap_ci(self, data: List[float], config: CompressionConfig) -> Tuple[float, float]:
         """Calculate bootstrap confidence interval with reproducible RNG."""
         if not data or len(data) < 2:
-            logger.warning("Insufficient data for confidence interval calculation")
             return (0.0, 0.0)
         try:
@@ -240,11 +219,9 @@ class BenchmarkMetrics:
 def safe_tokenize(tokenizer, text, max_length=512):
     """Safe tokenization with proper padding and truncation."""
-    # Ensure pad_token is set
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    # Tokenize with explicit parameters
     inputs = tokenizer(
         text,
         return_tensors="pt",
@@ -255,12 +232,10 @@ def safe_tokenize(tokenizer, text, max_length=512):
         add_special_tokens=True
     )
-    # Validate outputs
     if inputs.input_ids.shape[1] == 0:
         raise ValueError("Tokenization produced empty sequence")
     if inputs.input_ids.shape[1] > max_length:
-        logger.warning(f"Sequence length {inputs.input_ids.shape[1]} exceeds max {max_length}")
         inputs.input_ids = inputs.input_ids[:, :max_length]
         inputs.attention_mask = inputs.attention_mask[:, :max_length]
@@ -269,41 +244,35 @@ def safe_tokenize(tokenizer, text, max_length=512):
 def validate_model_inputs(model, input_ids, attention_mask):
     """Validate inputs are compatible with model."""
-    # Check sequence length against model's max position embeddings
     if hasattr(model.config, 'max_position_embeddings'):
         max_pos = model.config.max_position_embeddings
         if input_ids.shape[1] > max_pos:
-            logger.warning(f"Input length {input_ids.shape[1]} exceeds model max {max_pos}")
             input_ids = input_ids[:, :max_pos]
             attention_mask = attention_mask[:, :max_pos]
-    # For GPT-2, check n_positions
     if hasattr(model.config, 'n_positions'):
         n_pos = model.config.n_positions
         if input_ids.shape[1] > n_pos:
-            logger.warning(f"Input length {input_ids.shape[1]} exceeds GPT-2 positions {n_pos}")
             input_ids = input_ids[:, :n_pos]
             attention_mask = attention_mask[:, :n_pos]
-    # Ensure input_ids are within vocabulary range
     vocab_size = model.config.vocab_size
     if input_ids.max() >= vocab_size:
-        logger.error(f"Token id {input_ids.max()} exceeds vocab size {vocab_size}")
         input_ids = input_ids.clamp(0, vocab_size - 1)
     return input_ids, attention_mask
 def safe_generate(model, tokenizer, input_ids, attention_mask, past_key_values=None, max_new_tokens=20):
-    """Safe generation with proper error handling."""
     try:
-        # Validate inputs
         input_ids, attention_mask = validate_model_inputs(model, input_ids, attention_mask)
-        # Set generation config
         gen_config = {
             "max_new_tokens": max_new_tokens,
-            "temperature": 0.7,
             "do_sample": False,
             "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
             "eos_token_id": tokenizer.eos_token_id,
@@ -311,20 +280,21 @@ def safe_generate(model, tokenizer, input_ids, attention_mask, past_key_values=N
             "use_cache": True
         }
-        # Add past_key_values if available
         if past_key_values is not None:
             gen_config["past_key_values"] = past_key_values
-        # Generate with error handling
         with torch.no_grad():
             output = model.generate(input_ids, **gen_config)
-        return output
     except Exception as e:
         logger.error(f"Generation failed: {e}")
-        # Return input as fallback
-        return input_ids
 def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
@@ -336,21 +306,17 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
     """
     device = input_ids.device
-    # Validate inputs first
     input_ids, attention_mask = validate_model_inputs(model, input_ids, attention_mask)
-    # Clear GPU cache if requested
     if torch.cuda.is_available() and measure_memory:
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
         torch.cuda.synchronize()
-    # Measure prefill time
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     start_time = time.perf_counter()
-    # Prefill phase with error handling
     try:
         with torch.inference_mode():
             outputs = model(
@@ -363,7 +329,6 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
             logits = outputs.logits
     except Exception as e:
         logger.error(f"Prefill failed: {e}")
-        # Return minimal valid result
         return {
             'past_key_values': None,
             'prefill_time': 0,
@@ -380,22 +345,18 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
     prefill_time = time.perf_counter() - start_time
-    # Measure peak memory
     prefill_peak_mem = 0
     if torch.cuda.is_available() and measure_memory:
         prefill_peak_mem = _peak_mem_bytes_all_gpus()
-    # Calculate prefill perplexity safely
     prefill_loss = None
     if logits is not None and input_ids.shape[1] > 1:
         try:
-            # Ensure we have valid shapes
             seq_len = min(logits.shape[1], input_ids.shape[1] - 1)
             if seq_len > 0:
                 shift_logits = logits[:, :seq_len, :].contiguous()
                 shift_labels = input_ids[:, 1:seq_len+1].contiguous()
-                # Calculate loss with ignore_index for padding
                 loss = F.cross_entropy(
                     shift_logits.view(-1, shift_logits.size(-1)),
                     shift_labels.view(-1),
@@ -406,30 +367,28 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
         except Exception as e:
             logger.warning(f"Could not calculate prefill loss: {e}")
-    # Compression phase - same as WikiText
     original_cache_size = 0
     compressed_cache_size = 0
     compression_ratio = 1.0
     if past_key_values:
         try:
-            # Convert to legacy format for processing
-            kv_tuple = past_key_values.to_legacy_cache() if hasattr(past_key_values, 'to_legacy_cache') else past_key_values
-            # Calculate original size
             for layer_idx, (keys, values) in enumerate(kv_tuple):
                 if keys is not None and values is not None:
                     original_cache_size += keys.nelement() * keys.element_size()
                     original_cache_size += values.nelement() * values.element_size()
-                    # Apply compression if enabled
                     if config.compression_type != CompressionType.NONE and cache_manager is not None:
                         try:
                             cache_manager.compress_and_store(layer_idx, keys, values)
                         except Exception as e:
                             logger.error(f"Compression failed for layer {layer_idx}: {e}")
-            # Reconstruct compressed cache
             if config.compression_type != CompressionType.NONE and cache_manager is not None:
                 reconstructed_kv = []
                 for layer_idx in range(len(kv_tuple)):
@@ -438,20 +397,16 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
                         if dec_keys is not None and dec_values is not None:
                             reconstructed_kv.append((dec_keys, dec_values))
                         else:
-                            # Use original if decompression fails
-                            logger.warning(f"Decompression returned None for layer {layer_idx}, using original")
                             reconstructed_kv.append(kv_tuple[layer_idx])
                     except Exception as e:
                         logger.error(f"Decompression failed for layer {layer_idx}: {e}")
                         reconstructed_kv.append(kv_tuple[layer_idx])
-                # Convert back to DynamicCache format
                 if hasattr(DynamicCache, 'from_legacy_cache'):
                     past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
                 else:
                     past_key_values = tuple(reconstructed_kv)
-                # Measure compressed size
                 try:
                     compressed_cache_size = cache_manager.get_memory_footprint()
                 except:
@@ -459,8 +414,8 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
             else:
                 compressed_cache_size = original_cache_size
-            # Calculate compression ratio
-            compression_ratio = original_cache_size / compressed_cache_size if compressed_cache_size > 0 else 1.0
         except Exception as e:
             logger.error(f"Cache processing failed: {e}")
@@ -481,7 +436,6 @@ def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
 def create_niah_haystack(context_length: int, needle: str, depth_percent: float) -> str:
     """Create Needle-in-a-Haystack test context - NO HARDCODING."""
-    # Generate haystack text
     haystack_template = "The quick brown fox jumps over the lazy dog. " * 20
     haystack_chunks = []
@@ -490,7 +444,6 @@ def create_niah_haystack(context_length: int, needle: str, depth_percent: float)
     haystack = " ".join(haystack_chunks)[:context_length - len(needle) - 10]
-    # Insert needle at specified depth
     insertion_point = int(len(haystack) * depth_percent / 100)
     haystack_with_needle = (
         haystack[:insertion_point] +
@@ -511,25 +464,19 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
     prompt = f"{context}\n\nQuestion: What is the secret password?\nAnswer:"
-    # Use safe tokenization
     inputs = safe_tokenize(tokenizer, prompt, max_length=min(config.prefill_length, 1024))
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
-    # Apply SAME compression pipeline as WikiText
     compression_result = apply_compression_pipeline(
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
-    # Generate with compressed cache using safe generation
     gen_start = time.perf_counter()
-    output = safe_generate(model, tokenizer, input_ids, attention_mask,
-                          compression_result['past_key_values'], max_new_tokens=20)
     gen_time = time.perf_counter() - gen_start
-    generated_text = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
-    # Check if needle was retrieved
     accuracy = 1.0 if config.niah_needle.split()[-1] in generated_text else 0.0
     logger.info(f"NIAH accuracy: {accuracy}, Generated: {generated_text[:50]}")
@@ -547,10 +494,8 @@ def evaluate_niah(model, tokenizer, config: CompressionConfig, cache_manager: Op
 def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
     """Evaluate RULER with SAME compression pipeline as WikiText."""
-    # Create synthetic RULER-like task
-    seq_len = min(config.ruler_max_seq_length, config.prefill_length, 1024)  # Cap at GPT-2 limit
-    # Create a retrieval task with multiple facts
     facts = []
     for i in range(10):
         facts.append(f"Fact {i}: The capital of Country{i} is City{i}.")
@@ -565,20 +510,15 @@ def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: O
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
-    # Apply SAME compression pipeline as WikiText
     compression_result = apply_compression_pipeline(
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
-    # Generate with compressed cache
     gen_start = time.perf_counter()
-    output = safe_generate(model, tokenizer, input_ids, attention_mask,
-                          compression_result['past_key_values'], max_new_tokens=10)
     gen_time = time.perf_counter() - gen_start
-    generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
-    # Check exact match
     expected = f"City{query_idx}"
     exact_match = 1.0 if expected in generated else 0.0
@@ -597,7 +537,6 @@ def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: O
 def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
     """Evaluate SCBench with SAME compression pipeline as WikiText."""
-    # Create multi-turn conversation
     conversation = []
     facts = {}
@@ -612,7 +551,6 @@ def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager:
         conversation.append(f"User: {user_msg}")
         conversation.append(f"Assistant: {assistant_msg}")
-    # Query a random fact
     query_key = random.choice(list(facts.keys()))
     conversation.append(f"User: What is {query_key}?")
@@ -622,20 +560,15 @@ def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager:
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
-    # Apply SAME compression pipeline as WikiText
     compression_result = apply_compression_pipeline(
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
-    # Generate with compressed cache
     gen_start = time.perf_counter()
-    output = safe_generate(model, tokenizer, input_ids, attention_mask,
-                          compression_result['past_key_values'], max_new_tokens=20)
     gen_time = time.perf_counter() - gen_start
-    generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
-    # Check if correct value is recalled
     expected_value = facts[query_key]
     accuracy = 1.0 if expected_value in generated else 0.0
@@ -658,7 +591,6 @@ def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
     try:
         dataset = load_dataset("THUDM/LongBench", task, split="test")
-        # Sample evaluation examples
         n_samples = min(config.eval_samples, len(dataset))
         samples = dataset.select(range(n_samples))
@@ -682,21 +614,16 @@ def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
             input_ids = inputs.input_ids.to(model.device)
             attention_mask = inputs.attention_mask.to(model.device)
-            # Apply SAME compression pipeline as WikiText
             compression_result = apply_compression_pipeline(
                 model, tokenizer, input_ids, attention_mask, cache_manager, config,
-                measure_memory=False  # Don't measure memory for each sample
             )
-            # Generate with compressed cache
             gen_start = time.perf_counter()
-            output = safe_generate(model, tokenizer, input_ids, attention_mask,
-                                 compression_result['past_key_values'], max_new_tokens=50)
             gen_time = time.perf_counter() - gen_start
-            generated = tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
-            # Simple accuracy metric
             score = 1.0 if str(answer).lower() in generated.lower() else 0.0
             scores.append(score)
             compression_ratios.append(compression_result['compression_ratio'])
@@ -705,7 +632,6 @@ def evaluate_longbench_task(model, tokenizer, config: CompressionConfig,
             gen_times.append(gen_time)
         avg_compression = float(np.mean(compression_ratios)) if compression_ratios else 1.0
-        logger.info(f"LongBench {task} avg compression: {avg_compression:.1f}x")
         return {
             'accuracy': float(np.mean(scores)),
@@ -733,15 +659,11 @@ def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.float16 if device == "cuda" else torch.float32
-    # FAIL FAST if CUDA required but unavailable
     if config.fail_on_cpu_fallback and device == "cpu":
         raise RuntimeError("CUDA required but unavailable (fail_on_cpu_fallback=True)")
     logger.info(f"Loading model: {model_name}")
-    # Check if model requires authentication
-    model_info = SUPPORTED_MODELS.get(config.model_key, {})
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True
@@ -750,7 +672,6 @@ def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    # Model loading with Flash Attention support
     model_kwargs = {
         "torch_dtype": dtype,
         "device_map": "auto" if device == "cuda" else None,
@@ -758,20 +679,16 @@ def load_model_and_tokenizer(model_name: str, config: CompressionConfig):
         "trust_remote_code": True
     }
-    # Try Flash Attention if requested and available
     if config.use_flash_attention and device == "cuda":
         try:
-            # First try to load with Flash Attention
             model_kwargs["attn_implementation"] = "flash_attention_2"
             model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
             logger.info("Successfully loaded with Flash Attention 2")
         except Exception as e:
-            # Fall back to standard attention
-            logger.warning(f"Flash Attention not available, using standard attention: {e}")
             model_kwargs.pop("attn_implementation", None)
             model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
     else:
-        # Load without Flash Attention
         model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
     model.eval()
@@ -784,7 +701,6 @@ def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]
     logger.info(f"Loading samples for benchmark: {config.benchmark_type}")
     if config.benchmark_type == "wikitext":
-        # Original WikiText loading
         texts = []
         min_tokens = config.prefill_length + config.generation_length
@@ -823,7 +739,6 @@ def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]
             raise
     elif config.benchmark_type == "longbench":
-        # Load LongBench dataset
         texts = []
         if config.benchmark_subset:
             try:
@@ -839,7 +754,6 @@ def load_real_dataset_samples(config: CompressionConfig, tokenizer) -> List[str]
                 raise
     elif config.benchmark_type in ["niah", "ruler", "scbench"]:
-        # These benchmarks generate synthetic data
         texts = ["Synthetic benchmark data"] * config.eval_samples
     else:
@@ -858,7 +772,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
     logger.info(f"Benchmark type: {config.benchmark_type}")
     logger.info(f"Config hash: {config.get_hash()}")
-    # Enable synchronous CUDA for debugging
     if torch.cuda.is_available():
         os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
@@ -876,7 +789,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         logger.error(f"Failed to detect model layers: {e}")
         raise
-    # Warmup
     device = model.device
     with torch.inference_mode():
         dummy = torch.randint(0, tokenizer.vocab_size, (1, min(config.prefill_length, 128)), device=device)
@@ -899,13 +811,10 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         metrics = BenchmarkMetrics()
-        # Run benchmark-specific evaluation with UNIFIED compression
         if config.benchmark_type == "niah":
-            # NIAH evaluation with unified compression
             for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
                 config.niah_depth_percent = depth
                 for idx in range(min(config.eval_samples, 10)):
-                    # Create cache manager for compression types
                     if config.compression_type != CompressionType.NONE:
                         cache_manager = QuantizedKVCache(config)
                         cache_manager.n_layers = n_layers
@@ -918,12 +827,11 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                     metrics.compression_ratios.append(result['compression_ratio'])
                     metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
                     metrics.prefill_times.append(result['prefill_time'])
-                    metrics.decode_times.append(result['generation_time'] / 20)  # Per token
                     if result['prefill_peak_mem'] > 0:
                         metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
-                    # Record per-sample data
                     per_sample_records.append({
                         'benchmark': 'niah',
                         'depth_percent': depth,
@@ -935,7 +843,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                     })
         elif config.benchmark_type == "ruler":
-            # RULER evaluation with unified compression
             for idx in range(config.eval_samples):
                 if config.compression_type != CompressionType.NONE:
                     cache_manager = QuantizedKVCache(config)
@@ -949,7 +856,7 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 metrics.compression_ratios.append(result['compression_ratio'])
                 metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
                 metrics.prefill_times.append(result['prefill_time'])
-                metrics.decode_times.append(result['generation_time'] / 10)  # Per token
                 if result['prefill_peak_mem'] > 0:
                     metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
@@ -964,7 +871,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 })
         elif config.benchmark_type == "scbench":
-            # SCBench evaluation with unified compression
             for idx in range(config.eval_samples):
                 if config.compression_type != CompressionType.NONE:
                     cache_manager = QuantizedKVCache(config)
@@ -978,7 +884,7 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 metrics.compression_ratios.append(result['compression_ratio'])
                 metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
                 metrics.prefill_times.append(result['prefill_time'])
-                metrics.decode_times.append(result['generation_time'] / 20)  # Per token
                 if result['prefill_peak_mem'] > 0:
                     metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
@@ -993,7 +899,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 })
         elif config.benchmark_type == "longbench":
-            # LongBench evaluation with unified compression
             if config.benchmark_subset:
                 if config.compression_type != CompressionType.NONE:
                     cache_manager = QuantizedKVCache(config)
@@ -1010,7 +915,7 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 metrics.prefill_times.append(result['prefill_time'])
                 if result['generation_time'] > 0:
-                    metrics.decode_times.append(result['generation_time'] / 50)  # Per token
                 per_sample_records.append({
                     'benchmark': 'longbench',
@@ -1022,7 +927,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 })
         else:
-            # Standard WikiText perplexity evaluation with existing compression
             for idx in range(config.eval_samples):
                 logger.info(f"Sample {idx+1}/{config.eval_samples}")
@@ -1036,12 +940,10 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                 else:
                     cache_manager = None
-                # Use safe tokenization
                 inputs = safe_tokenize(tokenizer, text, max_length=min(config.prefill_length, 1024))
                 input_ids = inputs.input_ids.to(device)
                 attention_mask = inputs.attention_mask.to(device)
-                # Apply unified compression pipeline
                 compression_result = apply_compression_pipeline(
                     model, tokenizer, input_ids, attention_mask, cache_manager, config
                 )
@@ -1057,7 +959,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
                     prefill_perplexity = np.exp(compression_result['prefill_loss'])
                     metrics.prefill_perplexities.append(min(prefill_perplexity, 1000))
-                # Generation phase with timing
                 generated_ids = input_ids.clone()
                 decode_times = []
                 generation_losses = []
@@ -1110,7 +1011,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         metrics.calculate_statistics(config)
         all_metrics.append(metrics)
-    # Aggregate results across seeds
     final_metrics = BenchmarkMetrics()
     for m in all_metrics:
         final_metrics.prefill_times.extend(m.prefill_times)
@@ -1128,7 +1028,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
     final_metrics.calculate_statistics(config)
-    # Summary
     end_time = datetime.now().isoformat()
     summary = {
         'compression_type': config.compression_type.value,
@@ -1142,7 +1041,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         'end_time': end_time
     }
-    # Add benchmark-specific metrics
     if config.benchmark_type == "niah" and final_metrics.niah_retrieval_accuracy:
         summary['niah_accuracy'] = float(np.mean(final_metrics.niah_retrieval_accuracy))
     elif config.benchmark_type == "ruler" and final_metrics.ruler_exact_match:
@@ -1155,7 +1053,6 @@ def run_research_benchmark(model_name: str, config: CompressionConfig, dataset_t
         summary['prefill_perplexity'] = final_metrics.prefill_perplexity_mean
         summary['generation_perplexity'] = final_metrics.generation_perplexity_mean
-    # Always add timing and memory metrics
     summary['prefill_time_ms'] = final_metrics.prefill_time_mean * 1000
     summary['decode_time_ms'] = final_metrics.decode_time_per_token_mean_ms
     summary['throughput_tokens_sec'] = final_metrics.decode_tokens_per_sec
@@ -1253,7 +1150,6 @@ def verify_proof_bundle(bundle_root: str, config: CompressionConfig, proving: Pr
     recomputed = {}
     failures = []
-    # Verify based on benchmark type
     if config.benchmark_type == "niah":
         if "niah_accuracy" in summary:
             recomputed["niah_accuracy"] = mean_of("accuracy")
@@ -1267,13 +1163,11 @@ def verify_proof_bundle(bundle_root: str, config: CompressionConfig, proving: Pr
         if "longbench_accuracy" in summary:
             recomputed["longbench_accuracy"] = mean_of("accuracy")
     elif config.benchmark_type == "wikitext":
-        # WikiText benchmark metrics
         if "prefill_perplexity" in summary:
             recomputed["prefill_perplexity"] = mean_of("prefill_perplexity")
         if "generation_perplexity" in summary:
             recomputed["generation_perplexity"] = mean_of("generation_perplexity")
-    # Always verify compression metrics
     recomputed["compression_ratio"] = mean_of("compression_ratio")
     recomputed["kv_cache_memory_mb"] = mean_of("kv_cache_memory_mb")

 Supports LongBench, NIAH, RULER, SCBench benchmarks.
 MEASURED VALUES ONLY - no estimations. FAIL FAST on errors.
 ALL BENCHMARKS USE SAME COMPRESSION PIPELINE AS WIKITEXT.
+FIXED: Generation errors, proper fallback handling.
 """
 import torch
                 self.prefill_time_std = float(np.std(self.prefill_times))
                 self.prefill_time_ci = self._bootstrap_ci(self.prefill_times, config)
                 self.prefill_tokens_per_sec = config.prefill_length / self.prefill_time_mean if self.prefill_time_mean > 0 else 0.0
             if self.prefill_peak_memories:
                 memories_mb = [m / (1024 * 1024) for m in self.prefill_peak_memories]
                 self.prefill_peak_memory_mean_mb = float(np.mean(memories_mb))
                 self.prefill_peak_memory_std_mb = float(np.std(memories_mb))
                 self.prefill_peak_memory_ci_mb = self._bootstrap_ci(memories_mb, config)
             if self.decode_times:
                 self.decode_time_per_token_mean_ms = float(np.mean(self.decode_times) * 1000)
                 self.decode_tokens_per_sec = 1.0 / np.mean(self.decode_times) if self.decode_times else 0.0
                 self.decode_time_p50_ms = float(np.percentile(self.decode_times, 50) * 1000)
                 self.decode_time_p95_ms = float(np.percentile(self.decode_times, 95) * 1000)
             # Calculate end-to-end throughput
             if self.prefill_time_mean > 0 and self.decode_time_per_token_mean_ms > 0:
             if self.decode_peak_memories:
                 self.decode_peak_memory_mean_mb = float(np.mean(self.decode_peak_memories) / (1024 * 1024))
             if self.prefill_perplexities:
                 self.prefill_perplexity_mean = float(np.mean(self.prefill_perplexities))
                 self.prefill_perplexity_std = float(np.std(self.prefill_perplexities))
                 self.prefill_perplexity_ci = self._bootstrap_ci(self.prefill_perplexities, config)
             if self.generation_perplexities:
                 self.generation_perplexity_mean = float(np.mean(self.generation_perplexities))
                 self.generation_perplexity_std = float(np.std(self.generation_perplexities))
                 self.generation_perplexity_ci = self._bootstrap_ci(self.generation_perplexities, config)
             if self.compression_ratios:
                 self.compression_ratio_mean = float(np.mean(self.compression_ratios))
                 self.compression_ratio_std = float(np.std(self.compression_ratios))
             if self.kv_cache_memory_samples_mb:
                 self.kv_cache_memory_mb = float(np.mean(self.kv_cache_memory_samples_mb))
         except Exception as e:
             logger.error(f"Error calculating statistics: {e}")
     def _bootstrap_ci(self, data: List[float], config: CompressionConfig) -> Tuple[float, float]:
         """Calculate bootstrap confidence interval with reproducible RNG."""
         if not data or len(data) < 2:
             return (0.0, 0.0)
         try:
 def safe_tokenize(tokenizer, text, max_length=512):
     """Safe tokenization with proper padding and truncation."""
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     inputs = tokenizer(
         text,
         return_tensors="pt",
         add_special_tokens=True
     )
     if inputs.input_ids.shape[1] == 0:
         raise ValueError("Tokenization produced empty sequence")
     if inputs.input_ids.shape[1] > max_length:
         inputs.input_ids = inputs.input_ids[:, :max_length]
         inputs.attention_mask = inputs.attention_mask[:, :max_length]
 def validate_model_inputs(model, input_ids, attention_mask):
     """Validate inputs are compatible with model."""
     if hasattr(model.config, 'max_position_embeddings'):
         max_pos = model.config.max_position_embeddings
         if input_ids.shape[1] > max_pos:
             input_ids = input_ids[:, :max_pos]
             attention_mask = attention_mask[:, :max_pos]
     if hasattr(model.config, 'n_positions'):
         n_pos = model.config.n_positions
         if input_ids.shape[1] > n_pos:
             input_ids = input_ids[:, :n_pos]
             attention_mask = attention_mask[:, :n_pos]
     vocab_size = model.config.vocab_size
     if input_ids.max() >= vocab_size:
+        input_ids = input_ids.clamp(0, vocab_size - 1)
+    if input_ids.min() < 0:
         input_ids = input_ids.clamp(0, vocab_size - 1)
     return input_ids, attention_mask
 def safe_generate(model, tokenizer, input_ids, attention_mask, past_key_values=None, max_new_tokens=20):
+    """Safe generation with proper error handling - returns generated text."""
     try:
         input_ids, attention_mask = validate_model_inputs(model, input_ids, attention_mask)
         gen_config = {
             "max_new_tokens": max_new_tokens,
             "do_sample": False,
             "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
             "eos_token_id": tokenizer.eos_token_id,
             "use_cache": True
         }
         if past_key_values is not None:
             gen_config["past_key_values"] = past_key_values
         with torch.no_grad():
             output = model.generate(input_ids, **gen_config)
+        # Decode only the generated part
+        generated_ids = output[:, input_ids.shape[1]:]
+        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        return generated_text
     except Exception as e:
         logger.error(f"Generation failed: {e}")
+        # Return empty string on failure
+        return ""
 def apply_compression_pipeline(model, tokenizer, input_ids, attention_mask,
     """
     device = input_ids.device
     input_ids, attention_mask = validate_model_inputs(model, input_ids, attention_mask)
     if torch.cuda.is_available() and measure_memory:
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
         torch.cuda.synchronize()
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     start_time = time.perf_counter()
     try:
         with torch.inference_mode():
             outputs = model(
             logits = outputs.logits
     except Exception as e:
         logger.error(f"Prefill failed: {e}")
         return {
             'past_key_values': None,
             'prefill_time': 0,
     prefill_time = time.perf_counter() - start_time
     prefill_peak_mem = 0
     if torch.cuda.is_available() and measure_memory:
         prefill_peak_mem = _peak_mem_bytes_all_gpus()
     prefill_loss = None
     if logits is not None and input_ids.shape[1] > 1:
         try:
             seq_len = min(logits.shape[1], input_ids.shape[1] - 1)
             if seq_len > 0:
                 shift_logits = logits[:, :seq_len, :].contiguous()
                 shift_labels = input_ids[:, 1:seq_len+1].contiguous()
                 loss = F.cross_entropy(
                     shift_logits.view(-1, shift_logits.size(-1)),
                     shift_labels.view(-1),
         except Exception as e:
             logger.warning(f"Could not calculate prefill loss: {e}")
     original_cache_size = 0
     compressed_cache_size = 0
     compression_ratio = 1.0
     if past_key_values:
         try:
+            if hasattr(past_key_values, 'to_legacy_cache'):
+                kv_tuple = past_key_values.to_legacy_cache()
+            else:
+                kv_tuple = past_key_values
             for layer_idx, (keys, values) in enumerate(kv_tuple):
                 if keys is not None and values is not None:
                     original_cache_size += keys.nelement() * keys.element_size()
                     original_cache_size += values.nelement() * values.element_size()
                     if config.compression_type != CompressionType.NONE and cache_manager is not None:
                         try:
                             cache_manager.compress_and_store(layer_idx, keys, values)
                         except Exception as e:
                             logger.error(f"Compression failed for layer {layer_idx}: {e}")
             if config.compression_type != CompressionType.NONE and cache_manager is not None:
                 reconstructed_kv = []
                 for layer_idx in range(len(kv_tuple)):
                         if dec_keys is not None and dec_values is not None:
                             reconstructed_kv.append((dec_keys, dec_values))
                         else:
                             reconstructed_kv.append(kv_tuple[layer_idx])
                     except Exception as e:
                         logger.error(f"Decompression failed for layer {layer_idx}: {e}")
                         reconstructed_kv.append(kv_tuple[layer_idx])
                 if hasattr(DynamicCache, 'from_legacy_cache'):
                     past_key_values = DynamicCache.from_legacy_cache(tuple(reconstructed_kv))
                 else:
                     past_key_values = tuple(reconstructed_kv)
                 try:
                     compressed_cache_size = cache_manager.get_memory_footprint()
                 except:
             else:
                 compressed_cache_size = original_cache_size
+            if compressed_cache_size > 0:
+                compression_ratio = original_cache_size / compressed_cache_size
         except Exception as e:
             logger.error(f"Cache processing failed: {e}")
 def create_niah_haystack(context_length: int, needle: str, depth_percent: float) -> str:
     """Create Needle-in-a-Haystack test context - NO HARDCODING."""
     haystack_template = "The quick brown fox jumps over the lazy dog. " * 20
     haystack_chunks = []
     haystack = " ".join(haystack_chunks)[:context_length - len(needle) - 10]
     insertion_point = int(len(haystack) * depth_percent / 100)
     haystack_with_needle = (
         haystack[:insertion_point] +
     prompt = f"{context}\n\nQuestion: What is the secret password?\nAnswer:"
     inputs = safe_tokenize(tokenizer, prompt, max_length=min(config.prefill_length, 1024))
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
     compression_result = apply_compression_pipeline(
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
     gen_start = time.perf_counter()
+    generated_text = safe_generate(model, tokenizer, input_ids, attention_mask,
+                                  compression_result['past_key_values'], max_new_tokens=20)
     gen_time = time.perf_counter() - gen_start
     accuracy = 1.0 if config.niah_needle.split()[-1] in generated_text else 0.0
     logger.info(f"NIAH accuracy: {accuracy}, Generated: {generated_text[:50]}")
 def evaluate_ruler(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
     """Evaluate RULER with SAME compression pipeline as WikiText."""
+    seq_len = min(config.ruler_max_seq_length, config.prefill_length, 1024)
     facts = []
     for i in range(10):
         facts.append(f"Fact {i}: The capital of Country{i} is City{i}.")
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
     compression_result = apply_compression_pipeline(
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
     gen_start = time.perf_counter()
+    generated = safe_generate(model, tokenizer, input_ids, attention_mask,
+                            compression_result['past_key_values'], max_new_tokens=10)
     gen_time = time.perf_counter() - gen_start
     expected = f"City{query_idx}"
     exact_match = 1.0 if expected in generated else 0.0
 def evaluate_scbench(model, tokenizer, config: CompressionConfig, cache_manager: Optional[QuantizedKVCache] = None) -> Dict[str, Any]:
     """Evaluate SCBench with SAME compression pipeline as WikiText."""
     conversation = []
     facts = {}
         conversation.append(f"User: {user_msg}")
         conversation.append(f"Assistant: {assistant_msg}")
     query_key = random.choice(list(facts.keys()))
     conversation.append(f"User: What is {query_key}?")
     input_ids = inputs.input_ids.to(model.device)
     attention_mask = inputs.attention_mask.to(model.device)
     compression_result = apply_compression_pipeline(
         model, tokenizer, input_ids, attention_mask, cache_manager, config
     )
     gen_start = time.perf_counter()
+    generated = safe_generate(model, tokenizer, input_ids, attention_mask,
+                            compression_result['past_key_values'], max_new_tokens=20)
     gen_time = time.perf_counter() - gen_start
     expected_value = facts[query_key]
     accuracy = 1.0 if expected_value in generated else 0.0
     try:
         dataset = load_dataset("THUDM/LongBench", task, split="test")
         n_samples = min(config.eval_samples, len(dataset))
         samples = dataset.select(range(n_samples))
             input_ids = inputs.input_ids.to(model.device)
             attention_mask = inputs.attention_mask.to(model.device)
             compression_result = apply_compression_pipeline(
                 model, tokenizer, input_ids, attention_mask, cache_manager, config,
+                measure_memory=False
             )
             gen_start = time.perf_counter()
+            generated = safe_generate(model, tokenizer, input_ids, attention_mask,
+                                     compression_result['past_key_values'], max_new_tokens=50)
             gen_time = time.perf_counter() - gen_start
             score = 1.0 if str(answer).lower() in generated.lower() else 0.0
             scores.append(score)
             compression_ratios.append(compression_result['compression_ratio'])
             gen_times.append(gen_time)
         avg_compression = float(np.mean(compression_ratios)) if compression_ratios else 1.0
         return {
             'accuracy': float(np.mean(scores)),
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.float16 if device == "cuda" else torch.float32
     if config.fail_on_cpu_fallback and device == "cpu":
         raise RuntimeError("CUDA required but unavailable (fail_on_cpu_fallback=True)")
     logger.info(f"Loading model: {model_name}")
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     model_kwargs = {
         "torch_dtype": dtype,
         "device_map": "auto" if device == "cuda" else None,
         "trust_remote_code": True
     }
     if config.use_flash_attention and device == "cuda":
         try:
             model_kwargs["attn_implementation"] = "flash_attention_2"
             model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
             logger.info("Successfully loaded with Flash Attention 2")
         except Exception as e:
+            logger.warning(f"Flash Attention not available: {e}")
             model_kwargs.pop("attn_implementation", None)
             model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
     model.eval()
     logger.info(f"Loading samples for benchmark: {config.benchmark_type}")
     if config.benchmark_type == "wikitext":
         texts = []
         min_tokens = config.prefill_length + config.generation_length
             raise
     elif config.benchmark_type == "longbench":
         texts = []
         if config.benchmark_subset:
             try:
                 raise
     elif config.benchmark_type in ["niah", "ruler", "scbench"]:
         texts = ["Synthetic benchmark data"] * config.eval_samples
     else:
     logger.info(f"Benchmark type: {config.benchmark_type}")
     logger.info(f"Config hash: {config.get_hash()}")
     if torch.cuda.is_available():
         os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
         logger.error(f"Failed to detect model layers: {e}")
         raise
     device = model.device
     with torch.inference_mode():
         dummy = torch.randint(0, tokenizer.vocab_size, (1, min(config.prefill_length, 128)), device=device)
         metrics = BenchmarkMetrics()
         if config.benchmark_type == "niah":
             for depth in BENCHMARK_CONFIGS["niah"]["depths"]:
                 config.niah_depth_percent = depth
                 for idx in range(min(config.eval_samples, 10)):
                     if config.compression_type != CompressionType.NONE:
                         cache_manager = QuantizedKVCache(config)
                         cache_manager.n_layers = n_layers
                     metrics.compression_ratios.append(result['compression_ratio'])
                     metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
                     metrics.prefill_times.append(result['prefill_time'])
+                    metrics.decode_times.append(result['generation_time'] / 20)
                     if result['prefill_peak_mem'] > 0:
                         metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
                     per_sample_records.append({
                         'benchmark': 'niah',
                         'depth_percent': depth,
                     })
         elif config.benchmark_type == "ruler":
             for idx in range(config.eval_samples):
                 if config.compression_type != CompressionType.NONE:
                     cache_manager = QuantizedKVCache(config)
                 metrics.compression_ratios.append(result['compression_ratio'])
                 metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
                 metrics.prefill_times.append(result['prefill_time'])
+                metrics.decode_times.append(result['generation_time'] / 10)
                 if result['prefill_peak_mem'] > 0:
                     metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
                 })
         elif config.benchmark_type == "scbench":
             for idx in range(config.eval_samples):
                 if config.compression_type != CompressionType.NONE:
                     cache_manager = QuantizedKVCache(config)
                 metrics.compression_ratios.append(result['compression_ratio'])
                 metrics.kv_cache_memory_samples_mb.append(result['kv_cache_memory_mb'])
                 metrics.prefill_times.append(result['prefill_time'])
+                metrics.decode_times.append(result['generation_time'] / 20)
                 if result['prefill_peak_mem'] > 0:
                     metrics.prefill_peak_memories.append(result['prefill_peak_mem'])
                 })
         elif config.benchmark_type == "longbench":
             if config.benchmark_subset:
                 if config.compression_type != CompressionType.NONE:
                     cache_manager = QuantizedKVCache(config)
                 metrics.prefill_times.append(result['prefill_time'])
                 if result['generation_time'] > 0:
+                    metrics.decode_times.append(result['generation_time'] / 50)
                 per_sample_records.append({
                     'benchmark': 'longbench',
                 })
         else:
             for idx in range(config.eval_samples):
                 logger.info(f"Sample {idx+1}/{config.eval_samples}")
                 else:
                     cache_manager = None
                 inputs = safe_tokenize(tokenizer, text, max_length=min(config.prefill_length, 1024))
                 input_ids = inputs.input_ids.to(device)
                 attention_mask = inputs.attention_mask.to(device)
                 compression_result = apply_compression_pipeline(
                     model, tokenizer, input_ids, attention_mask, cache_manager, config
                 )
                     prefill_perplexity = np.exp(compression_result['prefill_loss'])
                     metrics.prefill_perplexities.append(min(prefill_perplexity, 1000))
                 generated_ids = input_ids.clone()
                 decode_times = []
                 generation_losses = []
         metrics.calculate_statistics(config)
         all_metrics.append(metrics)
     final_metrics = BenchmarkMetrics()
     for m in all_metrics:
         final_metrics.prefill_times.extend(m.prefill_times)
     final_metrics.calculate_statistics(config)
     end_time = datetime.now().isoformat()
     summary = {
         'compression_type': config.compression_type.value,
         'end_time': end_time
     }
     if config.benchmark_type == "niah" and final_metrics.niah_retrieval_accuracy:
         summary['niah_accuracy'] = float(np.mean(final_metrics.niah_retrieval_accuracy))
     elif config.benchmark_type == "ruler" and final_metrics.ruler_exact_match:
         summary['prefill_perplexity'] = final_metrics.prefill_perplexity_mean
         summary['generation_perplexity'] = final_metrics.generation_perplexity_mean
     summary['prefill_time_ms'] = final_metrics.prefill_time_mean * 1000
     summary['decode_time_ms'] = final_metrics.decode_time_per_token_mean_ms
     summary['throughput_tokens_sec'] = final_metrics.decode_tokens_per_sec
     recomputed = {}
     failures = []
     if config.benchmark_type == "niah":
         if "niah_accuracy" in summary:
             recomputed["niah_accuracy"] = mean_of("accuracy")
         if "longbench_accuracy" in summary:
             recomputed["longbench_accuracy"] = mean_of("accuracy")
     elif config.benchmark_type == "wikitext":
         if "prefill_perplexity" in summary:
             recomputed["prefill_perplexity"] = mean_of("prefill_perplexity")
         if "generation_perplexity" in summary:
             recomputed["generation_perplexity"] = mean_of("generation_perplexity")
     recomputed["compression_ratio"] = mean_of("compression_ratio")
     recomputed["kv_cache_memory_mb"] = mean_of("kv_cache_memory_mb")