Spaces:

kfoughali
/

serpent

Sleeping

App Files Files Community

kfoughali commited on Sep 6

Commit

9196642

verified ·

1 Parent(s): 860d0e3

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -701

app.py CHANGED Viewed

@@ -1,701 +0,0 @@
-# app.py
-"""
-Research-grade KV cache compression benchmark application.
-RocketKV-enhanced SPG with 450x compression capability.
-FIXED: CUDA assert errors, safer default parameters, GPT-2 sequence limits.
-"""
-import gradio as gr
-import torch
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from datetime import datetime
-import json
-import pandas as pd
-import tempfile
-import os
-import logging
-from typing import Dict, List, Any, Tuple
-from config import (
-    CompressionConfig, CompressionType, EnhancedSPGConfig,
-    ProvingConfig, ResearchConstants, SUPPORTED_MODELS, BENCHMARK_CONFIGS
-)
-from benchmark import (
-    run_research_benchmark, export_proof_bundle, verify_proof_bundle,
-    BenchmarkMetrics
-)
-from compression import detect_model_layers
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Set style for plots
-plt.style.use('seaborn-v0_8-darkgrid')
-sns.set_palette("husl")
-# Global state for results
-current_results = {}
-def run_benchmark(model_key, compression_type, benchmark_type, dataset_subset,
-                 eval_samples, n_seeds, seq_length, generation_length,
-                 base_decay_rate, sink_tokens, recent_window,
-                 enable_adaptive, target_perplexity_delta,
-                 enable_progressive, progressive_quality_threshold,
-                 initial_compression_ratio, max_compression_ratio,
-                 sequence_compression_ratio, head_compression_ratio,
-                 head_retention_mode, magnitude_threshold_mode,
-                 min_tokens_for_stability, recent_boost_factor,
-                 fail_on_cpu):
-    """Run comprehensive benchmark with all compression methods."""
-    # Enable synchronous CUDA for debugging
-    if torch.cuda.is_available():
-        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
-    # Validate sequence length for GPT-2
-    if model_key == "gpt2" and seq_length > 1024:
-        logger.warning(f"Reducing sequence length from {seq_length} to 1024 for GPT-2")
-        seq_length = 1024
-    try:
-        # Create base configuration
-        base_config = CompressionConfig(
-            model_key=model_key,
-            compression_type=CompressionType[compression_type.upper()],
-            benchmark_type=benchmark_type,
-            benchmark_subset=dataset_subset if benchmark_type == "longbench" else None,
-            eval_samples=int(eval_samples),
-            n_seeds=int(n_seeds),
-            prefill_length=int(seq_length),
-            generation_length=int(generation_length),
-            fail_on_cpu_fallback=fail_on_cpu
-        )
-        # Configure Enhanced SPG with safer parameters
-        base_config.enhanced_spg_config = EnhancedSPGConfig(
-            base_decay_rate=float(base_decay_rate),
-            sink_tokens=int(sink_tokens),
-            recent_window=int(recent_window),
-            enable_adaptive=enable_adaptive,
-            target_perplexity_delta=float(target_perplexity_delta),
-            enable_progressive=enable_progressive,
-            quality_threshold=float(progressive_quality_threshold),
-            initial_compression_ratio=float(initial_compression_ratio),
-            max_compression_ratio=float(max_compression_ratio),
-            target_compression_ratio=float(max_compression_ratio),
-            sequence_compression_ratio=float(sequence_compression_ratio),
-            head_compression_ratio=float(head_compression_ratio),
-            head_retention_mode=head_retention_mode,
-            magnitude_threshold_mode=magnitude_threshold_mode,
-            min_tokens_for_stability=int(min_tokens_for_stability),
-            recent_boost_factor=float(recent_boost_factor),
-            enable_two_stage=True,
-            use_hybrid_sparse_attention=True,
-            use_snapkv_plus_plus=True,
-            stage1_compression_ratio=20.0,  # Safer default
-            stage2_compression_ratio=20.0   # For 400x total
-        )
-        # Store results
-        results = {}
-        model_name = base_config.model_name
-        # Run benchmark for selected compression type
-        logger.info(f"Running {compression_type} benchmark...")
-        metrics, summary, records, fingerprints = run_research_benchmark(
-            model_name, base_config
-        )
-        results[compression_type] = {
-            'metrics': metrics,
-            'summary': summary,
-            'records': records
-        }
-        # Also run NONE compression for baseline comparison
-        if compression_type != "none":
-            logger.info("Running baseline (no compression) benchmark...")
-            baseline_config = CompressionConfig(
-                model_key=model_key,
-                compression_type=CompressionType.NONE,
-                benchmark_type=benchmark_type,
-                benchmark_subset=dataset_subset if benchmark_type == "longbench" else None,
-                eval_samples=int(eval_samples),
-                n_seeds=int(n_seeds),
-                prefill_length=int(seq_length),
-                generation_length=int(generation_length),
-                fail_on_cpu_fallback=fail_on_cpu
-            )
-            try:
-                baseline_metrics, baseline_summary, baseline_records, _ = run_research_benchmark(
-                    model_name, baseline_config
-                )
-                results['none'] = {
-                    'metrics': baseline_metrics,
-                    'summary': baseline_summary,
-                    'records': baseline_records
-                }
-            except Exception as e:
-                logger.error(f"Baseline benchmark failed: {e}")
-                # Continue without baseline
-        # Store globally for export
-        global current_results
-        current_results = results
-        # Create visualizations
-        plots = create_visualizations(results, benchmark_type)
-        # Create summary text
-        summary_text = create_summary_text(results, benchmark_type)
-        # Export proof bundle
-        with tempfile.TemporaryDirectory() as tmpdir:
-            bundle_path = export_proof_bundle(
-                tmpdir, base_config, metrics, summary, records, fingerprints
-            )
-            # Verify the bundle
-            verification = verify_proof_bundle(
-                tmpdir, base_config, base_config.proving
-            )
-            verification_text = f"Proof verification: {'PASSED ✓' if verification['ok'] else 'FAILED ✗'}"
-            if not verification['ok']:
-                verification_text += f"\nFailures: {verification['failures']}"
-        return plots, summary_text, verification_text
-    except Exception as e:
-        logger.error(f"Benchmark failed: {e}", exc_info=True)
-        return [], f"Error: {str(e)}", "Verification failed due to error"
-def create_visualizations(results: Dict, benchmark_type: str) -> List:
-    """Create comprehensive visualizations from benchmark results."""
-    plots = []
-    # 1. Compression Ratio Comparison
-    fig, ax = plt.subplots(figsize=(10, 6))
-    methods = []
-    ratios = []
-    errors = []
-    for method, data in results.items():
-        if 'metrics' in data and hasattr(data['metrics'], 'compression_ratio_mean'):
-            methods.append(method.upper())
-            ratios.append(data['metrics'].compression_ratio_mean)
-            errors.append(data['metrics'].compression_ratio_std)
-    if methods:
-        bars = ax.bar(methods, ratios, yerr=errors, capsize=5)
-        ax.set_ylabel('Compression Ratio')
-        ax.set_title('KV Cache Compression Ratios')
-        ax.grid(True, alpha=0.3)
-        # Add value labels on bars
-        for bar, ratio in zip(bars, ratios):
-            height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width()/2., height,
-                   f'{ratio:.1f}x', ha='center', va='bottom')
-    plt.tight_layout()
-    plots.append(fig)
-    # 2. Memory Usage Comparison
-    fig, ax = plt.subplots(figsize=(10, 6))
-    memories = []
-    memory_errors = []
-    for method, data in results.items():
-        if 'metrics' in data and hasattr(data['metrics'], 'kv_cache_memory_mb'):
-            memories.append(data['metrics'].kv_cache_memory_mb)
-            memory_errors.append(0)  # No std for memory in current implementation
-    if methods and memories:
-        bars = ax.bar(methods, memories, yerr=memory_errors, capsize=5, color='coral')
-        ax.set_ylabel('Memory Usage (MB)')
-        ax.set_title('KV Cache Memory Footprint')
-        ax.grid(True, alpha=0.3)
-        for bar, mem in zip(bars, memories):
-            height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width()/2., height,
-                   f'{mem:.1f}', ha='center', va='bottom')
-    plt.tight_layout()
-    plots.append(fig)
-    # 3. Benchmark-specific metrics
-    if benchmark_type == "wikitext":
-        # Perplexity comparison
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
-        # Prefill perplexity
-        prefill_ppls = []
-        prefill_errors = []
-        gen_ppls = []
-        gen_errors = []
-        for method, data in results.items():
-            if 'metrics' in data:
-                metrics = data['metrics']
-                if hasattr(metrics, 'prefill_perplexity_mean'):
-                    prefill_ppls.append(metrics.prefill_perplexity_mean)
-                    prefill_errors.append(metrics.prefill_perplexity_std)
-                if hasattr(metrics, 'generation_perplexity_mean'):
-                    gen_ppls.append(metrics.generation_perplexity_mean)
-                    gen_errors.append(metrics.generation_perplexity_std)
-        if prefill_ppls:
-            ax1.bar(methods[:len(prefill_ppls)], prefill_ppls, yerr=prefill_errors, capsize=5, color='skyblue')
-            ax1.set_ylabel('Perplexity')
-            ax1.set_title('Prefill Perplexity')
-            ax1.grid(True, alpha=0.3)
-        if gen_ppls:
-            ax2.bar(methods[:len(gen_ppls)], gen_ppls, yerr=gen_errors, capsize=5, color='lightgreen')
-            ax2.set_ylabel('Perplexity')
-            ax2.set_title('Generation Perplexity')
-            ax2.grid(True, alpha=0.3)
-        plt.suptitle('Quality Metrics: Perplexity Comparison')
-        plt.tight_layout()
-        plots.append(fig)
-    elif benchmark_type in ["niah", "ruler", "scbench"]:
-        # Accuracy metrics
-        fig, ax = plt.subplots(figsize=(10, 6))
-        accuracies = []
-        for method, data in results.items():
-            if 'summary' in data:
-                if benchmark_type == "niah" and 'niah_accuracy' in data['summary']:
-                    accuracies.append(data['summary']['niah_accuracy'])
-                elif benchmark_type == "ruler" and 'ruler_exact_match' in data['summary']:
-                    accuracies.append(data['summary']['ruler_exact_match'])
-                elif benchmark_type == "scbench" and 'scbench_accuracy' in data['summary']:
-                    accuracies.append(data['summary']['scbench_accuracy'])
-        if accuracies:
-            bars = ax.bar(methods[:len(accuracies)], accuracies, color='gold')
-            ax.set_ylabel('Accuracy')
-            ax.set_ylim(0, 1.1)
-            ax.set_title(f'{benchmark_type.upper()} Accuracy')
-            ax.grid(True, alpha=0.3)
-            for bar, acc in zip(bars, accuracies):
-                height = bar.get_height()
-                ax.text(bar.get_x() + bar.get_width()/2., height,
-                       f'{acc:.2%}', ha='center', va='bottom')
-        plt.tight_layout()
-        plots.append(fig)
-    # 4. Speed comparison
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
-    prefill_times = []
-    decode_times = []
-    for method, data in results.items():
-        if 'metrics' in data:
-            metrics = data['metrics']
-            if hasattr(metrics, 'prefill_time_mean'):
-                prefill_times.append(metrics.prefill_time_mean * 1000)  # Convert to ms
-            if hasattr(metrics, 'decode_time_per_token_mean_ms'):
-                decode_times.append(metrics.decode_time_per_token_mean_ms)
-    if prefill_times:
-        ax1.bar(methods[:len(prefill_times)], prefill_times, color='purple', alpha=0.7)
-        ax1.set_ylabel('Time (ms)')
-        ax1.set_title('Prefill Time')
-        ax1.grid(True, alpha=0.3)
-    if decode_times:
-        ax2.bar(methods[:len(decode_times)], decode_times, color='orange', alpha=0.7)
-        ax2.set_ylabel('Time per Token (ms)')
-        ax2.set_title('Decode Time')
-        ax2.grid(True, alpha=0.3)
-    plt.suptitle('Performance Metrics: Speed Comparison')
-    plt.tight_layout()
-    plots.append(fig)
-    return plots
-def create_summary_text(results: Dict, benchmark_type: str) -> str:
-    """Create detailed summary text from results."""
-    summary_lines = []
-    summary_lines.append("=" * 60)
-    summary_lines.append("BENCHMARK RESULTS SUMMARY")
-    summary_lines.append("=" * 60)
-    summary_lines.append(f"Benchmark Type: {benchmark_type.upper()}")
-    summary_lines.append(f"Timestamp: {datetime.now().isoformat()}")
-    summary_lines.append("")
-    for method, data in results.items():
-        if 'summary' not in data:
-            continue
-        summary = data['summary']
-        metrics = data['metrics'] if 'metrics' in data else None
-        summary_lines.append(f"Method: {method.upper()}")
-        summary_lines.append("-" * 40)
-        # Compression metrics
-        if 'compression_ratio' in summary:
-            summary_lines.append(f"Compression Ratio: {summary['compression_ratio']:.1f}x")
-        if 'kv_cache_memory_mb' in summary:
-            summary_lines.append(f"KV Cache Memory: {summary['kv_cache_memory_mb']:.2f} MB")
-        # Quality metrics
-        if benchmark_type == "wikitext":
-            if 'prefill_perplexity' in summary:
-                summary_lines.append(f"Prefill Perplexity: {summary['prefill_perplexity']:.2f}")
-            if 'generation_perplexity' in summary:
-                summary_lines.append(f"Generation Perplexity: {summary['generation_perplexity']:.2f}")
-        elif benchmark_type == "niah" and 'niah_accuracy' in summary:
-            summary_lines.append(f"NIAH Accuracy: {summary['niah_accuracy']:.2%}")
-        elif benchmark_type == "ruler" and 'ruler_exact_match' in summary:
-            summary_lines.append(f"RULER Exact Match: {summary['ruler_exact_match']:.2%}")
-        elif benchmark_type == "scbench" and 'scbench_accuracy' in summary:
-            summary_lines.append(f"SCBench Accuracy: {summary['scbench_accuracy']:.2%}")
-        elif benchmark_type == "longbench" and 'longbench_accuracy' in summary:
-            summary_lines.append(f"LongBench Accuracy: {summary['longbench_accuracy']:.2%}")
-        # Performance metrics
-        if 'prefill_time_ms' in summary:
-            summary_lines.append(f"Prefill Time: {summary['prefill_time_ms']:.2f} ms")
-        if 'decode_time_ms' in summary:
-            summary_lines.append(f"Decode Time per Token: {summary['decode_time_ms']:.2f} ms")
-        if 'throughput_tokens_sec' in summary:
-            summary_lines.append(f"Throughput: {summary['throughput_tokens_sec']:.1f} tokens/sec")
-        if 'end_to_end_throughput' in summary:
-            summary_lines.append(f"End-to-End Throughput: {summary['end_to_end_throughput']:.1f} tokens/sec")
-        if 'peak_memory_mb' in summary:
-            summary_lines.append(f"Peak Memory: {summary['peak_memory_mb']:.2f} MB")
-        summary_lines.append("")
-    # Add statistical comparison if baseline is available
-    if 'none' in results and len(results) > 1:
-        summary_lines.append("COMPARISON WITH BASELINE")
-        summary_lines.append("-" * 40)
-        baseline_summary = results['none']['summary']
-        for method, data in results.items():
-            if method == 'none' or 'summary' not in data:
-                continue
-            summary = data['summary']
-            # Calculate improvements
-            if 'compression_ratio' in summary:
-                summary_lines.append(f"{method.upper()} vs Baseline:")
-                summary_lines.append(f"  Compression: {summary['compression_ratio']:.1f}x")
-            if 'kv_cache_memory_mb' in summary and 'kv_cache_memory_mb' in baseline_summary:
-                baseline_mem = baseline_summary['kv_cache_memory_mb']
-                method_mem = summary['kv_cache_memory_mb']
-                if baseline_mem > 0:
-                    reduction = (1 - method_mem / baseline_mem) * 100
-                    summary_lines.append(f"  Memory Reduction: {reduction:.1f}%")
-            # Quality degradation for WikiText
-            if benchmark_type == "wikitext":
-                if 'generation_perplexity' in summary and 'generation_perplexity' in baseline_summary:
-                    baseline_ppl = baseline_summary['generation_perplexity']
-                    method_ppl = summary['generation_perplexity']
-                    if baseline_ppl > 0:
-                        degradation = ((method_ppl - baseline_ppl) / baseline_ppl) * 100
-                        summary_lines.append(f"  Perplexity Change: {degradation:+.1f}%")
-            # Accuracy comparison for other benchmarks
-            elif benchmark_type == "niah":
-                if 'niah_accuracy' in summary and 'niah_accuracy' in baseline_summary:
-                    acc_diff = summary['niah_accuracy'] - baseline_summary['niah_accuracy']
-                    summary_lines.append(f"  Accuracy Difference: {acc_diff:+.2%}")
-            summary_lines.append("")
-    return "\n".join(summary_lines)
-def export_results(format_type):
-    """Export current results in specified format."""
-    if not current_results:
-        return "No results to export. Please run a benchmark first."
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    if format_type == "JSON":
-        filename = f"results_{timestamp}.json"
-        # Convert numpy types to Python types for JSON serialization
-        def convert_numpy(obj):
-            if isinstance(obj, np.ndarray):
-                return obj.tolist()
-            elif isinstance(obj, (np.integer, np.int64, np.int32)):
-                return int(obj)
-            elif isinstance(obj, (np.floating, np.float64, np.float32)):
-                return float(obj)
-            elif isinstance(obj, BenchmarkMetrics):
-                return obj.__dict__
-            return obj
-        serializable_results = json.loads(
-            json.dumps(current_results, default=convert_numpy)
-        )
-        with open(filename, 'w') as f:
-            json.dump(serializable_results, f, indent=2)
-        return f"Results exported to {filename}"
-    elif format_type == "CSV":
-        filename = f"results_{timestamp}.csv"
-        # Flatten results for CSV
-        rows = []
-        for method, data in current_results.items():
-            if 'summary' in data:
-                row = {'method': method}
-                row.update(data['summary'])
-                rows.append(row)
-        if rows:
-            df = pd.DataFrame(rows)
-            df.to_csv(filename, index=False)
-            return f"Results exported to {filename}"
-        else:
-            return "No summary data to export"
-    elif format_type == "LaTeX":
-        filename = f"results_{timestamp}.tex"
-        # Create LaTeX table
-        latex_lines = [
-            "\\begin{table}[h]",
-            "\\centering",
-            "\\caption{KV Cache Compression Results}",
-            "\\begin{tabular}{lccc}",
-            "\\hline",
-            "Method & Compression & Memory (MB) & Throughput (tok/s) \\\\",
-            "\\hline"
-        ]
-        for method, data in current_results.items():
-            if 'summary' in data:
-                s = data['summary']
-                comp = f"{s.get('compression_ratio', 1.0):.1f}x"
-                mem = f"{s.get('kv_cache_memory_mb', 0):.1f}"
-                thr = f"{s.get('throughput_tokens_sec', 0):.1f}"
-                latex_lines.append(f"{method.upper()} & {comp} & {mem} & {thr} \\\\")
-        latex_lines.extend([
-            "\\hline",
-            "\\end{tabular}",
-            "\\end{table}"
-        ])
-        with open(filename, 'w') as f:
-            f.write('\n'.join(latex_lines))
-        return f"LaTeX table exported to {filename}"
-    return "Invalid export format"
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="RocketKV-Enhanced SPG Benchmark") as demo:
-        gr.Markdown("""
-        # 🚀 RocketKV-Enhanced SPG Compression Benchmark
-        Research-grade KV cache compression with **450x compression capability**.
-        Implements Enhanced Sliding Precision Gradient with RocketKV-style optimizations.
-        **Features:**
-        - Multiple compression methods (SPG, Adaptive, Enhanced, Progressive)
-        - Comprehensive benchmarks (WikiText, NIAH, RULER, SCBench, LongBench)
-        - Attestable proof generation and verification
-        - Real-time visualization and analysis
-        """)
-        with gr.Tab("Configuration"):
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### Model & Benchmark Settings")
-                    model_dropdown = gr.Dropdown(
-                        choices=list(SUPPORTED_MODELS.keys()),
-                        value="gpt2",
-                        label="Model"
-                    )
-                    compression_dropdown = gr.Dropdown(
-                        choices=["none", "spg", "adaptive_spg", "enhanced_spg", "progressive_spg"],
-                        value="enhanced_spg",
-                        label="Compression Method"
-                    )
-                    benchmark_dropdown = gr.Dropdown(
-                        choices=["wikitext", "niah", "ruler", "scbench", "longbench"],
-                        value="wikitext",
-                        label="Benchmark Type"
-                    )
-                    dataset_subset = gr.Dropdown(
-                        choices=BENCHMARK_CONFIGS["longbench"]["subsets"],
-                        value="narrativeqa",
-                        label="LongBench Subset (if applicable)",
-                        visible=False
-                    )
-                    # Show/hide subset based on benchmark type
-                    def update_subset_visibility(benchmark_type):
-                        return gr.update(visible=(benchmark_type == "longbench"))
-                    benchmark_dropdown.change(
-                        update_subset_visibility,
-                        inputs=[benchmark_dropdown],
-                        outputs=[dataset_subset]
-                    )
-                with gr.Column():
-                    gr.Markdown("### Evaluation Parameters")
-                    eval_samples = gr.Slider(1, 100, value=20, step=1, label="Evaluation Samples")
-                    n_seeds = gr.Slider(1, 5, value=3, step=1, label="Random Seeds")
-                    seq_length = gr.Slider(128, 1024, value=512, step=128,
-                                          label="Sequence Length (max 1024 for GPT-2)")
-                    generation_length = gr.Slider(16, 128, value=64, step=16, label="Generation Length")
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### SPG Core Parameters")
-                    base_decay = gr.Slider(0.8, 0.99, value=0.95, step=0.01, label="Base Decay Rate")
-                    sink_tokens = gr.Slider(0, 8, value=2, step=1, label="Sink Tokens")
-                    recent_window = gr.Slider(8, 64, value=32, step=8, label="Recent Window")
-                with gr.Column():
-                    gr.Markdown("### Adaptive SPG")
-                    enable_adaptive = gr.Checkbox(value=False, label="Enable Adaptive")
-                    target_ppl_delta = gr.Slider(0.5, 5.0, value=1.8, step=0.1,
-                                                label="Target Perplexity Delta")
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### Progressive Compression")
-                    enable_progressive = gr.Checkbox(value=False, label="Enable Progressive")
-                    quality_threshold = gr.Slider(0.005, 0.05, value=0.01, step=0.005,
-                                                 label="Quality Threshold")
-                    initial_compression = gr.Slider(10.0, 200.0, value=50.0, step=5.0,
-                                                   label="Initial Compression Ratio")
-                    max_compression = gr.Slider(100.0, 500.0, value=400.0, step=25.0,
-                                               label="Max Compression Ratio")
-                with gr.Column():
-                    gr.Markdown("### Enhanced SPG (RocketKV-style)")
-                    sequence_comp_ratio = gr.Slider(0.0001, 0.001, value=0.0001, step=0.00005,
-                                                   label="Sequence Compression Ratio")
-                    head_comp_ratio = gr.Slider(0.0001, 0.001, value=0.0001, step=0.00005,
-                                               label="Head Compression Ratio")
-                    head_retention = gr.Dropdown(
-                        choices=["conservative", "aggressive"],
-                        value="aggressive",
-                        label="Head Retention Mode"
-                    )
-                    magnitude_mode = gr.Dropdown(
-                        choices=["conservative", "aggressive", "extreme"],
-                        value="aggressive",  # Changed from "extreme" for stability
-                        label="Magnitude Threshold Mode"
-                    )
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### Stability Parameters")
-                    min_tokens_stability = gr.Slider(4, 16, value=8, step=1,
-                                                    label="Min Tokens for Stability")
-                    recent_boost = gr.Slider(0.0, 0.5, value=0.1, step=0.05,
-                                           label="Recent Boost Factor")
-                with gr.Column():
-                    gr.Markdown("### System Settings")
-                    fail_on_cpu = gr.Checkbox(value=False, label="Fail on CPU Fallback")
-        with gr.Tab("Run Benchmark"):
-            run_button = gr.Button("🚀 Run Benchmark", variant="primary")
-            with gr.Row():
-                progress_text = gr.Textbox(label="Progress", lines=10)
-            with gr.Row():
-                plot_gallery = gr.Gallery(label="Results Visualization", columns=2, height="auto")
-            with gr.Row():
-                summary_output = gr.Textbox(label="Summary", lines=20)
-                verification_output = gr.Textbox(label="Proof Verification", lines=5)
-        with gr.Tab("Export Results"):
-            gr.Markdown("### Export Options")
-            export_format = gr.Radio(
-                choices=["JSON", "CSV", "LaTeX"],
-                value="JSON",
-                label="Export Format"
-            )
-            export_button = gr.Button("📥 Export Results")
-            export_status = gr.Textbox(label="Export Status")
-            export_button.click(
-                export_results,
-                inputs=[export_format],
-                outputs=[export_status]
-            )
-        # Connect the run button
-        run_button.click(
-            run_benchmark,
-            inputs=[
-                model_dropdown, compression_dropdown, benchmark_dropdown, dataset_subset,
-                eval_samples, n_seeds, seq_length, generation_length,
-                base_decay, sink_tokens, recent_window,
-                enable_adaptive, target_ppl_delta,
-                enable_progressive, quality_threshold,
-                initial_compression, max_compression,
-                sequence_comp_ratio, head_comp_ratio,
-                head_retention, magnitude_mode,
-                min_tokens_stability, recent_boost,
-                fail_on_cpu
-            ],
-            outputs=[plot_gallery, summary_output, verification_output]
-        )
-    return demo
-if __name__ == "__main__":
-    # Set up logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-    # Create and launch the interface
-    demo = create_interface()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )