Spaces:

MCP-1st-Birthday
/

TraceMind

Running

Mandark-droid commited on 17 days ago

Commit

24b4390

1 Parent(s): 1fc3adb

Add leaderboard components and enhanced data loader

- Add HTML table generator with styled leaderboard display
- Add metric display components (badges, bars, formatters)
- Enhance data loader to support both JSON and HuggingFace sources
- Add sample data for local development and testing
- Implement automatic fallback between data sources

Files changed (10) hide show

components/__init__.py +60 -0
components/leaderboard_table.py +582 -0
components/metric_displays.py +387 -0
data_loader.py +362 -187
sample_data/generate_sample_metrics.py +207 -0
sample_data/leaderboard.json +89 -0
sample_data/metrics_gpt4.json +1 -0
sample_data/metrics_llama31.json +1106 -0
sample_data/results_gpt4.json +137 -0
sample_data/traces_gpt4.json +143 -0

components/__init__.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Components package for TraceMind UI
+Contains reusable visual components
+"""
+from .metric_displays import (
+    get_rank_badge,
+    get_success_rate_bar,
+    get_gpu_utilization_bar,
+    get_provider_badge,
+    get_agent_type_badge,
+    get_hardware_badge,
+    format_cost,
+    format_duration,
+    get_tooltip_icon
+)
+from .leaderboard_table import (
+    generate_leaderboard_html,
+    generate_empty_state_html,
+    generate_filter_summary_html
+)
+from .thought_graph import create_thought_graph
+from .analytics_charts import (
+    create_performance_heatmap,
+    create_speed_accuracy_scatter,
+    create_cost_efficiency_scatter,
+    create_comparison_radar
+)
+from .report_cards import (
+    generate_leaderboard_summary_card,
+    generate_run_report_card,
+    download_card_as_png_js
+)
+__all__ = [
+    'get_rank_badge',
+    'get_success_rate_bar',
+    'get_gpu_utilization_bar',
+    'get_provider_badge',
+    'get_agent_type_badge',
+    'get_hardware_badge',
+    'format_cost',
+    'format_duration',
+    'get_tooltip_icon',
+    'generate_leaderboard_html',
+    'generate_empty_state_html',
+    'generate_filter_summary_html',
+    'create_thought_graph',
+    'create_performance_heatmap',
+    'create_speed_accuracy_scatter',
+    'create_cost_efficiency_scatter',
+    'create_comparison_radar',
+    'generate_leaderboard_summary_card',
+    'generate_run_report_card',
+    'download_card_as_png_js'
+]

components/leaderboard_table.py ADDED Viewed

	@@ -0,0 +1,582 @@

+"""
+Leaderboard HTML Table Generator
+Creates styled HTML tables for the leaderboard view
+"""
+import pandas as pd
+from typing import Optional
+from .metric_displays import (
+    get_rank_badge,
+    get_success_rate_bar,
+    get_gpu_utilization_bar,
+    get_provider_badge,
+    get_agent_type_badge,
+    get_hardware_badge,
+    format_cost,
+    format_duration,
+    get_tooltip_icon
+)
+def generate_leaderboard_html(
+    df: pd.DataFrame,
+    sort_by: str = "success_rate",
+    ascending: bool = False
+) -> str:
+    """
+    Generate styled HTML table for leaderboard
+    Args:
+        df: Leaderboard DataFrame
+        sort_by: Column to sort by
+        ascending: Sort order (False = descending)
+    Returns:
+        HTML string with complete styled table
+    Expected DataFrame columns:
+        - model (str): Model name
+        - agent_type (str): tool, code, or both
+        - provider (str): litellm or transformers
+        - success_rate (float): 0-100
+        - total_tests (int): Number of tests
+        - avg_duration_ms (float): Average duration
+        - total_cost_usd (float): Total cost
+        - co2_emissions_g (float): CO2 emissions
+        - gpu_utilization_avg (float, optional): GPU utilization %
+        - submitted_by (str): Username
+    """
+    # Sort dataframe
+    df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
+    # Start HTML with embedded CSS
+    html = """
+    <style>
+    /* Leaderboard Table Styles */
+    .tm-leaderboard-container {
+        background: #F8FAFC;  /* Light background for better readability */
+        border-radius: 16px;
+        overflow-x: auto;  /* Enable horizontal scrolling */
+        overflow-y: visible;
+        border: 1px solid rgba(203, 213, 225, 0.8);
+        margin: 20px 0;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        max-width: 100%;
+    }
+    /* Custom scrollbar styling */
+    .tm-leaderboard-container::-webkit-scrollbar {
+        height: 8px;
+    }
+    .tm-leaderboard-container::-webkit-scrollbar-track {
+        background: #E2E8F0;
+        border-radius: 4px;
+    }
+    .tm-leaderboard-container::-webkit-scrollbar-thumb {
+        background: #94A3B8;
+        border-radius: 4px;
+    }
+    .tm-leaderboard-container::-webkit-scrollbar-thumb:hover {
+        background: #64748B;
+    }
+    .tm-leaderboard-table {
+        width: 100%;
+        min-width: 1650px;  /* Reduced from 1800px after combining columns */
+        border-collapse: collapse;
+        font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+        background: #FFFFFF;  /* White background */
+        color: #0F172A;  /* Dark text for maximum contrast */
+    }
+    .tm-leaderboard-table thead {
+        background: linear-gradient(135deg, #6366F1 0%, #4F46E5 100%);  /* Vibrant indigo gradient */
+        position: sticky;
+        top: 0;
+        z-index: 10;
+        backdrop-filter: blur(10px);
+    }
+    .tm-leaderboard-table th {
+        padding: 16px 12px;
+        text-align: left;
+        font-weight: 600;
+        color: #FFFFFF;  /* Pure white for headers - good contrast */
+        border-bottom: 2px solid #4338CA;
+        font-size: 12px;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+        white-space: nowrap;
+    }
+    .tm-leaderboard-table td {
+        padding: 14px 12px;
+        border-bottom: 1px solid rgba(226, 232, 240, 0.8);
+        color: #1E293B;  /* Dark text for cells */
+        font-size: 14px;
+        vertical-align: middle;
+    }
+    .tm-leaderboard-table tbody tr {
+        transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+        cursor: pointer;
+    }
+    .tm-leaderboard-table tbody tr:hover {
+        background: rgba(99, 102, 241, 0.08) !important;
+        box-shadow: 0 0 15px rgba(99, 102, 241, 0.15),
+                    inset 0 0 15px rgba(99, 102, 241, 0.05);
+        transform: scale(1.002);
+    }
+    .tm-leaderboard-table tbody tr:nth-child(even) {
+        background: rgba(241, 245, 249, 0.6);  /* Light stripe */
+    }
+    .tm-model-name {
+        font-weight: 600;
+        color: #000000 !important;  /* Pure black - readable in all themes */
+        font-size: 15px;
+        transition: color 0.2s ease;
+    }
+    .tm-leaderboard-table tr:hover .tm-model-name {
+        color: #4F46E5 !important;  /* Indigo on hover */
+    }
+    .tm-numeric-cell {
+        font-family: 'Monaco', 'Menlo', monospace;
+        font-size: 13px;
+        text-align: center;
+        color: #000000 !important;  /* Pure black for numbers */
+    }
+    .tm-badge-cell {
+        text-align: center;
+    }
+    .tm-run-id {
+        font-family: 'Monaco', 'Menlo', monospace;
+        font-size: 12px;
+        color: #000000 !important;  /* Pure black - readable in all themes */
+        cursor: pointer;
+        text-decoration: none;
+        font-weight: 500;
+        transition: all 0.2s ease;
+    }
+    .tm-run-id:hover {
+        color: #4F46E5 !important;  /* Indigo on hover */
+        text-decoration: underline;
+    }
+    .tm-text-cell {
+        color: #000000 !important;  /* Pure black for all text */
+        font-size: 0.9em;
+    }
+    /* Responsive Design */
+    @media (max-width: 1024px) {
+        .tm-leaderboard-table th,
+        .tm-leaderboard-table td {
+            padding: 10px 8px;
+            font-size: 12px;
+        }
+        /* Hide less important columns on smaller screens */
+        .tm-hide-mobile {
+            display: none !important;
+        }
+    }
+    @media (max-width: 768px) {
+        .tm-leaderboard-table th:nth-child(n+7),
+        .tm-leaderboard-table td:nth-child(n+7) {
+            display: none !important;
+        }
+        .tm-model-name {
+            font-size: 13px;
+        }
+    }
+    @media (max-width: 480px) {
+        /* Ultra-compact: Show only rank, model, and success rate */
+        .tm-leaderboard-table th:nth-child(n+4),
+        .tm-leaderboard-table td:nth-child(n+4) {
+            display: none !important;
+        }
+        .tm-leaderboard-table th:nth-child(3),
+        .tm-leaderboard-table td:nth-child(3) {
+            display: table-cell !important;
+        }
+    }
+    </style>
+    <div class="tm-leaderboard-container">
+        <table class="tm-leaderboard-table">
+            <thead>
+                <tr>
+                    <th style="width: 60px;">Rank</th>
+                    <th style="width: 110px;" title="Click to view detailed run information">Run ID</th>
+                    <th style="min-width: 160px;">Model</th>
+                    <th style="width: 80px;">Type</th>
+                    <th style="width: 90px;">Provider</th>
+                    <th style="width: 85px;" title="Hardware used for evaluation: GPU or CPU">Hardware</th>
+                    <th style="width: 150px;" title="Percentage of test cases that passed (0-100%). Higher is better.">
+                        Success Rate
+                    </th>
+                    <th style="width: 140px;" class="tm-numeric-cell" title="Tests: Total / Pass / Fail">
+                        Tests (P/F)
+                    </th>
+                    <th style="width: 70px;" class="tm-numeric-cell" title="Average number of steps per test case.">
+                        Steps
+                    </th>
+                    <th style="width: 100px;" class="tm-numeric-cell" title="Average time per test case. Lower is better.">
+                        Duration
+                    </th>
+                    <th style="width: 90px;" class="tm-numeric-cell" title="Total tokens used across all tests.">
+                        Tokens
+                    </th>
+                    <th style="width: 90px;" class="tm-numeric-cell" title="Total API + power costs in USD. Lower is better.">
+                        Cost
+                    </th>
+                    <th style="width: 80px;" class="tm-numeric-cell tm-hide-mobile" title="Carbon footprint in grams of CO2 equivalent.">
+                        CO2
+                    </th>
+                    <th style="width: 100px;" class="tm-hide-mobile" title="Average GPU usage during evaluation (0-100%). Only for GPU jobs.">
+                        GPU Util
+                    </th>
+                    <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU memory usage (avg/max in MiB). Only for GPU jobs.">
+                        GPU Mem
+                    </th>
+                    <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU temperature (avg/max in Celsius). Only for GPU jobs.">
+                        GPU Temp
+                    </th>
+                    <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="Average GPU power consumption in Watts. Only for GPU jobs.">
+                        GPU Power
+                    </th>
+                    <th style="width: 140px;" class="tm-hide-mobile">Timestamp</th>
+                    <th style="width: 110px;" class="tm-hide-mobile">Submitted By</th>
+                </tr>
+            </thead>
+            <tbody>
+    """
+    # Generate table rows
+    for idx, row in df_sorted.iterrows():
+        rank = idx + 1
+        # Get values with safe defaults
+        model = row.get('model', 'Unknown')
+        agent_type = row.get('agent_type', 'unknown')
+        provider = row.get('provider', 'unknown')
+        success_rate = row.get('success_rate', 0.0)
+        total_tests = row.get('total_tests', 0)
+        successful_tests = row.get('successful_tests', 0)
+        failed_tests = row.get('failed_tests', 0)
+        avg_steps = row.get('avg_steps', 0.0)
+        avg_duration_ms = row.get('avg_duration_ms', 0.0)
+        total_tokens = row.get('total_tokens', 0)
+        total_cost_usd = row.get('total_cost_usd', 0.0)
+        co2_emissions_g = row.get('co2_emissions_g', 0.0)
+        gpu_utilization_avg = row.get('gpu_utilization_avg', None)
+        gpu_memory_avg_mib = row.get('gpu_memory_avg_mib', None)
+        gpu_memory_max_mib = row.get('gpu_memory_max_mib', None)
+        gpu_temperature_avg = row.get('gpu_temperature_avg', None)
+        gpu_temperature_max = row.get('gpu_temperature_max', None)
+        gpu_power_avg_w = row.get('gpu_power_avg_w', None)
+        timestamp = row.get('timestamp', '')
+        submitted_by = row.get('submitted_by', 'Unknown')
+        # Check if GPU job
+        has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0
+        # Format GPU utilization
+        if has_gpu:
+            gpu_display = get_gpu_utilization_bar(gpu_utilization_avg)
+        else:
+            gpu_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
+        # Format CO2
+        if pd.notna(co2_emissions_g) and co2_emissions_g > 0:
+            co2_display = f'<span style="font-family: monospace; font-size: 0.9em; color: #334155;">{co2_emissions_g:.2f}g</span>'
+        else:
+            co2_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
+        # Format GPU Memory
+        if pd.notna(gpu_memory_avg_mib) and pd.notna(gpu_memory_max_mib):
+            gpu_mem_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_memory_avg_mib:.0f}/{gpu_memory_max_mib:.0f}</span>'
+        else:
+            gpu_mem_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
+        # Format GPU Temperature
+        if pd.notna(gpu_temperature_avg) and pd.notna(gpu_temperature_max):
+            gpu_temp_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_temperature_avg:.0f}/{gpu_temperature_max:.0f}°C</span>'
+        else:
+            gpu_temp_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
+        # Format GPU Power
+        if pd.notna(gpu_power_avg_w):
+            gpu_power_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_power_avg_w:.1f}W</span>'
+        else:
+            gpu_power_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
+        # Format timestamp
+        from datetime import datetime
+        if pd.notna(timestamp):
+            try:
+                # Handle both string and Timestamp objects
+                if isinstance(timestamp, pd.Timestamp):
+                    timestamp_display = timestamp.strftime('%Y-%m-%d %H:%M')
+                else:
+                    dt = datetime.fromisoformat(str(timestamp).replace('Z', '+00:00'))
+                    timestamp_display = dt.strftime('%Y-%m-%d %H:%M')
+            except Exception as e:
+                timestamp_display = str(timestamp)[:16] if timestamp else 'N/A'
+        else:
+            timestamp_display = 'N/A'
+        # Format Run ID (show first 8 characters)
+        run_id = row.get('run_id', 'N/A')
+        run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id
+        html += f"""
+            <tr data-run-id="{run_id}" data-rank="{rank}" class="tm-clickable-row">
+                <td>{get_rank_badge(rank)}</td>
+                <td class="tm-run-id" title="{run_id}">{run_id_short}</td>
+                <td class="tm-model-name">{model}</td>
+                <td class="tm-badge-cell">{get_agent_type_badge(agent_type)}</td>
+                <td class="tm-badge-cell">{get_provider_badge(provider)}</td>
+                <td class="tm-badge-cell">{get_hardware_badge(has_gpu)}</td>
+                <td>{get_success_rate_bar(success_rate)}</td>
+                <td class="tm-numeric-cell">
+                    <strong>{total_tests}</strong>
+                    <span style="color: #CBD5E1; margin: 0 4px;">/</span>
+                    <span style="color: #10B981; font-weight: 600;">{successful_tests}</span>
+                    <span style="color: #CBD5E1; margin: 0 4px;">/</span>
+                    <span style="color: #EF4444; font-weight: 600;">{failed_tests}</span>
+                </td>
+                <td class="tm-numeric-cell">{avg_steps:.1f}</td>
+                <td class="tm-numeric-cell">{format_duration(avg_duration_ms)}</td>
+                <td class="tm-numeric-cell">{total_tokens:,}</td>
+                <td class="tm-numeric-cell">{format_cost(total_cost_usd)}</td>
+                <td class="tm-numeric-cell tm-hide-mobile">{co2_display}</td>
+                <td class="tm-hide-mobile">{gpu_display}</td>
+                <td class="tm-numeric-cell tm-hide-mobile">{gpu_mem_display}</td>
+                <td class="tm-numeric-cell tm-hide-mobile">{gpu_temp_display}</td>
+                <td class="tm-numeric-cell tm-hide-mobile">{gpu_power_display}</td>
+                <td class="tm-hide-mobile tm-text-cell">{timestamp_display}</td>
+                <td class="tm-hide-mobile tm-text-cell">
+                    {submitted_by}
+                </td>
+            </tr>
+        """
+    html += """
+            </tbody>
+        </table>
+    </div>
+    <script>
+    // Add click handler for Run ID cells - runs on each table render
+    (function() {
+        // Function to attach handlers
+        function attachRowClickHandlers() {
+            const cells = document.querySelectorAll('.tm-run-id');
+            console.log('Found', cells.length, 'Run ID cells');
+            cells.forEach(function(cell) {
+                // Remove existing listener to avoid duplicates
+                cell.replaceWith(cell.cloneNode(true));
+            });
+            // Re-select after cloning
+            document.querySelectorAll('.tm-run-id').forEach(function(cell) {
+                cell.addEventListener('click', function(e) {
+                    e.stopPropagation();
+                    const row = this.closest('tr');
+                    const rowIndex = Array.from(row.parentNode.children).indexOf(row);
+                    console.log('Run ID clicked, row index:', rowIndex);
+                    // Try multiple ways to find the textbox
+                    let textbox = null;
+                    // Method 1: By elem_id
+                    const container1 = document.getElementById('selected_row_index');
+                    if (container1) {
+                        textbox = container1.querySelector('textarea, input[type="text"]');
+                        console.log('Method 1 (elem_id):', textbox ? 'Found' : 'Not found');
+                    }
+                    // Method 2: By data-testid
+                    if (!textbox) {
+                        const containers = document.querySelectorAll('[data-testid="textbox"]');
+                        console.log('Method 2: Found', containers.length, 'textbox containers');
+                        for (let container of containers) {
+                            const input = container.querySelector('textarea, input[type="text"]');
+                            if (input && !container.closest('.label-wrap')) {
+                                textbox = input;
+                                console.log('Method 2: Using hidden textbox');
+                                break;
+                            }
+                        }
+                    }
+                    if (textbox) {
+                        // Set the row index
+                        textbox.value = rowIndex.toString();
+                        // Trigger multiple events to ensure Gradio picks it up
+                        textbox.dispatchEvent(new Event('input', { bubbles: true }));
+                        textbox.dispatchEvent(new Event('change', { bubbles: true }));
+                        textbox.dispatchEvent(new Event('blur', { bubbles: true }));
+                        // Also try triggering on the container
+                        const container = textbox.closest('[data-testid="textbox"]');
+                        if (container) {
+                            container.dispatchEvent(new Event('input', { bubbles: true }));
+                        }
+                        console.log('Textbox updated to:', rowIndex);
+                    } else {
+                        console.error('Could not find hidden textbox!');
+                    }
+                });
+            });
+        }
+        // Attach immediately
+        attachRowClickHandlers();
+        // Also attach after a short delay (in case table loads async)
+        setTimeout(attachRowClickHandlers, 500);
+        setTimeout(attachRowClickHandlers, 1000);
+        setTimeout(attachRowClickHandlers, 2000);
+    })();
+    </script>
+    """
+    return html
+def generate_empty_state_html() -> str:
+    """
+    Generate HTML for empty leaderboard state
+    Returns:
+        HTML string for empty state
+    """
+    return """
+    <div style="
+        text-align: center;
+        padding: 60px 20px;
+        background: var(--tm-bg-card, #1E293B);
+        border-radius: 16px;
+        border: 2px dashed var(--tm-border-default, rgba(148, 163, 184, 0.2));
+        margin: 20px 0;
+    ">
+        <div style="font-size: 48px; margin-bottom: 16px;">📊</div>
+        <h3 style="
+            color: var(--tm-text-primary, #F1F5F9);
+            margin: 0 0 12px 0;
+            font-size: 1.5rem;
+        ">
+            No Evaluation Results Yet
+        </h3>
+        <p style="
+            color: var(--tm-text-secondary, #94A3B8);
+            margin: 0 0 24px 0;
+            font-size: 1rem;
+        ">
+            Run your first evaluation to see results appear here.
+        </p>
+        <button style="
+            padding: 12px 24px;
+            background: var(--tm-primary, #4F46E5);
+            color: white;
+            border: none;
+            border-radius: 8px;
+            font-weight: 600;
+            cursor: pointer;
+            font-size: 1rem;
+        ">
+            Start New Evaluation
+        </button>
+    </div>
+    """
+def generate_filter_summary_html(
+    total_runs: int,
+    filtered_runs: int,
+    active_filters: dict
+) -> str:
+    """
+    Generate summary of active filters
+    Args:
+        total_runs: Total number of runs
+        filtered_runs: Number of runs after filtering
+        active_filters: Dict of active filter values
+    Returns:
+        HTML string with filter summary
+    """
+    if filtered_runs == total_runs:
+        return f"""
+        <div style="
+            padding: 12px 16px;
+            background: var(--tm-bg-secondary, #334155);
+            border-radius: 8px;
+            margin-bottom: 16px;
+            color: var(--tm-text-secondary, #94A3B8);
+            font-size: 0.9em;
+        ">
+            Showing all <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> evaluation runs
+        </div>
+        """
+    filter_chips = []
+    for key, value in active_filters.items():
+        if value and value != "All":
+            filter_chips.append(f"""
+                <span style="
+                    display: inline-flex;
+                    align-items: center;
+                    padding: 4px 10px;
+                    background: var(--tm-primary, #4F46E5);
+                    color: white;
+                    border-radius: 6px;
+                    font-size: 0.85em;
+                    margin-right: 8px;
+                    font-weight: 500;
+                ">
+                    {key}: {value}
+                </span>
+            """)
+    filters_html = "".join(filter_chips) if filter_chips else ""
+    return f"""
+    <div style="
+        padding: 12px 16px;
+        background: var(--tm-bg-secondary, #334155);
+        border-radius: 8px;
+        margin-bottom: 16px;
+        color: var(--tm-text-secondary, #94A3B8);
+        font-size: 0.9em;
+    ">
+        <div style="margin-bottom: 8px;">
+            Showing <strong style="color: var(--tm-text-primary, #F1F5F9);">{filtered_runs}</strong> of
+            <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> runs
+        </div>
+        {filters_html}
+    </div>
+    """

components/metric_displays.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""
+Metric Display Components
+Reusable HTML generators for badges, progress bars, and visual metrics
+"""
+def get_rank_badge(rank: int) -> str:
+    """
+    Generate HTML for rank badge with medal styling for top 3
+    Args:
+        rank: Position in leaderboard (1-indexed)
+    Returns:
+        HTML string for rank badge
+    Examples:
+        >>> get_rank_badge(1)
+        '<span ...>🥇 1st</span>'
+    """
+    badge_styles = {
+        1: ("🥇 1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000", "0 2px 8px rgba(255, 215, 0, 0.4)"),
+        2: ("🥈 2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff", "0 2px 8px rgba(156, 163, 175, 0.4)"),
+        3: ("🥉 3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff", "0 2px 8px rgba(205, 127, 50, 0.4)"),
+    }
+    if rank in badge_styles:
+        label, gradient, text_color, shadow = badge_styles[rank]
+        return f"""
+            <span style="
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                min-width: 60px;
+                padding: 6px 12px;
+                background: {gradient};
+                color: {text_color};
+                border-radius: 8px;
+                font-weight: 700;
+                font-size: 0.9em;
+                box-shadow: {shadow};
+                letter-spacing: 0.5px;
+            ">
+                {label}
+            </span>
+        """
+    else:
+        return f"""
+            <span style="
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                min-width: 32px;
+                color: var(--tm-text-muted, #64748B);
+                font-weight: 500;
+                font-size: 0.95em;
+            ">
+                {rank}
+            </span>
+        """
+def get_success_rate_bar(success_rate: float) -> str:
+    """
+    Generate HTML for success rate progress bar with color gradient
+    Args:
+        success_rate: Success percentage (0-100)
+    Returns:
+        HTML string with progress bar and numeric value
+    Color Logic:
+        - < 50%: Red → Orange (danger)
+        - 50-79%: Orange → Yellow (warning)
+        - 80-100%: Green → Cyan (success)
+    """
+    width = min(max(success_rate, 0), 100)  # Clamp to 0-100
+    # Dynamic gradient based on performance
+    if success_rate < 50:
+        gradient = "linear-gradient(90deg, #EF4444, #F59E0B)"  # Red → Orange
+        bar_color = "#EF4444"
+    elif success_rate < 80:
+        gradient = "linear-gradient(90deg, #F59E0B, #FBBF24)"  # Orange → Yellow
+        bar_color = "#F59E0B"
+    else:
+        gradient = "linear-gradient(90deg, #10B981, #06B6D4)"  # Green → Cyan
+        bar_color = "#10B981"
+    return f"""
+    <div style="display: flex; align-items: center; gap: 10px; width: 100%;">
+        <div style="
+            flex: 1;
+            height: 8px;
+            background: rgba(148, 163, 184, 0.15);
+            border-radius: 4px;
+            overflow: hidden;
+            max-width: 160px;
+            position: relative;
+        ">
+            <div style="
+                width: {width}%;
+                height: 100%;
+                background: {gradient};
+                border-radius: 4px;
+                transition: width 0.5s cubic-bezier(0.4, 0, 0.2, 1);
+                box-shadow: 0 0 8px {bar_color}40;
+            "></div>
+        </div>
+        <span style="
+            font-family: 'Monaco', 'Menlo', monospace;
+            font-weight: 600;
+            color: var(--tm-text-primary, #000000);
+            min-width: 55px;
+            font-size: 0.9em;
+        ">{success_rate:.1f}%</span>
+    </div>
+    """
+def get_gpu_utilization_bar(utilization: float) -> str:
+    """
+    Generate HTML for GPU utilization progress bar
+    Args:
+        utilization: GPU utilization percentage (0-100)
+    Returns:
+        HTML string with progress bar
+    Color Logic:
+        - < 30%: Low utilization (yellow/amber)
+        - 30-70%: Medium utilization (orange)
+        - > 70%: High utilization (red/orange) - good efficiency!
+    """
+    width = min(max(utilization, 0), 100)
+    # Higher GPU utilization = better efficiency
+    if utilization < 30:
+        gradient = "linear-gradient(90deg, #FBBF24, #F59E0B)"  # Yellow → Amber
+    elif utilization < 70:
+        gradient = "linear-gradient(90deg, #F59E0B, #FB923C)"  # Amber → Orange
+    else:
+        gradient = "linear-gradient(90deg, #FB923C, #F97316)"  # Orange → Deep Orange
+    return f"""
+    <div style="display: flex; align-items: center; gap: 8px;">
+        <div style="
+            flex: 1;
+            height: 6px;
+            background: rgba(148, 163, 184, 0.15);
+            border-radius: 3px;
+            max-width: 100px;
+        ">
+            <div style="
+                width: {width}%;
+                height: 100%;
+                background: {gradient};
+                border-radius: 3px;
+                transition: width 0.4s ease;
+            "></div>
+        </div>
+        <span style="
+            font-family: monospace;
+            font-size: 0.85em;
+            color: var(--tm-text-secondary, #000000);
+            min-width: 45px;
+        ">{utilization:.1f}%</span>
+    </div>
+    """
+def get_provider_badge(provider: str) -> str:
+    """
+    Generate HTML for provider type badge
+    Args:
+        provider: Provider name (litellm, transformers, etc.)
+    Returns:
+        HTML string for colored badge
+    Colors:
+        - litellm: Blue (API providers)
+        - transformers: Green (GPU/local models)
+    """
+    provider_colors = {
+        "litellm": "#3B82F6",      # Blue - API providers
+        "transformers": "#10B981",  # Green - GPU/local
+        "openai": "#10A37F",       # OpenAI green
+        "anthropic": "#D97757",    # Anthropic orange
+    }
+    bg_color = provider_colors.get(provider.lower(), "#6B7280")  # Default gray
+    return f"""
+    <span style="
+        display: inline-flex;
+        align-items: center;
+        padding: 4px 10px;
+        background: {bg_color};
+        color: white;
+        border-radius: 5px;
+        font-size: 0.75em;
+        font-weight: 600;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
+    ">
+        {provider.upper()}
+    </span>
+    """
+def get_agent_type_badge(agent_type: str) -> str:
+    """
+    Generate HTML for agent type badge
+    Args:
+        agent_type: Agent type (tool, code, both)
+    Returns:
+        HTML string for colored badge
+    Colors:
+        - tool: Purple
+        - code: Amber/Orange
+        - both: Cyan
+    """
+    type_colors = {
+        "tool": "#8B5CF6",   # Purple
+        "code": "#F59E0B",   # Amber
+        "both": "#06B6D4",   # Cyan
+    }
+    bg_color = type_colors.get(agent_type.lower(), "#6B7280")
+    return f"""
+    <span style="
+        display: inline-flex;
+        align-items: center;
+        padding: 4px 10px;
+        background: {bg_color};
+        color: white;
+        border-radius: 5px;
+        font-size: 0.75em;
+        font-weight: 600;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
+    ">
+        {agent_type.upper()}
+    </span>
+    """
+def get_hardware_badge(has_gpu: bool) -> str:
+    """
+    Generate HTML for hardware type badge
+    Args:
+        has_gpu: Whether job used GPU
+    Returns:
+        HTML string for badge
+    """
+    if has_gpu:
+        return """
+        <span style="
+            display: inline-flex;
+            align-items: center;
+            gap: 4px;
+            padding: 4px 10px;
+            background: linear-gradient(135deg, #F59E0B, #EF4444);
+            color: white;
+            border-radius: 5px;
+            font-size: 0.75em;
+            font-weight: 600;
+            letter-spacing: 0.5px;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
+        ">
+            🖥️ GPU
+        </span>
+        """
+    else:
+        return """
+        <span style="
+            display: inline-flex;
+            align-items: center;
+            gap: 4px;
+            padding: 4px 10px;
+            background: #6B7280;
+            color: white;
+            border-radius: 5px;
+            font-size: 0.75em;
+            font-weight: 600;
+            letter-spacing: 0.5px;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
+        ">
+            💻 CPU
+        </span>
+        """
+def format_cost(cost_usd: float) -> str:
+    """
+    Format cost with color coding
+    Args:
+        cost_usd: Cost in USD
+    Returns:
+        HTML string with formatted cost
+    """
+    # Color code by cost magnitude
+    if cost_usd < 0.01:
+        color = "#10B981"  # Green - very cheap
+    elif cost_usd < 0.05:
+        color = "#F59E0B"  # Amber - moderate
+    else:
+        color = "#EF4444"  # Red - expensive
+    return f"""
+    <span style="
+        font-family: monospace;
+        font-weight: 600;
+        color: {color};
+        font-size: 0.9em;
+    ">
+        ${cost_usd:.4f}
+    </span>
+    """
+def format_duration(duration_ms: float) -> str:
+    """
+    Format duration with appropriate units
+    Args:
+        duration_ms: Duration in milliseconds
+    Returns:
+        HTML string with formatted duration
+    """
+    if duration_ms < 1000:
+        value = duration_ms
+        unit = "ms"
+        color = "#10B981"  # Green - fast
+    elif duration_ms < 10000:
+        value = duration_ms / 1000
+        unit = "s"
+        color = "#F59E0B"  # Amber - moderate
+    else:
+        value = duration_ms / 1000
+        unit = "s"
+        color = "#EF4444"  # Red - slow
+    return f"""
+    <span style="
+        font-family: monospace;
+        color: {color};
+        font-weight: 500;
+        font-size: 0.9em;
+    ">
+        {value:.1f}{unit}
+    </span>
+    """
+def get_tooltip_icon(tooltip_text: str) -> str:
+    """
+    Generate info icon with tooltip
+    Args:
+        tooltip_text: Text to show in tooltip
+    Returns:
+        HTML string for icon with tooltip
+    """
+    return f"""
+    <span title="{tooltip_text}" style="
+        color: var(--tm-secondary, #06B6D4);
+        cursor: help;
+        font-size: 0.9em;
+        margin-left: 4px;
+    ">ⓘ</span>
+    """

data_loader.py CHANGED Viewed

@@ -1,255 +1,430 @@
 """
-Data Loader for TraceMind-AI
-Loads real data from HuggingFace datasets (not mock data)
 """
 import os
-from typing import Optional, Dict, Any, List
 import pandas as pd
 from datasets import load_dataset
-from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-class TraceMindDataLoader:
-    """Loads evaluation data from HuggingFace datasets"""
     def __init__(
         self,
-        leaderboard_repo: Optional[str] = None,
         hf_token: Optional[str] = None
     ):
-        """
-        Initialize data loader
-        Args:
-            leaderboard_repo: HuggingFace dataset repo for leaderboard
-            hf_token: HuggingFace API token for private datasets
-        """
-        self.leaderboard_repo = leaderboard_repo or os.getenv(
-            'LEADERBOARD_REPO',
-            'kshitijthakkar/smoltrace-leaderboard'
-        )
-        self.hf_token = hf_token or os.getenv('HF_TOKEN')
-        # Cache for loaded datasets
-        self._leaderboard_df: Optional[pd.DataFrame] = None
-        self._results_cache: Dict[str, pd.DataFrame] = {}
-        self._traces_cache: Dict[str, List[Dict]] = {}
-        self._metrics_cache: Dict[str, Dict] = {}
-    def load_leaderboard(self, force_refresh: bool = False) -> pd.DataFrame:
-        """
-        Load leaderboard dataset from HuggingFace
-        Args:
-            force_refresh: Force reload from HF (ignore cache)
         Returns:
             DataFrame with leaderboard data
         """
-        if self._leaderboard_df is not None and not force_refresh:
-            return self._leaderboard_df
         try:
-            print(f"📊 Loading leaderboard from {self.leaderboard_repo}...")
-            # Load dataset from HuggingFace
-            dataset = load_dataset(
-                self.leaderboard_repo,
-                split='train',
-                token=self.hf_token
-            )
-            # Convert to DataFrame
-            self._leaderboard_df = pd.DataFrame(dataset)
-            print(f"✅ Loaded {len(self._leaderboard_df)} evaluation runs")
-            return self._leaderboard_df
-        except Exception as e:
-            print(f"❌ Error loading leaderboard: {e}")
-            # Return empty DataFrame with expected columns
-            return pd.DataFrame(columns=[
-                'run_id', 'model', 'agent_type', 'provider',
-                'success_rate', 'total_tests', 'successful_tests', 'failed_tests',
-                'avg_steps', 'avg_duration_ms', 'total_duration_ms',
-                'total_tokens', 'avg_tokens_per_test', 'total_cost_usd', 'avg_cost_per_test_usd',
-                'co2_emissions_g', 'gpu_utilization_avg', 'gpu_memory_max_mib',
-                'results_dataset', 'traces_dataset', 'metrics_dataset',
-                'timestamp', 'submitted_by', 'hf_job_id', 'job_type',
-                'dataset_used', 'smoltrace_version'
-            ])
-    def load_results(self, results_repo: str, force_refresh: bool = False) -> pd.DataFrame:
         """
         Load results dataset for a specific run
         Args:
-            results_repo: HuggingFace dataset repo for results (e.g., 'user/agent-results-gpt4')
-            force_refresh: Force reload from HF
         Returns:
             DataFrame with test case results
         """
-        if results_repo in self._results_cache and not force_refresh:
-            return self._results_cache[results_repo]
-        try:
-            print(f"📊 Loading results from {results_repo}...")
-            dataset = load_dataset(
-                results_repo,
-                split='train',
-                token=self.hf_token
-            )
-            df = pd.DataFrame(dataset)
-            self._results_cache[results_repo] = df
-            print(f"✅ Loaded {len(df)} test cases")
-            return df
-        except Exception as e:
-            print(f"❌ Error loading results: {e}")
-            return pd.DataFrame(columns=[
-                'run_id', 'task_id', 'test_index',
-                'prompt', 'expected_tool', 'difficulty', 'category',
-                'success', 'response', 'tool_called', 'tool_correct',
-                'expected_keywords', 'keywords_matched',
-                'execution_time_ms', 'total_tokens', 'prompt_tokens', 'completion_tokens', 'cost_usd',
-                'trace_id', 'start_time', 'end_time', 'start_time_unix_nano', 'end_time_unix_nano',
-                'error', 'error_type'
-            ])
-    def load_traces(self, traces_repo: str, force_refresh: bool = False) -> List[Dict[str, Any]]:
         """
         Load traces dataset for a specific run
         Args:
-            traces_repo: HuggingFace dataset repo for traces
-            force_refresh: Force reload from HF
-        Returns:
-            List of trace dictionaries (OpenTelemetry format)
-        """
-        if traces_repo in self._traces_cache and not force_refresh:
-            return self._traces_cache[traces_repo]
-        try:
-            print(f"🔍 Loading traces from {traces_repo}...")
-            dataset = load_dataset(
-                traces_repo,
-                split='train',
-                token=self.hf_token
-            )
-            # Convert to list of dicts
-            traces = [dict(item) for item in dataset]
-            self._traces_cache[traces_repo] = traces
-            print(f"✅ Loaded {len(traces)} traces")
-            return traces
-        except Exception as e:
-            print(f"❌ Error loading traces: {e}")
-            return []
-    def load_metrics(self, metrics_repo: str, force_refresh: bool = False) -> Dict[str, Any]:
-        """
-        Load GPU metrics dataset for a specific run
-        Args:
-            metrics_repo: HuggingFace dataset repo for metrics
-            force_refresh: Force reload from HF
         Returns:
-            Metrics data (OpenTelemetry metrics format)
         """
-        if metrics_repo in self._metrics_cache and not force_refresh:
-            return self._metrics_cache[metrics_repo]
-        try:
-            print(f"📈 Loading metrics from {metrics_repo}...")
-            dataset = load_dataset(
-                metrics_repo,
-                split='train',
-                token=self.hf_token
-            )
-            # Assume metrics dataset has one row with all metrics
-            if len(dataset) > 0:
-                metrics = dict(dataset[0])
-                self._metrics_cache[metrics_repo] = metrics
-                print(f"✅ Loaded metrics data")
-                return metrics
-            else:
-                print(f"⚠️ No metrics data found")
-                return {}
-        except Exception as e:
-            print(f"❌ Error loading metrics: {e}")
-            return {}
-    def get_run_by_id(self, run_id: str) -> Optional[Dict[str, Any]]:
         """
-        Get a specific run from the leaderboard by run_id
         Args:
-            run_id: Run ID to fetch
         Returns:
-            Run data as dict, or None if not found
         """
-        leaderboard_df = self.load_leaderboard()
-        run_rows = leaderboard_df[leaderboard_df['run_id'] == run_id]
-        if len(run_rows) > 0:
-            return run_rows.iloc[0].to_dict()
         else:
-            return None
-    def get_trace_by_id(self, traces_repo: str, trace_id: str) -> Optional[Dict[str, Any]]:
         """
-        Get a specific trace by trace_id
         Args:
-            traces_repo: HuggingFace dataset repo for traces
-            trace_id: Trace ID to fetch
         Returns:
-            Trace data as dict, or None if not found
         """
-        traces = self.load_traces(traces_repo)
         for trace in traces:
-            if trace.get('trace_id') == trace_id or trace.get('traceId') == trace_id:
                 return trace
         return None
-    def clear_cache(self):
-        """Clear all cached data"""
-        self._leaderboard_df = None
-        self._results_cache.clear()
-        self._traces_cache.clear()
-        self._metrics_cache.clear()
-        print("🧹 Cache cleared")
-def create_data_loader_from_env() -> TraceMindDataLoader:
     """
-    Create a data loader using environment variables
     Returns:
-        TraceMindDataLoader instance
     """
-    return TraceMindDataLoader(
-        leaderboard_repo=os.getenv('LEADERBOARD_REPO'),
-        hf_token=os.getenv('HF_TOKEN')
     )

 """
+Data Loader for MockTraceMind
+Supports loading from both JSON files and HuggingFace datasets
 """
 import os
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Literal
 import pandas as pd
 from datasets import load_dataset
+from huggingface_hub import HfApi
+import gradio as gr
+DataSource = Literal["json", "huggingface", "both"]
+class DataLoader:
+    """
+    Unified data loader for MockTraceMind
+    Supports:
+    - Local JSON files
+    - HuggingFace datasets
+    - Automatic fallback between sources
+    - Caching for performance
+    """
     def __init__(
         self,
+        data_source: DataSource = "both",
+        json_data_path: Optional[str] = None,
+        leaderboard_dataset: Optional[str] = None,
         hf_token: Optional[str] = None
     ):
+        self.data_source = data_source
+        self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
+        self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "huggingface/smolagents-leaderboard")
+        self.hf_token = hf_token or os.getenv("HF_TOKEN")
+        # Cache
+        self._cache: Dict[str, Any] = {}
+        self.hf_api = HfApi(token=self.hf_token) if self.hf_token else None
+    def load_leaderboard(self) -> pd.DataFrame:
+        """
+        Load leaderboard dataset
         Returns:
             DataFrame with leaderboard data
         """
+        cache_key = "leaderboard"
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        # Try HuggingFace first
+        if self.data_source in ["huggingface", "both"]:
+            try:
+                df = self._load_leaderboard_from_hf()
+                self._cache[cache_key] = df
+                return df
+            except Exception as e:
+                print(f"Failed to load from HuggingFace: {e}")
+                if self.data_source == "huggingface":
+                    raise
+        # Fallback to JSON
+        if self.data_source in ["json", "both"]:
+            try:
+                df = self._load_leaderboard_from_json()
+                self._cache[cache_key] = df
+                return df
+            except Exception as e:
+                print(f"Failed to load from JSON: {e}")
+                raise
+        raise ValueError("No valid data source available")
+    def _load_leaderboard_from_hf(self) -> pd.DataFrame:
+        """Load leaderboard from HuggingFace dataset"""
         try:
+            ds = load_dataset(self.leaderboard_dataset, split="train", token=self.hf_token)
+            df = ds.to_pandas()
+            print(f"[OK] Loaded leaderboard from HuggingFace: {len(df)} rows")
+            return df
+        except Exception as e:
+            print(f"[ERROR] Loading from HuggingFace: {e}")
+            raise
+    def _load_leaderboard_from_json(self) -> pd.DataFrame:
+        """Load leaderboard from local JSON file"""
+        json_path = self.json_data_path / "leaderboard.json"
+        if not json_path.exists():
+            raise FileNotFoundError(f"Leaderboard JSON not found: {json_path}")
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        df = pd.DataFrame(data)
+        print(f"[OK] Loaded leaderboard from JSON: {len(df)} rows")
+        return df
+    def load_results(self, results_dataset: str) -> pd.DataFrame:
         """
         Load results dataset for a specific run
         Args:
+            results_dataset: Dataset reference (e.g., "user/agent-results-gpt4")
         Returns:
             DataFrame with test case results
         """
+        cache_key = f"results_{results_dataset}"
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        # Try HuggingFace first
+        if self.data_source in ["huggingface", "both"]:
+            try:
+                df = self._load_results_from_hf(results_dataset)
+                self._cache[cache_key] = df
+                return df
+            except Exception as e:
+                print(f"Failed to load results from HuggingFace: {e}")
+                if self.data_source == "huggingface":
+                    raise
+        # Fallback to JSON
+        if self.data_source in ["json", "both"]:
+            try:
+                df = self._load_results_from_json(results_dataset)
+                self._cache[cache_key] = df
+                return df
+            except Exception as e:
+                print(f"Failed to load results from JSON: {e}")
+                raise
+        raise ValueError("No valid data source available")
+    def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
+        """Load results from HuggingFace dataset"""
+        ds = load_dataset(dataset_id, split="train", token=self.hf_token)
+        df = ds.to_pandas()
+        print(f"[OK] Loaded results from HuggingFace: {len(df)} rows")
+        return df
+    def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
+        """Load results from local JSON file"""
+        # Extract filename from dataset ID (e.g., "user/agent-results-gpt4" -> "results_gpt4.json")
+        filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
+        json_path = self.json_data_path / filename
+        if not json_path.exists():
+            raise FileNotFoundError(f"Results JSON not found: {json_path}")
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        df = pd.DataFrame(data)
+        print(f"[OK] Loaded results from JSON: {len(df)} rows")
+        return df
+    def load_traces(self, traces_dataset: str) -> List[Dict[str, Any]]:
         """
         Load traces dataset for a specific run
         Args:
+            traces_dataset: Dataset reference (e.g., "user/agent-traces-gpt4")
         Returns:
+            List of trace objects (OpenTelemetry format)
         """
+        cache_key = f"traces_{traces_dataset}"
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        # Try HuggingFace first
+        if self.data_source in ["huggingface", "both"]:
+            try:
+                traces = self._load_traces_from_hf(traces_dataset)
+                self._cache[cache_key] = traces
+                return traces
+            except Exception as e:
+                print(f"Failed to load traces from HuggingFace: {e}")
+                if self.data_source == "huggingface":
+                    raise
+        # Fallback to JSON
+        if self.data_source in ["json", "both"]:
+            try:
+                traces = self._load_traces_from_json(traces_dataset)
+                self._cache[cache_key] = traces
+                return traces
+            except Exception as e:
+                print(f"Failed to load traces from JSON: {e}")
+                raise
+        raise ValueError("No valid data source available")
+    def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
+        """Load traces from HuggingFace dataset"""
+        ds = load_dataset(dataset_id, split="train", token=self.hf_token)
+        traces = ds.to_pandas().to_dict("records")
+        print(f"[OK] Loaded traces from HuggingFace: {len(traces)} traces")
+        return traces
+    def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
+        """Load traces from local JSON file"""
+        filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
+        json_path = self.json_data_path / filename
+        if not json_path.exists():
+            raise FileNotFoundError(f"Traces JSON not found: {json_path}")
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        print(f"[OK] Loaded traces from JSON: {len(data)} traces")
+        return data
+    def load_metrics(self, metrics_dataset: str) -> pd.DataFrame:
         """
+        Load metrics dataset for a specific run (GPU metrics)
         Args:
+            metrics_dataset: Dataset reference (e.g., "user/agent-metrics-gpt4")
         Returns:
+            DataFrame with GPU metrics in flat format (columns: timestamp, gpu_utilization_percent, etc.)
         """
+        cache_key = f"metrics_{metrics_dataset}"
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        # Try HuggingFace first
+        if self.data_source in ["huggingface", "both"]:
+            try:
+                metrics = self._load_metrics_from_hf(metrics_dataset)
+                self._cache[cache_key] = metrics
+                return metrics
+            except Exception as e:
+                print(f"Failed to load metrics from HuggingFace: {e}")
+                if self.data_source == "huggingface":
+                    raise
+        # Fallback to JSON
+        if self.data_source in ["json", "both"]:
+            try:
+                metrics = self._load_metrics_from_json(metrics_dataset)
+                self._cache[cache_key] = metrics
+                return metrics
+            except Exception as e:
+                print(f"Failed to load metrics from JSON: {e}")
+                # Metrics might not exist for API models, don't raise
+                print("⚠️ No metrics available (expected for API models)")
+                return pd.DataFrame()
+        return pd.DataFrame()
+    def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
+        """Load metrics from HuggingFace dataset (flat format)"""
+        ds = load_dataset(dataset_id, split="train", token=self.hf_token)
+        df = ds.to_pandas()
+        # Convert timestamp strings to datetime if needed
+        if 'timestamp' in df.columns:
+            df['timestamp'] = pd.to_datetime(df['timestamp'])
+        print(f"[OK] Loaded metrics from HuggingFace: {len(df)} rows")
+        print(f"   Columns: {list(df.columns)}")
+        return df
+    def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
+        """Load metrics from local JSON file"""
+        filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
+        json_path = self.json_data_path / filename
+        if not json_path.exists():
+            # Metrics might not exist for API models
+            return pd.DataFrame()
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        # Check if it's OpenTelemetry format (nested) or flat format
+        if isinstance(data, dict) and 'resourceMetrics' in data:
+            # Legacy OpenTelemetry format - convert to flat format
+            df = self._convert_otel_to_flat(data)
+        elif isinstance(data, list):
+            df = pd.DataFrame(data)
         else:
+            df = pd.DataFrame()
+        # Convert timestamp strings to datetime if needed
+        if 'timestamp' in df.columns and not df.empty:
+            df['timestamp'] = pd.to_datetime(df['timestamp'])
+        print(f"[OK] Loaded metrics from JSON: {len(df)} rows")
+        return df
+    def _convert_otel_to_flat(self, otel_data: Dict[str, Any]) -> pd.DataFrame:
+        """Convert OpenTelemetry resourceMetrics format to flat DataFrame"""
+        rows = []
+        for resource_metric in otel_data.get('resourceMetrics', []):
+            for scope_metric in resource_metric.get('scopeMetrics', []):
+                for metric in scope_metric.get('metrics', []):
+                    metric_name = metric.get('name', '')
+                    # Handle gauge metrics
+                    if 'gauge' in metric:
+                        for data_point in metric['gauge'].get('dataPoints', []):
+                            row = self._extract_data_point(metric_name, data_point, metric.get('unit', ''))
+                            if row:
+                                rows.append(row)
+                    # Handle sum metrics (like CO2)
+                    elif 'sum' in metric:
+                        for data_point in metric['sum'].get('dataPoints', []):
+                            row = self._extract_data_point(metric_name, data_point, metric.get('unit', ''))
+                            if row:
+                                rows.append(row)
+        return pd.DataFrame(rows)
+    def _extract_data_point(self, metric_name: str, data_point: Dict, unit: str) -> Optional[Dict[str, Any]]:
+        """Extract a single data point from OpenTelemetry format to flat row"""
+        # Get GPU attributes
+        gpu_id = None
+        gpu_name = None
+        for attr in data_point.get('attributes', []):
+            if attr.get('key') == 'gpu_id':
+                gpu_id = attr.get('value', {}).get('stringValue', '')
+            elif attr.get('key') == 'gpu_name':
+                gpu_name = attr.get('value', {}).get('stringValue', '')
+        # Get value
+        value = None
+        if 'asInt' in data_point and data_point['asInt'] is not None:
+            value = int(data_point['asInt'])
+        elif 'asDouble' in data_point and data_point['asDouble'] is not None:
+            value = float(data_point['asDouble'])
+        # Get timestamp
+        timestamp_nano = data_point.get('timeUnixNano', '')
+        if timestamp_nano:
+            timestamp_sec = int(timestamp_nano) / 1e9
+            timestamp = pd.to_datetime(timestamp_sec, unit='s')
+        else:
+            timestamp = None
+        # Map metric names to column names
+        metric_col_map = {
+            'gen_ai.gpu.utilization': 'gpu_utilization_percent',
+            'gen_ai.gpu.memory.used': 'gpu_memory_used_mib',
+            'gen_ai.gpu.temperature': 'gpu_temperature_celsius',
+            'gen_ai.gpu.power': 'gpu_power_watts',
+            'gen_ai.co2.emissions': 'co2_emissions_gco2e'
+        }
+        return {
+            'timestamp': timestamp,
+            'timestamp_unix_nano': timestamp_nano,
+            'gpu_id': gpu_id,
+            'gpu_name': gpu_name,
+            'metric_name': metric_name,
+            'value': value,
+            'unit': unit
+        }
+    def get_trace_by_id(self, traces_dataset: str, trace_id: str) -> Optional[Dict[str, Any]]:
         """
+        Get a specific trace by ID
         Args:
+            traces_dataset: Dataset reference
+            trace_id: Trace ID to find
         Returns:
+            Trace object or None if not found
         """
+        traces = self.load_traces(traces_dataset)
         for trace in traces:
+            if trace.get("trace_id") == trace_id or trace.get("traceId") == trace_id:
+                # Ensure spans is a proper list (not numpy array or pandas Series)
+                if "spans" in trace:
+                    spans = trace["spans"]
+                    if hasattr(spans, 'tolist'):
+                        trace["spans"] = spans.tolist()
+                    elif not isinstance(spans, list):
+                        trace["spans"] = list(spans) if spans is not None else []
                 return trace
         return None
+    def clear_cache(self) -> None:
+        """Clear the internal cache"""
+        self._cache.clear()
+        print("[OK] Cache cleared")
+    def refresh_leaderboard(self) -> pd.DataFrame:
+        """Refresh leaderboard data (clear cache and reload)"""
+        if "leaderboard" in self._cache:
+            del self._cache["leaderboard"]
+        return self.load_leaderboard()
+def create_data_loader_from_env() -> DataLoader:
     """
+    Create DataLoader instance from environment variables
     Returns:
+        Configured DataLoader instance
     """
+    data_source = os.getenv("DATA_SOURCE", "both")
+    return DataLoader(
+        data_source=data_source,
+        json_data_path=os.getenv("JSON_DATA_PATH"),
+        leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
+        hf_token=os.getenv("HF_TOKEN")
     )

sample_data/generate_sample_metrics.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Generate sample metrics data in OpenTelemetry resourceMetrics format.
+This simulates what SMOLTRACE would produce for GPU and API evaluation runs.
+"""
+import json
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+def generate_gpu_sample_metrics(
+    run_id: str = "run_002_llama31",
+    duration_seconds: int = 120,
+    interval_seconds: int = 10
+):
+    """
+    Generate sample GPU metrics data for a GPU model run.
+    Args:
+        run_id: Run identifier
+        duration_seconds: Total duration of simulated run
+        interval_seconds: Interval between data points
+    Returns:
+        Dict in OpenTelemetry resourceMetrics format
+    """
+    start_time = datetime.now()
+    num_points = duration_seconds // interval_seconds
+    # Generate time-series data points
+    utilization_points = []
+    memory_points = []
+    temperature_points = []
+    power_points = []
+    co2_points = []
+    cumulative_co2 = 0.0
+    for i in range(num_points):
+        timestamp = start_time + timedelta(seconds=i * interval_seconds)
+        time_unix_nano = str(int(timestamp.timestamp() * 1e9))
+        # Simulate realistic GPU metrics with some variation
+        # Pattern: Higher utilization during inference, lower during idle
+        utilization = 45 + (i % 5) * 10 + (i % 2) * 5  # 45-70%
+        memory = 4096 + i * 100  # Gradually increasing memory usage
+        temperature = 70 + (i % 6) * 2  # 70-80°C
+        power = 250 + (i % 7) * 30  # 250-400W
+        # Cumulative CO2 (monotonic increasing)
+        # Rough estimate: power (W) * time (h) * carbon intensity (g/kWh)
+        delta_co2 = (power / 1000.0) * (interval_seconds / 3600.0) * 400  # 400g/kWh assumed
+        cumulative_co2 += delta_co2
+        utilization_points.append({
+            "attributes": [
+                {"key": "gpu_id", "value": {"stringValue": "0"}},
+                {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
+            ],
+            "timeUnixNano": time_unix_nano,
+            "asInt": str(utilization)
+        })
+        memory_points.append({
+            "attributes": [
+                {"key": "gpu_id", "value": {"stringValue": "0"}},
+                {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
+            ],
+            "timeUnixNano": time_unix_nano,
+            "asDouble": float(memory)
+        })
+        temperature_points.append({
+            "attributes": [
+                {"key": "gpu_id", "value": {"stringValue": "0"}},
+                {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
+            ],
+            "timeUnixNano": time_unix_nano,
+            "asInt": str(temperature)
+        })
+        power_points.append({
+            "attributes": [
+                {"key": "gpu_id", "value": {"stringValue": "0"}},
+                {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
+            ],
+            "timeUnixNano": time_unix_nano,
+            "asDouble": float(power)
+        })
+        co2_points.append({
+            "attributes": [
+                {"key": "gpu_id", "value": {"stringValue": "0"}}
+            ],
+            "timeUnixNano": time_unix_nano,
+            "asDouble": cumulative_co2
+        })
+    # Construct resourceMetrics structure (OpenTelemetry format)
+    metrics_data = {
+        "run_id": run_id,
+        "resourceMetrics": [{
+            "resource": {
+                "attributes": [
+                    {"key": "telemetry.sdk.language", "value": {"stringValue": "python"}},
+                    {"key": "telemetry.sdk.name", "value": {"stringValue": "opentelemetry"}},
+                    {"key": "telemetry.sdk.version", "value": {"stringValue": "1.37.0"}},
+                    {"key": "service.name", "value": {"stringValue": "smoltrace-eval"}},
+                    {"key": "run.id", "value": {"stringValue": run_id}}
+                ]
+            },
+            "scopeMetrics": [{
+                "scope": {"name": "genai.gpu", "version": None},
+                "metrics": [
+                    {
+                        "name": "gen_ai.gpu.utilization",
+                        "description": "GPU utilization percentage",
+                        "unit": "%",
+                        "gauge": {"dataPoints": utilization_points}
+                    },
+                    {
+                        "name": "gen_ai.gpu.memory.used",
+                        "description": "GPU memory used in MiB",
+                        "unit": "MiB",
+                        "gauge": {"dataPoints": memory_points}
+                    },
+                    {
+                        "name": "gen_ai.gpu.temperature",
+                        "description": "GPU temperature in Celsius",
+                        "unit": "Cel",
+                        "gauge": {"dataPoints": temperature_points}
+                    },
+                    {
+                        "name": "gen_ai.gpu.power",
+                        "description": "GPU power consumption in Watts",
+                        "unit": "W",
+                        "gauge": {"dataPoints": power_points}
+                    },
+                    {
+                        "name": "gen_ai.co2.emissions",
+                        "description": "Cumulative CO2 equivalent emissions in grams",
+                        "unit": "gCO2e",
+                        "sum": {
+                            "dataPoints": co2_points,
+                            "aggregationTemporality": 2,  # CUMULATIVE
+                            "isMonotonic": True
+                        }
+                    }
+                ]
+            }]
+        }]
+    }
+    return metrics_data
+def generate_api_sample_metrics(run_id: str = "run_001_gpt4"):
+    """
+    Generate minimal sample metrics for an API model run (no GPU).
+    Args:
+        run_id: Run identifier
+    Returns:
+        Dict with empty resourceMetrics (API models don't have GPU)
+    """
+    return {
+        "run_id": run_id,
+        "resourceMetrics": []
+    }
+if __name__ == "__main__":
+    # Create output directory
+    output_dir = Path(__file__).parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("Generating sample metrics data...")
+    # Generate GPU model metrics (Llama 3.1 on H200)
+    gpu_metrics = generate_gpu_sample_metrics(
+        run_id="run_002_llama31",
+        duration_seconds=120,
+        interval_seconds=10
+    )
+    output_file = output_dir / "metrics_llama31.json"
+    with open(output_file, "w") as f:
+        json.dump(gpu_metrics, f, indent=2)
+    print(f"[OK] Generated GPU metrics: {output_file}")
+    print(f"   - {len(gpu_metrics['resourceMetrics'][0]['scopeMetrics'][0]['metrics'])} metric types")
+    print(f"   - {len(gpu_metrics['resourceMetrics'][0]['scopeMetrics'][0]['metrics'][0]['gauge']['dataPoints'])} data points per metric")
+    # Generate API model metrics (GPT-4 - no GPU)
+    api_metrics = generate_api_sample_metrics(run_id="run_001_gpt4")
+    output_file = output_dir / "metrics_gpt4.json"
+    with open(output_file, "w") as f:
+        json.dump(api_metrics, f, indent=2)
+    print(f"[OK] Generated API metrics: {output_file}")
+    print(f"   - Empty resourceMetrics (API model has no GPU)")
+    print("\n[SUCCESS] Sample metrics data generation complete!")
+    print("\nYou can now test the visualization with:")
+    print("  python gpu_metrics_with_time_series.py")

sample_data/leaderboard.json ADDED Viewed

	@@ -0,0 +1,89 @@

+[
+  {
+    "run_id": "run_001_gpt4",
+    "model": "openai/gpt-4",
+    "agent_type": "both",
+    "provider": "litellm",
+    "success_rate": 95.8,
+    "total_tests": 100,
+    "successful_tests": 96,
+    "failed_tests": 4,
+    "avg_steps": 2.5,
+    "avg_duration_ms": 3200.0,
+    "total_duration_ms": 320000.0,
+    "total_tokens": 15000,
+    "avg_tokens_per_test": 150,
+    "total_cost_usd": 0.05,
+    "avg_cost_per_test_usd": 0.0005,
+    "co2_emissions_g": 0.22,
+    "gpu_utilization_avg": null,
+    "gpu_memory_max_mib": null,
+    "results_dataset": "test/results_gpt4",
+    "traces_dataset": "test/traces_gpt4",
+    "metrics_dataset": "test/metrics_gpt4",
+    "timestamp": "2025-01-16T14:23:00Z",
+    "submitted_by": "test_user",
+    "hf_job_id": "job_12345",
+    "job_type": "cpu",
+    "dataset_used": "huggingface/smolagents/tasks",
+    "smoltrace_version": "0.1.0"
+  },
+  {
+    "run_id": "run_002_llama31",
+    "model": "meta-llama/Llama-3.1-8B",
+    "agent_type": "both",
+    "provider": "transformers",
+    "success_rate": 93.4,
+    "total_tests": 100,
+    "successful_tests": 93,
+    "failed_tests": 7,
+    "avg_steps": 2.8,
+    "avg_duration_ms": 2100.0,
+    "total_duration_ms": 210000.0,
+    "total_tokens": 12500,
+    "avg_tokens_per_test": 125,
+    "total_cost_usd": 0.002,
+    "avg_cost_per_test_usd": 0.00002,
+    "co2_emissions_g": 1.45,
+    "gpu_utilization_avg": 67.5,
+    "gpu_memory_max_mib": 512.34,
+    "results_dataset": "test/results_llama31",
+    "traces_dataset": "test/traces_llama31",
+    "metrics_dataset": "test/metrics_llama31",
+    "timestamp": "2025-01-16T15:10:00Z",
+    "submitted_by": "test_user",
+    "hf_job_id": "job_12346",
+    "job_type": "gpu_h200",
+    "dataset_used": "huggingface/smolagents/tasks",
+    "smoltrace_version": "0.1.0"
+  },
+  {
+    "run_id": "run_003_claude",
+    "model": "anthropic/claude-3-haiku",
+    "agent_type": "tool",
+    "provider": "litellm",
+    "success_rate": 92.1,
+    "total_tests": 100,
+    "successful_tests": 92,
+    "failed_tests": 8,
+    "avg_steps": 2.2,
+    "avg_duration_ms": 2800.0,
+    "total_duration_ms": 280000.0,
+    "total_tokens": 11200,
+    "avg_tokens_per_test": 112,
+    "total_cost_usd": 0.012,
+    "avg_cost_per_test_usd": 0.00012,
+    "co2_emissions_g": 0.15,
+    "gpu_utilization_avg": null,
+    "gpu_memory_max_mib": null,
+    "results_dataset": "test/results_claude",
+    "traces_dataset": "test/traces_claude",
+    "metrics_dataset": "test/metrics_claude",
+    "timestamp": "2025-01-16T16:45:00Z",
+    "submitted_by": "test_user",
+    "hf_job_id": "job_12347",
+    "job_type": "cpu",
+    "dataset_used": "huggingface/smolagents/tasks",
+    "smoltrace_version": "0.1.0"
+  }
+]

sample_data/metrics_gpt4.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

sample_data/metrics_llama31.json ADDED Viewed

	@@ -0,0 +1,1106 @@

+{
+  "run_id": "run_002_llama31",
+  "resourceMetrics": [
+    {
+      "resource": {
+        "attributes": [
+          {
+            "key": "telemetry.sdk.language",
+            "value": {
+              "stringValue": "python"
+            }
+          },
+          {
+            "key": "telemetry.sdk.name",
+            "value": {
+              "stringValue": "opentelemetry"
+            }
+          },
+          {
+            "key": "telemetry.sdk.version",
+            "value": {
+              "stringValue": "1.37.0"
+            }
+          },
+          {
+            "key": "service.name",
+            "value": {
+              "stringValue": "smoltrace-eval"
+            }
+          },
+          {
+            "key": "run.id",
+            "value": {
+              "stringValue": "run_002_llama31"
+            }
+          }
+        ]
+      },
+      "scopeMetrics": [
+        {
+          "scope": {
+            "name": "genai.gpu",
+            "version": null
+          },
+          "metrics": [
+            {
+              "name": "gen_ai.gpu.utilization",
+              "description": "GPU utilization percentage",
+              "unit": "%",
+              "gauge": {
+                "dataPoints": [
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242554199441920",
+                    "asInt": "45"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242564199441920",
+                    "asInt": "60"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242574199441920",
+                    "asInt": "65"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242584199441920",
+                    "asInt": "80"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242594199441920",
+                    "asInt": "85"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242604199441920",
+                    "asInt": "50"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242614199441920",
+                    "asInt": "55"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242624199441920",
+                    "asInt": "70"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242634199441920",
+                    "asInt": "75"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242644199441920",
+                    "asInt": "90"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242654199441920",
+                    "asInt": "45"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242664199441920",
+                    "asInt": "60"
+                  }
+                ]
+              }
+            },
+            {
+              "name": "gen_ai.gpu.memory.used",
+              "description": "GPU memory used in MiB",
+              "unit": "MiB",
+              "gauge": {
+                "dataPoints": [
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242554199441920",
+                    "asDouble": 4096.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242564199441920",
+                    "asDouble": 4196.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242574199441920",
+                    "asDouble": 4296.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242584199441920",
+                    "asDouble": 4396.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242594199441920",
+                    "asDouble": 4496.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242604199441920",
+                    "asDouble": 4596.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242614199441920",
+                    "asDouble": 4696.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242624199441920",
+                    "asDouble": 4796.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242634199441920",
+                    "asDouble": 4896.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242644199441920",
+                    "asDouble": 4996.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242654199441920",
+                    "asDouble": 5096.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242664199441920",
+                    "asDouble": 5196.0
+                  }
+                ]
+              }
+            },
+            {
+              "name": "gen_ai.gpu.temperature",
+              "description": "GPU temperature in Celsius",
+              "unit": "Cel",
+              "gauge": {
+                "dataPoints": [
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242554199441920",
+                    "asInt": "70"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242564199441920",
+                    "asInt": "72"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242574199441920",
+                    "asInt": "74"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242584199441920",
+                    "asInt": "76"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242594199441920",
+                    "asInt": "78"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242604199441920",
+                    "asInt": "80"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242614199441920",
+                    "asInt": "70"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242624199441920",
+                    "asInt": "72"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242634199441920",
+                    "asInt": "74"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242644199441920",
+                    "asInt": "76"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242654199441920",
+                    "asInt": "78"
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242664199441920",
+                    "asInt": "80"
+                  }
+                ]
+              }
+            },
+            {
+              "name": "gen_ai.gpu.power",
+              "description": "GPU power consumption in Watts",
+              "unit": "W",
+              "gauge": {
+                "dataPoints": [
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242554199441920",
+                    "asDouble": 250.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242564199441920",
+                    "asDouble": 280.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242574199441920",
+                    "asDouble": 310.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242584199441920",
+                    "asDouble": 340.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242594199441920",
+                    "asDouble": 370.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242604199441920",
+                    "asDouble": 400.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242614199441920",
+                    "asDouble": 430.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242624199441920",
+                    "asDouble": 250.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242634199441920",
+                    "asDouble": 280.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242644199441920",
+                    "asDouble": 310.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242654199441920",
+                    "asDouble": 340.0
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      },
+                      {
+                        "key": "gpu_name",
+                        "value": {
+                          "stringValue": "NVIDIA H200"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242664199441920",
+                    "asDouble": 370.0
+                  }
+                ]
+              }
+            },
+            {
+              "name": "gen_ai.co2.emissions",
+              "description": "Cumulative CO2 equivalent emissions in grams",
+              "unit": "gCO2e",
+              "sum": {
+                "dataPoints": [
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242554199441920",
+                    "asDouble": 0.2777777777777778
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242564199441920",
+                    "asDouble": 0.5888888888888889
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242574199441920",
+                    "asDouble": 0.9333333333333333
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242584199441920",
+                    "asDouble": 1.3111111111111111
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242594199441920",
+                    "asDouble": 1.7222222222222223
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242604199441920",
+                    "asDouble": 2.166666666666667
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242614199441920",
+                    "asDouble": 2.644444444444445
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242624199441920",
+                    "asDouble": 2.9222222222222225
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242634199441920",
+                    "asDouble": 3.2333333333333334
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242644199441920",
+                    "asDouble": 3.577777777777778
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242654199441920",
+                    "asDouble": 3.9555555555555557
+                  },
+                  {
+                    "attributes": [
+                      {
+                        "key": "gpu_id",
+                        "value": {
+                          "stringValue": "0"
+                        }
+                      }
+                    ],
+                    "timeUnixNano": "1761242664199441920",
+                    "asDouble": 4.366666666666667
+                  }
+                ],
+                "aggregationTemporality": 2,
+                "isMonotonic": true
+              }
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}

sample_data/results_gpt4.json ADDED Viewed

	@@ -0,0 +1,137 @@

+[
+  {
+    "run_id": "run_001_gpt4",
+    "task_id": "task_001",
+    "test_index": 0,
+    "prompt": "What's the weather in Tokyo?",
+    "expected_tool": "get_weather",
+    "difficulty": "easy",
+    "category": "tool_usage",
+    "success": true,
+    "response": "The weather in Tokyo is 18°C and clear.",
+    "tool_called": "get_weather",
+    "tool_correct": true,
+    "expected_keywords": ["18°C", "clear"],
+    "keywords_matched": ["18°C", "clear"],
+    "execution_time_ms": 2450.0,
+    "total_tokens": 234,
+    "prompt_tokens": 78,
+    "completion_tokens": 156,
+    "cost_usd": 0.0012,
+    "trace_id": "trace_abc123",
+    "start_time": "2025-01-16T14:23:01Z",
+    "end_time": "2025-01-16T14:23:03.450Z",
+    "start_time_unix_nano": "1760947217774556600",
+    "end_time_unix_nano": "1760947220224556600",
+    "error": null,
+    "error_type": null
+  },
+  {
+    "run_id": "run_001_gpt4",
+    "task_id": "task_002",
+    "test_index": 1,
+    "prompt": "Search for recent news about AI",
+    "expected_tool": "web_search",
+    "difficulty": "medium",
+    "category": "information_retrieval",
+    "success": true,
+    "response": "Here are the latest AI news headlines: 1) New breakthrough in LLMs...",
+    "tool_called": "web_search",
+    "tool_correct": true,
+    "expected_keywords": ["AI", "news"],
+    "keywords_matched": ["AI"],
+    "execution_time_ms": 3800.0,
+    "total_tokens": 456,
+    "prompt_tokens": 120,
+    "completion_tokens": 336,
+    "cost_usd": 0.0018,
+    "trace_id": "trace_def456",
+    "start_time": "2025-01-16T14:23:05Z",
+    "end_time": "2025-01-16T14:23:08.800Z",
+    "start_time_unix_nano": "1760947221000000000",
+    "end_time_unix_nano": "1760947224800000000",
+    "error": null,
+    "error_type": null
+  },
+  {
+    "run_id": "run_001_gpt4",
+    "task_id": "task_003",
+    "test_index": 2,
+    "prompt": "Calculate 234 * 567",
+    "expected_tool": "calculator",
+    "difficulty": "easy",
+    "category": "tool_usage",
+    "success": true,
+    "response": "The result of 234 * 567 is 132678",
+    "tool_called": "calculator",
+    "tool_correct": true,
+    "expected_keywords": ["132678"],
+    "keywords_matched": ["132678"],
+    "execution_time_ms": 1200.0,
+    "total_tokens": 89,
+    "prompt_tokens": 45,
+    "completion_tokens": 44,
+    "cost_usd": 0.0004,
+    "trace_id": "trace_ghi789",
+    "start_time": "2025-01-16T14:23:10Z",
+    "end_time": "2025-01-16T14:23:11.200Z",
+    "start_time_unix_nano": "1760947226000000000",
+    "end_time_unix_nano": "1760947227200000000",
+    "error": null,
+    "error_type": null
+  },
+  {
+    "run_id": "run_001_gpt4",
+    "task_id": "task_004",
+    "test_index": 3,
+    "prompt": "Send an email to john@example.com with subject 'Meeting' and body 'Let's meet tomorrow'",
+    "expected_tool": "send_email",
+    "difficulty": "hard",
+    "category": "multi_step",
+    "success": false,
+    "response": "I apologize, I don't have access to an email sending function.",
+    "tool_called": null,
+    "tool_correct": false,
+    "expected_keywords": ["email", "sent"],
+    "keywords_matched": [],
+    "execution_time_ms": 1800.0,
+    "total_tokens": 123,
+    "prompt_tokens": 67,
+    "completion_tokens": 56,
+    "cost_usd": 0.0006,
+    "trace_id": "trace_jkl012",
+    "start_time": "2025-01-16T14:23:13Z",
+    "end_time": "2025-01-16T14:23:14.800Z",
+    "start_time_unix_nano": "1760947229000000000",
+    "end_time_unix_nano": "1760947230800000000",
+    "error": "Tool not found: send_email",
+    "error_type": "tool_not_found"
+  },
+  {
+    "run_id": "run_001_gpt4",
+    "task_id": "task_005",
+    "test_index": 4,
+    "prompt": "What is 2+2?",
+    "expected_tool": "calculator",
+    "difficulty": "easy",
+    "category": "reasoning",
+    "success": true,
+    "response": "2+2 equals 4",
+    "tool_called": "calculator",
+    "tool_correct": true,
+    "expected_keywords": ["4"],
+    "keywords_matched": ["4"],
+    "execution_time_ms": 900.0,
+    "total_tokens": 67,
+    "prompt_tokens": 34,
+    "completion_tokens": 33,
+    "cost_usd": 0.0003,
+    "trace_id": "trace_mno345",
+    "start_time": "2025-01-16T14:23:16Z",
+    "end_time": "2025-01-16T14:23:16.900Z",
+    "start_time_unix_nano": "1760947232000000000",
+    "end_time_unix_nano": "1760947232900000000",
+    "error": null,
+    "error_type": null
+  }
+]

sample_data/traces_gpt4.json ADDED Viewed

	@@ -0,0 +1,143 @@

+[
+  {
+    "trace_id": "trace_abc123",
+    "run_id": "run_001_gpt4",
+    "traceId": "trace_abc123",
+    "spans": [
+      {
+        "spanId": "span_001",
+        "parentSpanId": null,
+        "name": "Agent Execution",
+        "kind": "INTERNAL",
+        "startTime": 1760947217774556600,
+        "endTime": 1760947220224556600,
+        "attributes": {
+          "agent.type": "both",
+          "agent.name": "ToolCallingAgent",
+          "gen_ai.system": "openai",
+          "gen_ai.request.model": "gpt-4"
+        },
+        "status": {"code": "OK"}
+      },
+      {
+        "spanId": "span_002",
+        "parentSpanId": "span_001",
+        "name": "LLM Call - Reasoning",
+        "kind": "CLIENT",
+        "startTime": 1760947217774556600,
+        "endTime": 1760947218974556600,
+        "attributes": {
+          "gen_ai.system": "openai",
+          "gen_ai.request.model": "gpt-4",
+          "gen_ai.operation.name": "chat",
+          "gen_ai.usage.prompt_tokens": 78,
+          "gen_ai.usage.completion_tokens": 45,
+          "gen_ai.usage.total_tokens": 123,
+          "gen_ai.usage.cost.total": 0.0006,
+          "gen_ai.response.finish_reasons": ["stop"]
+        },
+        "status": {"code": "OK"}
+      },
+      {
+        "spanId": "span_003",
+        "parentSpanId": "span_001",
+        "name": "Tool Call - get_weather",
+        "kind": "CLIENT",
+        "startTime": 1760947219000556600,
+        "endTime": 1760947219890556600,
+        "attributes": {
+          "tool.name": "get_weather",
+          "tool.input": "{\"location\": \"Tokyo\"}",
+          "tool.output": "{\"temp\": \"18°C\", \"condition\": \"clear\"}",
+          "tool.latency_ms": 890
+        },
+        "status": {"code": "OK"}
+      },
+      {
+        "spanId": "span_004",
+        "parentSpanId": "span_001",
+        "name": "LLM Call - Final Response",
+        "kind": "CLIENT",
+        "startTime": 1760947219900556600,
+        "endTime": 1760947220224556600,
+        "attributes": {
+          "gen_ai.system": "openai",
+          "gen_ai.request.model": "gpt-4",
+          "gen_ai.usage.prompt_tokens": 145,
+          "gen_ai.usage.completion_tokens": 111,
+          "gen_ai.usage.cost.total": 0.0006
+        },
+        "status": {"code": "OK"}
+      }
+    ]
+  },
+  {
+    "trace_id": "trace_def456",
+    "run_id": "run_001_gpt4",
+    "traceId": "trace_def456",
+    "spans": [
+      {
+        "spanId": "span_005",
+        "parentSpanId": null,
+        "name": "Agent Execution",
+        "kind": "INTERNAL",
+        "startTime": 1760947221000000000,
+        "endTime": 1760947224800000000,
+        "attributes": {
+          "agent.type": "both",
+          "agent.name": "ToolCallingAgent",
+          "gen_ai.system": "openai",
+          "gen_ai.request.model": "gpt-4"
+        },
+        "status": {"code": "OK"}
+      },
+      {
+        "spanId": "span_006",
+        "parentSpanId": "span_005",
+        "name": "LLM Call - Reasoning",
+        "kind": "CLIENT",
+        "startTime": 1760947221000000000,
+        "endTime": 1760947222200000000,
+        "attributes": {
+          "gen_ai.system": "openai",
+          "gen_ai.request.model": "gpt-4",
+          "gen_ai.operation.name": "chat",
+          "gen_ai.usage.prompt_tokens": 120,
+          "gen_ai.usage.completion_tokens": 67,
+          "gen_ai.usage.total_tokens": 187
+        },
+        "status": {"code": "OK"}
+      },
+      {
+        "spanId": "span_007",
+        "parentSpanId": "span_005",
+        "name": "Tool Call - web_search",
+        "kind": "CLIENT",
+        "startTime": 1760947222300000000,
+        "endTime": 1760947224000000000,
+        "attributes": {
+          "tool.name": "web_search",
+          "tool.input": "{\"query\": \"recent AI news\"}",
+          "tool.output": "{\"results\": [...]}",
+          "tool.latency_ms": 1700
+        },
+        "status": {"code": "OK"}
+      },
+      {
+        "spanId": "span_008",
+        "parentSpanId": "span_005",
+        "name": "LLM Call - Final Response",
+        "kind": "CLIENT",
+        "startTime": 1760947224100000000,
+        "endTime": 1760947224800000000,
+        "attributes": {
+          "gen_ai.system": "openai",
+          "gen_ai.request.model": "gpt-4",
+          "gen_ai.usage.prompt_tokens": 189,
+          "gen_ai.usage.completion_tokens": 269
+        },
+        "status": {"code": "OK"}
+      }
+    ]
+  }
+]