Spaces:
Running
Running
Mandark-droid
commited on
Commit
·
24b4390
1
Parent(s):
1fc3adb
Add leaderboard components and enhanced data loader
Browse files- Add HTML table generator with styled leaderboard display
- Add metric display components (badges, bars, formatters)
- Enhance data loader to support both JSON and HuggingFace sources
- Add sample data for local development and testing
- Implement automatic fallback between data sources
- components/__init__.py +60 -0
- components/leaderboard_table.py +582 -0
- components/metric_displays.py +387 -0
- data_loader.py +362 -187
- sample_data/generate_sample_metrics.py +207 -0
- sample_data/leaderboard.json +89 -0
- sample_data/metrics_gpt4.json +1 -0
- sample_data/metrics_llama31.json +1106 -0
- sample_data/results_gpt4.json +137 -0
- sample_data/traces_gpt4.json +143 -0
components/__init__.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Components package for TraceMind UI
|
| 3 |
+
Contains reusable visual components
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .metric_displays import (
|
| 7 |
+
get_rank_badge,
|
| 8 |
+
get_success_rate_bar,
|
| 9 |
+
get_gpu_utilization_bar,
|
| 10 |
+
get_provider_badge,
|
| 11 |
+
get_agent_type_badge,
|
| 12 |
+
get_hardware_badge,
|
| 13 |
+
format_cost,
|
| 14 |
+
format_duration,
|
| 15 |
+
get_tooltip_icon
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
from .leaderboard_table import (
|
| 19 |
+
generate_leaderboard_html,
|
| 20 |
+
generate_empty_state_html,
|
| 21 |
+
generate_filter_summary_html
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
from .thought_graph import create_thought_graph
|
| 25 |
+
|
| 26 |
+
from .analytics_charts import (
|
| 27 |
+
create_performance_heatmap,
|
| 28 |
+
create_speed_accuracy_scatter,
|
| 29 |
+
create_cost_efficiency_scatter,
|
| 30 |
+
create_comparison_radar
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
from .report_cards import (
|
| 34 |
+
generate_leaderboard_summary_card,
|
| 35 |
+
generate_run_report_card,
|
| 36 |
+
download_card_as_png_js
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
__all__ = [
|
| 40 |
+
'get_rank_badge',
|
| 41 |
+
'get_success_rate_bar',
|
| 42 |
+
'get_gpu_utilization_bar',
|
| 43 |
+
'get_provider_badge',
|
| 44 |
+
'get_agent_type_badge',
|
| 45 |
+
'get_hardware_badge',
|
| 46 |
+
'format_cost',
|
| 47 |
+
'format_duration',
|
| 48 |
+
'get_tooltip_icon',
|
| 49 |
+
'generate_leaderboard_html',
|
| 50 |
+
'generate_empty_state_html',
|
| 51 |
+
'generate_filter_summary_html',
|
| 52 |
+
'create_thought_graph',
|
| 53 |
+
'create_performance_heatmap',
|
| 54 |
+
'create_speed_accuracy_scatter',
|
| 55 |
+
'create_cost_efficiency_scatter',
|
| 56 |
+
'create_comparison_radar',
|
| 57 |
+
'generate_leaderboard_summary_card',
|
| 58 |
+
'generate_run_report_card',
|
| 59 |
+
'download_card_as_png_js'
|
| 60 |
+
]
|
components/leaderboard_table.py
ADDED
|
@@ -0,0 +1,582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Leaderboard HTML Table Generator
|
| 3 |
+
Creates styled HTML tables for the leaderboard view
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from .metric_displays import (
|
| 9 |
+
get_rank_badge,
|
| 10 |
+
get_success_rate_bar,
|
| 11 |
+
get_gpu_utilization_bar,
|
| 12 |
+
get_provider_badge,
|
| 13 |
+
get_agent_type_badge,
|
| 14 |
+
get_hardware_badge,
|
| 15 |
+
format_cost,
|
| 16 |
+
format_duration,
|
| 17 |
+
get_tooltip_icon
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def generate_leaderboard_html(
|
| 22 |
+
df: pd.DataFrame,
|
| 23 |
+
sort_by: str = "success_rate",
|
| 24 |
+
ascending: bool = False
|
| 25 |
+
) -> str:
|
| 26 |
+
"""
|
| 27 |
+
Generate styled HTML table for leaderboard
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
df: Leaderboard DataFrame
|
| 31 |
+
sort_by: Column to sort by
|
| 32 |
+
ascending: Sort order (False = descending)
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
HTML string with complete styled table
|
| 36 |
+
|
| 37 |
+
Expected DataFrame columns:
|
| 38 |
+
- model (str): Model name
|
| 39 |
+
- agent_type (str): tool, code, or both
|
| 40 |
+
- provider (str): litellm or transformers
|
| 41 |
+
- success_rate (float): 0-100
|
| 42 |
+
- total_tests (int): Number of tests
|
| 43 |
+
- avg_duration_ms (float): Average duration
|
| 44 |
+
- total_cost_usd (float): Total cost
|
| 45 |
+
- co2_emissions_g (float): CO2 emissions
|
| 46 |
+
- gpu_utilization_avg (float, optional): GPU utilization %
|
| 47 |
+
- submitted_by (str): Username
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
# Sort dataframe
|
| 51 |
+
df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
|
| 52 |
+
|
| 53 |
+
# Start HTML with embedded CSS
|
| 54 |
+
html = """
|
| 55 |
+
<style>
|
| 56 |
+
/* Leaderboard Table Styles */
|
| 57 |
+
.tm-leaderboard-container {
|
| 58 |
+
background: #F8FAFC; /* Light background for better readability */
|
| 59 |
+
border-radius: 16px;
|
| 60 |
+
overflow-x: auto; /* Enable horizontal scrolling */
|
| 61 |
+
overflow-y: visible;
|
| 62 |
+
border: 1px solid rgba(203, 213, 225, 0.8);
|
| 63 |
+
margin: 20px 0;
|
| 64 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 65 |
+
max-width: 100%;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
/* Custom scrollbar styling */
|
| 69 |
+
.tm-leaderboard-container::-webkit-scrollbar {
|
| 70 |
+
height: 8px;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.tm-leaderboard-container::-webkit-scrollbar-track {
|
| 74 |
+
background: #E2E8F0;
|
| 75 |
+
border-radius: 4px;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.tm-leaderboard-container::-webkit-scrollbar-thumb {
|
| 79 |
+
background: #94A3B8;
|
| 80 |
+
border-radius: 4px;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.tm-leaderboard-container::-webkit-scrollbar-thumb:hover {
|
| 84 |
+
background: #64748B;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
.tm-leaderboard-table {
|
| 88 |
+
width: 100%;
|
| 89 |
+
min-width: 1650px; /* Reduced from 1800px after combining columns */
|
| 90 |
+
border-collapse: collapse;
|
| 91 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
| 92 |
+
background: #FFFFFF; /* White background */
|
| 93 |
+
color: #0F172A; /* Dark text for maximum contrast */
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.tm-leaderboard-table thead {
|
| 97 |
+
background: linear-gradient(135deg, #6366F1 0%, #4F46E5 100%); /* Vibrant indigo gradient */
|
| 98 |
+
position: sticky;
|
| 99 |
+
top: 0;
|
| 100 |
+
z-index: 10;
|
| 101 |
+
backdrop-filter: blur(10px);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.tm-leaderboard-table th {
|
| 105 |
+
padding: 16px 12px;
|
| 106 |
+
text-align: left;
|
| 107 |
+
font-weight: 600;
|
| 108 |
+
color: #FFFFFF; /* Pure white for headers - good contrast */
|
| 109 |
+
border-bottom: 2px solid #4338CA;
|
| 110 |
+
font-size: 12px;
|
| 111 |
+
text-transform: uppercase;
|
| 112 |
+
letter-spacing: 0.05em;
|
| 113 |
+
white-space: nowrap;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.tm-leaderboard-table td {
|
| 117 |
+
padding: 14px 12px;
|
| 118 |
+
border-bottom: 1px solid rgba(226, 232, 240, 0.8);
|
| 119 |
+
color: #1E293B; /* Dark text for cells */
|
| 120 |
+
font-size: 14px;
|
| 121 |
+
vertical-align: middle;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
.tm-leaderboard-table tbody tr {
|
| 125 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
|
| 126 |
+
cursor: pointer;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.tm-leaderboard-table tbody tr:hover {
|
| 130 |
+
background: rgba(99, 102, 241, 0.08) !important;
|
| 131 |
+
box-shadow: 0 0 15px rgba(99, 102, 241, 0.15),
|
| 132 |
+
inset 0 0 15px rgba(99, 102, 241, 0.05);
|
| 133 |
+
transform: scale(1.002);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.tm-leaderboard-table tbody tr:nth-child(even) {
|
| 137 |
+
background: rgba(241, 245, 249, 0.6); /* Light stripe */
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.tm-model-name {
|
| 141 |
+
font-weight: 600;
|
| 142 |
+
color: #000000 !important; /* Pure black - readable in all themes */
|
| 143 |
+
font-size: 15px;
|
| 144 |
+
transition: color 0.2s ease;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
.tm-leaderboard-table tr:hover .tm-model-name {
|
| 148 |
+
color: #4F46E5 !important; /* Indigo on hover */
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
.tm-numeric-cell {
|
| 152 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
| 153 |
+
font-size: 13px;
|
| 154 |
+
text-align: center;
|
| 155 |
+
color: #000000 !important; /* Pure black for numbers */
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
.tm-badge-cell {
|
| 159 |
+
text-align: center;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.tm-run-id {
|
| 163 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
| 164 |
+
font-size: 12px;
|
| 165 |
+
color: #000000 !important; /* Pure black - readable in all themes */
|
| 166 |
+
cursor: pointer;
|
| 167 |
+
text-decoration: none;
|
| 168 |
+
font-weight: 500;
|
| 169 |
+
transition: all 0.2s ease;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.tm-run-id:hover {
|
| 173 |
+
color: #4F46E5 !important; /* Indigo on hover */
|
| 174 |
+
text-decoration: underline;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.tm-text-cell {
|
| 178 |
+
color: #000000 !important; /* Pure black for all text */
|
| 179 |
+
font-size: 0.9em;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
/* Responsive Design */
|
| 183 |
+
@media (max-width: 1024px) {
|
| 184 |
+
.tm-leaderboard-table th,
|
| 185 |
+
.tm-leaderboard-table td {
|
| 186 |
+
padding: 10px 8px;
|
| 187 |
+
font-size: 12px;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
/* Hide less important columns on smaller screens */
|
| 191 |
+
.tm-hide-mobile {
|
| 192 |
+
display: none !important;
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
@media (max-width: 768px) {
|
| 197 |
+
.tm-leaderboard-table th:nth-child(n+7),
|
| 198 |
+
.tm-leaderboard-table td:nth-child(n+7) {
|
| 199 |
+
display: none !important;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
.tm-model-name {
|
| 203 |
+
font-size: 13px;
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
@media (max-width: 480px) {
|
| 208 |
+
/* Ultra-compact: Show only rank, model, and success rate */
|
| 209 |
+
.tm-leaderboard-table th:nth-child(n+4),
|
| 210 |
+
.tm-leaderboard-table td:nth-child(n+4) {
|
| 211 |
+
display: none !important;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
.tm-leaderboard-table th:nth-child(3),
|
| 215 |
+
.tm-leaderboard-table td:nth-child(3) {
|
| 216 |
+
display: table-cell !important;
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
</style>
|
| 220 |
+
|
| 221 |
+
<div class="tm-leaderboard-container">
|
| 222 |
+
<table class="tm-leaderboard-table">
|
| 223 |
+
<thead>
|
| 224 |
+
<tr>
|
| 225 |
+
<th style="width: 60px;">Rank</th>
|
| 226 |
+
<th style="width: 110px;" title="Click to view detailed run information">Run ID</th>
|
| 227 |
+
<th style="min-width: 160px;">Model</th>
|
| 228 |
+
<th style="width: 80px;">Type</th>
|
| 229 |
+
<th style="width: 90px;">Provider</th>
|
| 230 |
+
<th style="width: 85px;" title="Hardware used for evaluation: GPU or CPU">Hardware</th>
|
| 231 |
+
<th style="width: 150px;" title="Percentage of test cases that passed (0-100%). Higher is better.">
|
| 232 |
+
Success Rate
|
| 233 |
+
</th>
|
| 234 |
+
<th style="width: 140px;" class="tm-numeric-cell" title="Tests: Total / Pass / Fail">
|
| 235 |
+
Tests (P/F)
|
| 236 |
+
</th>
|
| 237 |
+
<th style="width: 70px;" class="tm-numeric-cell" title="Average number of steps per test case.">
|
| 238 |
+
Steps
|
| 239 |
+
</th>
|
| 240 |
+
<th style="width: 100px;" class="tm-numeric-cell" title="Average time per test case. Lower is better.">
|
| 241 |
+
Duration
|
| 242 |
+
</th>
|
| 243 |
+
<th style="width: 90px;" class="tm-numeric-cell" title="Total tokens used across all tests.">
|
| 244 |
+
Tokens
|
| 245 |
+
</th>
|
| 246 |
+
<th style="width: 90px;" class="tm-numeric-cell" title="Total API + power costs in USD. Lower is better.">
|
| 247 |
+
Cost
|
| 248 |
+
</th>
|
| 249 |
+
<th style="width: 80px;" class="tm-numeric-cell tm-hide-mobile" title="Carbon footprint in grams of CO2 equivalent.">
|
| 250 |
+
CO2
|
| 251 |
+
</th>
|
| 252 |
+
<th style="width: 100px;" class="tm-hide-mobile" title="Average GPU usage during evaluation (0-100%). Only for GPU jobs.">
|
| 253 |
+
GPU Util
|
| 254 |
+
</th>
|
| 255 |
+
<th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU memory usage (avg/max in MiB). Only for GPU jobs.">
|
| 256 |
+
GPU Mem
|
| 257 |
+
</th>
|
| 258 |
+
<th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU temperature (avg/max in Celsius). Only for GPU jobs.">
|
| 259 |
+
GPU Temp
|
| 260 |
+
</th>
|
| 261 |
+
<th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="Average GPU power consumption in Watts. Only for GPU jobs.">
|
| 262 |
+
GPU Power
|
| 263 |
+
</th>
|
| 264 |
+
<th style="width: 140px;" class="tm-hide-mobile">Timestamp</th>
|
| 265 |
+
<th style="width: 110px;" class="tm-hide-mobile">Submitted By</th>
|
| 266 |
+
</tr>
|
| 267 |
+
</thead>
|
| 268 |
+
<tbody>
|
| 269 |
+
"""
|
| 270 |
+
|
| 271 |
+
# Generate table rows
|
| 272 |
+
for idx, row in df_sorted.iterrows():
|
| 273 |
+
rank = idx + 1
|
| 274 |
+
|
| 275 |
+
# Get values with safe defaults
|
| 276 |
+
model = row.get('model', 'Unknown')
|
| 277 |
+
agent_type = row.get('agent_type', 'unknown')
|
| 278 |
+
provider = row.get('provider', 'unknown')
|
| 279 |
+
success_rate = row.get('success_rate', 0.0)
|
| 280 |
+
total_tests = row.get('total_tests', 0)
|
| 281 |
+
successful_tests = row.get('successful_tests', 0)
|
| 282 |
+
failed_tests = row.get('failed_tests', 0)
|
| 283 |
+
avg_steps = row.get('avg_steps', 0.0)
|
| 284 |
+
avg_duration_ms = row.get('avg_duration_ms', 0.0)
|
| 285 |
+
total_tokens = row.get('total_tokens', 0)
|
| 286 |
+
total_cost_usd = row.get('total_cost_usd', 0.0)
|
| 287 |
+
co2_emissions_g = row.get('co2_emissions_g', 0.0)
|
| 288 |
+
gpu_utilization_avg = row.get('gpu_utilization_avg', None)
|
| 289 |
+
gpu_memory_avg_mib = row.get('gpu_memory_avg_mib', None)
|
| 290 |
+
gpu_memory_max_mib = row.get('gpu_memory_max_mib', None)
|
| 291 |
+
gpu_temperature_avg = row.get('gpu_temperature_avg', None)
|
| 292 |
+
gpu_temperature_max = row.get('gpu_temperature_max', None)
|
| 293 |
+
gpu_power_avg_w = row.get('gpu_power_avg_w', None)
|
| 294 |
+
timestamp = row.get('timestamp', '')
|
| 295 |
+
submitted_by = row.get('submitted_by', 'Unknown')
|
| 296 |
+
|
| 297 |
+
# Check if GPU job
|
| 298 |
+
has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0
|
| 299 |
+
|
| 300 |
+
# Format GPU utilization
|
| 301 |
+
if has_gpu:
|
| 302 |
+
gpu_display = get_gpu_utilization_bar(gpu_utilization_avg)
|
| 303 |
+
else:
|
| 304 |
+
gpu_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
|
| 305 |
+
|
| 306 |
+
# Format CO2
|
| 307 |
+
if pd.notna(co2_emissions_g) and co2_emissions_g > 0:
|
| 308 |
+
co2_display = f'<span style="font-family: monospace; font-size: 0.9em; color: #334155;">{co2_emissions_g:.2f}g</span>'
|
| 309 |
+
else:
|
| 310 |
+
co2_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
|
| 311 |
+
|
| 312 |
+
# Format GPU Memory
|
| 313 |
+
if pd.notna(gpu_memory_avg_mib) and pd.notna(gpu_memory_max_mib):
|
| 314 |
+
gpu_mem_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_memory_avg_mib:.0f}/{gpu_memory_max_mib:.0f}</span>'
|
| 315 |
+
else:
|
| 316 |
+
gpu_mem_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
|
| 317 |
+
|
| 318 |
+
# Format GPU Temperature
|
| 319 |
+
if pd.notna(gpu_temperature_avg) and pd.notna(gpu_temperature_max):
|
| 320 |
+
gpu_temp_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_temperature_avg:.0f}/{gpu_temperature_max:.0f}°C</span>'
|
| 321 |
+
else:
|
| 322 |
+
gpu_temp_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
|
| 323 |
+
|
| 324 |
+
# Format GPU Power
|
| 325 |
+
if pd.notna(gpu_power_avg_w):
|
| 326 |
+
gpu_power_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_power_avg_w:.1f}W</span>'
|
| 327 |
+
else:
|
| 328 |
+
gpu_power_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
|
| 329 |
+
|
| 330 |
+
# Format timestamp
|
| 331 |
+
from datetime import datetime
|
| 332 |
+
if pd.notna(timestamp):
|
| 333 |
+
try:
|
| 334 |
+
# Handle both string and Timestamp objects
|
| 335 |
+
if isinstance(timestamp, pd.Timestamp):
|
| 336 |
+
timestamp_display = timestamp.strftime('%Y-%m-%d %H:%M')
|
| 337 |
+
else:
|
| 338 |
+
dt = datetime.fromisoformat(str(timestamp).replace('Z', '+00:00'))
|
| 339 |
+
timestamp_display = dt.strftime('%Y-%m-%d %H:%M')
|
| 340 |
+
except Exception as e:
|
| 341 |
+
timestamp_display = str(timestamp)[:16] if timestamp else 'N/A'
|
| 342 |
+
else:
|
| 343 |
+
timestamp_display = 'N/A'
|
| 344 |
+
|
| 345 |
+
# Format Run ID (show first 8 characters)
|
| 346 |
+
run_id = row.get('run_id', 'N/A')
|
| 347 |
+
run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id
|
| 348 |
+
|
| 349 |
+
html += f"""
|
| 350 |
+
<tr data-run-id="{run_id}" data-rank="{rank}" class="tm-clickable-row">
|
| 351 |
+
<td>{get_rank_badge(rank)}</td>
|
| 352 |
+
<td class="tm-run-id" title="{run_id}">{run_id_short}</td>
|
| 353 |
+
<td class="tm-model-name">{model}</td>
|
| 354 |
+
<td class="tm-badge-cell">{get_agent_type_badge(agent_type)}</td>
|
| 355 |
+
<td class="tm-badge-cell">{get_provider_badge(provider)}</td>
|
| 356 |
+
<td class="tm-badge-cell">{get_hardware_badge(has_gpu)}</td>
|
| 357 |
+
<td>{get_success_rate_bar(success_rate)}</td>
|
| 358 |
+
<td class="tm-numeric-cell">
|
| 359 |
+
<strong>{total_tests}</strong>
|
| 360 |
+
<span style="color: #CBD5E1; margin: 0 4px;">/</span>
|
| 361 |
+
<span style="color: #10B981; font-weight: 600;">{successful_tests}</span>
|
| 362 |
+
<span style="color: #CBD5E1; margin: 0 4px;">/</span>
|
| 363 |
+
<span style="color: #EF4444; font-weight: 600;">{failed_tests}</span>
|
| 364 |
+
</td>
|
| 365 |
+
<td class="tm-numeric-cell">{avg_steps:.1f}</td>
|
| 366 |
+
<td class="tm-numeric-cell">{format_duration(avg_duration_ms)}</td>
|
| 367 |
+
<td class="tm-numeric-cell">{total_tokens:,}</td>
|
| 368 |
+
<td class="tm-numeric-cell">{format_cost(total_cost_usd)}</td>
|
| 369 |
+
<td class="tm-numeric-cell tm-hide-mobile">{co2_display}</td>
|
| 370 |
+
<td class="tm-hide-mobile">{gpu_display}</td>
|
| 371 |
+
<td class="tm-numeric-cell tm-hide-mobile">{gpu_mem_display}</td>
|
| 372 |
+
<td class="tm-numeric-cell tm-hide-mobile">{gpu_temp_display}</td>
|
| 373 |
+
<td class="tm-numeric-cell tm-hide-mobile">{gpu_power_display}</td>
|
| 374 |
+
<td class="tm-hide-mobile tm-text-cell">{timestamp_display}</td>
|
| 375 |
+
<td class="tm-hide-mobile tm-text-cell">
|
| 376 |
+
{submitted_by}
|
| 377 |
+
</td>
|
| 378 |
+
</tr>
|
| 379 |
+
"""
|
| 380 |
+
|
| 381 |
+
html += """
|
| 382 |
+
</tbody>
|
| 383 |
+
</table>
|
| 384 |
+
</div>
|
| 385 |
+
|
| 386 |
+
<script>
|
| 387 |
+
// Add click handler for Run ID cells - runs on each table render
|
| 388 |
+
(function() {
|
| 389 |
+
// Function to attach handlers
|
| 390 |
+
function attachRowClickHandlers() {
|
| 391 |
+
const cells = document.querySelectorAll('.tm-run-id');
|
| 392 |
+
console.log('Found', cells.length, 'Run ID cells');
|
| 393 |
+
|
| 394 |
+
cells.forEach(function(cell) {
|
| 395 |
+
// Remove existing listener to avoid duplicates
|
| 396 |
+
cell.replaceWith(cell.cloneNode(true));
|
| 397 |
+
});
|
| 398 |
+
|
| 399 |
+
// Re-select after cloning
|
| 400 |
+
document.querySelectorAll('.tm-run-id').forEach(function(cell) {
|
| 401 |
+
cell.addEventListener('click', function(e) {
|
| 402 |
+
e.stopPropagation();
|
| 403 |
+
const row = this.closest('tr');
|
| 404 |
+
const rowIndex = Array.from(row.parentNode.children).indexOf(row);
|
| 405 |
+
|
| 406 |
+
console.log('Run ID clicked, row index:', rowIndex);
|
| 407 |
+
|
| 408 |
+
// Try multiple ways to find the textbox
|
| 409 |
+
let textbox = null;
|
| 410 |
+
|
| 411 |
+
// Method 1: By elem_id
|
| 412 |
+
const container1 = document.getElementById('selected_row_index');
|
| 413 |
+
if (container1) {
|
| 414 |
+
textbox = container1.querySelector('textarea, input[type="text"]');
|
| 415 |
+
console.log('Method 1 (elem_id):', textbox ? 'Found' : 'Not found');
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
// Method 2: By data-testid
|
| 419 |
+
if (!textbox) {
|
| 420 |
+
const containers = document.querySelectorAll('[data-testid="textbox"]');
|
| 421 |
+
console.log('Method 2: Found', containers.length, 'textbox containers');
|
| 422 |
+
for (let container of containers) {
|
| 423 |
+
const input = container.querySelector('textarea, input[type="text"]');
|
| 424 |
+
if (input && !container.closest('.label-wrap')) {
|
| 425 |
+
textbox = input;
|
| 426 |
+
console.log('Method 2: Using hidden textbox');
|
| 427 |
+
break;
|
| 428 |
+
}
|
| 429 |
+
}
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
if (textbox) {
|
| 433 |
+
// Set the row index
|
| 434 |
+
textbox.value = rowIndex.toString();
|
| 435 |
+
|
| 436 |
+
// Trigger multiple events to ensure Gradio picks it up
|
| 437 |
+
textbox.dispatchEvent(new Event('input', { bubbles: true }));
|
| 438 |
+
textbox.dispatchEvent(new Event('change', { bubbles: true }));
|
| 439 |
+
textbox.dispatchEvent(new Event('blur', { bubbles: true }));
|
| 440 |
+
|
| 441 |
+
// Also try triggering on the container
|
| 442 |
+
const container = textbox.closest('[data-testid="textbox"]');
|
| 443 |
+
if (container) {
|
| 444 |
+
container.dispatchEvent(new Event('input', { bubbles: true }));
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
console.log('Textbox updated to:', rowIndex);
|
| 448 |
+
} else {
|
| 449 |
+
console.error('Could not find hidden textbox!');
|
| 450 |
+
}
|
| 451 |
+
});
|
| 452 |
+
});
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
// Attach immediately
|
| 456 |
+
attachRowClickHandlers();
|
| 457 |
+
|
| 458 |
+
// Also attach after a short delay (in case table loads async)
|
| 459 |
+
setTimeout(attachRowClickHandlers, 500);
|
| 460 |
+
setTimeout(attachRowClickHandlers, 1000);
|
| 461 |
+
setTimeout(attachRowClickHandlers, 2000);
|
| 462 |
+
})();
|
| 463 |
+
</script>
|
| 464 |
+
"""
|
| 465 |
+
|
| 466 |
+
return html
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def generate_empty_state_html() -> str:
|
| 470 |
+
"""
|
| 471 |
+
Generate HTML for empty leaderboard state
|
| 472 |
+
|
| 473 |
+
Returns:
|
| 474 |
+
HTML string for empty state
|
| 475 |
+
"""
|
| 476 |
+
return """
|
| 477 |
+
<div style="
|
| 478 |
+
text-align: center;
|
| 479 |
+
padding: 60px 20px;
|
| 480 |
+
background: var(--tm-bg-card, #1E293B);
|
| 481 |
+
border-radius: 16px;
|
| 482 |
+
border: 2px dashed var(--tm-border-default, rgba(148, 163, 184, 0.2));
|
| 483 |
+
margin: 20px 0;
|
| 484 |
+
">
|
| 485 |
+
<div style="font-size: 48px; margin-bottom: 16px;">📊</div>
|
| 486 |
+
<h3 style="
|
| 487 |
+
color: var(--tm-text-primary, #F1F5F9);
|
| 488 |
+
margin: 0 0 12px 0;
|
| 489 |
+
font-size: 1.5rem;
|
| 490 |
+
">
|
| 491 |
+
No Evaluation Results Yet
|
| 492 |
+
</h3>
|
| 493 |
+
<p style="
|
| 494 |
+
color: var(--tm-text-secondary, #94A3B8);
|
| 495 |
+
margin: 0 0 24px 0;
|
| 496 |
+
font-size: 1rem;
|
| 497 |
+
">
|
| 498 |
+
Run your first evaluation to see results appear here.
|
| 499 |
+
</p>
|
| 500 |
+
<button style="
|
| 501 |
+
padding: 12px 24px;
|
| 502 |
+
background: var(--tm-primary, #4F46E5);
|
| 503 |
+
color: white;
|
| 504 |
+
border: none;
|
| 505 |
+
border-radius: 8px;
|
| 506 |
+
font-weight: 600;
|
| 507 |
+
cursor: pointer;
|
| 508 |
+
font-size: 1rem;
|
| 509 |
+
">
|
| 510 |
+
Start New Evaluation
|
| 511 |
+
</button>
|
| 512 |
+
</div>
|
| 513 |
+
"""
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
def generate_filter_summary_html(
|
| 517 |
+
total_runs: int,
|
| 518 |
+
filtered_runs: int,
|
| 519 |
+
active_filters: dict
|
| 520 |
+
) -> str:
|
| 521 |
+
"""
|
| 522 |
+
Generate summary of active filters
|
| 523 |
+
|
| 524 |
+
Args:
|
| 525 |
+
total_runs: Total number of runs
|
| 526 |
+
filtered_runs: Number of runs after filtering
|
| 527 |
+
active_filters: Dict of active filter values
|
| 528 |
+
|
| 529 |
+
Returns:
|
| 530 |
+
HTML string with filter summary
|
| 531 |
+
"""
|
| 532 |
+
if filtered_runs == total_runs:
|
| 533 |
+
return f"""
|
| 534 |
+
<div style="
|
| 535 |
+
padding: 12px 16px;
|
| 536 |
+
background: var(--tm-bg-secondary, #334155);
|
| 537 |
+
border-radius: 8px;
|
| 538 |
+
margin-bottom: 16px;
|
| 539 |
+
color: var(--tm-text-secondary, #94A3B8);
|
| 540 |
+
font-size: 0.9em;
|
| 541 |
+
">
|
| 542 |
+
Showing all <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> evaluation runs
|
| 543 |
+
</div>
|
| 544 |
+
"""
|
| 545 |
+
|
| 546 |
+
filter_chips = []
|
| 547 |
+
for key, value in active_filters.items():
|
| 548 |
+
if value and value != "All":
|
| 549 |
+
filter_chips.append(f"""
|
| 550 |
+
<span style="
|
| 551 |
+
display: inline-flex;
|
| 552 |
+
align-items: center;
|
| 553 |
+
padding: 4px 10px;
|
| 554 |
+
background: var(--tm-primary, #4F46E5);
|
| 555 |
+
color: white;
|
| 556 |
+
border-radius: 6px;
|
| 557 |
+
font-size: 0.85em;
|
| 558 |
+
margin-right: 8px;
|
| 559 |
+
font-weight: 500;
|
| 560 |
+
">
|
| 561 |
+
{key}: {value}
|
| 562 |
+
</span>
|
| 563 |
+
""")
|
| 564 |
+
|
| 565 |
+
filters_html = "".join(filter_chips) if filter_chips else ""
|
| 566 |
+
|
| 567 |
+
return f"""
|
| 568 |
+
<div style="
|
| 569 |
+
padding: 12px 16px;
|
| 570 |
+
background: var(--tm-bg-secondary, #334155);
|
| 571 |
+
border-radius: 8px;
|
| 572 |
+
margin-bottom: 16px;
|
| 573 |
+
color: var(--tm-text-secondary, #94A3B8);
|
| 574 |
+
font-size: 0.9em;
|
| 575 |
+
">
|
| 576 |
+
<div style="margin-bottom: 8px;">
|
| 577 |
+
Showing <strong style="color: var(--tm-text-primary, #F1F5F9);">{filtered_runs}</strong> of
|
| 578 |
+
<strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> runs
|
| 579 |
+
</div>
|
| 580 |
+
{filters_html}
|
| 581 |
+
</div>
|
| 582 |
+
"""
|
components/metric_displays.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metric Display Components
|
| 3 |
+
Reusable HTML generators for badges, progress bars, and visual metrics
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
def get_rank_badge(rank: int) -> str:
|
| 7 |
+
"""
|
| 8 |
+
Generate HTML for rank badge with medal styling for top 3
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
rank: Position in leaderboard (1-indexed)
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
HTML string for rank badge
|
| 15 |
+
|
| 16 |
+
Examples:
|
| 17 |
+
>>> get_rank_badge(1)
|
| 18 |
+
'<span ...>🥇 1st</span>'
|
| 19 |
+
"""
|
| 20 |
+
badge_styles = {
|
| 21 |
+
1: ("🥇 1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000", "0 2px 8px rgba(255, 215, 0, 0.4)"),
|
| 22 |
+
2: ("🥈 2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff", "0 2px 8px rgba(156, 163, 175, 0.4)"),
|
| 23 |
+
3: ("🥉 3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff", "0 2px 8px rgba(205, 127, 50, 0.4)"),
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
if rank in badge_styles:
|
| 27 |
+
label, gradient, text_color, shadow = badge_styles[rank]
|
| 28 |
+
return f"""
|
| 29 |
+
<span style="
|
| 30 |
+
display: inline-flex;
|
| 31 |
+
align-items: center;
|
| 32 |
+
justify-content: center;
|
| 33 |
+
min-width: 60px;
|
| 34 |
+
padding: 6px 12px;
|
| 35 |
+
background: {gradient};
|
| 36 |
+
color: {text_color};
|
| 37 |
+
border-radius: 8px;
|
| 38 |
+
font-weight: 700;
|
| 39 |
+
font-size: 0.9em;
|
| 40 |
+
box-shadow: {shadow};
|
| 41 |
+
letter-spacing: 0.5px;
|
| 42 |
+
">
|
| 43 |
+
{label}
|
| 44 |
+
</span>
|
| 45 |
+
"""
|
| 46 |
+
else:
|
| 47 |
+
return f"""
|
| 48 |
+
<span style="
|
| 49 |
+
display: inline-flex;
|
| 50 |
+
align-items: center;
|
| 51 |
+
justify-content: center;
|
| 52 |
+
min-width: 32px;
|
| 53 |
+
color: var(--tm-text-muted, #64748B);
|
| 54 |
+
font-weight: 500;
|
| 55 |
+
font-size: 0.95em;
|
| 56 |
+
">
|
| 57 |
+
{rank}
|
| 58 |
+
</span>
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_success_rate_bar(success_rate: float) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Generate HTML for success rate progress bar with color gradient
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
success_rate: Success percentage (0-100)
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
HTML string with progress bar and numeric value
|
| 71 |
+
|
| 72 |
+
Color Logic:
|
| 73 |
+
- < 50%: Red → Orange (danger)
|
| 74 |
+
- 50-79%: Orange → Yellow (warning)
|
| 75 |
+
- 80-100%: Green → Cyan (success)
|
| 76 |
+
"""
|
| 77 |
+
width = min(max(success_rate, 0), 100) # Clamp to 0-100
|
| 78 |
+
|
| 79 |
+
# Dynamic gradient based on performance
|
| 80 |
+
if success_rate < 50:
|
| 81 |
+
gradient = "linear-gradient(90deg, #EF4444, #F59E0B)" # Red → Orange
|
| 82 |
+
bar_color = "#EF4444"
|
| 83 |
+
elif success_rate < 80:
|
| 84 |
+
gradient = "linear-gradient(90deg, #F59E0B, #FBBF24)" # Orange → Yellow
|
| 85 |
+
bar_color = "#F59E0B"
|
| 86 |
+
else:
|
| 87 |
+
gradient = "linear-gradient(90deg, #10B981, #06B6D4)" # Green → Cyan
|
| 88 |
+
bar_color = "#10B981"
|
| 89 |
+
|
| 90 |
+
return f"""
|
| 91 |
+
<div style="display: flex; align-items: center; gap: 10px; width: 100%;">
|
| 92 |
+
<div style="
|
| 93 |
+
flex: 1;
|
| 94 |
+
height: 8px;
|
| 95 |
+
background: rgba(148, 163, 184, 0.15);
|
| 96 |
+
border-radius: 4px;
|
| 97 |
+
overflow: hidden;
|
| 98 |
+
max-width: 160px;
|
| 99 |
+
position: relative;
|
| 100 |
+
">
|
| 101 |
+
<div style="
|
| 102 |
+
width: {width}%;
|
| 103 |
+
height: 100%;
|
| 104 |
+
background: {gradient};
|
| 105 |
+
border-radius: 4px;
|
| 106 |
+
transition: width 0.5s cubic-bezier(0.4, 0, 0.2, 1);
|
| 107 |
+
box-shadow: 0 0 8px {bar_color}40;
|
| 108 |
+
"></div>
|
| 109 |
+
</div>
|
| 110 |
+
<span style="
|
| 111 |
+
font-family: 'Monaco', 'Menlo', monospace;
|
| 112 |
+
font-weight: 600;
|
| 113 |
+
color: var(--tm-text-primary, #000000);
|
| 114 |
+
min-width: 55px;
|
| 115 |
+
font-size: 0.9em;
|
| 116 |
+
">{success_rate:.1f}%</span>
|
| 117 |
+
</div>
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def get_gpu_utilization_bar(utilization: float) -> str:
|
| 122 |
+
"""
|
| 123 |
+
Generate HTML for GPU utilization progress bar
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
utilization: GPU utilization percentage (0-100)
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
HTML string with progress bar
|
| 130 |
+
|
| 131 |
+
Color Logic:
|
| 132 |
+
- < 30%: Low utilization (yellow/amber)
|
| 133 |
+
- 30-70%: Medium utilization (orange)
|
| 134 |
+
- > 70%: High utilization (red/orange) - good efficiency!
|
| 135 |
+
"""
|
| 136 |
+
width = min(max(utilization, 0), 100)
|
| 137 |
+
|
| 138 |
+
# Higher GPU utilization = better efficiency
|
| 139 |
+
if utilization < 30:
|
| 140 |
+
gradient = "linear-gradient(90deg, #FBBF24, #F59E0B)" # Yellow → Amber
|
| 141 |
+
elif utilization < 70:
|
| 142 |
+
gradient = "linear-gradient(90deg, #F59E0B, #FB923C)" # Amber → Orange
|
| 143 |
+
else:
|
| 144 |
+
gradient = "linear-gradient(90deg, #FB923C, #F97316)" # Orange → Deep Orange
|
| 145 |
+
|
| 146 |
+
return f"""
|
| 147 |
+
<div style="display: flex; align-items: center; gap: 8px;">
|
| 148 |
+
<div style="
|
| 149 |
+
flex: 1;
|
| 150 |
+
height: 6px;
|
| 151 |
+
background: rgba(148, 163, 184, 0.15);
|
| 152 |
+
border-radius: 3px;
|
| 153 |
+
max-width: 100px;
|
| 154 |
+
">
|
| 155 |
+
<div style="
|
| 156 |
+
width: {width}%;
|
| 157 |
+
height: 100%;
|
| 158 |
+
background: {gradient};
|
| 159 |
+
border-radius: 3px;
|
| 160 |
+
transition: width 0.4s ease;
|
| 161 |
+
"></div>
|
| 162 |
+
</div>
|
| 163 |
+
<span style="
|
| 164 |
+
font-family: monospace;
|
| 165 |
+
font-size: 0.85em;
|
| 166 |
+
color: var(--tm-text-secondary, #000000);
|
| 167 |
+
min-width: 45px;
|
| 168 |
+
">{utilization:.1f}%</span>
|
| 169 |
+
</div>
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def get_provider_badge(provider: str) -> str:
|
| 174 |
+
"""
|
| 175 |
+
Generate HTML for provider type badge
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
provider: Provider name (litellm, transformers, etc.)
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
HTML string for colored badge
|
| 182 |
+
|
| 183 |
+
Colors:
|
| 184 |
+
- litellm: Blue (API providers)
|
| 185 |
+
- transformers: Green (GPU/local models)
|
| 186 |
+
"""
|
| 187 |
+
provider_colors = {
|
| 188 |
+
"litellm": "#3B82F6", # Blue - API providers
|
| 189 |
+
"transformers": "#10B981", # Green - GPU/local
|
| 190 |
+
"openai": "#10A37F", # OpenAI green
|
| 191 |
+
"anthropic": "#D97757", # Anthropic orange
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
bg_color = provider_colors.get(provider.lower(), "#6B7280") # Default gray
|
| 195 |
+
|
| 196 |
+
return f"""
|
| 197 |
+
<span style="
|
| 198 |
+
display: inline-flex;
|
| 199 |
+
align-items: center;
|
| 200 |
+
padding: 4px 10px;
|
| 201 |
+
background: {bg_color};
|
| 202 |
+
color: white;
|
| 203 |
+
border-radius: 5px;
|
| 204 |
+
font-size: 0.75em;
|
| 205 |
+
font-weight: 600;
|
| 206 |
+
text-transform: uppercase;
|
| 207 |
+
letter-spacing: 0.5px;
|
| 208 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
|
| 209 |
+
">
|
| 210 |
+
{provider.upper()}
|
| 211 |
+
</span>
|
| 212 |
+
"""
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def get_agent_type_badge(agent_type: str) -> str:
|
| 216 |
+
"""
|
| 217 |
+
Generate HTML for agent type badge
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
agent_type: Agent type (tool, code, both)
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
HTML string for colored badge
|
| 224 |
+
|
| 225 |
+
Colors:
|
| 226 |
+
- tool: Purple
|
| 227 |
+
- code: Amber/Orange
|
| 228 |
+
- both: Cyan
|
| 229 |
+
"""
|
| 230 |
+
type_colors = {
|
| 231 |
+
"tool": "#8B5CF6", # Purple
|
| 232 |
+
"code": "#F59E0B", # Amber
|
| 233 |
+
"both": "#06B6D4", # Cyan
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
bg_color = type_colors.get(agent_type.lower(), "#6B7280")
|
| 237 |
+
|
| 238 |
+
return f"""
|
| 239 |
+
<span style="
|
| 240 |
+
display: inline-flex;
|
| 241 |
+
align-items: center;
|
| 242 |
+
padding: 4px 10px;
|
| 243 |
+
background: {bg_color};
|
| 244 |
+
color: white;
|
| 245 |
+
border-radius: 5px;
|
| 246 |
+
font-size: 0.75em;
|
| 247 |
+
font-weight: 600;
|
| 248 |
+
text-transform: uppercase;
|
| 249 |
+
letter-spacing: 0.5px;
|
| 250 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
|
| 251 |
+
">
|
| 252 |
+
{agent_type.upper()}
|
| 253 |
+
</span>
|
| 254 |
+
"""
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def get_hardware_badge(has_gpu: bool) -> str:
|
| 258 |
+
"""
|
| 259 |
+
Generate HTML for hardware type badge
|
| 260 |
+
|
| 261 |
+
Args:
|
| 262 |
+
has_gpu: Whether job used GPU
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
HTML string for badge
|
| 266 |
+
"""
|
| 267 |
+
if has_gpu:
|
| 268 |
+
return """
|
| 269 |
+
<span style="
|
| 270 |
+
display: inline-flex;
|
| 271 |
+
align-items: center;
|
| 272 |
+
gap: 4px;
|
| 273 |
+
padding: 4px 10px;
|
| 274 |
+
background: linear-gradient(135deg, #F59E0B, #EF4444);
|
| 275 |
+
color: white;
|
| 276 |
+
border-radius: 5px;
|
| 277 |
+
font-size: 0.75em;
|
| 278 |
+
font-weight: 600;
|
| 279 |
+
letter-spacing: 0.5px;
|
| 280 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
|
| 281 |
+
">
|
| 282 |
+
🖥️ GPU
|
| 283 |
+
</span>
|
| 284 |
+
"""
|
| 285 |
+
else:
|
| 286 |
+
return """
|
| 287 |
+
<span style="
|
| 288 |
+
display: inline-flex;
|
| 289 |
+
align-items: center;
|
| 290 |
+
gap: 4px;
|
| 291 |
+
padding: 4px 10px;
|
| 292 |
+
background: #6B7280;
|
| 293 |
+
color: white;
|
| 294 |
+
border-radius: 5px;
|
| 295 |
+
font-size: 0.75em;
|
| 296 |
+
font-weight: 600;
|
| 297 |
+
letter-spacing: 0.5px;
|
| 298 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
|
| 299 |
+
">
|
| 300 |
+
💻 CPU
|
| 301 |
+
</span>
|
| 302 |
+
"""
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def format_cost(cost_usd: float) -> str:
|
| 306 |
+
"""
|
| 307 |
+
Format cost with color coding
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
cost_usd: Cost in USD
|
| 311 |
+
|
| 312 |
+
Returns:
|
| 313 |
+
HTML string with formatted cost
|
| 314 |
+
"""
|
| 315 |
+
# Color code by cost magnitude
|
| 316 |
+
if cost_usd < 0.01:
|
| 317 |
+
color = "#10B981" # Green - very cheap
|
| 318 |
+
elif cost_usd < 0.05:
|
| 319 |
+
color = "#F59E0B" # Amber - moderate
|
| 320 |
+
else:
|
| 321 |
+
color = "#EF4444" # Red - expensive
|
| 322 |
+
|
| 323 |
+
return f"""
|
| 324 |
+
<span style="
|
| 325 |
+
font-family: monospace;
|
| 326 |
+
font-weight: 600;
|
| 327 |
+
color: {color};
|
| 328 |
+
font-size: 0.9em;
|
| 329 |
+
">
|
| 330 |
+
${cost_usd:.4f}
|
| 331 |
+
</span>
|
| 332 |
+
"""
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def format_duration(duration_ms: float) -> str:
|
| 336 |
+
"""
|
| 337 |
+
Format duration with appropriate units
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
duration_ms: Duration in milliseconds
|
| 341 |
+
|
| 342 |
+
Returns:
|
| 343 |
+
HTML string with formatted duration
|
| 344 |
+
"""
|
| 345 |
+
if duration_ms < 1000:
|
| 346 |
+
value = duration_ms
|
| 347 |
+
unit = "ms"
|
| 348 |
+
color = "#10B981" # Green - fast
|
| 349 |
+
elif duration_ms < 10000:
|
| 350 |
+
value = duration_ms / 1000
|
| 351 |
+
unit = "s"
|
| 352 |
+
color = "#F59E0B" # Amber - moderate
|
| 353 |
+
else:
|
| 354 |
+
value = duration_ms / 1000
|
| 355 |
+
unit = "s"
|
| 356 |
+
color = "#EF4444" # Red - slow
|
| 357 |
+
|
| 358 |
+
return f"""
|
| 359 |
+
<span style="
|
| 360 |
+
font-family: monospace;
|
| 361 |
+
color: {color};
|
| 362 |
+
font-weight: 500;
|
| 363 |
+
font-size: 0.9em;
|
| 364 |
+
">
|
| 365 |
+
{value:.1f}{unit}
|
| 366 |
+
</span>
|
| 367 |
+
"""
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def get_tooltip_icon(tooltip_text: str) -> str:
|
| 371 |
+
"""
|
| 372 |
+
Generate info icon with tooltip
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
tooltip_text: Text to show in tooltip
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
HTML string for icon with tooltip
|
| 379 |
+
"""
|
| 380 |
+
return f"""
|
| 381 |
+
<span title="{tooltip_text}" style="
|
| 382 |
+
color: var(--tm-secondary, #06B6D4);
|
| 383 |
+
cursor: help;
|
| 384 |
+
font-size: 0.9em;
|
| 385 |
+
margin-left: 4px;
|
| 386 |
+
">ⓘ</span>
|
| 387 |
+
"""
|
data_loader.py
CHANGED
|
@@ -1,255 +1,430 @@
|
|
| 1 |
"""
|
| 2 |
-
Data Loader for
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
from datasets import load_dataset
|
| 10 |
-
from
|
|
|
|
| 11 |
|
| 12 |
-
# Load environment variables
|
| 13 |
-
load_dotenv()
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def __init__(
|
| 20 |
self,
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
hf_token: Optional[str] = None
|
| 23 |
):
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"""
|
| 31 |
-
self.leaderboard_repo = leaderboard_repo or os.getenv(
|
| 32 |
-
'LEADERBOARD_REPO',
|
| 33 |
-
'kshitijthakkar/smoltrace-leaderboard'
|
| 34 |
-
)
|
| 35 |
-
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
| 36 |
-
|
| 37 |
-
# Cache for loaded datasets
|
| 38 |
-
self._leaderboard_df: Optional[pd.DataFrame] = None
|
| 39 |
-
self._results_cache: Dict[str, pd.DataFrame] = {}
|
| 40 |
-
self._traces_cache: Dict[str, List[Dict]] = {}
|
| 41 |
-
self._metrics_cache: Dict[str, Dict] = {}
|
| 42 |
-
|
| 43 |
-
def load_leaderboard(self, force_refresh: bool = False) -> pd.DataFrame:
|
| 44 |
-
"""
|
| 45 |
-
Load leaderboard dataset from HuggingFace
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
DataFrame with leaderboard data
|
| 52 |
"""
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
try:
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
split='train',
|
| 63 |
-
token=self.hf_token
|
| 64 |
-
)
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
'success_rate', 'total_tests', 'successful_tests', 'failed_tests',
|
| 78 |
-
'avg_steps', 'avg_duration_ms', 'total_duration_ms',
|
| 79 |
-
'total_tokens', 'avg_tokens_per_test', 'total_cost_usd', 'avg_cost_per_test_usd',
|
| 80 |
-
'co2_emissions_g', 'gpu_utilization_avg', 'gpu_memory_max_mib',
|
| 81 |
-
'results_dataset', 'traces_dataset', 'metrics_dataset',
|
| 82 |
-
'timestamp', 'submitted_by', 'hf_job_id', 'job_type',
|
| 83 |
-
'dataset_used', 'smoltrace_version'
|
| 84 |
-
])
|
| 85 |
-
|
| 86 |
-
def load_results(self, results_repo: str, force_refresh: bool = False) -> pd.DataFrame:
|
| 87 |
"""
|
| 88 |
Load results dataset for a specific run
|
| 89 |
|
| 90 |
Args:
|
| 91 |
-
|
| 92 |
-
force_refresh: Force reload from HF
|
| 93 |
|
| 94 |
Returns:
|
| 95 |
DataFrame with test case results
|
| 96 |
"""
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
"""
|
| 129 |
Load traces dataset for a specific run
|
| 130 |
|
| 131 |
Args:
|
| 132 |
-
|
| 133 |
-
force_refresh: Force reload from HF
|
| 134 |
-
|
| 135 |
-
Returns:
|
| 136 |
-
List of trace dictionaries (OpenTelemetry format)
|
| 137 |
-
"""
|
| 138 |
-
if traces_repo in self._traces_cache and not force_refresh:
|
| 139 |
-
return self._traces_cache[traces_repo]
|
| 140 |
-
|
| 141 |
-
try:
|
| 142 |
-
print(f"🔍 Loading traces from {traces_repo}...")
|
| 143 |
-
|
| 144 |
-
dataset = load_dataset(
|
| 145 |
-
traces_repo,
|
| 146 |
-
split='train',
|
| 147 |
-
token=self.hf_token
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
# Convert to list of dicts
|
| 151 |
-
traces = [dict(item) for item in dataset]
|
| 152 |
-
self._traces_cache[traces_repo] = traces
|
| 153 |
-
|
| 154 |
-
print(f"✅ Loaded {len(traces)} traces")
|
| 155 |
-
return traces
|
| 156 |
-
|
| 157 |
-
except Exception as e:
|
| 158 |
-
print(f"❌ Error loading traces: {e}")
|
| 159 |
-
return []
|
| 160 |
-
|
| 161 |
-
def load_metrics(self, metrics_repo: str, force_refresh: bool = False) -> Dict[str, Any]:
|
| 162 |
-
"""
|
| 163 |
-
Load GPU metrics dataset for a specific run
|
| 164 |
-
|
| 165 |
-
Args:
|
| 166 |
-
metrics_repo: HuggingFace dataset repo for metrics
|
| 167 |
-
force_refresh: Force reload from HF
|
| 168 |
|
| 169 |
Returns:
|
| 170 |
-
|
| 171 |
"""
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
"""
|
| 200 |
-
|
| 201 |
|
| 202 |
Args:
|
| 203 |
-
|
| 204 |
|
| 205 |
Returns:
|
| 206 |
-
|
| 207 |
"""
|
| 208 |
-
|
| 209 |
|
| 210 |
-
|
|
|
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
else:
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
"""
|
| 219 |
-
Get a specific trace by
|
| 220 |
|
| 221 |
Args:
|
| 222 |
-
|
| 223 |
-
trace_id: Trace ID to
|
| 224 |
|
| 225 |
Returns:
|
| 226 |
-
Trace
|
| 227 |
"""
|
| 228 |
-
traces = self.load_traces(
|
| 229 |
|
| 230 |
for trace in traces:
|
| 231 |
-
if trace.get(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
return trace
|
| 233 |
|
| 234 |
return None
|
| 235 |
|
| 236 |
-
def clear_cache(self):
|
| 237 |
-
"""Clear
|
| 238 |
-
self.
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
|
| 245 |
-
def create_data_loader_from_env() ->
|
| 246 |
"""
|
| 247 |
-
Create
|
| 248 |
|
| 249 |
Returns:
|
| 250 |
-
|
| 251 |
"""
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Data Loader for MockTraceMind
|
| 3 |
+
Supports loading from both JSON files and HuggingFace datasets
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List, Optional, Any, Literal
|
| 10 |
import pandas as pd
|
| 11 |
from datasets import load_dataset
|
| 12 |
+
from huggingface_hub import HfApi
|
| 13 |
+
import gradio as gr
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
DataSource = Literal["json", "huggingface", "both"]
|
| 17 |
|
| 18 |
+
|
| 19 |
+
class DataLoader:
|
| 20 |
+
"""
|
| 21 |
+
Unified data loader for MockTraceMind
|
| 22 |
+
|
| 23 |
+
Supports:
|
| 24 |
+
- Local JSON files
|
| 25 |
+
- HuggingFace datasets
|
| 26 |
+
- Automatic fallback between sources
|
| 27 |
+
- Caching for performance
|
| 28 |
+
"""
|
| 29 |
|
| 30 |
def __init__(
|
| 31 |
self,
|
| 32 |
+
data_source: DataSource = "both",
|
| 33 |
+
json_data_path: Optional[str] = None,
|
| 34 |
+
leaderboard_dataset: Optional[str] = None,
|
| 35 |
hf_token: Optional[str] = None
|
| 36 |
):
|
| 37 |
+
self.data_source = data_source
|
| 38 |
+
self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
|
| 39 |
+
self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "huggingface/smolagents-leaderboard")
|
| 40 |
+
self.hf_token = hf_token or os.getenv("HF_TOKEN")
|
| 41 |
|
| 42 |
+
# Cache
|
| 43 |
+
self._cache: Dict[str, Any] = {}
|
| 44 |
+
self.hf_api = HfApi(token=self.hf_token) if self.hf_token else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
def load_leaderboard(self) -> pd.DataFrame:
|
| 47 |
+
"""
|
| 48 |
+
Load leaderboard dataset
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
DataFrame with leaderboard data
|
| 52 |
"""
|
| 53 |
+
cache_key = "leaderboard"
|
| 54 |
+
|
| 55 |
+
if cache_key in self._cache:
|
| 56 |
+
return self._cache[cache_key]
|
| 57 |
+
|
| 58 |
+
# Try HuggingFace first
|
| 59 |
+
if self.data_source in ["huggingface", "both"]:
|
| 60 |
+
try:
|
| 61 |
+
df = self._load_leaderboard_from_hf()
|
| 62 |
+
self._cache[cache_key] = df
|
| 63 |
+
return df
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"Failed to load from HuggingFace: {e}")
|
| 66 |
+
if self.data_source == "huggingface":
|
| 67 |
+
raise
|
| 68 |
+
|
| 69 |
+
# Fallback to JSON
|
| 70 |
+
if self.data_source in ["json", "both"]:
|
| 71 |
+
try:
|
| 72 |
+
df = self._load_leaderboard_from_json()
|
| 73 |
+
self._cache[cache_key] = df
|
| 74 |
+
return df
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"Failed to load from JSON: {e}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
raise ValueError("No valid data source available")
|
| 80 |
+
|
| 81 |
+
def _load_leaderboard_from_hf(self) -> pd.DataFrame:
|
| 82 |
+
"""Load leaderboard from HuggingFace dataset"""
|
| 83 |
try:
|
| 84 |
+
ds = load_dataset(self.leaderboard_dataset, split="train", token=self.hf_token)
|
| 85 |
+
df = ds.to_pandas()
|
| 86 |
+
print(f"[OK] Loaded leaderboard from HuggingFace: {len(df)} rows")
|
| 87 |
+
return df
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"[ERROR] Loading from HuggingFace: {e}")
|
| 90 |
+
raise
|
| 91 |
|
| 92 |
+
def _load_leaderboard_from_json(self) -> pd.DataFrame:
|
| 93 |
+
"""Load leaderboard from local JSON file"""
|
| 94 |
+
json_path = self.json_data_path / "leaderboard.json"
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
if not json_path.exists():
|
| 97 |
+
raise FileNotFoundError(f"Leaderboard JSON not found: {json_path}")
|
| 98 |
|
| 99 |
+
with open(json_path, "r") as f:
|
| 100 |
+
data = json.load(f)
|
| 101 |
|
| 102 |
+
df = pd.DataFrame(data)
|
| 103 |
+
print(f"[OK] Loaded leaderboard from JSON: {len(df)} rows")
|
| 104 |
+
return df
|
| 105 |
+
|
| 106 |
+
def load_results(self, results_dataset: str) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
"""
|
| 108 |
Load results dataset for a specific run
|
| 109 |
|
| 110 |
Args:
|
| 111 |
+
results_dataset: Dataset reference (e.g., "user/agent-results-gpt4")
|
|
|
|
| 112 |
|
| 113 |
Returns:
|
| 114 |
DataFrame with test case results
|
| 115 |
"""
|
| 116 |
+
cache_key = f"results_{results_dataset}"
|
| 117 |
+
|
| 118 |
+
if cache_key in self._cache:
|
| 119 |
+
return self._cache[cache_key]
|
| 120 |
+
|
| 121 |
+
# Try HuggingFace first
|
| 122 |
+
if self.data_source in ["huggingface", "both"]:
|
| 123 |
+
try:
|
| 124 |
+
df = self._load_results_from_hf(results_dataset)
|
| 125 |
+
self._cache[cache_key] = df
|
| 126 |
+
return df
|
| 127 |
+
except Exception as e:
|
| 128 |
+
print(f"Failed to load results from HuggingFace: {e}")
|
| 129 |
+
if self.data_source == "huggingface":
|
| 130 |
+
raise
|
| 131 |
+
|
| 132 |
+
# Fallback to JSON
|
| 133 |
+
if self.data_source in ["json", "both"]:
|
| 134 |
+
try:
|
| 135 |
+
df = self._load_results_from_json(results_dataset)
|
| 136 |
+
self._cache[cache_key] = df
|
| 137 |
+
return df
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"Failed to load results from JSON: {e}")
|
| 140 |
+
raise
|
| 141 |
+
|
| 142 |
+
raise ValueError("No valid data source available")
|
| 143 |
+
|
| 144 |
+
def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
|
| 145 |
+
"""Load results from HuggingFace dataset"""
|
| 146 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token)
|
| 147 |
+
df = ds.to_pandas()
|
| 148 |
+
print(f"[OK] Loaded results from HuggingFace: {len(df)} rows")
|
| 149 |
+
return df
|
| 150 |
+
|
| 151 |
+
def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
|
| 152 |
+
"""Load results from local JSON file"""
|
| 153 |
+
# Extract filename from dataset ID (e.g., "user/agent-results-gpt4" -> "results_gpt4.json")
|
| 154 |
+
filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
|
| 155 |
+
json_path = self.json_data_path / filename
|
| 156 |
+
|
| 157 |
+
if not json_path.exists():
|
| 158 |
+
raise FileNotFoundError(f"Results JSON not found: {json_path}")
|
| 159 |
+
|
| 160 |
+
with open(json_path, "r") as f:
|
| 161 |
+
data = json.load(f)
|
| 162 |
+
|
| 163 |
+
df = pd.DataFrame(data)
|
| 164 |
+
print(f"[OK] Loaded results from JSON: {len(df)} rows")
|
| 165 |
+
return df
|
| 166 |
+
|
| 167 |
+
def load_traces(self, traces_dataset: str) -> List[Dict[str, Any]]:
|
| 168 |
"""
|
| 169 |
Load traces dataset for a specific run
|
| 170 |
|
| 171 |
Args:
|
| 172 |
+
traces_dataset: Dataset reference (e.g., "user/agent-traces-gpt4")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
Returns:
|
| 175 |
+
List of trace objects (OpenTelemetry format)
|
| 176 |
"""
|
| 177 |
+
cache_key = f"traces_{traces_dataset}"
|
| 178 |
+
|
| 179 |
+
if cache_key in self._cache:
|
| 180 |
+
return self._cache[cache_key]
|
| 181 |
+
|
| 182 |
+
# Try HuggingFace first
|
| 183 |
+
if self.data_source in ["huggingface", "both"]:
|
| 184 |
+
try:
|
| 185 |
+
traces = self._load_traces_from_hf(traces_dataset)
|
| 186 |
+
self._cache[cache_key] = traces
|
| 187 |
+
return traces
|
| 188 |
+
except Exception as e:
|
| 189 |
+
print(f"Failed to load traces from HuggingFace: {e}")
|
| 190 |
+
if self.data_source == "huggingface":
|
| 191 |
+
raise
|
| 192 |
+
|
| 193 |
+
# Fallback to JSON
|
| 194 |
+
if self.data_source in ["json", "both"]:
|
| 195 |
+
try:
|
| 196 |
+
traces = self._load_traces_from_json(traces_dataset)
|
| 197 |
+
self._cache[cache_key] = traces
|
| 198 |
+
return traces
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"Failed to load traces from JSON: {e}")
|
| 201 |
+
raise
|
| 202 |
+
|
| 203 |
+
raise ValueError("No valid data source available")
|
| 204 |
+
|
| 205 |
+
def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
|
| 206 |
+
"""Load traces from HuggingFace dataset"""
|
| 207 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token)
|
| 208 |
+
traces = ds.to_pandas().to_dict("records")
|
| 209 |
+
print(f"[OK] Loaded traces from HuggingFace: {len(traces)} traces")
|
| 210 |
+
return traces
|
| 211 |
+
|
| 212 |
+
def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
|
| 213 |
+
"""Load traces from local JSON file"""
|
| 214 |
+
filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
|
| 215 |
+
json_path = self.json_data_path / filename
|
| 216 |
+
|
| 217 |
+
if not json_path.exists():
|
| 218 |
+
raise FileNotFoundError(f"Traces JSON not found: {json_path}")
|
| 219 |
+
|
| 220 |
+
with open(json_path, "r") as f:
|
| 221 |
+
data = json.load(f)
|
| 222 |
+
|
| 223 |
+
print(f"[OK] Loaded traces from JSON: {len(data)} traces")
|
| 224 |
+
return data
|
| 225 |
+
|
| 226 |
+
def load_metrics(self, metrics_dataset: str) -> pd.DataFrame:
|
| 227 |
"""
|
| 228 |
+
Load metrics dataset for a specific run (GPU metrics)
|
| 229 |
|
| 230 |
Args:
|
| 231 |
+
metrics_dataset: Dataset reference (e.g., "user/agent-metrics-gpt4")
|
| 232 |
|
| 233 |
Returns:
|
| 234 |
+
DataFrame with GPU metrics in flat format (columns: timestamp, gpu_utilization_percent, etc.)
|
| 235 |
"""
|
| 236 |
+
cache_key = f"metrics_{metrics_dataset}"
|
| 237 |
|
| 238 |
+
if cache_key in self._cache:
|
| 239 |
+
return self._cache[cache_key]
|
| 240 |
|
| 241 |
+
# Try HuggingFace first
|
| 242 |
+
if self.data_source in ["huggingface", "both"]:
|
| 243 |
+
try:
|
| 244 |
+
metrics = self._load_metrics_from_hf(metrics_dataset)
|
| 245 |
+
self._cache[cache_key] = metrics
|
| 246 |
+
return metrics
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Failed to load metrics from HuggingFace: {e}")
|
| 249 |
+
if self.data_source == "huggingface":
|
| 250 |
+
raise
|
| 251 |
+
|
| 252 |
+
# Fallback to JSON
|
| 253 |
+
if self.data_source in ["json", "both"]:
|
| 254 |
+
try:
|
| 255 |
+
metrics = self._load_metrics_from_json(metrics_dataset)
|
| 256 |
+
self._cache[cache_key] = metrics
|
| 257 |
+
return metrics
|
| 258 |
+
except Exception as e:
|
| 259 |
+
print(f"Failed to load metrics from JSON: {e}")
|
| 260 |
+
# Metrics might not exist for API models, don't raise
|
| 261 |
+
print("⚠️ No metrics available (expected for API models)")
|
| 262 |
+
return pd.DataFrame()
|
| 263 |
+
|
| 264 |
+
return pd.DataFrame()
|
| 265 |
+
|
| 266 |
+
def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
|
| 267 |
+
"""Load metrics from HuggingFace dataset (flat format)"""
|
| 268 |
+
ds = load_dataset(dataset_id, split="train", token=self.hf_token)
|
| 269 |
+
df = ds.to_pandas()
|
| 270 |
+
|
| 271 |
+
# Convert timestamp strings to datetime if needed
|
| 272 |
+
if 'timestamp' in df.columns:
|
| 273 |
+
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
| 274 |
+
|
| 275 |
+
print(f"[OK] Loaded metrics from HuggingFace: {len(df)} rows")
|
| 276 |
+
print(f" Columns: {list(df.columns)}")
|
| 277 |
+
return df
|
| 278 |
+
|
| 279 |
+
def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
|
| 280 |
+
"""Load metrics from local JSON file"""
|
| 281 |
+
filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
|
| 282 |
+
json_path = self.json_data_path / filename
|
| 283 |
+
|
| 284 |
+
if not json_path.exists():
|
| 285 |
+
# Metrics might not exist for API models
|
| 286 |
+
return pd.DataFrame()
|
| 287 |
+
|
| 288 |
+
with open(json_path, "r") as f:
|
| 289 |
+
data = json.load(f)
|
| 290 |
+
|
| 291 |
+
# Check if it's OpenTelemetry format (nested) or flat format
|
| 292 |
+
if isinstance(data, dict) and 'resourceMetrics' in data:
|
| 293 |
+
# Legacy OpenTelemetry format - convert to flat format
|
| 294 |
+
df = self._convert_otel_to_flat(data)
|
| 295 |
+
elif isinstance(data, list):
|
| 296 |
+
df = pd.DataFrame(data)
|
| 297 |
else:
|
| 298 |
+
df = pd.DataFrame()
|
| 299 |
+
|
| 300 |
+
# Convert timestamp strings to datetime if needed
|
| 301 |
+
if 'timestamp' in df.columns and not df.empty:
|
| 302 |
+
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
| 303 |
+
|
| 304 |
+
print(f"[OK] Loaded metrics from JSON: {len(df)} rows")
|
| 305 |
+
return df
|
| 306 |
+
|
| 307 |
+
def _convert_otel_to_flat(self, otel_data: Dict[str, Any]) -> pd.DataFrame:
|
| 308 |
+
"""Convert OpenTelemetry resourceMetrics format to flat DataFrame"""
|
| 309 |
+
rows = []
|
| 310 |
+
|
| 311 |
+
for resource_metric in otel_data.get('resourceMetrics', []):
|
| 312 |
+
for scope_metric in resource_metric.get('scopeMetrics', []):
|
| 313 |
+
for metric in scope_metric.get('metrics', []):
|
| 314 |
+
metric_name = metric.get('name', '')
|
| 315 |
+
|
| 316 |
+
# Handle gauge metrics
|
| 317 |
+
if 'gauge' in metric:
|
| 318 |
+
for data_point in metric['gauge'].get('dataPoints', []):
|
| 319 |
+
row = self._extract_data_point(metric_name, data_point, metric.get('unit', ''))
|
| 320 |
+
if row:
|
| 321 |
+
rows.append(row)
|
| 322 |
+
|
| 323 |
+
# Handle sum metrics (like CO2)
|
| 324 |
+
elif 'sum' in metric:
|
| 325 |
+
for data_point in metric['sum'].get('dataPoints', []):
|
| 326 |
+
row = self._extract_data_point(metric_name, data_point, metric.get('unit', ''))
|
| 327 |
+
if row:
|
| 328 |
+
rows.append(row)
|
| 329 |
+
|
| 330 |
+
return pd.DataFrame(rows)
|
| 331 |
+
|
| 332 |
+
def _extract_data_point(self, metric_name: str, data_point: Dict, unit: str) -> Optional[Dict[str, Any]]:
|
| 333 |
+
"""Extract a single data point from OpenTelemetry format to flat row"""
|
| 334 |
+
# Get GPU attributes
|
| 335 |
+
gpu_id = None
|
| 336 |
+
gpu_name = None
|
| 337 |
+
for attr in data_point.get('attributes', []):
|
| 338 |
+
if attr.get('key') == 'gpu_id':
|
| 339 |
+
gpu_id = attr.get('value', {}).get('stringValue', '')
|
| 340 |
+
elif attr.get('key') == 'gpu_name':
|
| 341 |
+
gpu_name = attr.get('value', {}).get('stringValue', '')
|
| 342 |
+
|
| 343 |
+
# Get value
|
| 344 |
+
value = None
|
| 345 |
+
if 'asInt' in data_point and data_point['asInt'] is not None:
|
| 346 |
+
value = int(data_point['asInt'])
|
| 347 |
+
elif 'asDouble' in data_point and data_point['asDouble'] is not None:
|
| 348 |
+
value = float(data_point['asDouble'])
|
| 349 |
+
|
| 350 |
+
# Get timestamp
|
| 351 |
+
timestamp_nano = data_point.get('timeUnixNano', '')
|
| 352 |
+
if timestamp_nano:
|
| 353 |
+
timestamp_sec = int(timestamp_nano) / 1e9
|
| 354 |
+
timestamp = pd.to_datetime(timestamp_sec, unit='s')
|
| 355 |
+
else:
|
| 356 |
+
timestamp = None
|
| 357 |
+
|
| 358 |
+
# Map metric names to column names
|
| 359 |
+
metric_col_map = {
|
| 360 |
+
'gen_ai.gpu.utilization': 'gpu_utilization_percent',
|
| 361 |
+
'gen_ai.gpu.memory.used': 'gpu_memory_used_mib',
|
| 362 |
+
'gen_ai.gpu.temperature': 'gpu_temperature_celsius',
|
| 363 |
+
'gen_ai.gpu.power': 'gpu_power_watts',
|
| 364 |
+
'gen_ai.co2.emissions': 'co2_emissions_gco2e'
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
return {
|
| 368 |
+
'timestamp': timestamp,
|
| 369 |
+
'timestamp_unix_nano': timestamp_nano,
|
| 370 |
+
'gpu_id': gpu_id,
|
| 371 |
+
'gpu_name': gpu_name,
|
| 372 |
+
'metric_name': metric_name,
|
| 373 |
+
'value': value,
|
| 374 |
+
'unit': unit
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
def get_trace_by_id(self, traces_dataset: str, trace_id: str) -> Optional[Dict[str, Any]]:
|
| 378 |
"""
|
| 379 |
+
Get a specific trace by ID
|
| 380 |
|
| 381 |
Args:
|
| 382 |
+
traces_dataset: Dataset reference
|
| 383 |
+
trace_id: Trace ID to find
|
| 384 |
|
| 385 |
Returns:
|
| 386 |
+
Trace object or None if not found
|
| 387 |
"""
|
| 388 |
+
traces = self.load_traces(traces_dataset)
|
| 389 |
|
| 390 |
for trace in traces:
|
| 391 |
+
if trace.get("trace_id") == trace_id or trace.get("traceId") == trace_id:
|
| 392 |
+
# Ensure spans is a proper list (not numpy array or pandas Series)
|
| 393 |
+
if "spans" in trace:
|
| 394 |
+
spans = trace["spans"]
|
| 395 |
+
if hasattr(spans, 'tolist'):
|
| 396 |
+
trace["spans"] = spans.tolist()
|
| 397 |
+
elif not isinstance(spans, list):
|
| 398 |
+
trace["spans"] = list(spans) if spans is not None else []
|
| 399 |
+
|
| 400 |
return trace
|
| 401 |
|
| 402 |
return None
|
| 403 |
|
| 404 |
+
def clear_cache(self) -> None:
|
| 405 |
+
"""Clear the internal cache"""
|
| 406 |
+
self._cache.clear()
|
| 407 |
+
print("[OK] Cache cleared")
|
| 408 |
+
|
| 409 |
+
def refresh_leaderboard(self) -> pd.DataFrame:
|
| 410 |
+
"""Refresh leaderboard data (clear cache and reload)"""
|
| 411 |
+
if "leaderboard" in self._cache:
|
| 412 |
+
del self._cache["leaderboard"]
|
| 413 |
+
return self.load_leaderboard()
|
| 414 |
|
| 415 |
|
| 416 |
+
def create_data_loader_from_env() -> DataLoader:
|
| 417 |
"""
|
| 418 |
+
Create DataLoader instance from environment variables
|
| 419 |
|
| 420 |
Returns:
|
| 421 |
+
Configured DataLoader instance
|
| 422 |
"""
|
| 423 |
+
data_source = os.getenv("DATA_SOURCE", "both")
|
| 424 |
+
|
| 425 |
+
return DataLoader(
|
| 426 |
+
data_source=data_source,
|
| 427 |
+
json_data_path=os.getenv("JSON_DATA_PATH"),
|
| 428 |
+
leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
|
| 429 |
+
hf_token=os.getenv("HF_TOKEN")
|
| 430 |
)
|
sample_data/generate_sample_metrics.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate sample metrics data in OpenTelemetry resourceMetrics format.
|
| 3 |
+
This simulates what SMOLTRACE would produce for GPU and API evaluation runs.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import time
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def generate_gpu_sample_metrics(
|
| 13 |
+
run_id: str = "run_002_llama31",
|
| 14 |
+
duration_seconds: int = 120,
|
| 15 |
+
interval_seconds: int = 10
|
| 16 |
+
):
|
| 17 |
+
"""
|
| 18 |
+
Generate sample GPU metrics data for a GPU model run.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
run_id: Run identifier
|
| 22 |
+
duration_seconds: Total duration of simulated run
|
| 23 |
+
interval_seconds: Interval between data points
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Dict in OpenTelemetry resourceMetrics format
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
start_time = datetime.now()
|
| 30 |
+
num_points = duration_seconds // interval_seconds
|
| 31 |
+
|
| 32 |
+
# Generate time-series data points
|
| 33 |
+
utilization_points = []
|
| 34 |
+
memory_points = []
|
| 35 |
+
temperature_points = []
|
| 36 |
+
power_points = []
|
| 37 |
+
co2_points = []
|
| 38 |
+
|
| 39 |
+
cumulative_co2 = 0.0
|
| 40 |
+
|
| 41 |
+
for i in range(num_points):
|
| 42 |
+
timestamp = start_time + timedelta(seconds=i * interval_seconds)
|
| 43 |
+
time_unix_nano = str(int(timestamp.timestamp() * 1e9))
|
| 44 |
+
|
| 45 |
+
# Simulate realistic GPU metrics with some variation
|
| 46 |
+
# Pattern: Higher utilization during inference, lower during idle
|
| 47 |
+
utilization = 45 + (i % 5) * 10 + (i % 2) * 5 # 45-70%
|
| 48 |
+
memory = 4096 + i * 100 # Gradually increasing memory usage
|
| 49 |
+
temperature = 70 + (i % 6) * 2 # 70-80°C
|
| 50 |
+
power = 250 + (i % 7) * 30 # 250-400W
|
| 51 |
+
|
| 52 |
+
# Cumulative CO2 (monotonic increasing)
|
| 53 |
+
# Rough estimate: power (W) * time (h) * carbon intensity (g/kWh)
|
| 54 |
+
delta_co2 = (power / 1000.0) * (interval_seconds / 3600.0) * 400 # 400g/kWh assumed
|
| 55 |
+
cumulative_co2 += delta_co2
|
| 56 |
+
|
| 57 |
+
utilization_points.append({
|
| 58 |
+
"attributes": [
|
| 59 |
+
{"key": "gpu_id", "value": {"stringValue": "0"}},
|
| 60 |
+
{"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
|
| 61 |
+
],
|
| 62 |
+
"timeUnixNano": time_unix_nano,
|
| 63 |
+
"asInt": str(utilization)
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
memory_points.append({
|
| 67 |
+
"attributes": [
|
| 68 |
+
{"key": "gpu_id", "value": {"stringValue": "0"}},
|
| 69 |
+
{"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
|
| 70 |
+
],
|
| 71 |
+
"timeUnixNano": time_unix_nano,
|
| 72 |
+
"asDouble": float(memory)
|
| 73 |
+
})
|
| 74 |
+
|
| 75 |
+
temperature_points.append({
|
| 76 |
+
"attributes": [
|
| 77 |
+
{"key": "gpu_id", "value": {"stringValue": "0"}},
|
| 78 |
+
{"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
|
| 79 |
+
],
|
| 80 |
+
"timeUnixNano": time_unix_nano,
|
| 81 |
+
"asInt": str(temperature)
|
| 82 |
+
})
|
| 83 |
+
|
| 84 |
+
power_points.append({
|
| 85 |
+
"attributes": [
|
| 86 |
+
{"key": "gpu_id", "value": {"stringValue": "0"}},
|
| 87 |
+
{"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
|
| 88 |
+
],
|
| 89 |
+
"timeUnixNano": time_unix_nano,
|
| 90 |
+
"asDouble": float(power)
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
co2_points.append({
|
| 94 |
+
"attributes": [
|
| 95 |
+
{"key": "gpu_id", "value": {"stringValue": "0"}}
|
| 96 |
+
],
|
| 97 |
+
"timeUnixNano": time_unix_nano,
|
| 98 |
+
"asDouble": cumulative_co2
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
# Construct resourceMetrics structure (OpenTelemetry format)
|
| 102 |
+
metrics_data = {
|
| 103 |
+
"run_id": run_id,
|
| 104 |
+
"resourceMetrics": [{
|
| 105 |
+
"resource": {
|
| 106 |
+
"attributes": [
|
| 107 |
+
{"key": "telemetry.sdk.language", "value": {"stringValue": "python"}},
|
| 108 |
+
{"key": "telemetry.sdk.name", "value": {"stringValue": "opentelemetry"}},
|
| 109 |
+
{"key": "telemetry.sdk.version", "value": {"stringValue": "1.37.0"}},
|
| 110 |
+
{"key": "service.name", "value": {"stringValue": "smoltrace-eval"}},
|
| 111 |
+
{"key": "run.id", "value": {"stringValue": run_id}}
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
"scopeMetrics": [{
|
| 115 |
+
"scope": {"name": "genai.gpu", "version": None},
|
| 116 |
+
"metrics": [
|
| 117 |
+
{
|
| 118 |
+
"name": "gen_ai.gpu.utilization",
|
| 119 |
+
"description": "GPU utilization percentage",
|
| 120 |
+
"unit": "%",
|
| 121 |
+
"gauge": {"dataPoints": utilization_points}
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"name": "gen_ai.gpu.memory.used",
|
| 125 |
+
"description": "GPU memory used in MiB",
|
| 126 |
+
"unit": "MiB",
|
| 127 |
+
"gauge": {"dataPoints": memory_points}
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"name": "gen_ai.gpu.temperature",
|
| 131 |
+
"description": "GPU temperature in Celsius",
|
| 132 |
+
"unit": "Cel",
|
| 133 |
+
"gauge": {"dataPoints": temperature_points}
|
| 134 |
+
},
|
| 135 |
+
{
|
| 136 |
+
"name": "gen_ai.gpu.power",
|
| 137 |
+
"description": "GPU power consumption in Watts",
|
| 138 |
+
"unit": "W",
|
| 139 |
+
"gauge": {"dataPoints": power_points}
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"name": "gen_ai.co2.emissions",
|
| 143 |
+
"description": "Cumulative CO2 equivalent emissions in grams",
|
| 144 |
+
"unit": "gCO2e",
|
| 145 |
+
"sum": {
|
| 146 |
+
"dataPoints": co2_points,
|
| 147 |
+
"aggregationTemporality": 2, # CUMULATIVE
|
| 148 |
+
"isMonotonic": True
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
]
|
| 152 |
+
}]
|
| 153 |
+
}]
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
return metrics_data
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def generate_api_sample_metrics(run_id: str = "run_001_gpt4"):
|
| 160 |
+
"""
|
| 161 |
+
Generate minimal sample metrics for an API model run (no GPU).
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
run_id: Run identifier
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
Dict with empty resourceMetrics (API models don't have GPU)
|
| 168 |
+
"""
|
| 169 |
+
return {
|
| 170 |
+
"run_id": run_id,
|
| 171 |
+
"resourceMetrics": []
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
# Create output directory
|
| 177 |
+
output_dir = Path(__file__).parent
|
| 178 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 179 |
+
|
| 180 |
+
print("Generating sample metrics data...")
|
| 181 |
+
|
| 182 |
+
# Generate GPU model metrics (Llama 3.1 on H200)
|
| 183 |
+
gpu_metrics = generate_gpu_sample_metrics(
|
| 184 |
+
run_id="run_002_llama31",
|
| 185 |
+
duration_seconds=120,
|
| 186 |
+
interval_seconds=10
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
output_file = output_dir / "metrics_llama31.json"
|
| 190 |
+
with open(output_file, "w") as f:
|
| 191 |
+
json.dump(gpu_metrics, f, indent=2)
|
| 192 |
+
print(f"[OK] Generated GPU metrics: {output_file}")
|
| 193 |
+
print(f" - {len(gpu_metrics['resourceMetrics'][0]['scopeMetrics'][0]['metrics'])} metric types")
|
| 194 |
+
print(f" - {len(gpu_metrics['resourceMetrics'][0]['scopeMetrics'][0]['metrics'][0]['gauge']['dataPoints'])} data points per metric")
|
| 195 |
+
|
| 196 |
+
# Generate API model metrics (GPT-4 - no GPU)
|
| 197 |
+
api_metrics = generate_api_sample_metrics(run_id="run_001_gpt4")
|
| 198 |
+
|
| 199 |
+
output_file = output_dir / "metrics_gpt4.json"
|
| 200 |
+
with open(output_file, "w") as f:
|
| 201 |
+
json.dump(api_metrics, f, indent=2)
|
| 202 |
+
print(f"[OK] Generated API metrics: {output_file}")
|
| 203 |
+
print(f" - Empty resourceMetrics (API model has no GPU)")
|
| 204 |
+
|
| 205 |
+
print("\n[SUCCESS] Sample metrics data generation complete!")
|
| 206 |
+
print("\nYou can now test the visualization with:")
|
| 207 |
+
print(" python gpu_metrics_with_time_series.py")
|
sample_data/leaderboard.json
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"run_id": "run_001_gpt4",
|
| 4 |
+
"model": "openai/gpt-4",
|
| 5 |
+
"agent_type": "both",
|
| 6 |
+
"provider": "litellm",
|
| 7 |
+
"success_rate": 95.8,
|
| 8 |
+
"total_tests": 100,
|
| 9 |
+
"successful_tests": 96,
|
| 10 |
+
"failed_tests": 4,
|
| 11 |
+
"avg_steps": 2.5,
|
| 12 |
+
"avg_duration_ms": 3200.0,
|
| 13 |
+
"total_duration_ms": 320000.0,
|
| 14 |
+
"total_tokens": 15000,
|
| 15 |
+
"avg_tokens_per_test": 150,
|
| 16 |
+
"total_cost_usd": 0.05,
|
| 17 |
+
"avg_cost_per_test_usd": 0.0005,
|
| 18 |
+
"co2_emissions_g": 0.22,
|
| 19 |
+
"gpu_utilization_avg": null,
|
| 20 |
+
"gpu_memory_max_mib": null,
|
| 21 |
+
"results_dataset": "test/results_gpt4",
|
| 22 |
+
"traces_dataset": "test/traces_gpt4",
|
| 23 |
+
"metrics_dataset": "test/metrics_gpt4",
|
| 24 |
+
"timestamp": "2025-01-16T14:23:00Z",
|
| 25 |
+
"submitted_by": "test_user",
|
| 26 |
+
"hf_job_id": "job_12345",
|
| 27 |
+
"job_type": "cpu",
|
| 28 |
+
"dataset_used": "huggingface/smolagents/tasks",
|
| 29 |
+
"smoltrace_version": "0.1.0"
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"run_id": "run_002_llama31",
|
| 33 |
+
"model": "meta-llama/Llama-3.1-8B",
|
| 34 |
+
"agent_type": "both",
|
| 35 |
+
"provider": "transformers",
|
| 36 |
+
"success_rate": 93.4,
|
| 37 |
+
"total_tests": 100,
|
| 38 |
+
"successful_tests": 93,
|
| 39 |
+
"failed_tests": 7,
|
| 40 |
+
"avg_steps": 2.8,
|
| 41 |
+
"avg_duration_ms": 2100.0,
|
| 42 |
+
"total_duration_ms": 210000.0,
|
| 43 |
+
"total_tokens": 12500,
|
| 44 |
+
"avg_tokens_per_test": 125,
|
| 45 |
+
"total_cost_usd": 0.002,
|
| 46 |
+
"avg_cost_per_test_usd": 0.00002,
|
| 47 |
+
"co2_emissions_g": 1.45,
|
| 48 |
+
"gpu_utilization_avg": 67.5,
|
| 49 |
+
"gpu_memory_max_mib": 512.34,
|
| 50 |
+
"results_dataset": "test/results_llama31",
|
| 51 |
+
"traces_dataset": "test/traces_llama31",
|
| 52 |
+
"metrics_dataset": "test/metrics_llama31",
|
| 53 |
+
"timestamp": "2025-01-16T15:10:00Z",
|
| 54 |
+
"submitted_by": "test_user",
|
| 55 |
+
"hf_job_id": "job_12346",
|
| 56 |
+
"job_type": "gpu_h200",
|
| 57 |
+
"dataset_used": "huggingface/smolagents/tasks",
|
| 58 |
+
"smoltrace_version": "0.1.0"
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"run_id": "run_003_claude",
|
| 62 |
+
"model": "anthropic/claude-3-haiku",
|
| 63 |
+
"agent_type": "tool",
|
| 64 |
+
"provider": "litellm",
|
| 65 |
+
"success_rate": 92.1,
|
| 66 |
+
"total_tests": 100,
|
| 67 |
+
"successful_tests": 92,
|
| 68 |
+
"failed_tests": 8,
|
| 69 |
+
"avg_steps": 2.2,
|
| 70 |
+
"avg_duration_ms": 2800.0,
|
| 71 |
+
"total_duration_ms": 280000.0,
|
| 72 |
+
"total_tokens": 11200,
|
| 73 |
+
"avg_tokens_per_test": 112,
|
| 74 |
+
"total_cost_usd": 0.012,
|
| 75 |
+
"avg_cost_per_test_usd": 0.00012,
|
| 76 |
+
"co2_emissions_g": 0.15,
|
| 77 |
+
"gpu_utilization_avg": null,
|
| 78 |
+
"gpu_memory_max_mib": null,
|
| 79 |
+
"results_dataset": "test/results_claude",
|
| 80 |
+
"traces_dataset": "test/traces_claude",
|
| 81 |
+
"metrics_dataset": "test/metrics_claude",
|
| 82 |
+
"timestamp": "2025-01-16T16:45:00Z",
|
| 83 |
+
"submitted_by": "test_user",
|
| 84 |
+
"hf_job_id": "job_12347",
|
| 85 |
+
"job_type": "cpu",
|
| 86 |
+
"dataset_used": "huggingface/smolagents/tasks",
|
| 87 |
+
"smoltrace_version": "0.1.0"
|
| 88 |
+
}
|
| 89 |
+
]
|
sample_data/metrics_gpt4.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
sample_data/metrics_llama31.json
ADDED
|
@@ -0,0 +1,1106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"run_id": "run_002_llama31",
|
| 3 |
+
"resourceMetrics": [
|
| 4 |
+
{
|
| 5 |
+
"resource": {
|
| 6 |
+
"attributes": [
|
| 7 |
+
{
|
| 8 |
+
"key": "telemetry.sdk.language",
|
| 9 |
+
"value": {
|
| 10 |
+
"stringValue": "python"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"key": "telemetry.sdk.name",
|
| 15 |
+
"value": {
|
| 16 |
+
"stringValue": "opentelemetry"
|
| 17 |
+
}
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"key": "telemetry.sdk.version",
|
| 21 |
+
"value": {
|
| 22 |
+
"stringValue": "1.37.0"
|
| 23 |
+
}
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"key": "service.name",
|
| 27 |
+
"value": {
|
| 28 |
+
"stringValue": "smoltrace-eval"
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"key": "run.id",
|
| 33 |
+
"value": {
|
| 34 |
+
"stringValue": "run_002_llama31"
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
"scopeMetrics": [
|
| 40 |
+
{
|
| 41 |
+
"scope": {
|
| 42 |
+
"name": "genai.gpu",
|
| 43 |
+
"version": null
|
| 44 |
+
},
|
| 45 |
+
"metrics": [
|
| 46 |
+
{
|
| 47 |
+
"name": "gen_ai.gpu.utilization",
|
| 48 |
+
"description": "GPU utilization percentage",
|
| 49 |
+
"unit": "%",
|
| 50 |
+
"gauge": {
|
| 51 |
+
"dataPoints": [
|
| 52 |
+
{
|
| 53 |
+
"attributes": [
|
| 54 |
+
{
|
| 55 |
+
"key": "gpu_id",
|
| 56 |
+
"value": {
|
| 57 |
+
"stringValue": "0"
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"key": "gpu_name",
|
| 62 |
+
"value": {
|
| 63 |
+
"stringValue": "NVIDIA H200"
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
],
|
| 67 |
+
"timeUnixNano": "1761242554199441920",
|
| 68 |
+
"asInt": "45"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"attributes": [
|
| 72 |
+
{
|
| 73 |
+
"key": "gpu_id",
|
| 74 |
+
"value": {
|
| 75 |
+
"stringValue": "0"
|
| 76 |
+
}
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"key": "gpu_name",
|
| 80 |
+
"value": {
|
| 81 |
+
"stringValue": "NVIDIA H200"
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
],
|
| 85 |
+
"timeUnixNano": "1761242564199441920",
|
| 86 |
+
"asInt": "60"
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"attributes": [
|
| 90 |
+
{
|
| 91 |
+
"key": "gpu_id",
|
| 92 |
+
"value": {
|
| 93 |
+
"stringValue": "0"
|
| 94 |
+
}
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"key": "gpu_name",
|
| 98 |
+
"value": {
|
| 99 |
+
"stringValue": "NVIDIA H200"
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"timeUnixNano": "1761242574199441920",
|
| 104 |
+
"asInt": "65"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"attributes": [
|
| 108 |
+
{
|
| 109 |
+
"key": "gpu_id",
|
| 110 |
+
"value": {
|
| 111 |
+
"stringValue": "0"
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"key": "gpu_name",
|
| 116 |
+
"value": {
|
| 117 |
+
"stringValue": "NVIDIA H200"
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
],
|
| 121 |
+
"timeUnixNano": "1761242584199441920",
|
| 122 |
+
"asInt": "80"
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"attributes": [
|
| 126 |
+
{
|
| 127 |
+
"key": "gpu_id",
|
| 128 |
+
"value": {
|
| 129 |
+
"stringValue": "0"
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"key": "gpu_name",
|
| 134 |
+
"value": {
|
| 135 |
+
"stringValue": "NVIDIA H200"
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
],
|
| 139 |
+
"timeUnixNano": "1761242594199441920",
|
| 140 |
+
"asInt": "85"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"attributes": [
|
| 144 |
+
{
|
| 145 |
+
"key": "gpu_id",
|
| 146 |
+
"value": {
|
| 147 |
+
"stringValue": "0"
|
| 148 |
+
}
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"key": "gpu_name",
|
| 152 |
+
"value": {
|
| 153 |
+
"stringValue": "NVIDIA H200"
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
],
|
| 157 |
+
"timeUnixNano": "1761242604199441920",
|
| 158 |
+
"asInt": "50"
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"attributes": [
|
| 162 |
+
{
|
| 163 |
+
"key": "gpu_id",
|
| 164 |
+
"value": {
|
| 165 |
+
"stringValue": "0"
|
| 166 |
+
}
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"key": "gpu_name",
|
| 170 |
+
"value": {
|
| 171 |
+
"stringValue": "NVIDIA H200"
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
],
|
| 175 |
+
"timeUnixNano": "1761242614199441920",
|
| 176 |
+
"asInt": "55"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"attributes": [
|
| 180 |
+
{
|
| 181 |
+
"key": "gpu_id",
|
| 182 |
+
"value": {
|
| 183 |
+
"stringValue": "0"
|
| 184 |
+
}
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"key": "gpu_name",
|
| 188 |
+
"value": {
|
| 189 |
+
"stringValue": "NVIDIA H200"
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
],
|
| 193 |
+
"timeUnixNano": "1761242624199441920",
|
| 194 |
+
"asInt": "70"
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"attributes": [
|
| 198 |
+
{
|
| 199 |
+
"key": "gpu_id",
|
| 200 |
+
"value": {
|
| 201 |
+
"stringValue": "0"
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"key": "gpu_name",
|
| 206 |
+
"value": {
|
| 207 |
+
"stringValue": "NVIDIA H200"
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
],
|
| 211 |
+
"timeUnixNano": "1761242634199441920",
|
| 212 |
+
"asInt": "75"
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"attributes": [
|
| 216 |
+
{
|
| 217 |
+
"key": "gpu_id",
|
| 218 |
+
"value": {
|
| 219 |
+
"stringValue": "0"
|
| 220 |
+
}
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"key": "gpu_name",
|
| 224 |
+
"value": {
|
| 225 |
+
"stringValue": "NVIDIA H200"
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
],
|
| 229 |
+
"timeUnixNano": "1761242644199441920",
|
| 230 |
+
"asInt": "90"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"attributes": [
|
| 234 |
+
{
|
| 235 |
+
"key": "gpu_id",
|
| 236 |
+
"value": {
|
| 237 |
+
"stringValue": "0"
|
| 238 |
+
}
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"key": "gpu_name",
|
| 242 |
+
"value": {
|
| 243 |
+
"stringValue": "NVIDIA H200"
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
],
|
| 247 |
+
"timeUnixNano": "1761242654199441920",
|
| 248 |
+
"asInt": "45"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"attributes": [
|
| 252 |
+
{
|
| 253 |
+
"key": "gpu_id",
|
| 254 |
+
"value": {
|
| 255 |
+
"stringValue": "0"
|
| 256 |
+
}
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"key": "gpu_name",
|
| 260 |
+
"value": {
|
| 261 |
+
"stringValue": "NVIDIA H200"
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
],
|
| 265 |
+
"timeUnixNano": "1761242664199441920",
|
| 266 |
+
"asInt": "60"
|
| 267 |
+
}
|
| 268 |
+
]
|
| 269 |
+
}
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"name": "gen_ai.gpu.memory.used",
|
| 273 |
+
"description": "GPU memory used in MiB",
|
| 274 |
+
"unit": "MiB",
|
| 275 |
+
"gauge": {
|
| 276 |
+
"dataPoints": [
|
| 277 |
+
{
|
| 278 |
+
"attributes": [
|
| 279 |
+
{
|
| 280 |
+
"key": "gpu_id",
|
| 281 |
+
"value": {
|
| 282 |
+
"stringValue": "0"
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"key": "gpu_name",
|
| 287 |
+
"value": {
|
| 288 |
+
"stringValue": "NVIDIA H200"
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
],
|
| 292 |
+
"timeUnixNano": "1761242554199441920",
|
| 293 |
+
"asDouble": 4096.0
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"attributes": [
|
| 297 |
+
{
|
| 298 |
+
"key": "gpu_id",
|
| 299 |
+
"value": {
|
| 300 |
+
"stringValue": "0"
|
| 301 |
+
}
|
| 302 |
+
},
|
| 303 |
+
{
|
| 304 |
+
"key": "gpu_name",
|
| 305 |
+
"value": {
|
| 306 |
+
"stringValue": "NVIDIA H200"
|
| 307 |
+
}
|
| 308 |
+
}
|
| 309 |
+
],
|
| 310 |
+
"timeUnixNano": "1761242564199441920",
|
| 311 |
+
"asDouble": 4196.0
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"attributes": [
|
| 315 |
+
{
|
| 316 |
+
"key": "gpu_id",
|
| 317 |
+
"value": {
|
| 318 |
+
"stringValue": "0"
|
| 319 |
+
}
|
| 320 |
+
},
|
| 321 |
+
{
|
| 322 |
+
"key": "gpu_name",
|
| 323 |
+
"value": {
|
| 324 |
+
"stringValue": "NVIDIA H200"
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
],
|
| 328 |
+
"timeUnixNano": "1761242574199441920",
|
| 329 |
+
"asDouble": 4296.0
|
| 330 |
+
},
|
| 331 |
+
{
|
| 332 |
+
"attributes": [
|
| 333 |
+
{
|
| 334 |
+
"key": "gpu_id",
|
| 335 |
+
"value": {
|
| 336 |
+
"stringValue": "0"
|
| 337 |
+
}
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"key": "gpu_name",
|
| 341 |
+
"value": {
|
| 342 |
+
"stringValue": "NVIDIA H200"
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
],
|
| 346 |
+
"timeUnixNano": "1761242584199441920",
|
| 347 |
+
"asDouble": 4396.0
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"attributes": [
|
| 351 |
+
{
|
| 352 |
+
"key": "gpu_id",
|
| 353 |
+
"value": {
|
| 354 |
+
"stringValue": "0"
|
| 355 |
+
}
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"key": "gpu_name",
|
| 359 |
+
"value": {
|
| 360 |
+
"stringValue": "NVIDIA H200"
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
],
|
| 364 |
+
"timeUnixNano": "1761242594199441920",
|
| 365 |
+
"asDouble": 4496.0
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"attributes": [
|
| 369 |
+
{
|
| 370 |
+
"key": "gpu_id",
|
| 371 |
+
"value": {
|
| 372 |
+
"stringValue": "0"
|
| 373 |
+
}
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"key": "gpu_name",
|
| 377 |
+
"value": {
|
| 378 |
+
"stringValue": "NVIDIA H200"
|
| 379 |
+
}
|
| 380 |
+
}
|
| 381 |
+
],
|
| 382 |
+
"timeUnixNano": "1761242604199441920",
|
| 383 |
+
"asDouble": 4596.0
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"attributes": [
|
| 387 |
+
{
|
| 388 |
+
"key": "gpu_id",
|
| 389 |
+
"value": {
|
| 390 |
+
"stringValue": "0"
|
| 391 |
+
}
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"key": "gpu_name",
|
| 395 |
+
"value": {
|
| 396 |
+
"stringValue": "NVIDIA H200"
|
| 397 |
+
}
|
| 398 |
+
}
|
| 399 |
+
],
|
| 400 |
+
"timeUnixNano": "1761242614199441920",
|
| 401 |
+
"asDouble": 4696.0
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"attributes": [
|
| 405 |
+
{
|
| 406 |
+
"key": "gpu_id",
|
| 407 |
+
"value": {
|
| 408 |
+
"stringValue": "0"
|
| 409 |
+
}
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"key": "gpu_name",
|
| 413 |
+
"value": {
|
| 414 |
+
"stringValue": "NVIDIA H200"
|
| 415 |
+
}
|
| 416 |
+
}
|
| 417 |
+
],
|
| 418 |
+
"timeUnixNano": "1761242624199441920",
|
| 419 |
+
"asDouble": 4796.0
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"attributes": [
|
| 423 |
+
{
|
| 424 |
+
"key": "gpu_id",
|
| 425 |
+
"value": {
|
| 426 |
+
"stringValue": "0"
|
| 427 |
+
}
|
| 428 |
+
},
|
| 429 |
+
{
|
| 430 |
+
"key": "gpu_name",
|
| 431 |
+
"value": {
|
| 432 |
+
"stringValue": "NVIDIA H200"
|
| 433 |
+
}
|
| 434 |
+
}
|
| 435 |
+
],
|
| 436 |
+
"timeUnixNano": "1761242634199441920",
|
| 437 |
+
"asDouble": 4896.0
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"attributes": [
|
| 441 |
+
{
|
| 442 |
+
"key": "gpu_id",
|
| 443 |
+
"value": {
|
| 444 |
+
"stringValue": "0"
|
| 445 |
+
}
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"key": "gpu_name",
|
| 449 |
+
"value": {
|
| 450 |
+
"stringValue": "NVIDIA H200"
|
| 451 |
+
}
|
| 452 |
+
}
|
| 453 |
+
],
|
| 454 |
+
"timeUnixNano": "1761242644199441920",
|
| 455 |
+
"asDouble": 4996.0
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"attributes": [
|
| 459 |
+
{
|
| 460 |
+
"key": "gpu_id",
|
| 461 |
+
"value": {
|
| 462 |
+
"stringValue": "0"
|
| 463 |
+
}
|
| 464 |
+
},
|
| 465 |
+
{
|
| 466 |
+
"key": "gpu_name",
|
| 467 |
+
"value": {
|
| 468 |
+
"stringValue": "NVIDIA H200"
|
| 469 |
+
}
|
| 470 |
+
}
|
| 471 |
+
],
|
| 472 |
+
"timeUnixNano": "1761242654199441920",
|
| 473 |
+
"asDouble": 5096.0
|
| 474 |
+
},
|
| 475 |
+
{
|
| 476 |
+
"attributes": [
|
| 477 |
+
{
|
| 478 |
+
"key": "gpu_id",
|
| 479 |
+
"value": {
|
| 480 |
+
"stringValue": "0"
|
| 481 |
+
}
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"key": "gpu_name",
|
| 485 |
+
"value": {
|
| 486 |
+
"stringValue": "NVIDIA H200"
|
| 487 |
+
}
|
| 488 |
+
}
|
| 489 |
+
],
|
| 490 |
+
"timeUnixNano": "1761242664199441920",
|
| 491 |
+
"asDouble": 5196.0
|
| 492 |
+
}
|
| 493 |
+
]
|
| 494 |
+
}
|
| 495 |
+
},
|
| 496 |
+
{
|
| 497 |
+
"name": "gen_ai.gpu.temperature",
|
| 498 |
+
"description": "GPU temperature in Celsius",
|
| 499 |
+
"unit": "Cel",
|
| 500 |
+
"gauge": {
|
| 501 |
+
"dataPoints": [
|
| 502 |
+
{
|
| 503 |
+
"attributes": [
|
| 504 |
+
{
|
| 505 |
+
"key": "gpu_id",
|
| 506 |
+
"value": {
|
| 507 |
+
"stringValue": "0"
|
| 508 |
+
}
|
| 509 |
+
},
|
| 510 |
+
{
|
| 511 |
+
"key": "gpu_name",
|
| 512 |
+
"value": {
|
| 513 |
+
"stringValue": "NVIDIA H200"
|
| 514 |
+
}
|
| 515 |
+
}
|
| 516 |
+
],
|
| 517 |
+
"timeUnixNano": "1761242554199441920",
|
| 518 |
+
"asInt": "70"
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"attributes": [
|
| 522 |
+
{
|
| 523 |
+
"key": "gpu_id",
|
| 524 |
+
"value": {
|
| 525 |
+
"stringValue": "0"
|
| 526 |
+
}
|
| 527 |
+
},
|
| 528 |
+
{
|
| 529 |
+
"key": "gpu_name",
|
| 530 |
+
"value": {
|
| 531 |
+
"stringValue": "NVIDIA H200"
|
| 532 |
+
}
|
| 533 |
+
}
|
| 534 |
+
],
|
| 535 |
+
"timeUnixNano": "1761242564199441920",
|
| 536 |
+
"asInt": "72"
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"attributes": [
|
| 540 |
+
{
|
| 541 |
+
"key": "gpu_id",
|
| 542 |
+
"value": {
|
| 543 |
+
"stringValue": "0"
|
| 544 |
+
}
|
| 545 |
+
},
|
| 546 |
+
{
|
| 547 |
+
"key": "gpu_name",
|
| 548 |
+
"value": {
|
| 549 |
+
"stringValue": "NVIDIA H200"
|
| 550 |
+
}
|
| 551 |
+
}
|
| 552 |
+
],
|
| 553 |
+
"timeUnixNano": "1761242574199441920",
|
| 554 |
+
"asInt": "74"
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"attributes": [
|
| 558 |
+
{
|
| 559 |
+
"key": "gpu_id",
|
| 560 |
+
"value": {
|
| 561 |
+
"stringValue": "0"
|
| 562 |
+
}
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"key": "gpu_name",
|
| 566 |
+
"value": {
|
| 567 |
+
"stringValue": "NVIDIA H200"
|
| 568 |
+
}
|
| 569 |
+
}
|
| 570 |
+
],
|
| 571 |
+
"timeUnixNano": "1761242584199441920",
|
| 572 |
+
"asInt": "76"
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"attributes": [
|
| 576 |
+
{
|
| 577 |
+
"key": "gpu_id",
|
| 578 |
+
"value": {
|
| 579 |
+
"stringValue": "0"
|
| 580 |
+
}
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"key": "gpu_name",
|
| 584 |
+
"value": {
|
| 585 |
+
"stringValue": "NVIDIA H200"
|
| 586 |
+
}
|
| 587 |
+
}
|
| 588 |
+
],
|
| 589 |
+
"timeUnixNano": "1761242594199441920",
|
| 590 |
+
"asInt": "78"
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"attributes": [
|
| 594 |
+
{
|
| 595 |
+
"key": "gpu_id",
|
| 596 |
+
"value": {
|
| 597 |
+
"stringValue": "0"
|
| 598 |
+
}
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"key": "gpu_name",
|
| 602 |
+
"value": {
|
| 603 |
+
"stringValue": "NVIDIA H200"
|
| 604 |
+
}
|
| 605 |
+
}
|
| 606 |
+
],
|
| 607 |
+
"timeUnixNano": "1761242604199441920",
|
| 608 |
+
"asInt": "80"
|
| 609 |
+
},
|
| 610 |
+
{
|
| 611 |
+
"attributes": [
|
| 612 |
+
{
|
| 613 |
+
"key": "gpu_id",
|
| 614 |
+
"value": {
|
| 615 |
+
"stringValue": "0"
|
| 616 |
+
}
|
| 617 |
+
},
|
| 618 |
+
{
|
| 619 |
+
"key": "gpu_name",
|
| 620 |
+
"value": {
|
| 621 |
+
"stringValue": "NVIDIA H200"
|
| 622 |
+
}
|
| 623 |
+
}
|
| 624 |
+
],
|
| 625 |
+
"timeUnixNano": "1761242614199441920",
|
| 626 |
+
"asInt": "70"
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"attributes": [
|
| 630 |
+
{
|
| 631 |
+
"key": "gpu_id",
|
| 632 |
+
"value": {
|
| 633 |
+
"stringValue": "0"
|
| 634 |
+
}
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"key": "gpu_name",
|
| 638 |
+
"value": {
|
| 639 |
+
"stringValue": "NVIDIA H200"
|
| 640 |
+
}
|
| 641 |
+
}
|
| 642 |
+
],
|
| 643 |
+
"timeUnixNano": "1761242624199441920",
|
| 644 |
+
"asInt": "72"
|
| 645 |
+
},
|
| 646 |
+
{
|
| 647 |
+
"attributes": [
|
| 648 |
+
{
|
| 649 |
+
"key": "gpu_id",
|
| 650 |
+
"value": {
|
| 651 |
+
"stringValue": "0"
|
| 652 |
+
}
|
| 653 |
+
},
|
| 654 |
+
{
|
| 655 |
+
"key": "gpu_name",
|
| 656 |
+
"value": {
|
| 657 |
+
"stringValue": "NVIDIA H200"
|
| 658 |
+
}
|
| 659 |
+
}
|
| 660 |
+
],
|
| 661 |
+
"timeUnixNano": "1761242634199441920",
|
| 662 |
+
"asInt": "74"
|
| 663 |
+
},
|
| 664 |
+
{
|
| 665 |
+
"attributes": [
|
| 666 |
+
{
|
| 667 |
+
"key": "gpu_id",
|
| 668 |
+
"value": {
|
| 669 |
+
"stringValue": "0"
|
| 670 |
+
}
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
"key": "gpu_name",
|
| 674 |
+
"value": {
|
| 675 |
+
"stringValue": "NVIDIA H200"
|
| 676 |
+
}
|
| 677 |
+
}
|
| 678 |
+
],
|
| 679 |
+
"timeUnixNano": "1761242644199441920",
|
| 680 |
+
"asInt": "76"
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"attributes": [
|
| 684 |
+
{
|
| 685 |
+
"key": "gpu_id",
|
| 686 |
+
"value": {
|
| 687 |
+
"stringValue": "0"
|
| 688 |
+
}
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"key": "gpu_name",
|
| 692 |
+
"value": {
|
| 693 |
+
"stringValue": "NVIDIA H200"
|
| 694 |
+
}
|
| 695 |
+
}
|
| 696 |
+
],
|
| 697 |
+
"timeUnixNano": "1761242654199441920",
|
| 698 |
+
"asInt": "78"
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"attributes": [
|
| 702 |
+
{
|
| 703 |
+
"key": "gpu_id",
|
| 704 |
+
"value": {
|
| 705 |
+
"stringValue": "0"
|
| 706 |
+
}
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"key": "gpu_name",
|
| 710 |
+
"value": {
|
| 711 |
+
"stringValue": "NVIDIA H200"
|
| 712 |
+
}
|
| 713 |
+
}
|
| 714 |
+
],
|
| 715 |
+
"timeUnixNano": "1761242664199441920",
|
| 716 |
+
"asInt": "80"
|
| 717 |
+
}
|
| 718 |
+
]
|
| 719 |
+
}
|
| 720 |
+
},
|
| 721 |
+
{
|
| 722 |
+
"name": "gen_ai.gpu.power",
|
| 723 |
+
"description": "GPU power consumption in Watts",
|
| 724 |
+
"unit": "W",
|
| 725 |
+
"gauge": {
|
| 726 |
+
"dataPoints": [
|
| 727 |
+
{
|
| 728 |
+
"attributes": [
|
| 729 |
+
{
|
| 730 |
+
"key": "gpu_id",
|
| 731 |
+
"value": {
|
| 732 |
+
"stringValue": "0"
|
| 733 |
+
}
|
| 734 |
+
},
|
| 735 |
+
{
|
| 736 |
+
"key": "gpu_name",
|
| 737 |
+
"value": {
|
| 738 |
+
"stringValue": "NVIDIA H200"
|
| 739 |
+
}
|
| 740 |
+
}
|
| 741 |
+
],
|
| 742 |
+
"timeUnixNano": "1761242554199441920",
|
| 743 |
+
"asDouble": 250.0
|
| 744 |
+
},
|
| 745 |
+
{
|
| 746 |
+
"attributes": [
|
| 747 |
+
{
|
| 748 |
+
"key": "gpu_id",
|
| 749 |
+
"value": {
|
| 750 |
+
"stringValue": "0"
|
| 751 |
+
}
|
| 752 |
+
},
|
| 753 |
+
{
|
| 754 |
+
"key": "gpu_name",
|
| 755 |
+
"value": {
|
| 756 |
+
"stringValue": "NVIDIA H200"
|
| 757 |
+
}
|
| 758 |
+
}
|
| 759 |
+
],
|
| 760 |
+
"timeUnixNano": "1761242564199441920",
|
| 761 |
+
"asDouble": 280.0
|
| 762 |
+
},
|
| 763 |
+
{
|
| 764 |
+
"attributes": [
|
| 765 |
+
{
|
| 766 |
+
"key": "gpu_id",
|
| 767 |
+
"value": {
|
| 768 |
+
"stringValue": "0"
|
| 769 |
+
}
|
| 770 |
+
},
|
| 771 |
+
{
|
| 772 |
+
"key": "gpu_name",
|
| 773 |
+
"value": {
|
| 774 |
+
"stringValue": "NVIDIA H200"
|
| 775 |
+
}
|
| 776 |
+
}
|
| 777 |
+
],
|
| 778 |
+
"timeUnixNano": "1761242574199441920",
|
| 779 |
+
"asDouble": 310.0
|
| 780 |
+
},
|
| 781 |
+
{
|
| 782 |
+
"attributes": [
|
| 783 |
+
{
|
| 784 |
+
"key": "gpu_id",
|
| 785 |
+
"value": {
|
| 786 |
+
"stringValue": "0"
|
| 787 |
+
}
|
| 788 |
+
},
|
| 789 |
+
{
|
| 790 |
+
"key": "gpu_name",
|
| 791 |
+
"value": {
|
| 792 |
+
"stringValue": "NVIDIA H200"
|
| 793 |
+
}
|
| 794 |
+
}
|
| 795 |
+
],
|
| 796 |
+
"timeUnixNano": "1761242584199441920",
|
| 797 |
+
"asDouble": 340.0
|
| 798 |
+
},
|
| 799 |
+
{
|
| 800 |
+
"attributes": [
|
| 801 |
+
{
|
| 802 |
+
"key": "gpu_id",
|
| 803 |
+
"value": {
|
| 804 |
+
"stringValue": "0"
|
| 805 |
+
}
|
| 806 |
+
},
|
| 807 |
+
{
|
| 808 |
+
"key": "gpu_name",
|
| 809 |
+
"value": {
|
| 810 |
+
"stringValue": "NVIDIA H200"
|
| 811 |
+
}
|
| 812 |
+
}
|
| 813 |
+
],
|
| 814 |
+
"timeUnixNano": "1761242594199441920",
|
| 815 |
+
"asDouble": 370.0
|
| 816 |
+
},
|
| 817 |
+
{
|
| 818 |
+
"attributes": [
|
| 819 |
+
{
|
| 820 |
+
"key": "gpu_id",
|
| 821 |
+
"value": {
|
| 822 |
+
"stringValue": "0"
|
| 823 |
+
}
|
| 824 |
+
},
|
| 825 |
+
{
|
| 826 |
+
"key": "gpu_name",
|
| 827 |
+
"value": {
|
| 828 |
+
"stringValue": "NVIDIA H200"
|
| 829 |
+
}
|
| 830 |
+
}
|
| 831 |
+
],
|
| 832 |
+
"timeUnixNano": "1761242604199441920",
|
| 833 |
+
"asDouble": 400.0
|
| 834 |
+
},
|
| 835 |
+
{
|
| 836 |
+
"attributes": [
|
| 837 |
+
{
|
| 838 |
+
"key": "gpu_id",
|
| 839 |
+
"value": {
|
| 840 |
+
"stringValue": "0"
|
| 841 |
+
}
|
| 842 |
+
},
|
| 843 |
+
{
|
| 844 |
+
"key": "gpu_name",
|
| 845 |
+
"value": {
|
| 846 |
+
"stringValue": "NVIDIA H200"
|
| 847 |
+
}
|
| 848 |
+
}
|
| 849 |
+
],
|
| 850 |
+
"timeUnixNano": "1761242614199441920",
|
| 851 |
+
"asDouble": 430.0
|
| 852 |
+
},
|
| 853 |
+
{
|
| 854 |
+
"attributes": [
|
| 855 |
+
{
|
| 856 |
+
"key": "gpu_id",
|
| 857 |
+
"value": {
|
| 858 |
+
"stringValue": "0"
|
| 859 |
+
}
|
| 860 |
+
},
|
| 861 |
+
{
|
| 862 |
+
"key": "gpu_name",
|
| 863 |
+
"value": {
|
| 864 |
+
"stringValue": "NVIDIA H200"
|
| 865 |
+
}
|
| 866 |
+
}
|
| 867 |
+
],
|
| 868 |
+
"timeUnixNano": "1761242624199441920",
|
| 869 |
+
"asDouble": 250.0
|
| 870 |
+
},
|
| 871 |
+
{
|
| 872 |
+
"attributes": [
|
| 873 |
+
{
|
| 874 |
+
"key": "gpu_id",
|
| 875 |
+
"value": {
|
| 876 |
+
"stringValue": "0"
|
| 877 |
+
}
|
| 878 |
+
},
|
| 879 |
+
{
|
| 880 |
+
"key": "gpu_name",
|
| 881 |
+
"value": {
|
| 882 |
+
"stringValue": "NVIDIA H200"
|
| 883 |
+
}
|
| 884 |
+
}
|
| 885 |
+
],
|
| 886 |
+
"timeUnixNano": "1761242634199441920",
|
| 887 |
+
"asDouble": 280.0
|
| 888 |
+
},
|
| 889 |
+
{
|
| 890 |
+
"attributes": [
|
| 891 |
+
{
|
| 892 |
+
"key": "gpu_id",
|
| 893 |
+
"value": {
|
| 894 |
+
"stringValue": "0"
|
| 895 |
+
}
|
| 896 |
+
},
|
| 897 |
+
{
|
| 898 |
+
"key": "gpu_name",
|
| 899 |
+
"value": {
|
| 900 |
+
"stringValue": "NVIDIA H200"
|
| 901 |
+
}
|
| 902 |
+
}
|
| 903 |
+
],
|
| 904 |
+
"timeUnixNano": "1761242644199441920",
|
| 905 |
+
"asDouble": 310.0
|
| 906 |
+
},
|
| 907 |
+
{
|
| 908 |
+
"attributes": [
|
| 909 |
+
{
|
| 910 |
+
"key": "gpu_id",
|
| 911 |
+
"value": {
|
| 912 |
+
"stringValue": "0"
|
| 913 |
+
}
|
| 914 |
+
},
|
| 915 |
+
{
|
| 916 |
+
"key": "gpu_name",
|
| 917 |
+
"value": {
|
| 918 |
+
"stringValue": "NVIDIA H200"
|
| 919 |
+
}
|
| 920 |
+
}
|
| 921 |
+
],
|
| 922 |
+
"timeUnixNano": "1761242654199441920",
|
| 923 |
+
"asDouble": 340.0
|
| 924 |
+
},
|
| 925 |
+
{
|
| 926 |
+
"attributes": [
|
| 927 |
+
{
|
| 928 |
+
"key": "gpu_id",
|
| 929 |
+
"value": {
|
| 930 |
+
"stringValue": "0"
|
| 931 |
+
}
|
| 932 |
+
},
|
| 933 |
+
{
|
| 934 |
+
"key": "gpu_name",
|
| 935 |
+
"value": {
|
| 936 |
+
"stringValue": "NVIDIA H200"
|
| 937 |
+
}
|
| 938 |
+
}
|
| 939 |
+
],
|
| 940 |
+
"timeUnixNano": "1761242664199441920",
|
| 941 |
+
"asDouble": 370.0
|
| 942 |
+
}
|
| 943 |
+
]
|
| 944 |
+
}
|
| 945 |
+
},
|
| 946 |
+
{
|
| 947 |
+
"name": "gen_ai.co2.emissions",
|
| 948 |
+
"description": "Cumulative CO2 equivalent emissions in grams",
|
| 949 |
+
"unit": "gCO2e",
|
| 950 |
+
"sum": {
|
| 951 |
+
"dataPoints": [
|
| 952 |
+
{
|
| 953 |
+
"attributes": [
|
| 954 |
+
{
|
| 955 |
+
"key": "gpu_id",
|
| 956 |
+
"value": {
|
| 957 |
+
"stringValue": "0"
|
| 958 |
+
}
|
| 959 |
+
}
|
| 960 |
+
],
|
| 961 |
+
"timeUnixNano": "1761242554199441920",
|
| 962 |
+
"asDouble": 0.2777777777777778
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"attributes": [
|
| 966 |
+
{
|
| 967 |
+
"key": "gpu_id",
|
| 968 |
+
"value": {
|
| 969 |
+
"stringValue": "0"
|
| 970 |
+
}
|
| 971 |
+
}
|
| 972 |
+
],
|
| 973 |
+
"timeUnixNano": "1761242564199441920",
|
| 974 |
+
"asDouble": 0.5888888888888889
|
| 975 |
+
},
|
| 976 |
+
{
|
| 977 |
+
"attributes": [
|
| 978 |
+
{
|
| 979 |
+
"key": "gpu_id",
|
| 980 |
+
"value": {
|
| 981 |
+
"stringValue": "0"
|
| 982 |
+
}
|
| 983 |
+
}
|
| 984 |
+
],
|
| 985 |
+
"timeUnixNano": "1761242574199441920",
|
| 986 |
+
"asDouble": 0.9333333333333333
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"attributes": [
|
| 990 |
+
{
|
| 991 |
+
"key": "gpu_id",
|
| 992 |
+
"value": {
|
| 993 |
+
"stringValue": "0"
|
| 994 |
+
}
|
| 995 |
+
}
|
| 996 |
+
],
|
| 997 |
+
"timeUnixNano": "1761242584199441920",
|
| 998 |
+
"asDouble": 1.3111111111111111
|
| 999 |
+
},
|
| 1000 |
+
{
|
| 1001 |
+
"attributes": [
|
| 1002 |
+
{
|
| 1003 |
+
"key": "gpu_id",
|
| 1004 |
+
"value": {
|
| 1005 |
+
"stringValue": "0"
|
| 1006 |
+
}
|
| 1007 |
+
}
|
| 1008 |
+
],
|
| 1009 |
+
"timeUnixNano": "1761242594199441920",
|
| 1010 |
+
"asDouble": 1.7222222222222223
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"attributes": [
|
| 1014 |
+
{
|
| 1015 |
+
"key": "gpu_id",
|
| 1016 |
+
"value": {
|
| 1017 |
+
"stringValue": "0"
|
| 1018 |
+
}
|
| 1019 |
+
}
|
| 1020 |
+
],
|
| 1021 |
+
"timeUnixNano": "1761242604199441920",
|
| 1022 |
+
"asDouble": 2.166666666666667
|
| 1023 |
+
},
|
| 1024 |
+
{
|
| 1025 |
+
"attributes": [
|
| 1026 |
+
{
|
| 1027 |
+
"key": "gpu_id",
|
| 1028 |
+
"value": {
|
| 1029 |
+
"stringValue": "0"
|
| 1030 |
+
}
|
| 1031 |
+
}
|
| 1032 |
+
],
|
| 1033 |
+
"timeUnixNano": "1761242614199441920",
|
| 1034 |
+
"asDouble": 2.644444444444445
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"attributes": [
|
| 1038 |
+
{
|
| 1039 |
+
"key": "gpu_id",
|
| 1040 |
+
"value": {
|
| 1041 |
+
"stringValue": "0"
|
| 1042 |
+
}
|
| 1043 |
+
}
|
| 1044 |
+
],
|
| 1045 |
+
"timeUnixNano": "1761242624199441920",
|
| 1046 |
+
"asDouble": 2.9222222222222225
|
| 1047 |
+
},
|
| 1048 |
+
{
|
| 1049 |
+
"attributes": [
|
| 1050 |
+
{
|
| 1051 |
+
"key": "gpu_id",
|
| 1052 |
+
"value": {
|
| 1053 |
+
"stringValue": "0"
|
| 1054 |
+
}
|
| 1055 |
+
}
|
| 1056 |
+
],
|
| 1057 |
+
"timeUnixNano": "1761242634199441920",
|
| 1058 |
+
"asDouble": 3.2333333333333334
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"attributes": [
|
| 1062 |
+
{
|
| 1063 |
+
"key": "gpu_id",
|
| 1064 |
+
"value": {
|
| 1065 |
+
"stringValue": "0"
|
| 1066 |
+
}
|
| 1067 |
+
}
|
| 1068 |
+
],
|
| 1069 |
+
"timeUnixNano": "1761242644199441920",
|
| 1070 |
+
"asDouble": 3.577777777777778
|
| 1071 |
+
},
|
| 1072 |
+
{
|
| 1073 |
+
"attributes": [
|
| 1074 |
+
{
|
| 1075 |
+
"key": "gpu_id",
|
| 1076 |
+
"value": {
|
| 1077 |
+
"stringValue": "0"
|
| 1078 |
+
}
|
| 1079 |
+
}
|
| 1080 |
+
],
|
| 1081 |
+
"timeUnixNano": "1761242654199441920",
|
| 1082 |
+
"asDouble": 3.9555555555555557
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"attributes": [
|
| 1086 |
+
{
|
| 1087 |
+
"key": "gpu_id",
|
| 1088 |
+
"value": {
|
| 1089 |
+
"stringValue": "0"
|
| 1090 |
+
}
|
| 1091 |
+
}
|
| 1092 |
+
],
|
| 1093 |
+
"timeUnixNano": "1761242664199441920",
|
| 1094 |
+
"asDouble": 4.366666666666667
|
| 1095 |
+
}
|
| 1096 |
+
],
|
| 1097 |
+
"aggregationTemporality": 2,
|
| 1098 |
+
"isMonotonic": true
|
| 1099 |
+
}
|
| 1100 |
+
}
|
| 1101 |
+
]
|
| 1102 |
+
}
|
| 1103 |
+
]
|
| 1104 |
+
}
|
| 1105 |
+
]
|
| 1106 |
+
}
|
sample_data/results_gpt4.json
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"run_id": "run_001_gpt4",
|
| 4 |
+
"task_id": "task_001",
|
| 5 |
+
"test_index": 0,
|
| 6 |
+
"prompt": "What's the weather in Tokyo?",
|
| 7 |
+
"expected_tool": "get_weather",
|
| 8 |
+
"difficulty": "easy",
|
| 9 |
+
"category": "tool_usage",
|
| 10 |
+
"success": true,
|
| 11 |
+
"response": "The weather in Tokyo is 18°C and clear.",
|
| 12 |
+
"tool_called": "get_weather",
|
| 13 |
+
"tool_correct": true,
|
| 14 |
+
"expected_keywords": ["18°C", "clear"],
|
| 15 |
+
"keywords_matched": ["18°C", "clear"],
|
| 16 |
+
"execution_time_ms": 2450.0,
|
| 17 |
+
"total_tokens": 234,
|
| 18 |
+
"prompt_tokens": 78,
|
| 19 |
+
"completion_tokens": 156,
|
| 20 |
+
"cost_usd": 0.0012,
|
| 21 |
+
"trace_id": "trace_abc123",
|
| 22 |
+
"start_time": "2025-01-16T14:23:01Z",
|
| 23 |
+
"end_time": "2025-01-16T14:23:03.450Z",
|
| 24 |
+
"start_time_unix_nano": "1760947217774556600",
|
| 25 |
+
"end_time_unix_nano": "1760947220224556600",
|
| 26 |
+
"error": null,
|
| 27 |
+
"error_type": null
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"run_id": "run_001_gpt4",
|
| 31 |
+
"task_id": "task_002",
|
| 32 |
+
"test_index": 1,
|
| 33 |
+
"prompt": "Search for recent news about AI",
|
| 34 |
+
"expected_tool": "web_search",
|
| 35 |
+
"difficulty": "medium",
|
| 36 |
+
"category": "information_retrieval",
|
| 37 |
+
"success": true,
|
| 38 |
+
"response": "Here are the latest AI news headlines: 1) New breakthrough in LLMs...",
|
| 39 |
+
"tool_called": "web_search",
|
| 40 |
+
"tool_correct": true,
|
| 41 |
+
"expected_keywords": ["AI", "news"],
|
| 42 |
+
"keywords_matched": ["AI"],
|
| 43 |
+
"execution_time_ms": 3800.0,
|
| 44 |
+
"total_tokens": 456,
|
| 45 |
+
"prompt_tokens": 120,
|
| 46 |
+
"completion_tokens": 336,
|
| 47 |
+
"cost_usd": 0.0018,
|
| 48 |
+
"trace_id": "trace_def456",
|
| 49 |
+
"start_time": "2025-01-16T14:23:05Z",
|
| 50 |
+
"end_time": "2025-01-16T14:23:08.800Z",
|
| 51 |
+
"start_time_unix_nano": "1760947221000000000",
|
| 52 |
+
"end_time_unix_nano": "1760947224800000000",
|
| 53 |
+
"error": null,
|
| 54 |
+
"error_type": null
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"run_id": "run_001_gpt4",
|
| 58 |
+
"task_id": "task_003",
|
| 59 |
+
"test_index": 2,
|
| 60 |
+
"prompt": "Calculate 234 * 567",
|
| 61 |
+
"expected_tool": "calculator",
|
| 62 |
+
"difficulty": "easy",
|
| 63 |
+
"category": "tool_usage",
|
| 64 |
+
"success": true,
|
| 65 |
+
"response": "The result of 234 * 567 is 132678",
|
| 66 |
+
"tool_called": "calculator",
|
| 67 |
+
"tool_correct": true,
|
| 68 |
+
"expected_keywords": ["132678"],
|
| 69 |
+
"keywords_matched": ["132678"],
|
| 70 |
+
"execution_time_ms": 1200.0,
|
| 71 |
+
"total_tokens": 89,
|
| 72 |
+
"prompt_tokens": 45,
|
| 73 |
+
"completion_tokens": 44,
|
| 74 |
+
"cost_usd": 0.0004,
|
| 75 |
+
"trace_id": "trace_ghi789",
|
| 76 |
+
"start_time": "2025-01-16T14:23:10Z",
|
| 77 |
+
"end_time": "2025-01-16T14:23:11.200Z",
|
| 78 |
+
"start_time_unix_nano": "1760947226000000000",
|
| 79 |
+
"end_time_unix_nano": "1760947227200000000",
|
| 80 |
+
"error": null,
|
| 81 |
+
"error_type": null
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"run_id": "run_001_gpt4",
|
| 85 |
+
"task_id": "task_004",
|
| 86 |
+
"test_index": 3,
|
| 87 |
+
"prompt": "Send an email to john@example.com with subject 'Meeting' and body 'Let's meet tomorrow'",
|
| 88 |
+
"expected_tool": "send_email",
|
| 89 |
+
"difficulty": "hard",
|
| 90 |
+
"category": "multi_step",
|
| 91 |
+
"success": false,
|
| 92 |
+
"response": "I apologize, I don't have access to an email sending function.",
|
| 93 |
+
"tool_called": null,
|
| 94 |
+
"tool_correct": false,
|
| 95 |
+
"expected_keywords": ["email", "sent"],
|
| 96 |
+
"keywords_matched": [],
|
| 97 |
+
"execution_time_ms": 1800.0,
|
| 98 |
+
"total_tokens": 123,
|
| 99 |
+
"prompt_tokens": 67,
|
| 100 |
+
"completion_tokens": 56,
|
| 101 |
+
"cost_usd": 0.0006,
|
| 102 |
+
"trace_id": "trace_jkl012",
|
| 103 |
+
"start_time": "2025-01-16T14:23:13Z",
|
| 104 |
+
"end_time": "2025-01-16T14:23:14.800Z",
|
| 105 |
+
"start_time_unix_nano": "1760947229000000000",
|
| 106 |
+
"end_time_unix_nano": "1760947230800000000",
|
| 107 |
+
"error": "Tool not found: send_email",
|
| 108 |
+
"error_type": "tool_not_found"
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"run_id": "run_001_gpt4",
|
| 112 |
+
"task_id": "task_005",
|
| 113 |
+
"test_index": 4,
|
| 114 |
+
"prompt": "What is 2+2?",
|
| 115 |
+
"expected_tool": "calculator",
|
| 116 |
+
"difficulty": "easy",
|
| 117 |
+
"category": "reasoning",
|
| 118 |
+
"success": true,
|
| 119 |
+
"response": "2+2 equals 4",
|
| 120 |
+
"tool_called": "calculator",
|
| 121 |
+
"tool_correct": true,
|
| 122 |
+
"expected_keywords": ["4"],
|
| 123 |
+
"keywords_matched": ["4"],
|
| 124 |
+
"execution_time_ms": 900.0,
|
| 125 |
+
"total_tokens": 67,
|
| 126 |
+
"prompt_tokens": 34,
|
| 127 |
+
"completion_tokens": 33,
|
| 128 |
+
"cost_usd": 0.0003,
|
| 129 |
+
"trace_id": "trace_mno345",
|
| 130 |
+
"start_time": "2025-01-16T14:23:16Z",
|
| 131 |
+
"end_time": "2025-01-16T14:23:16.900Z",
|
| 132 |
+
"start_time_unix_nano": "1760947232000000000",
|
| 133 |
+
"end_time_unix_nano": "1760947232900000000",
|
| 134 |
+
"error": null,
|
| 135 |
+
"error_type": null
|
| 136 |
+
}
|
| 137 |
+
]
|
sample_data/traces_gpt4.json
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"trace_id": "trace_abc123",
|
| 4 |
+
"run_id": "run_001_gpt4",
|
| 5 |
+
"traceId": "trace_abc123",
|
| 6 |
+
"spans": [
|
| 7 |
+
{
|
| 8 |
+
"spanId": "span_001",
|
| 9 |
+
"parentSpanId": null,
|
| 10 |
+
"name": "Agent Execution",
|
| 11 |
+
"kind": "INTERNAL",
|
| 12 |
+
"startTime": 1760947217774556600,
|
| 13 |
+
"endTime": 1760947220224556600,
|
| 14 |
+
"attributes": {
|
| 15 |
+
"agent.type": "both",
|
| 16 |
+
"agent.name": "ToolCallingAgent",
|
| 17 |
+
"gen_ai.system": "openai",
|
| 18 |
+
"gen_ai.request.model": "gpt-4"
|
| 19 |
+
},
|
| 20 |
+
"status": {"code": "OK"}
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"spanId": "span_002",
|
| 24 |
+
"parentSpanId": "span_001",
|
| 25 |
+
"name": "LLM Call - Reasoning",
|
| 26 |
+
"kind": "CLIENT",
|
| 27 |
+
"startTime": 1760947217774556600,
|
| 28 |
+
"endTime": 1760947218974556600,
|
| 29 |
+
"attributes": {
|
| 30 |
+
"gen_ai.system": "openai",
|
| 31 |
+
"gen_ai.request.model": "gpt-4",
|
| 32 |
+
"gen_ai.operation.name": "chat",
|
| 33 |
+
"gen_ai.usage.prompt_tokens": 78,
|
| 34 |
+
"gen_ai.usage.completion_tokens": 45,
|
| 35 |
+
"gen_ai.usage.total_tokens": 123,
|
| 36 |
+
"gen_ai.usage.cost.total": 0.0006,
|
| 37 |
+
"gen_ai.response.finish_reasons": ["stop"]
|
| 38 |
+
},
|
| 39 |
+
"status": {"code": "OK"}
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"spanId": "span_003",
|
| 43 |
+
"parentSpanId": "span_001",
|
| 44 |
+
"name": "Tool Call - get_weather",
|
| 45 |
+
"kind": "CLIENT",
|
| 46 |
+
"startTime": 1760947219000556600,
|
| 47 |
+
"endTime": 1760947219890556600,
|
| 48 |
+
"attributes": {
|
| 49 |
+
"tool.name": "get_weather",
|
| 50 |
+
"tool.input": "{\"location\": \"Tokyo\"}",
|
| 51 |
+
"tool.output": "{\"temp\": \"18°C\", \"condition\": \"clear\"}",
|
| 52 |
+
"tool.latency_ms": 890
|
| 53 |
+
},
|
| 54 |
+
"status": {"code": "OK"}
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"spanId": "span_004",
|
| 58 |
+
"parentSpanId": "span_001",
|
| 59 |
+
"name": "LLM Call - Final Response",
|
| 60 |
+
"kind": "CLIENT",
|
| 61 |
+
"startTime": 1760947219900556600,
|
| 62 |
+
"endTime": 1760947220224556600,
|
| 63 |
+
"attributes": {
|
| 64 |
+
"gen_ai.system": "openai",
|
| 65 |
+
"gen_ai.request.model": "gpt-4",
|
| 66 |
+
"gen_ai.usage.prompt_tokens": 145,
|
| 67 |
+
"gen_ai.usage.completion_tokens": 111,
|
| 68 |
+
"gen_ai.usage.cost.total": 0.0006
|
| 69 |
+
},
|
| 70 |
+
"status": {"code": "OK"}
|
| 71 |
+
}
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"trace_id": "trace_def456",
|
| 76 |
+
"run_id": "run_001_gpt4",
|
| 77 |
+
"traceId": "trace_def456",
|
| 78 |
+
"spans": [
|
| 79 |
+
{
|
| 80 |
+
"spanId": "span_005",
|
| 81 |
+
"parentSpanId": null,
|
| 82 |
+
"name": "Agent Execution",
|
| 83 |
+
"kind": "INTERNAL",
|
| 84 |
+
"startTime": 1760947221000000000,
|
| 85 |
+
"endTime": 1760947224800000000,
|
| 86 |
+
"attributes": {
|
| 87 |
+
"agent.type": "both",
|
| 88 |
+
"agent.name": "ToolCallingAgent",
|
| 89 |
+
"gen_ai.system": "openai",
|
| 90 |
+
"gen_ai.request.model": "gpt-4"
|
| 91 |
+
},
|
| 92 |
+
"status": {"code": "OK"}
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"spanId": "span_006",
|
| 96 |
+
"parentSpanId": "span_005",
|
| 97 |
+
"name": "LLM Call - Reasoning",
|
| 98 |
+
"kind": "CLIENT",
|
| 99 |
+
"startTime": 1760947221000000000,
|
| 100 |
+
"endTime": 1760947222200000000,
|
| 101 |
+
"attributes": {
|
| 102 |
+
"gen_ai.system": "openai",
|
| 103 |
+
"gen_ai.request.model": "gpt-4",
|
| 104 |
+
"gen_ai.operation.name": "chat",
|
| 105 |
+
"gen_ai.usage.prompt_tokens": 120,
|
| 106 |
+
"gen_ai.usage.completion_tokens": 67,
|
| 107 |
+
"gen_ai.usage.total_tokens": 187
|
| 108 |
+
},
|
| 109 |
+
"status": {"code": "OK"}
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"spanId": "span_007",
|
| 113 |
+
"parentSpanId": "span_005",
|
| 114 |
+
"name": "Tool Call - web_search",
|
| 115 |
+
"kind": "CLIENT",
|
| 116 |
+
"startTime": 1760947222300000000,
|
| 117 |
+
"endTime": 1760947224000000000,
|
| 118 |
+
"attributes": {
|
| 119 |
+
"tool.name": "web_search",
|
| 120 |
+
"tool.input": "{\"query\": \"recent AI news\"}",
|
| 121 |
+
"tool.output": "{\"results\": [...]}",
|
| 122 |
+
"tool.latency_ms": 1700
|
| 123 |
+
},
|
| 124 |
+
"status": {"code": "OK"}
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"spanId": "span_008",
|
| 128 |
+
"parentSpanId": "span_005",
|
| 129 |
+
"name": "LLM Call - Final Response",
|
| 130 |
+
"kind": "CLIENT",
|
| 131 |
+
"startTime": 1760947224100000000,
|
| 132 |
+
"endTime": 1760947224800000000,
|
| 133 |
+
"attributes": {
|
| 134 |
+
"gen_ai.system": "openai",
|
| 135 |
+
"gen_ai.request.model": "gpt-4",
|
| 136 |
+
"gen_ai.usage.prompt_tokens": 189,
|
| 137 |
+
"gen_ai.usage.completion_tokens": 269
|
| 138 |
+
},
|
| 139 |
+
"status": {"code": "OK"}
|
| 140 |
+
}
|
| 141 |
+
]
|
| 142 |
+
}
|
| 143 |
+
]
|