Mandark-droid commited on
Commit
24b4390
·
1 Parent(s): 1fc3adb

Add leaderboard components and enhanced data loader

Browse files

- Add HTML table generator with styled leaderboard display
- Add metric display components (badges, bars, formatters)
- Enhance data loader to support both JSON and HuggingFace sources
- Add sample data for local development and testing
- Implement automatic fallback between data sources

components/__init__.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Components package for TraceMind UI
3
+ Contains reusable visual components
4
+ """
5
+
6
+ from .metric_displays import (
7
+ get_rank_badge,
8
+ get_success_rate_bar,
9
+ get_gpu_utilization_bar,
10
+ get_provider_badge,
11
+ get_agent_type_badge,
12
+ get_hardware_badge,
13
+ format_cost,
14
+ format_duration,
15
+ get_tooltip_icon
16
+ )
17
+
18
+ from .leaderboard_table import (
19
+ generate_leaderboard_html,
20
+ generate_empty_state_html,
21
+ generate_filter_summary_html
22
+ )
23
+
24
+ from .thought_graph import create_thought_graph
25
+
26
+ from .analytics_charts import (
27
+ create_performance_heatmap,
28
+ create_speed_accuracy_scatter,
29
+ create_cost_efficiency_scatter,
30
+ create_comparison_radar
31
+ )
32
+
33
+ from .report_cards import (
34
+ generate_leaderboard_summary_card,
35
+ generate_run_report_card,
36
+ download_card_as_png_js
37
+ )
38
+
39
+ __all__ = [
40
+ 'get_rank_badge',
41
+ 'get_success_rate_bar',
42
+ 'get_gpu_utilization_bar',
43
+ 'get_provider_badge',
44
+ 'get_agent_type_badge',
45
+ 'get_hardware_badge',
46
+ 'format_cost',
47
+ 'format_duration',
48
+ 'get_tooltip_icon',
49
+ 'generate_leaderboard_html',
50
+ 'generate_empty_state_html',
51
+ 'generate_filter_summary_html',
52
+ 'create_thought_graph',
53
+ 'create_performance_heatmap',
54
+ 'create_speed_accuracy_scatter',
55
+ 'create_cost_efficiency_scatter',
56
+ 'create_comparison_radar',
57
+ 'generate_leaderboard_summary_card',
58
+ 'generate_run_report_card',
59
+ 'download_card_as_png_js'
60
+ ]
components/leaderboard_table.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard HTML Table Generator
3
+ Creates styled HTML tables for the leaderboard view
4
+ """
5
+
6
+ import pandas as pd
7
+ from typing import Optional
8
+ from .metric_displays import (
9
+ get_rank_badge,
10
+ get_success_rate_bar,
11
+ get_gpu_utilization_bar,
12
+ get_provider_badge,
13
+ get_agent_type_badge,
14
+ get_hardware_badge,
15
+ format_cost,
16
+ format_duration,
17
+ get_tooltip_icon
18
+ )
19
+
20
+
21
+ def generate_leaderboard_html(
22
+ df: pd.DataFrame,
23
+ sort_by: str = "success_rate",
24
+ ascending: bool = False
25
+ ) -> str:
26
+ """
27
+ Generate styled HTML table for leaderboard
28
+
29
+ Args:
30
+ df: Leaderboard DataFrame
31
+ sort_by: Column to sort by
32
+ ascending: Sort order (False = descending)
33
+
34
+ Returns:
35
+ HTML string with complete styled table
36
+
37
+ Expected DataFrame columns:
38
+ - model (str): Model name
39
+ - agent_type (str): tool, code, or both
40
+ - provider (str): litellm or transformers
41
+ - success_rate (float): 0-100
42
+ - total_tests (int): Number of tests
43
+ - avg_duration_ms (float): Average duration
44
+ - total_cost_usd (float): Total cost
45
+ - co2_emissions_g (float): CO2 emissions
46
+ - gpu_utilization_avg (float, optional): GPU utilization %
47
+ - submitted_by (str): Username
48
+ """
49
+
50
+ # Sort dataframe
51
+ df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)
52
+
53
+ # Start HTML with embedded CSS
54
+ html = """
55
+ <style>
56
+ /* Leaderboard Table Styles */
57
+ .tm-leaderboard-container {
58
+ background: #F8FAFC; /* Light background for better readability */
59
+ border-radius: 16px;
60
+ overflow-x: auto; /* Enable horizontal scrolling */
61
+ overflow-y: visible;
62
+ border: 1px solid rgba(203, 213, 225, 0.8);
63
+ margin: 20px 0;
64
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
65
+ max-width: 100%;
66
+ }
67
+
68
+ /* Custom scrollbar styling */
69
+ .tm-leaderboard-container::-webkit-scrollbar {
70
+ height: 8px;
71
+ }
72
+
73
+ .tm-leaderboard-container::-webkit-scrollbar-track {
74
+ background: #E2E8F0;
75
+ border-radius: 4px;
76
+ }
77
+
78
+ .tm-leaderboard-container::-webkit-scrollbar-thumb {
79
+ background: #94A3B8;
80
+ border-radius: 4px;
81
+ }
82
+
83
+ .tm-leaderboard-container::-webkit-scrollbar-thumb:hover {
84
+ background: #64748B;
85
+ }
86
+
87
+ .tm-leaderboard-table {
88
+ width: 100%;
89
+ min-width: 1650px; /* Reduced from 1800px after combining columns */
90
+ border-collapse: collapse;
91
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
92
+ background: #FFFFFF; /* White background */
93
+ color: #0F172A; /* Dark text for maximum contrast */
94
+ }
95
+
96
+ .tm-leaderboard-table thead {
97
+ background: linear-gradient(135deg, #6366F1 0%, #4F46E5 100%); /* Vibrant indigo gradient */
98
+ position: sticky;
99
+ top: 0;
100
+ z-index: 10;
101
+ backdrop-filter: blur(10px);
102
+ }
103
+
104
+ .tm-leaderboard-table th {
105
+ padding: 16px 12px;
106
+ text-align: left;
107
+ font-weight: 600;
108
+ color: #FFFFFF; /* Pure white for headers - good contrast */
109
+ border-bottom: 2px solid #4338CA;
110
+ font-size: 12px;
111
+ text-transform: uppercase;
112
+ letter-spacing: 0.05em;
113
+ white-space: nowrap;
114
+ }
115
+
116
+ .tm-leaderboard-table td {
117
+ padding: 14px 12px;
118
+ border-bottom: 1px solid rgba(226, 232, 240, 0.8);
119
+ color: #1E293B; /* Dark text for cells */
120
+ font-size: 14px;
121
+ vertical-align: middle;
122
+ }
123
+
124
+ .tm-leaderboard-table tbody tr {
125
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
126
+ cursor: pointer;
127
+ }
128
+
129
+ .tm-leaderboard-table tbody tr:hover {
130
+ background: rgba(99, 102, 241, 0.08) !important;
131
+ box-shadow: 0 0 15px rgba(99, 102, 241, 0.15),
132
+ inset 0 0 15px rgba(99, 102, 241, 0.05);
133
+ transform: scale(1.002);
134
+ }
135
+
136
+ .tm-leaderboard-table tbody tr:nth-child(even) {
137
+ background: rgba(241, 245, 249, 0.6); /* Light stripe */
138
+ }
139
+
140
+ .tm-model-name {
141
+ font-weight: 600;
142
+ color: #000000 !important; /* Pure black - readable in all themes */
143
+ font-size: 15px;
144
+ transition: color 0.2s ease;
145
+ }
146
+
147
+ .tm-leaderboard-table tr:hover .tm-model-name {
148
+ color: #4F46E5 !important; /* Indigo on hover */
149
+ }
150
+
151
+ .tm-numeric-cell {
152
+ font-family: 'Monaco', 'Menlo', monospace;
153
+ font-size: 13px;
154
+ text-align: center;
155
+ color: #000000 !important; /* Pure black for numbers */
156
+ }
157
+
158
+ .tm-badge-cell {
159
+ text-align: center;
160
+ }
161
+
162
+ .tm-run-id {
163
+ font-family: 'Monaco', 'Menlo', monospace;
164
+ font-size: 12px;
165
+ color: #000000 !important; /* Pure black - readable in all themes */
166
+ cursor: pointer;
167
+ text-decoration: none;
168
+ font-weight: 500;
169
+ transition: all 0.2s ease;
170
+ }
171
+
172
+ .tm-run-id:hover {
173
+ color: #4F46E5 !important; /* Indigo on hover */
174
+ text-decoration: underline;
175
+ }
176
+
177
+ .tm-text-cell {
178
+ color: #000000 !important; /* Pure black for all text */
179
+ font-size: 0.9em;
180
+ }
181
+
182
+ /* Responsive Design */
183
+ @media (max-width: 1024px) {
184
+ .tm-leaderboard-table th,
185
+ .tm-leaderboard-table td {
186
+ padding: 10px 8px;
187
+ font-size: 12px;
188
+ }
189
+
190
+ /* Hide less important columns on smaller screens */
191
+ .tm-hide-mobile {
192
+ display: none !important;
193
+ }
194
+ }
195
+
196
+ @media (max-width: 768px) {
197
+ .tm-leaderboard-table th:nth-child(n+7),
198
+ .tm-leaderboard-table td:nth-child(n+7) {
199
+ display: none !important;
200
+ }
201
+
202
+ .tm-model-name {
203
+ font-size: 13px;
204
+ }
205
+ }
206
+
207
+ @media (max-width: 480px) {
208
+ /* Ultra-compact: Show only rank, model, and success rate */
209
+ .tm-leaderboard-table th:nth-child(n+4),
210
+ .tm-leaderboard-table td:nth-child(n+4) {
211
+ display: none !important;
212
+ }
213
+
214
+ .tm-leaderboard-table th:nth-child(3),
215
+ .tm-leaderboard-table td:nth-child(3) {
216
+ display: table-cell !important;
217
+ }
218
+ }
219
+ </style>
220
+
221
+ <div class="tm-leaderboard-container">
222
+ <table class="tm-leaderboard-table">
223
+ <thead>
224
+ <tr>
225
+ <th style="width: 60px;">Rank</th>
226
+ <th style="width: 110px;" title="Click to view detailed run information">Run ID</th>
227
+ <th style="min-width: 160px;">Model</th>
228
+ <th style="width: 80px;">Type</th>
229
+ <th style="width: 90px;">Provider</th>
230
+ <th style="width: 85px;" title="Hardware used for evaluation: GPU or CPU">Hardware</th>
231
+ <th style="width: 150px;" title="Percentage of test cases that passed (0-100%). Higher is better.">
232
+ Success Rate
233
+ </th>
234
+ <th style="width: 140px;" class="tm-numeric-cell" title="Tests: Total / Pass / Fail">
235
+ Tests (P/F)
236
+ </th>
237
+ <th style="width: 70px;" class="tm-numeric-cell" title="Average number of steps per test case.">
238
+ Steps
239
+ </th>
240
+ <th style="width: 100px;" class="tm-numeric-cell" title="Average time per test case. Lower is better.">
241
+ Duration
242
+ </th>
243
+ <th style="width: 90px;" class="tm-numeric-cell" title="Total tokens used across all tests.">
244
+ Tokens
245
+ </th>
246
+ <th style="width: 90px;" class="tm-numeric-cell" title="Total API + power costs in USD. Lower is better.">
247
+ Cost
248
+ </th>
249
+ <th style="width: 80px;" class="tm-numeric-cell tm-hide-mobile" title="Carbon footprint in grams of CO2 equivalent.">
250
+ CO2
251
+ </th>
252
+ <th style="width: 100px;" class="tm-hide-mobile" title="Average GPU usage during evaluation (0-100%). Only for GPU jobs.">
253
+ GPU Util
254
+ </th>
255
+ <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU memory usage (avg/max in MiB). Only for GPU jobs.">
256
+ GPU Mem
257
+ </th>
258
+ <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU temperature (avg/max in Celsius). Only for GPU jobs.">
259
+ GPU Temp
260
+ </th>
261
+ <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="Average GPU power consumption in Watts. Only for GPU jobs.">
262
+ GPU Power
263
+ </th>
264
+ <th style="width: 140px;" class="tm-hide-mobile">Timestamp</th>
265
+ <th style="width: 110px;" class="tm-hide-mobile">Submitted By</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ """
270
+
271
+ # Generate table rows
272
+ for idx, row in df_sorted.iterrows():
273
+ rank = idx + 1
274
+
275
+ # Get values with safe defaults
276
+ model = row.get('model', 'Unknown')
277
+ agent_type = row.get('agent_type', 'unknown')
278
+ provider = row.get('provider', 'unknown')
279
+ success_rate = row.get('success_rate', 0.0)
280
+ total_tests = row.get('total_tests', 0)
281
+ successful_tests = row.get('successful_tests', 0)
282
+ failed_tests = row.get('failed_tests', 0)
283
+ avg_steps = row.get('avg_steps', 0.0)
284
+ avg_duration_ms = row.get('avg_duration_ms', 0.0)
285
+ total_tokens = row.get('total_tokens', 0)
286
+ total_cost_usd = row.get('total_cost_usd', 0.0)
287
+ co2_emissions_g = row.get('co2_emissions_g', 0.0)
288
+ gpu_utilization_avg = row.get('gpu_utilization_avg', None)
289
+ gpu_memory_avg_mib = row.get('gpu_memory_avg_mib', None)
290
+ gpu_memory_max_mib = row.get('gpu_memory_max_mib', None)
291
+ gpu_temperature_avg = row.get('gpu_temperature_avg', None)
292
+ gpu_temperature_max = row.get('gpu_temperature_max', None)
293
+ gpu_power_avg_w = row.get('gpu_power_avg_w', None)
294
+ timestamp = row.get('timestamp', '')
295
+ submitted_by = row.get('submitted_by', 'Unknown')
296
+
297
+ # Check if GPU job
298
+ has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0
299
+
300
+ # Format GPU utilization
301
+ if has_gpu:
302
+ gpu_display = get_gpu_utilization_bar(gpu_utilization_avg)
303
+ else:
304
+ gpu_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
305
+
306
+ # Format CO2
307
+ if pd.notna(co2_emissions_g) and co2_emissions_g > 0:
308
+ co2_display = f'<span style="font-family: monospace; font-size: 0.9em; color: #334155;">{co2_emissions_g:.2f}g</span>'
309
+ else:
310
+ co2_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
311
+
312
+ # Format GPU Memory
313
+ if pd.notna(gpu_memory_avg_mib) and pd.notna(gpu_memory_max_mib):
314
+ gpu_mem_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_memory_avg_mib:.0f}/{gpu_memory_max_mib:.0f}</span>'
315
+ else:
316
+ gpu_mem_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
317
+
318
+ # Format GPU Temperature
319
+ if pd.notna(gpu_temperature_avg) and pd.notna(gpu_temperature_max):
320
+ gpu_temp_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_temperature_avg:.0f}/{gpu_temperature_max:.0f}°C</span>'
321
+ else:
322
+ gpu_temp_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
323
+
324
+ # Format GPU Power
325
+ if pd.notna(gpu_power_avg_w):
326
+ gpu_power_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_power_avg_w:.1f}W</span>'
327
+ else:
328
+ gpu_power_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>'
329
+
330
+ # Format timestamp
331
+ from datetime import datetime
332
+ if pd.notna(timestamp):
333
+ try:
334
+ # Handle both string and Timestamp objects
335
+ if isinstance(timestamp, pd.Timestamp):
336
+ timestamp_display = timestamp.strftime('%Y-%m-%d %H:%M')
337
+ else:
338
+ dt = datetime.fromisoformat(str(timestamp).replace('Z', '+00:00'))
339
+ timestamp_display = dt.strftime('%Y-%m-%d %H:%M')
340
+ except Exception as e:
341
+ timestamp_display = str(timestamp)[:16] if timestamp else 'N/A'
342
+ else:
343
+ timestamp_display = 'N/A'
344
+
345
+ # Format Run ID (show first 8 characters)
346
+ run_id = row.get('run_id', 'N/A')
347
+ run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id
348
+
349
+ html += f"""
350
+ <tr data-run-id="{run_id}" data-rank="{rank}" class="tm-clickable-row">
351
+ <td>{get_rank_badge(rank)}</td>
352
+ <td class="tm-run-id" title="{run_id}">{run_id_short}</td>
353
+ <td class="tm-model-name">{model}</td>
354
+ <td class="tm-badge-cell">{get_agent_type_badge(agent_type)}</td>
355
+ <td class="tm-badge-cell">{get_provider_badge(provider)}</td>
356
+ <td class="tm-badge-cell">{get_hardware_badge(has_gpu)}</td>
357
+ <td>{get_success_rate_bar(success_rate)}</td>
358
+ <td class="tm-numeric-cell">
359
+ <strong>{total_tests}</strong>
360
+ <span style="color: #CBD5E1; margin: 0 4px;">/</span>
361
+ <span style="color: #10B981; font-weight: 600;">{successful_tests}</span>
362
+ <span style="color: #CBD5E1; margin: 0 4px;">/</span>
363
+ <span style="color: #EF4444; font-weight: 600;">{failed_tests}</span>
364
+ </td>
365
+ <td class="tm-numeric-cell">{avg_steps:.1f}</td>
366
+ <td class="tm-numeric-cell">{format_duration(avg_duration_ms)}</td>
367
+ <td class="tm-numeric-cell">{total_tokens:,}</td>
368
+ <td class="tm-numeric-cell">{format_cost(total_cost_usd)}</td>
369
+ <td class="tm-numeric-cell tm-hide-mobile">{co2_display}</td>
370
+ <td class="tm-hide-mobile">{gpu_display}</td>
371
+ <td class="tm-numeric-cell tm-hide-mobile">{gpu_mem_display}</td>
372
+ <td class="tm-numeric-cell tm-hide-mobile">{gpu_temp_display}</td>
373
+ <td class="tm-numeric-cell tm-hide-mobile">{gpu_power_display}</td>
374
+ <td class="tm-hide-mobile tm-text-cell">{timestamp_display}</td>
375
+ <td class="tm-hide-mobile tm-text-cell">
376
+ {submitted_by}
377
+ </td>
378
+ </tr>
379
+ """
380
+
381
+ html += """
382
+ </tbody>
383
+ </table>
384
+ </div>
385
+
386
+ <script>
387
+ // Add click handler for Run ID cells - runs on each table render
388
+ (function() {
389
+ // Function to attach handlers
390
+ function attachRowClickHandlers() {
391
+ const cells = document.querySelectorAll('.tm-run-id');
392
+ console.log('Found', cells.length, 'Run ID cells');
393
+
394
+ cells.forEach(function(cell) {
395
+ // Remove existing listener to avoid duplicates
396
+ cell.replaceWith(cell.cloneNode(true));
397
+ });
398
+
399
+ // Re-select after cloning
400
+ document.querySelectorAll('.tm-run-id').forEach(function(cell) {
401
+ cell.addEventListener('click', function(e) {
402
+ e.stopPropagation();
403
+ const row = this.closest('tr');
404
+ const rowIndex = Array.from(row.parentNode.children).indexOf(row);
405
+
406
+ console.log('Run ID clicked, row index:', rowIndex);
407
+
408
+ // Try multiple ways to find the textbox
409
+ let textbox = null;
410
+
411
+ // Method 1: By elem_id
412
+ const container1 = document.getElementById('selected_row_index');
413
+ if (container1) {
414
+ textbox = container1.querySelector('textarea, input[type="text"]');
415
+ console.log('Method 1 (elem_id):', textbox ? 'Found' : 'Not found');
416
+ }
417
+
418
+ // Method 2: By data-testid
419
+ if (!textbox) {
420
+ const containers = document.querySelectorAll('[data-testid="textbox"]');
421
+ console.log('Method 2: Found', containers.length, 'textbox containers');
422
+ for (let container of containers) {
423
+ const input = container.querySelector('textarea, input[type="text"]');
424
+ if (input && !container.closest('.label-wrap')) {
425
+ textbox = input;
426
+ console.log('Method 2: Using hidden textbox');
427
+ break;
428
+ }
429
+ }
430
+ }
431
+
432
+ if (textbox) {
433
+ // Set the row index
434
+ textbox.value = rowIndex.toString();
435
+
436
+ // Trigger multiple events to ensure Gradio picks it up
437
+ textbox.dispatchEvent(new Event('input', { bubbles: true }));
438
+ textbox.dispatchEvent(new Event('change', { bubbles: true }));
439
+ textbox.dispatchEvent(new Event('blur', { bubbles: true }));
440
+
441
+ // Also try triggering on the container
442
+ const container = textbox.closest('[data-testid="textbox"]');
443
+ if (container) {
444
+ container.dispatchEvent(new Event('input', { bubbles: true }));
445
+ }
446
+
447
+ console.log('Textbox updated to:', rowIndex);
448
+ } else {
449
+ console.error('Could not find hidden textbox!');
450
+ }
451
+ });
452
+ });
453
+ }
454
+
455
+ // Attach immediately
456
+ attachRowClickHandlers();
457
+
458
+ // Also attach after a short delay (in case table loads async)
459
+ setTimeout(attachRowClickHandlers, 500);
460
+ setTimeout(attachRowClickHandlers, 1000);
461
+ setTimeout(attachRowClickHandlers, 2000);
462
+ })();
463
+ </script>
464
+ """
465
+
466
+ return html
467
+
468
+
469
+ def generate_empty_state_html() -> str:
470
+ """
471
+ Generate HTML for empty leaderboard state
472
+
473
+ Returns:
474
+ HTML string for empty state
475
+ """
476
+ return """
477
+ <div style="
478
+ text-align: center;
479
+ padding: 60px 20px;
480
+ background: var(--tm-bg-card, #1E293B);
481
+ border-radius: 16px;
482
+ border: 2px dashed var(--tm-border-default, rgba(148, 163, 184, 0.2));
483
+ margin: 20px 0;
484
+ ">
485
+ <div style="font-size: 48px; margin-bottom: 16px;">📊</div>
486
+ <h3 style="
487
+ color: var(--tm-text-primary, #F1F5F9);
488
+ margin: 0 0 12px 0;
489
+ font-size: 1.5rem;
490
+ ">
491
+ No Evaluation Results Yet
492
+ </h3>
493
+ <p style="
494
+ color: var(--tm-text-secondary, #94A3B8);
495
+ margin: 0 0 24px 0;
496
+ font-size: 1rem;
497
+ ">
498
+ Run your first evaluation to see results appear here.
499
+ </p>
500
+ <button style="
501
+ padding: 12px 24px;
502
+ background: var(--tm-primary, #4F46E5);
503
+ color: white;
504
+ border: none;
505
+ border-radius: 8px;
506
+ font-weight: 600;
507
+ cursor: pointer;
508
+ font-size: 1rem;
509
+ ">
510
+ Start New Evaluation
511
+ </button>
512
+ </div>
513
+ """
514
+
515
+
516
+ def generate_filter_summary_html(
517
+ total_runs: int,
518
+ filtered_runs: int,
519
+ active_filters: dict
520
+ ) -> str:
521
+ """
522
+ Generate summary of active filters
523
+
524
+ Args:
525
+ total_runs: Total number of runs
526
+ filtered_runs: Number of runs after filtering
527
+ active_filters: Dict of active filter values
528
+
529
+ Returns:
530
+ HTML string with filter summary
531
+ """
532
+ if filtered_runs == total_runs:
533
+ return f"""
534
+ <div style="
535
+ padding: 12px 16px;
536
+ background: var(--tm-bg-secondary, #334155);
537
+ border-radius: 8px;
538
+ margin-bottom: 16px;
539
+ color: var(--tm-text-secondary, #94A3B8);
540
+ font-size: 0.9em;
541
+ ">
542
+ Showing all <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> evaluation runs
543
+ </div>
544
+ """
545
+
546
+ filter_chips = []
547
+ for key, value in active_filters.items():
548
+ if value and value != "All":
549
+ filter_chips.append(f"""
550
+ <span style="
551
+ display: inline-flex;
552
+ align-items: center;
553
+ padding: 4px 10px;
554
+ background: var(--tm-primary, #4F46E5);
555
+ color: white;
556
+ border-radius: 6px;
557
+ font-size: 0.85em;
558
+ margin-right: 8px;
559
+ font-weight: 500;
560
+ ">
561
+ {key}: {value}
562
+ </span>
563
+ """)
564
+
565
+ filters_html = "".join(filter_chips) if filter_chips else ""
566
+
567
+ return f"""
568
+ <div style="
569
+ padding: 12px 16px;
570
+ background: var(--tm-bg-secondary, #334155);
571
+ border-radius: 8px;
572
+ margin-bottom: 16px;
573
+ color: var(--tm-text-secondary, #94A3B8);
574
+ font-size: 0.9em;
575
+ ">
576
+ <div style="margin-bottom: 8px;">
577
+ Showing <strong style="color: var(--tm-text-primary, #F1F5F9);">{filtered_runs}</strong> of
578
+ <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> runs
579
+ </div>
580
+ {filters_html}
581
+ </div>
582
+ """
components/metric_displays.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metric Display Components
3
+ Reusable HTML generators for badges, progress bars, and visual metrics
4
+ """
5
+
6
+ def get_rank_badge(rank: int) -> str:
7
+ """
8
+ Generate HTML for rank badge with medal styling for top 3
9
+
10
+ Args:
11
+ rank: Position in leaderboard (1-indexed)
12
+
13
+ Returns:
14
+ HTML string for rank badge
15
+
16
+ Examples:
17
+ >>> get_rank_badge(1)
18
+ '<span ...>🥇 1st</span>'
19
+ """
20
+ badge_styles = {
21
+ 1: ("🥇 1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000", "0 2px 8px rgba(255, 215, 0, 0.4)"),
22
+ 2: ("🥈 2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff", "0 2px 8px rgba(156, 163, 175, 0.4)"),
23
+ 3: ("🥉 3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff", "0 2px 8px rgba(205, 127, 50, 0.4)"),
24
+ }
25
+
26
+ if rank in badge_styles:
27
+ label, gradient, text_color, shadow = badge_styles[rank]
28
+ return f"""
29
+ <span style="
30
+ display: inline-flex;
31
+ align-items: center;
32
+ justify-content: center;
33
+ min-width: 60px;
34
+ padding: 6px 12px;
35
+ background: {gradient};
36
+ color: {text_color};
37
+ border-radius: 8px;
38
+ font-weight: 700;
39
+ font-size: 0.9em;
40
+ box-shadow: {shadow};
41
+ letter-spacing: 0.5px;
42
+ ">
43
+ {label}
44
+ </span>
45
+ """
46
+ else:
47
+ return f"""
48
+ <span style="
49
+ display: inline-flex;
50
+ align-items: center;
51
+ justify-content: center;
52
+ min-width: 32px;
53
+ color: var(--tm-text-muted, #64748B);
54
+ font-weight: 500;
55
+ font-size: 0.95em;
56
+ ">
57
+ {rank}
58
+ </span>
59
+ """
60
+
61
+
62
+ def get_success_rate_bar(success_rate: float) -> str:
63
+ """
64
+ Generate HTML for success rate progress bar with color gradient
65
+
66
+ Args:
67
+ success_rate: Success percentage (0-100)
68
+
69
+ Returns:
70
+ HTML string with progress bar and numeric value
71
+
72
+ Color Logic:
73
+ - < 50%: Red → Orange (danger)
74
+ - 50-79%: Orange → Yellow (warning)
75
+ - 80-100%: Green → Cyan (success)
76
+ """
77
+ width = min(max(success_rate, 0), 100) # Clamp to 0-100
78
+
79
+ # Dynamic gradient based on performance
80
+ if success_rate < 50:
81
+ gradient = "linear-gradient(90deg, #EF4444, #F59E0B)" # Red → Orange
82
+ bar_color = "#EF4444"
83
+ elif success_rate < 80:
84
+ gradient = "linear-gradient(90deg, #F59E0B, #FBBF24)" # Orange → Yellow
85
+ bar_color = "#F59E0B"
86
+ else:
87
+ gradient = "linear-gradient(90deg, #10B981, #06B6D4)" # Green → Cyan
88
+ bar_color = "#10B981"
89
+
90
+ return f"""
91
+ <div style="display: flex; align-items: center; gap: 10px; width: 100%;">
92
+ <div style="
93
+ flex: 1;
94
+ height: 8px;
95
+ background: rgba(148, 163, 184, 0.15);
96
+ border-radius: 4px;
97
+ overflow: hidden;
98
+ max-width: 160px;
99
+ position: relative;
100
+ ">
101
+ <div style="
102
+ width: {width}%;
103
+ height: 100%;
104
+ background: {gradient};
105
+ border-radius: 4px;
106
+ transition: width 0.5s cubic-bezier(0.4, 0, 0.2, 1);
107
+ box-shadow: 0 0 8px {bar_color}40;
108
+ "></div>
109
+ </div>
110
+ <span style="
111
+ font-family: 'Monaco', 'Menlo', monospace;
112
+ font-weight: 600;
113
+ color: var(--tm-text-primary, #000000);
114
+ min-width: 55px;
115
+ font-size: 0.9em;
116
+ ">{success_rate:.1f}%</span>
117
+ </div>
118
+ """
119
+
120
+
121
+ def get_gpu_utilization_bar(utilization: float) -> str:
122
+ """
123
+ Generate HTML for GPU utilization progress bar
124
+
125
+ Args:
126
+ utilization: GPU utilization percentage (0-100)
127
+
128
+ Returns:
129
+ HTML string with progress bar
130
+
131
+ Color Logic:
132
+ - < 30%: Low utilization (yellow/amber)
133
+ - 30-70%: Medium utilization (orange)
134
+ - > 70%: High utilization (red/orange) - good efficiency!
135
+ """
136
+ width = min(max(utilization, 0), 100)
137
+
138
+ # Higher GPU utilization = better efficiency
139
+ if utilization < 30:
140
+ gradient = "linear-gradient(90deg, #FBBF24, #F59E0B)" # Yellow → Amber
141
+ elif utilization < 70:
142
+ gradient = "linear-gradient(90deg, #F59E0B, #FB923C)" # Amber → Orange
143
+ else:
144
+ gradient = "linear-gradient(90deg, #FB923C, #F97316)" # Orange → Deep Orange
145
+
146
+ return f"""
147
+ <div style="display: flex; align-items: center; gap: 8px;">
148
+ <div style="
149
+ flex: 1;
150
+ height: 6px;
151
+ background: rgba(148, 163, 184, 0.15);
152
+ border-radius: 3px;
153
+ max-width: 100px;
154
+ ">
155
+ <div style="
156
+ width: {width}%;
157
+ height: 100%;
158
+ background: {gradient};
159
+ border-radius: 3px;
160
+ transition: width 0.4s ease;
161
+ "></div>
162
+ </div>
163
+ <span style="
164
+ font-family: monospace;
165
+ font-size: 0.85em;
166
+ color: var(--tm-text-secondary, #000000);
167
+ min-width: 45px;
168
+ ">{utilization:.1f}%</span>
169
+ </div>
170
+ """
171
+
172
+
173
+ def get_provider_badge(provider: str) -> str:
174
+ """
175
+ Generate HTML for provider type badge
176
+
177
+ Args:
178
+ provider: Provider name (litellm, transformers, etc.)
179
+
180
+ Returns:
181
+ HTML string for colored badge
182
+
183
+ Colors:
184
+ - litellm: Blue (API providers)
185
+ - transformers: Green (GPU/local models)
186
+ """
187
+ provider_colors = {
188
+ "litellm": "#3B82F6", # Blue - API providers
189
+ "transformers": "#10B981", # Green - GPU/local
190
+ "openai": "#10A37F", # OpenAI green
191
+ "anthropic": "#D97757", # Anthropic orange
192
+ }
193
+
194
+ bg_color = provider_colors.get(provider.lower(), "#6B7280") # Default gray
195
+
196
+ return f"""
197
+ <span style="
198
+ display: inline-flex;
199
+ align-items: center;
200
+ padding: 4px 10px;
201
+ background: {bg_color};
202
+ color: white;
203
+ border-radius: 5px;
204
+ font-size: 0.75em;
205
+ font-weight: 600;
206
+ text-transform: uppercase;
207
+ letter-spacing: 0.5px;
208
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
209
+ ">
210
+ {provider.upper()}
211
+ </span>
212
+ """
213
+
214
+
215
+ def get_agent_type_badge(agent_type: str) -> str:
216
+ """
217
+ Generate HTML for agent type badge
218
+
219
+ Args:
220
+ agent_type: Agent type (tool, code, both)
221
+
222
+ Returns:
223
+ HTML string for colored badge
224
+
225
+ Colors:
226
+ - tool: Purple
227
+ - code: Amber/Orange
228
+ - both: Cyan
229
+ """
230
+ type_colors = {
231
+ "tool": "#8B5CF6", # Purple
232
+ "code": "#F59E0B", # Amber
233
+ "both": "#06B6D4", # Cyan
234
+ }
235
+
236
+ bg_color = type_colors.get(agent_type.lower(), "#6B7280")
237
+
238
+ return f"""
239
+ <span style="
240
+ display: inline-flex;
241
+ align-items: center;
242
+ padding: 4px 10px;
243
+ background: {bg_color};
244
+ color: white;
245
+ border-radius: 5px;
246
+ font-size: 0.75em;
247
+ font-weight: 600;
248
+ text-transform: uppercase;
249
+ letter-spacing: 0.5px;
250
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
251
+ ">
252
+ {agent_type.upper()}
253
+ </span>
254
+ """
255
+
256
+
257
+ def get_hardware_badge(has_gpu: bool) -> str:
258
+ """
259
+ Generate HTML for hardware type badge
260
+
261
+ Args:
262
+ has_gpu: Whether job used GPU
263
+
264
+ Returns:
265
+ HTML string for badge
266
+ """
267
+ if has_gpu:
268
+ return """
269
+ <span style="
270
+ display: inline-flex;
271
+ align-items: center;
272
+ gap: 4px;
273
+ padding: 4px 10px;
274
+ background: linear-gradient(135deg, #F59E0B, #EF4444);
275
+ color: white;
276
+ border-radius: 5px;
277
+ font-size: 0.75em;
278
+ font-weight: 600;
279
+ letter-spacing: 0.5px;
280
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
281
+ ">
282
+ 🖥️ GPU
283
+ </span>
284
+ """
285
+ else:
286
+ return """
287
+ <span style="
288
+ display: inline-flex;
289
+ align-items: center;
290
+ gap: 4px;
291
+ padding: 4px 10px;
292
+ background: #6B7280;
293
+ color: white;
294
+ border-radius: 5px;
295
+ font-size: 0.75em;
296
+ font-weight: 600;
297
+ letter-spacing: 0.5px;
298
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
299
+ ">
300
+ 💻 CPU
301
+ </span>
302
+ """
303
+
304
+
305
+ def format_cost(cost_usd: float) -> str:
306
+ """
307
+ Format cost with color coding
308
+
309
+ Args:
310
+ cost_usd: Cost in USD
311
+
312
+ Returns:
313
+ HTML string with formatted cost
314
+ """
315
+ # Color code by cost magnitude
316
+ if cost_usd < 0.01:
317
+ color = "#10B981" # Green - very cheap
318
+ elif cost_usd < 0.05:
319
+ color = "#F59E0B" # Amber - moderate
320
+ else:
321
+ color = "#EF4444" # Red - expensive
322
+
323
+ return f"""
324
+ <span style="
325
+ font-family: monospace;
326
+ font-weight: 600;
327
+ color: {color};
328
+ font-size: 0.9em;
329
+ ">
330
+ ${cost_usd:.4f}
331
+ </span>
332
+ """
333
+
334
+
335
+ def format_duration(duration_ms: float) -> str:
336
+ """
337
+ Format duration with appropriate units
338
+
339
+ Args:
340
+ duration_ms: Duration in milliseconds
341
+
342
+ Returns:
343
+ HTML string with formatted duration
344
+ """
345
+ if duration_ms < 1000:
346
+ value = duration_ms
347
+ unit = "ms"
348
+ color = "#10B981" # Green - fast
349
+ elif duration_ms < 10000:
350
+ value = duration_ms / 1000
351
+ unit = "s"
352
+ color = "#F59E0B" # Amber - moderate
353
+ else:
354
+ value = duration_ms / 1000
355
+ unit = "s"
356
+ color = "#EF4444" # Red - slow
357
+
358
+ return f"""
359
+ <span style="
360
+ font-family: monospace;
361
+ color: {color};
362
+ font-weight: 500;
363
+ font-size: 0.9em;
364
+ ">
365
+ {value:.1f}{unit}
366
+ </span>
367
+ """
368
+
369
+
370
+ def get_tooltip_icon(tooltip_text: str) -> str:
371
+ """
372
+ Generate info icon with tooltip
373
+
374
+ Args:
375
+ tooltip_text: Text to show in tooltip
376
+
377
+ Returns:
378
+ HTML string for icon with tooltip
379
+ """
380
+ return f"""
381
+ <span title="{tooltip_text}" style="
382
+ color: var(--tm-secondary, #06B6D4);
383
+ cursor: help;
384
+ font-size: 0.9em;
385
+ margin-left: 4px;
386
+ ">ⓘ</span>
387
+ """
data_loader.py CHANGED
@@ -1,255 +1,430 @@
1
  """
2
- Data Loader for TraceMind-AI
3
- Loads real data from HuggingFace datasets (not mock data)
4
  """
5
 
6
  import os
7
- from typing import Optional, Dict, Any, List
 
 
8
  import pandas as pd
9
  from datasets import load_dataset
10
- from dotenv import load_dotenv
 
11
 
12
- # Load environment variables
13
- load_dotenv()
14
 
 
15
 
16
- class TraceMindDataLoader:
17
- """Loads evaluation data from HuggingFace datasets"""
 
 
 
 
 
 
 
 
 
18
 
19
  def __init__(
20
  self,
21
- leaderboard_repo: Optional[str] = None,
 
 
22
  hf_token: Optional[str] = None
23
  ):
24
- """
25
- Initialize data loader
 
 
26
 
27
- Args:
28
- leaderboard_repo: HuggingFace dataset repo for leaderboard
29
- hf_token: HuggingFace API token for private datasets
30
- """
31
- self.leaderboard_repo = leaderboard_repo or os.getenv(
32
- 'LEADERBOARD_REPO',
33
- 'kshitijthakkar/smoltrace-leaderboard'
34
- )
35
- self.hf_token = hf_token or os.getenv('HF_TOKEN')
36
-
37
- # Cache for loaded datasets
38
- self._leaderboard_df: Optional[pd.DataFrame] = None
39
- self._results_cache: Dict[str, pd.DataFrame] = {}
40
- self._traces_cache: Dict[str, List[Dict]] = {}
41
- self._metrics_cache: Dict[str, Dict] = {}
42
-
43
- def load_leaderboard(self, force_refresh: bool = False) -> pd.DataFrame:
44
- """
45
- Load leaderboard dataset from HuggingFace
46
 
47
- Args:
48
- force_refresh: Force reload from HF (ignore cache)
 
49
 
50
  Returns:
51
  DataFrame with leaderboard data
52
  """
53
- if self._leaderboard_df is not None and not force_refresh:
54
- return self._leaderboard_df
55
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  try:
57
- print(f"📊 Loading leaderboard from {self.leaderboard_repo}...")
 
 
 
 
 
 
58
 
59
- # Load dataset from HuggingFace
60
- dataset = load_dataset(
61
- self.leaderboard_repo,
62
- split='train',
63
- token=self.hf_token
64
- )
65
 
66
- # Convert to DataFrame
67
- self._leaderboard_df = pd.DataFrame(dataset)
68
 
69
- print(f"✅ Loaded {len(self._leaderboard_df)} evaluation runs")
70
- return self._leaderboard_df
71
 
72
- except Exception as e:
73
- print(f" Error loading leaderboard: {e}")
74
- # Return empty DataFrame with expected columns
75
- return pd.DataFrame(columns=[
76
- 'run_id', 'model', 'agent_type', 'provider',
77
- 'success_rate', 'total_tests', 'successful_tests', 'failed_tests',
78
- 'avg_steps', 'avg_duration_ms', 'total_duration_ms',
79
- 'total_tokens', 'avg_tokens_per_test', 'total_cost_usd', 'avg_cost_per_test_usd',
80
- 'co2_emissions_g', 'gpu_utilization_avg', 'gpu_memory_max_mib',
81
- 'results_dataset', 'traces_dataset', 'metrics_dataset',
82
- 'timestamp', 'submitted_by', 'hf_job_id', 'job_type',
83
- 'dataset_used', 'smoltrace_version'
84
- ])
85
-
86
- def load_results(self, results_repo: str, force_refresh: bool = False) -> pd.DataFrame:
87
  """
88
  Load results dataset for a specific run
89
 
90
  Args:
91
- results_repo: HuggingFace dataset repo for results (e.g., 'user/agent-results-gpt4')
92
- force_refresh: Force reload from HF
93
 
94
  Returns:
95
  DataFrame with test case results
96
  """
97
- if results_repo in self._results_cache and not force_refresh:
98
- return self._results_cache[results_repo]
99
-
100
- try:
101
- print(f"📊 Loading results from {results_repo}...")
102
-
103
- dataset = load_dataset(
104
- results_repo,
105
- split='train',
106
- token=self.hf_token
107
- )
108
-
109
- df = pd.DataFrame(dataset)
110
- self._results_cache[results_repo] = df
111
-
112
- print(f"✅ Loaded {len(df)} test cases")
113
- return df
114
-
115
- except Exception as e:
116
- print(f"❌ Error loading results: {e}")
117
- return pd.DataFrame(columns=[
118
- 'run_id', 'task_id', 'test_index',
119
- 'prompt', 'expected_tool', 'difficulty', 'category',
120
- 'success', 'response', 'tool_called', 'tool_correct',
121
- 'expected_keywords', 'keywords_matched',
122
- 'execution_time_ms', 'total_tokens', 'prompt_tokens', 'completion_tokens', 'cost_usd',
123
- 'trace_id', 'start_time', 'end_time', 'start_time_unix_nano', 'end_time_unix_nano',
124
- 'error', 'error_type'
125
- ])
126
-
127
- def load_traces(self, traces_repo: str, force_refresh: bool = False) -> List[Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  """
129
  Load traces dataset for a specific run
130
 
131
  Args:
132
- traces_repo: HuggingFace dataset repo for traces
133
- force_refresh: Force reload from HF
134
-
135
- Returns:
136
- List of trace dictionaries (OpenTelemetry format)
137
- """
138
- if traces_repo in self._traces_cache and not force_refresh:
139
- return self._traces_cache[traces_repo]
140
-
141
- try:
142
- print(f"🔍 Loading traces from {traces_repo}...")
143
-
144
- dataset = load_dataset(
145
- traces_repo,
146
- split='train',
147
- token=self.hf_token
148
- )
149
-
150
- # Convert to list of dicts
151
- traces = [dict(item) for item in dataset]
152
- self._traces_cache[traces_repo] = traces
153
-
154
- print(f"✅ Loaded {len(traces)} traces")
155
- return traces
156
-
157
- except Exception as e:
158
- print(f"❌ Error loading traces: {e}")
159
- return []
160
-
161
- def load_metrics(self, metrics_repo: str, force_refresh: bool = False) -> Dict[str, Any]:
162
- """
163
- Load GPU metrics dataset for a specific run
164
-
165
- Args:
166
- metrics_repo: HuggingFace dataset repo for metrics
167
- force_refresh: Force reload from HF
168
 
169
  Returns:
170
- Metrics data (OpenTelemetry metrics format)
171
  """
172
- if metrics_repo in self._metrics_cache and not force_refresh:
173
- return self._metrics_cache[metrics_repo]
174
-
175
- try:
176
- print(f"📈 Loading metrics from {metrics_repo}...")
177
-
178
- dataset = load_dataset(
179
- metrics_repo,
180
- split='train',
181
- token=self.hf_token
182
- )
183
-
184
- # Assume metrics dataset has one row with all metrics
185
- if len(dataset) > 0:
186
- metrics = dict(dataset[0])
187
- self._metrics_cache[metrics_repo] = metrics
188
- print(f"✅ Loaded metrics data")
189
- return metrics
190
- else:
191
- print(f"⚠️ No metrics data found")
192
- return {}
193
-
194
- except Exception as e:
195
- print(f" Error loading metrics: {e}")
196
- return {}
197
-
198
- def get_run_by_id(self, run_id: str) -> Optional[Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  """
200
- Get a specific run from the leaderboard by run_id
201
 
202
  Args:
203
- run_id: Run ID to fetch
204
 
205
  Returns:
206
- Run data as dict, or None if not found
207
  """
208
- leaderboard_df = self.load_leaderboard()
209
 
210
- run_rows = leaderboard_df[leaderboard_df['run_id'] == run_id]
 
211
 
212
- if len(run_rows) > 0:
213
- return run_rows.iloc[0].to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  else:
215
- return None
216
-
217
- def get_trace_by_id(self, traces_repo: str, trace_id: str) -> Optional[Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  """
219
- Get a specific trace by trace_id
220
 
221
  Args:
222
- traces_repo: HuggingFace dataset repo for traces
223
- trace_id: Trace ID to fetch
224
 
225
  Returns:
226
- Trace data as dict, or None if not found
227
  """
228
- traces = self.load_traces(traces_repo)
229
 
230
  for trace in traces:
231
- if trace.get('trace_id') == trace_id or trace.get('traceId') == trace_id:
 
 
 
 
 
 
 
 
232
  return trace
233
 
234
  return None
235
 
236
- def clear_cache(self):
237
- """Clear all cached data"""
238
- self._leaderboard_df = None
239
- self._results_cache.clear()
240
- self._traces_cache.clear()
241
- self._metrics_cache.clear()
242
- print("🧹 Cache cleared")
 
 
 
243
 
244
 
245
- def create_data_loader_from_env() -> TraceMindDataLoader:
246
  """
247
- Create a data loader using environment variables
248
 
249
  Returns:
250
- TraceMindDataLoader instance
251
  """
252
- return TraceMindDataLoader(
253
- leaderboard_repo=os.getenv('LEADERBOARD_REPO'),
254
- hf_token=os.getenv('HF_TOKEN')
 
 
 
 
255
  )
 
1
  """
2
+ Data Loader for MockTraceMind
3
+ Supports loading from both JSON files and HuggingFace datasets
4
  """
5
 
6
  import os
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Any, Literal
10
  import pandas as pd
11
  from datasets import load_dataset
12
+ from huggingface_hub import HfApi
13
+ import gradio as gr
14
 
 
 
15
 
16
+ DataSource = Literal["json", "huggingface", "both"]
17
 
18
+
19
+ class DataLoader:
20
+ """
21
+ Unified data loader for MockTraceMind
22
+
23
+ Supports:
24
+ - Local JSON files
25
+ - HuggingFace datasets
26
+ - Automatic fallback between sources
27
+ - Caching for performance
28
+ """
29
 
30
  def __init__(
31
  self,
32
+ data_source: DataSource = "both",
33
+ json_data_path: Optional[str] = None,
34
+ leaderboard_dataset: Optional[str] = None,
35
  hf_token: Optional[str] = None
36
  ):
37
+ self.data_source = data_source
38
+ self.json_data_path = Path(json_data_path or os.getenv("JSON_DATA_PATH", "./sample_data"))
39
+ self.leaderboard_dataset = leaderboard_dataset or os.getenv("LEADERBOARD_DATASET", "huggingface/smolagents-leaderboard")
40
+ self.hf_token = hf_token or os.getenv("HF_TOKEN")
41
 
42
+ # Cache
43
+ self._cache: Dict[str, Any] = {}
44
+ self.hf_api = HfApi(token=self.hf_token) if self.hf_token else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ def load_leaderboard(self) -> pd.DataFrame:
47
+ """
48
+ Load leaderboard dataset
49
 
50
  Returns:
51
  DataFrame with leaderboard data
52
  """
53
+ cache_key = "leaderboard"
54
+
55
+ if cache_key in self._cache:
56
+ return self._cache[cache_key]
57
+
58
+ # Try HuggingFace first
59
+ if self.data_source in ["huggingface", "both"]:
60
+ try:
61
+ df = self._load_leaderboard_from_hf()
62
+ self._cache[cache_key] = df
63
+ return df
64
+ except Exception as e:
65
+ print(f"Failed to load from HuggingFace: {e}")
66
+ if self.data_source == "huggingface":
67
+ raise
68
+
69
+ # Fallback to JSON
70
+ if self.data_source in ["json", "both"]:
71
+ try:
72
+ df = self._load_leaderboard_from_json()
73
+ self._cache[cache_key] = df
74
+ return df
75
+ except Exception as e:
76
+ print(f"Failed to load from JSON: {e}")
77
+ raise
78
+
79
+ raise ValueError("No valid data source available")
80
+
81
+ def _load_leaderboard_from_hf(self) -> pd.DataFrame:
82
+ """Load leaderboard from HuggingFace dataset"""
83
  try:
84
+ ds = load_dataset(self.leaderboard_dataset, split="train", token=self.hf_token)
85
+ df = ds.to_pandas()
86
+ print(f"[OK] Loaded leaderboard from HuggingFace: {len(df)} rows")
87
+ return df
88
+ except Exception as e:
89
+ print(f"[ERROR] Loading from HuggingFace: {e}")
90
+ raise
91
 
92
+ def _load_leaderboard_from_json(self) -> pd.DataFrame:
93
+ """Load leaderboard from local JSON file"""
94
+ json_path = self.json_data_path / "leaderboard.json"
 
 
 
95
 
96
+ if not json_path.exists():
97
+ raise FileNotFoundError(f"Leaderboard JSON not found: {json_path}")
98
 
99
+ with open(json_path, "r") as f:
100
+ data = json.load(f)
101
 
102
+ df = pd.DataFrame(data)
103
+ print(f"[OK] Loaded leaderboard from JSON: {len(df)} rows")
104
+ return df
105
+
106
+ def load_results(self, results_dataset: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
107
  """
108
  Load results dataset for a specific run
109
 
110
  Args:
111
+ results_dataset: Dataset reference (e.g., "user/agent-results-gpt4")
 
112
 
113
  Returns:
114
  DataFrame with test case results
115
  """
116
+ cache_key = f"results_{results_dataset}"
117
+
118
+ if cache_key in self._cache:
119
+ return self._cache[cache_key]
120
+
121
+ # Try HuggingFace first
122
+ if self.data_source in ["huggingface", "both"]:
123
+ try:
124
+ df = self._load_results_from_hf(results_dataset)
125
+ self._cache[cache_key] = df
126
+ return df
127
+ except Exception as e:
128
+ print(f"Failed to load results from HuggingFace: {e}")
129
+ if self.data_source == "huggingface":
130
+ raise
131
+
132
+ # Fallback to JSON
133
+ if self.data_source in ["json", "both"]:
134
+ try:
135
+ df = self._load_results_from_json(results_dataset)
136
+ self._cache[cache_key] = df
137
+ return df
138
+ except Exception as e:
139
+ print(f"Failed to load results from JSON: {e}")
140
+ raise
141
+
142
+ raise ValueError("No valid data source available")
143
+
144
+ def _load_results_from_hf(self, dataset_id: str) -> pd.DataFrame:
145
+ """Load results from HuggingFace dataset"""
146
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token)
147
+ df = ds.to_pandas()
148
+ print(f"[OK] Loaded results from HuggingFace: {len(df)} rows")
149
+ return df
150
+
151
+ def _load_results_from_json(self, dataset_id: str) -> pd.DataFrame:
152
+ """Load results from local JSON file"""
153
+ # Extract filename from dataset ID (e.g., "user/agent-results-gpt4" -> "results_gpt4.json")
154
+ filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
155
+ json_path = self.json_data_path / filename
156
+
157
+ if not json_path.exists():
158
+ raise FileNotFoundError(f"Results JSON not found: {json_path}")
159
+
160
+ with open(json_path, "r") as f:
161
+ data = json.load(f)
162
+
163
+ df = pd.DataFrame(data)
164
+ print(f"[OK] Loaded results from JSON: {len(df)} rows")
165
+ return df
166
+
167
+ def load_traces(self, traces_dataset: str) -> List[Dict[str, Any]]:
168
  """
169
  Load traces dataset for a specific run
170
 
171
  Args:
172
+ traces_dataset: Dataset reference (e.g., "user/agent-traces-gpt4")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  Returns:
175
+ List of trace objects (OpenTelemetry format)
176
  """
177
+ cache_key = f"traces_{traces_dataset}"
178
+
179
+ if cache_key in self._cache:
180
+ return self._cache[cache_key]
181
+
182
+ # Try HuggingFace first
183
+ if self.data_source in ["huggingface", "both"]:
184
+ try:
185
+ traces = self._load_traces_from_hf(traces_dataset)
186
+ self._cache[cache_key] = traces
187
+ return traces
188
+ except Exception as e:
189
+ print(f"Failed to load traces from HuggingFace: {e}")
190
+ if self.data_source == "huggingface":
191
+ raise
192
+
193
+ # Fallback to JSON
194
+ if self.data_source in ["json", "both"]:
195
+ try:
196
+ traces = self._load_traces_from_json(traces_dataset)
197
+ self._cache[cache_key] = traces
198
+ return traces
199
+ except Exception as e:
200
+ print(f"Failed to load traces from JSON: {e}")
201
+ raise
202
+
203
+ raise ValueError("No valid data source available")
204
+
205
+ def _load_traces_from_hf(self, dataset_id: str) -> List[Dict[str, Any]]:
206
+ """Load traces from HuggingFace dataset"""
207
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token)
208
+ traces = ds.to_pandas().to_dict("records")
209
+ print(f"[OK] Loaded traces from HuggingFace: {len(traces)} traces")
210
+ return traces
211
+
212
+ def _load_traces_from_json(self, dataset_id: str) -> List[Dict[str, Any]]:
213
+ """Load traces from local JSON file"""
214
+ filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
215
+ json_path = self.json_data_path / filename
216
+
217
+ if not json_path.exists():
218
+ raise FileNotFoundError(f"Traces JSON not found: {json_path}")
219
+
220
+ with open(json_path, "r") as f:
221
+ data = json.load(f)
222
+
223
+ print(f"[OK] Loaded traces from JSON: {len(data)} traces")
224
+ return data
225
+
226
+ def load_metrics(self, metrics_dataset: str) -> pd.DataFrame:
227
  """
228
+ Load metrics dataset for a specific run (GPU metrics)
229
 
230
  Args:
231
+ metrics_dataset: Dataset reference (e.g., "user/agent-metrics-gpt4")
232
 
233
  Returns:
234
+ DataFrame with GPU metrics in flat format (columns: timestamp, gpu_utilization_percent, etc.)
235
  """
236
+ cache_key = f"metrics_{metrics_dataset}"
237
 
238
+ if cache_key in self._cache:
239
+ return self._cache[cache_key]
240
 
241
+ # Try HuggingFace first
242
+ if self.data_source in ["huggingface", "both"]:
243
+ try:
244
+ metrics = self._load_metrics_from_hf(metrics_dataset)
245
+ self._cache[cache_key] = metrics
246
+ return metrics
247
+ except Exception as e:
248
+ print(f"Failed to load metrics from HuggingFace: {e}")
249
+ if self.data_source == "huggingface":
250
+ raise
251
+
252
+ # Fallback to JSON
253
+ if self.data_source in ["json", "both"]:
254
+ try:
255
+ metrics = self._load_metrics_from_json(metrics_dataset)
256
+ self._cache[cache_key] = metrics
257
+ return metrics
258
+ except Exception as e:
259
+ print(f"Failed to load metrics from JSON: {e}")
260
+ # Metrics might not exist for API models, don't raise
261
+ print("⚠️ No metrics available (expected for API models)")
262
+ return pd.DataFrame()
263
+
264
+ return pd.DataFrame()
265
+
266
+ def _load_metrics_from_hf(self, dataset_id: str) -> pd.DataFrame:
267
+ """Load metrics from HuggingFace dataset (flat format)"""
268
+ ds = load_dataset(dataset_id, split="train", token=self.hf_token)
269
+ df = ds.to_pandas()
270
+
271
+ # Convert timestamp strings to datetime if needed
272
+ if 'timestamp' in df.columns:
273
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
274
+
275
+ print(f"[OK] Loaded metrics from HuggingFace: {len(df)} rows")
276
+ print(f" Columns: {list(df.columns)}")
277
+ return df
278
+
279
+ def _load_metrics_from_json(self, dataset_id: str) -> pd.DataFrame:
280
+ """Load metrics from local JSON file"""
281
+ filename = dataset_id.split("/")[-1].replace("agent-", "") + ".json"
282
+ json_path = self.json_data_path / filename
283
+
284
+ if not json_path.exists():
285
+ # Metrics might not exist for API models
286
+ return pd.DataFrame()
287
+
288
+ with open(json_path, "r") as f:
289
+ data = json.load(f)
290
+
291
+ # Check if it's OpenTelemetry format (nested) or flat format
292
+ if isinstance(data, dict) and 'resourceMetrics' in data:
293
+ # Legacy OpenTelemetry format - convert to flat format
294
+ df = self._convert_otel_to_flat(data)
295
+ elif isinstance(data, list):
296
+ df = pd.DataFrame(data)
297
  else:
298
+ df = pd.DataFrame()
299
+
300
+ # Convert timestamp strings to datetime if needed
301
+ if 'timestamp' in df.columns and not df.empty:
302
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
303
+
304
+ print(f"[OK] Loaded metrics from JSON: {len(df)} rows")
305
+ return df
306
+
307
+ def _convert_otel_to_flat(self, otel_data: Dict[str, Any]) -> pd.DataFrame:
308
+ """Convert OpenTelemetry resourceMetrics format to flat DataFrame"""
309
+ rows = []
310
+
311
+ for resource_metric in otel_data.get('resourceMetrics', []):
312
+ for scope_metric in resource_metric.get('scopeMetrics', []):
313
+ for metric in scope_metric.get('metrics', []):
314
+ metric_name = metric.get('name', '')
315
+
316
+ # Handle gauge metrics
317
+ if 'gauge' in metric:
318
+ for data_point in metric['gauge'].get('dataPoints', []):
319
+ row = self._extract_data_point(metric_name, data_point, metric.get('unit', ''))
320
+ if row:
321
+ rows.append(row)
322
+
323
+ # Handle sum metrics (like CO2)
324
+ elif 'sum' in metric:
325
+ for data_point in metric['sum'].get('dataPoints', []):
326
+ row = self._extract_data_point(metric_name, data_point, metric.get('unit', ''))
327
+ if row:
328
+ rows.append(row)
329
+
330
+ return pd.DataFrame(rows)
331
+
332
+ def _extract_data_point(self, metric_name: str, data_point: Dict, unit: str) -> Optional[Dict[str, Any]]:
333
+ """Extract a single data point from OpenTelemetry format to flat row"""
334
+ # Get GPU attributes
335
+ gpu_id = None
336
+ gpu_name = None
337
+ for attr in data_point.get('attributes', []):
338
+ if attr.get('key') == 'gpu_id':
339
+ gpu_id = attr.get('value', {}).get('stringValue', '')
340
+ elif attr.get('key') == 'gpu_name':
341
+ gpu_name = attr.get('value', {}).get('stringValue', '')
342
+
343
+ # Get value
344
+ value = None
345
+ if 'asInt' in data_point and data_point['asInt'] is not None:
346
+ value = int(data_point['asInt'])
347
+ elif 'asDouble' in data_point and data_point['asDouble'] is not None:
348
+ value = float(data_point['asDouble'])
349
+
350
+ # Get timestamp
351
+ timestamp_nano = data_point.get('timeUnixNano', '')
352
+ if timestamp_nano:
353
+ timestamp_sec = int(timestamp_nano) / 1e9
354
+ timestamp = pd.to_datetime(timestamp_sec, unit='s')
355
+ else:
356
+ timestamp = None
357
+
358
+ # Map metric names to column names
359
+ metric_col_map = {
360
+ 'gen_ai.gpu.utilization': 'gpu_utilization_percent',
361
+ 'gen_ai.gpu.memory.used': 'gpu_memory_used_mib',
362
+ 'gen_ai.gpu.temperature': 'gpu_temperature_celsius',
363
+ 'gen_ai.gpu.power': 'gpu_power_watts',
364
+ 'gen_ai.co2.emissions': 'co2_emissions_gco2e'
365
+ }
366
+
367
+ return {
368
+ 'timestamp': timestamp,
369
+ 'timestamp_unix_nano': timestamp_nano,
370
+ 'gpu_id': gpu_id,
371
+ 'gpu_name': gpu_name,
372
+ 'metric_name': metric_name,
373
+ 'value': value,
374
+ 'unit': unit
375
+ }
376
+
377
+ def get_trace_by_id(self, traces_dataset: str, trace_id: str) -> Optional[Dict[str, Any]]:
378
  """
379
+ Get a specific trace by ID
380
 
381
  Args:
382
+ traces_dataset: Dataset reference
383
+ trace_id: Trace ID to find
384
 
385
  Returns:
386
+ Trace object or None if not found
387
  """
388
+ traces = self.load_traces(traces_dataset)
389
 
390
  for trace in traces:
391
+ if trace.get("trace_id") == trace_id or trace.get("traceId") == trace_id:
392
+ # Ensure spans is a proper list (not numpy array or pandas Series)
393
+ if "spans" in trace:
394
+ spans = trace["spans"]
395
+ if hasattr(spans, 'tolist'):
396
+ trace["spans"] = spans.tolist()
397
+ elif not isinstance(spans, list):
398
+ trace["spans"] = list(spans) if spans is not None else []
399
+
400
  return trace
401
 
402
  return None
403
 
404
+ def clear_cache(self) -> None:
405
+ """Clear the internal cache"""
406
+ self._cache.clear()
407
+ print("[OK] Cache cleared")
408
+
409
+ def refresh_leaderboard(self) -> pd.DataFrame:
410
+ """Refresh leaderboard data (clear cache and reload)"""
411
+ if "leaderboard" in self._cache:
412
+ del self._cache["leaderboard"]
413
+ return self.load_leaderboard()
414
 
415
 
416
+ def create_data_loader_from_env() -> DataLoader:
417
  """
418
+ Create DataLoader instance from environment variables
419
 
420
  Returns:
421
+ Configured DataLoader instance
422
  """
423
+ data_source = os.getenv("DATA_SOURCE", "both")
424
+
425
+ return DataLoader(
426
+ data_source=data_source,
427
+ json_data_path=os.getenv("JSON_DATA_PATH"),
428
+ leaderboard_dataset=os.getenv("LEADERBOARD_DATASET"),
429
+ hf_token=os.getenv("HF_TOKEN")
430
  )
sample_data/generate_sample_metrics.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate sample metrics data in OpenTelemetry resourceMetrics format.
3
+ This simulates what SMOLTRACE would produce for GPU and API evaluation runs.
4
+ """
5
+
6
+ import json
7
+ import time
8
+ from datetime import datetime, timedelta
9
+ from pathlib import Path
10
+
11
+
12
+ def generate_gpu_sample_metrics(
13
+ run_id: str = "run_002_llama31",
14
+ duration_seconds: int = 120,
15
+ interval_seconds: int = 10
16
+ ):
17
+ """
18
+ Generate sample GPU metrics data for a GPU model run.
19
+
20
+ Args:
21
+ run_id: Run identifier
22
+ duration_seconds: Total duration of simulated run
23
+ interval_seconds: Interval between data points
24
+
25
+ Returns:
26
+ Dict in OpenTelemetry resourceMetrics format
27
+ """
28
+
29
+ start_time = datetime.now()
30
+ num_points = duration_seconds // interval_seconds
31
+
32
+ # Generate time-series data points
33
+ utilization_points = []
34
+ memory_points = []
35
+ temperature_points = []
36
+ power_points = []
37
+ co2_points = []
38
+
39
+ cumulative_co2 = 0.0
40
+
41
+ for i in range(num_points):
42
+ timestamp = start_time + timedelta(seconds=i * interval_seconds)
43
+ time_unix_nano = str(int(timestamp.timestamp() * 1e9))
44
+
45
+ # Simulate realistic GPU metrics with some variation
46
+ # Pattern: Higher utilization during inference, lower during idle
47
+ utilization = 45 + (i % 5) * 10 + (i % 2) * 5 # 45-70%
48
+ memory = 4096 + i * 100 # Gradually increasing memory usage
49
+ temperature = 70 + (i % 6) * 2 # 70-80°C
50
+ power = 250 + (i % 7) * 30 # 250-400W
51
+
52
+ # Cumulative CO2 (monotonic increasing)
53
+ # Rough estimate: power (W) * time (h) * carbon intensity (g/kWh)
54
+ delta_co2 = (power / 1000.0) * (interval_seconds / 3600.0) * 400 # 400g/kWh assumed
55
+ cumulative_co2 += delta_co2
56
+
57
+ utilization_points.append({
58
+ "attributes": [
59
+ {"key": "gpu_id", "value": {"stringValue": "0"}},
60
+ {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
61
+ ],
62
+ "timeUnixNano": time_unix_nano,
63
+ "asInt": str(utilization)
64
+ })
65
+
66
+ memory_points.append({
67
+ "attributes": [
68
+ {"key": "gpu_id", "value": {"stringValue": "0"}},
69
+ {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
70
+ ],
71
+ "timeUnixNano": time_unix_nano,
72
+ "asDouble": float(memory)
73
+ })
74
+
75
+ temperature_points.append({
76
+ "attributes": [
77
+ {"key": "gpu_id", "value": {"stringValue": "0"}},
78
+ {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
79
+ ],
80
+ "timeUnixNano": time_unix_nano,
81
+ "asInt": str(temperature)
82
+ })
83
+
84
+ power_points.append({
85
+ "attributes": [
86
+ {"key": "gpu_id", "value": {"stringValue": "0"}},
87
+ {"key": "gpu_name", "value": {"stringValue": "NVIDIA H200"}}
88
+ ],
89
+ "timeUnixNano": time_unix_nano,
90
+ "asDouble": float(power)
91
+ })
92
+
93
+ co2_points.append({
94
+ "attributes": [
95
+ {"key": "gpu_id", "value": {"stringValue": "0"}}
96
+ ],
97
+ "timeUnixNano": time_unix_nano,
98
+ "asDouble": cumulative_co2
99
+ })
100
+
101
+ # Construct resourceMetrics structure (OpenTelemetry format)
102
+ metrics_data = {
103
+ "run_id": run_id,
104
+ "resourceMetrics": [{
105
+ "resource": {
106
+ "attributes": [
107
+ {"key": "telemetry.sdk.language", "value": {"stringValue": "python"}},
108
+ {"key": "telemetry.sdk.name", "value": {"stringValue": "opentelemetry"}},
109
+ {"key": "telemetry.sdk.version", "value": {"stringValue": "1.37.0"}},
110
+ {"key": "service.name", "value": {"stringValue": "smoltrace-eval"}},
111
+ {"key": "run.id", "value": {"stringValue": run_id}}
112
+ ]
113
+ },
114
+ "scopeMetrics": [{
115
+ "scope": {"name": "genai.gpu", "version": None},
116
+ "metrics": [
117
+ {
118
+ "name": "gen_ai.gpu.utilization",
119
+ "description": "GPU utilization percentage",
120
+ "unit": "%",
121
+ "gauge": {"dataPoints": utilization_points}
122
+ },
123
+ {
124
+ "name": "gen_ai.gpu.memory.used",
125
+ "description": "GPU memory used in MiB",
126
+ "unit": "MiB",
127
+ "gauge": {"dataPoints": memory_points}
128
+ },
129
+ {
130
+ "name": "gen_ai.gpu.temperature",
131
+ "description": "GPU temperature in Celsius",
132
+ "unit": "Cel",
133
+ "gauge": {"dataPoints": temperature_points}
134
+ },
135
+ {
136
+ "name": "gen_ai.gpu.power",
137
+ "description": "GPU power consumption in Watts",
138
+ "unit": "W",
139
+ "gauge": {"dataPoints": power_points}
140
+ },
141
+ {
142
+ "name": "gen_ai.co2.emissions",
143
+ "description": "Cumulative CO2 equivalent emissions in grams",
144
+ "unit": "gCO2e",
145
+ "sum": {
146
+ "dataPoints": co2_points,
147
+ "aggregationTemporality": 2, # CUMULATIVE
148
+ "isMonotonic": True
149
+ }
150
+ }
151
+ ]
152
+ }]
153
+ }]
154
+ }
155
+
156
+ return metrics_data
157
+
158
+
159
+ def generate_api_sample_metrics(run_id: str = "run_001_gpt4"):
160
+ """
161
+ Generate minimal sample metrics for an API model run (no GPU).
162
+
163
+ Args:
164
+ run_id: Run identifier
165
+
166
+ Returns:
167
+ Dict with empty resourceMetrics (API models don't have GPU)
168
+ """
169
+ return {
170
+ "run_id": run_id,
171
+ "resourceMetrics": []
172
+ }
173
+
174
+
175
+ if __name__ == "__main__":
176
+ # Create output directory
177
+ output_dir = Path(__file__).parent
178
+ output_dir.mkdir(parents=True, exist_ok=True)
179
+
180
+ print("Generating sample metrics data...")
181
+
182
+ # Generate GPU model metrics (Llama 3.1 on H200)
183
+ gpu_metrics = generate_gpu_sample_metrics(
184
+ run_id="run_002_llama31",
185
+ duration_seconds=120,
186
+ interval_seconds=10
187
+ )
188
+
189
+ output_file = output_dir / "metrics_llama31.json"
190
+ with open(output_file, "w") as f:
191
+ json.dump(gpu_metrics, f, indent=2)
192
+ print(f"[OK] Generated GPU metrics: {output_file}")
193
+ print(f" - {len(gpu_metrics['resourceMetrics'][0]['scopeMetrics'][0]['metrics'])} metric types")
194
+ print(f" - {len(gpu_metrics['resourceMetrics'][0]['scopeMetrics'][0]['metrics'][0]['gauge']['dataPoints'])} data points per metric")
195
+
196
+ # Generate API model metrics (GPT-4 - no GPU)
197
+ api_metrics = generate_api_sample_metrics(run_id="run_001_gpt4")
198
+
199
+ output_file = output_dir / "metrics_gpt4.json"
200
+ with open(output_file, "w") as f:
201
+ json.dump(api_metrics, f, indent=2)
202
+ print(f"[OK] Generated API metrics: {output_file}")
203
+ print(f" - Empty resourceMetrics (API model has no GPU)")
204
+
205
+ print("\n[SUCCESS] Sample metrics data generation complete!")
206
+ print("\nYou can now test the visualization with:")
207
+ print(" python gpu_metrics_with_time_series.py")
sample_data/leaderboard.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "run_id": "run_001_gpt4",
4
+ "model": "openai/gpt-4",
5
+ "agent_type": "both",
6
+ "provider": "litellm",
7
+ "success_rate": 95.8,
8
+ "total_tests": 100,
9
+ "successful_tests": 96,
10
+ "failed_tests": 4,
11
+ "avg_steps": 2.5,
12
+ "avg_duration_ms": 3200.0,
13
+ "total_duration_ms": 320000.0,
14
+ "total_tokens": 15000,
15
+ "avg_tokens_per_test": 150,
16
+ "total_cost_usd": 0.05,
17
+ "avg_cost_per_test_usd": 0.0005,
18
+ "co2_emissions_g": 0.22,
19
+ "gpu_utilization_avg": null,
20
+ "gpu_memory_max_mib": null,
21
+ "results_dataset": "test/results_gpt4",
22
+ "traces_dataset": "test/traces_gpt4",
23
+ "metrics_dataset": "test/metrics_gpt4",
24
+ "timestamp": "2025-01-16T14:23:00Z",
25
+ "submitted_by": "test_user",
26
+ "hf_job_id": "job_12345",
27
+ "job_type": "cpu",
28
+ "dataset_used": "huggingface/smolagents/tasks",
29
+ "smoltrace_version": "0.1.0"
30
+ },
31
+ {
32
+ "run_id": "run_002_llama31",
33
+ "model": "meta-llama/Llama-3.1-8B",
34
+ "agent_type": "both",
35
+ "provider": "transformers",
36
+ "success_rate": 93.4,
37
+ "total_tests": 100,
38
+ "successful_tests": 93,
39
+ "failed_tests": 7,
40
+ "avg_steps": 2.8,
41
+ "avg_duration_ms": 2100.0,
42
+ "total_duration_ms": 210000.0,
43
+ "total_tokens": 12500,
44
+ "avg_tokens_per_test": 125,
45
+ "total_cost_usd": 0.002,
46
+ "avg_cost_per_test_usd": 0.00002,
47
+ "co2_emissions_g": 1.45,
48
+ "gpu_utilization_avg": 67.5,
49
+ "gpu_memory_max_mib": 512.34,
50
+ "results_dataset": "test/results_llama31",
51
+ "traces_dataset": "test/traces_llama31",
52
+ "metrics_dataset": "test/metrics_llama31",
53
+ "timestamp": "2025-01-16T15:10:00Z",
54
+ "submitted_by": "test_user",
55
+ "hf_job_id": "job_12346",
56
+ "job_type": "gpu_h200",
57
+ "dataset_used": "huggingface/smolagents/tasks",
58
+ "smoltrace_version": "0.1.0"
59
+ },
60
+ {
61
+ "run_id": "run_003_claude",
62
+ "model": "anthropic/claude-3-haiku",
63
+ "agent_type": "tool",
64
+ "provider": "litellm",
65
+ "success_rate": 92.1,
66
+ "total_tests": 100,
67
+ "successful_tests": 92,
68
+ "failed_tests": 8,
69
+ "avg_steps": 2.2,
70
+ "avg_duration_ms": 2800.0,
71
+ "total_duration_ms": 280000.0,
72
+ "total_tokens": 11200,
73
+ "avg_tokens_per_test": 112,
74
+ "total_cost_usd": 0.012,
75
+ "avg_cost_per_test_usd": 0.00012,
76
+ "co2_emissions_g": 0.15,
77
+ "gpu_utilization_avg": null,
78
+ "gpu_memory_max_mib": null,
79
+ "results_dataset": "test/results_claude",
80
+ "traces_dataset": "test/traces_claude",
81
+ "metrics_dataset": "test/metrics_claude",
82
+ "timestamp": "2025-01-16T16:45:00Z",
83
+ "submitted_by": "test_user",
84
+ "hf_job_id": "job_12347",
85
+ "job_type": "cpu",
86
+ "dataset_used": "huggingface/smolagents/tasks",
87
+ "smoltrace_version": "0.1.0"
88
+ }
89
+ ]
sample_data/metrics_gpt4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
sample_data/metrics_llama31.json ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "run_002_llama31",
3
+ "resourceMetrics": [
4
+ {
5
+ "resource": {
6
+ "attributes": [
7
+ {
8
+ "key": "telemetry.sdk.language",
9
+ "value": {
10
+ "stringValue": "python"
11
+ }
12
+ },
13
+ {
14
+ "key": "telemetry.sdk.name",
15
+ "value": {
16
+ "stringValue": "opentelemetry"
17
+ }
18
+ },
19
+ {
20
+ "key": "telemetry.sdk.version",
21
+ "value": {
22
+ "stringValue": "1.37.0"
23
+ }
24
+ },
25
+ {
26
+ "key": "service.name",
27
+ "value": {
28
+ "stringValue": "smoltrace-eval"
29
+ }
30
+ },
31
+ {
32
+ "key": "run.id",
33
+ "value": {
34
+ "stringValue": "run_002_llama31"
35
+ }
36
+ }
37
+ ]
38
+ },
39
+ "scopeMetrics": [
40
+ {
41
+ "scope": {
42
+ "name": "genai.gpu",
43
+ "version": null
44
+ },
45
+ "metrics": [
46
+ {
47
+ "name": "gen_ai.gpu.utilization",
48
+ "description": "GPU utilization percentage",
49
+ "unit": "%",
50
+ "gauge": {
51
+ "dataPoints": [
52
+ {
53
+ "attributes": [
54
+ {
55
+ "key": "gpu_id",
56
+ "value": {
57
+ "stringValue": "0"
58
+ }
59
+ },
60
+ {
61
+ "key": "gpu_name",
62
+ "value": {
63
+ "stringValue": "NVIDIA H200"
64
+ }
65
+ }
66
+ ],
67
+ "timeUnixNano": "1761242554199441920",
68
+ "asInt": "45"
69
+ },
70
+ {
71
+ "attributes": [
72
+ {
73
+ "key": "gpu_id",
74
+ "value": {
75
+ "stringValue": "0"
76
+ }
77
+ },
78
+ {
79
+ "key": "gpu_name",
80
+ "value": {
81
+ "stringValue": "NVIDIA H200"
82
+ }
83
+ }
84
+ ],
85
+ "timeUnixNano": "1761242564199441920",
86
+ "asInt": "60"
87
+ },
88
+ {
89
+ "attributes": [
90
+ {
91
+ "key": "gpu_id",
92
+ "value": {
93
+ "stringValue": "0"
94
+ }
95
+ },
96
+ {
97
+ "key": "gpu_name",
98
+ "value": {
99
+ "stringValue": "NVIDIA H200"
100
+ }
101
+ }
102
+ ],
103
+ "timeUnixNano": "1761242574199441920",
104
+ "asInt": "65"
105
+ },
106
+ {
107
+ "attributes": [
108
+ {
109
+ "key": "gpu_id",
110
+ "value": {
111
+ "stringValue": "0"
112
+ }
113
+ },
114
+ {
115
+ "key": "gpu_name",
116
+ "value": {
117
+ "stringValue": "NVIDIA H200"
118
+ }
119
+ }
120
+ ],
121
+ "timeUnixNano": "1761242584199441920",
122
+ "asInt": "80"
123
+ },
124
+ {
125
+ "attributes": [
126
+ {
127
+ "key": "gpu_id",
128
+ "value": {
129
+ "stringValue": "0"
130
+ }
131
+ },
132
+ {
133
+ "key": "gpu_name",
134
+ "value": {
135
+ "stringValue": "NVIDIA H200"
136
+ }
137
+ }
138
+ ],
139
+ "timeUnixNano": "1761242594199441920",
140
+ "asInt": "85"
141
+ },
142
+ {
143
+ "attributes": [
144
+ {
145
+ "key": "gpu_id",
146
+ "value": {
147
+ "stringValue": "0"
148
+ }
149
+ },
150
+ {
151
+ "key": "gpu_name",
152
+ "value": {
153
+ "stringValue": "NVIDIA H200"
154
+ }
155
+ }
156
+ ],
157
+ "timeUnixNano": "1761242604199441920",
158
+ "asInt": "50"
159
+ },
160
+ {
161
+ "attributes": [
162
+ {
163
+ "key": "gpu_id",
164
+ "value": {
165
+ "stringValue": "0"
166
+ }
167
+ },
168
+ {
169
+ "key": "gpu_name",
170
+ "value": {
171
+ "stringValue": "NVIDIA H200"
172
+ }
173
+ }
174
+ ],
175
+ "timeUnixNano": "1761242614199441920",
176
+ "asInt": "55"
177
+ },
178
+ {
179
+ "attributes": [
180
+ {
181
+ "key": "gpu_id",
182
+ "value": {
183
+ "stringValue": "0"
184
+ }
185
+ },
186
+ {
187
+ "key": "gpu_name",
188
+ "value": {
189
+ "stringValue": "NVIDIA H200"
190
+ }
191
+ }
192
+ ],
193
+ "timeUnixNano": "1761242624199441920",
194
+ "asInt": "70"
195
+ },
196
+ {
197
+ "attributes": [
198
+ {
199
+ "key": "gpu_id",
200
+ "value": {
201
+ "stringValue": "0"
202
+ }
203
+ },
204
+ {
205
+ "key": "gpu_name",
206
+ "value": {
207
+ "stringValue": "NVIDIA H200"
208
+ }
209
+ }
210
+ ],
211
+ "timeUnixNano": "1761242634199441920",
212
+ "asInt": "75"
213
+ },
214
+ {
215
+ "attributes": [
216
+ {
217
+ "key": "gpu_id",
218
+ "value": {
219
+ "stringValue": "0"
220
+ }
221
+ },
222
+ {
223
+ "key": "gpu_name",
224
+ "value": {
225
+ "stringValue": "NVIDIA H200"
226
+ }
227
+ }
228
+ ],
229
+ "timeUnixNano": "1761242644199441920",
230
+ "asInt": "90"
231
+ },
232
+ {
233
+ "attributes": [
234
+ {
235
+ "key": "gpu_id",
236
+ "value": {
237
+ "stringValue": "0"
238
+ }
239
+ },
240
+ {
241
+ "key": "gpu_name",
242
+ "value": {
243
+ "stringValue": "NVIDIA H200"
244
+ }
245
+ }
246
+ ],
247
+ "timeUnixNano": "1761242654199441920",
248
+ "asInt": "45"
249
+ },
250
+ {
251
+ "attributes": [
252
+ {
253
+ "key": "gpu_id",
254
+ "value": {
255
+ "stringValue": "0"
256
+ }
257
+ },
258
+ {
259
+ "key": "gpu_name",
260
+ "value": {
261
+ "stringValue": "NVIDIA H200"
262
+ }
263
+ }
264
+ ],
265
+ "timeUnixNano": "1761242664199441920",
266
+ "asInt": "60"
267
+ }
268
+ ]
269
+ }
270
+ },
271
+ {
272
+ "name": "gen_ai.gpu.memory.used",
273
+ "description": "GPU memory used in MiB",
274
+ "unit": "MiB",
275
+ "gauge": {
276
+ "dataPoints": [
277
+ {
278
+ "attributes": [
279
+ {
280
+ "key": "gpu_id",
281
+ "value": {
282
+ "stringValue": "0"
283
+ }
284
+ },
285
+ {
286
+ "key": "gpu_name",
287
+ "value": {
288
+ "stringValue": "NVIDIA H200"
289
+ }
290
+ }
291
+ ],
292
+ "timeUnixNano": "1761242554199441920",
293
+ "asDouble": 4096.0
294
+ },
295
+ {
296
+ "attributes": [
297
+ {
298
+ "key": "gpu_id",
299
+ "value": {
300
+ "stringValue": "0"
301
+ }
302
+ },
303
+ {
304
+ "key": "gpu_name",
305
+ "value": {
306
+ "stringValue": "NVIDIA H200"
307
+ }
308
+ }
309
+ ],
310
+ "timeUnixNano": "1761242564199441920",
311
+ "asDouble": 4196.0
312
+ },
313
+ {
314
+ "attributes": [
315
+ {
316
+ "key": "gpu_id",
317
+ "value": {
318
+ "stringValue": "0"
319
+ }
320
+ },
321
+ {
322
+ "key": "gpu_name",
323
+ "value": {
324
+ "stringValue": "NVIDIA H200"
325
+ }
326
+ }
327
+ ],
328
+ "timeUnixNano": "1761242574199441920",
329
+ "asDouble": 4296.0
330
+ },
331
+ {
332
+ "attributes": [
333
+ {
334
+ "key": "gpu_id",
335
+ "value": {
336
+ "stringValue": "0"
337
+ }
338
+ },
339
+ {
340
+ "key": "gpu_name",
341
+ "value": {
342
+ "stringValue": "NVIDIA H200"
343
+ }
344
+ }
345
+ ],
346
+ "timeUnixNano": "1761242584199441920",
347
+ "asDouble": 4396.0
348
+ },
349
+ {
350
+ "attributes": [
351
+ {
352
+ "key": "gpu_id",
353
+ "value": {
354
+ "stringValue": "0"
355
+ }
356
+ },
357
+ {
358
+ "key": "gpu_name",
359
+ "value": {
360
+ "stringValue": "NVIDIA H200"
361
+ }
362
+ }
363
+ ],
364
+ "timeUnixNano": "1761242594199441920",
365
+ "asDouble": 4496.0
366
+ },
367
+ {
368
+ "attributes": [
369
+ {
370
+ "key": "gpu_id",
371
+ "value": {
372
+ "stringValue": "0"
373
+ }
374
+ },
375
+ {
376
+ "key": "gpu_name",
377
+ "value": {
378
+ "stringValue": "NVIDIA H200"
379
+ }
380
+ }
381
+ ],
382
+ "timeUnixNano": "1761242604199441920",
383
+ "asDouble": 4596.0
384
+ },
385
+ {
386
+ "attributes": [
387
+ {
388
+ "key": "gpu_id",
389
+ "value": {
390
+ "stringValue": "0"
391
+ }
392
+ },
393
+ {
394
+ "key": "gpu_name",
395
+ "value": {
396
+ "stringValue": "NVIDIA H200"
397
+ }
398
+ }
399
+ ],
400
+ "timeUnixNano": "1761242614199441920",
401
+ "asDouble": 4696.0
402
+ },
403
+ {
404
+ "attributes": [
405
+ {
406
+ "key": "gpu_id",
407
+ "value": {
408
+ "stringValue": "0"
409
+ }
410
+ },
411
+ {
412
+ "key": "gpu_name",
413
+ "value": {
414
+ "stringValue": "NVIDIA H200"
415
+ }
416
+ }
417
+ ],
418
+ "timeUnixNano": "1761242624199441920",
419
+ "asDouble": 4796.0
420
+ },
421
+ {
422
+ "attributes": [
423
+ {
424
+ "key": "gpu_id",
425
+ "value": {
426
+ "stringValue": "0"
427
+ }
428
+ },
429
+ {
430
+ "key": "gpu_name",
431
+ "value": {
432
+ "stringValue": "NVIDIA H200"
433
+ }
434
+ }
435
+ ],
436
+ "timeUnixNano": "1761242634199441920",
437
+ "asDouble": 4896.0
438
+ },
439
+ {
440
+ "attributes": [
441
+ {
442
+ "key": "gpu_id",
443
+ "value": {
444
+ "stringValue": "0"
445
+ }
446
+ },
447
+ {
448
+ "key": "gpu_name",
449
+ "value": {
450
+ "stringValue": "NVIDIA H200"
451
+ }
452
+ }
453
+ ],
454
+ "timeUnixNano": "1761242644199441920",
455
+ "asDouble": 4996.0
456
+ },
457
+ {
458
+ "attributes": [
459
+ {
460
+ "key": "gpu_id",
461
+ "value": {
462
+ "stringValue": "0"
463
+ }
464
+ },
465
+ {
466
+ "key": "gpu_name",
467
+ "value": {
468
+ "stringValue": "NVIDIA H200"
469
+ }
470
+ }
471
+ ],
472
+ "timeUnixNano": "1761242654199441920",
473
+ "asDouble": 5096.0
474
+ },
475
+ {
476
+ "attributes": [
477
+ {
478
+ "key": "gpu_id",
479
+ "value": {
480
+ "stringValue": "0"
481
+ }
482
+ },
483
+ {
484
+ "key": "gpu_name",
485
+ "value": {
486
+ "stringValue": "NVIDIA H200"
487
+ }
488
+ }
489
+ ],
490
+ "timeUnixNano": "1761242664199441920",
491
+ "asDouble": 5196.0
492
+ }
493
+ ]
494
+ }
495
+ },
496
+ {
497
+ "name": "gen_ai.gpu.temperature",
498
+ "description": "GPU temperature in Celsius",
499
+ "unit": "Cel",
500
+ "gauge": {
501
+ "dataPoints": [
502
+ {
503
+ "attributes": [
504
+ {
505
+ "key": "gpu_id",
506
+ "value": {
507
+ "stringValue": "0"
508
+ }
509
+ },
510
+ {
511
+ "key": "gpu_name",
512
+ "value": {
513
+ "stringValue": "NVIDIA H200"
514
+ }
515
+ }
516
+ ],
517
+ "timeUnixNano": "1761242554199441920",
518
+ "asInt": "70"
519
+ },
520
+ {
521
+ "attributes": [
522
+ {
523
+ "key": "gpu_id",
524
+ "value": {
525
+ "stringValue": "0"
526
+ }
527
+ },
528
+ {
529
+ "key": "gpu_name",
530
+ "value": {
531
+ "stringValue": "NVIDIA H200"
532
+ }
533
+ }
534
+ ],
535
+ "timeUnixNano": "1761242564199441920",
536
+ "asInt": "72"
537
+ },
538
+ {
539
+ "attributes": [
540
+ {
541
+ "key": "gpu_id",
542
+ "value": {
543
+ "stringValue": "0"
544
+ }
545
+ },
546
+ {
547
+ "key": "gpu_name",
548
+ "value": {
549
+ "stringValue": "NVIDIA H200"
550
+ }
551
+ }
552
+ ],
553
+ "timeUnixNano": "1761242574199441920",
554
+ "asInt": "74"
555
+ },
556
+ {
557
+ "attributes": [
558
+ {
559
+ "key": "gpu_id",
560
+ "value": {
561
+ "stringValue": "0"
562
+ }
563
+ },
564
+ {
565
+ "key": "gpu_name",
566
+ "value": {
567
+ "stringValue": "NVIDIA H200"
568
+ }
569
+ }
570
+ ],
571
+ "timeUnixNano": "1761242584199441920",
572
+ "asInt": "76"
573
+ },
574
+ {
575
+ "attributes": [
576
+ {
577
+ "key": "gpu_id",
578
+ "value": {
579
+ "stringValue": "0"
580
+ }
581
+ },
582
+ {
583
+ "key": "gpu_name",
584
+ "value": {
585
+ "stringValue": "NVIDIA H200"
586
+ }
587
+ }
588
+ ],
589
+ "timeUnixNano": "1761242594199441920",
590
+ "asInt": "78"
591
+ },
592
+ {
593
+ "attributes": [
594
+ {
595
+ "key": "gpu_id",
596
+ "value": {
597
+ "stringValue": "0"
598
+ }
599
+ },
600
+ {
601
+ "key": "gpu_name",
602
+ "value": {
603
+ "stringValue": "NVIDIA H200"
604
+ }
605
+ }
606
+ ],
607
+ "timeUnixNano": "1761242604199441920",
608
+ "asInt": "80"
609
+ },
610
+ {
611
+ "attributes": [
612
+ {
613
+ "key": "gpu_id",
614
+ "value": {
615
+ "stringValue": "0"
616
+ }
617
+ },
618
+ {
619
+ "key": "gpu_name",
620
+ "value": {
621
+ "stringValue": "NVIDIA H200"
622
+ }
623
+ }
624
+ ],
625
+ "timeUnixNano": "1761242614199441920",
626
+ "asInt": "70"
627
+ },
628
+ {
629
+ "attributes": [
630
+ {
631
+ "key": "gpu_id",
632
+ "value": {
633
+ "stringValue": "0"
634
+ }
635
+ },
636
+ {
637
+ "key": "gpu_name",
638
+ "value": {
639
+ "stringValue": "NVIDIA H200"
640
+ }
641
+ }
642
+ ],
643
+ "timeUnixNano": "1761242624199441920",
644
+ "asInt": "72"
645
+ },
646
+ {
647
+ "attributes": [
648
+ {
649
+ "key": "gpu_id",
650
+ "value": {
651
+ "stringValue": "0"
652
+ }
653
+ },
654
+ {
655
+ "key": "gpu_name",
656
+ "value": {
657
+ "stringValue": "NVIDIA H200"
658
+ }
659
+ }
660
+ ],
661
+ "timeUnixNano": "1761242634199441920",
662
+ "asInt": "74"
663
+ },
664
+ {
665
+ "attributes": [
666
+ {
667
+ "key": "gpu_id",
668
+ "value": {
669
+ "stringValue": "0"
670
+ }
671
+ },
672
+ {
673
+ "key": "gpu_name",
674
+ "value": {
675
+ "stringValue": "NVIDIA H200"
676
+ }
677
+ }
678
+ ],
679
+ "timeUnixNano": "1761242644199441920",
680
+ "asInt": "76"
681
+ },
682
+ {
683
+ "attributes": [
684
+ {
685
+ "key": "gpu_id",
686
+ "value": {
687
+ "stringValue": "0"
688
+ }
689
+ },
690
+ {
691
+ "key": "gpu_name",
692
+ "value": {
693
+ "stringValue": "NVIDIA H200"
694
+ }
695
+ }
696
+ ],
697
+ "timeUnixNano": "1761242654199441920",
698
+ "asInt": "78"
699
+ },
700
+ {
701
+ "attributes": [
702
+ {
703
+ "key": "gpu_id",
704
+ "value": {
705
+ "stringValue": "0"
706
+ }
707
+ },
708
+ {
709
+ "key": "gpu_name",
710
+ "value": {
711
+ "stringValue": "NVIDIA H200"
712
+ }
713
+ }
714
+ ],
715
+ "timeUnixNano": "1761242664199441920",
716
+ "asInt": "80"
717
+ }
718
+ ]
719
+ }
720
+ },
721
+ {
722
+ "name": "gen_ai.gpu.power",
723
+ "description": "GPU power consumption in Watts",
724
+ "unit": "W",
725
+ "gauge": {
726
+ "dataPoints": [
727
+ {
728
+ "attributes": [
729
+ {
730
+ "key": "gpu_id",
731
+ "value": {
732
+ "stringValue": "0"
733
+ }
734
+ },
735
+ {
736
+ "key": "gpu_name",
737
+ "value": {
738
+ "stringValue": "NVIDIA H200"
739
+ }
740
+ }
741
+ ],
742
+ "timeUnixNano": "1761242554199441920",
743
+ "asDouble": 250.0
744
+ },
745
+ {
746
+ "attributes": [
747
+ {
748
+ "key": "gpu_id",
749
+ "value": {
750
+ "stringValue": "0"
751
+ }
752
+ },
753
+ {
754
+ "key": "gpu_name",
755
+ "value": {
756
+ "stringValue": "NVIDIA H200"
757
+ }
758
+ }
759
+ ],
760
+ "timeUnixNano": "1761242564199441920",
761
+ "asDouble": 280.0
762
+ },
763
+ {
764
+ "attributes": [
765
+ {
766
+ "key": "gpu_id",
767
+ "value": {
768
+ "stringValue": "0"
769
+ }
770
+ },
771
+ {
772
+ "key": "gpu_name",
773
+ "value": {
774
+ "stringValue": "NVIDIA H200"
775
+ }
776
+ }
777
+ ],
778
+ "timeUnixNano": "1761242574199441920",
779
+ "asDouble": 310.0
780
+ },
781
+ {
782
+ "attributes": [
783
+ {
784
+ "key": "gpu_id",
785
+ "value": {
786
+ "stringValue": "0"
787
+ }
788
+ },
789
+ {
790
+ "key": "gpu_name",
791
+ "value": {
792
+ "stringValue": "NVIDIA H200"
793
+ }
794
+ }
795
+ ],
796
+ "timeUnixNano": "1761242584199441920",
797
+ "asDouble": 340.0
798
+ },
799
+ {
800
+ "attributes": [
801
+ {
802
+ "key": "gpu_id",
803
+ "value": {
804
+ "stringValue": "0"
805
+ }
806
+ },
807
+ {
808
+ "key": "gpu_name",
809
+ "value": {
810
+ "stringValue": "NVIDIA H200"
811
+ }
812
+ }
813
+ ],
814
+ "timeUnixNano": "1761242594199441920",
815
+ "asDouble": 370.0
816
+ },
817
+ {
818
+ "attributes": [
819
+ {
820
+ "key": "gpu_id",
821
+ "value": {
822
+ "stringValue": "0"
823
+ }
824
+ },
825
+ {
826
+ "key": "gpu_name",
827
+ "value": {
828
+ "stringValue": "NVIDIA H200"
829
+ }
830
+ }
831
+ ],
832
+ "timeUnixNano": "1761242604199441920",
833
+ "asDouble": 400.0
834
+ },
835
+ {
836
+ "attributes": [
837
+ {
838
+ "key": "gpu_id",
839
+ "value": {
840
+ "stringValue": "0"
841
+ }
842
+ },
843
+ {
844
+ "key": "gpu_name",
845
+ "value": {
846
+ "stringValue": "NVIDIA H200"
847
+ }
848
+ }
849
+ ],
850
+ "timeUnixNano": "1761242614199441920",
851
+ "asDouble": 430.0
852
+ },
853
+ {
854
+ "attributes": [
855
+ {
856
+ "key": "gpu_id",
857
+ "value": {
858
+ "stringValue": "0"
859
+ }
860
+ },
861
+ {
862
+ "key": "gpu_name",
863
+ "value": {
864
+ "stringValue": "NVIDIA H200"
865
+ }
866
+ }
867
+ ],
868
+ "timeUnixNano": "1761242624199441920",
869
+ "asDouble": 250.0
870
+ },
871
+ {
872
+ "attributes": [
873
+ {
874
+ "key": "gpu_id",
875
+ "value": {
876
+ "stringValue": "0"
877
+ }
878
+ },
879
+ {
880
+ "key": "gpu_name",
881
+ "value": {
882
+ "stringValue": "NVIDIA H200"
883
+ }
884
+ }
885
+ ],
886
+ "timeUnixNano": "1761242634199441920",
887
+ "asDouble": 280.0
888
+ },
889
+ {
890
+ "attributes": [
891
+ {
892
+ "key": "gpu_id",
893
+ "value": {
894
+ "stringValue": "0"
895
+ }
896
+ },
897
+ {
898
+ "key": "gpu_name",
899
+ "value": {
900
+ "stringValue": "NVIDIA H200"
901
+ }
902
+ }
903
+ ],
904
+ "timeUnixNano": "1761242644199441920",
905
+ "asDouble": 310.0
906
+ },
907
+ {
908
+ "attributes": [
909
+ {
910
+ "key": "gpu_id",
911
+ "value": {
912
+ "stringValue": "0"
913
+ }
914
+ },
915
+ {
916
+ "key": "gpu_name",
917
+ "value": {
918
+ "stringValue": "NVIDIA H200"
919
+ }
920
+ }
921
+ ],
922
+ "timeUnixNano": "1761242654199441920",
923
+ "asDouble": 340.0
924
+ },
925
+ {
926
+ "attributes": [
927
+ {
928
+ "key": "gpu_id",
929
+ "value": {
930
+ "stringValue": "0"
931
+ }
932
+ },
933
+ {
934
+ "key": "gpu_name",
935
+ "value": {
936
+ "stringValue": "NVIDIA H200"
937
+ }
938
+ }
939
+ ],
940
+ "timeUnixNano": "1761242664199441920",
941
+ "asDouble": 370.0
942
+ }
943
+ ]
944
+ }
945
+ },
946
+ {
947
+ "name": "gen_ai.co2.emissions",
948
+ "description": "Cumulative CO2 equivalent emissions in grams",
949
+ "unit": "gCO2e",
950
+ "sum": {
951
+ "dataPoints": [
952
+ {
953
+ "attributes": [
954
+ {
955
+ "key": "gpu_id",
956
+ "value": {
957
+ "stringValue": "0"
958
+ }
959
+ }
960
+ ],
961
+ "timeUnixNano": "1761242554199441920",
962
+ "asDouble": 0.2777777777777778
963
+ },
964
+ {
965
+ "attributes": [
966
+ {
967
+ "key": "gpu_id",
968
+ "value": {
969
+ "stringValue": "0"
970
+ }
971
+ }
972
+ ],
973
+ "timeUnixNano": "1761242564199441920",
974
+ "asDouble": 0.5888888888888889
975
+ },
976
+ {
977
+ "attributes": [
978
+ {
979
+ "key": "gpu_id",
980
+ "value": {
981
+ "stringValue": "0"
982
+ }
983
+ }
984
+ ],
985
+ "timeUnixNano": "1761242574199441920",
986
+ "asDouble": 0.9333333333333333
987
+ },
988
+ {
989
+ "attributes": [
990
+ {
991
+ "key": "gpu_id",
992
+ "value": {
993
+ "stringValue": "0"
994
+ }
995
+ }
996
+ ],
997
+ "timeUnixNano": "1761242584199441920",
998
+ "asDouble": 1.3111111111111111
999
+ },
1000
+ {
1001
+ "attributes": [
1002
+ {
1003
+ "key": "gpu_id",
1004
+ "value": {
1005
+ "stringValue": "0"
1006
+ }
1007
+ }
1008
+ ],
1009
+ "timeUnixNano": "1761242594199441920",
1010
+ "asDouble": 1.7222222222222223
1011
+ },
1012
+ {
1013
+ "attributes": [
1014
+ {
1015
+ "key": "gpu_id",
1016
+ "value": {
1017
+ "stringValue": "0"
1018
+ }
1019
+ }
1020
+ ],
1021
+ "timeUnixNano": "1761242604199441920",
1022
+ "asDouble": 2.166666666666667
1023
+ },
1024
+ {
1025
+ "attributes": [
1026
+ {
1027
+ "key": "gpu_id",
1028
+ "value": {
1029
+ "stringValue": "0"
1030
+ }
1031
+ }
1032
+ ],
1033
+ "timeUnixNano": "1761242614199441920",
1034
+ "asDouble": 2.644444444444445
1035
+ },
1036
+ {
1037
+ "attributes": [
1038
+ {
1039
+ "key": "gpu_id",
1040
+ "value": {
1041
+ "stringValue": "0"
1042
+ }
1043
+ }
1044
+ ],
1045
+ "timeUnixNano": "1761242624199441920",
1046
+ "asDouble": 2.9222222222222225
1047
+ },
1048
+ {
1049
+ "attributes": [
1050
+ {
1051
+ "key": "gpu_id",
1052
+ "value": {
1053
+ "stringValue": "0"
1054
+ }
1055
+ }
1056
+ ],
1057
+ "timeUnixNano": "1761242634199441920",
1058
+ "asDouble": 3.2333333333333334
1059
+ },
1060
+ {
1061
+ "attributes": [
1062
+ {
1063
+ "key": "gpu_id",
1064
+ "value": {
1065
+ "stringValue": "0"
1066
+ }
1067
+ }
1068
+ ],
1069
+ "timeUnixNano": "1761242644199441920",
1070
+ "asDouble": 3.577777777777778
1071
+ },
1072
+ {
1073
+ "attributes": [
1074
+ {
1075
+ "key": "gpu_id",
1076
+ "value": {
1077
+ "stringValue": "0"
1078
+ }
1079
+ }
1080
+ ],
1081
+ "timeUnixNano": "1761242654199441920",
1082
+ "asDouble": 3.9555555555555557
1083
+ },
1084
+ {
1085
+ "attributes": [
1086
+ {
1087
+ "key": "gpu_id",
1088
+ "value": {
1089
+ "stringValue": "0"
1090
+ }
1091
+ }
1092
+ ],
1093
+ "timeUnixNano": "1761242664199441920",
1094
+ "asDouble": 4.366666666666667
1095
+ }
1096
+ ],
1097
+ "aggregationTemporality": 2,
1098
+ "isMonotonic": true
1099
+ }
1100
+ }
1101
+ ]
1102
+ }
1103
+ ]
1104
+ }
1105
+ ]
1106
+ }
sample_data/results_gpt4.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "run_id": "run_001_gpt4",
4
+ "task_id": "task_001",
5
+ "test_index": 0,
6
+ "prompt": "What's the weather in Tokyo?",
7
+ "expected_tool": "get_weather",
8
+ "difficulty": "easy",
9
+ "category": "tool_usage",
10
+ "success": true,
11
+ "response": "The weather in Tokyo is 18°C and clear.",
12
+ "tool_called": "get_weather",
13
+ "tool_correct": true,
14
+ "expected_keywords": ["18°C", "clear"],
15
+ "keywords_matched": ["18°C", "clear"],
16
+ "execution_time_ms": 2450.0,
17
+ "total_tokens": 234,
18
+ "prompt_tokens": 78,
19
+ "completion_tokens": 156,
20
+ "cost_usd": 0.0012,
21
+ "trace_id": "trace_abc123",
22
+ "start_time": "2025-01-16T14:23:01Z",
23
+ "end_time": "2025-01-16T14:23:03.450Z",
24
+ "start_time_unix_nano": "1760947217774556600",
25
+ "end_time_unix_nano": "1760947220224556600",
26
+ "error": null,
27
+ "error_type": null
28
+ },
29
+ {
30
+ "run_id": "run_001_gpt4",
31
+ "task_id": "task_002",
32
+ "test_index": 1,
33
+ "prompt": "Search for recent news about AI",
34
+ "expected_tool": "web_search",
35
+ "difficulty": "medium",
36
+ "category": "information_retrieval",
37
+ "success": true,
38
+ "response": "Here are the latest AI news headlines: 1) New breakthrough in LLMs...",
39
+ "tool_called": "web_search",
40
+ "tool_correct": true,
41
+ "expected_keywords": ["AI", "news"],
42
+ "keywords_matched": ["AI"],
43
+ "execution_time_ms": 3800.0,
44
+ "total_tokens": 456,
45
+ "prompt_tokens": 120,
46
+ "completion_tokens": 336,
47
+ "cost_usd": 0.0018,
48
+ "trace_id": "trace_def456",
49
+ "start_time": "2025-01-16T14:23:05Z",
50
+ "end_time": "2025-01-16T14:23:08.800Z",
51
+ "start_time_unix_nano": "1760947221000000000",
52
+ "end_time_unix_nano": "1760947224800000000",
53
+ "error": null,
54
+ "error_type": null
55
+ },
56
+ {
57
+ "run_id": "run_001_gpt4",
58
+ "task_id": "task_003",
59
+ "test_index": 2,
60
+ "prompt": "Calculate 234 * 567",
61
+ "expected_tool": "calculator",
62
+ "difficulty": "easy",
63
+ "category": "tool_usage",
64
+ "success": true,
65
+ "response": "The result of 234 * 567 is 132678",
66
+ "tool_called": "calculator",
67
+ "tool_correct": true,
68
+ "expected_keywords": ["132678"],
69
+ "keywords_matched": ["132678"],
70
+ "execution_time_ms": 1200.0,
71
+ "total_tokens": 89,
72
+ "prompt_tokens": 45,
73
+ "completion_tokens": 44,
74
+ "cost_usd": 0.0004,
75
+ "trace_id": "trace_ghi789",
76
+ "start_time": "2025-01-16T14:23:10Z",
77
+ "end_time": "2025-01-16T14:23:11.200Z",
78
+ "start_time_unix_nano": "1760947226000000000",
79
+ "end_time_unix_nano": "1760947227200000000",
80
+ "error": null,
81
+ "error_type": null
82
+ },
83
+ {
84
+ "run_id": "run_001_gpt4",
85
+ "task_id": "task_004",
86
+ "test_index": 3,
87
+ "prompt": "Send an email to john@example.com with subject 'Meeting' and body 'Let's meet tomorrow'",
88
+ "expected_tool": "send_email",
89
+ "difficulty": "hard",
90
+ "category": "multi_step",
91
+ "success": false,
92
+ "response": "I apologize, I don't have access to an email sending function.",
93
+ "tool_called": null,
94
+ "tool_correct": false,
95
+ "expected_keywords": ["email", "sent"],
96
+ "keywords_matched": [],
97
+ "execution_time_ms": 1800.0,
98
+ "total_tokens": 123,
99
+ "prompt_tokens": 67,
100
+ "completion_tokens": 56,
101
+ "cost_usd": 0.0006,
102
+ "trace_id": "trace_jkl012",
103
+ "start_time": "2025-01-16T14:23:13Z",
104
+ "end_time": "2025-01-16T14:23:14.800Z",
105
+ "start_time_unix_nano": "1760947229000000000",
106
+ "end_time_unix_nano": "1760947230800000000",
107
+ "error": "Tool not found: send_email",
108
+ "error_type": "tool_not_found"
109
+ },
110
+ {
111
+ "run_id": "run_001_gpt4",
112
+ "task_id": "task_005",
113
+ "test_index": 4,
114
+ "prompt": "What is 2+2?",
115
+ "expected_tool": "calculator",
116
+ "difficulty": "easy",
117
+ "category": "reasoning",
118
+ "success": true,
119
+ "response": "2+2 equals 4",
120
+ "tool_called": "calculator",
121
+ "tool_correct": true,
122
+ "expected_keywords": ["4"],
123
+ "keywords_matched": ["4"],
124
+ "execution_time_ms": 900.0,
125
+ "total_tokens": 67,
126
+ "prompt_tokens": 34,
127
+ "completion_tokens": 33,
128
+ "cost_usd": 0.0003,
129
+ "trace_id": "trace_mno345",
130
+ "start_time": "2025-01-16T14:23:16Z",
131
+ "end_time": "2025-01-16T14:23:16.900Z",
132
+ "start_time_unix_nano": "1760947232000000000",
133
+ "end_time_unix_nano": "1760947232900000000",
134
+ "error": null,
135
+ "error_type": null
136
+ }
137
+ ]
sample_data/traces_gpt4.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "trace_id": "trace_abc123",
4
+ "run_id": "run_001_gpt4",
5
+ "traceId": "trace_abc123",
6
+ "spans": [
7
+ {
8
+ "spanId": "span_001",
9
+ "parentSpanId": null,
10
+ "name": "Agent Execution",
11
+ "kind": "INTERNAL",
12
+ "startTime": 1760947217774556600,
13
+ "endTime": 1760947220224556600,
14
+ "attributes": {
15
+ "agent.type": "both",
16
+ "agent.name": "ToolCallingAgent",
17
+ "gen_ai.system": "openai",
18
+ "gen_ai.request.model": "gpt-4"
19
+ },
20
+ "status": {"code": "OK"}
21
+ },
22
+ {
23
+ "spanId": "span_002",
24
+ "parentSpanId": "span_001",
25
+ "name": "LLM Call - Reasoning",
26
+ "kind": "CLIENT",
27
+ "startTime": 1760947217774556600,
28
+ "endTime": 1760947218974556600,
29
+ "attributes": {
30
+ "gen_ai.system": "openai",
31
+ "gen_ai.request.model": "gpt-4",
32
+ "gen_ai.operation.name": "chat",
33
+ "gen_ai.usage.prompt_tokens": 78,
34
+ "gen_ai.usage.completion_tokens": 45,
35
+ "gen_ai.usage.total_tokens": 123,
36
+ "gen_ai.usage.cost.total": 0.0006,
37
+ "gen_ai.response.finish_reasons": ["stop"]
38
+ },
39
+ "status": {"code": "OK"}
40
+ },
41
+ {
42
+ "spanId": "span_003",
43
+ "parentSpanId": "span_001",
44
+ "name": "Tool Call - get_weather",
45
+ "kind": "CLIENT",
46
+ "startTime": 1760947219000556600,
47
+ "endTime": 1760947219890556600,
48
+ "attributes": {
49
+ "tool.name": "get_weather",
50
+ "tool.input": "{\"location\": \"Tokyo\"}",
51
+ "tool.output": "{\"temp\": \"18°C\", \"condition\": \"clear\"}",
52
+ "tool.latency_ms": 890
53
+ },
54
+ "status": {"code": "OK"}
55
+ },
56
+ {
57
+ "spanId": "span_004",
58
+ "parentSpanId": "span_001",
59
+ "name": "LLM Call - Final Response",
60
+ "kind": "CLIENT",
61
+ "startTime": 1760947219900556600,
62
+ "endTime": 1760947220224556600,
63
+ "attributes": {
64
+ "gen_ai.system": "openai",
65
+ "gen_ai.request.model": "gpt-4",
66
+ "gen_ai.usage.prompt_tokens": 145,
67
+ "gen_ai.usage.completion_tokens": 111,
68
+ "gen_ai.usage.cost.total": 0.0006
69
+ },
70
+ "status": {"code": "OK"}
71
+ }
72
+ ]
73
+ },
74
+ {
75
+ "trace_id": "trace_def456",
76
+ "run_id": "run_001_gpt4",
77
+ "traceId": "trace_def456",
78
+ "spans": [
79
+ {
80
+ "spanId": "span_005",
81
+ "parentSpanId": null,
82
+ "name": "Agent Execution",
83
+ "kind": "INTERNAL",
84
+ "startTime": 1760947221000000000,
85
+ "endTime": 1760947224800000000,
86
+ "attributes": {
87
+ "agent.type": "both",
88
+ "agent.name": "ToolCallingAgent",
89
+ "gen_ai.system": "openai",
90
+ "gen_ai.request.model": "gpt-4"
91
+ },
92
+ "status": {"code": "OK"}
93
+ },
94
+ {
95
+ "spanId": "span_006",
96
+ "parentSpanId": "span_005",
97
+ "name": "LLM Call - Reasoning",
98
+ "kind": "CLIENT",
99
+ "startTime": 1760947221000000000,
100
+ "endTime": 1760947222200000000,
101
+ "attributes": {
102
+ "gen_ai.system": "openai",
103
+ "gen_ai.request.model": "gpt-4",
104
+ "gen_ai.operation.name": "chat",
105
+ "gen_ai.usage.prompt_tokens": 120,
106
+ "gen_ai.usage.completion_tokens": 67,
107
+ "gen_ai.usage.total_tokens": 187
108
+ },
109
+ "status": {"code": "OK"}
110
+ },
111
+ {
112
+ "spanId": "span_007",
113
+ "parentSpanId": "span_005",
114
+ "name": "Tool Call - web_search",
115
+ "kind": "CLIENT",
116
+ "startTime": 1760947222300000000,
117
+ "endTime": 1760947224000000000,
118
+ "attributes": {
119
+ "tool.name": "web_search",
120
+ "tool.input": "{\"query\": \"recent AI news\"}",
121
+ "tool.output": "{\"results\": [...]}",
122
+ "tool.latency_ms": 1700
123
+ },
124
+ "status": {"code": "OK"}
125
+ },
126
+ {
127
+ "spanId": "span_008",
128
+ "parentSpanId": "span_005",
129
+ "name": "LLM Call - Final Response",
130
+ "kind": "CLIENT",
131
+ "startTime": 1760947224100000000,
132
+ "endTime": 1760947224800000000,
133
+ "attributes": {
134
+ "gen_ai.system": "openai",
135
+ "gen_ai.request.model": "gpt-4",
136
+ "gen_ai.usage.prompt_tokens": 189,
137
+ "gen_ai.usage.completion_tokens": 269
138
+ },
139
+ "status": {"code": "OK"}
140
+ }
141
+ ]
142
+ }
143
+ ]