every_eval_ever_space / ui_components.py
deepmage121's picture
fixes to search + some css
b3c0458
import gradio as gr
import plotly.graph_objects as go
from data_loader import get_eval_metadata
def get_theme():
return gr.themes.Base(
primary_hue="blue",
neutral_hue="slate",
).set(
body_background_fill="#f5f5f5",
body_text_color="#0a0a0a",
body_text_color_subdued="#525252",
block_background_fill="#ffffff",
block_border_color="#e5e5e5",
block_label_text_color="#525252",
block_title_text_color="#0a0a0a",
input_background_fill="#ffffff",
input_border_color="#e5e5e5",
button_primary_background_fill="#3b82f6",
button_primary_text_color="#ffffff",
button_secondary_background_fill="#ffffff",
button_secondary_text_color="#0a0a0a",
button_secondary_border_color="#e5e5e5",
)
def get_custom_css():
return """
:root {
--brand-black: #0a0a0a;
--brand-dark: #1a1a1a;
--brand-gray: #2a2a2a;
--brand-light: #f5f5f5;
--brand-accent: #3b82f6;
}
body, .gradio-container {
background: var(--brand-light) !important;
color: var(--brand-black) !important;
}
.gradio-container {
max-width: 100%;
padding: 1.25rem 2.5rem 2rem;
}
.gradio-container *:focus-visible {
outline: none !important;
box-shadow: inset 0 0 0 1.5px #3b82f6 !important;
}
.gradio-container .block,
.gradio-container .wrap,
.gradio-container .form,
.gradio-container .container {
box-shadow: none !important;
}
/* Match pill styling */
.match-pills .wrap,
.match-pills .container {
display: flex !important;
flex-wrap: wrap !important;
gap: 0.35rem !important;
}
.match-pills .wrap > div,
.match-pills .container > div {
margin: 0 !important;
}
.match-pills input[type="checkbox"] {
display: none;
}
.match-pills label {
display: inline-flex;
align-items: center;
border: 1px solid #d6d9de;
background: #f5f7fb;
border-radius: 999px;
padding: 0.28rem 0.75rem;
font-weight: 500;
color: #0a0a0a;
transition: all 120ms ease;
cursor: pointer;
}
.match-pills label:hover {
border-color: #3b82f6;
background: #eef4ff;
}
.match-pills input[type="checkbox"]:checked + label {
border-color: #3b82f6;
background: rgba(59, 130, 246, 0.12);
color: #0a0a0a;
font-weight: 600;
}
.app-header {
display: flex;
align-items: center;
gap: 1rem;
margin-bottom: 1.5rem;
padding: 1rem 1.25rem;
background: #ffffff;
border: 1px solid #e5e5e5;
border-radius: 12px;
}
.logo-mark {
width: 48px;
height: 48px;
border-radius: 12px;
display: flex;
align-items: center;
justify-content: center;
font-weight: 800;
font-size: 1.1rem;
color: #ffffff;
}
.brand h1 { margin: 0; font-size: 1.5rem; font-weight: 700; color: #0a0a0a; }
.brand .tagline { color: #525252; font-size: 0.9rem; }
.header-right { margin-left: auto; }
.version-badge {
background: rgba(59, 130, 246, 0.1);
border: 1px solid #3b82f6;
border-radius: 8px;
padding: 0.35rem 0.6rem;
font-size: 0.78rem;
color: #3b82f6;
}
.info-banner {
background: #ffffff;
border: 1px solid #e5e5e5;
border-left: 3px solid #3b82f6;
border-radius: 10px;
padding: 1rem 1.25rem;
margin-bottom: 1rem;
}
.info-banner h3 { margin: 0; font-weight: 600; color: #0a0a0a; }
.leaderboard-header {
display: flex;
justify-content: space-between;
align-items: center;
gap: 1rem;
flex-wrap: wrap;
margin-bottom: 0.4rem;
}
.lb-title {
font-size: 1.2rem;
font-weight: 700;
color: #0a0a0a;
margin: 0;
line-height: 1.35;
}
.lb-by {
font-size: 0.9rem;
color: #525252;
margin: 0.1rem 0 0 0;
line-height: 1.35;
}
.lb-meta {
display: flex;
flex-direction: column;
gap: 0.1rem;
}
.eval-tags { display: flex; flex-wrap: wrap; gap: 0.4rem; }
.eval-tags { margin-top: 0.35rem; }
.eval-tag {
border-radius: 10px;
padding: 0.3rem 0.65rem;
font-size: 0.82rem;
font-weight: 600;
color: #0a0a0a;
border: 1px solid #e5e5e5;
background: #f8fafc;
}
.eval-tag:nth-child(5n + 1) { border-color: #3b82f6; background: rgba(59, 130, 246, 0.12); color: #0a1d4a; }
.eval-tag:nth-child(5n + 2) { border-color: #10b981; background: rgba(16, 185, 129, 0.12); color: #0b3b2b; }
.eval-tag:nth-child(5n + 3) { border-color: #f97316; background: rgba(249, 115, 22, 0.12); color: #4b1f07; }
.eval-tag:nth-child(5n + 4) { border-color: #8b5cf6; background: rgba(139, 92, 246, 0.12); color: #2f0f5a; }
.eval-tag:nth-child(5n) { border-color: #06b6d4; background: rgba(6, 182, 212, 0.12); color: #053f46; }
.source-link {
font-size: 0.75rem;
color: #3b82f6;
text-decoration: none;
padding: 0.375rem 0.75rem;
border: 1px solid #3b82f6;
border-radius: 6px;
}
.source-link:hover { background: rgba(59, 130, 246, 0.1); }
.pagination-bar {
margin-top: 0.75rem;
padding: 0.85rem 0 0.25rem;
display: flex;
justify-content: center;
align-items: center;
gap: 0.85rem;
}
.page-info { font-size: 1rem; min-width: 80px; text-align: center; color: #0a0a0a; }
.metrics-section {
margin-top: 1.25rem;
padding: 1.25rem 1rem;
border-top: 1px solid #e5e5e5;
}
.metrics-section h3 {
font-size: 0.9rem;
font-weight: 700;
color: #525252;
margin: 0 0 0.9rem 0;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.metrics-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 0.75rem;
}
@media (max-width: 768px) {
.metrics-grid {
grid-template-columns: repeat(auto-fill, minmax(160px, 1fr));
gap: 0.5rem;
}
.metric-card-header {
padding: 0.65rem 0.8rem;
flex-direction: column;
align-items: flex-start;
gap: 0.25rem;
}
.metric-card-body {
padding: 0.65rem 0.8rem;
font-size: 0.85rem;
}
.metrics-section {
padding: 1rem 0.5rem;
}
}
.metrics-grid .metric-card {
align-self: start;
}
.metric-card {
background: #ffffff;
border: 1px solid #e5e5e5;
border-radius: 10px;
overflow: hidden;
position: relative;
}
.metric-card-header {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.85rem 1rem;
cursor: pointer;
}
.metric-card-header:hover {
background: #f9f9f9;
}
.metric-card-name { font-weight: 600; color: #0a0a0a; }
.metric-card-direction { font-size: 0.82rem; color: #525252; }
.metric-card-direction .arrow { color: #22c55e; font-weight: 700; }
.metric-card-body {
display: none;
padding: 0.85rem 1rem;
border-top: 1px solid #e5e5e5;
color: #0a0a0a;
}
.metric-card input.metric-toggle {
display: none;
}
.metric-card input.metric-toggle:checked ~ .metric-card-body {
display: block;
}
.metric-card input.metric-toggle:checked ~ .metric-card-header {
background: #f9f9f9;
border-bottom: 1px solid #e5e5e5;
}
.metric-card input.metric-toggle:checked ~ .metric-card-header .metric-card-name,
.metric-card input.metric-toggle:checked ~ .metric-card-header .metric-card-direction {
color: #0a0a0a;
}
/* Ensure multiple cards can be open at once and are closable */
.metric-card input.metric-toggle:not(:checked) ~ .metric-card-body {
display: none;
}
.metric-type-badge {
font-size: 0.68rem;
text-transform: uppercase;
padding: 0.2rem 0.45rem;
background: rgba(59, 130, 246, 0.1);
border: 1px solid #3b82f6;
border-radius: 6px;
color: #3b82f6;
}
.heatmap-table { width: 100%; border-collapse: collapse; font-size: 0.85rem; }
.heatmap-table th { padding: 0.55rem 0.65rem; font-weight: 700; font-size: 0.72rem; text-transform: uppercase; color: #525252; background: #f5f5f5; }
.heatmap-table td { padding: 0.45rem 0.65rem; text-align: center; border-bottom: 1px solid #e5e5e5; }
.heatmap-table td.metric-name { text-align: left; font-weight: 600; color: #0a0a0a; }
.heatmap-table td.score-cell { font-weight: 600; }
.heatmap-table td.score-cell.best { background: rgba(34, 197, 94, 0.15); color: #16a34a; }
.heatmap-table td.score-cell.good { background: rgba(34, 197, 94, 0.08); color: #16a34a; }
.heatmap-table td.score-cell.mid { background: rgba(234, 179, 8, 0.15); color: #ca8a04; }
.heatmap-table td.score-cell.low { background: rgba(239, 68, 68, 0.12); color: #dc2626; }
.heatmap-table td.score-cell.worst { background: rgba(239, 68, 68, 0.18); color: #b91c1c; }
.heatmap-table td.score-cell.na { color: #525252; font-style: italic; }
/* Model chips */
.selected-models-group label {
display: inline-flex !important;
background: #ffffff;
border: 1px solid #e5e5e5;
border-radius: 16px;
padding: 0.35rem 0.85rem;
font-size: 0.88rem;
color: #0a0a0a;
cursor: pointer;
margin: 0.18rem 0.32rem 0.18rem 0 !important;
}
.selected-models-group input[type="checkbox"] { display: none; }
.no-results { text-align: center; padding: 2.5rem 1rem; color: #525252; }
.gradio-container footer { display: none; }
.block, .form, .wrap, .container { background: #ffffff !important; }
body, .gradio-container, p, span, div, h1, h2, h3, h4, h5, h6, label, td, th {
color: #0a0a0a !important;
}
.label-wrap span, .prose, .markdown, .prose p, .prose li, .markdown p, .markdown li {
color: #525252 !important;
}
input, textarea, select {
background: #ffffff !important;
color: #0a0a0a !important;
border: 1px solid #e5e5e5 !important;
border-radius: 8px !important;
}
input::placeholder, textarea::placeholder {
color: #a1a1a1 !important;
}
input:focus, textarea:focus, select:focus {
border-color: #3b82f6 !important;
outline: none !important;
box-shadow: inset 0 0 0 1.5px #3b82f6 !important;
}
select, .wrap select, .wrap input, input[type="text"], textarea {
min-height: 44px !important;
padding: 0.55rem 0.75rem !important;
font-size: 0.96rem !important;
}
button {
border-radius: 8px !important;
font-weight: 500 !important;
transition: all 0.15s ease !important;
}
button.primary, button[variant="primary"] {
background: #3b82f6 !important;
color: #ffffff !important;
border: none !important;
}
button.primary:hover, button[variant="primary"]:hover {
background: #2563eb !important;
}
button.secondary, button[variant="secondary"], button:not(.primary):not([variant="primary"]) {
background: #ffffff !important;
color: #0a0a0a !important;
border: 1px solid #e5e5e5 !important;
}
button.secondary:hover, button[variant="secondary"]:hover {
border-color: #3b82f6 !important;
background: #f5f5f5 !important;
}
.tab-nav, .tabs {
border-bottom: 1px solid #e5e5e5 !important;
}
.tab-nav button, .tabs button {
color: #525252 !important;
background: transparent !important;
border: none !important;
border-bottom: 2px solid transparent !important;
}
.tab-nav button.selected, .tabs button.selected {
color: #3b82f6 !important;
border-bottom-color: #3b82f6 !important;
}
.wrap, .secondary-wrap, .primary-wrap {
background: transparent !important;
border: none !important;
border-radius: 0 !important;
box-shadow: none !important;
padding: 0 !important;
}
ul[role="listbox"], .dropdown, .options {
background: #ffffff !important;
border: 1px solid #e5e5e5 !important;
border-radius: 8px !important;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important;
}
ul[role="listbox"] li, .dropdown li, .options li {
color: #0a0a0a !important;
}
ul[role="listbox"] li:hover, .dropdown li:hover, .options li:hover {
background: #f5f5f5 !important;
}
ul[role="listbox"] li.active, .dropdown li.active, .options li.active {
background: #f5f5f5 !important;
color: #0a0a0a !important;
}
ul[role="listbox"] li.selected, .dropdown li.selected {
background: rgba(59, 130, 246, 0.1) !important;
color: #3b82f6 !important;
}
.accordion {
border: 1px solid #e5e5e5 !important;
border-radius: 8px !important;
background: #ffffff !important;
}
.accordion > button {
color: #0a0a0a !important;
}
.selected-models-group label, .checkbox-group label {
display: inline-flex !important;
background: #ffffff;
border: 1px solid #e5e5e5;
border-radius: 20px !important;
padding: 0.4rem 0.9rem !important;
font-size: 0.88rem !important;
color: #0a0a0a !important;
cursor: pointer !important;
margin: 0.2rem !important;
transition: all 0.15s ease !important;
}
.selected-models-group label:hover, .checkbox-group label:hover {
border-color: #3b82f6 !important;
background: #f5f5f5 !important;
}
.selected-models-group input[type="checkbox"], .checkbox-group input[type="checkbox"] {
display: none !important;
}
table {
width: 100% !important;
border-collapse: collapse !important;
background: #ffffff !important;
}
table th {
background: #f5f5f5 !important;
color: #525252 !important;
font-weight: 600 !important;
text-transform: uppercase !important;
font-size: 0.75rem !important;
padding: 0.75rem !important;
border-bottom: 1px solid #e5e5e5 !important;
text-align: left !important;
}
table td {
padding: 0.65rem 0.75rem !important;
border-bottom: 1px solid #e5e5e5 !important;
color: #0a0a0a !important;
}
table tr:hover td {
background: #f9f9f9 !important;
}
.dataframe {
background: #ffffff !important;
border: 1px solid #e5e5e5 !important;
box-shadow: none !important;
border-radius: px !important;
overflow: hidden !important;
}
.dataframe table {
width: 100% !important;
border-collapse: collapse !important;
font-size: 0.75rem !important;
table-layout: auto !important;
background: #ffffff !important;
}
.dataframe thead,
.dataframe thead tr {
background: #ffffff !important;
position: sticky !important;
top: 0 !important;
z-index: 10 !important;
}
.dataframe thead th {
padding: 0.875rem 1rem !important;
font-weight: 700 !important;
font-size: 0.75rem !important;
text-transform: uppercase !important;
letter-spacing: 0.05em !important;
color: #0a0a0a !important;
border-bottom: 2px solid #e5e5e5 !important;
border-top: none !important;
text-align: left !important;
background: #ffffff !important;
white-space: nowrap !important;
border-radius: 0 !important;
}
.dataframe thead th span,
.dataframe thead th div,
.dataframe thead th button {
background: transparent !important;
border: none !important;
border-radius: 0 !important;
box-shadow: none !important;
margin: 0 !important;
outline: none !important;
}
.dataframe thead th span[role="button"],
.dataframe thead th span[class*="svelte"] {
background: transparent !important;
border: none !important;
box-shadow: none !important;
outline: none !important;
padding: 0 !important;
width: auto !important;
}
/* Also target the SVG icon if it exists to ensure it doesn't have a background */
.dataframe thead th svg {
background: transparent !important;
box-shadow: none !important;
}
.dataframe thead th span:hover,
.dataframe thead th span[role="button"]:hover,
.dataframe thead th span[class*="svelte"]:hover,
.dataframe thead th button:hover {
background: transparent !important;
border: none !important;
box-shadow: none !important;
color: #3b82f6 !important;
}
.token {
background-color: rgba(59, 130, 246, 0.12) !important;
border: 1px solid rgba(59, 130, 246, 0.3) !important;
color: #1e3a8a !important;
border-radius: 6px !important;
padding: 2px 8px !important;
gap: 4px !important;
}
.token-remove {
background-color: rgba(255, 255, 255, 0.4) !important;
border: 1px solid rgba(30, 58, 138, 0.5) !important; /* Dark blue outline */
color: #1e3a8a !important;
border-radius: 4px !important;
margin-left: 6px !important;
padding: 1px !important;
opacity: 0.9 !important;
min-width: 18px !important;
min-height: 18px !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.token-remove svg {
width: 12px !important;
height: 12px !important;
}
.token-remove:hover {
background-color: #1e3a8a !important;
color: #ffffff !important;
border-color: #1e3a8a !important;
}
.selector-item {
border-radius: 6px !important;
}
.gradio-container .token {
box-shadow: none !important;
font-weight: 500 !important;
}
.gradio-container .token span {
color: #1e3a8a !important;
}
.dataframe tbody,
.dataframe tbody tr {
background: #ffffff !important;
}
.dataframe tbody tr {
border-bottom: 1px solid #e5e5e5 !important;
}
.dataframe tbody tr:hover {
background: #f9f9f9 !important;
}
.dataframe tbody td {
padding: 0.75rem 1rem !important;
color: #0a0a0a !important;
background: #ffffff !important;
border: none !important;
border-bottom: 1px solid #e5e5e5 !important;
}
.dataframe tbody td:first-child {
font-weight: 700 !important;
color: #0a0a0a !important;
white-space: normal !important;
word-break: break-word !important;
max-width: 400px;
min-width: 250px;
}
.dataframe tbody td:not(:first-child) {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
text-align: left !important;
white-space: nowrap !important;
min-width: 80px !important;
}
.dataframe td:nth-child(2),
.dataframe th:nth-child(2) {
max-width: 220px;
min-width: 140px;
}
.column-selector-dropdown {
min-width: 300px;
}
.column-selector-dropdown .wrap {
flex-wrap: nowrap !important;
overflow-x: auto !important;
gap: 0.25rem !important;
padding: 0.5rem !important;
}
.column-selector-dropdown .wrap input {
width: 100% !important;
padding-left: 0.5rem !important;
border: none !important;
box-shadow: none !important;
}
.heatmap-table {
border: 1px solid #e5e5e5 !important;
border-radius: 8px !important;
overflow: hidden !important;
}
.heatmap-table th {
background: #f5f5f5 !important;
color: #525252 !important;
padding: 0.6rem 0.75rem !important;
font-size: 0.72rem !important;
border-bottom: 2px solid #e5e5e5 !important;
}
.heatmap-table td {
padding: 0.5rem 0.75rem !important;
border-bottom: 1px solid #e5e5e5 !important;
}
.heatmap-table td.metric-name {
background: #f5f5f5 !important;
font-weight: 600 !important;
}
.heatmap-table td.score-cell.best { background: rgba(34, 197, 94, 0.2) !important; color: #15803d !important; }
.heatmap-table td.score-cell.good { background: rgba(34, 197, 94, 0.1) !important; color: #16a34a !important; }
.heatmap-table td.score-cell.mid { background: rgba(234, 179, 8, 0.15) !important; color: #a16207 !important; }
.heatmap-table td.score-cell.low { background: rgba(239, 68, 68, 0.12) !important; color: #dc2626 !important; }
.heatmap-table td.score-cell.worst { background: rgba(239, 68, 68, 0.2) !important; color: #b91c1c !important; }
.heatmap-table td.score-cell.na { color: #a1a1a1 !important; font-style: italic !important; }
.gradio-container footer { display: none !important; }
::-webkit-scrollbar { width: 8px; height: 8px; }
::-webkit-scrollbar-track { background: #f5f5f5; }
::-webkit-scrollbar-thumb { background: #d4d4d4; border-radius: 4px; }
::-webkit-scrollbar-thumb:hover { background: #a1a1a1; }
"""
def format_leaderboard_header(selected_leaderboard, metadata):
if not selected_leaderboard:
return '<div style="text-align: center; padding: 2rem; color: #525252;">Select a leaderboard to explore</div>'
if not metadata or not metadata.get("evals"):
return f'<div class="info-banner"><h3>{selected_leaderboard}</h3></div>'
source_info = metadata.get("source_info", {})
org = source_info.get("organization", "Unknown")
url = source_info.get("url", "#")
eval_names = sorted(list(metadata["evals"].keys()))
eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names])
return f'''
<div class="info-banner">
<div class="leaderboard-header">
<div class="lb-meta">
<div class="lb-title">{selected_leaderboard}</div>
<div class="lb-by">By {org}</div>
</div>
<a href="{url}" target="_blank" class="source-link">Source →</a>
</div>
<div class="eval-tags">{eval_tags}</div>
</div>
'''
def format_metric_details(selected_leaderboard, metadata):
if not selected_leaderboard or not metadata or not metadata.get("evals"):
return ""
evals = metadata.get("evals", {})
cards_html = ""
for i, (eval_name, info) in enumerate(evals.items()):
score_type = info.get('score_type', '').upper() or "—"
direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
arrow = "↓" if info.get('lower_is_better') else "↑"
details = ""
if info.get('score_type') == "continuous" and info.get('min_score') is not None:
details = f"Range: [{info['min_score']}{info['max_score']}]"
elif info.get('score_type') == "levels" and info.get('level_names'):
details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
card_id = f"mc{i}"
cards_html += f'''
<div class="metric-card" id="{card_id}">
<input type="checkbox" id="toggle-{card_id}" class="metric-toggle" />
<label class="metric-card-header" for="toggle-{card_id}">
<span class="metric-card-name">{eval_name}</span>
<span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
</label>
<div class="metric-card-body">
<div>{info.get('description', 'No description')}</div>
<div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
<span style="font-size: 0.75rem; color: #525252;">{details}</span>
<span class="metric-type-badge">{score_type}</span>
</div>
</div>
</div>
'''
return f'''
<div class="metrics-section">
<h3>Metric Reference</h3>
<div class="metrics-grid">{cards_html}</div>
</div>
'''
def format_model_card(model_name, model_data):
if not model_data:
return '<div class="no-results"><h3>No results found</h3><p>Try a different model name</p></div>'
first = list(model_data.values())[0]
developer = first.get("developer", "Unknown")
params = first.get("params")
arch = first.get("architecture", "Unknown")
params_str = f"{params}B" if params else "—"
html = f'''
<div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">
<h2 style="margin: 0 0 0.5rem 0; color: #0a0a0a;">{model_name}</h2>
<div style="color: #525252; margin-bottom: 1rem;">
<span>Developer: {developer}</span> ·
<span>Params: {params_str}</span> ·
<span>Arch: {arch}</span>
</div>
'''
for leaderboard_name, data in model_data.items():
results = data.get("results", {})
if not results:
continue
scores = [v for v in results.values() if v is not None]
avg = sum(scores) / len(scores) if scores else None
avg_str = f"{avg:.2f}" if avg else "—"
html += f'<div style="margin-bottom: 1rem;"><h4 style="color: #0a0a0a;">{leaderboard_name} <span style="color: #525252;">(avg: {avg_str})</span></h4>'
html += '<div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">'
for metric_name, score in sorted(results.items(), key=lambda x: x[1] if x[1] else 0, reverse=True):
score_display = f"{score:.2f}" if score is not None else "—"
html += f'<div style="padding: 0.4rem 0.8rem; border-radius: 6px; background: #f5f5f5; border: 1px solid #e5e5e5;"><span style="color: #525252;">{metric_name}:</span> <strong style="color: #0a0a0a;">{score_display}</strong></div>'
html += '</div></div>'
html += '</div>'
return html
def format_model_comparison(selected_models, all_results):
if not selected_models or not all_results:
return '<div class="no-results"><h3>Select models to compare</h3><p>Choose models from the dropdown</p></div>'
all_leaderboards = set()
model_data_dict = {}
for model_name in selected_models:
if model_name in all_results:
model_data_dict[model_name] = all_results[model_name]
for lb in all_results[model_name].keys():
all_leaderboards.add(lb)
if not model_data_dict:
return '<div class="no-results"><h3>No data found</h3></div>'
all_leaderboards = sorted(all_leaderboards)
html = '<div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">'
for leaderboard_name in all_leaderboards:
metrics = set()
for md in model_data_dict.values():
if leaderboard_name in md:
metrics.update(md[leaderboard_name].get("results", {}).keys())
metrics = sorted(metrics)
if not metrics:
continue
html += f'<h3 style="margin: 1rem 0 0.5rem; color: #0a0a0a;">{leaderboard_name}</h3>'
html += '<div style="overflow-x: auto;"><table class="heatmap-table"><thead><tr><th>Metric</th>'
for model_name in selected_models:
short = model_name[:20] + "…" if len(model_name) > 20 else model_name
html += f'<th title="{model_name}">{short}</th>'
html += '</tr></thead><tbody>'
for metric_name in metrics:
html += f'<tr><td class="metric-name">{metric_name}</td>'
scores = {}
for m in selected_models:
if m in model_data_dict and leaderboard_name in model_data_dict[m]:
scores[m] = model_data_dict[m][leaderboard_name].get("results", {}).get(metric_name)
valid = [v for v in scores.values() if v is not None]
max_s = max(valid) if valid else None
min_s = min(valid) if valid else None
for model_name in selected_models:
score = scores.get(model_name)
if score is not None:
if len(valid) > 1 and max_s and min_s:
if score == max_s:
cls = "best"
elif max_s > min_s:
pct = (score - min_s) / (max_s - min_s)
cls = "good" if pct >= 0.75 else "mid" if pct >= 0.5 else "low" if pct >= 0.25 else "worst"
else:
cls = ""
else:
cls = ""
html += f'<td class="score-cell {cls}">{score:.2f}</td>'
else:
html += '<td class="score-cell na">—</td>'
html += '</tr>'
html += '</tbody></table></div>'
html += '</div>'
return html
def create_radar_plot(selected_models, all_results):
if not selected_models or not all_results:
return None
metric_data = {}
leaderboards_involved = set()
for model in selected_models:
if model not in all_results:
continue
model_data = all_results[model]
for lb_name, lb_data in model_data.items():
leaderboards_involved.add(lb_name)
results = lb_data.get("results", {})
for metric, score in results.items():
if score is None: continue
key = f"{lb_name}: {metric}"
if key not in metric_data:
metric_data[key] = {}
metric_data[key][model] = score
if not metric_data:
return None
meta_cache = {}
for lb in leaderboards_involved:
meta_cache[lb] = get_eval_metadata(lb)
fig = go.Figure()
categories = sorted(metric_data.keys())
for model in selected_models:
r_values = []
theta_values = []
hover_texts = []
for cat in categories:
lb_name, metric_name = cat.split(": ", 1)
val = metric_data[cat].get(model)
if val is None:
r_values.append(None)
theta_values.append(cat)
hover_texts.append(f"{cat}<br>N/A")
else:
meta = meta_cache.get(lb_name, {}).get("evals", {}).get(metric_name, {})
min_s = meta.get("min_score")
max_s = meta.get("max_score")
observed_vals = []
for m in selected_models:
v = metric_data[cat].get(m)
if v is not None:
observed_vals.append(v)
observed_max = max(observed_vals) if observed_vals else 1.0
if min_s is None:
min_s = 0
if max_s is None:
if observed_max > 1:
max_s = 100
else:
max_s = 1
max_s = max(max_s, observed_max)
if max_s == min_s:
norm_val = 1.0
else:
norm_val = (val - min_s) / (max_s - min_s)
norm_val = max(0.0, min(1.0, norm_val))
r_values.append(norm_val)
theta_values.append(cat)
hover_texts.append(f"{cat}<br>Score: {val:.2f} (Norm: {norm_val:.2f})")
if r_values:
r_values.append(r_values[0])
theta_values.append(theta_values[0])
hover_texts.append(hover_texts[0])
fig.add_trace(go.Scatterpolar(
r=r_values,
theta=theta_values,
name=model,
hovertext=hover_texts,
hoverinfo="text",
fill='toself'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)
),
showlegend=True,
margin=dict(l=80, r=80, t=20, b=20),
title="Model Comparison Radar (Normalized Scores)"
)
return fig