AMA-bench-Leaderboard / visualization.py
NorahYujieZhao
refine the model scope
fb8b1d9
"""
Visualization module for AMA-Bench leaderboard
Adapted from lmGAME_bench patterns with AMA-specific customizations
"""
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import json
import os
from typing import Dict, List, Optional, Tuple
# Constants
METRICS = ["Recall", "Causal Inference", "State Updating", "State Abstraction"]
ALL_METRICS = METRICS + ["Average"]
def load_model_colors(filepath: str = "assets/model_colors.json") -> Dict[str, str]:
"""
Load color scheme for models and methods from JSON file.
Args:
filepath: Path to color configuration JSON
Returns:
Dictionary mapping model/method names to hex colors
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
color_data = json.load(f)
# Merge models and methods into single dictionary
colors = {}
if 'models' in color_data:
colors.update(color_data['models'])
if 'methods' in color_data:
colors.update(color_data['methods'])
# Store fallback color
fallback = color_data.get('fallback', '#808080')
return colors, fallback
except Exception as e:
print(f"Warning: Could not load colors from {filepath}: {e}")
return {}, '#808080'
def normalize_scores(values: List[float], mean: float, std: float) -> List[float]:
"""
Normalize scores using z-score and scale to 0-100 range.
Adapted from lmGAME_bench's normalize_values() function.
Args:
values: List of accuracy values (0-1 range)
mean: Mean value for normalization
std: Standard deviation for normalization
Returns:
List of normalized scores (0-100 range)
Formula:
z_score = (value - mean) / std
normalized = clamp((z_score * 30) + 35, 0, 100)
"""
# Handle zero std case (all values are the same)
if std < 0.05: # Minimum std threshold to prevent extreme values
std = 0.05
normalized = []
for v in values:
z_score = (v - mean) / std
scaled = (z_score * 30) + 35
clamped = max(0, min(100, scaled))
normalized.append(clamped)
return normalized
def filter_by_category(data: Dict, category: str) -> Dict:
"""
Filter method data by category.
Args:
data: Full dataset with entries
category: "All", "RAG", or "Agent Memory"
Returns:
Filtered data dictionary
"""
if category == "All":
return data
filtered_data = data.copy()
filtered_data['entries'] = [
entry for entry in data['entries']
if entry.get('category') == category
]
return filtered_data
def prepare_dataframe_for_visualization(
data: Dict,
top_n: Optional[int] = None,
category_filter: str = "All",
selected_metrics: Optional[List[str]] = None
) -> pd.DataFrame:
"""
Build DataFrame with both raw and normalized scores.
Args:
data: Raw data from model_data.json or method_data.json
top_n: Number of top entries to include (None = all)
category_filter: "All", "RAG", or "Agent Memory" (for methods only)
selected_metrics: List of metrics to include (None = all)
Returns:
DataFrame with columns:
- Method/Model (name)
- Category (if applicable)
- {Metric} (raw accuracy 0-1) for each metric
- norm_{Metric} (normalized 0-100) for each metric
- Avg Normalized Score (mean of normalized scores)
"""
# Filter by category first
if category_filter != "All":
data = filter_by_category(data, category_filter)
if not data['entries']:
# Return empty DataFrame if no entries
return pd.DataFrame()
# Use all metrics if none specified
if selected_metrics is None:
selected_metrics = METRICS
# Build basic DataFrame
rows = []
for entry in data['entries']:
row = {
'Name': entry['method'],
}
# Add category if present
if entry.get('category') is not None:
row['Category'] = entry['category']
# Add raw scores
for metric in selected_metrics:
score_data = entry['scores'].get(metric, {})
row[metric] = score_data.get('accuracy', 0.0)
# Add average
row['Average'] = entry['scores'].get('Average', {}).get('accuracy', 0.0)
rows.append(row)
df = pd.DataFrame(rows)
# Sort by average accuracy (descending)
df = df.sort_values(by='Average', ascending=False)
# Calculate normalization parameters from FULL dataset (before limiting)
norm_params = {}
for metric in selected_metrics:
values = df[metric].values
mean = values.mean()
std = values.std()
norm_params[metric] = (mean, std)
# Apply top_n limit if specified
if top_n is not None and top_n > 0:
df = df.head(top_n)
# Add normalized scores
for metric in selected_metrics:
mean, std = norm_params[metric]
values = df[metric].values
df[f'norm_{metric}'] = normalize_scores(values.tolist(), mean, std)
# Calculate average normalized score
norm_cols = [f'norm_{metric}' for metric in selected_metrics]
df['Avg Normalized Score'] = df[norm_cols].mean(axis=1)
# Reset index
df = df.reset_index(drop=True)
return df
def hex_to_rgba(hex_color: str, alpha: float = 0.2) -> str:
"""
Convert hex color to RGBA with specified alpha.
Args:
hex_color: Hex color code (e.g., "#FF0000")
alpha: Alpha value (0-1)
Returns:
RGBA color string
"""
hex_color = hex_color.lstrip('#')
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
return f'rgba({r}, {g}, {b}, {alpha})'
def create_radar_chart(
df: pd.DataFrame,
selected_metrics: List[str],
title: str = "Performance Across Metrics",
color_map: Optional[Dict[str, str]] = None
) -> go.Figure:
"""
Create radar chart with normalized scores.
Adapted from lmGAME_bench's create_single_radar_chart().
Args:
df: DataFrame from prepare_dataframe_for_visualization()
selected_metrics: List of metric names to include as axes
title: Chart title
color_map: Dictionary mapping names to colors
Returns:
Plotly Figure with radar chart
Features:
- Each axis = one metric
- Each trace = one model/method
- Range: 0-100 (normalized)
- Interactive legend (click to isolate, double-click to toggle)
"""
if df.empty:
fig = go.Figure()
fig.update_layout(title="No data available")
return fig
# Load colors if not provided
if color_map is None:
color_map, fallback_color = load_model_colors()
else:
fallback_color = '#808080'
# Check if we have normalized columns
norm_cols = [f'norm_{metric}' for metric in selected_metrics]
if not all(col in df.columns for col in norm_cols):
fig = go.Figure()
fig.update_layout(title="Missing normalized data")
return fig
fig = go.Figure()
# Add trace for each model/method
for _, row in df.iterrows():
name = row['Name']
# Get normalized values for selected metrics
r = [row[f'norm_{metric}'] for metric in selected_metrics]
# Get color
color = color_map.get(name, fallback_color)
fillcolor = hex_to_rgba(color, 0.2)
# Add trace
fig.add_trace(go.Scatterpolar(
r=r + [r[0]], # Close the polygon
theta=selected_metrics + [selected_metrics[0]],
mode='lines+markers',
fill='toself',
name=name.lower(), # Lowercase for legend
line=dict(color=color, width=2),
marker=dict(color=color, size=6),
fillcolor=fillcolor,
opacity=0.7,
hovertemplate='<b>%{fullData.name}</b><br>%{theta}: %{r:.1f}<extra></extra>'
))
# Update layout
fig.update_layout(
title=dict(
text=title,
x=0.5,
xanchor='center',
font=dict(size=18)
),
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 100],
tickfont=dict(size=11),
gridcolor='lightgray',
gridwidth=1
),
angularaxis=dict(
tickfont=dict(size=12, weight='bold')
)
),
legend=dict(
font=dict(size=11),
title=dict(text="Models/Methods 💡", font=dict(size=12)),
itemsizing='trace',
x=1.05,
y=1,
xanchor='left',
yanchor='top',
bgcolor='rgba(255,255,255,0.6)',
bordercolor='gray',
borderwidth=1,
itemclick="toggleothers",
itemdoubleclick="toggle"
),
height=550,
margin=dict(l=80, r=200, t=80, b=80)
)
return fig
def create_group_bar_chart(
df: pd.DataFrame,
selected_metrics: List[str],
top_n: int = 5,
color_map: Optional[Dict[str, str]] = None
) -> go.Figure:
"""
Create grouped bar chart showing top N performers per metric.
Adapted from lmGAME_bench's create_group_bar_chart().
Args:
df: DataFrame with normalized scores
selected_metrics: List of metrics to display
top_n: Number of top performers to show per metric
color_map: Dictionary mapping names to colors
Returns:
Plotly Figure with grouped bar chart
Structure:
- X-axis: Metrics with rank positions (e.g., "Recall #1", "Recall #2")
- Y-axis: Normalized score (0-100)
- Bars: Grouped by model/method
"""
if df.empty:
fig = go.Figure()
fig.update_layout(title="No data available")
return fig
# Load colors if not provided
if color_map is None:
color_map, fallback_color = load_model_colors()
else:
fallback_color = '#808080'
# Check for normalized columns
norm_cols = [f'norm_{metric}' for metric in selected_metrics]
if not all(col in df.columns for col in norm_cols):
fig = go.Figure()
fig.update_layout(title="Missing normalized data")
return fig
# Build x-axis categories and data structure
all_x_categories = []
all_names = set()
metric_rankings = {}
for metric in selected_metrics:
norm_col = f'norm_{metric}'
# Get top N for this metric
metric_df = df[df[norm_col].notna()].copy()
metric_df = metric_df.sort_values(by=norm_col, ascending=False).head(top_n)
metric_rankings[metric] = []
for rank, (_, row) in enumerate(metric_df.iterrows(), 1):
name = row['Name']
score = row[norm_col]
x_category = f"{metric}<br>#{rank}"
metric_rankings[metric].append({
'name': name,
'score': score,
'x_category': x_category,
'rank': rank
})
all_x_categories.append(x_category)
all_names.add(name)
# Create traces for each model/method
fig = go.Figure()
for name in sorted(all_names):
x_vals = []
y_vals = []
for metric in selected_metrics:
# Find this model/method's data for this metric
for data in metric_rankings[metric]:
if data['name'] == name:
x_vals.append(data['x_category'])
y_vals.append(data['score'])
break
if x_vals: # Only add if has data
color = color_map.get(name, fallback_color)
fig.add_trace(go.Bar(
name=name,
x=x_vals,
y=y_vals,
marker_color=color,
hovertemplate="<b>%{fullData.name}</b><br>Score: %{y:.1f}<extra></extra>"
))
# Update layout
fig.update_layout(
title=dict(
text=f"Top {top_n} Performers by Metric",
x=0.5,
xanchor='center',
font=dict(size=18)
),
xaxis_title="Metrics (Ranked by Performance)",
yaxis_title="Normalized Score",
xaxis=dict(
categoryorder='array',
categoryarray=all_x_categories,
tickangle=0
),
yaxis=dict(range=[0, 100]),
barmode='group',
bargap=0.15,
bargroupgap=0.1,
height=550,
margin=dict(l=60, r=200, t=80, b=80),
legend=dict(
font=dict(size=11),
title=dict(text="Models/Methods 💡", font=dict(size=12)),
itemsizing='trace',
x=1.05,
y=1,
xanchor='left',
yanchor='top',
bgcolor='rgba(255,255,255,0.6)',
bordercolor='gray',
borderwidth=1
)
)
return fig
def create_horizontal_bar_chart(
df: pd.DataFrame,
metric: str,
color_map: Optional[Dict[str, str]] = None
) -> go.Figure:
"""
Create horizontal bar chart for single metric details view.
Adapted from lmGAME_bench's create_horizontal_bar_chart().
Args:
df: DataFrame with scores
metric: Metric name (e.g., "Recall")
color_map: Dictionary mapping names to colors
Returns:
Plotly Figure with horizontal bar chart
Features:
- Y-axis: Model/method names (sorted by score, descending)
- X-axis: Raw accuracy score (0-1 range)
- Uses raw scores, not normalized
"""
if df.empty or metric not in df.columns:
fig = go.Figure()
fig.update_layout(title=f"No data available for {metric}")
return fig
# Load colors if not provided
if color_map is None:
color_map, fallback_color = load_model_colors()
else:
fallback_color = '#808080'
# Filter and sort
metric_df = df[df[metric].notna()].copy()
metric_df = metric_df.sort_values(by=metric, ascending=True) # Lowest at top
if metric_df.empty:
fig = go.Figure()
fig.update_layout(title=f"No valid data for {metric}")
return fig
# Create bar chart
colors = [color_map.get(name, fallback_color) for name in metric_df['Name']]
fig = go.Figure(
go.Bar(
y=metric_df['Name'],
x=metric_df[metric],
orientation='h',
marker=dict(
color=colors,
line=dict(color='#2c3e50', width=1)
),
hovertemplate='%{y}<br>Accuracy: %{x:.4f}<extra></extra>'
)
)
# Update layout
fig.update_layout(
title=dict(
text=f'{metric} - Detailed Rankings',
x=0.5,
xanchor='center',
font=dict(size=18)
),
xaxis_title="Accuracy",
yaxis_title="Model/Method",
xaxis=dict(
range=[0, 1],
gridcolor='#e0e0e0'
),
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
font=dict(color='#2c3e50'),
height=max(400, len(metric_df) * 30), # Dynamic height based on entries
margin=dict(l=200, r=40, t=80, b=60),
showlegend=False
)
return fig
def create_multi_metric_bar_chart(
df: pd.DataFrame,
selected_metrics: List[str],
color_map: Optional[Dict[str, str]] = None
) -> go.Figure:
"""
Create grouped horizontal bar chart showing multiple metrics for each model/method.
Args:
df: DataFrame with scores
selected_metrics: List of metrics to display (e.g., ["Recall", "Causal Inference"])
color_map: Dictionary mapping names to colors
Returns:
Plotly Figure with grouped horizontal bar chart
Features:
- Y-axis: Model/method names
- X-axis: Raw accuracy score (0-1 range)
- Multiple bars per model/method (one per selected metric)
- Sorted by average score across selected metrics
"""
if df.empty or not selected_metrics:
fig = go.Figure()
fig.update_layout(title="No data available")
return fig
# Check if all selected metrics exist
missing_metrics = [m for m in selected_metrics if m not in df.columns]
if missing_metrics:
fig = go.Figure()
fig.update_layout(title=f"Missing metrics: {', '.join(missing_metrics)}")
return fig
# Filter to entries that have at least one selected metric
metric_df = df.copy()
metric_df = metric_df[metric_df[selected_metrics].notna().any(axis=1)]
if metric_df.empty:
fig = go.Figure()
fig.update_layout(title="No valid data for selected metrics")
return fig
# Calculate average score across selected metrics for sorting
metric_df['avg_score'] = metric_df[selected_metrics].mean(axis=1)
metric_df = metric_df.sort_values(by='avg_score', ascending=True) # Lowest at top
# Use single base color with gradient based on capability
base_color = "#636EFA" # Blue color
# Normalize avg_score to create gradient (0.3 to 1.0 range for visibility)
min_score = metric_df['avg_score'].min()
max_score = metric_df['avg_score'].max()
score_range = max_score - min_score if max_score > min_score else 1
# Create color gradient based on model capability (higher score = deeper color)
def get_gradient_color(score, min_val, max_val, score_range):
"""Generate color with gradient based on score"""
# Normalize to 0-1 range, then scale to 0.3-1.0 for better visibility
normalized = (score - min_val) / score_range if score_range > 0 else 0.5
intensity = 0.3 + (normalized * 0.7) # Range: 0.3 (light) to 1.0 (deep)
# Convert base color to RGB and apply intensity with 50% opacity
hex_color = base_color.lstrip('#')
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
# Apply intensity to RGB values
r = int(255 - (255 - r) * intensity)
g = int(255 - (255 - g) * intensity)
b = int(255 - (255 - b) * intensity)
return f'rgba({r}, {g}, {b}, 0.5)' # 50% transparency
# Create grouped bar chart
fig = go.Figure()
for metric in selected_metrics:
# Create color array for each model based on their avg_score
colors = [
get_gradient_color(row['avg_score'], min_score, max_score, score_range)
for _, row in metric_df.iterrows()
]
fig.add_trace(go.Bar(
name=metric,
y=metric_df['Name'],
x=metric_df[metric],
orientation='h',
marker=dict(
color=colors,
line=dict(color='#2c3e50', width=0.5)
),
hovertemplate=f'<b>%{{y}}</b><br>{metric}: %{{x:.4f}}<extra></extra>'
))
# Update layout
fig.update_layout(
title=dict(
text=f'Detailed Comparison - {", ".join(selected_metrics)}',
x=0.5,
xanchor='center',
font=dict(size=18)
),
xaxis_title="Accuracy",
yaxis_title="Model/Method",
xaxis=dict(
range=[0, 1],
gridcolor='#e0e0e0'
),
barmode='group',
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
font=dict(color='#2c3e50'),
height=max(500, len(metric_df) * 40), # Dynamic height
margin=dict(l=200, r=40, t=80, b=80),
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="center",
x=0.5,
font=dict(size=12)
)
)
return fig