Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import plotly.graph_objects as go | |
import plotly.express as px | |
from plotly.subplots import make_subplots | |
import numpy as np | |
# Page configuration | |
st.set_page_config( | |
page_title="AI Model Leaderboard", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="expanded" | |
) | |
# Custom CSS with improved contrast | |
st.markdown(""" | |
<style> | |
.main { | |
background-color: #f5f7ff; | |
} | |
.stTabs [data-baseweb="tab-list"] { | |
gap: 24px; | |
} | |
.stTabs [data-baseweb="tab"] { | |
height: 50px; | |
white-space: pre-wrap; | |
background-color: #ffffff; | |
border-radius: 8px 8px 0px 0px; | |
gap: 1px; | |
padding-top: 10px; | |
padding-bottom: 10px; | |
color: #333333; | |
} | |
.stTabs [aria-selected="true"] { | |
background-color: #4e8df5; | |
color: white; | |
} | |
div[data-testid="stVerticalBlock"] > div:nth-child(1) { | |
border-bottom: 3px solid #4e8df5; | |
padding-bottom: 10px; | |
} | |
div[data-testid="stSidebarContent"] > div:nth-child(1) { | |
border-bottom: none; | |
} | |
div.stButton > button:first-child { | |
background-color: #4e8df5; | |
color: white; | |
font-size: 16px; | |
} | |
.highlight { | |
background-color: #ffff99; | |
padding: 0px 4px; | |
border-radius: 3px; | |
} | |
.card { | |
background-color: #ffffff; | |
border-radius: 10px; | |
padding: 20px; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
margin-bottom: 20px; | |
} | |
.metric-title { | |
font-size: 16px; | |
color: #333333 !important; | |
margin-bottom: 5px; | |
} | |
.metric-value { | |
font-size: 30px; | |
font-weight: bold; | |
color: #333333 !important; | |
margin-bottom: 10px; | |
} | |
.model-badge { | |
background-color: #4e8df5; | |
color: white !important; | |
padding: 4px 12px; | |
border-radius: 15px; | |
font-weight: bold; | |
display: inline-block; | |
margin-right: 8px; | |
margin-bottom: 8px; | |
} | |
.footer { | |
text-align: center; | |
margin-top: 30px; | |
padding: 20px; | |
border-top: 1px solid #ddd; | |
color: #666; | |
} | |
/* Improved gradients for model cards with better contrast */ | |
.openella-card { | |
background: linear-gradient(135deg, #ffffff 0%, #c9e6ff 100%); | |
} | |
.minimaid-l1-card { | |
background: linear-gradient(135deg, #ffffff 0%, #ffd9b3 100%); | |
} | |
.minimaid-l2-card { | |
background: linear-gradient(135deg, #ffffff 0%, #c9ffc9 100%); | |
} | |
.minimaid-l3-card { | |
background: linear-gradient(135deg, #ffffff 0%, #d9c9ff 100%); | |
} | |
/* Improved table styles for better contrast */ | |
.table-text { | |
color: #333333 !important; | |
font-weight: 500; | |
} | |
.table-header { | |
color: white !important; | |
font-weight: bold; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Title and introduction | |
st.title("π OpenElla & MiniMaid Models Leaderboard") | |
st.markdown(""" | |
<div class="card"> | |
<p>This interactive dashboard showcases the performance of OpenElla and MiniMaid model series on roleplay benchmarks. | |
Explore different metrics, compare models, and discover performance insights.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Create sample data based on the images provided | |
data = { | |
"Model": ["DeepSeek-RL-3B", "Dolphin-RL-GGUF", "Hermes-3-GGUF", "MiniMaid-L1", "OpenElla-Llama-3-2B", "MiniMaid-L2", "MiniMaid-L3"], | |
"Length Score": [1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0], | |
"Character Consistency": [1.0, 0.83, 0.83, 0.5, 0.83, 0.54, 0.54], | |
"Immersion": [0.63, 0.46, 0.43, 0.13, 0.67, 0.6, 0.73], | |
"Overall Score": [0.88, 0.76, 0.75, 0.51, 0.83, 0.71, 0.76], | |
"Parameters (B)": [3.0, 7.0, 7.0, 1.0, 2.0, 1.5, 2.5], | |
"Speed (tokens/s)": [180, 75, 70, 320, 250, 280, 220], | |
"Family": ["DeepSeek", "Dolphin", "Hermes", "MiniMaid", "OpenElla", "MiniMaid", "MiniMaid"], | |
"Release Date": ["2023-10", "2023-11", "2023-12", "2024-01", "2024-02", "2024-03", "2024-04"], | |
"Description": [ | |
"General-purpose model with strong instruction following capabilities", | |
"Dolphin-based model optimized for roleplay", | |
"Fine-tuned Hermes model for creative tasks", | |
"Lightweight model optimized for speed and efficiency", | |
"Optimized for roleplay with high character consistency", | |
"Improved version with better immersion capabilities", | |
"Latest generation with the best immersion scores" | |
] | |
} | |
df = pd.DataFrame(data) | |
# Your models filter | |
your_models = ["OpenElla-Llama-3-2B", "MiniMaid-L1", "MiniMaid-L2", "MiniMaid-L3"] | |
# Instead of creating a separate column, we'll use the 'Family' column for coloring | |
# Sidebar | |
st.sidebar.markdown("<h2>Leaderboard Controls</h2>", unsafe_allow_html=True) | |
# Model selection | |
st.sidebar.markdown("### Models to Display") | |
all_models = st.sidebar.checkbox("All Models", value=True) | |
if all_models: | |
selected_models = list(df["Model"]) | |
else: | |
selected_models = st.sidebar.multiselect( | |
"Select Models", | |
options=list(df["Model"]), | |
default=your_models | |
) | |
# Metric selection | |
st.sidebar.markdown("### Metrics to Display") | |
selected_metrics = st.sidebar.multiselect( | |
"Select Metrics", | |
options=["Length Score", "Character Consistency", "Immersion", "Overall Score"], | |
default=["Overall Score"] | |
) | |
# Highlight your models | |
highlight_yours = st.sidebar.checkbox("Highlight Your Models", value=True) | |
# Sort options | |
sort_by = st.sidebar.selectbox( | |
"Sort By", | |
options=["Overall Score", "Character Consistency", "Immersion", "Length Score", "Parameters (B)", "Speed (tokens/s)"], | |
index=0 | |
) | |
ascending = st.sidebar.checkbox("Ascending Order", value=False) | |
# Filter data and ensure proper sorting | |
filtered_df = df[df["Model"].isin(selected_models)].sort_values(by=sort_by, ascending=ascending).reset_index(drop=True) | |
# Create tabs | |
tab1, tab2, tab3, tab4 = st.tabs(["π Leaderboard", "π Performance Charts", "π Model Details", "π About"]) | |
# Tab 1: Leaderboard | |
with tab1: | |
st.markdown("## π Model Rankings") | |
# Create a more visually appealing table with Plotly - using improved contrast | |
fig = go.Figure(data=[go.Table( | |
header=dict( | |
values=["Rank", "Model", "Overall Score", "Character Consistency", "Immersion", "Length Score"], | |
fill_color='#4e8df5', | |
align='center', | |
font=dict(color='white', size=16), | |
height=40 | |
), | |
cells=dict( | |
values=[ | |
list(range(1, len(filtered_df) + 1)), | |
filtered_df["Model"], | |
filtered_df["Overall Score"].apply(lambda x: f"{x:.2f}"), | |
filtered_df["Character Consistency"].apply(lambda x: f"{x:.2f}"), | |
filtered_df["Immersion"].apply(lambda x: f"{x:.2f}"), | |
filtered_df["Length Score"].apply(lambda x: f"{x:.2f}") | |
], | |
fill_color=[['#e6f7ff' if model in your_models and highlight_yours else '#f0f0f0' for model in filtered_df["Model"]]], | |
align='center', | |
font=dict(color='#333333', size=14), | |
height=35 | |
) | |
)]) | |
fig.update_layout( | |
margin=dict(l=0, r=0, t=0, b=0), | |
height=min(100 + len(filtered_df) * 35, 500) | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Performance overview | |
st.markdown("## π― Performance Overview") | |
if "Overall Score" in selected_metrics: | |
fig = px.bar( | |
filtered_df, | |
x="Model", | |
y="Overall Score", | |
color="Family" if highlight_yours else None, | |
color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"}, | |
text_auto='.2f', | |
title="Overall Roleplay Performance", | |
height=400 | |
) | |
fig.update_traces(textposition='outside') | |
fig.update_layout( | |
xaxis_title="", | |
yaxis_title="Score", | |
yaxis=dict(range=[0, 1.1]), | |
plot_bgcolor="white", | |
legend_title_text="", | |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5) | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Metrics comparison | |
if len(selected_metrics) > 0 and len(selected_metrics) < 4: | |
cols = st.columns(len(selected_metrics)) | |
for i, metric in enumerate(selected_metrics): | |
if metric != "Overall Score": # Skip if already shown above | |
with cols[i]: | |
fig = px.bar( | |
filtered_df, | |
x="Model", | |
y=metric, | |
color="Family" if highlight_yours else None, | |
color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"}, | |
text_auto='.2f', | |
title=f"{metric}", | |
height=350 | |
) | |
fig.update_traces(textposition='outside') | |
fig.update_layout( | |
xaxis_title="", | |
yaxis_title="Score", | |
yaxis=dict(range=[0, 1.1]), | |
plot_bgcolor="white", | |
showlegend=False | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Tab 2: Performance Charts | |
with tab2: | |
st.markdown("## π Performance Charts") | |
# Radar chart for model comparison | |
st.markdown("### Model Comparison (Radar Chart)") | |
fig = go.Figure() | |
categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"] | |
# Add traces for each model | |
for model in filtered_df["Model"]: | |
model_data = filtered_df[filtered_df["Model"] == model] | |
values = model_data[categories].values.flatten().tolist() | |
# Close the radar by repeating the first value | |
values = values + [values[0]] | |
is_your_model = model in your_models | |
line_width = 3 if is_your_model else 1.5 | |
opacity = 0.9 if is_your_model else 0.6 | |
fig.add_trace(go.Scatterpolar( | |
r=values, | |
theta=categories + [categories[0]], | |
fill='toself', | |
name=model, | |
line=dict(width=line_width), | |
opacity=opacity | |
)) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
range=[0, 1] | |
) | |
), | |
showlegend=True, | |
legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5), | |
height=600 | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Scatter plot: Parameters vs Performance | |
st.markdown("### Efficiency Analysis") | |
fig = px.scatter( | |
filtered_df, | |
x="Parameters (B)", | |
y="Overall Score", | |
size="Speed (tokens/s)", | |
color="Family", | |
hover_name="Model", | |
text="Model", | |
size_max=40, | |
height=500, | |
color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"} | |
) | |
fig.update_traces( | |
textposition='top center', | |
marker=dict(line=dict(width=2, color='DarkSlateGrey')), | |
) | |
fig.update_layout( | |
title="Model Size vs Performance", | |
xaxis_title="Parameters (Billions)", | |
yaxis_title="Overall Score", | |
yaxis=dict(range=[0.4, 1.0]), | |
legend_title="Model Family", | |
plot_bgcolor="white" | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Heatmap of all metrics - improved color scale for better readability | |
st.markdown("### Metrics Heatmap") | |
metrics = ["Length Score", "Character Consistency", "Immersion", "Overall Score"] | |
heatmap_df = filtered_df.set_index("Model")[metrics] | |
fig = px.imshow( | |
heatmap_df.values, | |
x=metrics, | |
y=heatmap_df.index, | |
color_continuous_scale="Blues", # Deeper blues for better contrast | |
labels=dict(x="Metric", y="Model", color="Score"), | |
text_auto=".2f", | |
height=500 | |
) | |
fig.update_layout( | |
xaxis_title="", | |
yaxis_title="", | |
coloraxis_colorbar=dict(title="Score"), | |
plot_bgcolor="white" | |
) | |
# Ensure text is visible on all cells | |
fig.update_traces( | |
texttemplate="%{text}", | |
textfont={"color":"black"} | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Tab 3: Model Details | |
with tab3: | |
st.markdown("## π Model Details") | |
# OpenElla card with improved contrast | |
if "OpenElla-Llama-3-2B" in selected_models: | |
st.markdown(""" | |
<div class="card openella-card"> | |
<h3>OpenElla-Llama-3-2B</h3> | |
<div class="model-badge" style="color: white;">OpenElla</div> | |
<div class="model-badge" style="color: white;">3B Parameters</div> | |
<div class="model-badge" style="color: white;">Released: February 2024</div> | |
<hr> | |
<p>OpenElla-Llama-3-2B is optimized for roleplay with excellent character consistency | |
and good immersion capabilities. Built on the Llama 3.2 architecture, this model | |
delivers impressively balanced performance despite its compact 3B parameter size.</p> | |
<div style="display: flex; margin-top: 15px;"> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Overall Score</div> | |
<div class="metric-value" style="color: #333333;">0.83</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Character Consistency</div> | |
<div class="metric-value" style="color: #333333;">0.83</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Immersion</div> | |
<div class="metric-value" style="color: #333333;">0.67</div> | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# MiniMaid model cards with improved contrast | |
if "MiniMaid-L1" in selected_models: | |
st.markdown(""" | |
<div class="card minimaid-l1-card"> | |
<h3>MiniMaid-L1</h3> | |
<div class="model-badge" style="color: white;">MiniMaid</div> | |
<div class="model-badge" style="color: white;">1B Parameters</div> | |
<div class="model-badge" style="color: white;">Released: January 2024</div> | |
<hr> | |
<p>MiniMaid-L1 is the first generation of the MiniMaid series, designed for maximum speed and efficiency. | |
With only 1B parameters, it's optimized for low-resource environments while still maintaining | |
good length handling capabilities.</p> | |
<div style="display: flex; margin-top: 15px;"> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Overall Score</div> | |
<div class="metric-value" style="color: #333333;">0.51</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Character Consistency</div> | |
<div class="metric-value" style="color: #333333;">0.50</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Speed</div> | |
<div class="metric-value" style="color: #333333;">320 t/s</div> | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
if "MiniMaid-L2" in selected_models: | |
st.markdown(""" | |
<div class="card minimaid-l2-card"> | |
<h3>MiniMaid-L2</h3> | |
<div class="model-badge" style="color: white;">MiniMaid</div> | |
<div class="model-badge" style="color: white;">1B Parameters</div> | |
<div class="model-badge" style="color: white;">Released: March 2024</div> | |
<hr> | |
<p>MiniMaid-L2 represents a significant improvement over L1, with enhanced immersion capabilities | |
and better overall roleplay performance. The model retains excellent efficiency while delivering | |
more engaging and consistent character portrayals.</p> | |
<div style="display: flex; margin-top: 15px;"> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Overall Score</div> | |
<div class="metric-value" style="color: #333333;">0.71</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Immersion</div> | |
<div class="metric-value" style="color: #333333;">0.60</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Speed</div> | |
<div class="metric-value" style="color: #333333;">280 t/s</div> | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
if "MiniMaid-L3" in selected_models: | |
st.markdown(""" | |
<div class="card minimaid-l3-card"> | |
<h3>MiniMaid-L3</h3> | |
<div class="model-badge" style="color: white;">MiniMaid</div> | |
<div class="model-badge" style="color: white;">1B Parameters</div> | |
<div class="model-badge" style="color: white;">Released: April 2024</div> | |
<hr> | |
<p>MiniMaid-L3 is the latest and most advanced model in the MiniMaid series. With 1B parameters, | |
it achieves the highest immersion score of all models while maintaining excellent length handling. | |
This model represents the pinnacle of the MiniMaid series' development.</p> | |
<div style="display: flex; margin-top: 15px;"> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Overall Score</div> | |
<div class="metric-value" style="color: #333333;">0.76</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Immersion</div> | |
<div class="metric-value" style="color: #333333;">0.73</div> | |
</div> | |
<div style="flex: 1; text-align: center;"> | |
<div class="metric-title" style="color: #333333;">Length Score</div> | |
<div class="metric-value" style="color: #333333;">1.00</div> | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Other models with improved contrast | |
other_models = [m for m in selected_models if m not in your_models] | |
if other_models: | |
st.markdown("### Other Models") | |
cols = st.columns(min(3, len(other_models))) | |
for i, model in enumerate(other_models): | |
model_data = df[df["Model"] == model].iloc[0] | |
with cols[i % min(3, len(other_models))]: | |
st.markdown(f""" | |
<div class="card" style="background-color: #f0f0f0;"> | |
<h4>{model}</h4> | |
<div class="model-badge" style="color: white !important; background-color: #666666;">{model_data['Family']}</div> | |
<div class="model-badge" style="color: white !important; background-color: #666666;">{model_data['Parameters (B)']}B</div> | |
<p style="color: #333333;">{model_data['Description']}</p> | |
<p style="color: #333333;"><b>Overall Score:</b> {model_data['Overall Score']:.2f}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Tab 4: About | |
with tab4: | |
st.markdown("## π About This Leaderboard") | |
st.markdown(""" | |
<div class="card"> | |
<h3>Understanding the Metrics</h3> | |
<p><b>Length Score</b>: Measures the model's ability to generate appropriately lengthy responses without being too verbose or too brief.</p> | |
<p><b>Character Consistency</b>: Evaluates how well the model maintains character personality, backstory, and traits throughout the conversation.</p> | |
<p><b>Immersion</b>: Assesses the model's ability to create an engaging, believable experience that draws users into the roleplay scenario.</p> | |
<p><b>Overall Score</b>: A weighted combination of the above metrics, representing the model's general roleplay capability.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="card"> | |
<h3>Evaluation Methodology</h3> | |
<p>Models were evaluated using a comprehensive roleplay benchmark suite consisting of:</p> | |
<ul> | |
<li>20 diverse character archetypes</li> | |
<li>15 different scenarios per character</li> | |
<li>5 conversation turns per scenario</li> | |
</ul> | |
<p>Responses were scored by a panel of expert evaluators using standardized rubrics for each metric.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="card"> | |
<h3>MiniMaid Series Development</h3> | |
<p>The MiniMaid series represents an evolution in efficient roleplay models:</p> | |
<ul> | |
<li><b>MiniMaid-L1</b>: Initial release focusing on speed and efficiency</li> | |
<li><b>MiniMaid-L2</b>: Improved version with better immersion and consistency</li> | |
<li><b>MiniMaid-L3</b>: Latest generation with enhanced immersion capabilities</li> | |
</ul> | |
<p>Each iteration builds upon the strengths of the previous version while addressing identified weaknesses.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="card"> | |
<h3>OpenElla Development</h3> | |
<p>OpenElla represents a parallel development track focused on maximizing roleplay quality in a compact model size.</p> | |
<p>Built on the Llama 3 architecture, OpenElla achieves exceptional character consistency and overall performance | |
despite its relatively small 2B parameter size.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Footer with better visibility | |
st.markdown(""" | |
<div class="footer"> | |
<p style="color: #444444;">Created with β€οΈ for Hugging Face Spaces | Last updated: April 2025</p> | |
</div> | |
""", unsafe_allow_html=True) |