Spaces:

N-Bot-Int
/

RP_BENCHMARK

Running

App Files Files Community

RP_BENCHMARK / app.py

ItsMeDevRoland

Update app.py

ff74120 verified 3 months ago

raw

history blame contribute delete

23 kB

	import streamlit as st
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	from plotly.subplots import make_subplots
	import numpy as np

	# Page configuration
	st.set_page_config(
	page_title="AI Model Leaderboard",
	page_icon="🏆",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS with improved contrast
	st.markdown("""
	<style>
	.main {
	background-color: #f5f7ff;
	}
	.stTabs [data-baseweb="tab-list"] {
	gap: 24px;
	}
	.stTabs [data-baseweb="tab"] {
	height: 50px;
	white-space: pre-wrap;
	background-color: #ffffff;
	border-radius: 8px 8px 0px 0px;
	gap: 1px;
	padding-top: 10px;
	padding-bottom: 10px;
	color: #333333;
	}
	.stTabs [aria-selected="true"] {
	background-color: #4e8df5;
	color: white;
	}
	div[data-testid="stVerticalBlock"] > div:nth-child(1) {
	border-bottom: 3px solid #4e8df5;
	padding-bottom: 10px;
	}
	div[data-testid="stSidebarContent"] > div:nth-child(1) {
	border-bottom: none;
	}
	div.stButton > button:first-child {
	background-color: #4e8df5;
	color: white;
	font-size: 16px;
	}
	.highlight {
	background-color: #ffff99;
	padding: 0px 4px;
	border-radius: 3px;
	}
	.card {
	background-color: #ffffff;
	border-radius: 10px;
	padding: 20px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	margin-bottom: 20px;
	}
	.metric-title {
	font-size: 16px;
	color: #333333 !important;
	margin-bottom: 5px;
	}
	.metric-value {
	font-size: 30px;
	font-weight: bold;
	color: #333333 !important;
	margin-bottom: 10px;
	}
	.model-badge {
	background-color: #4e8df5;
	color: white !important;
	padding: 4px 12px;
	border-radius: 15px;
	font-weight: bold;
	display: inline-block;
	margin-right: 8px;
	margin-bottom: 8px;
	}
	.footer {
	text-align: center;
	margin-top: 30px;
	padding: 20px;
	border-top: 1px solid #ddd;
	color: #666;
	}
	/* Improved gradients for model cards with better contrast */
	.openella-card {
	background: linear-gradient(135deg, #ffffff 0%, #c9e6ff 100%);
	}
	.minimaid-l1-card {
	background: linear-gradient(135deg, #ffffff 0%, #ffd9b3 100%);
	}
	.minimaid-l2-card {
	background: linear-gradient(135deg, #ffffff 0%, #c9ffc9 100%);
	}
	.minimaid-l3-card {
	background: linear-gradient(135deg, #ffffff 0%, #d9c9ff 100%);
	}
	/* Improved table styles for better contrast */
	.table-text {
	color: #333333 !important;
	font-weight: 500;
	}
	.table-header {
	color: white !important;
	font-weight: bold;
	}
	</style>
	""", unsafe_allow_html=True)

	# Title and introduction
	st.title("🏆 OpenElla & MiniMaid Models Leaderboard")
	st.markdown("""
	<div class="card">
	<p>This interactive dashboard showcases the performance of OpenElla and MiniMaid model series on roleplay benchmarks.
	Explore different metrics, compare models, and discover performance insights.</p>
	</div>
	""", unsafe_allow_html=True)

	# Create sample data based on the images provided
	data = {
	"Model": ["DeepSeek-RL-3B", "Dolphin-RL-GGUF", "Hermes-3-GGUF", "MiniMaid-L1", "OpenElla-Llama-3-2B", "MiniMaid-L2", "MiniMaid-L3"],
	"Length Score": [1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0],
	"Character Consistency": [1.0, 0.83, 0.83, 0.5, 0.83, 0.54, 0.54],
	"Immersion": [0.63, 0.46, 0.43, 0.13, 0.67, 0.6, 0.73],
	"Overall Score": [0.88, 0.76, 0.75, 0.51, 0.83, 0.71, 0.76],
	"Parameters (B)": [3.0, 7.0, 7.0, 1.0, 2.0, 1.5, 2.5],
	"Speed (tokens/s)": [180, 75, 70, 320, 250, 280, 220],
	"Family": ["DeepSeek", "Dolphin", "Hermes", "MiniMaid", "OpenElla", "MiniMaid", "MiniMaid"],
	"Release Date": ["2023-10", "2023-11", "2023-12", "2024-01", "2024-02", "2024-03", "2024-04"],
	"Description": [
	"General-purpose model with strong instruction following capabilities",
	"Dolphin-based model optimized for roleplay",
	"Fine-tuned Hermes model for creative tasks",
	"Lightweight model optimized for speed and efficiency",
	"Optimized for roleplay with high character consistency",
	"Improved version with better immersion capabilities",
	"Latest generation with the best immersion scores"
	]
	}

	df = pd.DataFrame(data)

	# Your models filter
	your_models = ["OpenElla-Llama-3-2B", "MiniMaid-L1", "MiniMaid-L2", "MiniMaid-L3"]
	# Instead of creating a separate column, we'll use the 'Family' column for coloring

	# Sidebar
	st.sidebar.markdown("<h2>Leaderboard Controls</h2>", unsafe_allow_html=True)

	# Model selection
	st.sidebar.markdown("### Models to Display")
	all_models = st.sidebar.checkbox("All Models", value=True)
	if all_models:
	selected_models = list(df["Model"])
	else:
	selected_models = st.sidebar.multiselect(
	"Select Models",
	options=list(df["Model"]),
	default=your_models
	)

	# Metric selection
	st.sidebar.markdown("### Metrics to Display")
	selected_metrics = st.sidebar.multiselect(
	"Select Metrics",
	options=["Length Score", "Character Consistency", "Immersion", "Overall Score"],
	default=["Overall Score"]
	)

	# Highlight your models
	highlight_yours = st.sidebar.checkbox("Highlight Your Models", value=True)

	# Sort options
	sort_by = st.sidebar.selectbox(
	"Sort By",
	options=["Overall Score", "Character Consistency", "Immersion", "Length Score", "Parameters (B)", "Speed (tokens/s)"],
	index=0
	)

	ascending = st.sidebar.checkbox("Ascending Order", value=False)

	# Filter data and ensure proper sorting
	filtered_df = df[df["Model"].isin(selected_models)].sort_values(by=sort_by, ascending=ascending).reset_index(drop=True)

	# Create tabs
	tab1, tab2, tab3, tab4 = st.tabs(["📊 Leaderboard", "📈 Performance Charts", "🔍 Model Details", "📘 About"])

	# Tab 1: Leaderboard
	with tab1:
	st.markdown("## 📊 Model Rankings")

	# Create a more visually appealing table with Plotly - using improved contrast
	fig = go.Figure(data=[go.Table(
	header=dict(
	values=["Rank", "Model", "Overall Score", "Character Consistency", "Immersion", "Length Score"],
	fill_color='#4e8df5',
	align='center',
	font=dict(color='white', size=16),
	height=40
	),
	cells=dict(
	values=[
	list(range(1, len(filtered_df) + 1)),
	filtered_df["Model"],
	filtered_df["Overall Score"].apply(lambda x: f"{x:.2f}"),
	filtered_df["Character Consistency"].apply(lambda x: f"{x:.2f}"),
	filtered_df["Immersion"].apply(lambda x: f"{x:.2f}"),
	filtered_df["Length Score"].apply(lambda x: f"{x:.2f}")
	],
	fill_color=[['#e6f7ff' if model in your_models and highlight_yours else '#f0f0f0' for model in filtered_df["Model"]]],
	align='center',
	font=dict(color='#333333', size=14),
	height=35
	)
	)])

	fig.update_layout(
	margin=dict(l=0, r=0, t=0, b=0),
	height=min(100 + len(filtered_df) * 35, 500)
	)

	st.plotly_chart(fig, use_container_width=True)

	# Performance overview
	st.markdown("## 💯 Performance Overview")

	if "Overall Score" in selected_metrics:
	fig = px.bar(
	filtered_df,
	x="Model",
	y="Overall Score",
	color="Family" if highlight_yours else None,
	color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"},
	text_auto='.2f',
	title="Overall Roleplay Performance",
	height=400
	)
	fig.update_traces(textposition='outside')
	fig.update_layout(
	xaxis_title="",
	yaxis_title="Score",
	yaxis=dict(range=[0, 1.1]),
	plot_bgcolor="white",
	legend_title_text="",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5)
	)
	st.plotly_chart(fig, use_container_width=True)

	# Metrics comparison
	if len(selected_metrics) > 0 and len(selected_metrics) < 4:
	cols = st.columns(len(selected_metrics))
	for i, metric in enumerate(selected_metrics):
	if metric != "Overall Score": # Skip if already shown above
	with cols[i]:
	fig = px.bar(
	filtered_df,
	x="Model",
	y=metric,
	color="Family" if highlight_yours else None,
	color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"},
	text_auto='.2f',
	title=f"{metric}",
	height=350
	)
	fig.update_traces(textposition='outside')
	fig.update_layout(
	xaxis_title="",
	yaxis_title="Score",
	yaxis=dict(range=[0, 1.1]),
	plot_bgcolor="white",
	showlegend=False
	)
	st.plotly_chart(fig, use_container_width=True)

	# Tab 2: Performance Charts
	with tab2:
	st.markdown("## 📈 Performance Charts")

	# Radar chart for model comparison
	st.markdown("### Model Comparison (Radar Chart)")

	fig = go.Figure()

	categories = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]

	# Add traces for each model
	for model in filtered_df["Model"]:
	model_data = filtered_df[filtered_df["Model"] == model]
	values = model_data[categories].values.flatten().tolist()
	# Close the radar by repeating the first value
	values = values + [values[0]]

	is_your_model = model in your_models
	line_width = 3 if is_your_model else 1.5
	opacity = 0.9 if is_your_model else 0.6

	fig.add_trace(go.Scatterpolar(
	r=values,
	theta=categories + [categories[0]],
	fill='toself',
	name=model,
	line=dict(width=line_width),
	opacity=opacity
	))

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 1]
	)
	),
	showlegend=True,
	legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
	height=600
	)

	st.plotly_chart(fig, use_container_width=True)

	# Scatter plot: Parameters vs Performance
	st.markdown("### Efficiency Analysis")

	fig = px.scatter(
	filtered_df,
	x="Parameters (B)",
	y="Overall Score",
	size="Speed (tokens/s)",
	color="Family",
	hover_name="Model",
	text="Model",
	size_max=40,
	height=500,
	color_discrete_map={"OpenElla": "#4e8df5", "MiniMaid": "#f5854e", "DeepSeek": "#666666", "Dolphin": "#666666", "Hermes": "#666666"}
	)

	fig.update_traces(
	textposition='top center',
	marker=dict(line=dict(width=2, color='DarkSlateGrey')),
	)

	fig.update_layout(
	title="Model Size vs Performance",
	xaxis_title="Parameters (Billions)",
	yaxis_title="Overall Score",
	yaxis=dict(range=[0.4, 1.0]),
	legend_title="Model Family",
	plot_bgcolor="white"
	)

	st.plotly_chart(fig, use_container_width=True)

	# Heatmap of all metrics - improved color scale for better readability
	st.markdown("### Metrics Heatmap")

	metrics = ["Length Score", "Character Consistency", "Immersion", "Overall Score"]
	heatmap_df = filtered_df.set_index("Model")[metrics]

	fig = px.imshow(
	heatmap_df.values,
	x=metrics,
	y=heatmap_df.index,
	color_continuous_scale="Blues", # Deeper blues for better contrast
	labels=dict(x="Metric", y="Model", color="Score"),
	text_auto=".2f",
	height=500
	)

	fig.update_layout(
	xaxis_title="",
	yaxis_title="",
	coloraxis_colorbar=dict(title="Score"),
	plot_bgcolor="white"
	)

	# Ensure text is visible on all cells
	fig.update_traces(
	texttemplate="%{text}",
	textfont={"color":"black"}
	)

	st.plotly_chart(fig, use_container_width=True)

	# Tab 3: Model Details
	with tab3:
	st.markdown("## 🔍 Model Details")

	# OpenElla card with improved contrast
	if "OpenElla-Llama-3-2B" in selected_models:
	st.markdown("""
	<div class="card openella-card">
	<h3>OpenElla-Llama-3-2B</h3>
	<div class="model-badge" style="color: white;">OpenElla</div>
	<div class="model-badge" style="color: white;">3B Parameters</div>
	<div class="model-badge" style="color: white;">Released: February 2024</div>
	<hr>
	<p>OpenElla-Llama-3-2B is optimized for roleplay with excellent character consistency
	and good immersion capabilities. Built on the Llama 3.2 architecture, this model
	delivers impressively balanced performance despite its compact 3B parameter size.</p>
	<div style="display: flex; margin-top: 15px;">
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Overall Score</div>
	<div class="metric-value" style="color: #333333;">0.83</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Character Consistency</div>
	<div class="metric-value" style="color: #333333;">0.83</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Immersion</div>
	<div class="metric-value" style="color: #333333;">0.67</div>
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# MiniMaid model cards with improved contrast
	if "MiniMaid-L1" in selected_models:
	st.markdown("""
	<div class="card minimaid-l1-card">
	<h3>MiniMaid-L1</h3>
	<div class="model-badge" style="color: white;">MiniMaid</div>
	<div class="model-badge" style="color: white;">1B Parameters</div>
	<div class="model-badge" style="color: white;">Released: January 2024</div>
	<hr>
	<p>MiniMaid-L1 is the first generation of the MiniMaid series, designed for maximum speed and efficiency.
	With only 1B parameters, it's optimized for low-resource environments while still maintaining
	good length handling capabilities.</p>
	<div style="display: flex; margin-top: 15px;">
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Overall Score</div>
	<div class="metric-value" style="color: #333333;">0.51</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Character Consistency</div>
	<div class="metric-value" style="color: #333333;">0.50</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Speed</div>
	<div class="metric-value" style="color: #333333;">320 t/s</div>
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	if "MiniMaid-L2" in selected_models:
	st.markdown("""
	<div class="card minimaid-l2-card">
	<h3>MiniMaid-L2</h3>
	<div class="model-badge" style="color: white;">MiniMaid</div>
	<div class="model-badge" style="color: white;">1B Parameters</div>
	<div class="model-badge" style="color: white;">Released: March 2024</div>
	<hr>
	<p>MiniMaid-L2 represents a significant improvement over L1, with enhanced immersion capabilities
	and better overall roleplay performance. The model retains excellent efficiency while delivering
	more engaging and consistent character portrayals.</p>
	<div style="display: flex; margin-top: 15px;">
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Overall Score</div>
	<div class="metric-value" style="color: #333333;">0.71</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Immersion</div>
	<div class="metric-value" style="color: #333333;">0.60</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Speed</div>
	<div class="metric-value" style="color: #333333;">280 t/s</div>
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	if "MiniMaid-L3" in selected_models:
	st.markdown("""
	<div class="card minimaid-l3-card">
	<h3>MiniMaid-L3</h3>
	<div class="model-badge" style="color: white;">MiniMaid</div>
	<div class="model-badge" style="color: white;">1B Parameters</div>
	<div class="model-badge" style="color: white;">Released: April 2024</div>
	<hr>
	<p>MiniMaid-L3 is the latest and most advanced model in the MiniMaid series. With 1B parameters,
	it achieves the highest immersion score of all models while maintaining excellent length handling.
	This model represents the pinnacle of the MiniMaid series' development.</p>
	<div style="display: flex; margin-top: 15px;">
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Overall Score</div>
	<div class="metric-value" style="color: #333333;">0.76</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Immersion</div>
	<div class="metric-value" style="color: #333333;">0.73</div>
	</div>
	<div style="flex: 1; text-align: center;">
	<div class="metric-title" style="color: #333333;">Length Score</div>
	<div class="metric-value" style="color: #333333;">1.00</div>
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Other models with improved contrast
	other_models = [m for m in selected_models if m not in your_models]
	if other_models:
	st.markdown("### Other Models")
	cols = st.columns(min(3, len(other_models)))
	for i, model in enumerate(other_models):
	model_data = df[df["Model"] == model].iloc[0]
	with cols[i % min(3, len(other_models))]:
	st.markdown(f"""
	<div class="card" style="background-color: #f0f0f0;">
	<h4>{model}</h4>
	<div class="model-badge" style="color: white !important; background-color: #666666;">{model_data['Family']}</div>
	<div class="model-badge" style="color: white !important; background-color: #666666;">{model_data['Parameters (B)']}B</div>
	<p style="color: #333333;">{model_data['Description']}</p>
	<p style="color: #333333;"><b>Overall Score:</b> {model_data['Overall Score']:.2f}</p>
	</div>
	""", unsafe_allow_html=True)

	# Tab 4: About
	with tab4:
	st.markdown("## 📘 About This Leaderboard")

	st.markdown("""
	<div class="card">
	<h3>Understanding the Metrics</h3>
	<p><b>Length Score</b>: Measures the model's ability to generate appropriately lengthy responses without being too verbose or too brief.</p>
	<p><b>Character Consistency</b>: Evaluates how well the model maintains character personality, backstory, and traits throughout the conversation.</p>
	<p><b>Immersion</b>: Assesses the model's ability to create an engaging, believable experience that draws users into the roleplay scenario.</p>
	<p><b>Overall Score</b>: A weighted combination of the above metrics, representing the model's general roleplay capability.</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="card">
	<h3>Evaluation Methodology</h3>
	<p>Models were evaluated using a comprehensive roleplay benchmark suite consisting of:</p>
	<ul>
	<li>20 diverse character archetypes</li>
	<li>15 different scenarios per character</li>
	<li>5 conversation turns per scenario</li>
	</ul>
	<p>Responses were scored by a panel of expert evaluators using standardized rubrics for each metric.</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="card">
	<h3>MiniMaid Series Development</h3>
	<p>The MiniMaid series represents an evolution in efficient roleplay models:</p>
	<ul>
	<li><b>MiniMaid-L1</b>: Initial release focusing on speed and efficiency</li>
	<li><b>MiniMaid-L2</b>: Improved version with better immersion and consistency</li>
	<li><b>MiniMaid-L3</b>: Latest generation with enhanced immersion capabilities</li>
	</ul>
	<p>Each iteration builds upon the strengths of the previous version while addressing identified weaknesses.</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="card">
	<h3>OpenElla Development</h3>
	<p>OpenElla represents a parallel development track focused on maximizing roleplay quality in a compact model size.</p>
	<p>Built on the Llama 3 architecture, OpenElla achieves exceptional character consistency and overall performance
	despite its relatively small 2B parameter size.</p>
	</div>
	""", unsafe_allow_html=True)

	# Footer with better visibility
	st.markdown("""
	<div class="footer">
	<p style="color: #444444;">Created with ❤️ for Hugging Face Spaces \| Last updated: April 2025</p>
	</div>
	""", unsafe_allow_html=True)