Spaces:

OpenCaptchaWorld
/

captchav2_prototype

Sleeping

App Files Files Community

captchav2_prototype / app.py

OpenCaptchaWorld

debug updating system

d9ef651 about 2 months ago

raw

history blame contribute delete

55.6 kB

	# leaderboard/app.py
	import pandas as pd, numpy as np, matplotlib.pyplot as plt, gradio as gr
	import pathlib
	import json
	import csv

	CATEGORY_MAP = {
	"Overall": ["Overall Pass Rate"],
	# You can define sets, e.g. "Vision-hard": ["Squiggle", "Shadow_Plausible"]
	}

	def get_results_path():
	"""Get the path to results.csv, resolving relative to this file's location."""
	this_file = pathlib.Path(__file__).resolve()
	results_path = this_file.parent / "results.csv"
	return results_path

	def get_runs_path():
	"""Get the path to runs directory, resolving relative to this file's location."""
	this_file = pathlib.Path(__file__).resolve()
	runs_path = this_file.parent / "runs"
	runs_path.mkdir(parents=True, exist_ok=True)
	return runs_path

	def infer_type(row):
	"""Infer model type (Proprietary/Open source) from Provider or Model name."""
	provider = str(row.get("Provider", "")).lower()
	model = str(row.get("Model", "")).lower()

	# Open source indicators
	open_source_keywords = [
	"llama", "mistral", "qwen", "phi", "gemma", "falcon", "mpt",
	"vicuna", "alpaca", "wizard", "openchat", "neural-chat",
	"browser-use", "browseruse", "open source", "opensource"
	]

	# Check if any open source keyword appears
	for keyword in open_source_keywords:
	if keyword in provider or keyword in model:
	return "Open source"

	# Default to Proprietary if not found
	return "Proprietary"

	def load_df(path=None):
	"""Load the results CSV, creating empty dataframe if file doesn't exist."""
	if path is None:
	path = get_results_path()

	metadata_cols = ["Model", "Provider", "Agent Framework", "Type"]
	metric_cols = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
	expected_cols = metadata_cols + metric_cols

	if not pathlib.Path(path).exists():
	# Return empty dataframe with expected columns
	return pd.DataFrame(columns=expected_cols)

	try:
	df = pd.read_csv(path)
	# Handle empty CSV (only headers)
	if len(df) == 0:
	return pd.DataFrame(columns=expected_cols)

	# Ensure required columns exist
	if "Agent Framework" not in df.columns:
	# Try legacy "Notes" column
	if "Notes" in df.columns:
	df["Agent Framework"] = df["Notes"]
	else:
	df["Agent Framework"] = ""

	# Handle legacy "Overall" column
	if "Overall" in df.columns and "Overall Pass Rate" not in df.columns:
	df["Overall Pass Rate"] = df["Overall"]

	# Add Type column if missing, infer from Provider/Model
	if "Type" not in df.columns:
	df["Type"] = df.apply(infer_type, axis=1)

	# Convert numeric columns
	numeric_cols = metric_cols + [c for c in df.columns if c not in metadata_cols + metric_cols]
	for c in numeric_cols:
	if c in df.columns:
	df[c] = pd.to_numeric(df[c], errors="coerce")

	return df
	except Exception as e:
	print(f"Error loading results.csv: {e}")
	return pd.DataFrame(columns=expected_cols)

	def compute_score(df, category):
	# Get columns to compute score from
	# Map "Overall" category to "Overall Pass Rate" column
	if category == "Overall":
	# Use CATEGORY_MAP which maps "Overall" to ["Overall Pass Rate"]
	cols = CATEGORY_MAP.get("Overall", ["Overall Pass Rate"])
	elif category in CATEGORY_MAP:
	# Use predefined category mapping
	cols = CATEGORY_MAP[category]
	elif category in df.columns:
	# Category is a direct column name
	cols = [category]
	else:
	# Fallback: use "Overall Pass Rate" if it exists, otherwise all numeric columns
	if "Overall Pass Rate" in df.columns:
	cols = ["Overall Pass Rate"]
	else:
	numeric_cols = [c for c in df.columns if c not in ["Model", "Provider", "Agent Framework", "Type"]]
	cols = numeric_cols if numeric_cols else []

	# Filter to only existing columns
	cols = [c for c in cols if c in df.columns]

	# If no valid columns found, use all numeric columns except metadata/metrics
	if not cols:
	exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Avg Duration (s)", "Avg Cost ($)"]
	numeric_cols = [c for c in df.columns if c not in exclude_cols]
	cols = numeric_cols if numeric_cols else []
	# If still no columns, create a zero score
	if not cols:
	df = df.copy()
	df["Category Pass Rate"] = 0.0
	return df

	df = df.copy()
	if cols:
	df["Category Pass Rate"] = df[cols].mean(axis=1, skipna=True)
	else:
	df["Category Pass Rate"] = 0.0
	return df

	def table_html(df):
	if len(df) == 0:
	return """
	<style>
	.leaderboard-container {
	background: #ffffff;
	border-radius: 8px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.1);
	overflow: hidden;
	margin: 20px 0;
	}
	table.lb {
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
	font-size: 14px;
	}
	table.lb thead {
	background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
	color: white;
	}
	table.lb th {
	padding: 16px 20px;
	text-align: left;
	font-weight: 600;
	font-size: 13px;
	text-transform: uppercase;
	letter-spacing: 0.5px;
	}
	table.lb td {
	padding: 16px 20px;
	border-bottom: 1px solid #e5e7eb;
	color: #374151;
	}
	table.lb tbody tr {
	transition: background-color 0.2s ease;
	}
	table.lb tbody tr:hover {
	background: #f9fafb;
	}
	table.lb tbody tr:last-child td {
	border-bottom: none;
	}
	.rank-badge {
	display: inline-block;
	width: 32px;
	height: 32px;
	line-height: 32px;
	text-align: center;
	border-radius: 50%;
	font-weight: 700;
	font-size: 14px;
	}
	.rank-1 { background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }
	.rank-2 { background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }
	.rank-3 { background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }
	.rank-other { background: #f1f5f9; color: #64748b; }
	.pass-rate-cell {
	font-weight: 600;
	font-size: 15px;
	}
	.metric-cell {
	font-weight: 500;
	font-size: 14px;
	color: #6b7280;
	}
	</style>
	<div class="leaderboard-container">
	<table class="lb">
	<thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead>
	<tbody><tr><td colspan="8" style="text-align:center;padding:40px;color:#9ca3af;font-size:16px;">No results yet. Run evaluations to populate the leaderboard.</td></tr></tbody>
	</table>
	</div>
	"""
	rows = []
	for i, r in df.iterrows():
	rank = i + 1
	rank_class = "rank-1" if rank == 1 else "rank-2" if rank == 2 else "rank-3" if rank == 3 else "rank-other"
	pass_rate = r['Category Pass Rate']
	pass_rate_color = "#10b981" if pass_rate >= 0.7 else "#f59e0b" if pass_rate >= 0.4 else "#ef4444"

	# Format duration and cost
	duration = r.get('Avg Duration (s)', None)
	duration_str = f"{duration:.2f}" if pd.notna(duration) and duration is not None else "N/A"

	cost = r.get('Avg Cost ($)', None)
	cost_str = f"${cost:.4f}" if pd.notna(cost) and cost is not None else "N/A"

	type_val = r.get('Type', 'Proprietary')
	type_color = "#10b981" if type_val == "Open source" else "#6366f1"

	rows.append(f"""
	<tr>
	<td><span class="rank-badge {rank_class}">{rank}</span></td>
	<td><strong style="color: #111827;">{r['Model']}</strong></td>
	<td style="color: #6b7280;">{r.get('Provider','')}</td>
	<td><span style="color: {type_color}; font-weight: 600;">{type_val}</span></td>
	<td style="color: #6b7280;">{r.get('Agent Framework','')}</td>
	<td class="pass-rate-cell" style="color: {pass_rate_color};">{pass_rate:.3f}</td>
	<td class="metric-cell">{duration_str}</td>
	<td class="metric-cell">{cost_str}</td>
	</tr>""")
	return f"""
	<style>
	.leaderboard-container {{
	background: #ffffff;
	border-radius: 8px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.1);
	overflow: hidden;
	margin: 20px 0;
	}}
	table.lb {{
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
	font-size: 14px;
	}}
	table.lb thead {{
	background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
	color: white;
	}}
	table.lb th {{
	padding: 16px 20px;
	text-align: left;
	font-weight: 600;
	font-size: 13px;
	text-transform: uppercase;
	letter-spacing: 0.5px;
	}}
	table.lb td {{
	padding: 16px 20px;
	border-bottom: 1px solid #e5e7eb;
	color: #374151;
	}}
	table.lb tbody tr {{
	transition: background-color 0.2s ease;
	}}
	table.lb tbody tr:hover {{
	background: #f9fafb;
	}}
	table.lb tbody tr:last-child td {{
	border-bottom: none;
	}}
	.rank-badge {{
	display: inline-block;
	width: 32px;
	height: 32px;
	line-height: 32px;
	text-align: center;
	border-radius: 50%;
	font-weight: 700;
	font-size: 14px;
	}}
	.rank-1 {{ background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }}
	.rank-2 {{ background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }}
	.rank-3 {{ background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }}
	.rank-other {{ background: #f1f5f9; color: #64748b; }}
	.pass-rate-cell {{
	font-weight: 600;
	font-size: 15px;
	}}
	.metric-cell {{
	font-weight: 500;
	font-size: 14px;
	color: #6b7280;
	}}
	</style>
	<div class="leaderboard-container">
	<table class="lb">
	<thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead>
	<tbody>{''.join(rows)}</tbody>
	</table>
	</div>
	"""

	def perf_bar(df):
	plt.close("all")
	if len(df) == 0:
	fig, ax = plt.subplots(figsize=(10, 4), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig
	d = df.sort_values("Category Pass Rate", ascending=True)
	fig, ax = plt.subplots(figsize=(10, max(4, 0.5*len(d))), facecolor='white', dpi=150)

	# Create gradient colors based on pass rate - CAPTCHA themed
	colors = []
	for pass_rate in d["Category Pass Rate"]:
	if pass_rate >= 0.7:
	colors.append('#10b981') # verification green
	elif pass_rate >= 0.4:
	colors.append('#f59e0b') # warning amber
	else:
	colors.append('#ef4444') # error red

	bars = ax.barh(range(len(d)), d["Category Pass Rate"], color=colors, alpha=0.8, edgecolor='white', linewidth=1.5)

	# Add value labels on bars
	for i, (bar, pass_rate) in enumerate(zip(bars, d["Category Pass Rate"])):
	width = bar.get_width()
	ax.text(width + 0.01, bar.get_y() + bar.get_height()/2,
	f'{pass_rate:.3f}', ha='left', va='center', fontsize=11, fontweight='600')

	ax.set_yticks(range(len(d)))
	ax.set_yticklabels(d["Model"], fontsize=12)
	ax.set_xlabel("Pass Rate", fontsize=12, fontweight='600', color='#374151')
	ax.set_xlim(0, 1.1)
	ax.set_title("Performance Comparison", fontsize=16, fontweight='700', color='#111827', pad=20)
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['left'].set_color('#e5e7eb')
	ax.spines['bottom'].set_color('#e5e7eb')
	ax.grid(axis='x', alpha=0.3, linestyle='--')
	ax.set_facecolor('#fafafa')
	fig.tight_layout()
	return fig

	def perf_by_type(df_full, model_filter="Models Avg"):
	"""
	Show average performance by puzzle type.

	Args:
	df_full: Full dataframe with all models
	model_filter: "Models Avg" for average across all models, or a specific model name
	"""
	plt.close("all")

	# Filter by model if specified
	if model_filter and model_filter != "Models Avg":
	df_filtered = df_full[df_full["Model"] == model_filter].copy()
	if len(df_filtered) == 0:
	fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, f"No data available for model: {model_filter}", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig
	df_plot = df_filtered
	plot_title = f"Performance by Type - {model_filter}"
	else:
	df_plot = df_full
	plot_title = "Average Performance by CAPTCHA Type (All Models)"

	if len(df_plot) == 0:
	fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig

	# Average each per-type column across models (exclude metadata and metric columns)
	exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)", "Category Pass Rate"]
	numeric_cols = [c for c in df_plot.columns if c not in exclude_cols]
	type_cols = [c for c in numeric_cols if df_plot[c].notna().any() and df_plot[c].dtype in ['float64', 'int64', 'float32', 'int32']]

	if len(type_cols) == 0:
	fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, "No per-type data available", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig

	# Calculate means, handling NaN values properly
	if model_filter == "Models Avg":
	# Average across all models
	means = df_plot[type_cols].mean(numeric_only=True)
	else:
	# For a single model, just get its values (should be one row)
	if len(df_plot) == 1:
	means = df_plot[type_cols].iloc[0]
	else:
	# If multiple rows (shouldn't happen), average them
	means = df_plot[type_cols].mean(numeric_only=True)

	# Filter out any NaN means
	means = means.dropna()

	if len(means) == 0:
	fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, "No valid per-type data available", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig

	fig, ax = plt.subplots(figsize=(max(12, len(means) * 0.8), 6), facecolor='white', dpi=150)

	# Create gradient colors based on performance - CAPTCHA themed
	colors = []
	for val in means.values:
	if pd.isna(val):
	colors.append('#94a3b8') # slate gray for NaN
	elif val >= 0.7:
	colors.append('#10b981') # verification green
	elif val >= 0.4:
	colors.append('#f59e0b') # warning amber
	else:
	colors.append('#ef4444') # error red

	bars = ax.bar(range(len(means)), means.values, color=colors, alpha=0.8, edgecolor='white', linewidth=1.5)

	# Add value labels on bars
	for bar, val in zip(bars, means.values):
	if not pd.isna(val):
	height = bar.get_height()
	ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
	f'{val:.2f}', ha='center', va='bottom', fontsize=10, fontweight='600')

	ax.set_xticks(range(len(means)))
	ax.set_xticklabels(means.index, rotation=45, ha="right", fontsize=10)
	ax.set_ylim(0, max(1.1, means.max() * 1.1) if not means.empty else 1.1)
	ax.set_ylabel("Average Pass Rate", fontsize=12, fontweight='600', color='#374151')
	ax.set_title(plot_title, fontsize=16, fontweight='700', color='#111827', pad=20)
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['left'].set_color('#e5e7eb')
	ax.spines['bottom'].set_color('#e5e7eb')
	ax.grid(axis='y', alpha=0.3, linestyle='--')
	ax.set_facecolor('#fafafa')
	fig.tight_layout()
	return fig

	def cost_effectiveness_plot(df):
	"""
	Create a cost-effectiveness scatter plot: Performance (X) vs Cost (Y).
	Color-coded by Type (Proprietary vs Open source).
	"""
	plt.close("all")
	if len(df) == 0:
	fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig

	# Filter to rows with valid performance and cost data
	df_plot = df.copy()
	df_plot = df_plot[df_plot['Category Pass Rate'].notna() & df_plot['Avg Cost ($)'].notna()]

	if len(df_plot) == 0:
	fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150)
	ax.text(0.5, 0.5, "No data with both performance and cost metrics", ha="center", va="center", fontsize=14, color="gray")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
	fig.tight_layout(); return fig

	# Create figure with higher DPI for better resolution
	fig, ax = plt.subplots(figsize=(14, 9), facecolor='white', dpi=150)

	# Separate by type
	proprietary = df_plot[df_plot.get('Type', 'Proprietary') == 'Proprietary']
	open_source = df_plot[df_plot.get('Type', 'Proprietary') == 'Open source']

	# Plot points
	if len(proprietary) > 0:
	ax.scatter(proprietary['Category Pass Rate'], proprietary['Avg Cost ($)'],
	c='#6366f1', s=200, alpha=0.75, edgecolors='white', linewidth=2.5,
	label='Proprietary', zorder=3)
	# Add labels for proprietary models
	for idx, row in proprietary.iterrows():
	ax.annotate(row['Model'],
	(row['Category Pass Rate'], row['Avg Cost ($)']),
	fontsize=10, alpha=0.85, ha='left', va='bottom',
	bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none'))

	if len(open_source) > 0:
	ax.scatter(open_source['Category Pass Rate'], open_source['Avg Cost ($)'],
	c='#10b981', s=200, alpha=0.75, edgecolors='white', linewidth=2.5,
	label='Open source', zorder=3)
	# Add labels for open source models
	for idx, row in open_source.iterrows():
	ax.annotate(row['Model'],
	(row['Category Pass Rate'], row['Avg Cost ($)']),
	fontsize=10, alpha=0.85, ha='left', va='bottom',
	bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none'))

	# Calculate thresholds for quadrants (median or fixed thresholds)
	perf_threshold = df_plot['Category Pass Rate'].median() if len(df_plot) > 1 else 0.4
	cost_threshold = df_plot['Avg Cost ($)'].median() if len(df_plot) > 1 else 0.01

	# Add quadrant lines
	ax.axvline(x=perf_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1)
	ax.axhline(y=cost_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1)

	# Add quadrant annotations
	x_range = df_plot['Category Pass Rate'].max() - df_plot['Category Pass Rate'].min()
	y_range = df_plot['Avg Cost ($)'].max() - df_plot['Avg Cost ($)'].min()

	# Top-left: Low Performance, High Cost
	ax.text(df_plot['Category Pass Rate'].min() + x_range * 0.05,
	df_plot['Avg Cost ($)'].max() - y_range * 0.05,
	'▲ Low Performance\nHigh Cost',
	fontsize=12, color='#ef4444', weight='bold',
	ha='left', va='top', alpha=0.8,
	bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#ef4444', linewidth=1.5))

	# Bottom-right: High Performance, Low Cost
	ax.text(df_plot['Category Pass Rate'].max() - x_range * 0.05,
	df_plot['Avg Cost ($)'].min() + y_range * 0.05,
	'▼ High Performance\nLow Cost',
	fontsize=12, color='#10b981', weight='bold',
	ha='right', va='bottom', alpha=0.8,
	bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#10b981', linewidth=1.5))

	# Styling
	ax.set_xlabel("Performance (Pass Rate)", fontsize=14, fontweight='600', color='#374151')
	ax.set_ylabel("Avg Cost ($)", fontsize=14, fontweight='600', color='#374151')
	ax.set_title("Cost-Effectiveness Analysis", fontsize=17, fontweight='700', color='#111827', pad=25)

	# Add padding to axes (more padding on right for legend space)
	x_pad = x_range * 0.15 if x_range > 0 else 0.1
	y_pad = y_range * 0.15 if y_range > 0 else 0.001
	ax.set_xlim(df_plot['Category Pass Rate'].min() - x_pad * 0.5, df_plot['Category Pass Rate'].max() + x_pad)
	ax.set_ylim(max(0, df_plot['Avg Cost ($)'].min() - y_pad * 0.5), df_plot['Avg Cost ($)'].max() + y_pad)

	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['left'].set_color('#e5e7eb')
	ax.spines['bottom'].set_color('#e5e7eb')
	ax.grid(alpha=0.3, linestyle='--', zorder=0, linewidth=1)
	ax.set_facecolor('#fafafa')

	# Add legend - position it outside the plot area to avoid covering data
	# Use bbox_to_anchor to place it outside the plot
	ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1), frameon=True,
	fancybox=True, shadow=True, fontsize=12, framealpha=0.95,
	edgecolor='#e5e7eb', facecolor='white')

	# Adjust layout to make room for legend
	fig.tight_layout(rect=[0, 0, 0.95, 1])
	return fig

	def convert_benchmark_results_json(file_path, model_name=None, provider=None, agent_framework=None):
	"""
	Convert benchmark_results.json format (per-puzzle results) to aggregated format.

	Args:
	file_path: Path to benchmark_results.json file (Path object or string)
	model_name: Model name (if None, will try to infer from filename or use "Unknown")
	provider: Provider name (if None, will try to infer from model_name)
	agent_framework: Agent framework name (if None, will use "browser-use" as default)

	Returns:
	dict: Aggregated record with Model, Provider, Agent Framework, Type, metrics, and per-type pass rates
	"""
	# Convert to Path object if needed
	file_path = pathlib.Path(file_path) if not isinstance(file_path, pathlib.Path) else file_path

	# Read the file - it's a JSONL file (one JSON object per line)
	puzzle_results = []
	with open(file_path, 'r') as f:
	for line in f:
	line = line.strip()
	if line:
	try:
	puzzle_results.append(json.loads(line))
	except json.JSONDecodeError:
	continue

	if not puzzle_results:
	raise ValueError("No valid puzzle results found in file")

	# Try to extract model/provider from puzzle results first (if they were included)
	extracted_model = None
	extracted_provider = None
	extracted_agent_framework = None

	for result in puzzle_results[:10]: # Check first 10 results
	if 'model' in result and result['model']:
	extracted_model = result['model']
	if 'provider' in result and result['provider']:
	extracted_provider = result['provider']
	if 'agent_framework' in result and result['agent_framework']:
	extracted_agent_framework = result['agent_framework']
	# Also check camelCase variants
	if 'agentFramework' in result and result['agentFramework']:
	extracted_agent_framework = result['agentFramework']

	# Use extracted values if available, otherwise use provided parameters
	if model_name is None:
	model_name = extracted_model

	if provider is None:
	provider = extracted_provider

	if agent_framework is None:
	agent_framework = extracted_agent_framework

	# Infer model/provider if still not available
	if model_name is None:
	# Try to infer from filename (e.g., "gpt-4_results.json" -> "gpt-4")
	filename = file_path.stem.lower()
	if 'benchmark_results' in filename:
	model_name = "Unknown Model"
	else:
	# Try to extract model name from filename
	model_name = filename.replace('_results', '').replace('_benchmark', '').replace('-', ' ').title()

	if provider is None:
	# Try to infer provider from model name
	model_lower = model_name.lower()
	if any(x in model_lower for x in ['gpt', 'openai']):
	provider = "OpenAI"
	elif any(x in model_lower for x in ['claude', 'anthropic']):
	provider = "Anthropic"
	elif any(x in model_lower for x in ['gemini', 'google']):
	provider = "Google"
	elif any(x in model_lower for x in ['llama', 'mistral', 'qwen', 'phi', 'gemma']):
	provider = "Open Source"
	else:
	provider = "Unknown"

	if agent_framework is None:
	agent_framework = "browser-use" # Default assumption

	# Aggregate results
	# Group by puzzle_type
	puzzle_type_stats = {}
	total_correct = 0
	total_attempts = len(puzzle_results)
	total_duration = 0.0
	total_cost = 0.0
	cost_count = 0

	for result in puzzle_results:
	puzzle_type = result.get('puzzle_type', 'Unknown')

	# Initialize puzzle type stats if needed
	if puzzle_type not in puzzle_type_stats:
	puzzle_type_stats[puzzle_type] = {'correct': 0, 'total': 0}

	puzzle_type_stats[puzzle_type]['total'] += 1
	if result.get('correct', False):
	puzzle_type_stats[puzzle_type]['correct'] += 1
	total_correct += 1

	# Aggregate duration
	elapsed_time = result.get('elapsed_time')
	if elapsed_time is not None:
	try:
	total_duration += float(elapsed_time)
	except (ValueError, TypeError):
	pass

	# Aggregate cost
	cost = result.get('cost')
	if cost is not None:
	try:
	total_cost += float(cost)
	cost_count += 1
	except (ValueError, TypeError):
	pass

	# Calculate overall pass rate
	overall_pass_rate = total_correct / total_attempts if total_attempts > 0 else 0.0

	# Calculate average duration
	avg_duration = total_duration / total_attempts if total_attempts > 0 else None

	# Calculate average cost
	avg_cost = total_cost / cost_count if cost_count > 0 else None

	# Build aggregated record
	record = {
	"Model": model_name,
	"Provider": provider,
	"Agent Framework": agent_framework,
	"Overall Pass Rate": overall_pass_rate,
	"Avg Duration (s)": avg_duration,
	"Avg Cost ($)": avg_cost,
	}

	# Add per-type pass rates
	for puzzle_type, stats in puzzle_type_stats.items():
	pass_rate = stats['correct'] / stats['total'] if stats['total'] > 0 else 0.0
	record[puzzle_type] = pass_rate

	# Infer Type
	record["Type"] = infer_type(record)

	return record

	def is_benchmark_results_format(data):
	"""
	Check if the data is in benchmark_results.json format (per-puzzle results).

	Args:
	data: List of dictionaries or single dictionary

	Returns:
	bool: True if data appears to be in benchmark_results format
	"""
	if isinstance(data, dict):
	data = [data]

	if not isinstance(data, list) or len(data) == 0:
	return False

	# Check if first record has benchmark_results.json structure
	first = data[0]
	required_fields = ['puzzle_type', 'puzzle_id', 'correct']
	has_required = all(field in first for field in required_fields)

	# Check if it's NOT the aggregated format (which would have Model, Provider, etc.)
	aggregated_fields = ['Model', 'Provider', 'Overall Pass Rate']
	is_not_aggregated = not any(field in first for field in aggregated_fields)

	return has_required and is_not_aggregated

	def process_uploaded_file(file, model_name=None, provider=None, agent_framework=None):
	"""
	Process an uploaded CSV or JSON file and merge with existing results.

	Args:
	file: File path string (from Gradio File component with type="filepath")
	model_name: Optional model name (for benchmark_results.json conversion)
	provider: Optional provider name (for benchmark_results.json conversion)
	agent_framework: Optional agent framework name (for benchmark_results.json conversion)

	Returns:
	tuple: (success_message, error_message)
	"""
	if file is None:
	return None, "No file uploaded"

	try:
	# Gradio returns a file path string when type="filepath"
	file_path = pathlib.Path(file) if isinstance(file, str) else pathlib.Path(file.name)

	# Read the file based on extension
	if file_path.suffix.lower() == '.json':
	# Try reading as JSONL first (benchmark_results.json format)
	try:
	# Read first few lines to detect format
	with open(file_path, 'r') as f:
	first_lines = [f.readline().strip() for _ in range(5)]
	f.seek(0)

	# Try to parse as JSONL (one JSON object per line)
	puzzle_results = []
	for line in first_lines:
	if line:
	try:
	puzzle_results.append(json.loads(line))
	except json.JSONDecodeError:
	break

	# Check if it's benchmark_results format
	if puzzle_results and is_benchmark_results_format(puzzle_results):
	# Read entire file as JSONL
	puzzle_results = []
	with open(file_path, 'r') as f:
	for line in f:
	line = line.strip()
	if line:
	try:
	puzzle_results.append(json.loads(line))
	except json.JSONDecodeError:
	continue

	# Convert to aggregated format
	record = convert_benchmark_results_json(
	file_path,
	model_name=model_name,
	provider=provider,
	agent_framework=agent_framework
	)
	records = [record]
	else:
	# Try reading as regular JSON
	f.seek(0)
	data = json.load(f)

	# Normalize to list of records
	if isinstance(data, dict):
	records = [data]
	elif isinstance(data, list):
	records = data
	else:
	return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}"

	# Check if it's benchmark_results format
	if is_benchmark_results_format(records):
	# Convert to aggregated format
	record = convert_benchmark_results_json(
	file_path,
	model_name=model_name,
	provider=provider,
	agent_framework=agent_framework
	)
	records = [record]
	except Exception as e:
	# Fallback: try reading as regular JSON
	try:
	with open(file_path, 'r') as f:
	data = json.load(f)

	# Normalize to list of records
	if isinstance(data, dict):
	records = [data]
	elif isinstance(data, list):
	records = data
	else:
	return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}"

	# Check if it's benchmark_results format
	if is_benchmark_results_format(records):
	# Convert to aggregated format
	record = convert_benchmark_results_json(
	file_path,
	model_name=model_name,
	provider=provider,
	agent_framework=agent_framework
	)
	records = [record]
	except Exception as json_err:
	return None, f"Error reading JSON file: {str(json_err)}"

	# Handle legacy column names
	legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
	for record in records:
	for old_key, new_key in legacy_map.items():
	if old_key in record and new_key not in record:
	record[new_key] = record.pop(old_key)

	# Infer Type if not present
	if "Type" not in record:
	record["Type"] = infer_type(record)

	# Save individual JSON files to runs directory for aggregation
	runs_path = get_runs_path()
	import time
	for record in records:
	run_file = runs_path / f"run_{int(time.time() * 1000)}.json"
	with open(run_file, 'w') as f:
	json.dump(record, f, indent=2)

	num_records = len(records)

	elif file_path.suffix.lower() == '.csv':
	# Handle CSV file
	df_uploaded = pd.read_csv(file_path)

	# Handle legacy column names
	if "Notes" in df_uploaded.columns and "Agent Framework" not in df_uploaded.columns:
	df_uploaded["Agent Framework"] = df_uploaded["Notes"]
	if "Overall" in df_uploaded.columns and "Overall Pass Rate" not in df_uploaded.columns:
	df_uploaded["Overall Pass Rate"] = df_uploaded["Overall"]

	# Add Type column if missing
	if "Type" not in df_uploaded.columns:
	df_uploaded["Type"] = df_uploaded.apply(infer_type, axis=1)

	# Convert to records and save as JSON files (for consistency with aggregation script)
	records = df_uploaded.to_dict('records')
	runs_path = get_runs_path()
	import time
	for record in records:
	run_file = runs_path / f"run_{int(time.time() * 1000)}.json"
	with open(run_file, 'w') as f:
	json.dump(record, f, indent=2)

	num_records = len(records)

	else:
	return None, f"Unsupported file type: {file_path.suffix}. Please upload a .csv or .json file."

	# Aggregate runs into results.csv
	aggregate_runs_to_csv()

	return f"✅ Successfully uploaded {num_records} record(s). Leaderboard updated!", None

	except json.JSONDecodeError as e:
	return None, f"Invalid JSON file: {str(e)}"
	except pd.errors.EmptyDataError:
	return None, "CSV file is empty"
	except Exception as e:
	return None, f"Error processing file: {str(e)}"

	def clean_nan_values(record):
	"""Convert NaN values to None for proper CSV serialization."""
	import math
	cleaned = {}
	for key, value in record.items():
	if pd.isna(value) or (isinstance(value, float) and math.isnan(value)):
	cleaned[key] = None
	else:
	cleaned[key] = value
	return cleaned

	def aggregate_runs_to_csv():
	"""
	Aggregate all JSON files in runs/ directory into results.csv.
	This consolidates all uploaded evaluation results into a single CSV file.
	Deduplicates records based on (Model, Provider, Agent Framework) combination,
	keeping the most recent entry for each unique combination.
	Preserves existing records from results.csv that aren't in runs/ directory.
	"""
	runs_path = get_runs_path()
	results_path = get_results_path()

	# First, load existing results.csv to preserve models not in new uploads
	existing_records_with_time = []
	if results_path.exists():
	try:
	df_existing = load_df(results_path)
	if len(df_existing) > 0:
	# Convert existing records to dict format
	for _, row in df_existing.iterrows():
	record = row.to_dict()
	# Clean NaN values
	record = clean_nan_values(record)
	# Use file modification time - 1 day as timestamp (older than new uploads)
	# This ensures new uploads take precedence, but existing records are preserved
	existing_mtime = results_path.stat().st_mtime - 86400 # 1 day ago
	existing_records_with_time.append((existing_mtime, record))
	except Exception as e:
	print(f"Warning: Error loading existing results.csv: {e}")

	# Gather all JSON files with their modification times
	records_with_time = []
	for path in runs_path.glob("*.json"):
	try:
	record = json.loads(path.read_text())
	# Store modification time for deduplication (most recent wins)
	mtime = path.stat().st_mtime
	records_with_time.append((mtime, record))
	except Exception as e:
	print(f"Warning: Skipping invalid JSON file {path}: {e}")

	# Combine existing records with new records from runs/
	all_records_with_time = existing_records_with_time + records_with_time

	if not all_records_with_time:
	# Create empty CSV with headers
	fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
	fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
	with results_path.open("w", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
	w.writeheader()
	return

	# Sort by modification time (most recent first)
	all_records_with_time.sort(key=lambda x: x[0], reverse=True)

	# Handle legacy column names and infer Type
	legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
	processed_records = []
	for mtime, record in all_records_with_time:
	for old_key, new_key in legacy_map.items():
	if old_key in record and new_key not in record:
	record[new_key] = record.pop(old_key)

	# Infer Type if not present
	if "Type" not in record:
	record["Type"] = infer_type(record)

	processed_records.append(record)

	# Deduplicate: keep only the most recent record for each (Model, Provider, Agent Framework) combination
	seen = {}
	deduplicated_records = []

	for record in processed_records:
	# Create unique key from Model, Provider, and Agent Framework
	model = str(record.get("Model", "")).strip()
	provider = str(record.get("Provider", "")).strip()
	agent_framework = str(record.get("Agent Framework", "")).strip()
	unique_key = (model, provider, agent_framework)

	# Only add if we haven't seen this combination before
	# Since records are sorted by time (most recent first), the first occurrence is kept
	if unique_key not in seen:
	seen[unique_key] = True
	deduplicated_records.append(record)

	if not deduplicated_records:
	# Create empty CSV with headers
	fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
	fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
	with results_path.open("w", newline="") as f:
	w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
	w.writeheader()
	return

	# Build header: metadata → metrics → puzzle types
	fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
	fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
	puzzle_types = sorted({k for r in deduplicated_records for k in r.keys()
	if k not in fixed_metadata + fixed_metrics})
	header = fixed_metadata + fixed_metrics + puzzle_types

	# Write CSV
	results_path.parent.mkdir(parents=True, exist_ok=True)
	with results_path.open("w", newline="") as f:
	w = csv.DictWriter(f, fieldnames=header)
	w.writeheader()
	for r in deduplicated_records:
	w.writerow(r)

	def render(category, sort_column, sort_direction, model_filter="Models Avg"):
	df_full = load_df() # Keep full dataset for perf_by_type
	df = df_full.copy()

	df = compute_score(df, category)

	# Determine sort column and direction
	ascending = (sort_direction == "Low→High")

	# Map sort column names to actual column names (only numeric/metric columns)
	sort_column_map = {
	"Pass Rate": "Category Pass Rate",
	"Avg Duration (s)": "Avg Duration (s)",
	"Avg Cost ($)": "Avg Cost ($)"
	}

	actual_sort_column = sort_column_map.get(sort_column, "Category Pass Rate")

	# Check if column exists
	if actual_sort_column not in df.columns:
	actual_sort_column = "Category Pass Rate"

	# Handle NaN values for numeric sorting
	df = df.copy()
	df['_sort_helper'] = df[actual_sort_column].fillna(float('inf') if ascending else float('-inf'))
	df = df.sort_values('_sort_helper', ascending=ascending).drop(columns=['_sort_helper'])
	df = df.reset_index(drop=True)

	# perf_by_type uses full dataset to show all puzzle types, with optional model filter
	# cost_effectiveness_plot needs df with Category Pass Rate computed
	return table_html(df), perf_bar(df), perf_by_type(df_full, model_filter), cost_effectiveness_plot(df)

	def app():
	df = load_df()

	cats = ["Overall"]
	if len(df) > 0:
	# Get all puzzle type columns (exclude metadata and metric columns)
	exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
	puzzle_cols = [c for c in df.columns if c not in exclude_cols]
	cats = ["Overall"] + puzzle_cols

	with gr.Blocks(title="CAPTCHAv2 Leaderboard", theme=gr.themes.Soft(primary_hue="indigo")) as demo:
	gr.Markdown("""
	<div style="text-align: center; padding: 30px 0;">
	<h1 style="font-size: 42px; font-weight: 700; margin: 0; background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;">
	CAPTCHAv2 Leaderboard
	</h1>
	<p style="font-size: 16px; color: #64748b; margin-top: 10px;">
	Compare model performance across different CAPTCHA types
	</p>
	</div>
	""")

	# Upload section
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Upload Results")

	# Main accordion for the entire guide
	with gr.Accordion("📖 Step-by-Step Guide to Submit Results", open=False):
	# Step 1: Run Evaluation Protocol
	with gr.Accordion("Step 1: Run the Evaluation Protocol", open=False):
	gr.Markdown("""
	Option A: Using browser-use Agent Framework

	1. Start the CAPTCHA server:
	```bash
	python app.py
	```
	The server will run on `http://127.0.0.1:7860`

	2. Run the browser-use agent evaluation (default is their in house model BU1.0):
	```bash
	python -m agent_frameworks.browseruse_cli \\
	--url http://127.0.0.1:7860 \\
	--llm browser-use \\
	```
	Or with a different LLM:
	```bash
	python -m agent_frameworks.browseruse_cli \\
	--url http://127.0.0.1:7860 \\
	--llm openai \\
	--model gpt-4o
	```

	3. The evaluation will automatically save results to `benchmark_results.json` in the project root.
	Each puzzle attempt is logged as a JSON object with fields:
	- `puzzle_type`, `puzzle_id`, `user_answer`, `correct_answer`, `correct`
	- `elapsed_time`, `timestamp`
	- `model`, `provider`, `agent_framework`

	Option B: Using Other Agent Frameworks

	Follow your framework's evaluation protocol. Ensure results are saved in `benchmark_results.json` format
	(JSONL: one JSON object per line) with the same field structure.
	""")

	# Step 2: Convert Results
	with gr.Accordion("Step 2: Convert Results to CSV Format", open=False):
	gr.Markdown("""
	Method 1: Convert to CSV Format (Recommended)

	Use the provided conversion script (`convert_benchmark_to_csv.py` in the project root):
	```bash
	python convert_benchmark_to_csv.py benchmark_results.json leaderboard/results.csv
	```

	Method 2: Directly Upload to Leaderboard (Auto-conversion)

	You can upload `benchmark_results.json` directly here. The system will automatically handle all.

	Optionally provide metadata below if auto-detection fails:
	- Model Name (e.g., "gpt-4", "claude-3-sonnet", "bu-1-0")
	- Provider (e.g., "OpenAI", "Anthropic", "browser-use")
	- Agent Framework (e.g., "browser-use", "crewai")
	""")

	# Step 3: Upload Results
	with gr.Accordion("Step 3: Upload Results", open=False):
	gr.Markdown("""
	Supported file formats:
	- ✅ `benchmark_results.json` - Per-puzzle results (JSONL format)
	- ✅ `results.csv` - Aggregated results Recommended
	- ✅ JSON files - Single object or array of aggregated results

	File format requirements:

	For `benchmark_results.json` (per-puzzle format):
	```json
	{"puzzle_type": "Dice_Count", "puzzle_id": "dice1.png", "user_answer": "24", "correct_answer": 24, "correct": true, "elapsed_time": "12.5", "timestamp": "2025-01-01T00:00:00Z", "model": "bu-1-0", "provider": "browser-use", "agent_framework": "browser-use"}
	```

	For CSV (aggregated format):
	- Required columns: `Model`, `Provider`, `Agent Framework`, `Type`, `Overall Pass Rate` , `Avg Duration (s)`, `Avg Cost ($)`, and puzzle type columns (e.g., `Dice_Count`, `Mirror`, etc.)
	""")

	file_upload = gr.File(
	label="Upload Results File",
	file_types=[".csv", ".json"],
	type="filepath"
	)
	with gr.Row():
	model_name_input = gr.Textbox(
	label="Model Name (optional, for benchmark_results.json)",
	placeholder="e.g., gpt-4, claude-3-sonnet",
	container=True
	)
	provider_input = gr.Textbox(
	label="Provider (optional, for benchmark_results.json)",
	placeholder="e.g., OpenAI, Anthropic, Google",
	container=True
	)
	agent_framework_input = gr.Textbox(
	label="Agent Framework (optional, for benchmark_results.json)",
	placeholder="e.g., browser-use, crewai",
	value="browser-use",
	container=True
	)
	upload_btn = gr.Button("Upload & Update Leaderboard", variant="primary")
	upload_status = gr.Markdown("")

	gr.Markdown("---")

	with gr.Row():
	cat = gr.Dropdown(choices=cats, value="Overall", label="Category/Type", container=True)
	sort_col = gr.Dropdown(
	choices=["Pass Rate", "Avg Duration (s)", "Avg Cost ($)"],
	value="Pass Rate",
	label="Sort by",
	container=True
	)
	sort_dir = gr.Radio(
	choices=["High→Low", "Low→High"],
	value="High→Low",
	label="Sort Direction",
	container=True
	)

	# Model filter for Performance by Type plot
	model_choices = ["Models Avg"]
	if len(df) > 0 and "Model" in df.columns:
	model_choices.extend(sorted(df["Model"].unique().tolist()))

	with gr.Row():
	model_filter = gr.Dropdown(
	choices=model_choices,
	value="Models Avg",
	label="Model Filter (for Performance by Type plot)",
	container=True
	)

	out = gr.HTML(elem_classes="leaderboard-table")
	bar = gr.Plot(label="Performance Comparison")
	pertype_plot = gr.Plot(label="Performance by Type")
	cost_eff_plot = gr.Plot(label="Cost-Effectiveness Analysis")

	def handle_upload(file, model_filter_val, model_name_input_val, provider_input_val, agent_framework_input_val):
	if file is None:
	# Return current state if no file
	table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
	return "Please select a file to upload.", table, bar_fig, pertype_fig, cost_fig

	# Use provided metadata or None (which will trigger auto-detection)
	model_name_val = model_name_input_val.strip() if model_name_input_val else None
	provider_val = provider_input_val.strip() if provider_input_val else None
	agent_framework_val = agent_framework_input_val.strip() if agent_framework_input_val else None

	success_msg, error_msg = process_uploaded_file(
	file,
	model_name=model_name_val,
	provider=provider_val,
	agent_framework=agent_framework_val
	)
	if error_msg:
	# Return current state with error message
	table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
	return f"❌ {error_msg}", table, bar_fig, pertype_fig, cost_fig

	# Reload and render after successful upload
	# Re-render with current settings (use Overall as default since we can't access component values directly)
	table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
	return success_msg, table, bar_fig, pertype_fig, cost_fig

	upload_btn.click(
	handle_upload,
	inputs=[file_upload, model_filter, model_name_input, provider_input, agent_framework_input],
	outputs=[upload_status, out, bar, pertype_plot, cost_eff_plot]
	)

	demo.load(lambda: render("Overall", "Pass Rate", "High→Low", "Models Avg"), outputs=[out, bar, pertype_plot, cost_eff_plot])
	for comp in (cat, sort_col, sort_dir, model_filter):
	comp.change(render, inputs=[cat, sort_col, sort_dir, model_filter], outputs=[out, bar, pertype_plot, cost_eff_plot])
	return demo

	if __name__ == "__main__":
	app().launch()