Spaces:

AtlaAI
/

judge-arena

Running

App Files Files Community

judge-arena / leaderboard.py

kaikaidai

Create leaderboard.py

5267683 verified about 1 month ago

raw

history blame

3.77 kB

	from collections import defaultdict
	from datetime import datetime, timezone
	from typing import Dict, List

	# Constants
	DEFAULT_ELO = 1200 # Starting ELO for new models
	K_FACTOR = 32 # Standard chess K-factor

	def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
	"""Generate leaderboard data using votes from MongoDB."""
	# Initialize dictionaries for tracking
	ratings = defaultdict(lambda: DEFAULT_ELO)
	matches = defaultdict(int)

	# Process each vote
	for vote in voting_data:
	try:
	model_a = vote.get("model_a")
	model_b = vote.get("model_b")
	winner = vote.get("winner")

	# Skip if models aren't in current model_data
	if (
	not all([model_a, model_b, winner])
	or model_a not in model_data
	or model_b not in model_data
	):
	continue

	# Update match counts
	matches[model_a] += 1
	matches[model_b] += 1

	# Calculate ELO changes
	elo_a = ratings[model_a]
	elo_b = ratings[model_b]

	# Expected scores
	expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
	expected_b = 1 - expected_a

	# Actual scores
	score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
	score_b = 1 - score_a

	# Update ratings
	ratings[model_a] += K_FACTOR * (score_a - expected_a)
	ratings[model_b] += K_FACTOR * (score_b - expected_b)

	except Exception as e:
	print(f"Error processing vote: {e}")
	continue

	# Generate leaderboard data
	leaderboard = []
	for model in model_data.keys():
	votes = matches[model]
	# Skip models with < 500 votes if show_preliminary is False
	if not show_preliminary and votes < 500:
	continue

	elo = ratings[model]
	ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
	data = {
	"Model": model,
	"ELO Score": f"{int(elo)}",
	"95% CI": f"±{int(ci)}",
	"# Votes": votes,
	"Organization": model_data[model]["organization"],
	"License": model_data[model]["license"],
	}
	leaderboard.append(data)

	# Sort leaderboard by ELO score in descending order
	leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)

	return leaderboard

	def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
	"""Get summary statistics for the leaderboard."""
	now = datetime.now(timezone.utc)
	total_votes = len(voting_data)
	total_models = len(model_data)
	last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
	"%B %d, %Y at %H:00 UTC"
	)

	return f"""
	### Leaderboard Stats
	- Total Models: {total_models}
	- Total Votes: {total_votes}
	- Last Updated: {last_updated}
	"""

	def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
	"""Calculate ELO rating changes for both players."""
	expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
	expected_b = 1 - expected_a

	if winner == "A":
	score_a, score_b = 1, 0
	elif winner == "B":
	score_a, score_b = 0, 1
	else: # Handle ties
	score_a, score_b = 0.5, 0.5

	change_a = K_FACTOR * (score_a - expected_a)
	change_b = K_FACTOR * (score_b - expected_b)

	return change_a, change_b

	def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
	"""Get current rankings of all models from leaderboard data."""
	return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}