SLM-RAG-Arena

Running on Zero

App Files Files Community

SLM-RAG-Arena / utils /leaderboard.py

oliver-aizip

make hub download once per app load

5e2794d 4 days ago

raw

history blame

14.6 kB

	import os
	import pandas as pd
	import math
	from datetime import datetime
	from .models import models
	from huggingface_hub import CommitScheduler, hf_hub_download

	# Default K-factor (determines how much a single match affects ratings)
	DEFAULT_K_FACTOR = 32

	# Default starting Elo
	DEFAULT_ELO = 1500

	LEADERBOARD_FN = './utils/leaderboard/arena_elo_leaderboard.csv'
	REPO_ID = "aizip-dev/Arena-Metadata"

	hub_leaderboard_path = hf_hub_download(
	repo_id=REPO_ID,
	filename="arena_elo_leaderboard.csv",
	repo_type="dataset",
	)
	df = pd.read_csv(hub_leaderboard_path)
	print(f"Successfully loaded leaderboard from the Hub. {len(df)} models.")
	df.to_csv(LEADERBOARD_FN, index=False)
	print(f"Leaderboard copied to {LEADERBOARD_FN} for CommitScheduler.")


	#csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')

	leaderboard_scheduler = CommitScheduler(
	repo_id=REPO_ID,
	folder_path="utils/leaderboard",
	repo_type="dataset",
	every=1
	)


	def prepare_url(model_dict: dict):
	"""
	Prepare the URL for the model based on its name.

	Parameters:
	- model_dict: Dictionary containing model information

	Returns:
	- URL string for the model
	"""
	url_dict = {}
	# Extract the model name from the dictionary
	model_names = model_dict.keys()
	for name in model_names:
	half_url = model_dict[name]

	# Construct the URL using the model name
	url = f"https://huggingface.co/{half_url}"
	url_dict[name] = url

	return url_dict


	# Mapping of model names to their Hugging Face URLs
	# model_to_hf = {
	# "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct",
	# "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct",
	# # Add more models and their HF links here
	# }

	model_to_hf = prepare_url(models)

	def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False):
	"""
	Calculate Elo rating changes for two models.

	Parameters:
	- winner_rating: Winner's current rating
	- loser_rating: Loser's current rating
	- k_factor: How much a single match affects ratings
	- draw: Whether the match was a draw

	Returns:
	- (winner_change, loser_change): Rating changes to apply
	"""
	# Calculate expected scores (probability of winning)
	expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
	expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400))

	if draw:
	# For a draw, both get 0.5 points
	actual_winner = 0.5
	actual_loser = 0.5
	else:
	# For a win, winner gets 1 point, loser gets 0
	actual_winner = 1.0
	actual_loser = 0.0

	# Calculate rating changes
	winner_change = k_factor * (actual_winner - expected_winner)
	loser_change = k_factor * (actual_loser - expected_loser)

	return winner_change, loser_change

	def calculate_confidence_interval(elo_rating, num_games, confidence=0.95):
	"""
	Calculate a confidence interval for an Elo rating.

	Parameters:
	- elo_rating: The current Elo rating
	- num_games: Number of games played
	- confidence: Confidence level (default: 0.95 for 95% confidence)

	Returns:
	- margin: The margin of error for the confidence interval
	"""
	if num_games == 0:
	return float('inf')

	# Z-score for the given confidence level (1.96 for 95% confidence)
	z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96

	# Standard deviation of the Elo rating
	# The factor 400/sqrt(num_games) is a common approximation
	std_dev = 400 / math.sqrt(num_games)

	# Margin of error
	margin = z * std_dev

	return margin

	def load_leaderboard_data():
	"""
	Loads the leaderboard data from the leaderboard CSV file.
	Returns the data in a format compatible with the application.
	"""
	# Initialize the results structure with both win/loss/tie counts and Elo ratings
	results = {
	"wins": {},
	"losses": {},
	"ties": {},
	"votes": 0,
	"elo": {},
	"games_played": {},
	"last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	}

	try:
	# Define the path to the CSV file for leaderboard
	csv_path = LEADERBOARD_FN
	# Check if the file exists and load it
	if os.path.exists(csv_path):
	df = pd.read_csv(LEADERBOARD_FN)
	# Process the data into our structure
	for _, row in df.iterrows():
	model = row['model']
	results["wins"][model] = row['wins']
	results["losses"][model] = row['losses']
	results["ties"][model] = row['ties']
	results["elo"][model] = row['elo']
	results["games_played"][model] = row['games_played']

	# Calculate total votes
	for model in results["wins"].keys():
	results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
	else:
	# If file doesn't exist, pre-populate with some reasonable data
	print("Leaderboard file not found. Initializing with default values.")
	from .models import model_names
	for model in model_names:
	results["wins"][model] = 0
	results["losses"][model] = 0
	results["ties"][model] = 0
	results["elo"][model] = DEFAULT_ELO # Start everyone at 1500 Elo
	results["games_played"][model] = 0

	return results
	except Exception as e:
	print(f"Error loading leaderboard data: {e}")
	# Return the initialized structure if file can't be loaded
	return results

	def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR):
	"""
	Updates Elo ratings based on a match result.

	Parameters:
	- results: The current leaderboard results dictionary
	- model_a: Name of model A
	- model_b: Name of model B
	- winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner
	- k_factor: How much this match affects ratings

	Returns:
	- Updated results dictionary
	"""
	# Initialize ratings if not present
	if model_a not in results["elo"]:
	results["elo"][model_a] = DEFAULT_ELO
	results["games_played"][model_a] = 0

	if model_b not in results["elo"]:
	results["elo"][model_b] = DEFAULT_ELO
	results["games_played"][model_b] = 0

	# Get current ratings
	rating_a = results["elo"][model_a]
	rating_b = results["elo"][model_b]

	# Handle different winning scenarios
	if winner == 'left':
	# Model A won
	change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False)
	results["wins"][model_a] = results["wins"].get(model_a, 0) + 1
	results["losses"][model_b] = results["losses"].get(model_b, 0) + 1
	elif winner == 'right':
	# Model B won
	change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False)
	results["wins"][model_b] = results["wins"].get(model_b, 0) + 1
	results["losses"][model_a] = results["losses"].get(model_a, 0) + 1
	elif winner == 'tie':
	# It's a tie
	change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True)
	results["ties"][model_a] = results["ties"].get(model_a, 0) + 1
	results["ties"][model_b] = results["ties"].get(model_b, 0) + 1
	else: # 'neither' case - no winner
	# No rating changes, but still log the game
	change_a, change_b = 0, 0

	# Apply rating changes
	results["elo"][model_a] = rating_a + change_a
	results["elo"][model_b] = rating_b + change_b

	# Update games played counters
	results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1
	results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1

	# Update timestamp
	results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	return results

	def save_leaderboard_data(results):
	"""
	Saves the current leaderboard results back to the CSV file.

	Parameters:
	- results: The results dictionary with wins, losses, ties, elo, etc.
	"""
	try:
	# Define the path to the CSV file
	csv_path = LEADERBOARD_FN

	# Convert the results dictionary to a DataFrame
	data = []
	for model in results["elo"].keys():
	# Calculate confidence interval
	games_played = results["games_played"].get(model, 0)
	confidence_interval = calculate_confidence_interval(results["elo"][model], games_played)

	data.append({
	'model': model,
	'elo': round(results["elo"].get(model, DEFAULT_ELO), 1),
	'wins': results["wins"].get(model, 0),
	'losses': results["losses"].get(model, 0),
	'ties': results["ties"].get(model, 0),
	'games_played': results["games_played"].get(model, 0),
	'confidence_interval': round(confidence_interval, 1)
	})

	df = pd.DataFrame(data)

	# Sort by Elo rating (descending)
	df = df.sort_values(by='elo', ascending=False)

	# Save to CSV
	with leaderboard_scheduler.lock:
	df.to_csv(csv_path, index=False)
	print(f"Leaderboard data saved successfully to {csv_path}")
	except Exception as e:
	print(f"Error saving leaderboard data: {e}")

	def generate_leaderboard_html(results):
	"""
	Generate HTML for displaying the leaderboard with Elo ratings.

	Parameters:
	- results: The current leaderboard results dictionary

	Returns:
	- HTML string for the leaderboard
	"""
	# Prepare model data for the HTML table
	model_data = []
	for model in results["elo"]:
	elo = results["elo"].get(model, DEFAULT_ELO)
	wins = results["wins"].get(model, 0)
	losses = results["losses"].get(model, 0)
	ties = results["ties"].get(model, 0)
	total_comparisons = wins + losses + ties
	win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0

	# Calculate confidence interval
	games_played = results["games_played"].get(model, 0)
	confidence = calculate_confidence_interval(elo, games_played)

	model_data.append({
	"model": model,
	"elo": elo,
	"wins": wins,
	"losses": losses,
	"ties": ties,
	"comparisons": total_comparisons,
	"win_rate": win_rate,
	"confidence": confidence
	})

	# Sort by Elo rating
	model_data.sort(key=lambda x: x["elo"], reverse=True)

	# Start building HTML table
	html = """
	<table class="leaderboard-table">
	<thead>
	<tr>
	<th class="centered">Rank</th>
	<th>Model</th>
	<th>Elo Rating</th>
	<th class="centered">Win Rate (%)</th>
	<th class="centered">Wins</th>
	<th class="centered">Losses</th>
	<th class="centered">Ties</th>
	<th class="centered">Comparisons</th>
	</tr>
	</thead>
	<tbody>
	"""

	# Add rows to the HTML table
	for rank, data in enumerate(model_data, 1):
	model = data["model"]
	elo = data["elo"]
	wins = data["wins"]
	losses = data["losses"]
	ties = data["ties"]
	comparisons = data["comparisons"]
	win_rate = data["win_rate"]
	confidence = data["confidence"]

	# Create model link if in the mapping
	if model in model_to_hf:
	model_html = f'<a href="{model_to_hf[model]}" target="_blank" rel="noopener noreferrer" class="model-link">{model}<span class="external-icon">↗</span></a>'
	else:
	model_html = model

	# Format Elo with confidence interval
	elo_html = f"{elo:.1f} <span class='confidence-value'>± {confidence:.1f}</span>"

	# Add row to table
	html += f"""
	<tr>
	<td class="centered"><strong>{rank}</strong></td>
	<td>{model_html}</td>
	<td class="elo-col">{elo_html}</td>
	<td class="centered">{win_rate:.1%}</td>
	<td class="centered">{wins}</td>
	<td class="centered">{losses}</td>
	<td class="centered">{ties}</td>
	<td class="centered">{comparisons}</td>
	</tr>
	"""

	# Close the HTML table
	html += """
	</tbody>
	</table>
	"""

	return html

	def submit_vote_with_elo(m_a, m_b, winner, feedback, current_results):
	"""
	Enhanced version of submit_vote that calculates and applies Elo rating changes.
	This replaces the original submit_vote_fixed function.

	Parameters:
	- m_a: Model A name
	- m_b: Model B name
	- winner: 'left', 'right', 'tie', or 'neither'
	- feedback: List of feedback options selected
	- current_results: The current leaderboard state

	Returns:
	- Updated results and UI components
	"""
	if winner is None:
	print("Warning: Submit called without a winner selected.")
	return {}

	# Update Elo ratings
	updated_results = update_elo_ratings(current_results.copy(), m_a, m_b, winner)

	# Update vote count
	updated_results["votes"] = updated_results.get("votes", 0) + 1

	# Save updated results
	save_leaderboard_data(updated_results)

	# Generate HTML leaderboard
	leaderboard_html = generate_leaderboard_html(updated_results)

	# Import gradio for the gr.update objects
	import gradio as gr

	return [
	True, updated_results,
	gr.update(interactive=False), gr.update(interactive=False),
	gr.update(interactive=False), gr.update(interactive=False),
	gr.update(interactive=False), gr.update(visible=True),
	gr.update(visible=False), gr.update(visible=True),
	gr.update(interactive=False), gr.update(value=leaderboard_html, visible=True),
	gr.update(elem_classes=["results-revealed"]),
	gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
	]