import os import pandas as pd import math from datetime import datetime from .models import models from huggingface_hub import CommitScheduler, hf_hub_download # Default K-factor (determines how much a single match affects ratings) DEFAULT_K_FACTOR = 32 # Default starting Elo DEFAULT_ELO = 1500 LEADERBOARD_FN = './utils/leaderboard/arena_elo_leaderboard.csv' REPO_ID = "aizip-dev/Arena-Metadata" hub_leaderboard_path = hf_hub_download( repo_id=REPO_ID, filename="arena_elo_leaderboard.csv", repo_type="dataset", ) df = pd.read_csv(hub_leaderboard_path) print(f"Successfully loaded leaderboard from the Hub. {len(df)} models.") df.to_csv(LEADERBOARD_FN, index=False) print(f"Leaderboard copied to {LEADERBOARD_FN} for CommitScheduler.") #csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv') leaderboard_scheduler = CommitScheduler( repo_id=REPO_ID, folder_path="utils/leaderboard", repo_type="dataset", every=1 ) def prepare_url(model_dict: dict): """ Prepare the URL for the model based on its name. Parameters: - model_dict: Dictionary containing model information Returns: - URL string for the model """ url_dict = {} # Extract the model name from the dictionary model_names = model_dict.keys() for name in model_names: half_url = model_dict[name] # Construct the URL using the model name url = f"https://huggingface.co/{half_url}" url_dict[name] = url return url_dict # Mapping of model names to their Hugging Face URLs # model_to_hf = { # "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct", # "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct", # # Add more models and their HF links here # } model_to_hf = prepare_url(models) def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False): """ Calculate Elo rating changes for two models. Parameters: - winner_rating: Winner's current rating - loser_rating: Loser's current rating - k_factor: How much a single match affects ratings - draw: Whether the match was a draw Returns: - (winner_change, loser_change): Rating changes to apply """ # Calculate expected scores (probability of winning) expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400)) expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400)) if draw: # For a draw, both get 0.5 points actual_winner = 0.5 actual_loser = 0.5 else: # For a win, winner gets 1 point, loser gets 0 actual_winner = 1.0 actual_loser = 0.0 # Calculate rating changes winner_change = k_factor * (actual_winner - expected_winner) loser_change = k_factor * (actual_loser - expected_loser) return winner_change, loser_change def calculate_confidence_interval(elo_rating, num_games, confidence=0.95): """ Calculate a confidence interval for an Elo rating. Parameters: - elo_rating: The current Elo rating - num_games: Number of games played - confidence: Confidence level (default: 0.95 for 95% confidence) Returns: - margin: The margin of error for the confidence interval """ if num_games == 0: return float('inf') # Z-score for the given confidence level (1.96 for 95% confidence) z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96 # Standard deviation of the Elo rating # The factor 400/sqrt(num_games) is a common approximation std_dev = 400 / math.sqrt(num_games) # Margin of error margin = z * std_dev return margin def load_leaderboard_data(): """ Loads the leaderboard data from the leaderboard CSV file. Returns the data in a format compatible with the application. """ # Initialize the results structure with both win/loss/tie counts and Elo ratings results = { "wins": {}, "losses": {}, "ties": {}, "votes": 0, "elo": {}, "games_played": {}, "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S") } try: # Define the path to the CSV file for leaderboard csv_path = LEADERBOARD_FN # Check if the file exists and load it if os.path.exists(csv_path): df = pd.read_csv(LEADERBOARD_FN) # Process the data into our structure for _, row in df.iterrows(): model = row['model'] results["wins"][model] = row['wins'] results["losses"][model] = row['losses'] results["ties"][model] = row['ties'] results["elo"][model] = row['elo'] results["games_played"][model] = row['games_played'] # Calculate total votes for model in results["wins"].keys(): results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2 else: # If file doesn't exist, pre-populate with some reasonable data print("Leaderboard file not found. Initializing with default values.") from .models import model_names for model in model_names: results["wins"][model] = 0 results["losses"][model] = 0 results["ties"][model] = 0 results["elo"][model] = DEFAULT_ELO # Start everyone at 1500 Elo results["games_played"][model] = 0 return results except Exception as e: print(f"Error loading leaderboard data: {e}") # Return the initialized structure if file can't be loaded return results def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR): """ Updates Elo ratings based on a match result. Parameters: - results: The current leaderboard results dictionary - model_a: Name of model A - model_b: Name of model B - winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner - k_factor: How much this match affects ratings Returns: - Updated results dictionary """ # Initialize ratings if not present if model_a not in results["elo"]: results["elo"][model_a] = DEFAULT_ELO results["games_played"][model_a] = 0 if model_b not in results["elo"]: results["elo"][model_b] = DEFAULT_ELO results["games_played"][model_b] = 0 # Get current ratings rating_a = results["elo"][model_a] rating_b = results["elo"][model_b] # Handle different winning scenarios if winner == 'left': # Model A won change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False) results["wins"][model_a] = results["wins"].get(model_a, 0) + 1 results["losses"][model_b] = results["losses"].get(model_b, 0) + 1 elif winner == 'right': # Model B won change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False) results["wins"][model_b] = results["wins"].get(model_b, 0) + 1 results["losses"][model_a] = results["losses"].get(model_a, 0) + 1 elif winner == 'tie': # It's a tie change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True) results["ties"][model_a] = results["ties"].get(model_a, 0) + 1 results["ties"][model_b] = results["ties"].get(model_b, 0) + 1 else: # 'neither' case - no winner # No rating changes, but still log the game change_a, change_b = 0, 0 # Apply rating changes results["elo"][model_a] = rating_a + change_a results["elo"][model_b] = rating_b + change_b # Update games played counters results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1 results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1 # Update timestamp results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return results def save_leaderboard_data(results): """ Saves the current leaderboard results back to the CSV file. Parameters: - results: The results dictionary with wins, losses, ties, elo, etc. """ try: # Define the path to the CSV file csv_path = LEADERBOARD_FN # Convert the results dictionary to a DataFrame data = [] for model in results["elo"].keys(): # Calculate confidence interval games_played = results["games_played"].get(model, 0) confidence_interval = calculate_confidence_interval(results["elo"][model], games_played) data.append({ 'model': model, 'elo': round(results["elo"].get(model, DEFAULT_ELO), 1), 'wins': results["wins"].get(model, 0), 'losses': results["losses"].get(model, 0), 'ties': results["ties"].get(model, 0), 'games_played': results["games_played"].get(model, 0), 'confidence_interval': round(confidence_interval, 1) }) df = pd.DataFrame(data) # Sort by Elo rating (descending) df = df.sort_values(by='elo', ascending=False) # Save to CSV with leaderboard_scheduler.lock: df.to_csv(csv_path, index=False) print(f"Leaderboard data saved successfully to {csv_path}") except Exception as e: print(f"Error saving leaderboard data: {e}") def generate_leaderboard_html(results): """ Generate HTML for displaying the leaderboard with Elo ratings. Parameters: - results: The current leaderboard results dictionary Returns: - HTML string for the leaderboard """ # Prepare model data for the HTML table model_data = [] for model in results["elo"]: elo = results["elo"].get(model, DEFAULT_ELO) wins = results["wins"].get(model, 0) losses = results["losses"].get(model, 0) ties = results["ties"].get(model, 0) total_comparisons = wins + losses + ties win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0 # Calculate confidence interval games_played = results["games_played"].get(model, 0) confidence = calculate_confidence_interval(elo, games_played) model_data.append({ "model": model, "elo": elo, "wins": wins, "losses": losses, "ties": ties, "comparisons": total_comparisons, "win_rate": win_rate, "confidence": confidence }) # Sort by Elo rating model_data.sort(key=lambda x: x["elo"], reverse=True) # Start building HTML table html = """
Rank | Model | Elo Rating | Win Rate (%) | Wins | Losses | Ties | Comparisons |
---|---|---|---|---|---|---|---|
{rank} | {model_html} | {elo_html} | {win_rate:.1%} | {wins} | {losses} | {ties} | {comparisons} |