SLM-RAG-Arena

Running on Zero

File size: 14,948 Bytes

import os
import pandas as pd
import math
from datetime import datetime
from .models import models
from huggingface_hub import CommitScheduler, hf_hub_download

# Default K-factor (determines how much a single match affects ratings)
DEFAULT_K_FACTOR = 32

# Default starting Elo
DEFAULT_ELO = 1500

LEADERBOARD_FN = './utils/leaderboard/arena_elo_leaderboard.csv'
REPO_ID = "aizip-dev/Arena-Metadata"

hub_leaderboard_path = hf_hub_download(
    repo_id=REPO_ID,
    filename="arena_elo_leaderboard.csv",
    repo_type="dataset",
)
df = pd.read_csv(hub_leaderboard_path)
print(f"Successfully loaded leaderboard from the Hub. {len(df)} models.")
df.to_csv(LEADERBOARD_FN, index=False)
print(f"Leaderboard copied to {LEADERBOARD_FN} for CommitScheduler.")
        

#csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')

leaderboard_scheduler = CommitScheduler(
    repo_id=REPO_ID,
    folder_path="utils/leaderboard",
    repo_type="dataset",
    every=1
    )


def prepare_url(model_dict: dict):
    """
    Prepare the URL for the model based on its name.
    
    Parameters:
    - model_dict: Dictionary containing model information
    
    Returns:
    - URL string for the model
    """
    url_dict = {}
    # Extract the model name from the dictionary
    model_names = model_dict.keys()
    for name in model_names:
        half_url = model_dict[name]
    
    # Construct the URL using the model name
        url = f"https://huggingface.co/{half_url}"
        url_dict[name] = url
    
    return url_dict


# Mapping of model names to their Hugging Face URLs
# model_to_hf = {
#     "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct",
#     "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct",
#     # Add more models and their HF links here
# }

model_to_hf = prepare_url(models)

def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False):
    """
    Calculate Elo rating changes for two models.
    
    Parameters:
    - winner_rating: Winner's current rating
    - loser_rating: Loser's current rating
    - k_factor: How much a single match affects ratings
    - draw: Whether the match was a draw
    
    Returns:
    - (winner_change, loser_change): Rating changes to apply
    """
    # Calculate expected scores (probability of winning)
    expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
    expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400))
    
    if draw:
        # For a draw, both get 0.5 points
        actual_winner = 0.5
        actual_loser = 0.5
    else:
        # For a win, winner gets 1 point, loser gets 0
        actual_winner = 1.0
        actual_loser = 0.0
    
    # Calculate rating changes
    winner_change = k_factor * (actual_winner - expected_winner)
    loser_change = k_factor * (actual_loser - expected_loser)
    
    return winner_change, loser_change

def calculate_confidence_interval(elo_rating, num_games, confidence=0.95):
    """
    Calculate a confidence interval for an Elo rating.
    
    Parameters:
    - elo_rating: The current Elo rating
    - num_games: Number of games played
    - confidence: Confidence level (default: 0.95 for 95% confidence)
    
    Returns:
    - margin: The margin of error for the confidence interval
    """
    if num_games == 0:
        return float('inf')
    
    # Z-score for the given confidence level (1.96 for 95% confidence)
    z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96
    
    # Standard deviation of the Elo rating
    # The factor 400/sqrt(num_games) is a common approximation
    std_dev = 400 / math.sqrt(num_games)
    
    # Margin of error
    margin = z * std_dev
    
    return margin

def load_leaderboard_data():
    """
    Loads the leaderboard data from the leaderboard CSV file.
    Returns the data in a format compatible with the application.
    """
    # Initialize the results structure with both win/loss/tie counts and Elo ratings
    results = {
        "wins": {}, 
        "losses": {}, 
        "ties": {}, 
        "votes": 0,
        "elo": {}, 
        "games_played": {},
        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    try:
        # Define the path to the CSV file for leaderboard
        csv_path = LEADERBOARD_FN
        # Check if the file exists and load it
        if os.path.exists(csv_path):
            df = pd.read_csv(LEADERBOARD_FN)
            # Process the data into our structure
            for _, row in df.iterrows():
                model = row['model']
                results["wins"][model] = row['wins']
                results["losses"][model] = row['losses']
                results["ties"][model] = row['ties']
                results["elo"][model] = row['elo']
                results["games_played"][model] = row['games_played']
                
            # Calculate total votes
            for model in results["wins"].keys():
                results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
        else:
            # If file doesn't exist, pre-populate with some reasonable data
            print("Leaderboard file not found. Initializing with default values.")
            from .models import model_names
            for model in model_names:
                results["wins"][model] = 0
                results["losses"][model] = 0
                results["ties"][model] = 0
                results["elo"][model] = DEFAULT_ELO  # Start everyone at 1500 Elo
                results["games_played"][model] = 0
            
        return results
    except Exception as e:
        print(f"Error loading leaderboard data: {e}")
        # Return the initialized structure if file can't be loaded
        return results

def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR):
    """
    Updates Elo ratings based on a match result.
    
    Parameters:
    - results: The current leaderboard results dictionary
    - model_a: Name of model A
    - model_b: Name of model B
    - winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner
    - k_factor: How much this match affects ratings
    
    Returns:
    - Updated results dictionary
    """
    # Initialize ratings if not present
    if model_a not in results["elo"]:
        results["elo"][model_a] = DEFAULT_ELO
        results["games_played"][model_a] = 0
    
    if model_b not in results["elo"]:
        results["elo"][model_b] = DEFAULT_ELO
        results["games_played"][model_b] = 0
    
    # Get current ratings
    rating_a = results["elo"][model_a]
    rating_b = results["elo"][model_b]
    
    # Handle different winning scenarios
    if winner == 'left':
        # Model A won
        change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False)
        results["wins"][model_a] = results["wins"].get(model_a, 0) + 1
        results["losses"][model_b] = results["losses"].get(model_b, 0) + 1
    elif winner == 'right':
        # Model B won
        change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False)
        results["wins"][model_b] = results["wins"].get(model_b, 0) + 1
        results["losses"][model_a] = results["losses"].get(model_a, 0) + 1
    elif winner == 'tie':
        # It's a tie
        change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True)
        results["ties"][model_a] = results["ties"].get(model_a, 0) + 1
        results["ties"][model_b] = results["ties"].get(model_b, 0) + 1
    else:  # 'neither' case - no winner
        # No rating changes, but still log the game
        change_a, change_b = 0, 0
    
    # Apply rating changes
    results["elo"][model_a] = rating_a + change_a
    results["elo"][model_b] = rating_b + change_b
    
    # Update games played counters
    results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1
    results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1
    
    # Update timestamp
    results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    return results

def save_leaderboard_data(results):
    """
    Saves the current leaderboard results back to the CSV file.
    
    Parameters:
    - results: The results dictionary with wins, losses, ties, elo, etc.
    """
    try:
        # Define the path to the CSV file
        csv_path = LEADERBOARD_FN
        
        # Convert the results dictionary to a DataFrame
        data = []
        for model in results["elo"].keys():
            # Calculate confidence interval
            games_played = results["games_played"].get(model, 0)
            confidence_interval = calculate_confidence_interval(results["elo"][model], games_played)
            
            data.append({
                'model': model,
                'elo': round(results["elo"].get(model, DEFAULT_ELO), 1),
                'wins': results["wins"].get(model, 0),
                'losses': results["losses"].get(model, 0),
                'ties': results["ties"].get(model, 0),
                'games_played': results["games_played"].get(model, 0),
                'confidence_interval': round(confidence_interval, 1)
            })
        
        df = pd.DataFrame(data)
        
        # Sort by Elo rating (descending)
        df = df.sort_values(by='elo', ascending=False)
        
        # Save to CSV
        with leaderboard_scheduler.lock:
            df.to_csv(csv_path, index=False)
            print(f"Leaderboard data saved successfully to {csv_path}")
    except Exception as e:
        print(f"Error saving leaderboard data: {e}")

def generate_leaderboard_html(results):
    """
    Generate HTML for displaying the leaderboard with Elo ratings.
    
    Parameters:
    - results: The current leaderboard results dictionary
    
    Returns:
    - HTML string for the leaderboard
    """
    # Models to hide from leaderboard display (but keep in battles)
    HIDDEN_MODELS = ["icecream-3b"]
    
    # Prepare model data for the HTML table
    model_data = []
    for model in results["elo"]:
        # Skip hidden models in the display
        if model in HIDDEN_MODELS:
            continue
            
        elo = results["elo"].get(model, DEFAULT_ELO)
        wins = results["wins"].get(model, 0)
        losses = results["losses"].get(model, 0)
        ties = results["ties"].get(model, 0)
        total_comparisons = wins + losses + ties
        win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
        
        # Calculate confidence interval
        games_played = results["games_played"].get(model, 0)
        confidence = calculate_confidence_interval(elo, games_played)
        
        model_data.append({
            "model": model,
            "elo": elo,
            "wins": wins,
            "losses": losses,
            "ties": ties,
            "comparisons": total_comparisons,
            "win_rate": win_rate,
            "confidence": confidence
        })
    
    # Sort by Elo rating
    model_data.sort(key=lambda x: x["elo"], reverse=True)
    
    # Start building HTML table
    html = """
    <table class="leaderboard-table">
        <thead>
            <tr>
                <th class="centered">Rank</th>
                <th>Model</th>
                <th>Elo Rating</th>
                <th class="centered">Win Rate (%)</th>
                <th class="centered">Wins</th>
                <th class="centered">Losses</th>
                <th class="centered">Ties</th>
                <th class="centered">Comparisons</th>
            </tr>
        </thead>
        <tbody>
    """
    
    # Add rows to the HTML table
    for rank, data in enumerate(model_data, 1):
        model = data["model"]
        elo = data["elo"]
        wins = data["wins"]
        losses = data["losses"]
        ties = data["ties"]
        comparisons = data["comparisons"]
        win_rate = data["win_rate"]
        confidence = data["confidence"]
        
        # Create model link if in the mapping
        if model in model_to_hf:
            model_html = f'<a href="{model_to_hf[model]}" target="_blank" rel="noopener noreferrer" class="model-link">{model}<span class="external-icon">↗</span></a>'
        else:
            model_html = model
        
        # Format Elo with confidence interval
        elo_html = f"{elo:.1f} <span class='confidence-value'>± {confidence:.1f}</span>"
        
        # Add row to table
        html += f"""
        <tr>
            <td class="centered"><strong>{rank}</strong></td>
            <td>{model_html}</td>
            <td class="elo-col">{elo_html}</td>
            <td class="centered">{win_rate:.1%}</td>
            <td class="centered">{wins}</td>
            <td class="centered">{losses}</td>
            <td class="centered">{ties}</td>
            <td class="centered">{comparisons}</td>
        </tr>
        """
    
    # Close the HTML table
    html += """
        </tbody>
    </table>
    """
    
    return html

def submit_vote_with_elo(m_a, m_b, winner, feedback, current_results):
    """
    Enhanced version of submit_vote that calculates and applies Elo rating changes.
    This replaces the original submit_vote_fixed function.
    
    Parameters:
    - m_a: Model A name
    - m_b: Model B name
    - winner: 'left', 'right', 'tie', or 'neither'
    - feedback: List of feedback options selected
    - current_results: The current leaderboard state
    
    Returns:
    - Updated results and UI components
    """
    if winner is None:
        print("Warning: Submit called without a winner selected.")
        return {}

    # Current results could be stale, reload from latest copy on non-persistent storage:
    recent_results = load_leaderboard_data()

    # Update Elo ratings
    updated_results = update_elo_ratings(recent_results.copy(), m_a, m_b, winner)

    # Update vote count
    updated_results["votes"] = updated_results.get("votes", 0) + 1
    
    # Save updated results
    save_leaderboard_data(updated_results)

    # Generate HTML leaderboard
    leaderboard_html = generate_leaderboard_html(updated_results)
    
    # Import gradio for the gr.update objects
    import gradio as gr
    
    return [
        True, updated_results,
        gr.update(interactive=False), gr.update(interactive=False),
        gr.update(interactive=False), gr.update(interactive=False),
        gr.update(interactive=False), gr.update(visible=True),
        gr.update(visible=False), gr.update(visible=True),
        gr.update(interactive=False), gr.update(value=leaderboard_html, visible=True),
        gr.update(elem_classes=["results-revealed"]),
        gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
    ]