Spaces:

lmarena
/

chatbot-arena-leaderboard

Running

File size: 51,667 Bytes

import argparse
import ast
import glob
import pickle
import traceback
import numpy as np

import pandas as pd
import gradio as gr
import numpy as np


promo_banner = """
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
    USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
</div>
"""

deprecated_model_name = [
    "GigaChat 3.1.25.3",
    "GigaChat-Pro 2.2.25.3",
    "saiga_llama3_8b_v6",
    "saiga_phi3_medium",
    "GigaChat-Plus 3.1.25.3",
    "GigaChat-Pro 4.0.26.8",
    "GigaChat 4.0.26.8",
    "xAI: Grok 2",
    "GigaChat-Pro 4.0.26.15",
    "GigaChat 4.0.26.15",
    "YandexGPT Experimental", "yandex-gpt-arena",
    "RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated"
]

models_10b = [
    "saiga_llama3_8b_v7",
    "Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it",
    "T-lite-instruct-0.1",
    "t-tech/T-lite-it-1.0",
    "LLaMA-3 Chat (8B)",
    "Llama 3.1 8B Instruct Turbo",
    "MTSAIR/Cotype-Nano"
]


def make_default_md_1():
    leaderboard_md = f"""
# 🏆 LLM Arena in Russian: Leaderboard

{promo_banner}

"""
    return leaderboard_md

def make_default_md_2():
    leaderboard_md = f"""

    The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
    Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!

    - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
    - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
    - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
    """
    return leaderboard_md


def make_arena_leaderboard_md(arena_df, last_updated_time):
    # Using version from monitor.py (translated)
    total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
    total_models = len(arena_df)
    space = "   " # Using HTML space

    leaderboard_md = f"""
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.

***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model.
Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval).
See Figure 1 below for a visualization of the confidence intervals of model ratings.
"""
    return leaderboard_md


def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
    total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
    total_models = len(arena_df)
    space = "   "
    total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0
    total_subset_models = len(arena_subset_df)

    perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0
    perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0

    leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)}
#### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space}
"""
    return leaderboard_md

def model_hyperlink(model_name, link):
    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'


def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25):
    """
    Filters Plotly plots to show only top N models and optionally removes specific models.

    Args:
        fig: The Plotly figure object.
        hidden_models (list, optional): A list of model names to remove. Defaults to None.
        limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25.

    Returns:
        Plotly figure: The filtered figure object or the original if filtering fails or is not applicable.
    """
    if fig is None:
        return None

    # Check if the figure has data
    if not hasattr(fig, 'data') or len(fig.data) == 0:
        return fig

    # Check if data has a type attribute
    if not hasattr(fig.data[0], 'type'):
        return fig

    # Check minimum number of models after initial hidden_models filtering
    models_to_check = []
    if hasattr(fig.data[0], 'x'):
      models_to_check = fig.data[0].x
    elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric
      models_to_check = fig.data[0].y

    if hidden_models is not None and models_to_check.any():
        available_models = [x for x in models_to_check if x not in hidden_models]
        # print(f"Available models before top N: {len(available_models)}") # Debug
        if len(available_models) <= 2:  # If less than 3 models remain before top_n
            # print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.")
            return fig # Return the original plot if too few models

    if limit_to_top is not None and limit_to_top <= 0:
        limit_to_top = None

    try:
        # Work on a deep copy to avoid modifying the original figure object
        fig_copy = pickle.loads(pickle.dumps(fig))
        data = fig_copy.data[0]

        if data.type == 'heatmap':
            # Apply hidden models filter
            mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool)
            mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool)

            # Get initially filtered X and Y arrays
            filtered_x = np.array(data.x)[mask_x]
            filtered_y = np.array(data.y)[mask_y]

            # Apply top N limit (assuming the order is already by rank/rating)
            if limit_to_top is not None and len(filtered_x) > limit_to_top:
                top_models = filtered_x[:limit_to_top]
                # Create new masks based on the top models relative to the *original* data axes
                mask_x = np.isin(data.x, top_models)
                mask_y = np.isin(data.y, top_models)
                # Get final filtered axes
                filtered_x = np.array(data.x)[mask_x]
                filtered_y = np.array(data.y)[mask_y]
            elif len(filtered_x) <= 2: # If <=2 models remain after filtering
                 return fig # Return original

            # Update the heatmap data
            data.x = filtered_x
            data.y = filtered_y
            # Important: Indexing 'z' must use masks derived from the *original* data order
            z_original = np.array(fig.data[0].z)
            data.z = z_original[np.ix_(mask_y, mask_x)]

        elif data.type == 'scatter':
            trace = data
            # Apply hidden models filter
            mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)

            # Get initially filtered arrays
            current_x = np.array(trace.x)[mask]
            current_y = np.array(trace.y)[mask]
            current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None
            # Handle error bars safely
            current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None
            current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None

            # Apply top N limit
            if limit_to_top is not None and len(current_x) > limit_to_top:
                # Sort by y-value (rating) descending to find the top N
                sort_indices = np.argsort(-current_y)[:limit_to_top]
                current_x = current_x[sort_indices]
                current_y = current_y[sort_indices]
                if current_text is not None:
                    current_text = current_text[sort_indices]
                if current_error_y_array is not None:
                    current_error_y_array = current_error_y_array[sort_indices]
                if current_error_y_arrayminus is not None:
                    current_error_y_arrayminus = current_error_y_arrayminus[sort_indices]
            elif len(current_x) <= 2: # If <=2 models remain after filtering
                return fig # Return original

            # Update the scatter trace data
            trace.x, trace.y = current_x, current_y
            if current_text is not None:
                trace.text = current_text
            # Update error bars if they exist
            if current_error_y_array is not None:
                # Ensure error_y exists before assigning
                if 'error_y' not in trace: trace.error_y = {}
                trace.error_y['array'] = current_error_y_array
            if current_error_y_arrayminus is not None:
                if 'error_y' not in trace: trace.error_y = {}
                trace.error_y['arrayminus'] = current_error_y_arrayminus

        elif data.type == 'bar':
            trace = data
            # Apply hidden models filter
            mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)

            # Get initially filtered arrays
            current_x = np.array(trace.x)[mask]
            current_y = np.array(trace.y)[mask]

            # Apply top N limit
            if limit_to_top is not None and len(current_x) > limit_to_top:
                # Sort by y-value (rating) descending
                sort_indices = np.argsort(-current_y)[:limit_to_top]
                current_x = current_x[sort_indices]
                current_y = current_y[sort_indices]
            elif len(current_x) <= 2: # If <=2 models remain after filtering
                return fig # Return original

            # Update the bar trace data
            trace.x, trace.y = current_x, current_y

        return fig_copy

    except Exception as e:
        print(f"Error filtering plot: {e}")
        traceback.print_exc()
        return fig # Return original figure on error


def load_leaderboard_table_csv(filename, add_hyperlink=True):
    lines = open(filename).readlines()
    heads = [v.strip() for v in lines[0].split(",")]
    rows = []
    for i in range(1, len(lines)):
        row = [v.strip() for v in lines[i].split(",")]
        item = {} # Create dictionary once per row
        for h, v in zip(heads, row):
            if h == "Arena Elo rating":
                if v != "-":
                    try:
                        v = int(ast.literal_eval(v))
                    except:
                        v = np.nan # Handle parsing errors
                else:
                    v = np.nan
            item[h] = v
        if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist
            # Check for empty/missing link
            if item["Link"] and item["Link"] != "-":
                item["Model"] = model_hyperlink(item["Model"], item["Link"])
            # Otherwise, keep the model name as is
        rows.append(item)
    return rows


def create_ranking_str(ranking, ranking_difference):
    # Convert rank to int before comparison
    try:
        # Ensure rank and difference are treated as numbers
        ranking_val = int(float(ranking)) # Handle potential float input
        ranking_difference_val = int(float(ranking_difference))
        if ranking_difference_val > 0:
            return f"{ranking_val} ↑"
        elif ranking_difference_val < 0:
            return f"{ranking_val} ↓"
        else:
            return f"{ranking_val}"
    except (ValueError, TypeError): # Handle cases where rank is not numeric
        return str(ranking)


def recompute_final_ranking(arena_df):
    ranking = {}
    if arena_df.empty:
        return []

    model_indices = arena_df.index
    # Ensure CI columns exist before trying to access them
    if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns:
        print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.")
        # Return NaN or simple rank based on order
        return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1)

    ratings_q025 = arena_df["rating_q025"].to_dict()
    ratings_q975 = arena_df["rating_q975"].to_dict()

    for model_a in model_indices:
        rank = 1
        rating_a_q975 = ratings_q975.get(model_a)
        # Skip if model A has no CI data
        if pd.isna(rating_a_q975):
             ranking[model_a] = np.nan # Or assign max rank + 1
             continue

        for model_b in model_indices:
            if model_a == model_b:
                continue

            rating_b_q025 = ratings_q025.get(model_b)
            # Skip comparison if model B has no CI data
            if pd.isna(rating_b_q025):
                 continue

            # Check if B is statistically better than A
            if rating_b_q025 > rating_a_q975:
                rank += 1
        ranking[model_a] = rank
    return list(ranking.values())


def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
    """
    Generates the leaderboard table data.
    'use_cache' parameter removed.
    """
    # print(f'Calculating get_arena_table') # Debug

    # Create copies to avoid modifying original DataFrames
    arena_df_processed = arena_df.copy()
    if arena_subset_df is not None:
        arena_subset_df_processed = arena_subset_df.copy()
    else:
        arena_subset_df_processed = None

    # Sort by rating initially to have a stable order before ranking
    arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False)
    # Compute 'final_ranking' based on CIs if possible
    if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
         arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
         arena_df_processed = arena_df_processed.sort_values(
             by=["final_ranking", "rating"], ascending=[True, False]
         )
    else:
         # Fallback to simple ordering if CI columns are missing
         arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)

    if hidden_models:
        arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy()
        # Recompute ranks for the filtered view
        if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
            arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
            # Re-sort based on new ranks
            arena_df_processed = arena_df_processed.sort_values(
                by=["final_ranking", "rating"], ascending=[True, False]
            )
        else:
             arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)


    if arena_subset_df_processed is not None:
        # Filter subset by hidden_models first
        if hidden_models:
             arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy()

        # Ensure models in the subset are also present in the (filtered) main view
        arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)]

        # Proceed only if subset is not empty and has CI columns
        if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns:
            # Rank within the subset
            arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False)
            arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category

            # Filter the main processed DF to only include models from the subset
            # 'final_ranking' here represents the rank *among these models* in the baseline category view
            arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy()
            arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True)


            # Join the subset ranks and baseline ranks
            arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join(
                 arena_df_for_join["final_ranking_baseline"], how="inner"
            )

            # Calculate rank difference
            arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"]

            # Sort by subset rank and rating
            arena_df_combined = arena_df_combined.sort_values(
                by=["final_ranking_subset", "rating"], ascending=[True, False]
            )

            # Format the rank string with delta for display
            arena_df_combined["display_ranking"] = arena_df_combined.apply(
                lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]),
                axis=1,
            )
            arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed

            columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"]
            columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns]
            arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner")


            # Now sorting should work as the column exists
            # Use the subset rank for final sorting if subset is active
            # Check if 'final_ranking_subset' was successfully joined before sorting
            if "final_ranking_subset" in arena_df_processed.columns:
                arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True)
            else:
                # Fallback sort if join failed for some reason
                arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True)


        else:
            # If subset is empty or lacks CI, disable subset logic
            arena_subset_df_processed = None
            # Use the baseline ranking as the display ranking
            arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
            arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)


    else:
        # If no subset is used, display ranking is just the final rank from the main DF
        arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
        # Ensure it's sorted correctly
        arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)


    values = []
    # Iterate using the final sorted index of arena_df_processed
    for model_key in arena_df_processed.index:
        row_data = arena_df_processed.loc[model_key]
        # Find model metadata
        model_info = model_table_df[model_table_df["key"] == model_key]
        if model_info.empty:
            # print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.")
            continue # Skip if no metadata

        row = []
        # Rank (Display)
        row.append(row_data.get("display_ranking", "")) # Use the calculated display rank

        # Delta (only if subset was processed successfully)
        if arena_subset_df_processed is not None:
            row.append(row_data.get("ranking_difference", 0))

        # Model Name (hyperlink applied during loading)
        row.append(model_info["Model"].values[0])

        # Arena Elo
        row.append(round(row_data["rating"]))

        # 95% CI
        # Check for NaN before calculation
        upper_rating = row_data.get("rating_q975")
        lower_rating = row_data.get("rating_q025")
        current_rating = row_data.get("rating")
        upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?'
        lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?'
        row.append(f"+{upper_diff}/-{lower_diff}")


        # Votes
        row.append(round(row_data["num_battles"]))

        # Organization
        row.append(model_info["Organization"].values[0])

        # License
        row.append(model_info["License"].values[0])

        # Knowledge Cutoff
        cutoff_date = model_info["Knowledge cutoff date"].values[0]
        row.append("Unknown" if cutoff_date == "-" else cutoff_date)

        values.append(row)

    return values


key_to_category_name = {
    # Mapping from internal key to display name (kept English for consistency)
    "full": "Overall", # Might not be used if filtered out later
    "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
    "site_visitors/medium_prompts": "site_visitors/medium_prompts",
    "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed
}
cat_name_to_explanation = {
    # Translated explanations for display
    "Overall": "All queries",
    "crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.",
    "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
    "site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
}
cat_name_to_baseline = {
    # Baseline category for comparison (if needed, seems unused now but kept)
    # "Hard Prompts (English)": "English",
}

actual_categories = [
    # Categories available in the dropdown (use the *keys* from key_to_category_name)
    # "Overall", # Removed
    # "crowdsourcing/simple_prompts", # Removed
    "site_visitors/medium_prompts",
    "site_visitors/medium_prompts:style control"
]
# Default selected category key
req_cat_key = "site_visitors/medium_prompts:style control"
selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None))
# Get the display name for the selected category
selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found


def read_elo_file(elo_results_file, leaderboard_table_file):
    # Version from monitor.py, but no lazy_load or caching
    print('Reading Elo file...')
    arena_dfs = {}
    category_elo_results = {}
    last_updated_time = "N/A" # Default value
    elo_results = {} # Default value
    model_table_df = pd.DataFrame() # Default value

    try:
        # Use context manager for file operations
        with open(elo_results_file, "rb") as fin:
            elo_results = pickle.load(fin)

            # Try to get last updated time from primary or fallback categories
            main_cat_key = "site_visitors/medium_prompts:style control"
            fallback_cat_key_1 = "site_visitors/medium_prompts"
            fallback_cat_key_2 = "full" # Another fallback

            if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]:
                 last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0]
            elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]:
                 last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0]
            elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]:
                 last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0]

            # Iterate through defined category keys
            for key in key_to_category_name.keys():
                display_name = key_to_category_name[key] # Get the display name
                if key in elo_results:
                    # Check for required data within the category result
                    if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame):
                         df = elo_results[key]["leaderboard_table_df"]
                         # Filter by number of battles > 200
                         # Store using the *display_name* as the key for consistency with dropdown/UI
                         arena_dfs[display_name] = df[df["num_battles"] > 200].copy()
                         category_elo_results[display_name] = elo_results[key]
                    # else:
                    #      print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'")
                # else:
                #      print(f"Warning: Key '{key}' not found in elo_results")

        # Load model metadata CSV
        data = load_leaderboard_table_csv(leaderboard_table_file)
        model_table_df = pd.DataFrame(data)

    except FileNotFoundError:
        print(f"Error: Elo results file not found at {elo_results_file}")
        # Return empty structures
    except Exception as e:
        print(f"Error reading elo file: {e}")
        traceback.print_exc()
        # Return empty structures

    # Ensure correct data types are returned even on error
    return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df


def build_leaderboard_tab(
    elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
):
    # Load data once during build time
    try:
        last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
    except Exception as e:
        print(f"Failed to load initial data: {e}")
        # Set empty defaults to prevent app crash
        last_updated_time = "Error"
        arena_dfs = {}
        category_elo_results = {}
        elo_results = {}
        model_table_df = pd.DataFrame()

    # Get data for the default selected category
    # Use the *display name* derived from the selected key
    if selected_category_display_name in arena_dfs:
        arena_df = arena_dfs[selected_category_display_name]
        elo_subset_results_init = category_elo_results[selected_category_display_name]
        p1_init = elo_subset_results_init.get("win_fraction_heatmap")
        p2_init = elo_subset_results_init.get("battle_count_heatmap")
        p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
        p4_init = elo_subset_results_init.get("average_win_rate_bar")
    else:
        # Fallback if default category is missing
        fallback_cat_display_name = None
        if actual_categories:
             # Try the first actual category's display name
             first_cat_key = actual_categories[0]
             fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key)

        if fallback_cat_display_name and fallback_cat_display_name in arena_dfs:
            print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.")
            arena_df = arena_dfs[fallback_cat_display_name]
            elo_subset_results_init = category_elo_results[fallback_cat_display_name]
            p1_init = elo_subset_results_init.get("win_fraction_heatmap")
            p2_init = elo_subset_results_init.get("battle_count_heatmap")
            p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
            p4_init = elo_subset_results_init.get("average_win_rate_bar")
        else:
            print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.")
            arena_df = pd.DataFrame() # Empty DataFrame
            p1_init, p2_init, p3_init, p4_init = None, None, None, None

    # Apply initial filtering to plots
    p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name)
    p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name)
    p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name)
    p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name)

    default_md = make_default_md_1() # Parameters removed
    default_md_2 = make_default_md_2() # Parameters removed

    with gr.Row():
        with gr.Column(scale=4):
            # Removed Vote button
            md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
        with gr.Column(scale=1):
            vote_button = gr.Button("Vote!", link="https://llmarena.ru")
    md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")

    # Generate initial table data
    if not arena_df.empty and not model_table_df.empty:
         # Pass the baseline DF and the model table; initially no subset difference is shown
         arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
    else:
         arena_table_vals_init = []

    # Single "Arena" tab
    with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab
        md_arena = make_arena_leaderboard_md(arena_df, last_updated_time)
        lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown")

        with gr.Row():
            with gr.Column(scale=2):
                # Use *display names* for choices if they differ significantly from keys,
                # but here keys are descriptive enough. Callback receives the *key*.
                category_dropdown = gr.Dropdown(
                    # Choices should be the *keys* corresponding to display names
                    choices=actual_categories,
                    value=selected_category_key, # Use the key for the default value
                    label="Category", # Translated
                )

            with gr.Column(scale=2):
                category_checkbox = gr.CheckboxGroup(
                    # Use user-friendly translated labels
                    ["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity
                    label="Apply Filter",
                    info="",
                    value=[], # Filters off by default
                )

            # Category details
            default_category_details = make_category_arena_leaderboard_md(
                arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display
            ) if not arena_df.empty else "No data for category"

            with gr.Column(scale=4, variant="panel"):
                 category_deets = gr.Markdown(
                     default_category_details, elem_id="category_deets"
                 )

        # DataFrame for displaying the table
        # Initial view doesn't have 'Delta' column
        arena_vals = pd.DataFrame(
            arena_table_vals_init,
            columns=[
                "Rank* (UB)", "Model", "Arena Elo", "95% CI",
                "Votes", "Organization", "License", "Knowledge Cutoff"
            ]
        ) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data
                "Rank* (UB)", "Model", "Arena Elo", "95% CI",
                "Votes", "Organization", "License", "Knowledge Cutoff"
            ])

        # Sort by Elo for initial display
        if "Arena Elo" in arena_vals.columns:
            arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False)


        elo_display_df = gr.Dataframe(
            headers=[ # Translated headers
                "Rank* (UB)", "Model", "Arena Elo", "95% CI",
                "Votes", "Organization", "License", "Knowledge Cutoff"
            ],
            datatype=[
                "str", "markdown", "number", "str",
                "number", "str", "str", "str"
            ],
            value=arena_vals.style, # Apply Pandas styling if needed
            elem_id="arena_leaderboard_dataframe",
            height=700,
            column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py
            wrap=True,
        )

        gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing

        plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables
        more_stats_md = None # Initialize markdown variable
        if show_plot:
            more_stats_md = gr.Markdown(
                f"""## More Statistics for Chatbot Arena""", # Translated
                elem_id="leaderboard_header_markdown",
            )
            with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py
                with gr.Column():
                    gr.Markdown( # Translated title
                        "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
                        elem_id="plot-title",
                    )
                    plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data
                with gr.Column():
                    gr.Markdown( # Translated title
                        "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
                        elem_id="plot-title",
                    )
                    plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data
            with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py
                with gr.Column():
                    gr.Markdown( # Translated title
                        "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
                        elem_id="plot-title",
                    )
                    plot_1 = gr.Plot(
                        p1_init, show_label=False, elem_id="plot-container" # Use initial data
                    )
                with gr.Column():
                    gr.Markdown( # Translated title
                        "#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
                        elem_id="plot-title",
                    )
                    plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data

    def update_leaderboard_df(arena_table_vals):
        # Add error handling for empty or incorrect data
        # Expects 9 columns when Delta is present
        if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9:
            print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.")
            # Return an empty styled DataFrame to avoid Gradio errors
            empty_styled = pd.DataFrame(columns=[
                "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
                "Votes", "Organization", "License", "Knowledge Cutoff"
            ]).style
            return empty_styled

        try:
            elo_datarame = pd.DataFrame(
                arena_table_vals,
                columns=[
                    "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
                    "Votes", "Organization", "License", "Knowledge Cutoff"
                ],
            )

            def highlight_max(s):
                # Check rank string for arrows
                return [
                    "color: green; font-weight: bold" if "↑" in str(v) else
                    "color: red; font-weight: bold" if "↓" in str(v) else ""
                    for v in s
                ]

            def highlight_rank_max(s):
                # Check Delta value (ensure it's numeric)
                return [
                    "color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else
                    "color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else ""
                    for v in s
                ]
            # Apply styles
            styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
                highlight_rank_max, subset=["Delta"]
            )
            return styled_df

        except Exception as e:
            print(f"Error applying styles in update_leaderboard_df: {e}")
            traceback.print_exc()
            # Return unstyled DataFrame on error
            return pd.DataFrame(arena_table_vals, columns=[
                "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
                "Votes", "Organization", "License", "Knowledge Cutoff"
            ]).style

    def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown
        # No caching
        # Reload data on each call
        try:
             current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
        except Exception as e:
             print(f"Error reloading data in callback: {e}")
             # Return empty updates to prevent UI crash
             empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame
             empty_plot_update = gr.Plot(value=None) # Empty Plot
             empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown
             # Match the number of outputs expected by the .change() call
             num_plots = 4 if show_plot else 0
             return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]


        # Use the display name corresponding to the selected key
        category_display_name = key_to_category_name.get(category_key, category_key)

        # Check if data exists for the selected category (using display name as key now)
        if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty:
             print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.")
             empty_df_update = gr.Dataframe(value=pd.DataFrame().style)
             empty_plot_update = gr.Plot(value=None)
             empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}")
             num_plots = 4 if show_plot else 0
             # Match the number of outputs
             return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]

        # Get the specific data slices using the display name
        arena_subset_df = current_arena_dfs[category_display_name]
        elo_subset_results = current_category_elo_results[category_display_name]

        # Use the hardcoded baseline key, get its display name
        baseline_key = "site_visitors/medium_prompts:style control"
        baseline_display_name = key_to_category_name.get(baseline_key, baseline_key)

        # Fallback if baseline is missing
        if baseline_display_name not in current_arena_dfs:
            print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.")
            baseline_display_name = category_display_name # Fallback to the selected category itself

        arena_df_baseline = current_arena_dfs[baseline_display_name]


        hidden_models_list = None # Default: show all
        # Check filter labels (must match the translated CheckboxGroup choices)
        if "Show Deprecated" not in filters:
            hidden_models_list = deprecated_model_name.copy() # Hide deprecated

        if "Only <10B Models" in filters:
            # Get all models currently in the baseline view
            all_models_in_view = arena_df_baseline.index.tolist()
            # Find models *not* in the allowed list
            models_to_hide = [model for model in all_models_in_view if model not in models_10b]

            if hidden_models_list is None: # If deprecated are not hidden
                hidden_models_list = models_to_hide
            else: # If deprecated are already hidden, add the non-<10B ones
                # Use set to avoid duplicates
                hidden_models_list = list(set(hidden_models_list + models_to_hide))

        arena_table_values = get_arena_table(
            arena_df_baseline, # Use the determined baseline DataFrame
            current_model_table_df,
            # Pass subset only if it's different from the baseline
            arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None),
            hidden_models=hidden_models_list
        )

        dataframe_update = None
        # Show Delta column only if category is not the baseline and data exists
        if category_display_name != baseline_display_name and arena_table_values:
            styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta
            # Check if styling was successful
            if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty:
                dataframe_update = gr.Dataframe(
                    headers=[ # Headers including Delta
                        "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
                        "Votes", "Organization", "License", "Knowledge Cutoff"
                    ],
                    datatype=[
                        "str", "number", "markdown", "number", "str",
                        "number", "str", "str", "str"
                    ],
                    value=styled_arena_values, # Pass the Styler object
                    elem_id="arena_leaderboard_dataframe",
                    height=700,
                    column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta
                    wrap=True,
                )
            else: # Handle styling failure
                 dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update

        else: # Baseline category or no data for Delta
             # Ensure data exists before creating DataFrame
             if arena_table_values:
                  # Create DataFrame without Delta column from the raw values
                  df_no_delta = pd.DataFrame(arena_table_values, columns=[
                      "Rank* (UB)", "Model", "Arena Elo", "95% CI",
                      "Votes", "Organization", "License", "Knowledge Cutoff"
                  ])
                  dataframe_update = gr.Dataframe(
                      headers=[ # Headers without Delta
                          "Rank* (UB)", "Model", "Arena Elo", "95% CI",
                          "Votes", "Organization", "License", "Knowledge Cutoff"
                      ],
                      datatype=[
                          "str", "markdown", "number", "str", "number",
                          "str", "str", "str"
                      ],
                      value=df_no_delta.style, # Apply basic Pandas styling
                      elem_id="arena_leaderboard_dataframe",
                      height=700,
                      column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta
                      wrap=True,
                  )
             else:
                  dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update

        plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates
        if show_plot:
             p1_updated = elo_subset_results.get("win_fraction_heatmap")
             p2_updated = elo_subset_results.get("battle_count_heatmap")
             p3_updated = elo_subset_results.get("bootstrap_elo_rating")
             p4_updated = elo_subset_results.get("average_win_rate_bar")

             # Filter plots
             p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list)
             p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list)
             p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list)
             p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list)
             plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered]


        more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else ""
        more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text)

        # Use baseline DF for total counts, subset DF for category-specific counts
        category_details_md_updated_text = make_category_arena_leaderboard_md(
            arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name
        )
        category_deets_update = gr.Markdown(value=category_details_md_updated_text)

        # Return updates in the correct order matching outputs list
        # Order: df, p1, p2, p3, p4, more_stats_md, category_deets
        return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update]


    # Define output components (must exist in the UI build)
    outputs_list = [elo_display_df]
    if show_plot:
        # Add plot components if they exist
        outputs_list.extend([plot_1, plot_2, plot_3, plot_4])
        # Add markdown component if it exists
        if more_stats_md: outputs_list.append(more_stats_md)
        else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created
    else:
        # Add placeholders if plots/MD are not shown
        outputs_list.extend([gr.Plot(visible=False)] * 4)
        outputs_list.append(gr.Markdown(visible=False))
    outputs_list.append(category_deets) # Always update category details

    # Attach change listeners
    category_dropdown.change(
        fn=update_leaderboard_and_plots,
        inputs=[category_dropdown, category_checkbox],
        outputs=outputs_list
    )
    category_checkbox.change(
        fn=update_leaderboard_and_plots, # Use the same function
        inputs=[category_dropdown, category_checkbox],
        outputs=outputs_list
    )


    return_components = [md_1, md_2, lb_description, category_deets, elo_display_df]
    if show_plot:
        # Add plots if they were created
        return_components.extend([plot_1, plot_2, plot_3, plot_4])
        # Add the extra stats markdown if it was created
        if more_stats_md: return_components.append(more_stats_md)


    return return_components


def build_demo(elo_results_file, leaderboard_table_file):
    # Assumes block_css is available or defined elsewhere
    try:
        from fastchat.serve.gradio_web_server import block_css
    except ImportError:
        print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.")
        # Define a minimal fallback CSS or copy the content here
        block_css = """
        /* Add minimal CSS rules here if needed */
        #arena_leaderboard_dataframe table { font-size: 105%; }
        #leaderboard_markdown .prose { font-size: 110% !important; }
        .app { max-width: 100% !important; padding: 20px !important; }
        a { color: #1976D2; text-decoration: none; }
        a:hover { color: #63A4FF; text-decoration: underline; }
        """

    text_size = gr.themes.sizes.text_lg
    # Assumes theme.json is present
    try:
       theme = gr.themes.Default.load("theme.json")
    except:
       print("Warning: theme.json not found. Using default Gradio theme.")
       theme = gr.themes.Default(text_size=text_size) # Fallback theme

    if hasattr(theme, 'text_size'): theme.text_size = text_size
    # Apply custom settings if theme object supports it
    if hasattr(theme, 'set'):
        theme.set(
            button_large_text_size="40px",
            button_small_text_size="40px",
            button_large_text_weight="1000",
            button_small_text_weight="1000",
            button_shadow="*shadow_drop_lg",
            button_shadow_hover="*shadow_drop_lg",
            checkbox_label_shadow="*shadow_drop_lg",
            button_shadow_active="*shadow_inset",
            button_secondary_background_fill="*primary_300",
            button_secondary_background_fill_dark="*primary_700",
            button_secondary_background_fill_hover="*primary_200",
            button_secondary_background_fill_hover_dark="*primary_500",
            button_secondary_text_color="*primary_800",
            button_secondary_text_color_dark="white",
        )

    with gr.Blocks(
        title="LLM Arena: Leaderboard", # Translated title
        theme=theme,
        css=block_css, # Use loaded or fallback CSS
    ) as demo:
        # Build only the leaderboard tab content
        # show_plot=True to display plots
        leader_components = build_leaderboard_tab(
            elo_results_file, leaderboard_table_file, show_plot=True, mirror=False
        )
    return demo


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--share", action="store_true", default=False) # Default False for HF
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--port", type=int, default=7860)
    # Removed args specific to monitor.py
    args = parser.parse_args()
    try:
        elo_result_files = glob.glob("elo_results_*.pkl")
        if not elo_result_files:
             raise FileNotFoundError("No elo_results_*.pkl files found.")
        # More robust sorting extracting the number
        elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
        elo_result_file = elo_result_files[-1]
        print(f"Using Elo results file: {elo_result_file}")
    except Exception as e:
        print(f"Error finding Elo results file: {e}")
        print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.")
        exit(1) # Exit if file not found

    try:
        leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
        if not leaderboard_table_files:
             raise FileNotFoundError("No leaderboard_table_*.csv files found.")
        leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
        leaderboard_table_file = leaderboard_table_files[-1]
        print(f"Using leaderboard table file: {leaderboard_table_file}")
    except Exception as e:
        print(f"Error finding leaderboard table file: {e}")
        print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.")
        exit(1) # Exit if file not found


    demo = build_demo(elo_result_file, leaderboard_table_file)
    # Launch with args
    demo.launch(
        server_name=args.host,
        server_port=args.port,
        share=args.share,
        show_api=False
    )