Spaces:

lmarena-ai
/

arena-hard-viewer

Running

File size: 19,765 Bytes

ad39573

import os
import json
import pandas as pd
import glob
import gradio as gr

# Cache for loaded data
data_cache = {}

# Load data functions with caching
def load_jsonl(file_path):
    """Load a JSONL file into a pandas DataFrame with caching."""
    if file_path in data_cache:
        return data_cache[file_path]
    
    if not os.path.exists(file_path):
        return pd.DataFrame()
    
    try:
        df = pd.read_json(file_path, lines=True)
        data_cache[file_path] = df
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return pd.DataFrame()

def get_available_benchmarks():
    """Get list of available benchmarks in data directory."""
    return [dir_name for dir_name in os.listdir("data") 
            if os.path.isdir(os.path.join("data", dir_name))]

def get_categories(benchmark):
    """Get list of categories for a given benchmark."""
    questions = load_jsonl(f"data/{benchmark}/question.jsonl")
    if questions.empty:
        return []
    return sorted(questions['category'].unique().tolist())

def get_languages(benchmark):
    """Get list of languages available in the benchmark."""
    questions = load_jsonl(f"data/{benchmark}/question.jsonl")
    if questions.empty or 'language' not in questions.columns:
        return ["English"]  # Default if no language column
    
    return sorted(questions['language'].unique().tolist())

def get_judges(benchmark):
    """Get list of available judges for a benchmark."""
    judgment_dir = f"data/{benchmark}/model_judgment"
    if not os.path.exists(judgment_dir):
        return []
    return [dir_name for dir_name in os.listdir(judgment_dir)
            if os.path.isdir(os.path.join(judgment_dir, dir_name))]

def get_models(benchmark, judge):
    """Get list of models that have judgments by the specified judge."""
    if not judge:
        return []
    
    judgment_dir = f"data/{benchmark}/model_judgment/{judge}"
    if not os.path.exists(judgment_dir):
        return []
    
    return [os.path.splitext(os.path.basename(file))[0] 
            for file in glob.glob(f"{judgment_dir}/*.jsonl")]

def get_questions(benchmark, category=None, language=None):
    """Get questions with category and language filters if provided."""
    questions = load_jsonl(f"data/{benchmark}/question.jsonl")
    if questions.empty:
        return []
    
    # Apply category filter if provided
    if category and category != "All":
        questions = questions[questions['category'] == category]
    
    # Apply language filter if provided and column exists
    if language and language != "All" and 'language' in questions.columns:
        questions = questions[questions['language'] == language]
    
    # Create list of question previews with their UIDs
    question_previews = [(row['uid'], row['prompt'][:100] + "..." if len(row['prompt']) > 100 else row['prompt']) 
                        for _, row in questions.iterrows()]
    
    return question_previews

def get_model_answer(benchmark, model, uid):
    """Get a model's answer for a specific question."""
    model_answers = load_jsonl(f"data/{benchmark}/model_answer/{model}.jsonl")
    if model_answers.empty:
        return "No answer found"
    
    answer = model_answers[model_answers['uid'] == uid]
    if answer.empty:
        return "No answer found"
    
    # Extract the actual answer from the messages
    try:
        messages = answer.iloc[0]['messages']
        if len(messages) < 2:
            return "No answer found"
        
        # The assistant's message should be the second one
        assistant_msg = messages[1]
        if 'role' in assistant_msg and assistant_msg['role'] == 'assistant':
            content = assistant_msg['content']
            
            # Handle different content formats
            if isinstance(content, dict) and 'answer' in content:
                return content['answer']
            elif isinstance(content, str):
                return content
            else:
                return str(content)
        else:
            return "Invalid message format"
    except Exception as e:
        return f"Error extracting answer: {str(e)}"

def get_judgment(benchmark, judge, model, uid):
    """Get judgment for a specific model and question."""
    judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
    if judgments.empty:
        return None, None
    
    judgment = judgments[judgments['uid'] == uid]
    if judgment.empty:
        return None, None
    
    games = judgment.iloc[0]['games']
    if len(games) < 2:
        return games[0] if games else None, None
    
    return games[0], games[1]  # First game, second game

def format_judgment(game):
    """Format judgment for display."""
    if not game:
        return "No judgment available"
    
    score = game.get('score', 'No score')
    
    # Try to get judgment text
    judgment = game.get('judgment', {})
    if isinstance(judgment, dict) and 'answer' in judgment:
        judgment_text = judgment['answer']
    else:
        judgment_text = str(judgment)
    
    return f"### Score: {score}\n\n{judgment_text}"

# Gradio interface functions
def update_categories(benchmark):
    """Update category dropdown based on selected benchmark."""
    categories = ["All"] + get_categories(benchmark)
    return gr.Dropdown(choices=categories, value="All")

def update_languages(benchmark):
    """Update language dropdown based on selected benchmark."""
    languages = ["All"] + get_languages(benchmark)
    default = "English" if "English" in languages else languages[0]
    return gr.Dropdown(choices=languages, value=default)

def update_judges(benchmark):
    """Update judge dropdown based on selected benchmark."""
    judges = get_judges(benchmark)
    default = judges[0] if judges else None
    return gr.Dropdown(choices=judges, value=default)

def update_models(benchmark, judge):
    """Update model dropdown based on selected benchmark and judge."""
    models = get_models(benchmark, judge)
    default = models[0] if models else None
    return gr.Dropdown(choices=models, value=default)

def update_questions(benchmark, category, language):
    """Update question dropdown based on selected benchmark, category and language."""
    question_list = get_questions(benchmark, category, language)
    if not question_list:
        return gr.Dropdown(choices=[], value=None), {}
    
    # Create a dictionary mapping previews to UIDs to ensure we can look up UIDs from previews
    question_dict = {q[1]: q[0] for q in question_list}
    question_options = list(question_dict.keys())
    
    default = question_options[0] if question_options else None
    return gr.Dropdown(choices=question_options, value=default), question_dict

def display_content(benchmark, category, language, judge, model, question, question_dict):
    """Display the question, answers, and judgments."""
    if not question or not question_dict or question not in question_dict:
        return "No question selected", "No baseline answer", "No model answer", "No judgment", "No judgment"
    
    uid = question_dict[question]
    
    # Load the question text
    questions_df = load_jsonl(f"data/{benchmark}/question.jsonl")
    question_row = questions_df[questions_df['uid'] == uid]
    if question_row.empty:
        return "Question not found", "No baseline answer", "No model answer", "No judgment", "No judgment"
    
    question_text = question_row.iloc[0]['prompt']
    
    # Load judgments and identify baseline model
    judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
    judgment_row = judgments[judgments['uid'] == uid]
    
    if judgment_row.empty:
        return question_text, "No baseline answer", "No model answer", "No judgment", "No judgment"
    
    baseline_model = judgment_row.iloc[0]['baseline']
    
    # Get answers
    baseline_answer = get_model_answer(benchmark, baseline_model, uid)
    model_answer = get_model_answer(benchmark, model, uid)
    
    # Get judgments
    game1, game2 = get_judgment(benchmark, judge, model, uid)
    
    judgment1 = format_judgment(game1)
    judgment2 = format_judgment(game2)
    
    return question_text, baseline_answer, model_answer, judgment1, judgment2

# Initialize app components based on selected benchmark
def init_app(benchmark):
    categories = ["All"] + get_categories(benchmark)
    default_category = "All"
    
    languages = ["All"] + get_languages(benchmark)
    default_language = "English" if "English" in languages else languages[0]
    
    judges = get_judges(benchmark)
    default_judge = judges[0] if judges else None
    
    models = get_models(benchmark, default_judge) if default_judge else []
    default_model = models[0] if models else None
    
    question_list = get_questions(benchmark, default_category, default_language)
    question_dict = {q[1]: q[0] for q in question_list}
    question_options = list(question_dict.keys())
    default_question = question_options[0] if question_options else None
    
    # Get initial display content
    if default_question and default_model and default_judge:
        question_text, baseline_ans, model_ans, judgment1, judgment2 = display_content(
            benchmark, default_category, default_language, default_judge, default_model, default_question, question_dict
        )
    else:
        question_text = "No question available"
        baseline_ans = "No baseline answer"
        model_ans = "No model answer"
        judgment1 = "No judgment"
        judgment2 = "No judgment"
    
    return (
        gr.Dropdown(choices=categories, value=default_category),
        gr.Dropdown(choices=languages, value=default_language),
        gr.Dropdown(choices=judges, value=default_judge),
        gr.Dropdown(choices=models, value=default_model),
        gr.Dropdown(choices=question_options, value=default_question),
        question_dict,
        question_text,
        baseline_ans, model_ans,
        judgment1, judgment2
    )

# Function to go to the next question
def next_question(benchmark, category, language, current_question, question_dict):
    question_list = get_questions(benchmark, category, language)
    previews = [q[1] for q in question_list]
    
    if current_question not in previews:
        return gr.Dropdown(value=previews[0] if previews else None)
            
    current_idx = previews.index(current_question)
    next_idx = (current_idx + 1) % len(previews)
    return gr.Dropdown(value=previews[next_idx])

# Create Gradio app
def create_app():
    benchmarks = get_available_benchmarks()
    default_benchmark = "arena-hard-v2.0" if "arena-hard-v2.0" in benchmarks else benchmarks[0]
    
    # Initialize data for the default benchmark
    init_data = init_app(default_benchmark)
    
    with gr.Blocks() as app:
        gr.Markdown(
            '''# Arena-Hard-Auto Benchmark Viewer
            
            Arena-Hard-Auto is an automatic evaluation tool for instruction-tuned LLMs. It has the highest correlation and separability to LMArena (Chatbot Arena) among popular open-ended LLM benchmarks. If you are curious to see how well your model might perform on LMArena before deploying, we recommend trying Arena-Hard-Auto's newest evaluation set, **Arena-Hard-v2.0-Preview**.
            
            **Repo:** https://github.com/lmarena/arena-hard-auto
            
            **Paper:** https://arxiv.org/abs/2406.11939
            '''
        )
        
        with gr.Row():
            with gr.Column():
                benchmark_dropdown = gr.Dropdown(
                    choices=benchmarks,
                    value=default_benchmark,
                    label="Benchmark"
                )
                
                category_dropdown = gr.Dropdown(
                    choices=init_data[0].choices,
                    value=init_data[0].value,
                    label="Category"
                )
                
                language_dropdown = gr.Dropdown(
                    choices=init_data[1].choices,
                    value=init_data[1].value,
                    label="Language"
                )
            
            with gr.Column():
                judge_dropdown = gr.Dropdown(
                    choices=init_data[2].choices,
                    value=init_data[2].value,
                    label="Judge Model"
                )
                
                model_dropdown = gr.Dropdown(
                    label="Model to Evaluate",
                    choices=init_data[3].choices,
                    value=init_data[3].value,
                )
        
        question_dict = gr.State(init_data[5])
        question_dropdown = gr.Dropdown(
            choices=init_data[4].choices,
            value=init_data[4].value,
            label="Select Question"
        )
        
        # Add a next question button
        next_button = gr.Button("Next Question")
        
        # Display the question
        gr.Markdown("---")
        question_display = gr.Markdown(value="### Question\n\n" + init_data[6])
        
        with gr.Tabs():
            with gr.TabItem("Game 1: Baseline (A) vs Model (B)"):
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("### Baseline (A)")
                        baseline_answer1 = gr.Markdown(value=init_data[7])
                    with gr.Column():
                        gr.Markdown("### Model (B)")
                        model_answer1 = gr.Markdown(value=init_data[8])
                gr.Markdown("---")
                gr.Markdown("### Judgment")
                judgment1 = gr.Markdown(value=init_data[9])
            
            with gr.TabItem("Game 2: Model (A) vs Baseline (B)"):
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("### Model (A)")
                        model_answer2 = gr.Markdown(value=init_data[8])
                    with gr.Column():
                        gr.Markdown("### Baseline (B)")
                        baseline_answer2 = gr.Markdown(value=init_data[7])
                gr.Markdown("---")
                gr.Markdown("### Judgment")
                judgment2 = gr.Markdown(value=init_data[10])
                
        gr.Markdown("---")
        gr.Markdown("### Citation")
        gr.Markdown("If you find this tool useful, please cite the following papers:")
        gr.Markdown(
            '''```bibtex
@article{li2024crowdsourced,
  title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline},
  author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
  journal={arXiv preprint arXiv:2406.11939},
  year={2024}
}
@misc{arenahard2024,
    title = {From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline},
    url = {https://lmsys.org/blog/2024-04-19-arena-hard/},
    author = {Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica},
    month = {April},
    year = {2024}
}
```''')
        
        # Set up event handlers
        benchmark_dropdown.change(
            fn=init_app,
            inputs=benchmark_dropdown,
            outputs=[
                category_dropdown, language_dropdown, judge_dropdown, model_dropdown, 
                question_dropdown, question_dict,
                question_display, 
                baseline_answer1, model_answer1,
                judgment1, judgment2
            ]
        ).then(
            fn=lambda model, baseline: (model, baseline),
            inputs=[model_answer1, baseline_answer1],
            outputs=[model_answer2, baseline_answer2]
        )
        
        # Update questions when category changes
        category_dropdown.change(
            fn=update_questions,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
            outputs=[question_dropdown, question_dict]
        ).then(
            fn=display_content,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
        ).then(
            fn=lambda model, baseline: (model, baseline),
            inputs=[model_answer1, baseline_answer1],
            outputs=[model_answer2, baseline_answer2]
        )
        
        # Update questions when language changes
        language_dropdown.change(
            fn=update_questions,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
            outputs=[question_dropdown, question_dict]
        ).then(
            fn=display_content,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
        ).then(
            fn=lambda model, baseline: (model, baseline),
            inputs=[model_answer1, baseline_answer1],
            outputs=[model_answer2, baseline_answer2]
        )
        
        # Update models when judge changes
        judge_dropdown.change(
            fn=update_models,
            inputs=[benchmark_dropdown, judge_dropdown],
            outputs=model_dropdown
        ).then(
            fn=display_content,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
        ).then(
            fn=lambda model, baseline: (model, baseline),
            inputs=[model_answer1, baseline_answer1],
            outputs=[model_answer2, baseline_answer2]
        )
        
        # Display content when model changes
        model_dropdown.change(
            fn=display_content,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
        ).then(
            fn=lambda model, baseline: (model, baseline),
            inputs=[model_answer1, baseline_answer1],
            outputs=[model_answer2, baseline_answer2]
        )
        
        # Display content when question changes
        question_dropdown.change(
            fn=display_content,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
        ).then(
            fn=lambda model, baseline: (model, baseline),
            inputs=[model_answer1, baseline_answer1],
            outputs=[model_answer2, baseline_answer2]
        )
        
        # Handle next question button
        next_button.click(
            fn=next_question,
            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, question_dropdown, question_dict],
            outputs=question_dropdown
        )
    
    return app

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="0.0.0.0")
    parser.add_argument("--port", type=int)
    parser.add_argument("--share", action="store_true")
    args = parser.parse_args()
    
    app = create_app()
    app.launch(server_name=args.host, server_port=args.port, share=args.share)