File size: 3,677 Bytes
15188ef
 
91d4a22
 
 
15188ef
91d4a22
d715186
47a8f30
 
 
 
15188ef
91d4a22
 
 
 
 
 
15188ef
91d4a22
15188ef
91d4a22
 
 
 
 
15188ef
91d4a22
 
15188ef
91d4a22
 
 
 
15188ef
91d4a22
f7f1bb2
15188ef
 
 
 
91d4a22
15188ef
91d4a22
 
15188ef
 
91d4a22
 
 
15188ef
 
f7f1bb2
15188ef
 
91d4a22
47a8f30
 
15188ef
91d4a22
15188ef
91d4a22
a7cf972
e72ba15
fa30761
7ebb846
38431a0
e72ba15
 
91d4a22
501c623
91d4a22
 
 
ac3517b
91d4a22
f7f1bb2
91d4a22
274dd90
 
 
 
 
 
 
91d4a22
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
import pandas as pd
import numpy as np
from collections import defaultdict
from gradio_leaderboard import Leaderboard, SelectColumns

# Load the DataFrame from the CSV file for detailed pass@k metrics
df = pd.read_csv('results.csv')

# Ensure 'Model' and 'Scenario' columns are strings
df['Model'] = df['Model'].astype(str)
df['Scenario'] = df['Scenario'].astype(str)

# Function to estimate pass@k
def estimate_pass_at_k(num_samples, num_correct, k):
    def estimator(n, c, k):
        if n - c < k:
            return 1.0
        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    return np.array([estimator(n, c, k) for n, c in zip(num_samples, num_correct)])

# Function to calculate pass@k
def calculate_pass_at_k(df, model, scenario, k_values=[1, 5, 10]):
    filtered_df = df[(df['Model'] == model) & (df['Scenario'] == scenario)]
    num_samples = filtered_df['Runs'].values
    num_correct = filtered_df['Successes'].values

    pass_at_k = {f"pass@{k}": estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_values}
    return pass_at_k

# Function to filter data and calculate pass@k
def filter_data(model, scenario):
    pass_at_k = calculate_pass_at_k(df, model, scenario)
    return pd.DataFrame([pass_at_k])

# Initialize the leaderboard
def init_leaderboard(dataframe, height=600):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    return Leaderboard(
        value=dataframe,
        datatype=["markdown", "number", "number", "number"],  # Specify the types of your columns
        select_columns=SelectColumns(
            default_selection=["Model", "pass@1", "pass@5", "pass@10"],  # Columns to display by default
            cant_deselect=[],  # Columns that cannot be deselected
            label="Select Columns to Display:",
        ),
        search_columns=["Model"],  # Columns that can be searched
        hide_columns=[],  # Columns to hide
        filter_columns=[],  # Filters for the columns
        bool_checkboxgroup_label="Hide models",
        interactive=False,
        height=height,
    )

# Gradio interface
models = df['Model'].unique().tolist()
scenarios = df['Scenario'].unique().tolist()

demo = gr.Blocks()

with demo:
    gr.Markdown("# 🏆 WebApp1K Models Leaderboard")
    gr.Markdown(
        "## [Discord](https://discord.gg/3qpAbWC7) " +
        "[HF Daily Papers](https://huggingface.co/papers?date=2024-09-10) " +
        "[Blog](https://huggingface.co/blog/onekq/all-llms-write-great-code) "
        "[Github](https://github.com/onekq/WebApp1k) " +
        "[AI Models](https://www.aimodels.fyi/papers/arxiv/webapp1k-practical-code-generation-benchmark-web-app)")
    # Initialize leaderboard with the complete DataFrame
    complete_pass_at_k = df.groupby('Model')[['Runs', 'Successes']].apply(lambda x: pd.Series({
        'pass@1': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 1).mean(),
        'pass@5': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 5).mean(),
        'pass@10': estimate_pass_at_k(x['Runs'].values, x['Successes'].values, 10).mean()
    }, index=['pass@1', 'pass@5', 'pass@10'])).reset_index()

    leaderboard = init_leaderboard(complete_pass_at_k, height=800)

    model_input = gr.Dropdown(choices=models, label="Select Model")
    scenario_input = gr.Dropdown(choices=scenarios, label="Select Category")
    output = gr.DataFrame(headers=["pass@1", "pass@5", "pass@10"])
    
    filter_button = gr.Button("Filter")
    filter_button.click(filter_data, inputs=[model_input, scenario_input], outputs=output)

# Launch the Gradio interface
demo.launch()