File size: 3,822 Bytes
f1f9b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c69ae0d
f1f9b0c
 
 
 
 
 
 
 
 
 
c69ae0d
f1f9b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr # type: ignore
import pandas as pd
from sotopia_space.constants import MODEL_OPTIONS
# from sotopia_space.utils import apply_length_penalty 

LP_MODE = "v2"
original_df, ablation_df = None, None
LP_original_dfs = {} 
DEFAULT_LP = 0.5 

available_models = [] # to be filled in later
original_df, ablation_df = None, None

# def slider_change_main(length_penalty):
#     global original_df, ablation_df, LP_MODE
#     adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) 
#     adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
#     adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
#     # adjusted_df = add_winrates(adjusted_df, LP=length_penalty) 
#     # adjusted_df = adjusted_df.drop(columns=["Length"])
#     adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
#     return adjusted_df

# def slider_change_full(length_penalty, show_winrate):
#     global original_df, ablation_df, LP_MODE
#     adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
#     # sort the model by the "Task-Avg Elo" column
#     adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
#     adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
#     if show_winrate == "none":
#         adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
#         return adjusted_df
#     elif show_winrate == "gpt-3.5":
#         adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
#     elif show_winrate == "gpt-4":
#         adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
#     adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
#     return adjusted_df

def benchmark_table():
    global original_df, ablation_df
    global LP_original_dfs, LP_MODE
    
    # gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
                
    with gr.TabItem("BARTScore on 100 samples", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
        # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
        original_df = pd.read_json('data_dir/bartscore_100_results.jsonl', lines=True)
        # default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs) 
        # original_df = original_df.sort_values(by="GOAL [0, 10]", ascending=False)
        # add a Rank column to the first columnn (starting from 1)
        # original_df.insert(0, "Rank", range(1, 1 + len(original_df)))
        with gr.Row():
            with gr.Column(scale=4):
                gr.Markdown("Models are evaluated using BARTScore") 
            # with gr.Column(scale=1):
                # length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider") 
        # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
        TYPES = ["number", "markdown", "number"]
        leaderboard_table = gr.components.Dataframe(
            value=original_df,
            datatype=TYPES,
            # max_rows=None,
            height=1000,
            elem_id="leaderboard-table",
            interactive=False,
            visible=True,
            min_width=60,
            )
        # return leaderboard_table
        #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])