File size: 4,206 Bytes
507a14d
 
9ceb843
 
 
507a14d
 
 
 
 
 
9ceb843
 
 
 
507a14d
 
 
 
 
 
9ceb843
 
 
e4cd4cd
9ceb843
 
507a14d
 
9ceb843
507a14d
9ceb843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507a14d
9ceb843
 
 
 
507a14d
9ceb843
 
 
 
507a14d
 
9ceb843
507a14d
9ceb843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507a14d
 
9ceb843
 
 
 
 
 
 
 
507a14d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import os
from huggingface_hub import HfApi, snapshot_download
from src.utils import load_all_data
from src.md import ABOUT_TEXT
import numpy as np

api = HfApi()

COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
prefs_repo = "ai2-rlhf-collab/rm-testset-results"
repo_dir_herm = "./evals/herm/"
repo_dir_prefs = "./evals/prefs/"

# def restart_space():
#     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
    


print("Pulling evaluation results")
repo = snapshot_download(
    local_dir=repo_dir_herm,
    repo_id=evals_repo,
    use_auth_token=COLLAB_TOKEN,
    tqdm_class=None, 
    etag_timeout=30,
    repo_type="dataset",
)
# repo.git_pull()

repo_pref_sets = snapshot_download(
    local_dir=repo_dir_prefs,
    repo_id=prefs_repo,
    use_auth_token=COLLAB_TOKEN,
    tqdm_class=None, 
    etag_timeout=30,
    repo_type="dataset",
)
# repo_pref_sets.git_pull()

def avg_over_herm(dataframe):
    """
    Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
    """
    subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
    # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
    for subset in subsets:
        subset_cols = [col for col in dataframe.columns if subset in col]
        dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2)

    keep_columns = ["model", "average"] + subsets
    dataframe = dataframe[keep_columns]
    # replace average column with new average
    dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2)        
    return dataframe

def expand_subsets(dataframe):
    # TODO need to modify data/ script to do this
    pass
    
herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)

col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)

with gr.Blocks() as app:
    # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
    with gr.Row():
        gr.Markdown("# HERM Results Viewer")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("HERM - Overview"):
            with gr.Row():
                herm_table = gr.Dataframe(
                    herm_data_avg.values,
                    datatype=col_types_herm_avg,
                    headers=herm_data_avg.columns.tolist(),
                    elem_id="herm_dataframe_avg",
                )
        with gr.TabItem("HERM - Detailed"):
            with gr.Row():
                herm_table = gr.Dataframe(
                    herm_data.values,
                    datatype=col_types_herm,
                    headers=herm_data.columns.tolist(),
                    elem_id="herm_dataframe",
                )
        with gr.TabItem("Pref Sets - Overview"):
                pref_sets_table = gr.Dataframe(
                    prefs_data.values,
                    datatype=col_types_prefs,
                    headers=prefs_data.columns.tolist(),
                    elem_id="prefs_dataframe",
                )

        with gr.TabItem("About"):
            with gr.Row():
                gr.Markdown(ABOUT_TEXT)
                
# Load data when app starts
def load_data_on_start():
    data_herm = load_all_data(repo_dir_herm)
    herm_table.update(data_herm)

    data_herm_avg = avg_over_herm(repo_dir_herm)
    herm_table.update(data_herm_avg)

    data_prefs = load_all_data(repo_dir_prefs)
    pref_sets_table.update(data_prefs)

app.launch()