File size: 6,276 Bytes
507a14d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b514443
507a14d
 
 
 
 
 
 
b514443
507a14d
b514443
507a14d
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import pandas as pd
from pathlib import Path
from datasets import load_dataset
import os
from huggingface_hub import HfApi, Repository
import numpy as np

api = HfApi()

COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
BASE_DIR = "./evals/"
# def restart_space():
#     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
    

# From Open LLM Leaderboard
def model_hyperlink(link, model_name):
    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

print("Pulling evaluation results")
repo = Repository(
    local_dir=BASE_DIR,
    clone_from=evals_repo,
    use_auth_token=COLLAB_TOKEN,
    repo_type="dataset",
)
repo.git_pull()

# Define a function to fetch and process data
def fetch_and_display_data():    # use HF api to pull the git repo
    dir = Path(BASE_DIR)
    data_dir = dir / "data"
    orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    # get all files within the sub folders orgs
    models_results = []
    for org in orgs:
        org_dir = data_dir / org
        files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
        for file in files:
            if file.endswith(".json"):
                models_results.append(org + "/" + file)

    # create empty dataframe to add all data to
    df = pd.DataFrame()

    # load all json data in the list models_results one by one to avoid not having the same entries
    for model in models_results:
        model_data = load_dataset("json", data_files=BASE_DIR + "data/" + model, split="train")
        df2 = pd.DataFrame(model_data)
        # add to df
        df = pd.concat([df2, df])


    # remove chat_template comlumn
    df = df.drop(columns=["chat_template"])

    # move column "model" to the front
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index('model')))
    df = df.loc[:, cols]

    # select all columns except "model"
    cols = df.columns.tolist()
    cols.remove("model")
    # round 
    df[cols] = df[cols].round(2)
    avg = np.mean(df[cols].values,axis=1).round(2)
    # add average column
    df["average"] = avg
    
    # apply model_hyperlink function to column "model"
    df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))

    # move average column to the second
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('average')))
    df = df.loc[:, cols]
    return df

benchmark_text = """
# HERM Results Viewer

We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
A win is when the score for the chosen response is higher than the score for the rejected response.

### Subset summary

| Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
| alpacaeval-easy        |                     805                     | Great model vs poor model                                         |
| alpacaeval-length      |                     805                     | Good model vs low model, equal length                             |
| alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
| mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
| mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
| mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
| refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
| refusals-offensive     |                     704                     | Offensive response vs no response                                 |
| llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
| llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
| llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
| llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
| llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
| XSTest                 |                     450                     | TODO curate                                                       |
| (?) repetitiveness     |                                               |                                                                   |
| (?) grammar            |                                               |                                                                   |


For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
"""
leaderboard_data = fetch_and_display_data()
col_types = ["markdown"] + ["number"] * (len(leaderboard_data.columns) - 1)
with gr.Blocks() as app:
    with gr.Row():
        gr.Markdown(benchmark_text)
        
    with gr.Row():
        output_table = gr.Dataframe(
            leaderboard_data.values,
            datatype=col_types,
            headers=leaderboard_data.columns.tolist(),
            elem_id="leaderboard_dataframe",
        )
    
# Load data when app starts
def load_data_on_start():
    data = fetch_and_display_data()
    output_table.update(data)

app.launch()