Spaces:

allenai
/

reward-bench

Running

File size: 6,276 Bytes

import gradio as gr
import pandas as pd
from pathlib import Path
from datasets import load_dataset
import os
from huggingface_hub import HfApi, Repository
import numpy as np

api = HfApi()

COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
BASE_DIR = "./evals/"
# def restart_space():
#     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
    

# From Open LLM Leaderboard
def model_hyperlink(link, model_name):
    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

print("Pulling evaluation results")
repo = Repository(
    local_dir=BASE_DIR,
    clone_from=evals_repo,
    use_auth_token=COLLAB_TOKEN,
    repo_type="dataset",
)
repo.git_pull()

# Define a function to fetch and process data
def fetch_and_display_data():    # use HF api to pull the git repo
    dir = Path(BASE_DIR)
    data_dir = dir / "data"
    orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    # get all files within the sub folders orgs
    models_results = []
    for org in orgs:
        org_dir = data_dir / org
        files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
        for file in files:
            if file.endswith(".json"):
                models_results.append(org + "/" + file)

    # create empty dataframe to add all data to
    df = pd.DataFrame()

    # load all json data in the list models_results one by one to avoid not having the same entries
    for model in models_results:
        model_data = load_dataset("json", data_files=BASE_DIR + "data/" + model, split="train")
        df2 = pd.DataFrame(model_data)
        # add to df
        df = pd.concat([df2, df])


    # remove chat_template comlumn
    df = df.drop(columns=["chat_template"])

    # move column "model" to the front
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index('model')))
    df = df.loc[:, cols]

    # select all columns except "model"
    cols = df.columns.tolist()
    cols.remove("model")
    # round 
    df[cols] = df[cols].round(2)
    avg = np.mean(df[cols].values,axis=1).round(2)
    # add average column
    df["average"] = avg
    
    # apply model_hyperlink function to column "model"
    df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))

    # move average column to the second
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('average')))
    df = df.loc[:, cols]
    return df

benchmark_text = """
# HERM Results Viewer

We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
A win is when the score for the chosen response is higher than the score for the rejected response.

### Subset summary

| Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
| alpacaeval-easy        |                     805                     | Great model vs poor model                                         |
| alpacaeval-length      |                     805                     | Good model vs low model, equal length                             |
| alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
| mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
| mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
| mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
| refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
| refusals-offensive     |                     704                     | Offensive response vs no response                                 |
| llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
| llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
| llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
| llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
| llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
| XSTest                 |                     450                     | TODO curate                                                       |
| (?) repetitiveness     |                                               |                                                                   |
| (?) grammar            |                                               |                                                                   |


For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
"""
leaderboard_data = fetch_and_display_data()
col_types = ["markdown"] + ["number"] * (len(leaderboard_data.columns) - 1)
with gr.Blocks() as app:
    with gr.Row():
        gr.Markdown(benchmark_text)
        
    with gr.Row():
        output_table = gr.Dataframe(
            leaderboard_data.values,
            datatype=col_types,
            headers=leaderboard_data.columns.tolist(),
            elem_id="leaderboard_dataframe",
        )
    
# Load data when app starts
def load_data_on_start():
    data = fetch_and_display_data()
    output_table.update(data)

app.launch()