Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on Jan 2, 2024

Commit

507a14d

1 Parent(s): d7244ec

init

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +129 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ evals/

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import gradio as gr
+import pandas as pd
+from pathlib import Path
+from datasets import load_dataset
+import json
+import os
+from huggingface_hub import HfApi, Repository
+import numpy as np
+api = HfApi()
+COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
+evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
+BASE_DIR = "./evals/"
+# def restart_space():
+#     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
+# From Open LLM Leaderboard
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+print("Pulling evaluation results")
+repo = Repository(
+    local_dir=BASE_DIR,
+    clone_from=evals_repo,
+    use_auth_token=COLLAB_TOKEN,
+    repo_type="dataset",
+)
+repo.git_pull()
+# Define a function to fetch and process data
+def fetch_and_display_data():    # use HF api to pull the git repo
+    dir = Path(BASE_DIR)
+    data_dir = dir / "data"
+    orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
+    # get all files within the sub folders orgs
+    models_results = []
+    for org in orgs:
+        org_dir = data_dir / org
+        files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
+        for file in files:
+            if file.endswith(".json"):
+                models_results.append(org + "/" + file)
+    # create empty dataframe to add all data to
+    df = pd.DataFrame()
+    # load all json data in the list models_results one by one to avoid not having the same entries
+    for model in models_results:
+        model_data = load_dataset("json", data_files=BASE_DIR + "data/" + model, split="train")
+        df2 = pd.DataFrame(model_data)
+        # add to df
+        df = pd.concat([df2, df])
+    # remove chat_template comlumn
+    df = df.drop(columns=["chat_template"])
+    # move column "model" to the front
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index('model')))
+    df = df.loc[:, cols]
+    # select all columns except "model"
+    cols = df.columns.tolist()
+    cols.remove("model")
+    # round
+    df[cols] = df[cols].round(2)
+    avg = np.mean(df[cols].values,axis=1).round(2)
+    # add average column
+    df["average"] = avg
+    # apply model_hyperlink function to column "model"
+    df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
+    # move average column to the second
+    cols = list(df.columns)
+    cols.insert(1, cols.pop(cols.index('average')))
+    df = df.loc[:, cols]
+    return df
+benchmark_text = """
+# HERM Results Viewer
+We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
+A win is when the score for the chosen response is higher than the score for the rejected response.
+### Subset summary
+| Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
+| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
+| alpacaeval-easy        |                     805                     | Great model vs poor model                                         |
+| alpacaeval-length      |                     805                     | Good model vs low model, equal length                             |
+| alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
+| mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
+| mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
+| mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
+| refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
+| refusals-offensive     |                     704                     | Offensive response vs no response                                 |
+| llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
+| llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
+| llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
+| llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
+| llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
+| XSTest                 |                     450                     | TODO curate                                                       |
+| (?) repetitiveness     |                                               |                                                                   |
+| (?) grammar            |                                               |                                                                   |
+For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
+"""
+leaderboard_data = fetch_and_display_data()
+with gr.Blocks() as app:
+    with gr.Row():
+        gr.Markdown(benchmark_text)
+    with gr.Row():
+        output_table = gr.Dataframe(
+            leaderboard_data.values,
+            headers=leaderboard_data.columns.tolist(),
+        )
+# Load data when app starts
+def load_data_on_start():
+    data = fetch_and_display_data()
+    output_table.update(data)
+app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pandas
2	+ datasets