Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on Jan 18, 2024

Commit

9ceb843

1 Parent(s): b514443

update

Browse files

Files changed (4) hide show

.gitignore +2 -0
app.py +89 -105
src/md.py +28 -0
src/utils.py +60 -0

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	evals/

 evals/
+__pycache__/*
+*.pyc

app.py CHANGED Viewed

@@ -1,131 +1,115 @@
 import gradio as gr
-import pandas as pd
-from pathlib import Path
-from datasets import load_dataset
 import os
-from huggingface_hub import HfApi, Repository
 import numpy as np
 api = HfApi()
 COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
 evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
-BASE_DIR = "./evals/"
 # def restart_space():
 #     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
-# From Open LLM Leaderboard
-def model_hyperlink(link, model_name):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 print("Pulling evaluation results")
-repo = Repository(
-    local_dir=BASE_DIR,
-    clone_from=evals_repo,
-    use_auth_token=COLLAB_TOKEN,
     repo_type="dataset",
 )
-repo.git_pull()
-# Define a function to fetch and process data
-def fetch_and_display_data():    # use HF api to pull the git repo
-    dir = Path(BASE_DIR)
-    data_dir = dir / "data"
-    orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
-    # get all files within the sub folders orgs
-    models_results = []
-    for org in orgs:
-        org_dir = data_dir / org
-        files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
-        for file in files:
-            if file.endswith(".json"):
-                models_results.append(org + "/" + file)
-    # create empty dataframe to add all data to
-    df = pd.DataFrame()
-    # load all json data in the list models_results one by one to avoid not having the same entries
-    for model in models_results:
-        model_data = load_dataset("json", data_files=BASE_DIR + "data/" + model, split="train")
-        df2 = pd.DataFrame(model_data)
-        # add to df
-        df = pd.concat([df2, df])
-    # remove chat_template comlumn
-    df = df.drop(columns=["chat_template"])
-    # move column "model" to the front
-    cols = list(df.columns)
-    cols.insert(0, cols.pop(cols.index('model')))
-    df = df.loc[:, cols]
-    # select all columns except "model"
-    cols = df.columns.tolist()
-    cols.remove("model")
-    # round
-    df[cols] = df[cols].round(2)
-    avg = np.mean(df[cols].values,axis=1).round(2)
-    # add average column
-    df["average"] = avg
-    # apply model_hyperlink function to column "model"
-    df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
-    # move average column to the second
-    cols = list(df.columns)
-    cols.insert(1, cols.pop(cols.index('average')))
-    df = df.loc[:, cols]
-    return df
-benchmark_text = """
-# HERM Results Viewer
-We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
-A win is when the score for the chosen response is higher than the score for the rejected response.
-### Subset summary
-| Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
-| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
-| alpacaeval-easy        |                     805                     | Great model vs poor model                                         |
-| alpacaeval-length      |                     805                     | Good model vs low model, equal length                             |
-| alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
-| mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
-| mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
-| mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
-| refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
-| refusals-offensive     |                     704                     | Offensive response vs no response                                 |
-| llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
-| llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
-| llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
-| llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
-| llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
-| XSTest                 |                     450                     | TODO curate                                                       |
-| (?) repetitiveness     |                                               |                                                                   |
-| (?) grammar            |                                               |                                                                   |
-For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
-"""
-leaderboard_data = fetch_and_display_data()
-col_types = ["markdown"] + ["number"] * (len(leaderboard_data.columns) - 1)
 with gr.Blocks() as app:
     with gr.Row():
-        gr.Markdown(benchmark_text)
-    with gr.Row():
-        output_table = gr.Dataframe(
-            leaderboard_data.values,
-            datatype=col_types,
-            headers=leaderboard_data.columns.tolist(),
-            elem_id="leaderboard_dataframe",
-        )
 # Load data when app starts
 def load_data_on_start():
-    data = fetch_and_display_data()
-    output_table.update(data)
 app.launch()

 import gradio as gr
 import os
+from huggingface_hub import HfApi, snapshot_download
+from src.utils import load_all_data
+from src.md import ABOUT_TEXT
 import numpy as np
 api = HfApi()
 COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
 evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
+prefs_repo = "ai2-rlhf-collab/rm-testset-results"
+repo_dir_herm = "./evals/herm/"
+repo_dir_prefs = "./evals/prefs/"
 # def restart_space():
 #     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
 print("Pulling evaluation results")
+repo = snapshot_download(
+    local_dir=repo_dir_herm,
+    repo_id=evals_repo,
+    tqdm_class=None,
+    etag_timeout=30,
     repo_type="dataset",
 )
+# repo.git_pull()
+repo_pref_sets = snapshot_download(
+    local_dir=repo_dir_prefs,
+    repo_id=prefs_repo,
+    use_auth_token=COLLAB_TOKEN,
+    tqdm_class=None,
+    etag_timeout=30,
+    repo_type="dataset",
+)
+# repo_pref_sets.git_pull()
+def avg_over_herm(dataframe):
+    """
+    Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
+    """
+    subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
+    # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
+    for subset in subsets:
+        subset_cols = [col for col in dataframe.columns if subset in col]
+        dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2)
+    keep_columns = ["model", "average"] + subsets
+    dataframe = dataframe[keep_columns]
+    # replace average column with new average
+    dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2)
+    return dataframe
+def expand_subsets(dataframe):
+    # TODO need to modify data/ script to do this
+    pass
+herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
+herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
+prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
+# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
+col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
+col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
+col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
+# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
 with gr.Blocks() as app:
+    # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
     with gr.Row():
+        gr.Markdown("# HERM Results Viewer")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("HERM - Overview"):
+            with gr.Row():
+                herm_table = gr.Dataframe(
+                    herm_data_avg.values,
+                    datatype=col_types_herm_avg,
+                    headers=herm_data_avg.columns.tolist(),
+                    elem_id="herm_dataframe_avg",
+                )
+        with gr.TabItem("HERM - Detailed"):
+            with gr.Row():
+                herm_table = gr.Dataframe(
+                    herm_data.values,
+                    datatype=col_types_herm,
+                    headers=herm_data.columns.tolist(),
+                    elem_id="herm_dataframe",
+                )
+        with gr.TabItem("Pref Sets - Overview"):
+                pref_sets_table = gr.Dataframe(
+                    prefs_data.values,
+                    datatype=col_types_prefs,
+                    headers=prefs_data.columns.tolist(),
+                    elem_id="prefs_dataframe",
+                )
+        with gr.TabItem("About"):
+            with gr.Row():
+                gr.Markdown(ABOUT_TEXT)
 # Load data when app starts
 def load_data_on_start():
+    data_herm = load_all_data(repo_dir_herm)
+    herm_table.update(data_herm)
+    data_herm_avg = avg_over_herm(repo_dir_herm)
+    herm_table.update(data_herm_avg)
+    data_prefs = load_all_data(repo_dir_prefs)
+    pref_sets_table.update(data_prefs)
 app.launch()

src/md.py ADDED Viewed

	@@ -0,0 +1,28 @@

+ABOUT_TEXT = """
+We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
+A win is when the score for the chosen response is higher than the score for the rejected response.
+### Subset summary
+| Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
+| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
+| alpacaeval-easy        |                     805                     | Great model vs poor model                                         |
+| alpacaeval-length      |                     805                     | Good model vs low model, equal length                             |
+| alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
+| mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
+| mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
+| mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
+| refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
+| refusals-offensive     |                     704                     | Offensive response vs no response                                 |
+| llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
+| llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
+| llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
+| llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
+| llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
+| XSTest                 |                     450                     | TODO curate                                                       |
+| (?) repetitiveness     |                                               |                                                                   |
+| (?) grammar            |                                               |                                                                   |
+For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
+"""

src/utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import pandas as pd
+from pathlib import Path
+from datasets import load_dataset
+import numpy as np
+import os
+# From Open LLM Leaderboard
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+# Define a function to fetch and process data
+def load_all_data(data_repo, subsubsets=False):    # use HF api to pull the git repo
+    dir = Path(data_repo)
+    data_dir = dir / "data"
+    orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
+    # get all files within the sub folders orgs
+    models_results = []
+    for org in orgs:
+        org_dir = data_dir / org
+        files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
+        for file in files:
+            if file.endswith(".json"):
+                models_results.append(org + "/" + file)
+    # create empty dataframe to add all data to
+    df = pd.DataFrame()
+    # load all json data in the list models_results one by one to avoid not having the same entries
+    for model in models_results:
+        model_data = load_dataset("json", data_files=data_repo + "data/" + model, split="train")
+        df2 = pd.DataFrame(model_data)
+        # add to df
+        df = pd.concat([df2, df])
+    # remove chat_template comlumn
+    df = df.drop(columns=["chat_template"])
+    # move column "model" to the front
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index('model')))
+    df = df.loc[:, cols]
+    # select all columns except "model"
+    cols = df.columns.tolist()
+    cols.remove("model")
+    # round
+    df[cols] = df[cols].round(2)
+    avg = np.nanmean(df[cols].values,axis=1).round(2)
+    # add average column
+    df["average"] = avg
+    # apply model_hyperlink function to column "model"
+    df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
+    # move average column to the second
+    cols = list(df.columns)
+    cols.insert(1, cols.pop(cols.index('average')))
+    df = df.loc[:, cols]
+    return df