Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on Jan 21

Commit

e5d5995

•

1 Parent(s): 8e499f4

smol improvements

Browse files

Files changed (2) hide show

app.py +38 -21
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import os
 from huggingface_hub import HfApi, snapshot_download
 from datasets import load_dataset
 from src.utils import load_all_data
 from src.md import ABOUT_TEXT
@@ -15,10 +16,8 @@ eval_set_repo = "ai2-rlhf-collab/rm-benchmark-dev"
 repo_dir_herm = "./evals/herm/"
 repo_dir_prefs = "./evals/prefs/"
-# def restart_space():
-#     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
 print("Pulling evaluation results")
 repo = snapshot_download(
@@ -43,17 +42,18 @@ def avg_over_herm(dataframe):
     """
     Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
     """
     subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
     # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
     for subset in subsets:
-        subset_cols = [col for col in dataframe.columns if subset in col]
-        dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2)
     keep_columns = ["model", "average"] + subsets
-    dataframe = dataframe[keep_columns]
     # replace average column with new average
-    dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2)
-    return dataframe
 def expand_subsets(dataframe):
     # TODO need to modify data/ script to do this
@@ -71,12 +71,23 @@ col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
 # for showing random samples
 eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
-def random_sample(r: gr.Request):
-    sample_index = np.random.randint(0, len(eval_set) - 1)
-    sample = eval_set[sample_index]
     markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
     return markdown_text
 with gr.Blocks() as app:
     # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
     with gr.Row():
@@ -114,23 +125,29 @@ with gr.Blocks() as app:
             with gr.Row():
                 # loads one sample
                 gr.Markdown("## Random Dataset Sample Viewer")
                 button = gr.Button("Show Random Sample")
             with gr.Row():
                 sample_display = gr.Markdown("{sampled data loads here}")
-            button.click(fn=random_sample, outputs=sample_display)
 # Load data when app starts, TODO make this used somewhere...
-def load_data_on_start():
-    data_herm = load_all_data(repo_dir_herm)
-    herm_table.update(data_herm)
-    data_herm_avg = avg_over_herm(repo_dir_herm)
-    herm_table.update(data_herm_avg)
-    data_prefs = load_all_data(repo_dir_prefs)
-    pref_sets_table.update(data_prefs)
-app.launch()

 import gradio as gr
 import os
 from huggingface_hub import HfApi, snapshot_download
+from apscheduler.schedulers.background import BackgroundScheduler
 from datasets import load_dataset
 from src.utils import load_all_data
 from src.md import ABOUT_TEXT
 repo_dir_herm = "./evals/herm/"
 repo_dir_prefs = "./evals/prefs/"
+def restart_space():
+    api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
 print("Pulling evaluation results")
 repo = snapshot_download(
     """
     Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
     """
+    new_df = dataframe.copy()
     subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
     # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
     for subset in subsets:
+        subset_cols = [col for col in new_df.columns if subset in col]
+        new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
     keep_columns = ["model", "average"] + subsets
+    new_df = new_df[keep_columns]
     # replace average column with new average
+    new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2)
+    return new_df
 def expand_subsets(dataframe):
     # TODO need to modify data/ script to do this
 # for showing random samples
 eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
+def random_sample(r: gr.Request, subset):
+    if subset is None or subset == []:
+        sample_index = np.random.randint(0, len(eval_set) - 1)
+        sample = eval_set[sample_index]
+    else: # filter by subsets (can be list)
+        if isinstance(subset, str):
+            subset = [subset]
+        # filter down dataset to only include the subset(s)
+        eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
+        sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
+        sample = eval_set_filtered[sample_index]
     markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
     return markdown_text
+subsets = eval_set.unique("subset")
 with gr.Blocks() as app:
     # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
     with gr.Row():
             with gr.Row():
                 # loads one sample
                 gr.Markdown("## Random Dataset Sample Viewer")
+                subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
                 button = gr.Button("Show Random Sample")
             with gr.Row():
                 sample_display = gr.Markdown("{sampled data loads here}")
+            button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
 # Load data when app starts, TODO make this used somewhere...
+# def load_data_on_start():
+#     data_herm = load_all_data(repo_dir_herm)
+#     herm_table.update(data_herm)
+#     data_herm_avg = avg_over_herm(repo_dir_herm)
+#     herm_table.update(data_herm_avg)
+#     data_prefs = load_all_data(repo_dir_prefs)
+#     pref_sets_table.update(data_prefs)
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
+scheduler.start()
+app.queue().launch()

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 pandas
 datasets

+APScheduler==3.10.1
 pandas
 datasets