import gradio as gr import os from huggingface_hub import HfApi, snapshot_download from apscheduler.schedulers.background import BackgroundScheduler from datasets import load_dataset from src.utils import load_all_data from src.md import ABOUT_TEXT, TOP_TEXT from src.plt import plot_avg_correlation from src.constants import subset_mapping, length_categories, example_counts import numpy as np api = HfApi() COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") evals_repo = "allenai/reward-bench-results" eval_set_repo = "allenai/reward-bench" repo_dir_rewardbench = "./evals/rewardbench/" def restart_space(): api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN) print("Pulling evaluation results") repo = snapshot_download( local_dir=repo_dir_rewardbench, ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"], repo_id=evals_repo, use_auth_token=COLLAB_TOKEN, tqdm_class=None, etag_timeout=30, repo_type="dataset", ) def avg_over_rewardbench(dataframe_core, dataframe_prefs): """ Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. We average over 4 core sections (per prompt weighting): 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium) 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual) 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer) 4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust) """ new_df = dataframe_core.copy() dataframe_prefs = dataframe_prefs.copy() # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models for subset, sub_subsets in subset_mapping.items(): subset_cols = [col for col in new_df.columns if col in sub_subsets] sub_data = new_df[subset_cols].values # take the relevant column values sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) # take the weighted average # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2) data_cols = list(subset_mapping.keys()) keep_columns = ["model",] + ["model_type"] + data_cols # keep_columns = ["model", "average"] + subsets new_df = new_df[keep_columns] # selected average from pref_sets pref_columns = ["anthropic_helpful", "mtbench_gpt4", "shp", "summarize"] pref_data = dataframe_prefs[pref_columns].values # add column test sets knowing the rows are not identical, take superset dataframe_prefs["Test Sets"] = np.round(np.nanmean(pref_data, axis=1), 2) # add column Test Sets empty to new_df new_df["Test Sets"] = np.nan # per row in new_df if model is in dataframe_prefs, add the value to new_df["Test Sets"] values = [] for i, row in new_df.iterrows(): model = row["model"] if model in dataframe_prefs["model"].values: values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0]) # new_df.at[i, "Test Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0] else: values.append(np.nan) new_df["Test Sets"] = values # add total average data_cols += ["Test Sets"] new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2) # make average third column keep_columns = ["model", "model_type", "average"] + data_cols new_df = new_df[keep_columns] return new_df def expand_subsets(dataframe): # TODO need to modify data/ script to do this pass def length_bias_check(dataframe): """ Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories. Then, take the average of the three buckets as "average" """ new_df = dataframe.copy() existing_subsets = new_df.columns[3:] # model, model_type, average final_subsets = ["Length Bias", "Neutral", "Terse Bias"] # new data is empty list dict for each final subset new_data = {s: [] for s in final_subsets} # now, subsets correspond to those with True, Nuetral, and False length bias # check if length_categories[subset] == "True" or "False" or "Neutral" for subset in existing_subsets: subset_data = new_df[subset].values subset_length = length_categories[subset] # route to the correct bucket if subset_length == "True": new_data["Length Bias"].append(subset_data) elif subset_length == "Neutral": new_data["Neutral"].append(subset_data) elif subset_length == "False": new_data["Terse Bias"].append(subset_data) # take average of new_data and add to new_df (removing other columns than model) for subset in final_subsets: new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2) keep_columns = ["model"] + final_subsets new_df = new_df[keep_columns] # recompute average # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2) return new_df rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False) rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False) prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False) # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False) rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False) col_types_rewardbench = ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1) col_types_rewardbench_avg = ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1) cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1) col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1) # for showing random samples eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered") def random_sample(r: gr.Request, subset): if subset is None or subset == []: sample_index = np.random.randint(0, len(eval_set) - 1) sample = eval_set[sample_index] else: # filter by subsets (can be list) if isinstance(subset, str): subset = [subset] # filter down dataset to only include the subset(s) eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset) sample_index = np.random.randint(0, len(eval_set_filtered) - 1) sample = eval_set_filtered[sample_index] markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()]) return markdown_text subsets = eval_set.unique("subset") def regex_table(dataframe, regex, filter_button): """ Takes a model name as a regex, then returns only the rows that has that in it. """ # Split regex statement by comma and trim whitespace around regexes regex_list = [x.strip() for x in regex.split(",")] # Join the list into a single regex pattern with '|' acting as OR combined_regex = '|'.join(regex_list) # if filter_button, remove all rows with "ai2" in the model name if isinstance(filter_button, list) or isinstance(filter_button, str): if "AI2 Experiments" not in filter_button and ("ai2" not in regex): dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)] if "Seq. Classifiers" not in filter_button: dataframe = dataframe[~dataframe["model_type"].str.contains("Seq. Classifier", case=False, na=False)] if "DPO" not in filter_button: dataframe = dataframe[~dataframe["model_type"].str.contains("DPO", case=False, na=False)] if "Custom Classifiers" not in filter_button: dataframe = dataframe[~dataframe["model_type"].str.contains("Custom Classifier", case=False, na=False)] # Filter the dataframe such that 'model' contains any of the regex patterns return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)] with gr.Blocks() as app: # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About" with gr.Row(): with gr.Column(scale=2.2): # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model") # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True) # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500) gr.Markdown(""" ![](file/src/logo.png) """) with gr.Column(scale=3): gr.Markdown(TOP_TEXT) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏆 RewardBench Leaderboard"): with gr.Row(): search_1 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model") model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"], value=["Seq. Classifiers", "DPO", "Custom Classifiers"], label="Model Types", # info="Which model types to include.", ) with gr.Row(): # reference data rewardbench_table_hidden = gr.Dataframe( rewardbench_data_avg.values, datatype=col_types_rewardbench_avg, headers=rewardbench_data_avg.columns.tolist(), visible=False, ) rewardbench_table = gr.Dataframe( regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values, datatype=col_types_rewardbench_avg, headers=rewardbench_data_avg.columns.tolist(), elem_id="rewardbench_dataframe_avg", height=1000, ) with gr.TabItem("🔍 RewardBench - Detailed"): with gr.Row(): search_2 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model") model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"], value=["Seq. Classifiers", "DPO", "Custom Classifiers"], label="Model Types", # info="Which model types to include." ) with gr.Row(): # ref data rewardbench_table_detailed_hidden = gr.Dataframe( rewardbench_data.values, datatype=col_types_rewardbench, headers=rewardbench_data.columns.tolist(), visible=False, ) rewardbench_table_detailed = gr.Dataframe( regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values, datatype=col_types_rewardbench, headers=rewardbench_data.columns.tolist(), elem_id="rewardbench_dataframe", height=1000, ) # with gr.TabItem("rewardbench Eval Set - Length Bias"): # with gr.Row(): # # backup # rewardbench_table_len_hidden = gr.Dataframe( # rewardbench_data_length.values, # datatype=cols_rewardbench_data_length, # headers=rewardbench_data_length.columns.tolist(), # visible=False, # ) # rewardbench_table_len = gr.Dataframe( # regex_table(rewardbench_data_length.copy(), "", False).values, # datatype=cols_rewardbench_data_length, # headers=rewardbench_data_length.columns.tolist(), # elem_id="rewardbench_dataframe_length", # height=1000, # ) with gr.TabItem("Existing Test Sets"): with gr.Row(): search_3 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model") model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"], value=["Seq. Classifiers", "DPO", "Custom Classifiers"], label="Model Types", # info="Which model types to include.", ) with gr.Row(): PREF_SET_TEXT = """ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). """ gr.Markdown(PREF_SET_TEXT) with gr.Row(): # backup pref_sets_table_hidden = gr.Dataframe( prefs_data.values, datatype=col_types_prefs, headers=prefs_data.columns.tolist(), visible=False, ) pref_sets_table = gr.Dataframe( regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values, datatype=col_types_prefs, headers=prefs_data.columns.tolist(), elem_id="prefs_dataframe", height=1000, ) with gr.TabItem("About"): with gr.Row(): gr.Markdown(ABOUT_TEXT) with gr.TabItem("Dataset Viewer"): with gr.Row(): # loads one sample gr.Markdown("## Random Dataset Sample Viewer") subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True) button = gr.Button("Show Random Sample") with gr.Row(): sample_display = gr.Markdown("{sampled data loads here}") button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display]) # removed plot because not pretty enough # with gr.TabItem("Model Correlation"): # with gr.Row(): # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data) # gr.Plot(plot) search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed) # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len) search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table) model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed) model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table) # Load data when app starts, TODO make this used somewhere... # def load_data_on_start(): # data_rewardbench = load_all_data(repo_dir_rewardbench) # rewardbench_table.update(data_rewardbench) # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench) # rewardbench_table.update(data_rewardbench_avg) # data_prefs = load_all_data(repo_dir_prefs) # pref_sets_table.update(data_prefs) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h scheduler.start() app.launch() # had .queue() before launch before... not sure if that's necessary