Spaces:

allenai
/

reward-bench

Running

App Files Files Community

reward-bench / app.py

natolambert

major imporvements

31bff5a 6 months ago

raw

history blame

No virus

17 kB

	import gradio as gr
	import os
	from huggingface_hub import HfApi, snapshot_download
	from apscheduler.schedulers.background import BackgroundScheduler
	from datasets import load_dataset
	from src.utils import load_all_data
	from src.md import ABOUT_TEXT, TOP_TEXT
	from src.plt import plot_avg_correlation
	from src.constants import subset_mapping, length_categories, example_counts
	import numpy as np

	api = HfApi()

	COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
	evals_repo = "allenai/reward-bench-results"

	eval_set_repo = "allenai/reward-bench"
	repo_dir_rewardbench = "./evals/rewardbench/"

	def restart_space():
	api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)

	print("Pulling evaluation results")
	repo = snapshot_download(
	local_dir=repo_dir_rewardbench,
	ignore_patterns=["pref-sets-scores/", "eval-set-scores/"],
	repo_id=evals_repo,
	use_auth_token=COLLAB_TOKEN,
	tqdm_class=None,
	etag_timeout=30,
	repo_type="dataset",
	)


	def avg_over_rewardbench(dataframe_core, dataframe_prefs):
	"""
	Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.

	We average over 4 core sections (per prompt weighting):
	1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
	2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
	3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
	4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)

	"""
	new_df = dataframe_core.copy()
	dataframe_prefs = dataframe_prefs.copy()

	# for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
	for subset, sub_subsets in subset_mapping.items():
	subset_cols = [col for col in new_df.columns if col in sub_subsets]
	sub_data = new_df[subset_cols].values # take the relevant column values
	sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
	new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) # take the weighted average
	# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)

	data_cols = list(subset_mapping.keys())
	keep_columns = ["model",] + ["model_type"] + data_cols
	# keep_columns = ["model", "average"] + subsets
	new_df = new_df[keep_columns]

	# selected average from pref_sets
	pref_columns = ["anthropic_helpful", "mtbench_gpt4", "shp", "summarize"]
	pref_data = dataframe_prefs[pref_columns].values

	# add column test sets knowing the rows are not identical, take superset
	dataframe_prefs["Test Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)

	# add column Test Sets empty to new_df
	new_df["Test Sets"] = np.nan
	# per row in new_df if model is in dataframe_prefs, add the value to new_df["Test Sets"]
	values = []
	for i, row in new_df.iterrows():
	model = row["model"]
	if model in dataframe_prefs["model"].values:
	values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0])
	# new_df.at[i, "Test Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0]
	else:
	values.append(np.nan)

	new_df["Test Sets"] = values

	# add total average
	data_cols += ["Test Sets"]
	new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)

	# make average third column
	keep_columns = ["model", "model_type", "average"] + data_cols
	new_df = new_df[keep_columns]
	return new_df

	def expand_subsets(dataframe):
	# TODO need to modify data/ script to do this
	pass


	def length_bias_check(dataframe):
	"""
	Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
	Then, take the average of the three buckets as "average"
	"""
	new_df = dataframe.copy()
	existing_subsets = new_df.columns[3:] # model, model_type, average
	final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
	# new data is empty list dict for each final subset
	new_data = {s: [] for s in final_subsets}

	# now, subsets correspond to those with True, Nuetral, and False length bias
	# check if length_categories[subset] == "True" or "False" or "Neutral"
	for subset in existing_subsets:
	subset_data = new_df[subset].values
	subset_length = length_categories[subset]
	# route to the correct bucket
	if subset_length == "True":
	new_data["Length Bias"].append(subset_data)
	elif subset_length == "Neutral":
	new_data["Neutral"].append(subset_data)
	elif subset_length == "False":
	new_data["Terse Bias"].append(subset_data)

	# take average of new_data and add to new_df (removing other columns than model)
	for subset in final_subsets:
	new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2)
	keep_columns = ["model"] + final_subsets
	new_df = new_df[keep_columns]
	# recompute average
	# new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)

	return new_df



	rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
	rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
	prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
	# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)

	rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)

	col_types_rewardbench = ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
	col_types_rewardbench_avg = ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
	cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
	col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
	# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)

	# for showing random samples
	eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
	def random_sample(r: gr.Request, subset):
	if subset is None or subset == []:
	sample_index = np.random.randint(0, len(eval_set) - 1)
	sample = eval_set[sample_index]
	else: # filter by subsets (can be list)
	if isinstance(subset, str):
	subset = [subset]
	# filter down dataset to only include the subset(s)
	eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
	sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
	sample = eval_set_filtered[sample_index]

	markdown_text = '\n\n'.join([f"{key}:\n\n{value}" for key, value in sample.items()])
	return markdown_text

	subsets = eval_set.unique("subset")

	def regex_table(dataframe, regex, filter_button):
	"""
	Takes a model name as a regex, then returns only the rows that has that in it.
	"""
	# Split regex statement by comma and trim whitespace around regexes
	regex_list = [x.strip() for x in regex.split(",")]
	# Join the list into a single regex pattern with '\|' acting as OR
	combined_regex = '\|'.join(regex_list)

	# if filter_button, remove all rows with "ai2" in the model name
	if isinstance(filter_button, list) or isinstance(filter_button, str):
	if "AI2 Experiments" not in filter_button and ("ai2" not in regex):
	dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)]
	if "Seq. Classifiers" not in filter_button:
	dataframe = dataframe[~dataframe["model_type"].str.contains("Seq. Classifier", case=False, na=False)]
	if "DPO" not in filter_button:
	dataframe = dataframe[~dataframe["model_type"].str.contains("DPO", case=False, na=False)]
	if "Custom Classifiers" not in filter_button:
	dataframe = dataframe[~dataframe["model_type"].str.contains("Custom Classifier", case=False, na=False)]
	# Filter the dataframe such that 'model' contains any of the regex patterns
	return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]


	with gr.Blocks() as app:
	# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
	with gr.Row():
	with gr.Column(scale=2.2):
	# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
	# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
	# img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
	gr.Markdown("""
	![](file/src/logo.png)
	""")
	with gr.Column(scale=3):
	gr.Markdown(TOP_TEXT)
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏆 RewardBench Leaderboard"):
	with gr.Row():
	search_1 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
	model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
	value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
	label="Model Types",
	# info="Which model types to include.",
	)
	with gr.Row():
	# reference data
	rewardbench_table_hidden = gr.Dataframe(
	rewardbench_data_avg.values,
	datatype=col_types_rewardbench_avg,
	headers=rewardbench_data_avg.columns.tolist(),
	visible=False,
	)
	rewardbench_table = gr.Dataframe(
	regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
	datatype=col_types_rewardbench_avg,
	headers=rewardbench_data_avg.columns.tolist(),
	elem_id="rewardbench_dataframe_avg",
	height=1000,
	)

	with gr.TabItem("🔍 RewardBench - Detailed"):
	with gr.Row():
	search_2 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
	model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
	value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
	label="Model Types",
	# info="Which model types to include."
	)
	with gr.Row():
	# ref data
	rewardbench_table_detailed_hidden = gr.Dataframe(
	rewardbench_data.values,
	datatype=col_types_rewardbench,
	headers=rewardbench_data.columns.tolist(),
	visible=False,
	)
	rewardbench_table_detailed = gr.Dataframe(
	regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
	datatype=col_types_rewardbench,
	headers=rewardbench_data.columns.tolist(),
	elem_id="rewardbench_dataframe",
	height=1000,
	)
	# with gr.TabItem("rewardbench Eval Set - Length Bias"):
	# with gr.Row():
	# # backup
	# rewardbench_table_len_hidden = gr.Dataframe(
	# rewardbench_data_length.values,
	# datatype=cols_rewardbench_data_length,
	# headers=rewardbench_data_length.columns.tolist(),
	# visible=False,
	# )
	# rewardbench_table_len = gr.Dataframe(
	# regex_table(rewardbench_data_length.copy(), "", False).values,
	# datatype=cols_rewardbench_data_length,
	# headers=rewardbench_data_length.columns.tolist(),
	# elem_id="rewardbench_dataframe_length",
	# height=1000,
	# )
	with gr.TabItem("Existing Test Sets"):
	with gr.Row():
	search_3 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
	model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
	value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
	label="Model Types",
	# info="Which model types to include.",
	)
	with gr.Row():
	PREF_SET_TEXT = """
	For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
	"""
	gr.Markdown(PREF_SET_TEXT)
	with gr.Row():
	# backup
	pref_sets_table_hidden = gr.Dataframe(
	prefs_data.values,
	datatype=col_types_prefs,
	headers=prefs_data.columns.tolist(),
	visible=False,
	)
	pref_sets_table = gr.Dataframe(
	regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
	datatype=col_types_prefs,
	headers=prefs_data.columns.tolist(),
	elem_id="prefs_dataframe",
	height=1000,
	)


	with gr.TabItem("About"):
	with gr.Row():
	gr.Markdown(ABOUT_TEXT)

	with gr.TabItem("Dataset Viewer"):
	with gr.Row():
	# loads one sample
	gr.Markdown("## Random Dataset Sample Viewer")
	subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
	button = gr.Button("Show Random Sample")

	with gr.Row():
	sample_display = gr.Markdown("{sampled data loads here}")

	button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
	# removed plot because not pretty enough
	# with gr.TabItem("Model Correlation"):
	# with gr.Row():
	# plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
	# gr.Plot(plot)

	search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
	search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
	# search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
	search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)

	model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
	model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
	model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)

	# Load data when app starts, TODO make this used somewhere...
	# def load_data_on_start():
	# data_rewardbench = load_all_data(repo_dir_rewardbench)
	# rewardbench_table.update(data_rewardbench)

	# data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
	# rewardbench_table.update(data_rewardbench_avg)

	# data_prefs = load_all_data(repo_dir_prefs)
	# pref_sets_table.update(data_prefs)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
	scheduler.start()
	app.launch() # had .queue() before launch before... not sure if that's necessary