Spaces:

vbhat4
/

rubriceval

Sleeping

App Files Files Community

rubriceval / app.py

vbhat4

Small change to submit model instructions

49557ad verified 12 months ago

raw

history blame contribute delete

10.5 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import scipy.stats as st

	LEADERBOARD_FILE = "leaderboard.csv"

	def get_leaderboard_df():
	df = pd.read_csv(LEADERBOARD_FILE)
	df = df.sort_values(by = ["Score"], ascending = False)
	df = df.reset_index(drop = True)
	return df

	def get_model_stats(uploaded_df):
	overall_score = uploaded_df["avg_score"].mean()
	data = np.array(list(uploaded_df["avg_score"]))
	bootstrap_res = st.bootstrap((data,),
	np.mean,
	confidence_level = 0.95,
	n_resamples = 10000,
	method = "percentile")
	ci_high = bootstrap_res.confidence_interval.high
	ci_low = bootstrap_res.confidence_interval.low

	formatted_upper_diff = str(round(ci_high - overall_score, 2))
	formatted_lower_diff = str(round(overall_score - ci_low, 2))

	formatted_score = round(overall_score, 2)
	formatted_ci = f"+{formatted_upper_diff}/-{formatted_lower_diff}"

	return (formatted_score, formatted_ci)

	def process_upload(file):
	uploaded_df = pd.read_csv(file.name).dropna()
	if "avg_score" not in list(uploaded_df.columns):
	return "Upload failed: file must have column 'avg_score'."
	overall_score, confidence_interval = get_model_stats(uploaded_df)
	leaderboard_df = get_leaderboard_df()
	model_name = file.name.split("gradio/")[1].split("/")[1].split(".csv")[0]
	new_entry = {"Model": model_name, "Score": overall_score, "95% CI": confidence_interval}
	leaderboard_df = leaderboard_df.append(new_entry, ignore_index = True)
	leaderboard_df.to_csv(LEADERBOARD_FILE, index = False)
	return "Upload complete! The leaderboard has been updated."

	#theme = gr.themes.Default(radius_size = "none")
	def create_ui():
	text_size = gr.themes.sizes.text_lg
	# load theme from theme.json
	theme = gr.themes.Default.load("theme.json")
	# set text size to large
	theme.text_size = text_size
	with gr.Blocks(theme = theme) as demo:
	with gr.Row():
	gr.Image("https://ai.stanford.edu/wp-content/themes/sail/img/logo.png",
	show_label = False,
	show_download_button = False,
	show_share_button = False,
	container = False,
	min_width = 200,
	scale = 0)
	gr.Image("https://crfm.stanford.edu/static/img/header/crfm-rgb.png",
	show_label = False,
	show_download_button = False,
	show_share_button = False,
	container = False,
	min_width = 200,
	scale = 0)
	gr.Markdown(
	"""
	# RubricEval: A Scalable Human-LLM Evaluation Framework for Open-Ended Tasks
	######
	""")
	with gr.TabItem("Leaderboard"):
	overall_leaderboard_table = gr.Dataframe(get_leaderboard_df,
	gr.Timer(5),
	column_widths = ["33.3%", "33.3%", "33.3%"],
	height = 600)
	gr.Markdown(
	"""
	######

	## RubricEval leaderboard statistics (Overall)
	"""
	)
	gr.Image("lb_stats.png",
	show_label = False,
	show_download_button = False,
	show_share_button = False,
	width = 800)
	gr.Markdown(
	"""
	######

	## RubricEval scores by category
	"""
	)
	gr.Image("category_scores.png",
	show_label = False,
	show_download_button = False,
	show_share_button = False)
	with gr.TabItem("About"):
	gr.Image("eval_about.jpg",
	show_label = False,
	show_download_button = False,
	show_share_button = False)
	with gr.Accordion("What is RubricEval?"):
	gr.Markdown(
	"""
	######
	#### Overview
	RubricEval is a framework for evaluating instruction-following models.
	The core idea is to create example-specific rubrics designed by human experts, which are then applied by an GPT-4o to evaluate model outputs at scale. This process results in more scalable, trustworthy, and interpretable evaluations of language models.

	#### Features
	Open-Ended: The responses of chat models are open-ended in nature, and a small set of reference
	answers often can’t capture all acceptable responses. This is a key limitation of reference-based
	evaluators like BLEU and BERTScore.

	Multidimensional: Responses can be good and bad in different ways, which isn’t captured by "head
	to head" evaluators like Chatbot Arena and AlpacaEval that simply decide if one response is better
	than another generally.

	Absolute: Evaluators like Chatbot Arena and AlpacaEval use win rates based on pairwise comparisons.
	This means that we don’t know how good a model is in absolute terms. For example, a model may
	have a low win rate against GPT-4o but still be formidable, and the highest win rate model may not
	be perfect despite topping the leaderboard.

	Varying Criteria: The criteria for what makes a good response is different for each instruction. While
	HELM Instruct is open-ended, multidimensional, and absolute, it uses the same set of scoring criteria
	for each instruction, missing nuances at the instruction level. Most pairwise comparison evaluators
	may implicitly consider varying criteria for each instruction, but these criteria are not explicitly laid
	out (WildBench is a notable exception).

	Feedback: To the best of our knowledge, no current language model evaluation system provides
	textual feedback on a model’s overall strengths and weaknesses with respect to some set of
	instructions. However, we believe that such feedback would be highly valuable for model developers.
	Evaluation is a key piece of iterative model development, and textual feedback could provide insight
	on what exactly needs to be improved rather than solely a score which is hard to interpret.
	######
	""")
	gr.Image("feature_comp.png",
	show_label = False,
	show_download_button = False,
	show_share_button = False)
	with gr.Accordion("Where do evaluation instructions come from?"):
	gr.Markdown(
	"""
	######
	We utilize a set of approximately 1,000 instructions from WildBench ([https://huggingface.co/spaces/allenai/WildBench](https://huggingface.co/spaces/allenai/WildBench)) which was made publicly available. From this, 392 of the hardest instructions were chosen via a GPT-4 based pairwise comparison method.

	Using the WildBench dataset has three primary benefits:

	1) It contains a manually curated selection of instructions from real users.

	2) The instructions are well spread out across 11 categories, which is useful for benchmarking.

	3) Each instruction comes with user-defined criteria of what they’re looking for, which we can make use of directly in our framework
	######
	""")
	with gr.Accordion("How does RubricEval correlate with human preferences?"):
	gr.Markdown(
	"""
	######
	We used RubricEval to score 13 leading large language models across 11 categories and 392 instructions from WildBench.

	Notably, the ranking of these models based on RubricEval scores correlates highly with the ranking of the same models using Chatbot Arena ELO ratings (spearman ρ = 0.98).
	The main discordance is in the ranking of Claude 3 Opus (which is ranked relatively lower by RubricEval compared to Chatbot Arena).
	RubricEval’s correlation of ρ = 0.98 with human preferences ties length-corrected AlpacaEval’s record 0.98 correlation, while being higher than regular AlpacaEval (ρ = 0.94), MT-Bench (ρ = 0.94), and MMLU (ρ = 0.87).
	######
	""")
	with gr.Accordion("Additional details"):
	gr.Markdown(
	"""
	######
	See our detailed report at [insert blog link].
	######
	""")
	with gr.Accordion("Citation"):
	gr.Markdown(
	"""
	######
	[insert citation]
	######
	""")
	with gr.TabItem("Submit Model"):
	gr.Markdown(
	"""
	######
	#### Want to add a model to this leaderboard?
	#### 1. Run RubricEval locally for <$x (see [insert github link]).
	#### 2. Upload the evaluation file generated by RubricEval below. Note: the file name will be used as the model name.
	#### 3. Wait ~5 seconds and refresh the leaderboard page to see that your model has been added!
	######
	""")
	model_submission = gr.File(file_types = [".csv"], file_count = "single")
	model_submission.upload(fn = process_upload, inputs = [model_submission], outputs = [])

	demo.launch()

	if __name__ == "__main__":
	create_ui()