Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import scipy.stats as st | |
LEADERBOARD_FILE = "leaderboard.csv" | |
def get_leaderboard_df(): | |
df = pd.read_csv(LEADERBOARD_FILE) | |
df = df.sort_values(by = ["Score"], ascending = False) | |
df = df.reset_index(drop = True) | |
return df | |
def get_model_stats(uploaded_df): | |
overall_score = uploaded_df["avg_score"].mean() | |
data = np.array(list(uploaded_df["avg_score"])) | |
bootstrap_res = st.bootstrap((data,), | |
np.mean, | |
confidence_level = 0.95, | |
n_resamples = 10000, | |
method = "percentile") | |
ci_high = bootstrap_res.confidence_interval.high | |
ci_low = bootstrap_res.confidence_interval.low | |
formatted_upper_diff = str(round(ci_high - overall_score, 2)) | |
formatted_lower_diff = str(round(overall_score - ci_low, 2)) | |
formatted_score = round(overall_score, 2) | |
formatted_ci = f"+{formatted_upper_diff}/-{formatted_lower_diff}" | |
return (formatted_score, formatted_ci) | |
def process_upload(file): | |
uploaded_df = pd.read_csv(file.name).dropna() | |
if "avg_score" not in list(uploaded_df.columns): | |
return "Upload failed: file must have column 'avg_score'." | |
overall_score, confidence_interval = get_model_stats(uploaded_df) | |
leaderboard_df = get_leaderboard_df() | |
model_name = file.name.split("gradio/")[1].split("/")[1].split(".csv")[0] | |
new_entry = {"Model": model_name, "Score": overall_score, "95% CI": confidence_interval} | |
leaderboard_df = leaderboard_df.append(new_entry, ignore_index = True) | |
leaderboard_df.to_csv(LEADERBOARD_FILE, index = False) | |
return "Upload complete! The leaderboard has been updated." | |
#theme = gr.themes.Default(radius_size = "none") | |
def create_ui(): | |
text_size = gr.themes.sizes.text_lg | |
# load theme from theme.json | |
theme = gr.themes.Default.load("theme.json") | |
# set text size to large | |
theme.text_size = text_size | |
with gr.Blocks(theme = theme) as demo: | |
with gr.Row(): | |
gr.Image("https://ai.stanford.edu/wp-content/themes/sail/img/logo.png", | |
show_label = False, | |
show_download_button = False, | |
show_share_button = False, | |
container = False, | |
min_width = 200, | |
scale = 0) | |
gr.Image("https://crfm.stanford.edu/static/img/header/crfm-rgb.png", | |
show_label = False, | |
show_download_button = False, | |
show_share_button = False, | |
container = False, | |
min_width = 200, | |
scale = 0) | |
gr.Markdown( | |
""" | |
# **RubricEval: A Scalable Human-LLM Evaluation Framework for Open-Ended Tasks** | |
###### | |
""") | |
with gr.TabItem("Leaderboard"): | |
overall_leaderboard_table = gr.Dataframe(get_leaderboard_df, | |
gr.Timer(5), | |
column_widths = ["33.3%", "33.3%", "33.3%"], | |
height = 600) | |
gr.Markdown( | |
""" | |
###### | |
## RubricEval leaderboard statistics (Overall) | |
""" | |
) | |
gr.Image("lb_stats.png", | |
show_label = False, | |
show_download_button = False, | |
show_share_button = False, | |
width = 800) | |
gr.Markdown( | |
""" | |
###### | |
## RubricEval scores by category | |
""" | |
) | |
gr.Image("category_scores.png", | |
show_label = False, | |
show_download_button = False, | |
show_share_button = False) | |
with gr.TabItem("About"): | |
gr.Image("eval_about.jpg", | |
show_label = False, | |
show_download_button = False, | |
show_share_button = False) | |
with gr.Accordion("What is RubricEval?"): | |
gr.Markdown( | |
""" | |
###### | |
#### Overview | |
RubricEval is a framework for evaluating instruction-following models. | |
The core idea is to create example-specific rubrics designed by human experts, which are then applied by an GPT-4o to evaluate model outputs at scale. This process results in more scalable, trustworthy, and interpretable evaluations of language models. | |
#### Features | |
**Open-Ended:** The responses of chat models are open-ended in nature, and a small set of reference | |
answers often can’t capture all acceptable responses. This is a key limitation of reference-based | |
evaluators like BLEU and BERTScore. | |
**Multidimensional:** Responses can be good and bad in different ways, which isn’t captured by "head | |
to head" evaluators like Chatbot Arena and AlpacaEval that simply decide if one response is better | |
than another generally. | |
**Absolute:** Evaluators like Chatbot Arena and AlpacaEval use win rates based on pairwise comparisons. | |
This means that we don’t know how good a model is in absolute terms. For example, a model may | |
have a low win rate against GPT-4o but still be formidable, and the highest win rate model may not | |
be perfect despite topping the leaderboard. | |
**Varying Criteria:** The criteria for what makes a good response is different for each instruction. While | |
HELM Instruct is open-ended, multidimensional, and absolute, it uses the same set of scoring criteria | |
for each instruction, missing nuances at the instruction level. Most pairwise comparison evaluators | |
may implicitly consider varying criteria for each instruction, but these criteria are not explicitly laid | |
out (WildBench is a notable exception). | |
**Feedback:** To the best of our knowledge, no current language model evaluation system provides | |
textual feedback on a model’s overall strengths and weaknesses with respect to some set of | |
instructions. However, we believe that such feedback would be highly valuable for model developers. | |
Evaluation is a key piece of iterative model development, and textual feedback could provide insight | |
on what exactly needs to be improved rather than solely a score which is hard to interpret. | |
###### | |
""") | |
gr.Image("feature_comp.png", | |
show_label = False, | |
show_download_button = False, | |
show_share_button = False) | |
with gr.Accordion("Where do evaluation instructions come from?"): | |
gr.Markdown( | |
""" | |
###### | |
We utilize a set of approximately 1,000 instructions from WildBench ([https://huggingface.co/spaces/allenai/WildBench](https://huggingface.co/spaces/allenai/WildBench)) which was made publicly available. From this, 392 of the hardest instructions were chosen via a GPT-4 based pairwise comparison method. | |
Using the WildBench dataset has three primary benefits: | |
1) It contains a manually curated selection of instructions from real users. | |
2) The instructions are well spread out across 11 categories, which is useful for benchmarking. | |
3) Each instruction comes with user-defined criteria of what they’re looking for, which we can make use of directly in our framework | |
###### | |
""") | |
with gr.Accordion("How does RubricEval correlate with human preferences?"): | |
gr.Markdown( | |
""" | |
###### | |
We used RubricEval to score 13 leading large language models across 11 categories and 392 instructions from WildBench. | |
Notably, the ranking of these models based on RubricEval scores correlates highly with the ranking of the same models using Chatbot Arena ELO ratings (spearman ρ = 0.98). | |
The main discordance is in the ranking of Claude 3 Opus (which is ranked relatively lower by RubricEval compared to Chatbot Arena). | |
RubricEval’s correlation of ρ = 0.98 with human preferences ties length-corrected AlpacaEval’s record 0.98 correlation, while being higher than regular AlpacaEval (ρ = 0.94), MT-Bench (ρ = 0.94), and MMLU (ρ = 0.87). | |
###### | |
""") | |
with gr.Accordion("Additional details"): | |
gr.Markdown( | |
""" | |
###### | |
See our detailed report at [insert blog link]. | |
###### | |
""") | |
with gr.Accordion("Citation"): | |
gr.Markdown( | |
""" | |
###### | |
[insert citation] | |
###### | |
""") | |
with gr.TabItem("Submit Model"): | |
gr.Markdown( | |
""" | |
###### | |
#### Want to add a model to this leaderboard? | |
#### 1. Run RubricEval locally for <$x (see [insert github link]). | |
#### 2. Upload the evaluation file generated by RubricEval below. Note: the file name will be used as the model name. | |
#### 3. Wait ~5 seconds and refresh the leaderboard page to see that your model has been added! | |
###### | |
""") | |
model_submission = gr.File(file_types = [".csv"], file_count = "single") | |
model_submission.upload(fn = process_upload, inputs = [model_submission], outputs = []) | |
demo.launch() | |
if __name__ == "__main__": | |
create_ui() |