Spaces:

allenai
/

URIAL-Bench

Running

App Files Files Community

yuchenlin commited on Feb 26

Commit

a415f27

•

1 Parent(s): 4401f8e

urial bench

Browse files

Files changed (8) hide show

.gitignore +2 -0
README.md +4 -5
app.py +137 -0
constants.py +91 -0
init.py +95 -0
leaderboard_data.jsonl +13 -0
requirements.txt +1 -0
utils_display.py +43 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ .DS_Store

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: URIAL Bench
-emoji: 📊
-colorFrom: gray
-colorTo: indigo
 sdk: gradio
 sdk_version: 4.19.2
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: URIAL Bench (Eval Base LLMs on MT-Bench)
+emoji: 🐑
+colorFrom: blue
+colorTo: yellow
 sdk: gradio
 sdk_version: 4.19.2
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
+import ast
+import argparse
+import glob
+import pickle
+import gradio as gr
+import numpy as np
+import pandas as pd
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+import json
+from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS
+from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub
+from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
+from datetime import datetime, timezone
+LAST_UPDATED = "Feb 27th 2024"
+css = """
+.markdown-text{font-size: 16pt}
+th {
+  text-align: center;
+}
+td {
+  font-size: 16px; /* Adjust the font size as needed */
+  text-align: center;
+}
+"""
+column_names = {
+    "model": "Model",
+    "Overall": "All 🎯",
+    "Turn 1": "Turn 1️⃣",
+    "Turn 2": "Turn 2️⃣",
+    }
+model_info = {
+    "gpt-4": {"hf_name": "https://platform.openai.com/", "pretty_name": "gpt-4"},
+    "gpt-3.5-turbo": {"hf_name": "https://platform.openai.com/", "pretty_name": "gpt-3.5-turbo"},
+    "Llama-2-70b-hf": {"hf_name": "meta-llama/Llama-2-70b-hf", "pretty_name": "Llama-2-70B"},
+    "Llama-2-13b-hf": {"hf_name": "meta-llama/Llama-2-13b-hf", "pretty_name": "Llama-2-13B"},
+    "Llama-2-7b-hf": {"hf_name": "meta-llama/Llama-2-7b-hf", "pretty_name": "Llama-2-7B"},
+    "Mixtral-8x7B-v0.1": {"hf_name": "mistralai/Mixtral-8x7B-v0.1", "pretty_name": "Mixtral-8x7B"},
+    "Mistral-7b-v0.1": {"hf_name": "mistralai/Mistral-7B-v0.1", "pretty_name": "Mistral-7B"},
+    "Yi-34B": {"hf_name": "01-ai/Yi-34B", "pretty_name": "Yi-34B"},
+    "Yi-6B": {"hf_name": "01-ai/Yi-6B", "pretty_name": "Yi-6B"},
+    "gemma-7b": {"hf_name": "google/gemma-7b", "pretty_name": "Gemma-7B"},
+    "gemma-2b": {"hf_name": "google/gemma-2b", "pretty_name": "Gemma-2B"},
+    "phi-2": {"hf_name": "microsoft/phi-2", "pretty_name": "Phi-2 (2.7B)"},
+    "olmo": {"hf_name": "allenai/OLMo-7B", "pretty_name": "OLMo-7B"},
+}
+# Formats the columns
+def formatter(x):
+    if type(x) is str:
+        x = x
+    else:
+        x = round(x, 2)
+    return x
+def build_demo(original_df, TYPES):
+    with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+        # gr.HTML(BANNER, elem_id="banner")
+        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
+                leaderboard_table = gr.components.Dataframe(
+                    value=original_df,
+                    datatype=TYPES,
+                    height=1000,
+                    wrap=False,
+                    elem_id="leaderboard-table",
+                    interactive=False,
+                    visible=True,
+                    min_width=60,
+                    )
+            with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
+                gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")
+        gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                gr.Textbox(
+                    value=CITATION_TEXT, lines=7,
+                    label="Copy the BibTeX to cite URIAL and MT-Bench",
+                    elem_id="citation-button",
+                    show_copy_button=True)
+                # ).style(show_copy_button=True)
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--result_file", help="Path to results table", default="leaderboard_data.jsonl")
+    args = parser.parse_args()
+    bench_results = args.result_file
+    original_df = pd.read_json(bench_results, lines=True)
+    print(original_df.columns)
+    for col in original_df.columns:
+        if col == "model":
+            original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x, model_info)))
+        else:
+            original_df[col] = original_df[col].apply(formatter) # For numerical values
+    # Define the first column explicitly, add 'Overall' as the second column, and then append the rest excluding 'Overall'
+    new_order = [original_df.columns[0], 'Overall'] + [col for col in original_df.columns if col not in [original_df.columns[0], 'Overall']]
+    # Reorder the DataFrame columns using the new order
+    reordered_df = original_df[new_order]
+    reordered_df.sort_values(by='Overall', inplace=True, ascending=False)
+    reordered_df.rename(columns=column_names, inplace=True)
+    # COLS = [c.name for c in fields(AutoEvalColumn)]
+    # TYPES = [c.type for c in fields(AutoEvalColumn)]
+    TYPES = ["markdown", "number"]
+    demo = build_demo(reordered_df, TYPES)
+    demo.launch(share=args.share)

constants.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from pathlib import Path
+# Directory where request by models are stored
+DIR_OUTPUT_REQUESTS = Path("requested_models")
+EVAL_REQUESTS_PATH = Path("eval_requests")
+##########################
+# Text definitions       #
+##########################
+banner_url = "https://huggingface.co/spaces/WildEval/WildBench-Leaderboard/resolve/main/%E2%80%8Eleaderboard_logo_v2.png" # the same repo here.
+BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
+TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> URIAL Bench </b> </body> </html>"
+INTRODUCTION_TEXT= """
+# URIAL Bench (Evaluating Base LLMs with URIAL on MT-Bench)
+[🛜 Website](https://allenai.github.io/re-align/index.html) | [💻 GitHub](https://github.com/Re-Align/URIAL) | [📖 Paper](https://arxiv.org/abs/2312.01552) | [🐦 Twitter](https://x.com/billyuchenlin/status/1759541978881311125?s=20)
+> URIAL Bench tests the capacity of base LLMs for alignment without introducing the factors of fine-tuning (learning rate, data, etc.), which are hard to control for fair comparisons.
+Specifically, we use [URIAL](https://github.com/Re-Align/URIAL/tree/main/run_scripts/mt-bench#run-urial-inference) to align a base LLM, and evaluate its performance on MT-Bench.
+- [🐑 URIAL](https://arxiv.org/abs/2312.01552) uses three constant examples to align BASE LLMs with in-context learning.
+- [📊 MT-Bench](https://huggingface.co/spaces/lmsys/mt-bench) is a small, curated benchmark with two turns of instruction following tasks in 10 domains.
+"""
+CITATION_TEXT = """@inproceedings{
+    Lin2024ReAlign,
+    title={The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning},
+    author={Bill Yuchen Lin and Abhilasha Ravichander and Ximing Lu and Nouha Dziri and Melanie Sclar and Khyathi Chandu and Chandra Bhagavatula and Yejin Choi},
+    booktitle={International Conference on Learning Representations},
+    year={2024},
+    url={https://arxiv.org/abs/2312.01552}
+}
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric P. Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+"""
+METRICS_TAB_TEXT = """
+Here you will find details about the different metrics reported in our leaderboard.
+## Metrics
+🎯 Win Rate and Elo Ratings are popular metrics for evaluating LLMs general capabilities by comparing the with a strong reference model. [WIP]
+### Win Rate vs. ChatGPT
+[WIP]
+```
+Example:
+```
+### Elo Rating
+[WIP]
+```
+Example:
+```
+## How to reproduce our results
+The WildBench Leaderboard will be a continued effort to benchmark open source/access LLMs.
+Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
+For more details head over to our repo at: https://github.com/WildEval/WildBench-Leaderboard
+P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️
+## Benchmark datasets
+| Dataset                                                         | Domain                   | Source       | size | License |
+|-----------------------------------------------------------------|--------------------------|--------------|------|---------|
+| [WildBench](https://huggingface.co/datasets/WildEval/WildBench) | in-the-wild user queries | [WildChat]() | XXX  | XXX     |
+"""

init.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import pandas as pd
+from constants import EVAL_REQUESTS_PATH
+from pathlib import Path
+from huggingface_hub import HfApi, Repository
+TOKEN_HUB = os.environ.get("TOKEN_HUB", None)
+QUEUE_REPO = os.environ.get("QUEUE_REPO", None)
+QUEUE_PATH = os.environ.get("QUEUE_PATH", None)
+hf_api = HfApi(
+    endpoint="https://huggingface.co",
+    token=TOKEN_HUB,
+)
+def load_all_info_from_dataset_hub():
+    eval_queue_repo = None
+    results_csv_path = None
+    requested_models = None
+    passed = True
+    if TOKEN_HUB is None:
+        passed = False
+    else:
+        print("Pulling evaluation requests and results.")
+        eval_queue_repo = Repository(
+            local_dir=QUEUE_PATH,
+            clone_from=QUEUE_REPO,
+            use_auth_token=TOKEN_HUB,
+            repo_type="dataset",
+        )
+        eval_queue_repo.git_pull()
+        # Local directory where dataset repo is cloned + folder with eval requests
+        directory = QUEUE_PATH / EVAL_REQUESTS_PATH
+        requested_models = get_all_requested_models(directory)
+        requested_models = [p.stem for p in requested_models]
+        # Local directory where dataset repo is cloned
+        csv_results = get_csv_with_results(QUEUE_PATH)
+        # csv_results = pd.read_json(QUEUE_PATH, lines=True)
+        if csv_results is None:
+            passed = False
+    if not passed:
+        print("No HuggingFace token or result path provided. Skipping evaluation requests and results.")
+    return eval_queue_repo, requested_models, csv_results
+def upload_file(requested_model_name, path_or_fileobj):
+    dest_repo_file = Path(EVAL_REQUESTS_PATH) / path_or_fileobj.name
+    dest_repo_file = str(dest_repo_file)
+    hf_api.upload_file(
+            path_or_fileobj=path_or_fileobj,
+            path_in_repo=str(dest_repo_file),
+            repo_id=QUEUE_REPO,
+            token=TOKEN_HUB,
+            repo_type="dataset",
+            commit_message=f"Add {requested_model_name} to eval queue")
+def get_all_requested_models(directory):
+    directory = Path(directory)
+    all_requested_models = list(directory.glob("*.txt"))
+    return all_requested_models
+def get_csv_with_results(directory):
+    directory = Path(directory)
+    all_csv_files = list(directory.glob("*.csv"))
+    latest = [f for f in all_csv_files if f.stem.endswith("latest")]
+    if len(latest) != 1:
+        return None
+    return latest[0]
+def is_model_on_hub(model_name, revision="main") -> bool:
+    try:
+        model_name = model_name.replace(" ","")
+        author = model_name.split("/")[0]
+        model_id = model_name.split("/")[1]
+        if len(author) == 0 or len(model_id) == 0:
+            return False, "is not a valid model name. Please use the format `author/model_name`."
+    except Exception as e:
+        return False, "is not a valid model name. Please use the format `author/model_name`."
+    try:
+        models = list(hf_api.list_models(author=author, search=model_id))
+        matched = [model_name for m in models if m.modelId == model_name]
+        if len(matched) != 1:
+            return False, "was not found on the hub!"
+        else:
+            return True, None
+    except Exception as e:
+        print(f"Could not get the model from the hub.: {e}")
+        return False, "was not found on hub!"

leaderboard_data.jsonl ADDED Viewed

	@@ -0,0 +1,13 @@

+{"model": "gpt-4", "Turn 1": 8.95625, "Turn 2": 9.025, "Overall": 8.990625, "coding": 8.55, "extraction": 9.375, "humanities": 9.95, "math": 6.8, "reasoning": 9.0, "roleplay": 8.9, "stem": 9.7, "writing": 9.65}
+{"model": "gpt-3.5-turbo", "Turn 1": 8.075, "Turn 2": 7.8125, "Overall": 7.94375, "coding": 6.9, "extraction": 8.85, "humanities": 9.55, "math": 6.3, "reasoning": 5.65, "roleplay": 8.4, "stem": 8.7, "writing": 9.2}
+{"model": "Llama-2-70b-hf", "Turn 1": 7.60625, "Turn 2": 6.6125, "Overall": 7.109375, "coding": 4.15, "extraction": 7.7, "humanities": 9.75, "math": 3.6, "reasoning": 6.1, "roleplay": 7.325, "stem": 8.75, "writing": 9.5}
+{"model": "Mixtral-8x7B-v0.1", "Turn 1": 7.69375, "Turn 2": 6.1875, "Overall": 6.940625, "coding": 5.3, "extraction": 7.05, "humanities": 9.2, "math": 4.85, "reasoning": 5.3, "roleplay": 7.4, "stem": 8.225, "writing": 8.2}
+{"model": "Mistral-7b-v0.1", "Turn 1": 7.4875, "Turn 2": 5.8625, "Overall": 6.675, "coding": 4.6, "extraction": 7.75, "humanities": 9.075, "math": 3.4, "reasoning": 4.9, "roleplay": 7.65, "stem": 8.275, "writing": 7.75}
+{"model": "Yi-34B", "Turn 1": 7.19375, "Turn 2": 6.15625, "Overall": 6.675, "coding": 3.85, "extraction": 6.8, "humanities": 8.475, "math": 4.8, "reasoning": 6.0, "roleplay": 7.75, "stem": 7.825, "writing": 7.9}
+{"model": "gemma-7b", "Turn 1": 6.96875, "Turn 2": 5.0375, "Overall": 6.003125, "coding": 3.95, "extraction": 6.25, "humanities": 8.825, "math": 4.35, "reasoning": 4.5, "roleplay": 6.25, "stem": 7.25, "writing": 6.65}
+{"model": "phi-2", "Turn 1": 7.0375, "Turn 2": 4.6625, "Overall": 5.85, "coding": 4.25, "extraction": 4.45, "humanities": 8.85, "math": 3.8, "reasoning": 4.55, "roleplay": 7.2, "stem": 7.0, "writing": 6.7}
+{"model": "Llama-2-13b-hf", "Turn 1": 6.26875, "Turn 2": 4.4125, "Overall": 5.340625, "coding": 2.8, "extraction": 4.7, "humanities": 8.3, "math": 2.85, "reasoning": 2.9, "roleplay": 6.625, "stem": 7.025, "writing": 7.525}
+{"model": "Yi-6B", "Turn 1": 5.95625, "Turn 2": 3.9875, "Overall": 4.971875, "coding": 2.3, "extraction": 2.95, "humanities": 8.775, "math": 2.5, "reasoning": 3.5, "roleplay": 6.95, "stem": 7.7, "writing": 5.1}
+{"model": "Llama-2-7b-hf", "Turn 1": 5.75, "Turn 2": 3.9125, "Overall": 4.83125, "coding": 1.65, "extraction": 3.4, "humanities": 8.075, "math": 1.6, "reasoning": 3.45, "roleplay": 7.475, "stem": 6.8, "writing": 6.2}
+{"model": "gemma-2b", "Turn 1": 5.08125, "Turn 2": 2.8625, "Overall": 3.971875, "coding": 1.8, "extraction": 3.1, "humanities": 5.65, "math": 3.3, "reasoning": 2.55, "roleplay": 5.7, "stem": 5.725, "writing": 3.95}
+{"model": "olmo", "Turn 1": 3.95, "Turn 2": 2.8625, "Overall": 3.40625, "coding": 1.65, "extraction": 2.45, "humanities": 4.9, "math": 1.25, "reasoning": 2.45, "roleplay": 5.3, "stem": 5.3, "writing": 3.95}

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio==4.19.2

utils_display.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from dataclasses import dataclass
+# These classes are for user facing column names, to avoid having to change them
+# all around the code when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+@dataclass(frozen=True)
+class AutoEvalColumn: # Auto evals column
+    model = ColumnContent("Model", "markdown")
+    avg_wer = ColumnContent("Average WER ⬇️", "number")
+    rtf = ColumnContent("RTF (1e-3) ⬇️", "number")
+    ami_wer = ColumnContent("AMI", "number")
+    e22_wer = ColumnContent("Earnings22", "number")
+    gs_wer = ColumnContent("Gigaspeech", "number")
+    lsc_wer = ColumnContent("LS Clean", "number")
+    lso_wer = ColumnContent("LS Other", "number")
+    ss_wer = ColumnContent("SPGISpeech", "number")
+    tl_wer = ColumnContent("Tedlium", "number")
+    vp_wer = ColumnContent("Voxpopuli", "number")
+    cv_wer = ColumnContent("Common Voice", "number")
+def make_clickable_model(model_name, model_info):
+    if model_info[model_name]['hf_name'].startswith("http"):
+        link = model_info[model_name]['hf_name']
+    else:
+        link = f"https://huggingface.co/{model_info[model_name]['hf_name']}"
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"