open_llm_leaderboard

Runtime error

App Files Files Community

edbeeching commited on Apr 17, 2023

Commit

9346f1c

1 Parent(s): 4ff62ee

creates leaderboard

Browse files

Files changed (3) hide show

.gitignore +2 -0
app.py +110 -0
requirements.txt +66 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ evals/
2	+ venv/

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import shutil
+import numpy as np
+import gradio as gr
+from huggingface_hub import Repository
+import json
+from apscheduler.schedulers.background import BackgroundScheduler
+import pandas as pd
+# clone / pull the lmeh eval data
+H4_TOKEN = os.environ.get("H4_TOKEN", None)
+repo=None
+if H4_TOKEN:
+    # try:
+    #     shutil.rmtree("./evals/")
+    # except:
+    #     pass
+    repo = Repository(
+        local_dir="./evals/", clone_from="HuggingFaceH4/lmeh_evaluations", use_auth_token=H4_TOKEN, repo_type="dataset"
+    )
+    repo.git_pull()
+# parse the results
+BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
+BENCH_TO_NAME = {
+    "arc_challenge":"ARC",
+     "hellaswag":"HellaSwag",
+     "hendrycks":"MMLU",
+     "truthfulqa_mc":"TruthQA",
+}
+METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
+entries = [entry for entry in os.listdir("evals") if not entry.startswith('.')]
+model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))]
+def make_clickable_model(model_name):
+    # remove user from model name
+    #model_name_show = ' '.join(model_name.split('/')[1:])
+    link = "https://huggingface.co/" + model_name
+    return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def load_results(model, benchmark, metric):
+    file_path = os.path.join("evals", model, f"{model}-eval_{benchmark}.json")
+    if not os.path.exists(file_path):
+        return 0.0, None
+    with open(file_path) as fp:
+        data = json.load(fp)
+    accs = np.array([v[metric] for k, v in data["results"].items()])
+    mean_acc = np.mean(accs)
+    return mean_acc, data["config"]["model_args"]
+COLS = ["eval_name", "total", "ARC", "HellaSwag", "MMLU", "TruthQA", "base_model"]
+TYPES = ["str", "number", "number", "number", "number", "number","markdown", ]
+def get_leaderboard():
+    if repo:
+        repo.git_pull()
+    all_data = []
+    for model in model_directories:
+        model_data = {"base_model": None}
+        model_data = {"eval_name": model}
+        for benchmark, metric in zip(BENCHMARKS, METRICS):
+            value, base_model = load_results(model, benchmark, metric)
+            model_data[BENCH_TO_NAME[benchmark]] = value
+            if base_model is not None: # in case the last benchmark failed
+                model_data["base_model"] = base_model
+        model_data["total"] = sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values())
+        if model_data["base_model"] is not None:
+            model_data["base_model"] = make_clickable_model(model_data["base_model"])
+        all_data.append(model_data)
+    dataframe = pd.DataFrame.from_records(all_data)
+    dataframe = dataframe.sort_values(by=['total'], ascending=False)
+    dataframe = dataframe[COLS]
+    return dataframe
+leaderboard = get_leaderboard()
+block = gr.Blocks()
+with block:
+    gr.Markdown(f"""
+    # H4 Model Evaluation leaderboard using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> LMEH benchmark suite </a>.
+    Evaluation is performed against 4 popular benchmarks AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthFul QC MC. To run your own benchmarks, refer to the README in the H4 repo.
+    """)
+    with gr.Row():
+        leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
+                                                    datatype=TYPES, max_rows=5)
+    with gr.Row():
+        refresh_button = gr.Button("Refresh")
+        refresh_button.click(get_leaderboard, inputs=[], outputs=leaderboard_table)
+block.launch()
+def refresh_leaderboard():
+    leaderboard_table = get_leaderboard()
+    print("leaderboard updated")
+scheduler = BackgroundScheduler()
+scheduler.add_job(func=refresh_leaderboard, trigger="interval", seconds=300) # refresh every 5 mins
+scheduler.start()

requirements.txt ADDED Viewed

	@@ -0,0 +1,66 @@

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+APScheduler==3.10.1
+async-timeout==4.0.2
+attrs==23.1.0
+certifi==2022.12.7
+charset-normalizer==3.1.0
+click==8.1.3
+contourpy==1.0.7
+cycler==0.11.0
+entrypoints==0.4
+fastapi==0.95.1
+ffmpy==0.3.0
+filelock==3.11.0
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.4.0
+gradio==3.27.0
+gradio_client==0.1.3
+h11==0.14.0
+httpcore==0.17.0
+httpx==0.24.0
+huggingface-hub==0.13.4
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+multidict==6.0.4
+numpy==1.24.2
+orjson==3.8.10
+packaging==23.1
+pandas==2.0.0
+Pillow==9.5.0
+pydantic==1.10.7
+pydub==0.25.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+requests==2.28.2
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+starlette==0.26.1
+toolz==0.12.0
+tqdm==4.65.0
+typing_extensions==4.5.0
+tzdata==2023.3
+tzlocal==4.3
+uc-micro-py==1.0.1
+urllib3==1.26.15
+uvicorn==0.21.1
+websockets==11.0.1
+yarl==1.8.2