import os import shutil import numpy as np import gradio as gr from huggingface_hub import Repository import json from apscheduler.schedulers.background import BackgroundScheduler import pandas as pd # clone / pull the lmeh eval data H4_TOKEN = os.environ.get("H4_TOKEN", None) repo=None if H4_TOKEN: # try: # shutil.rmtree("./evals/") # except: # pass repo = Repository( local_dir="./evals/", clone_from="HuggingFaceH4/lmeh_evaluations", use_auth_token=H4_TOKEN, repo_type="dataset" ) repo.git_pull() # parse the results BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"] BENCH_TO_NAME = { "arc_challenge":"ARC", "hellaswag":"HellaSwag", "hendrycks":"MMLU", "truthfulqa_mc":"TruthQA", } METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] entries = [entry for entry in os.listdir("evals") if not entry.startswith('.')] model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))] def make_clickable_model(model_name): # remove user from model name #model_name_show = ' '.join(model_name.split('/')[1:]) link = "https://huggingface.co/" + model_name return f'{model_name}' def load_results(model, benchmark, metric): file_path = os.path.join("evals", model, f"{model}-eval_{benchmark}.json") if not os.path.exists(file_path): return 0.0, None with open(file_path) as fp: data = json.load(fp) accs = np.array([v[metric] for k, v in data["results"].items()]) mean_acc = np.mean(accs) return mean_acc, data["config"]["model_args"] COLS = ["eval_name", "total", "ARC", "HellaSwag", "MMLU", "TruthQA", "base_model"] TYPES = ["str", "number", "number", "number", "number", "number","markdown", ] def get_leaderboard(): if repo: repo.git_pull() all_data = [] for model in model_directories: model_data = {"base_model": None} model_data = {"eval_name": model} for benchmark, metric in zip(BENCHMARKS, METRICS): value, base_model = load_results(model, benchmark, metric) model_data[BENCH_TO_NAME[benchmark]] = value if base_model is not None: # in case the last benchmark failed model_data["base_model"] = base_model model_data["total"] = sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values()) if model_data["base_model"] is not None: model_data["base_model"] = make_clickable_model(model_data["base_model"]) all_data.append(model_data) dataframe = pd.DataFrame.from_records(all_data) dataframe = dataframe.sort_values(by=['total'], ascending=False) dataframe = dataframe[COLS] return dataframe leaderboard = get_leaderboard() block = gr.Blocks() with block: gr.Markdown(f""" # H4 Model Evaluation leaderboard using the LMEH benchmark suite . Evaluation is performed against 4 popular benchmarks AI2 Reasoning Challenge, HellaSwag, MMLU, and TruthFul QC MC. To run your own benchmarks, refer to the README in the H4 repo. """) with gr.Row(): leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS, datatype=TYPES, max_rows=5) with gr.Row(): refresh_button = gr.Button("Refresh") refresh_button.click(get_leaderboard, inputs=[], outputs=leaderboard_table) block.launch() def refresh_leaderboard(): leaderboard_table = get_leaderboard() print("leaderboard updated") scheduler = BackgroundScheduler() scheduler.add_job(func=refresh_leaderboard, trigger="interval", seconds=300) # refresh every 5 mins scheduler.start()