Clémentine
init
728a44a
import json
import os
from datetime import datetime, timezone
import gradio as gr
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from src.assets.text_content import *
from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
from src.assets.css_html_js import custom_css, get_window_url_params # left in case you need them
from src.utils_display import EloEvalColumn, fields, styled_error, styled_warning, styled_message
from src.init import load_all_info_from_hub
# clone / pull the lmeh eval data
H4_TOKEN = os.environ.get("H4_TOKEN", None)
HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
ADD_PLOTS = False
EVAL_REQUESTS_PATH = "auto_evals/eval_requests"
api = HfApi()
def restart_space():
api.restart_space(
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
)
human_eval_repo, gpt_4_eval_repo = load_all_info_from_hub(HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
ELO_COLS = [c.name for c in fields(EloEvalColumn)]
ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
ELO_SORT_COL = EloEvalColumn.gpt4.name
def has_no_nan_values(df, columns):
return df[columns].notna().all(axis=1)
def has_nan_values(df, columns):
return df[columns].isna().any(axis=1)
def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
if human_eval_repo:
print("Pulling human_eval_repo changes")
human_eval_repo.git_pull()
all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
dataframe = pd.DataFrame.from_records(all_data)
dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
dataframe = dataframe[ELO_COLS]
return dataframe
def get_elo_elements():
df_instruct = pd.read_json("human_evals/without_code.json")
df_code_instruct = pd.read_json("human_evals/with_code.json")
elo_leaderboard = get_elo_leaderboard(
df_instruct, df_code_instruct, tie_allowed=False
)
elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
df_instruct, df_code_instruct, tie_allowed=True
)
plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
df_instruct, df_code_instruct, tie_allowed=False
)
return (
elo_leaderboard,
elo_leaderboard_with_tie_allowed,
plot_1,
plot_2,
plot_3,
plot_4,
)
(
elo_leaderboard,
elo_leaderboard_with_tie_allowed,
plot_1,
plot_2,
plot_3,
plot_4,
) = get_elo_elements()
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
with gr.Row():
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Row():
with gr.Column(scale=2):
gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
with gr.Column(scale=1):
gr.Image(
"src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
)
gr.Markdown("## No tie allowed")
elo_leaderboard_table = gr.components.Dataframe(
value=elo_leaderboard,
headers=ELO_COLS,
datatype=ELO_TYPES,
max_rows=5,
)
gr.Markdown("## Tie allowed*")
elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
value=elo_leaderboard_with_tie_allowed,
headers=ELO_COLS,
datatype=ELO_TYPES,
max_rows=5,
)
gr.Markdown(
"\* Results when the scores of 4 and 5 were treated as ties.",
elem_classes="markdown-text",
)
gr.Markdown(
"Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
elem_id="models-to-add-text",
)
if ADD_PLOTS:
with gr.Box():
visualization_title = gr.HTML(VISUALIZATION_TITLE)
with gr.Row():
with gr.Column():
gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
plot_1 = gr.Plot(plot_1, show_label=False)
with gr.Column():
gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
plot_2 = gr.Plot(plot_2, show_label=False)
with gr.Row():
with gr.Column():
gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
plot_3 = gr.Plot(plot_3, show_label=False)
with gr.Column():
gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
plot_4 = gr.Plot(plot_4, show_label=False)
with gr.Row():
with gr.Column():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
).style(show_copy_button=True)
with gr.Column():
with gr.Accordion("✨ CHANGELOG", open=False):
changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.queue(concurrency_count=40).launch()