lukecq's picture
udpate results to include SeaBench and private dataset
4ecf403
import gradio as gr
import pandas as pd
import os
from huggingface_hub import snapshot_download, login
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
from src.display.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
CONTACT_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
SUB_TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import API
from src.leaderboard.load_results import load_data
# clone / pull the lmeh eval data
TOKEN = os.environ.get("TOKEN", None)
login(token=TOKEN)
RESULTS_REPO = f"SeaLLMs/SeaExam-results"
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset",
token=TOKEN
)
def restart_space():
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
all_columns = ['R', 'Model', 'type', 'open?', 'avg-pub', 'avg-prv ⬇️', 'id-pub',
'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv', '#P(B)']
show_columns = ['R', 'Model','type','open?','#P(B)', 'avg-pub', 'avg-prv ⬇️',
'id-pub', 'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv']
TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'SeaExam-prv ⬇️',
'SeaBench-pub', 'SeaBench-prv']
TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
# Load the data from the csv file
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241030.csv'
# csv_path = f'eval-results/SeaExam_results_20241030.csv'
df = pd.read_csv(csv_path, skiprows=1, header=0)
# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
df_seaexam, df_seabench, df_overall = load_data(csv_path)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.HTML(SUB_TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Tab("πŸ… Overall"):
Leaderboard(
value=df_overall[show_columns_overall],
select_columns=SelectColumns(
default_selection=show_columns_overall,
cant_deselect=["R", "Model"],
label="Select Columns to Display:",
),
search_columns=["Model"],
# hide_columns=["model_name_for_query", "Model Size"],
filter_columns=[
"type",
"open?",
# ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
# ColumnFilter("Flagged", type="boolean", default=False),
ColumnFilter("#P(B)", default=[7, 9], label="Paramers(B)"),
],
datatype=TYPES_overall,
# column_widths=["3%", "20%", "6%", "4%"]
)
with gr.Tab("SeaExam"):
Leaderboard(
value=df_seaexam[show_columns],
select_columns=SelectColumns(
default_selection=show_columns,
cant_deselect=["R", "Model"],
label="Select Columns to Display:",
),
search_columns=["Model"],
# hide_columns=["model_name_for_query", "Model Size"],
filter_columns=[
"type",
"open?",
# ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
# ColumnFilter("Flagged", type="boolean", default=False),
ColumnFilter("#P(B)", default=[7, 9]),
],
datatype=TYPES,
# column_widths=["2%", "33%"],
)
with gr.Tab("SeaBench"):
Leaderboard(
value=df_seabench[show_columns],
select_columns=SelectColumns(
default_selection=show_columns,
cant_deselect=["R", "Model"],
label="Select Columns to Display:",
),
search_columns=["Model"],
# hide_columns=["model_name_for_query", "Model Size"],
filter_columns=[
"type",
"open?",
# ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
# ColumnFilter("Flagged", type="boolean", default=False),
ColumnFilter("#P(B)", default=[7, 9]),
],
datatype=TYPES,
# column_widths=["2%", "33%"],
)
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
# with gr.Row():
# with gr.Accordion("πŸ“™ Citation", open=False):
# citation_button = gr.Textbox(
# value=CITATION_BUTTON_TEXT,
# label=CITATION_BUTTON_LABEL,
# lines=20,
# elem_id="citation-button",
# show_copy_button=True,
# )
gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(share=True)