Ryan McConville
tweak styling on main leaderboard
23f0448
raw
history blame
11.9 kB
import gradio as gr
import pandas as pd
from pathlib import Path
import plotly.express as px
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
import base64
def restart_space():
API.restart_space(repo_id=REPO_ID)
def make_rate_chart(df: pd.DataFrame):
"""Return a Plotly bar chart of hallucination rates."""
# long-form dataframe for grouped bars
df_long = df.melt(
id_vars="Models",
value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
var_name="Benchmark",
value_name="Rate",
)
fig = px.bar(
df_long,
x="Models",
y="Rate",
color="Benchmark",
barmode="group",
title="Hallucination Rates by Model",
height=400,
)
fig.update_layout(xaxis_title="", yaxis_title="%")
return fig
def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
"""
Return a horizontal bar chart sorted ascending by `col`.
Lowest value (best) at the top.
"""
df_sorted = df.sort_values(col, ascending=False) # best β†’ worst
fig = px.bar(
df_sorted,
x=col,
y="Models",
orientation="h",
title=title,
text_auto=".2f",
height=400,
color_discrete_sequence=[bar_color],
)
fig.update_traces(textposition="outside", cliponaxis=False)
fig.update_layout(
xaxis_title="Hallucination Rate (%)",
yaxis_title="",
yaxis=dict(dtick=1), # ensure every model shown
margin=dict(l=140, r=60, t=60, b=40)
)
fig.update_traces(textposition="outside")
return fig
def color_scale(s, cmap):
"""
Return background-colour styles for a numeric Series (lower = greener,
higher = redder). Works with any palette length.
"""
colours = px.colors.sequential.__dict__[cmap]
n = len(colours) - 1 # max valid index
rng = s.max() - s.min()
norm = (s - s.min()) / (rng if rng else 1)
return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
# restart_space()
print(f"[WARN] Skipping RESULTS sync: {Exception}")
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
# restart_space()
print(f"[WARN] Skipping RESULTS sync: {Exception}")
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
# (
# finished_eval_queue_df,
# running_eval_queue_df,
# pending_eval_queue_df,
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(df: pd.DataFrame):
if df is None or df.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=df,
datatype=["markdown", "markdown", "number", "number", "number"],
select_columns=SelectColumns(
default_selection=[
"Rank", "Models",
"Average Hallucination Rate (%)",
"RAG Hallucination Rate (%)",
"Non-RAG Hallucination Rate (%)"
],
cant_deselect=["Models", "Rank"],
label="Select Columns to Display:",
),
search_columns=["Models"],
# column_widths=["3%"],
bool_checkboxgroup_label=None,
interactive=False,
)
image_path = "static/kluster-color.png"
with open(image_path, "rb") as img_file:
b64_string = base64.b64encode(img_file.read()).decode("utf-8")
# print("CUSTOM CSS\n", custom_css[-1000:], "\n---------")
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(f"""
<div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
<img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
<div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
LLM Hallucination Detection Leaderboard
</div>
<div style="font-size: 1.5em; margin-top: 0.5em;">
Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
<a href="https://platform.kluster.ai/verify" target="_blank">
Verify
</a> by
<a href="https://platform.kluster.ai/" target="_blank">
kluster.ai
</a>
</div>
</div>
""")
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
# ---------- Chart ----------
with gr.Row():
gr.Plot(
make_leaderboard_plot(
LEADERBOARD_DF,
"RAG Hallucination Rate (%)",
"RAG Hallucination Rate (lower is better)",
bar_color="#4CAF50",
),
show_label=False,
)
gr.Plot(
make_leaderboard_plot(
LEADERBOARD_DF,
"Non-RAG Hallucination Rate (%)",
"Non-RAG Hallucination Rate (lower is better)",
bar_color="#FF7043",
),
show_label=False,
)
# ---------- Leaderboard ----------
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ“ Details", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown((Path(__file__).parent / "docs.md").read_text())
with gr.TabItem("πŸš€ Submit Here! ", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown((Path(__file__).parent / "submit.md").read_text())
# with gr.Column():
# with gr.Row():
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
# with gr.Column():
# with gr.Accordion(
# f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
# open=False,
# ):
# with gr.Row():
# finished_eval_table = gr.components.Dataframe(
# value=finished_eval_queue_df,
# headers=EVAL_COLS,
# datatype=EVAL_TYPES,
# row_count=5,
# )
# with gr.Accordion(
# f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
# open=False,
# ):
# with gr.Row():
# running_eval_table = gr.components.Dataframe(
# value=running_eval_queue_df,
# headers=EVAL_COLS,
# datatype=EVAL_TYPES,
# row_count=5,
# )
# with gr.Accordion(
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
# open=False,
# ):
# with gr.Row():
# pending_eval_table = gr.components.Dataframe(
# value=pending_eval_queue_df,
# headers=EVAL_COLS,
# datatype=EVAL_TYPES,
# row_count=5,
# )
# with gr.Row():
# gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
# with gr.Row():
# with gr.Column():
# model_name_textbox = gr.Textbox(label="Model name")
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
# model_type = gr.Dropdown(
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
# label="Model type",
# multiselect=False,
# value=None,
# interactive=True,
# )
# with gr.Column():
# precision = gr.Dropdown(
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
# label="Precision",
# multiselect=False,
# value="float16",
# interactive=True,
# )
# weight_type = gr.Dropdown(
# choices=[i.value.name for i in WeightType],
# label="Weights type",
# multiselect=False,
# value="Original",
# interactive=True,
# )
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
# submit_button = gr.Button("Submit Eval")
# submission_result = gr.Markdown()
# submit_button.click(
# add_new_eval,
# [
# model_name_textbox,
# base_model_name_textbox,
# revision_name_textbox,
# precision,
# weight_type,
# model_type,
# ],
# submission_result,
# )
# with gr.Row():
# with gr.Accordion("πŸ“™ Citation", open=False):
# citation_button = gr.Textbox(
# value=CITATION_BUTTON_TEXT,
# label=CITATION_BUTTON_LABEL,
# lines=20,
# elem_id="citation-button",
# show_copy_button=True,
# )
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()