|
import gradio as gr
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import plotly.express as px
|
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
from huggingface_hub import snapshot_download
|
|
|
|
from src.about import (
|
|
CITATION_BUTTON_LABEL,
|
|
CITATION_BUTTON_TEXT,
|
|
EVALUATION_QUEUE_TEXT,
|
|
INTRODUCTION_TEXT,
|
|
LLM_BENCHMARKS_TEXT,
|
|
TITLE,
|
|
)
|
|
from src.display.css_html_js import custom_css
|
|
from src.display.utils import (
|
|
BENCHMARK_COLS,
|
|
COLS,
|
|
EVAL_COLS,
|
|
EVAL_TYPES,
|
|
AutoEvalColumn,
|
|
ModelType,
|
|
fields,
|
|
WeightType,
|
|
Precision
|
|
)
|
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
from src.submission.submit import add_new_eval
|
|
import base64
|
|
|
|
|
|
def restart_space():
|
|
API.restart_space(repo_id=REPO_ID)
|
|
|
|
|
|
|
|
def make_rate_chart(df: pd.DataFrame):
|
|
"""Return a Plotly bar chart of hallucination rates."""
|
|
|
|
df_long = df.melt(
|
|
id_vars="Models",
|
|
value_vars=["RAG Hallucination Rate (%)", "Non-RAG Hallucination Rate (%)"],
|
|
var_name="Benchmark",
|
|
value_name="Rate",
|
|
)
|
|
fig = px.bar(
|
|
df_long,
|
|
x="Models",
|
|
y="Rate",
|
|
color="Benchmark",
|
|
barmode="group",
|
|
title="Hallucination Rates by Model",
|
|
height=400,
|
|
)
|
|
fig.update_layout(xaxis_title="", yaxis_title="%")
|
|
return fig
|
|
|
|
def make_leaderboard_plot(df: pd.DataFrame, col: str, title: str, bar_color: str):
|
|
"""
|
|
Return a horizontal bar chart sorted ascending by `col`.
|
|
Lowest value (best) at the top.
|
|
"""
|
|
df_sorted = df.sort_values(col, ascending=False)
|
|
fig = px.bar(
|
|
df_sorted,
|
|
x=col,
|
|
y="Models",
|
|
orientation="h",
|
|
title=title,
|
|
text_auto=".2f",
|
|
height=400,
|
|
color_discrete_sequence=[bar_color],
|
|
)
|
|
fig.update_traces(textposition="outside", cliponaxis=False)
|
|
|
|
fig.update_layout(
|
|
xaxis_title="Hallucination Rate (%)",
|
|
yaxis_title="",
|
|
yaxis=dict(dtick=1),
|
|
margin=dict(l=140, r=60, t=60, b=40)
|
|
)
|
|
fig.update_traces(textposition="outside")
|
|
return fig
|
|
|
|
|
|
def color_scale(s, cmap):
|
|
"""
|
|
Return background-colour styles for a numeric Series (lower = greener,
|
|
higher = redder). Works with any palette length.
|
|
"""
|
|
colours = px.colors.sequential.__dict__[cmap]
|
|
n = len(colours) - 1
|
|
|
|
rng = s.max() - s.min()
|
|
norm = (s - s.min()) / (rng if rng else 1)
|
|
|
|
return [f"background-color:{colours[int(v * n)]}" for v in 1 - norm]
|
|
|
|
|
|
|
|
try:
|
|
print(EVAL_REQUESTS_PATH)
|
|
snapshot_download(
|
|
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
|
)
|
|
except Exception:
|
|
|
|
print(f"[WARN] Skipping RESULTS sync: {Exception}")
|
|
try:
|
|
print(EVAL_RESULTS_PATH)
|
|
snapshot_download(
|
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
|
)
|
|
except Exception:
|
|
|
|
print(f"[WARN] Skipping RESULTS sync: {Exception}")
|
|
|
|
|
|
|
|
LEADERBOARD_DF = get_leaderboard_df("leaderboard/data/leaderboard.csv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_leaderboard(df: pd.DataFrame):
|
|
if df is None or df.empty:
|
|
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
return Leaderboard(
|
|
value=df,
|
|
datatype=["markdown", "markdown", "number", "number", "number"],
|
|
select_columns=SelectColumns(
|
|
default_selection=[
|
|
"Rank", "Models",
|
|
"Average Hallucination Rate (%)",
|
|
"RAG Hallucination Rate (%)",
|
|
"Non-RAG Hallucination Rate (%)"
|
|
],
|
|
cant_deselect=["Models", "Rank"],
|
|
label="Select Columns to Display:",
|
|
),
|
|
search_columns=["Models"],
|
|
|
|
bool_checkboxgroup_label=None,
|
|
interactive=False,
|
|
)
|
|
|
|
image_path = "static/kluster-color.png"
|
|
with open(image_path, "rb") as img_file:
|
|
b64_string = base64.b64encode(img_file.read()).decode("utf-8")
|
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css)
|
|
with demo:
|
|
gr.HTML(f"""
|
|
<div style="text-align: center; margin-top: 2em; margin-bottom: 1em;">
|
|
<img src="data:image/png;base64,{b64_string}" alt="kluster.ai logo"
|
|
style="height: 80px; display: block; margin-left: auto; margin-right: auto;" />
|
|
|
|
<div style="font-size: 2.5em; font-weight: bold; margin-top: 0.4em; color: var(--text-color);">
|
|
LLM Hallucination Detection Leaderboard
|
|
</div>
|
|
|
|
<div style="font-size: 1.5em; margin-top: 0.5em;">
|
|
Evaluating factual accuracy and faithfulness of LLMs in both RAG and real-world knowledge settings with
|
|
<a href="https://platform.kluster.ai/verify" target="_blank">
|
|
Verify
|
|
</a> by
|
|
<a href="https://platform.kluster.ai/" target="_blank">
|
|
kluster.ai
|
|
</a>
|
|
</div>
|
|
</div>
|
|
""")
|
|
|
|
|
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
with gr.TabItem("π
Hallucination Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
with gr.Row():
|
|
gr.Plot(
|
|
make_leaderboard_plot(
|
|
LEADERBOARD_DF,
|
|
"RAG Hallucination Rate (%)",
|
|
"RAG Hallucination Rate (lower is better)",
|
|
bar_color="#4CAF50",
|
|
),
|
|
show_label=False,
|
|
)
|
|
gr.Plot(
|
|
make_leaderboard_plot(
|
|
LEADERBOARD_DF,
|
|
"Non-RAG Hallucination Rate (%)",
|
|
"Non-RAG Hallucination Rate (lower is better)",
|
|
bar_color="#FF7043",
|
|
),
|
|
show_label=False,
|
|
)
|
|
|
|
|
|
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
with gr.TabItem("π Details", elem_id="llm-benchmark-tab-table", id=2):
|
|
gr.Markdown((Path(__file__).parent / "docs.md").read_text())
|
|
|
|
with gr.TabItem("π Submit Here! ", elem_id="llm-benchmark-tab-table", id=3):
|
|
gr.Markdown((Path(__file__).parent / "submit.md").read_text())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scheduler = BackgroundScheduler()
|
|
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
scheduler.start()
|
|
demo.queue(default_concurrency_limit=40).launch() |