Spaces:
Sleeping
Sleeping
import json | |
from pathlib import Path | |
from typing import List, Dict | |
import gradio as gr | |
from pydantic import BaseModel, Field, validator | |
# --------------- Configuration --------------- | |
LEADERBOARD_PATH = Path("leaderboard_data.json") | |
DEFAULT_MODEL_NAME = "example/model" | |
# --------------- Data models --------------- | |
class Metrics(BaseModel): | |
readability: float | |
relevance: float | |
explanation_clarity: float = Field(alias="explanation_clarity") | |
problem_identification: float | |
actionability: float | |
completeness: float | |
specificity: float | |
contextual_adequacy: float | |
consistency: float | |
brevity: float | |
class LeaderboardEntry(BaseModel): | |
model_name: str | |
bleu: float | |
llm_pass_1: float | |
llm_pass_5: float | |
llm_pass_10: float | |
metrics: Metrics | |
def score_range(cls, v: float): | |
if not 0.0 <= v <= 1.0: | |
raise ValueError("Scores should be between 0 and 1") | |
return v | |
# --------------- Persistence helpers --------------- | |
def _load_leaderboard() -> List[Dict]: | |
if not LEADERBOARD_PATH.exists(): | |
return [] | |
with LEADERBOARD_PATH.open("r", encoding="utf-8") as f: | |
data = json.load(f) | |
return data.get("leaderboard", []) | |
def _save_leaderboard(data: List[Dict]): | |
to_store = {"leaderboard": data} | |
with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: | |
json.dump(to_store, f, indent=2) | |
# --------------- Utility --------------- | |
def _flatten_entry(entry: Dict) -> Dict: | |
"""Flatten nested metrics so that every metric is a column.""" | |
flat = { | |
"Model": entry["model_name"], | |
"BLEU": entry["bleu"], | |
"Pass@1": entry["llm_pass_1"], | |
"Pass@5": entry["llm_pass_5"], | |
"Pass@10": entry["llm_pass_10"], | |
} | |
for metric_name, score in entry["metrics"].items(): | |
flat[metric_name.replace("_", " ").title()] = score | |
return flat | |
def _table_data() -> List[Dict]: | |
data = _load_leaderboard() | |
# Sort descending by pass@1 as requested | |
data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
return [_flatten_entry(e) for e in data] | |
# --------------- Gradio callbacks --------------- | |
def submit_model( | |
model_name: str, | |
bleu: float, | |
llm_pass_1: float, | |
llm_pass_5: float, | |
llm_pass_10: float, | |
readability: float, | |
relevance: float, | |
explanation_clarity: float, | |
problem_identification: float, | |
actionability: float, | |
completeness: float, | |
specificity: float, | |
contextual_adequacy: float, | |
consistency: float, | |
brevity: float, | |
): | |
"""Validate and append a new model entry to the leaderboard.""" | |
try: | |
entry = LeaderboardEntry( | |
model_name=model_name.strip(), | |
bleu=bleu, | |
llm_pass_1=llm_pass_1, | |
llm_pass_5=llm_pass_5, | |
llm_pass_10=llm_pass_10, | |
metrics={ | |
"readability": readability, | |
"relevance": relevance, | |
"explanation_clarity": explanation_clarity, | |
"problem_identification": problem_identification, | |
"actionability": actionability, | |
"completeness": completeness, | |
"specificity": specificity, | |
"contextual_adequacy": contextual_adequacy, | |
"consistency": consistency, | |
"brevity": brevity, | |
}, | |
) | |
except Exception as e: | |
return gr.update(value=_table_data()), gr.update(value=f"β Submission failed: {e}") | |
data = _load_leaderboard() | |
# Replace existing model entry if any | |
data = [d for d in data if d["model_name"] != entry.model_name] | |
data.append(entry.dict()) | |
_save_leaderboard(data) | |
return gr.update(value=_table_data()), gr.update(value="β Submission recorded!") | |
# --------------- Interface --------------- | |
with gr.Blocks(title="Custom LLM Leaderboard") as demo: | |
gr.Markdown("""# π LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """) | |
leaderboard_df = gr.Dataframe( | |
headers=list(_table_data()[0].keys()) if _table_data() else [], | |
value=_table_data(), | |
label="Current Leaderboard", | |
interactive=False, | |
) | |
gr.Markdown("## π Submit new model results") | |
with gr.Accordion("Submission form", open=False): | |
with gr.Row(): | |
model_name_inp = gr.Text(label="Model name (org/model)", value="") | |
bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0) | |
pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0) | |
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0) | |
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0) | |
gr.Markdown("### Multi-metric subjective scores (0.0 β 1.0)") | |
with gr.Row(): | |
readability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Readability") | |
relevance_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Relevance") | |
explanation_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Explanation Clarity") | |
problem_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Problem Identification") | |
actionability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Actionability") | |
completeness_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Completeness") | |
specificity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Specificity") | |
contextual_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Contextual Adequacy") | |
consistency_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Consistency") | |
brevity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Brevity") | |
submit_btn = gr.Button("Submit") | |
status_markdown = gr.Markdown("") | |
submit_btn.click( | |
fn=submit_model, | |
inputs=[ | |
model_name_inp, | |
bleu_inp, | |
pass1_inp, | |
pass5_inp, | |
pass10_inp, | |
readability_inp, | |
relevance_inp, | |
explanation_inp, | |
problem_inp, | |
actionability_inp, | |
completeness_inp, | |
specificity_inp, | |
contextual_inp, | |
consistency_inp, | |
brevity_inp, | |
], | |
outputs=[leaderboard_df, status_markdown], | |
) | |
# Expose app variable for Spaces | |
app = demo |