Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from typing import List, Dict | |
| import gradio as gr | |
| from pydantic import BaseModel, Field, field_validator | |
| # --------------- Configuration --------------- | |
| LEADERBOARD_PATH = Path("leaderboard_data.json") | |
| DEFAULT_MODEL_NAME = "example/model" | |
| # --------------- Data models --------------- | |
| class Metrics(BaseModel): | |
| readability: float | |
| relevance: float | |
| explanation_clarity: float = Field(alias="explanation_clarity") | |
| problem_identification: float | |
| actionability: float | |
| completeness: float | |
| specificity: float | |
| contextual_adequacy: float | |
| consistency: float | |
| brevity: float | |
| class LeaderboardEntry(BaseModel): | |
| model_name: str | |
| bleu: float | |
| llm_pass_1: float | |
| llm_pass_5: float | |
| llm_pass_10: float | |
| metrics: Metrics | |
| def score_range(cls, v: float): | |
| if not 0.0 <= v <= 1.0: | |
| raise ValueError("Scores should be between 0 and 1") | |
| return v | |
| # --------------- Persistence helpers --------------- | |
| def _load_leaderboard() -> List[Dict]: | |
| if not LEADERBOARD_PATH.exists(): | |
| return [] | |
| with LEADERBOARD_PATH.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| return data.get("leaderboard", []) | |
| def _save_leaderboard(data: List[Dict]): | |
| to_store = {"leaderboard": data} | |
| with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: | |
| json.dump(to_store, f, indent=2) | |
| # --------------- Utility --------------- | |
| def _flatten_entry(entry: Dict) -> Dict: | |
| """Flatten nested metrics so that every metric is a column.""" | |
| flat = { | |
| "Model": entry["model_name"], | |
| "BLEU": entry["bleu"], | |
| "Pass@1": entry["llm_pass_1"], | |
| "Pass@5": entry["llm_pass_5"], | |
| "Pass@10": entry["llm_pass_10"], | |
| } | |
| for metric_name, score in entry["metrics"].items(): | |
| flat[metric_name.replace("_", " ").title()] = score | |
| return flat | |
| def _table_data() -> List[Dict]: | |
| data = _load_leaderboard() | |
| # Sort descending by pass@1 as requested | |
| data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
| return [_flatten_entry(e) for e in data] | |
| # --------------- Gradio callbacks --------------- | |
| def submit_model( | |
| model_name: str, | |
| bleu: float, | |
| llm_pass_1: float, | |
| llm_pass_5: float, | |
| llm_pass_10: float, | |
| readability: float, | |
| relevance: float, | |
| explanation_clarity: float, | |
| problem_identification: float, | |
| actionability: float, | |
| completeness: float, | |
| specificity: float, | |
| contextual_adequacy: float, | |
| consistency: float, | |
| brevity: float, | |
| ): | |
| """Validate and append a new model entry to the leaderboard.""" | |
| try: | |
| entry = LeaderboardEntry( | |
| model_name=model_name.strip(), | |
| bleu=bleu, | |
| llm_pass_1=llm_pass_1, | |
| llm_pass_5=llm_pass_5, | |
| llm_pass_10=llm_pass_10, | |
| metrics={ | |
| "readability": readability, | |
| "relevance": relevance, | |
| "explanation_clarity": explanation_clarity, | |
| "problem_identification": problem_identification, | |
| "actionability": actionability, | |
| "completeness": completeness, | |
| "specificity": specificity, | |
| "contextual_adequacy": contextual_adequacy, | |
| "consistency": consistency, | |
| "brevity": brevity, | |
| }, | |
| ) | |
| except Exception as e: | |
| return gr.update(value=_table_data()), gr.update(value=f"β Submission failed: {e}") | |
| data = _load_leaderboard() | |
| # Replace existing model entry if any | |
| data = [d for d in data if d["model_name"] != entry.model_name] | |
| data.append(entry.dict()) | |
| _save_leaderboard(data) | |
| return gr.update(value=_table_data()), gr.update(value="β Submission recorded!") | |
| # --------------- Interface --------------- | |
| with gr.Blocks(title="Custom LLM Leaderboard") as demo: | |
| gr.Markdown("""# π LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """) | |
| leaderboard_df = gr.Dataframe( | |
| headers=list(_table_data()[0].keys()) if _table_data() else [], | |
| value=_table_data(), | |
| label="Current Leaderboard", | |
| interactive=False, | |
| ) | |
| gr.Markdown("## π Submit new model results") | |
| with gr.Accordion("Submission form", open=False): | |
| with gr.Row(): | |
| model_name_inp = gr.Text(label="Model name (org/model)", value="") | |
| bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0) | |
| pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0) | |
| pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0) | |
| pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0) | |
| gr.Markdown("### Multi-metric subjective scores (0.0 β 1.0)") | |
| with gr.Row(): | |
| readability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Readability") | |
| relevance_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Relevance") | |
| explanation_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Explanation Clarity") | |
| problem_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Problem Identification") | |
| actionability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Actionability") | |
| completeness_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Completeness") | |
| specificity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Specificity") | |
| contextual_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Contextual Adequacy") | |
| consistency_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Consistency") | |
| brevity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Brevity") | |
| submit_btn = gr.Button("Submit") | |
| status_markdown = gr.Markdown("") | |
| submit_btn.click( | |
| fn=submit_model, | |
| inputs=[ | |
| model_name_inp, | |
| bleu_inp, | |
| pass1_inp, | |
| pass5_inp, | |
| pass10_inp, | |
| readability_inp, | |
| relevance_inp, | |
| explanation_inp, | |
| problem_inp, | |
| actionability_inp, | |
| completeness_inp, | |
| specificity_inp, | |
| contextual_inp, | |
| consistency_inp, | |
| brevity_inp, | |
| ], | |
| outputs=[leaderboard_df, status_markdown], | |
| api_name="submit_model", | |
| ) | |
| # Ensure API endpoint for programmatic submissions | |
| submit_btn.click( | |
| fn=submit_model, | |
| inputs=[ | |
| model_name_inp, | |
| bleu_inp, | |
| pass1_inp, | |
| pass5_inp, | |
| pass10_inp, | |
| readability_inp, | |
| relevance_inp, | |
| explanation_inp, | |
| problem_inp, | |
| actionability_inp, | |
| completeness_inp, | |
| specificity_inp, | |
| contextual_inp, | |
| consistency_inp, | |
| brevity_inp, | |
| ], | |
| outputs=[leaderboard_df, status_markdown], | |
| api_name="submit_model", | |
| ) | |
| # ----------------- Launch ----------------- | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |
| # For HF Spaces runtime (gradio SDK) expose `demo` | |
| app = demo |