CodeReviewBench / app.py
Alex
error
2017254
raw
history blame
6.81 kB
import json
from pathlib import Path
from typing import List, Dict
import gradio as gr
from pydantic import BaseModel, Field, validator
# --------------- Configuration ---------------
LEADERBOARD_PATH = Path("leaderboard_data.json")
DEFAULT_MODEL_NAME = "example/model"
# --------------- Data models ---------------
class Metrics(BaseModel):
readability: float
relevance: float
explanation_clarity: float = Field(alias="explanation_clarity")
problem_identification: float
actionability: float
completeness: float
specificity: float
contextual_adequacy: float
consistency: float
brevity: float
class LeaderboardEntry(BaseModel):
model_name: str
bleu: float
llm_pass_1: float
llm_pass_5: float
llm_pass_10: float
metrics: Metrics
@validator("bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10", each_item=True)
def score_range(cls, v: float):
if not 0.0 <= v <= 1.0:
raise ValueError("Scores should be between 0 and 1")
return v
# --------------- Persistence helpers ---------------
def _load_leaderboard() -> List[Dict]:
if not LEADERBOARD_PATH.exists():
return []
with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
data = json.load(f)
return data.get("leaderboard", [])
def _save_leaderboard(data: List[Dict]):
to_store = {"leaderboard": data}
with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
json.dump(to_store, f, indent=2)
# --------------- Utility ---------------
def _flatten_entry(entry: Dict) -> Dict:
"""Flatten nested metrics so that every metric is a column."""
flat = {
"Model": entry["model_name"],
"BLEU": entry["bleu"],
"Pass@1": entry["llm_pass_1"],
"Pass@5": entry["llm_pass_5"],
"Pass@10": entry["llm_pass_10"],
}
for metric_name, score in entry["metrics"].items():
flat[metric_name.replace("_", " ").title()] = score
return flat
def _table_data() -> List[Dict]:
data = _load_leaderboard()
# Sort descending by pass@1 as requested
data.sort(key=lambda x: x["llm_pass_1"], reverse=True)
return [_flatten_entry(e) for e in data]
# --------------- Gradio callbacks ---------------
def submit_model(
model_name: str,
bleu: float,
llm_pass_1: float,
llm_pass_5: float,
llm_pass_10: float,
readability: float,
relevance: float,
explanation_clarity: float,
problem_identification: float,
actionability: float,
completeness: float,
specificity: float,
contextual_adequacy: float,
consistency: float,
brevity: float,
):
"""Validate and append a new model entry to the leaderboard."""
try:
entry = LeaderboardEntry(
model_name=model_name.strip(),
bleu=bleu,
llm_pass_1=llm_pass_1,
llm_pass_5=llm_pass_5,
llm_pass_10=llm_pass_10,
metrics={
"readability": readability,
"relevance": relevance,
"explanation_clarity": explanation_clarity,
"problem_identification": problem_identification,
"actionability": actionability,
"completeness": completeness,
"specificity": specificity,
"contextual_adequacy": contextual_adequacy,
"consistency": consistency,
"brevity": brevity,
},
)
except Exception as e:
return gr.update(value=_table_data()), gr.update(value=f"❌ Submission failed: {e}")
data = _load_leaderboard()
# Replace existing model entry if any
data = [d for d in data if d["model_name"] != entry.model_name]
data.append(entry.dict())
_save_leaderboard(data)
return gr.update(value=_table_data()), gr.update(value="βœ… Submission recorded!")
# --------------- Interface ---------------
with gr.Blocks(title="Custom LLM Leaderboard") as demo:
gr.Markdown("""# πŸ† LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
leaderboard_df = gr.Dataframe(
headers=list(_table_data()[0].keys()) if _table_data() else [],
value=_table_data(),
label="Current Leaderboard",
interactive=False,
)
gr.Markdown("## πŸ”„ Submit new model results")
with gr.Accordion("Submission form", open=False):
with gr.Row():
model_name_inp = gr.Text(label="Model name (org/model)", value="")
bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0)
pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0)
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0)
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0)
gr.Markdown("### Multi-metric subjective scores (0.0 – 1.0)")
with gr.Row():
readability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Readability")
relevance_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Relevance")
explanation_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Explanation Clarity")
problem_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Problem Identification")
actionability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Actionability")
completeness_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Completeness")
specificity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Specificity")
contextual_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Contextual Adequacy")
consistency_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Consistency")
brevity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Brevity")
submit_btn = gr.Button("Submit")
status_markdown = gr.Markdown("")
submit_btn.click(
fn=submit_model,
inputs=[
model_name_inp,
bleu_inp,
pass1_inp,
pass5_inp,
pass10_inp,
readability_inp,
relevance_inp,
explanation_inp,
problem_inp,
actionability_inp,
completeness_inp,
specificity_inp,
contextual_inp,
consistency_inp,
brevity_inp,
],
outputs=[leaderboard_df, status_markdown],
)
# Expose app variable for Spaces
app = demo