CodeReviewBench / app.py
Alex
error
ea6e048
raw
history blame
7.47 kB
import json
from pathlib import Path
from typing import List, Dict
import gradio as gr
from pydantic import BaseModel, Field, field_validator
# --------------- Configuration ---------------
LEADERBOARD_PATH = Path("leaderboard_data.json")
DEFAULT_MODEL_NAME = "example/model"
# --------------- Data models ---------------
class Metrics(BaseModel):
readability: int
relevance: int
explanation_clarity: int = Field(alias="explanation_clarity")
problem_identification: int
actionability: int
completeness: int
specificity: int
contextual_adequacy: int
consistency: int
brevity: int
@field_validator("readability", "relevance", "explanation_clarity", "problem_identification", "actionability", "completeness", "specificity", "contextual_adequacy", "consistency", "brevity")
def metric_range(cls, v: int):
if not 0 <= v <= 10:
raise ValueError("Multi-metrics should be between 0 and 10")
return v
class LeaderboardEntry(BaseModel):
model_name: str
bleu: float
llm_pass_1: float
llm_pass_5: float
llm_pass_10: float
metrics: Metrics
@field_validator("bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10")
def score_range(cls, v: float):
if not 0.0 <= v <= 1.0:
raise ValueError("Scores should be between 0 and 1")
return v
# --------------- Persistence helpers ---------------
def _load_leaderboard() -> List[Dict]:
if not LEADERBOARD_PATH.exists():
return []
with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
data = json.load(f)
return data.get("leaderboard", [])
def _save_leaderboard(data: List[Dict]):
to_store = {"leaderboard": data}
with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
json.dump(to_store, f, indent=2)
# --------------- Utility ---------------
def _flatten_entry(entry: Dict) -> Dict:
"""Flatten nested metrics so that every metric is a column."""
flat = {
"Model": entry["model_name"],
"BLEU": entry["bleu"],
"Pass@1": entry["llm_pass_1"],
"Pass@5": entry["llm_pass_5"],
"Pass@10": entry["llm_pass_10"],
}
for metric_name, score in entry["metrics"].items():
flat[metric_name.replace("_", " ").title()] = score
return flat
def _table_data() -> List[Dict]:
data = _load_leaderboard()
# Sort descending by pass@1 as requested
data.sort(key=lambda x: x["llm_pass_1"], reverse=True)
return [_flatten_entry(e) for e in data]
# --------------- Gradio callbacks ---------------
def submit_model(
model_name: str,
bleu: float,
llm_pass_1: float,
llm_pass_5: float,
llm_pass_10: float,
readability: int,
relevance: int,
explanation_clarity: int,
problem_identification: int,
actionability: int,
completeness: int,
specificity: int,
contextual_adequacy: int,
consistency: int,
brevity: int,
):
"""Validate and append a new model entry to the leaderboard."""
try:
entry = LeaderboardEntry(
model_name=model_name.strip(),
bleu=bleu,
llm_pass_1=llm_pass_1,
llm_pass_5=llm_pass_5,
llm_pass_10=llm_pass_10,
metrics={
"readability": readability,
"relevance": relevance,
"explanation_clarity": explanation_clarity,
"problem_identification": problem_identification,
"actionability": actionability,
"completeness": completeness,
"specificity": specificity,
"contextual_adequacy": contextual_adequacy,
"consistency": consistency,
"brevity": brevity,
},
)
except Exception as e:
return gr.update(value=_table_data()), gr.update(value=f"❌ Submission failed: {e}")
data = _load_leaderboard()
# Replace existing model entry if any
data = [d for d in data if d["model_name"] != entry.model_name]
data.append(entry.dict())
_save_leaderboard(data)
return gr.update(value=_table_data()), gr.update(value="βœ… Submission recorded!")
# --------------- Interface ---------------
with gr.Blocks(title="Custom LLM Leaderboard") as demo:
gr.Markdown("""# πŸ† LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
# Initialize table data
initial_data = _table_data()
leaderboard_df = gr.Dataframe(
headers=list(initial_data[0].keys()) if initial_data else ["Model", "BLEU", "Pass@1", "Pass@5", "Pass@10", "Readability", "Relevance", "Explanation Clarity", "Problem Identification", "Actionability", "Completeness", "Specificity", "Contextual Adequacy", "Consistency", "Brevity"],
value=initial_data,
label="Current Leaderboard",
interactive=False,
)
gr.Markdown("## πŸ”„ Submit new model results")
with gr.Accordion("Submission form", open=False):
with gr.Row():
model_name_inp = gr.Text(label="Model name (org/model)", value="")
bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0)
pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0)
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0)
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0)
gr.Markdown("### Multi-metric subjective scores (0 – 10)")
with gr.Row():
readability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Readability")
relevance_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Relevance")
explanation_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Explanation Clarity")
problem_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Problem Identification")
actionability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Actionability")
completeness_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Completeness")
specificity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Specificity")
contextual_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Contextual Adequacy")
consistency_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Consistency")
brevity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Brevity")
submit_btn = gr.Button("Submit")
status_markdown = gr.Markdown("")
submit_btn.click(
fn=submit_model,
inputs=[
model_name_inp,
bleu_inp,
pass1_inp,
pass5_inp,
pass10_inp,
readability_inp,
relevance_inp,
explanation_inp,
problem_inp,
actionability_inp,
completeness_inp,
specificity_inp,
contextual_inp,
consistency_inp,
brevity_inp,
],
outputs=[leaderboard_df, status_markdown],
api_name="submit_model",
)
# ----------------- Launch -----------------
if __name__ == "__main__":
demo.queue().launch()
# For HF Spaces runtime (gradio SDK) expose `demo`
app = demo