Enderchef's picture
Create app.py
09fee22 verified
import gradio as gr
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import json
import os
from datetime import datetime
import time
# --- Configuration ---
QA_FILE = "qa.txt"
RESULTS_FILE = "Eval_results.jsonl"
JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging
# --- Setup: Ensure files exist ---
if not os.path.exists(RESULTS_FILE):
with open(RESULTS_FILE, "w") as f:
pass # Create an empty file if it doesn't exist
if not os.path.exists(QA_FILE):
# Create a dummy qa.txt if it's missing, with a few example questions
dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary
1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times."
2,Common Chat,"What is the capital of France?","The answer must be Paris."
3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection."
"""
with open(QA_FILE, "w") as f:
f.write(dummy_data)
# --- AI Judge Logic ---
def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer):
"""
Uses the AI Judge model to give a verdict on the tested model's answer.
"""
system_instruction = f"""
You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer.
A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary.
A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete.
---
User Question:
{question}
Expected Golden Answer Summary:
{golden_summary}
---
AI Model's Answer:
{ai_answer}
---
Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'.
"""
try:
response = judge_pipeline(system_instruction, max_new_tokens=5)
# Extract the generated text and clean it up
verdict = response[0]['generated_text'].strip()
# Ensure the verdict is either '1' or '0'
if '1' in verdict:
return 1
else:
return 0
except Exception:
# If the judge fails for any reason, default to a failing grade
return 0
# --- Core Evaluation Logic ---
def run_evaluation(model_repo, model_nickname, progress=gr.Progress()):
"""
Loads a user-specified model, runs it against the benchmark, evaluates the answers
using an AI judge, and saves the results.
"""
if not model_repo or not model_nickname:
gr.Warning("Model Repository and Nickname cannot be empty.")
return pd.DataFrame(), None
# Load benchmark questions
try:
questions_df = pd.read_csv(QA_FILE)
# Use a small subset for quick demos if needed
# questions_df = questions_df.head(3)
except Exception as e:
gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}")
return pd.DataFrame(), None
# --- Load Models ---
progress(0, desc="Loading AI Judge Model...")
try:
judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16)
except Exception as e:
gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}")
return pd.DataFrame(), None
progress(0.1, desc=f"Loading test model: {model_repo}")
try:
model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo)
model_to_test = AutoModelForCausalLM.from_pretrained(
model_repo,
device_map="auto",
torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU
)
test_pipeline = pipeline(
"text-generation",
model=model_to_test,
tokenizer=model_to_test_tokenizer,
max_new_tokens=1024, # Set a reasonable limit for code generation
do_sample=True,
temperature=0.7,
top_p=0.95
)
except Exception as e:
gr.Error(f"Failed to load the specified test model '{model_repo}': {e}")
return pd.DataFrame(), None
# --- Run Benchmark Loop ---
detailed_results = []
total_score = 0
total_questions = len(questions_df)
for i, row in enumerate(questions_df.itertuples()):
progress_val = 0.1 + (0.8 * (i / total_questions))
progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}")
# Generate answer from the model being tested
try:
prompt = f"Question: {row.Question}\n\nAnswer:"
response = test_pipeline(prompt)
ai_answer = response[0]['generated_text'].replace(prompt, "").strip()
except Exception as e:
ai_answer = f"Error during generation: {e}"
# Get verdict from the AI Judge
score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer)
total_score += score
detailed_results.append({
"ID": row.ID,
"Question": row.Question,
"AI_Answer": ai_answer,
"Score": score
})
time.sleep(0.1) # Small delay to allow UI to update
# --- Finalize and Save Results ---
progress(0.95, desc="Finalizing and saving...")
final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0
run_summary = {
"model_nickname": model_nickname,
"model_repo": model_repo,
"score_percent": round(final_score_percent, 2),
"timestamp": datetime.utcnow().isoformat(),
"detailed_results": detailed_results
}
try:
with open(RESULTS_FILE, "a") as f:
f.write(json.dumps(run_summary) + "\n")
except Exception as e:
gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}")
progress(1, desc="Evaluation Complete!")
return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**")
# --- Leaderboard Logic ---
def load_leaderboard():
"""
Loads and displays the leaderboard from the results file.
"""
if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0:
return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
results_data = []
with open(RESULTS_FILE, "r") as f:
for line in f:
try:
data = json.loads(line)
results_data.append({
"Model Nickname": data.get("model_nickname"),
"Score (%)": data.get("score_percent"),
"Model Repo": data.get("model_repo"),
"Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S')
})
except (json.JSONDecodeError, KeyError):
# Skip corrupted or malformed lines
continue
if not results_data:
return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
leaderboard_df = pd.DataFrame(results_data)
leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)
leaderboard_df["Rank"] = leaderboard_df.index + 1
# Reorder columns for display
leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]]
return leaderboard_df
# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo:
gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark")
with gr.Tabs():
with gr.TabItem("Run Evaluation"):
with gr.Row():
with gr.Column(scale=2):
model_repo_input = gr.Textbox(
label="Hugging Face Model Repository",
placeholder="e.g., google/gemma-2b-it",
info="The model to be tested. Must be compatible with the text-generation pipeline."
)
model_nickname_input = gr.Textbox(
label="Model Nickname",
placeholder="e.g., Gemma-2B-v1",
info="A unique name to display on the leaderboard."
)
run_button = gr.Button("Start Evaluation", variant="primary")
with gr.Column(scale=1):
final_score_output = gr.Markdown("**Overall Score: --**")
gr.Markdown("---")
gr.Markdown("### Detailed Run Results")
results_output = gr.DataFrame(
headers=["ID", "Question", "AI_Answer", "Score"],
wrap=True,
height=600
)
with gr.TabItem("Leaderboard"):
leaderboard_refresh_button = gr.Button("Refresh Leaderboard")
leaderboard_output = gr.DataFrame(
headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"],
wrap=True,
height=700
)
# --- Event Handlers ---
run_button.click(
fn=run_evaluation,
inputs=[model_repo_input, model_nickname_input],
outputs=[results_output, final_score_output]
)
leaderboard_refresh_button.click(
fn=load_leaderboard,
inputs=[],
outputs=[leaderboard_output]
)
# Load leaderboard once on startup
demo.load(load_leaderboard, None, leaderboard_output)
if __name__ == "__main__":
demo.launch(debug=True)