Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import torch | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
import json | |
import os | |
from datetime import datetime | |
import time | |
# --- Configuration --- | |
QA_FILE = "qa.txt" | |
RESULTS_FILE = "Eval_results.jsonl" | |
JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging | |
# --- Setup: Ensure files exist --- | |
if not os.path.exists(RESULTS_FILE): | |
with open(RESULTS_FILE, "w") as f: | |
pass # Create an empty file if it doesn't exist | |
if not os.path.exists(QA_FILE): | |
# Create a dummy qa.txt if it's missing, with a few example questions | |
dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary | |
1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times." | |
2,Common Chat,"What is the capital of France?","The answer must be Paris." | |
3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection." | |
""" | |
with open(QA_FILE, "w") as f: | |
f.write(dummy_data) | |
# --- AI Judge Logic --- | |
def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer): | |
""" | |
Uses the AI Judge model to give a verdict on the tested model's answer. | |
""" | |
system_instruction = f""" | |
You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer. | |
A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary. | |
A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete. | |
--- | |
User Question: | |
{question} | |
Expected Golden Answer Summary: | |
{golden_summary} | |
--- | |
AI Model's Answer: | |
{ai_answer} | |
--- | |
Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'. | |
""" | |
try: | |
response = judge_pipeline(system_instruction, max_new_tokens=5) | |
# Extract the generated text and clean it up | |
verdict = response[0]['generated_text'].strip() | |
# Ensure the verdict is either '1' or '0' | |
if '1' in verdict: | |
return 1 | |
else: | |
return 0 | |
except Exception: | |
# If the judge fails for any reason, default to a failing grade | |
return 0 | |
# --- Core Evaluation Logic --- | |
def run_evaluation(model_repo, model_nickname, progress=gr.Progress()): | |
""" | |
Loads a user-specified model, runs it against the benchmark, evaluates the answers | |
using an AI judge, and saves the results. | |
""" | |
if not model_repo or not model_nickname: | |
gr.Warning("Model Repository and Nickname cannot be empty.") | |
return pd.DataFrame(), None | |
# Load benchmark questions | |
try: | |
questions_df = pd.read_csv(QA_FILE) | |
# Use a small subset for quick demos if needed | |
# questions_df = questions_df.head(3) | |
except Exception as e: | |
gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}") | |
return pd.DataFrame(), None | |
# --- Load Models --- | |
progress(0, desc="Loading AI Judge Model...") | |
try: | |
judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16) | |
except Exception as e: | |
gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}") | |
return pd.DataFrame(), None | |
progress(0.1, desc=f"Loading test model: {model_repo}") | |
try: | |
model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo) | |
model_to_test = AutoModelForCausalLM.from_pretrained( | |
model_repo, | |
device_map="auto", | |
torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU | |
) | |
test_pipeline = pipeline( | |
"text-generation", | |
model=model_to_test, | |
tokenizer=model_to_test_tokenizer, | |
max_new_tokens=1024, # Set a reasonable limit for code generation | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.95 | |
) | |
except Exception as e: | |
gr.Error(f"Failed to load the specified test model '{model_repo}': {e}") | |
return pd.DataFrame(), None | |
# --- Run Benchmark Loop --- | |
detailed_results = [] | |
total_score = 0 | |
total_questions = len(questions_df) | |
for i, row in enumerate(questions_df.itertuples()): | |
progress_val = 0.1 + (0.8 * (i / total_questions)) | |
progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}") | |
# Generate answer from the model being tested | |
try: | |
prompt = f"Question: {row.Question}\n\nAnswer:" | |
response = test_pipeline(prompt) | |
ai_answer = response[0]['generated_text'].replace(prompt, "").strip() | |
except Exception as e: | |
ai_answer = f"Error during generation: {e}" | |
# Get verdict from the AI Judge | |
score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer) | |
total_score += score | |
detailed_results.append({ | |
"ID": row.ID, | |
"Question": row.Question, | |
"AI_Answer": ai_answer, | |
"Score": score | |
}) | |
time.sleep(0.1) # Small delay to allow UI to update | |
# --- Finalize and Save Results --- | |
progress(0.95, desc="Finalizing and saving...") | |
final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0 | |
run_summary = { | |
"model_nickname": model_nickname, | |
"model_repo": model_repo, | |
"score_percent": round(final_score_percent, 2), | |
"timestamp": datetime.utcnow().isoformat(), | |
"detailed_results": detailed_results | |
} | |
try: | |
with open(RESULTS_FILE, "a") as f: | |
f.write(json.dumps(run_summary) + "\n") | |
except Exception as e: | |
gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}") | |
progress(1, desc="Evaluation Complete!") | |
return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**") | |
# --- Leaderboard Logic --- | |
def load_leaderboard(): | |
""" | |
Loads and displays the leaderboard from the results file. | |
""" | |
if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0: | |
return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"]) | |
results_data = [] | |
with open(RESULTS_FILE, "r") as f: | |
for line in f: | |
try: | |
data = json.loads(line) | |
results_data.append({ | |
"Model Nickname": data.get("model_nickname"), | |
"Score (%)": data.get("score_percent"), | |
"Model Repo": data.get("model_repo"), | |
"Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S') | |
}) | |
except (json.JSONDecodeError, KeyError): | |
# Skip corrupted or malformed lines | |
continue | |
if not results_data: | |
return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"]) | |
leaderboard_df = pd.DataFrame(results_data) | |
leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True) | |
leaderboard_df["Rank"] = leaderboard_df.index + 1 | |
# Reorder columns for display | |
leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]] | |
return leaderboard_df | |
# --- Gradio UI --- | |
with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo: | |
gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark") | |
with gr.Tabs(): | |
with gr.TabItem("Run Evaluation"): | |
with gr.Row(): | |
with gr.Column(scale=2): | |
model_repo_input = gr.Textbox( | |
label="Hugging Face Model Repository", | |
placeholder="e.g., google/gemma-2b-it", | |
info="The model to be tested. Must be compatible with the text-generation pipeline." | |
) | |
model_nickname_input = gr.Textbox( | |
label="Model Nickname", | |
placeholder="e.g., Gemma-2B-v1", | |
info="A unique name to display on the leaderboard." | |
) | |
run_button = gr.Button("Start Evaluation", variant="primary") | |
with gr.Column(scale=1): | |
final_score_output = gr.Markdown("**Overall Score: --**") | |
gr.Markdown("---") | |
gr.Markdown("### Detailed Run Results") | |
results_output = gr.DataFrame( | |
headers=["ID", "Question", "AI_Answer", "Score"], | |
wrap=True, | |
height=600 | |
) | |
with gr.TabItem("Leaderboard"): | |
leaderboard_refresh_button = gr.Button("Refresh Leaderboard") | |
leaderboard_output = gr.DataFrame( | |
headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"], | |
wrap=True, | |
height=700 | |
) | |
# --- Event Handlers --- | |
run_button.click( | |
fn=run_evaluation, | |
inputs=[model_repo_input, model_nickname_input], | |
outputs=[results_output, final_score_output] | |
) | |
leaderboard_refresh_button.click( | |
fn=load_leaderboard, | |
inputs=[], | |
outputs=[leaderboard_output] | |
) | |
# Load leaderboard once on startup | |
demo.load(load_leaderboard, None, leaderboard_output) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |