eval_model / app.py
AvocadoMuffin's picture
Update app.py
98d17bf verified
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
def normalize_answer(s):
"""Identical to extractor's normalization"""
def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text): return ' '.join(text.split())
def remove_punc(text):
return ''.join(ch for ch in text if ch not in set(string.punctuation))
def lower(text): return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score_qa(prediction, ground_truth):
"""Identical to original"""
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0: return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
return (2 * precision * recall) / (precision + recall)
def exact_match_score(prediction, ground_truth):
"""Identical to original"""
return normalize_answer(prediction) == normalize_answer(ground_truth)
def get_qa_confidence(model, tokenizer, question, context):
"""Identical to extractor's confidence calculation"""
inputs = tokenizer(
question, context,
return_tensors="pt",
truncation=True,
max_length=512,
stride=128,
padding=True
)
if torch.cuda.is_available():
inputs = {k:v.cuda() for k,v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
start_probs = torch.softmax(outputs.start_logits, dim=1)
end_probs = torch.softmax(outputs.end_logits, dim=1)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
confidence = np.sqrt(
start_probs[0, answer_start].item() *
end_probs[0, answer_end-1].item()
)
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
return answer, float(confidence)
def run_evaluation(num_samples, progress=gr.Progress()):
"""Modified to use extractor's confidence calculation"""
# Authentication
hf_token = os.getenv("EVAL_TOKEN")
if hf_token:
try:
login(token=hf_token)
except Exception as e:
print(f"Auth error: {e}")
# Load model (raw instead of pipeline)
model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
if torch.cuda.is_available():
model = model.cuda()
except Exception as e:
return f"❌ Model load failed: {e}", pd.DataFrame(), None
# Load dataset
progress(0.1, desc="Loading CUAD dataset...")
try:
dataset = load_dataset(
"theatticusproject/cuad-qa",
trust_remote_code=True,
token=hf_token
)
test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
except Exception as e:
return f"❌ Dataset load failed: {e}", pd.DataFrame(), None
predictions = []
for i, example in enumerate(test_data):
progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}")
try:
context = example["context"]
question = example["question"]
gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
# Use extractor-style confidence
pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
predictions.append({
"Sample_ID": i+1,
"Question": question[:100] + "..." if len(question) > 100 else question,
"Predicted_Answer": pred_answer,
"Ground_Truth": gt_answer,
"Exact_Match": exact_match_score(pred_answer, gt_answer),
"F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3),
"Confidence": round(confidence, 3) # Now matches extractor
})
except Exception as e:
print(f"Error sample {i}: {e}")
continue
# Generate report (identical to original)
if not predictions:
return "❌ No valid predictions", pd.DataFrame(), None
df = pd.DataFrame(predictions)
avg_em = df["Exact_Match"].mean() * 100
avg_f1 = df["F1_Score"].mean() * 100
results_summary = f"""
# πŸ“Š Evaluation Results (n={len(df)})
## 🎯 Metrics
- Exact Match: {avg_em:.2f}%
- F1 Score: {avg_f1:.2f}%
- Avg Confidence: {df['Confidence'].mean():.2%}
## πŸ” Confidence Analysis
- High-Confidence (>80%) Accuracy: {
df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%}
"""
# Save results (identical to original)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"cuad_eval_{timestamp}.json"
with open(results_file, "w") as f:
json.dump({
"model": model_name,
"metrics": {
"exact_match": float(avg_em),
"f1_score": float(avg_f1),
"avg_confidence": float(df['Confidence'].mean())
},
"samples": predictions
}, f, indent=2)
return results_summary, df, results_file
# YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED)
def create_gradio_interface():
with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1>πŸ›οΈ CUAD Model Evaluation Dashboard</h1>
<p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
<p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<h3>βš™οΈ Evaluation Settings</h3>")
num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of samples")
evaluate_btn = gr.Button("πŸš€ Start Evaluation", variant="primary")
with gr.Column(scale=2):
results_summary = gr.Markdown("Click 'πŸš€ Start Evaluation' to begin...")
gr.HTML("<hr>")
detailed_results = gr.Dataframe(interactive=False, wrap=True)
download_file = gr.File(visible=False)
def handle_eval(num_samples):
summary, df, file = run_evaluation(num_samples)
return (
summary,
df[["Sample_ID", "Question", "Predicted_Answer", "Confidence", "Exact_Match"]],
gr.File(visible=True, value=file) if file else gr.File(visible=False)
)
evaluate_btn.click(
fn=handle_eval,
inputs=num_samples,
outputs=[results_summary, detailed_results, download_file],
show_progress=True
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)