Spaces:

AvocadoMuffin
/

eval_model

Running

App Files Files Community

eval_model / app.py

AvocadoMuffin

Update app.py

98d17bf verified about 6 hours ago

raw

history blame contribute delete

7.75 kB

	import os
	import json
	import numpy as np
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering
	import torch
	from sklearn.metrics import f1_score
	import re
	from collections import Counter
	import string
	from huggingface_hub import login
	import gradio as gr
	import pandas as pd
	from datetime import datetime

	def normalize_answer(s):
	"""Identical to extractor's normalization"""
	def remove_articles(text): return re.sub(r'\b(a\|an\|the)\b', ' ', text)
	def white_space_fix(text): return ' '.join(text.split())
	def remove_punc(text):
	return ''.join(ch for ch in text if ch not in set(string.punctuation))
	def lower(text): return text.lower()
	return white_space_fix(remove_articles(remove_punc(lower(s))))

	def f1_score_qa(prediction, ground_truth):
	"""Identical to original"""
	prediction_tokens = normalize_answer(prediction).split()
	ground_truth_tokens = normalize_answer(ground_truth).split()
	common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
	num_same = sum(common.values())
	if num_same == 0: return 0
	precision = 1.0 * num_same / len(prediction_tokens)
	recall = 1.0 * num_same / len(ground_truth_tokens)
	return (2 * precision * recall) / (precision + recall)

	def exact_match_score(prediction, ground_truth):
	"""Identical to original"""
	return normalize_answer(prediction) == normalize_answer(ground_truth)

	def get_qa_confidence(model, tokenizer, question, context):
	"""Identical to extractor's confidence calculation"""
	inputs = tokenizer(
	question, context,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	stride=128,
	padding=True
	)
	if torch.cuda.is_available():
	inputs = {k:v.cuda() for k,v in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)

	start_probs = torch.softmax(outputs.start_logits, dim=1)
	end_probs = torch.softmax(outputs.end_logits, dim=1)
	answer_start = torch.argmax(outputs.start_logits)
	answer_end = torch.argmax(outputs.end_logits) + 1

	confidence = np.sqrt(
	start_probs[0, answer_start].item() *
	end_probs[0, answer_end-1].item()
	)

	answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
	answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
	return answer, float(confidence)

	def run_evaluation(num_samples, progress=gr.Progress()):
	"""Modified to use extractor's confidence calculation"""
	# Authentication
	hf_token = os.getenv("EVAL_TOKEN")
	if hf_token:
	try:
	login(token=hf_token)
	except Exception as e:
	print(f"Auth error: {e}")

	# Load model (raw instead of pipeline)
	model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
	model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
	if torch.cuda.is_available():
	model = model.cuda()
	except Exception as e:
	return f"❌ Model load failed: {e}", pd.DataFrame(), None

	# Load dataset
	progress(0.1, desc="Loading CUAD dataset...")
	try:
	dataset = load_dataset(
	"theatticusproject/cuad-qa",
	trust_remote_code=True,
	token=hf_token
	)
	test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
	except Exception as e:
	return f"❌ Dataset load failed: {e}", pd.DataFrame(), None

	predictions = []
	for i, example in enumerate(test_data):
	progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}")

	try:
	context = example["context"]
	question = example["question"]
	gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""

	# Use extractor-style confidence
	pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)

	predictions.append({
	"Sample_ID": i+1,
	"Question": question[:100] + "..." if len(question) > 100 else question,
	"Predicted_Answer": pred_answer,
	"Ground_Truth": gt_answer,
	"Exact_Match": exact_match_score(pred_answer, gt_answer),
	"F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3),
	"Confidence": round(confidence, 3) # Now matches extractor
	})
	except Exception as e:
	print(f"Error sample {i}: {e}")
	continue

	# Generate report (identical to original)
	if not predictions:
	return "❌ No valid predictions", pd.DataFrame(), None

	df = pd.DataFrame(predictions)
	avg_em = df["Exact_Match"].mean() * 100
	avg_f1 = df["F1_Score"].mean() * 100

	results_summary = f"""
	# 📊 Evaluation Results (n={len(df)})
	## 🎯 Metrics
	- Exact Match: {avg_em:.2f}%
	- F1 Score: {avg_f1:.2f}%
	- Avg Confidence: {df['Confidence'].mean():.2%}
	## 🔍 Confidence Analysis
	- High-Confidence (>80%) Accuracy: {
	df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%}
	"""

	# Save results (identical to original)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	results_file = f"cuad_eval_{timestamp}.json"
	with open(results_file, "w") as f:
	json.dump({
	"model": model_name,
	"metrics": {
	"exact_match": float(avg_em),
	"f1_score": float(avg_f1),
	"avg_confidence": float(df['Confidence'].mean())
	},
	"samples": predictions
	}, f, indent=2)

	return results_summary, df, results_file

	# YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED)
	def create_gradio_interface():
	with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
	gr.HTML("""
	<div style="text-align: center; padding: 20px;">
	<h1>🏛️ CUAD Model Evaluation Dashboard</h1>
	<p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
	<p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.HTML("<h3>⚙️ Evaluation Settings</h3>")
	num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of samples")
	evaluate_btn = gr.Button("🚀 Start Evaluation", variant="primary")

	with gr.Column(scale=2):
	results_summary = gr.Markdown("Click '🚀 Start Evaluation' to begin...")

	gr.HTML("<hr>")
	detailed_results = gr.Dataframe(interactive=False, wrap=True)
	download_file = gr.File(visible=False)

	def handle_eval(num_samples):
	summary, df, file = run_evaluation(num_samples)
	return (
	summary,
	df[["Sample_ID", "Question", "Predicted_Answer", "Confidence", "Exact_Match"]],
	gr.File(visible=True, value=file) if file else gr.File(visible=False)
	)

	evaluate_btn.click(
	fn=handle_eval,
	inputs=num_samples,
	outputs=[results_summary, detailed_results, download_file],
	show_progress=True
	)

	return demo

	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)