import os import json import numpy as np from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForQuestionAnswering import torch from sklearn.metrics import f1_score import re from collections import Counter import string from huggingface_hub import login import gradio as gr import pandas as pd from datetime import datetime def normalize_answer(s): """Identical to extractor's normalization""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): return ''.join(ch for ch in text if ch not in set(string.punctuation)) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def f1_score_qa(prediction, ground_truth): """Identical to original""" prediction_tokens = normalize_answer(prediction).split() ground_truth_tokens = normalize_answer(ground_truth).split() common = Counter(prediction_tokens) & Counter(ground_truth_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) return (2 * precision * recall) / (precision + recall) def exact_match_score(prediction, ground_truth): """Identical to original""" return normalize_answer(prediction) == normalize_answer(ground_truth) def get_qa_confidence(model, tokenizer, question, context): """Identical to extractor's confidence calculation""" inputs = tokenizer( question, context, return_tensors="pt", truncation=True, max_length=512, stride=128, padding=True ) if torch.cuda.is_available(): inputs = {k:v.cuda() for k,v in inputs.items()} with torch.no_grad(): outputs = model(**inputs) start_probs = torch.softmax(outputs.start_logits, dim=1) end_probs = torch.softmax(outputs.end_logits, dim=1) answer_start = torch.argmax(outputs.start_logits) answer_end = torch.argmax(outputs.end_logits) + 1 confidence = np.sqrt( start_probs[0, answer_start].item() * end_probs[0, answer_end-1].item() ) answer_tokens = inputs["input_ids"][0][answer_start:answer_end] answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip() return answer, float(confidence) def run_evaluation(num_samples, progress=gr.Progress()): """Modified to use extractor's confidence calculation""" # Authentication hf_token = os.getenv("EVAL_TOKEN") if hf_token: try: login(token=hf_token) except Exception as e: print(f"Auth error: {e}") # Load model (raw instead of pipeline) model_name = "AvocadoMuffin/roberta-cuad-qa-v2" try: tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token) model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token) if torch.cuda.is_available(): model = model.cuda() except Exception as e: return f"❌ Model load failed: {e}", pd.DataFrame(), None # Load dataset progress(0.1, desc="Loading CUAD dataset...") try: dataset = load_dataset( "theatticusproject/cuad-qa", trust_remote_code=True, token=hf_token ) test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"])))) except Exception as e: return f"❌ Dataset load failed: {e}", pd.DataFrame(), None predictions = [] for i, example in enumerate(test_data): progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}") try: context = example["context"] question = example["question"] gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else "" # Use extractor-style confidence pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context) predictions.append({ "Sample_ID": i+1, "Question": question[:100] + "..." if len(question) > 100 else question, "Predicted_Answer": pred_answer, "Ground_Truth": gt_answer, "Exact_Match": exact_match_score(pred_answer, gt_answer), "F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3), "Confidence": round(confidence, 3) # Now matches extractor }) except Exception as e: print(f"Error sample {i}: {e}") continue # Generate report (identical to original) if not predictions: return "❌ No valid predictions", pd.DataFrame(), None df = pd.DataFrame(predictions) avg_em = df["Exact_Match"].mean() * 100 avg_f1 = df["F1_Score"].mean() * 100 results_summary = f""" # 📊 Evaluation Results (n={len(df)}) ## 🎯 Metrics - Exact Match: {avg_em:.2f}% - F1 Score: {avg_f1:.2f}% - Avg Confidence: {df['Confidence'].mean():.2%} ## 🔍 Confidence Analysis - High-Confidence (>80%) Accuracy: { df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%} """ # Save results (identical to original) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_file = f"cuad_eval_{timestamp}.json" with open(results_file, "w") as f: json.dump({ "model": model_name, "metrics": { "exact_match": float(avg_em), "f1_score": float(avg_f1), "avg_confidence": float(df['Confidence'].mean()) }, "samples": predictions }, f, indent=2) return results_summary, df, results_file # YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED) def create_gradio_interface(): with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo: gr.HTML("""
Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model
Model: AvocadoMuffin/roberta-cuad-qa-v2