Spaces:
Running
Running
File size: 7,750 Bytes
0f03dd5 98d17bf 0f03dd5 cb1cf5c 0f03dd5 cb1cf5c 0f03dd5 98d17bf e010197 98d17bf e010197 98d17bf 0f03dd5 cb1cf5c 98d17bf cb1cf5c 0f03dd5 e010197 cb1cf5c 37e8cfe 0f03dd5 cb1cf5c 98d17bf cb1cf5c e010197 98d17bf 37e8cfe e010197 37e8cfe e010197 37e8cfe e010197 37e8cfe 0f03dd5 e010197 0f03dd5 37e8cfe 98d17bf 37e8cfe cb1cf5c 98d17bf cb1cf5c 98d17bf 0f03dd5 98d17bf 37e8cfe 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf 37e8cfe 98d17bf cb1cf5c 98d17bf 0f03dd5 98d17bf 37e8cfe 98d17bf cb1cf5c 98d17bf 37e8cfe 0f03dd5 98d17bf 0f03dd5 98d17bf 37e8cfe cb1cf5c e010197 98d17bf cb1cf5c e010197 98d17bf 37e8cfe 98d17bf 0f03dd5 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf cb1cf5c 98d17bf cb1cf5c 0f03dd5 cb1cf5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime
def normalize_answer(s):
"""Identical to extractor's normalization"""
def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text): return ' '.join(text.split())
def remove_punc(text):
return ''.join(ch for ch in text if ch not in set(string.punctuation))
def lower(text): return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score_qa(prediction, ground_truth):
"""Identical to original"""
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0: return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
return (2 * precision * recall) / (precision + recall)
def exact_match_score(prediction, ground_truth):
"""Identical to original"""
return normalize_answer(prediction) == normalize_answer(ground_truth)
def get_qa_confidence(model, tokenizer, question, context):
"""Identical to extractor's confidence calculation"""
inputs = tokenizer(
question, context,
return_tensors="pt",
truncation=True,
max_length=512,
stride=128,
padding=True
)
if torch.cuda.is_available():
inputs = {k:v.cuda() for k,v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
start_probs = torch.softmax(outputs.start_logits, dim=1)
end_probs = torch.softmax(outputs.end_logits, dim=1)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
confidence = np.sqrt(
start_probs[0, answer_start].item() *
end_probs[0, answer_end-1].item()
)
answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
return answer, float(confidence)
def run_evaluation(num_samples, progress=gr.Progress()):
"""Modified to use extractor's confidence calculation"""
# Authentication
hf_token = os.getenv("EVAL_TOKEN")
if hf_token:
try:
login(token=hf_token)
except Exception as e:
print(f"Auth error: {e}")
# Load model (raw instead of pipeline)
model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
if torch.cuda.is_available():
model = model.cuda()
except Exception as e:
return f"β Model load failed: {e}", pd.DataFrame(), None
# Load dataset
progress(0.1, desc="Loading CUAD dataset...")
try:
dataset = load_dataset(
"theatticusproject/cuad-qa",
trust_remote_code=True,
token=hf_token
)
test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
except Exception as e:
return f"β Dataset load failed: {e}", pd.DataFrame(), None
predictions = []
for i, example in enumerate(test_data):
progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}")
try:
context = example["context"]
question = example["question"]
gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
# Use extractor-style confidence
pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
predictions.append({
"Sample_ID": i+1,
"Question": question[:100] + "..." if len(question) > 100 else question,
"Predicted_Answer": pred_answer,
"Ground_Truth": gt_answer,
"Exact_Match": exact_match_score(pred_answer, gt_answer),
"F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3),
"Confidence": round(confidence, 3) # Now matches extractor
})
except Exception as e:
print(f"Error sample {i}: {e}")
continue
# Generate report (identical to original)
if not predictions:
return "β No valid predictions", pd.DataFrame(), None
df = pd.DataFrame(predictions)
avg_em = df["Exact_Match"].mean() * 100
avg_f1 = df["F1_Score"].mean() * 100
results_summary = f"""
# π Evaluation Results (n={len(df)})
## π― Metrics
- Exact Match: {avg_em:.2f}%
- F1 Score: {avg_f1:.2f}%
- Avg Confidence: {df['Confidence'].mean():.2%}
## π Confidence Analysis
- High-Confidence (>80%) Accuracy: {
df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%}
"""
# Save results (identical to original)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"cuad_eval_{timestamp}.json"
with open(results_file, "w") as f:
json.dump({
"model": model_name,
"metrics": {
"exact_match": float(avg_em),
"f1_score": float(avg_f1),
"avg_confidence": float(df['Confidence'].mean())
},
"samples": predictions
}, f, indent=2)
return results_summary, df, results_file
# YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED)
def create_gradio_interface():
with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1>ποΈ CUAD Model Evaluation Dashboard</h1>
<p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
<p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<h3>βοΈ Evaluation Settings</h3>")
num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of samples")
evaluate_btn = gr.Button("π Start Evaluation", variant="primary")
with gr.Column(scale=2):
results_summary = gr.Markdown("Click 'π Start Evaluation' to begin...")
gr.HTML("<hr>")
detailed_results = gr.Dataframe(interactive=False, wrap=True)
download_file = gr.File(visible=False)
def handle_eval(num_samples):
summary, df, file = run_evaluation(num_samples)
return (
summary,
df[["Sample_ID", "Question", "Predicted_Answer", "Confidence", "Exact_Match"]],
gr.File(visible=True, value=file) if file else gr.File(visible=False)
)
evaluate_btn.click(
fn=handle_eval,
inputs=num_samples,
outputs=[results_summary, detailed_results, download_file],
show_progress=True
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
) |