import os
import json
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import f1_score
import re
from collections import Counter
import string
from huggingface_hub import login
import gradio as gr
import pandas as pd
from datetime import datetime

def normalize_answer(s):
    """Identical to extractor's normalization"""
    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text): return ' '.join(text.split())
    def remove_punc(text): 
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def lower(text): return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score_qa(prediction, ground_truth):
    """Identical to original"""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0: return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    """Identical to original"""
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def get_qa_confidence(model, tokenizer, question, context):
    """Identical to extractor's confidence calculation"""
    inputs = tokenizer(
        question, context,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        stride=128,
        padding=True
    )
    if torch.cuda.is_available():
        inputs = {k:v.cuda() for k,v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_probs = torch.softmax(outputs.start_logits, dim=1)
    end_probs = torch.softmax(outputs.end_logits, dim=1)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    
    confidence = np.sqrt(
        start_probs[0, answer_start].item() * 
        end_probs[0, answer_end-1].item()
    )
    
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
    return answer, float(confidence)

def run_evaluation(num_samples, progress=gr.Progress()):
    """Modified to use extractor's confidence calculation"""
    # Authentication
    hf_token = os.getenv("EVAL_TOKEN")
    if hf_token:
        try:
            login(token=hf_token)
        except Exception as e:
            print(f"Auth error: {e}")
    
    # Load model (raw instead of pipeline)
    model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
        if torch.cuda.is_available():
            model = model.cuda()
    except Exception as e:
        return f"❌ Model load failed: {e}", pd.DataFrame(), None
    
    # Load dataset
    progress(0.1, desc="Loading CUAD dataset...")
    try:
        dataset = load_dataset(
            "theatticusproject/cuad-qa",
            trust_remote_code=True,
            token=hf_token
        )
        test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
    except Exception as e:
        return f"❌ Dataset load failed: {e}", pd.DataFrame(), None
    
    predictions = []
    for i, example in enumerate(test_data):
        progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}")
        
        try:
            context = example["context"]
            question = example["question"]
            gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
            
            # Use extractor-style confidence
            pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
            
            predictions.append({
                "Sample_ID": i+1,
                "Question": question[:100] + "..." if len(question) > 100 else question,
                "Predicted_Answer": pred_answer,
                "Ground_Truth": gt_answer,
                "Exact_Match": exact_match_score(pred_answer, gt_answer),
                "F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3),
                "Confidence": round(confidence, 3)  # Now matches extractor
            })
        except Exception as e:
            print(f"Error sample {i}: {e}")
            continue
    
    # Generate report (identical to original)
    if not predictions:
        return "❌ No valid predictions", pd.DataFrame(), None
    
    df = pd.DataFrame(predictions)
    avg_em = df["Exact_Match"].mean() * 100
    avg_f1 = df["F1_Score"].mean() * 100
    
    results_summary = f"""
    # 📊 Evaluation Results (n={len(df)})
    ## 🎯 Metrics
    - Exact Match: {avg_em:.2f}%
    - F1 Score: {avg_f1:.2f}%
    - Avg Confidence: {df['Confidence'].mean():.2%}
    ## 🔍 Confidence Analysis
    - High-Confidence (>80%) Accuracy: {
        df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%}
    """
    
    # Save results (identical to original)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"cuad_eval_{timestamp}.json"
    with open(results_file, "w") as f:
        json.dump({
            "model": model_name,
            "metrics": {
                "exact_match": float(avg_em),
                "f1_score": float(avg_f1),
                "avg_confidence": float(df['Confidence'].mean())
            },
            "samples": predictions
        }, f, indent=2)
    
    return results_summary, df, results_file

# YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED)
def create_gradio_interface():
    with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
        gr.HTML("""
        <div style="text-align: center; padding: 20px;">
            <h1>🏛️ CUAD Model Evaluation Dashboard</h1>
            <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
            <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h3>⚙️ Evaluation Settings</h3>")
                num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of samples")
                evaluate_btn = gr.Button("🚀 Start Evaluation", variant="primary")
                
            with gr.Column(scale=2):
                results_summary = gr.Markdown("Click '🚀 Start Evaluation' to begin...")
        
        gr.HTML("<hr>")
        detailed_results = gr.Dataframe(interactive=False, wrap=True)
        download_file = gr.File(visible=False)
        
        def handle_eval(num_samples):
            summary, df, file = run_evaluation(num_samples)
            return (
                summary,
                df[["Sample_ID", "Question", "Predicted_Answer", "Confidence", "Exact_Match"]],
                gr.File(visible=True, value=file) if file else gr.File(visible=False)
            )
        
        evaluate_btn.click(
            fn=handle_eval,
            inputs=num_samples,
            outputs=[results_summary, detailed_results, download_file],
            show_progress=True
        )
    
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )