#!/usr/bin/env python3
"""
Phi-3.5-MoE Expert Assistant
Robust application with CPU/GPU environment detection and dependency handling
"""

import os
import sys
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Apply the model patch if available
try:
    import model_patch
    print("✅ Applied model patch for handling missing dependencies")
except ImportError:
    print("ℹ️ Model patch not found, continuing without it")

# Environment detection
ON_GPU = torch.cuda.is_available()
MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct")
REVISION = os.getenv("HF_REVISION")

# Configuration based on environment
if ON_GPU:
    attn_impl = "sdpa"  # Fast attention for GPU
    dtype = torch.bfloat16  # Mixed precision for GPU
    device_map = "auto"  # Auto device mapping for GPU
    low_cpu_mem = False  # Don't need low memory usage on GPU
else:
    attn_impl = "eager"  # Standard attention for CPU
    dtype = torch.float32  # Full precision for CPU
    device_map = "cpu"  # Force CPU device
    low_cpu_mem = True  # Enable low memory usage on CPU

print(f"🚀 Loading model: {MODEL_ID}")
print(f"🔧 Environment: {'GPU' if ON_GPU else 'CPU'}")
print(f"📊 Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}")

# Expert categories for query classification
EXPERT_CATEGORIES = {
    "Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"],
    "Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"],
    "Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"],
    "Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"],
    "General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"]
}

# Load model with robust error handling
model = None
tokenizer = None

try:
    # Load tokenizer
    print("📝 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID, 
        trust_remote_code=True, 
        revision=REVISION
    )
    
    # Load model with environment-specific settings
    print("🧠 Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        revision=REVISION,
        attn_implementation=attn_impl,
        dtype=dtype,  # Fixed: Use dtype instead of torch_dtype
        device_map=device_map,
        low_cpu_mem_usage=low_cpu_mem
    ).eval()
    
    print("✅ Model loaded successfully!")
    
    # Verify model works with a simple generation
    print("🔍 Running quick model test...")
    test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device)
    with torch.no_grad():
        test_output = model.generate(**test_input, max_new_tokens=5)
    print("✅ Model test successful!")
    
except Exception as e:
    print(f"⚠️ Model loading failed: {e}")
    print("⚠️ Continuing with limited functionality")

def classify_expert(query):
    """Classify query to determine which expert should handle it."""
    query_lower = query.lower()
    scores = {}
    
    for expert, keywords in EXPERT_CATEGORIES.items():
        score = sum(1 for keyword in keywords if keyword in query_lower)
        scores[expert] = score
    
    # Get expert with highest score, default to General if tied or no matches
    max_score = max(scores.values()) if scores else 0
    if max_score > 0:
        experts = [expert for expert, score in scores.items() if score == max_score]
        return experts[0]
    return "General"

def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None):
    """Generate response from the model."""
    if model is None or tokenizer is None:
        return "⚠️ Model not loaded. Please check the logs for errors."
    
    try:
        # Determine expert if not provided
        if expert is None:
            expert = classify_expert(prompt)
        
        # Create expert-specific prompt
        system_prompt = f"You are an AI assistant specialized in {expert}. "
        full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
        
        # Tokenize input
        inputs = tokenizer(full_prompt, return_tensors="pt")
        if ON_GPU:
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the input prompt from the response
        response = response[len(full_prompt):].strip()
        
        return response
        
    except Exception as e:
        return f"⚠️ Generation failed: {str(e)}"

def create_interface():
    """Create the Gradio interface."""
    
    with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo:
        gr.Markdown("# 🤖 Phi-3.5-MoE Expert Assistant")
        gr.Markdown(f"**Environment:** {'GPU' if ON_GPU else 'CPU'} | **Model:** {MODEL_ID}")
        
        if model is None:
            gr.Markdown("⚠️ **Model failed to load. Limited functionality available.**")
        
        with gr.Row():
            with gr.Column(scale=3):
                prompt = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask me anything...",
                    lines=4
                )
                
                with gr.Row():
                    max_tokens = gr.Slider(
                        minimum=50, maximum=1024, value=512, step=50,
                        label="Max Tokens"
                    )
                    temperature = gr.Slider(
                        minimum=0.1, maximum=2.0, value=0.7, step=0.1,
                        label="Temperature"
                    )
                    expert = gr.Dropdown(
                        choices=list(EXPERT_CATEGORIES.keys()),
                        value=None,
                        label="Expert (Optional)",
                        allow_custom_value=False
                    )
                
                generate_btn = gr.Button("Generate Response", variant="primary")
            
            with gr.Column(scale=2):
                response = gr.Textbox(
                    label="Response",
                    lines=10,
                    interactive=False
                )
        
        # Example prompts
        gr.Examples(
            examples=[
                ["Explain quantum computing in simple terms", None],
                ["Write a Python function to calculate fibonacci numbers", "Code"],
                ["What are the benefits of renewable energy?", "General"],
                ["How does machine learning work?", "Reasoning"],
                ["Translate 'Hello, how are you?' to Spanish", "Multilingual"],
                ["Solve the equation 3x^2 + 5x - 2 = 0", "Math"]
            ],
            inputs=[prompt, expert]
        )
        
        # Event handlers
        generate_btn.click(
            fn=generate_response,
            inputs=[prompt, max_tokens, temperature, expert],
            outputs=response
        )
        
        prompt.submit(
            fn=generate_response,
            inputs=[prompt, max_tokens, temperature, expert],
            outputs=response
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )