#!/usr/bin/env python3 """ Phi-3.5-MoE Expert Assistant Robust application with CPU/GPU environment detection and dependency handling """ import os import sys import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Apply the model patch if available try: import model_patch print("✅ Applied model patch for handling missing dependencies") except ImportError: print("â„šī¸ Model patch not found, continuing without it") # Environment detection ON_GPU = torch.cuda.is_available() MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct") REVISION = os.getenv("HF_REVISION") # Configuration based on environment if ON_GPU: attn_impl = "sdpa" # Fast attention for GPU dtype = torch.bfloat16 # Mixed precision for GPU device_map = "auto" # Auto device mapping for GPU low_cpu_mem = False # Don't need low memory usage on GPU else: attn_impl = "eager" # Standard attention for CPU dtype = torch.float32 # Full precision for CPU device_map = "cpu" # Force CPU device low_cpu_mem = True # Enable low memory usage on CPU print(f"🚀 Loading model: {MODEL_ID}") print(f"🔧 Environment: {'GPU' if ON_GPU else 'CPU'}") print(f"📊 Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}") # Expert categories for query classification EXPERT_CATEGORIES = { "Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"], "Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"], "Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"], "Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"], "General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"] } # Load model with robust error handling model = None tokenizer = None try: # Load tokenizer print("📝 Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, trust_remote_code=True, revision=REVISION ) # Load model with environment-specific settings print("🧠 Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, revision=REVISION, attn_implementation=attn_impl, dtype=dtype, # Fixed: Use dtype instead of torch_dtype device_map=device_map, low_cpu_mem_usage=low_cpu_mem ).eval() print("✅ Model loaded successfully!") # Verify model works with a simple generation print("🔍 Running quick model test...") test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device) with torch.no_grad(): test_output = model.generate(**test_input, max_new_tokens=5) print("✅ Model test successful!") except Exception as e: print(f"âš ī¸ Model loading failed: {e}") print("âš ī¸ Continuing with limited functionality") def classify_expert(query): """Classify query to determine which expert should handle it.""" query_lower = query.lower() scores = {} for expert, keywords in EXPERT_CATEGORIES.items(): score = sum(1 for keyword in keywords if keyword in query_lower) scores[expert] = score # Get expert with highest score, default to General if tied or no matches max_score = max(scores.values()) if scores else 0 if max_score > 0: experts = [expert for expert, score in scores.items() if score == max_score] return experts[0] return "General" def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None): """Generate response from the model.""" if model is None or tokenizer is None: return "âš ī¸ Model not loaded. Please check the logs for errors." try: # Determine expert if not provided if expert is None: expert = classify_expert(prompt) # Create expert-specific prompt system_prompt = f"You are an AI assistant specialized in {expert}. " full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:" # Tokenize input inputs = tokenizer(full_prompt, return_tensors="pt") if ON_GPU: inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate response with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id ) # Decode response response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Remove the input prompt from the response response = response[len(full_prompt):].strip() return response except Exception as e: return f"âš ī¸ Generation failed: {str(e)}" def create_interface(): """Create the Gradio interface.""" with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo: gr.Markdown("# 🤖 Phi-3.5-MoE Expert Assistant") gr.Markdown(f"**Environment:** {'GPU' if ON_GPU else 'CPU'} | **Model:** {MODEL_ID}") if model is None: gr.Markdown("âš ī¸ **Model failed to load. Limited functionality available.**") with gr.Row(): with gr.Column(scale=3): prompt = gr.Textbox( label="Your Question", placeholder="Ask me anything...", lines=4 ) with gr.Row(): max_tokens = gr.Slider( minimum=50, maximum=1024, value=512, step=50, label="Max Tokens" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature" ) expert = gr.Dropdown( choices=list(EXPERT_CATEGORIES.keys()), value=None, label="Expert (Optional)", allow_custom_value=False ) generate_btn = gr.Button("Generate Response", variant="primary") with gr.Column(scale=2): response = gr.Textbox( label="Response", lines=10, interactive=False ) # Example prompts gr.Examples( examples=[ ["Explain quantum computing in simple terms", None], ["Write a Python function to calculate fibonacci numbers", "Code"], ["What are the benefits of renewable energy?", "General"], ["How does machine learning work?", "Reasoning"], ["Translate 'Hello, how are you?' to Spanish", "Multilingual"], ["Solve the equation 3x^2 + 5x - 2 = 0", "Math"] ], inputs=[prompt, expert] ) # Event handlers generate_btn.click( fn=generate_response, inputs=[prompt, max_tokens, temperature, expert], outputs=response ) prompt.submit( fn=generate_response, inputs=[prompt, max_tokens, temperature, expert], outputs=response ) return demo if __name__ == "__main__": demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False )