phi35-moe-demo / app.py
ianshank's picture
πŸš€ Final fix v20250913_220639: Comprehensive solution for dependency and configuration issues
3eeba36 verified
#!/usr/bin/env python3
"""
Phi-3.5-MoE Expert Assistant
Robust application with CPU/GPU environment detection and dependency handling
"""
import os
import sys
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# Apply the model patch if available
try:
import model_patch
print("βœ… Applied model patch for handling missing dependencies")
except ImportError:
print("ℹ️ Model patch not found, continuing without it")
# Environment detection
ON_GPU = torch.cuda.is_available()
MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct")
REVISION = os.getenv("HF_REVISION")
# Configuration based on environment
if ON_GPU:
attn_impl = "sdpa" # Fast attention for GPU
dtype = torch.bfloat16 # Mixed precision for GPU
device_map = "auto" # Auto device mapping for GPU
low_cpu_mem = False # Don't need low memory usage on GPU
else:
attn_impl = "eager" # Standard attention for CPU
dtype = torch.float32 # Full precision for CPU
device_map = "cpu" # Force CPU device
low_cpu_mem = True # Enable low memory usage on CPU
print(f"πŸš€ Loading model: {MODEL_ID}")
print(f"πŸ”§ Environment: {'GPU' if ON_GPU else 'CPU'}")
print(f"πŸ“Š Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}")
# Expert categories for query classification
EXPERT_CATEGORIES = {
"Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"],
"Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"],
"Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"],
"Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"],
"General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"]
}
# Load model with robust error handling
model = None
tokenizer = None
try:
# Load tokenizer
print("πŸ“ Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True,
revision=REVISION
)
# Load model with environment-specific settings
print("🧠 Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
revision=REVISION,
attn_implementation=attn_impl,
dtype=dtype, # Fixed: Use dtype instead of torch_dtype
device_map=device_map,
low_cpu_mem_usage=low_cpu_mem
).eval()
print("βœ… Model loaded successfully!")
# Verify model works with a simple generation
print("πŸ” Running quick model test...")
test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device)
with torch.no_grad():
test_output = model.generate(**test_input, max_new_tokens=5)
print("βœ… Model test successful!")
except Exception as e:
print(f"⚠️ Model loading failed: {e}")
print("⚠️ Continuing with limited functionality")
def classify_expert(query):
"""Classify query to determine which expert should handle it."""
query_lower = query.lower()
scores = {}
for expert, keywords in EXPERT_CATEGORIES.items():
score = sum(1 for keyword in keywords if keyword in query_lower)
scores[expert] = score
# Get expert with highest score, default to General if tied or no matches
max_score = max(scores.values()) if scores else 0
if max_score > 0:
experts = [expert for expert, score in scores.items() if score == max_score]
return experts[0]
return "General"
def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None):
"""Generate response from the model."""
if model is None or tokenizer is None:
return "⚠️ Model not loaded. Please check the logs for errors."
try:
# Determine expert if not provided
if expert is None:
expert = classify_expert(prompt)
# Create expert-specific prompt
system_prompt = f"You are an AI assistant specialized in {expert}. "
full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
# Tokenize input
inputs = tokenizer(full_prompt, return_tensors="pt")
if ON_GPU:
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove the input prompt from the response
response = response[len(full_prompt):].strip()
return response
except Exception as e:
return f"⚠️ Generation failed: {str(e)}"
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo:
gr.Markdown("# πŸ€– Phi-3.5-MoE Expert Assistant")
gr.Markdown(f"**Environment:** {'GPU' if ON_GPU else 'CPU'} | **Model:** {MODEL_ID}")
if model is None:
gr.Markdown("⚠️ **Model failed to load. Limited functionality available.**")
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(
label="Your Question",
placeholder="Ask me anything...",
lines=4
)
with gr.Row():
max_tokens = gr.Slider(
minimum=50, maximum=1024, value=512, step=50,
label="Max Tokens"
)
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature"
)
expert = gr.Dropdown(
choices=list(EXPERT_CATEGORIES.keys()),
value=None,
label="Expert (Optional)",
allow_custom_value=False
)
generate_btn = gr.Button("Generate Response", variant="primary")
with gr.Column(scale=2):
response = gr.Textbox(
label="Response",
lines=10,
interactive=False
)
# Example prompts
gr.Examples(
examples=[
["Explain quantum computing in simple terms", None],
["Write a Python function to calculate fibonacci numbers", "Code"],
["What are the benefits of renewable energy?", "General"],
["How does machine learning work?", "Reasoning"],
["Translate 'Hello, how are you?' to Spanish", "Multilingual"],
["Solve the equation 3x^2 + 5x - 2 = 0", "Math"]
],
inputs=[prompt, expert]
)
# Event handlers
generate_btn.click(
fn=generate_response,
inputs=[prompt, max_tokens, temperature, expert],
outputs=response
)
prompt.submit(
fn=generate_response,
inputs=[prompt, max_tokens, temperature, expert],
outputs=response
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)