Spaces:

ianshank
/

phi35-moe-demo

Sleeping

App Files Files Community

phi35-moe-demo / app.py

ianshank

🚀 Final fix v20250913_220639: Comprehensive solution for dependency and configuration issues

3eeba36 verified 2 months ago

raw

history blame contribute delete

8.31 kB

	#!/usr/bin/env python3
	"""
	Phi-3.5-MoE Expert Assistant
	Robust application with CPU/GPU environment detection and dependency handling
	"""

	import os
	import sys
	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# Apply the model patch if available
	try:
	import model_patch
	print("✅ Applied model patch for handling missing dependencies")
	except ImportError:
	print("ℹ️ Model patch not found, continuing without it")

	# Environment detection
	ON_GPU = torch.cuda.is_available()
	MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct")
	REVISION = os.getenv("HF_REVISION")

	# Configuration based on environment
	if ON_GPU:
	attn_impl = "sdpa" # Fast attention for GPU
	dtype = torch.bfloat16 # Mixed precision for GPU
	device_map = "auto" # Auto device mapping for GPU
	low_cpu_mem = False # Don't need low memory usage on GPU
	else:
	attn_impl = "eager" # Standard attention for CPU
	dtype = torch.float32 # Full precision for CPU
	device_map = "cpu" # Force CPU device
	low_cpu_mem = True # Enable low memory usage on CPU

	print(f"🚀 Loading model: {MODEL_ID}")
	print(f"🔧 Environment: {'GPU' if ON_GPU else 'CPU'}")
	print(f"📊 Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}")

	# Expert categories for query classification
	EXPERT_CATEGORIES = {
	"Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"],
	"Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"],
	"Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"],
	"Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"],
	"General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"]
	}

	# Load model with robust error handling
	model = None
	tokenizer = None

	try:
	# Load tokenizer
	print("📝 Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	revision=REVISION
	)

	# Load model with environment-specific settings
	print("🧠 Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	revision=REVISION,
	attn_implementation=attn_impl,
	dtype=dtype, # Fixed: Use dtype instead of torch_dtype
	device_map=device_map,
	low_cpu_mem_usage=low_cpu_mem
	).eval()

	print("✅ Model loaded successfully!")

	# Verify model works with a simple generation
	print("🔍 Running quick model test...")
	test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device)
	with torch.no_grad():
	test_output = model.generate(**test_input, max_new_tokens=5)
	print("✅ Model test successful!")

	except Exception as e:
	print(f"⚠️ Model loading failed: {e}")
	print("⚠️ Continuing with limited functionality")

	def classify_expert(query):
	"""Classify query to determine which expert should handle it."""
	query_lower = query.lower()
	scores = {}

	for expert, keywords in EXPERT_CATEGORIES.items():
	score = sum(1 for keyword in keywords if keyword in query_lower)
	scores[expert] = score

	# Get expert with highest score, default to General if tied or no matches
	max_score = max(scores.values()) if scores else 0
	if max_score > 0:
	experts = [expert for expert, score in scores.items() if score == max_score]
	return experts[0]
	return "General"

	def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None):
	"""Generate response from the model."""
	if model is None or tokenizer is None:
	return "⚠️ Model not loaded. Please check the logs for errors."

	try:
	# Determine expert if not provided
	if expert is None:
	expert = classify_expert(prompt)

	# Create expert-specific prompt
	system_prompt = f"You are an AI assistant specialized in {expert}. "
	full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"

	# Tokenize input
	inputs = tokenizer(full_prompt, return_tensors="pt")
	if ON_GPU:
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	# Decode response
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Remove the input prompt from the response
	response = response[len(full_prompt):].strip()

	return response

	except Exception as e:
	return f"⚠️ Generation failed: {str(e)}"

	def create_interface():
	"""Create the Gradio interface."""

	with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo:
	gr.Markdown("# 🤖 Phi-3.5-MoE Expert Assistant")
	gr.Markdown(f"Environment: {'GPU' if ON_GPU else 'CPU'} \| Model: {MODEL_ID}")

	if model is None:
	gr.Markdown("⚠️ Model failed to load. Limited functionality available.")

	with gr.Row():
	with gr.Column(scale=3):
	prompt = gr.Textbox(
	label="Your Question",
	placeholder="Ask me anything...",
	lines=4
	)

	with gr.Row():
	max_tokens = gr.Slider(
	minimum=50, maximum=1024, value=512, step=50,
	label="Max Tokens"
	)
	temperature = gr.Slider(
	minimum=0.1, maximum=2.0, value=0.7, step=0.1,
	label="Temperature"
	)
	expert = gr.Dropdown(
	choices=list(EXPERT_CATEGORIES.keys()),
	value=None,
	label="Expert (Optional)",
	allow_custom_value=False
	)

	generate_btn = gr.Button("Generate Response", variant="primary")

	with gr.Column(scale=2):
	response = gr.Textbox(
	label="Response",
	lines=10,
	interactive=False
	)

	# Example prompts
	gr.Examples(
	examples=[
	["Explain quantum computing in simple terms", None],
	["Write a Python function to calculate fibonacci numbers", "Code"],
	["What are the benefits of renewable energy?", "General"],
	["How does machine learning work?", "Reasoning"],
	["Translate 'Hello, how are you?' to Spanish", "Multilingual"],
	["Solve the equation 3x^2 + 5x - 2 = 0", "Math"]
	],
	inputs=[prompt, expert]
	)

	# Event handlers
	generate_btn.click(
	fn=generate_response,
	inputs=[prompt, max_tokens, temperature, expert],
	outputs=response
	)

	prompt.submit(
	fn=generate_response,
	inputs=[prompt, max_tokens, temperature, expert],
	outputs=response
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)