Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Phi-3.5-MoE Expert Assistant | |
| Robust application with CPU/GPU environment detection and dependency handling | |
| """ | |
| import os | |
| import sys | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # Apply the model patch if available | |
| try: | |
| import model_patch | |
| print("β Applied model patch for handling missing dependencies") | |
| except ImportError: | |
| print("βΉοΈ Model patch not found, continuing without it") | |
| # Environment detection | |
| ON_GPU = torch.cuda.is_available() | |
| MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct") | |
| REVISION = os.getenv("HF_REVISION") | |
| # Configuration based on environment | |
| if ON_GPU: | |
| attn_impl = "sdpa" # Fast attention for GPU | |
| dtype = torch.bfloat16 # Mixed precision for GPU | |
| device_map = "auto" # Auto device mapping for GPU | |
| low_cpu_mem = False # Don't need low memory usage on GPU | |
| else: | |
| attn_impl = "eager" # Standard attention for CPU | |
| dtype = torch.float32 # Full precision for CPU | |
| device_map = "cpu" # Force CPU device | |
| low_cpu_mem = True # Enable low memory usage on CPU | |
| print(f"π Loading model: {MODEL_ID}") | |
| print(f"π§ Environment: {'GPU' if ON_GPU else 'CPU'}") | |
| print(f"π Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}") | |
| # Expert categories for query classification | |
| EXPERT_CATEGORIES = { | |
| "Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"], | |
| "Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"], | |
| "Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"], | |
| "Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"], | |
| "General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"] | |
| } | |
| # Load model with robust error handling | |
| model = None | |
| tokenizer = None | |
| try: | |
| # Load tokenizer | |
| print("π Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| revision=REVISION | |
| ) | |
| # Load model with environment-specific settings | |
| print("π§ Loading model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| revision=REVISION, | |
| attn_implementation=attn_impl, | |
| dtype=dtype, # Fixed: Use dtype instead of torch_dtype | |
| device_map=device_map, | |
| low_cpu_mem_usage=low_cpu_mem | |
| ).eval() | |
| print("β Model loaded successfully!") | |
| # Verify model works with a simple generation | |
| print("π Running quick model test...") | |
| test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device) | |
| with torch.no_grad(): | |
| test_output = model.generate(**test_input, max_new_tokens=5) | |
| print("β Model test successful!") | |
| except Exception as e: | |
| print(f"β οΈ Model loading failed: {e}") | |
| print("β οΈ Continuing with limited functionality") | |
| def classify_expert(query): | |
| """Classify query to determine which expert should handle it.""" | |
| query_lower = query.lower() | |
| scores = {} | |
| for expert, keywords in EXPERT_CATEGORIES.items(): | |
| score = sum(1 for keyword in keywords if keyword in query_lower) | |
| scores[expert] = score | |
| # Get expert with highest score, default to General if tied or no matches | |
| max_score = max(scores.values()) if scores else 0 | |
| if max_score > 0: | |
| experts = [expert for expert, score in scores.items() if score == max_score] | |
| return experts[0] | |
| return "General" | |
| def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None): | |
| """Generate response from the model.""" | |
| if model is None or tokenizer is None: | |
| return "β οΈ Model not loaded. Please check the logs for errors." | |
| try: | |
| # Determine expert if not provided | |
| if expert is None: | |
| expert = classify_expert(prompt) | |
| # Create expert-specific prompt | |
| system_prompt = f"You are an AI assistant specialized in {expert}. " | |
| full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:" | |
| # Tokenize input | |
| inputs = tokenizer(full_prompt, return_tensors="pt") | |
| if ON_GPU: | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # Decode response | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Remove the input prompt from the response | |
| response = response[len(full_prompt):].strip() | |
| return response | |
| except Exception as e: | |
| return f"β οΈ Generation failed: {str(e)}" | |
| def create_interface(): | |
| """Create the Gradio interface.""" | |
| with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo: | |
| gr.Markdown("# π€ Phi-3.5-MoE Expert Assistant") | |
| gr.Markdown(f"**Environment:** {'GPU' if ON_GPU else 'CPU'} | **Model:** {MODEL_ID}") | |
| if model is None: | |
| gr.Markdown("β οΈ **Model failed to load. Limited functionality available.**") | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| prompt = gr.Textbox( | |
| label="Your Question", | |
| placeholder="Ask me anything...", | |
| lines=4 | |
| ) | |
| with gr.Row(): | |
| max_tokens = gr.Slider( | |
| minimum=50, maximum=1024, value=512, step=50, | |
| label="Max Tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, maximum=2.0, value=0.7, step=0.1, | |
| label="Temperature" | |
| ) | |
| expert = gr.Dropdown( | |
| choices=list(EXPERT_CATEGORIES.keys()), | |
| value=None, | |
| label="Expert (Optional)", | |
| allow_custom_value=False | |
| ) | |
| generate_btn = gr.Button("Generate Response", variant="primary") | |
| with gr.Column(scale=2): | |
| response = gr.Textbox( | |
| label="Response", | |
| lines=10, | |
| interactive=False | |
| ) | |
| # Example prompts | |
| gr.Examples( | |
| examples=[ | |
| ["Explain quantum computing in simple terms", None], | |
| ["Write a Python function to calculate fibonacci numbers", "Code"], | |
| ["What are the benefits of renewable energy?", "General"], | |
| ["How does machine learning work?", "Reasoning"], | |
| ["Translate 'Hello, how are you?' to Spanish", "Multilingual"], | |
| ["Solve the equation 3x^2 + 5x - 2 = 0", "Math"] | |
| ], | |
| inputs=[prompt, expert] | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_response, | |
| inputs=[prompt, max_tokens, temperature, expert], | |
| outputs=response | |
| ) | |
| prompt.submit( | |
| fn=generate_response, | |
| inputs=[prompt, max_tokens, temperature, expert], | |
| outputs=response | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |