# import ollama import json import logging from typing import List, Dict, Any, Optional from transformers import AutoTokenizer, AutoModelForCausalLM import torch from typing import Optional from config.settings import Settings logger = logging.getLogger(__name__) # HuggingFace LLM Handler for Microsoft Phi-3 Mini import requests from typing import Optional import requests import os from dotenv import load_dotenv load_dotenv() class OpenRouterLLMHandler: def __init__(self, api_key: str="", model: str = "mistralai/mistral-7b-instruct"): if (model == ""): model = self.current_model API_KEY = os.getenv("OPENROUTER_API_KEY") api_key= API_KEY if API_KEY else api_key self.api_key = api_key self.model = model self.base_url = "https://openrouter.ai/api/v1/chat/completions" print(f"🔌 Initialized OpenRouter handler with model: {model}") def generate_response(self, prompt: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str: try: full_prompt = self._build_simple_prompt(prompt, context, tools_output) # if self.model_name: # self.model = self.model_name #self.model = self.model_name # if (model == ""): # model = self.model_name headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } payload = { "model": self.model, "messages": [ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": full_prompt} ], "temperature": 0.7, "max_tokens": 200 } # 222 # 320 # 90k # msai # 2% candidate response = requests.post(self.base_url, headers=headers, json=payload) response.raise_for_status() result = response.json() return result["choices"][0]["message"]["content"].strip() except Exception as e: return f"Error generating response: {str(e)}" def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str: prompt_parts = [] if context and len(context) < 300: prompt_parts.append(f"Context: {context}") if tools_output and len(tools_output) < 200: prompt_parts.append(f"Additional info: {tools_output}") prompt_parts.append(f"User query: {user_input}") return "\n\n".join(prompt_parts) def add_to_history(self, user_input: str, assistant_response: str): """ Add exchange to conversation history """ self.conversation_history.append({ 'user': user_input, 'assistant': assistant_response }) # Keep only recent history if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY: self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:] def clear_history(self): """ Clear conversation history """ self.conversation_history = [] def get_available_models(self) -> List[str]: """ Get list of available Ollama models """ try: models = self.client.list() return [model['name'] for model in models['models']] except Exception as e: logger.error(f"Error getting models: {e}") return [Settings.DEFAULT_MODEL] def switch_model(self, model_name: str) -> bool: """ Switch to a different model """ try: # Test if model is available #self.client.generate(model=model_name, prompt="test", options={'num_predict': 1}) self.model = model_name self.model_name = model_name logger.info(f"Switched to model: {model_name}") return True except Exception as e: logger.error(f"Error switching to model {model_name}: {e}") return False def generate_embedding(self, text: str) -> List[float]: """ Generate embeddings for text using Ollama """ try: response = self.client.embeddings( model=Settings.EMBEDDING_MODEL, prompt=text ) return response['embedding'] except Exception as e: logger.error(f"Error generating embedding: {e}") return [] class HuggingFaceLLMHandler: def __init__(self): from transformers import AutoTokenizer, AutoModelForCausalLM import torch import psutil self.model_name = "microsoft/Phi-3-mini-4k-instruct" print("Loading model... this may take a moment on first run") # Choose device and dtype intelligently device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch_dtype = torch.float16 if device.type == "cuda" else torch.float32 print(f"Using device: {device}, dtype: {torch_dtype}") print(f"Available RAM: {psutil.virtual_memory().available / 1e6:.2f} MB") # Load tokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, trust_remote_code=True ) # Load model safely try: self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch_dtype, device_map="auto" if device.type == "cuda" else None, low_cpu_mem_usage=True, # Helps reduce RAM footprint during init trust_remote_code=True ) # Explicitly move to CPU if needed if device.type == "cpu": self.model = self.model.to(device) print("Model loaded successfully!") except RuntimeError as e: print(f"❌ Error loading model: {e}") print("Tip: Try switching to a smaller model or free up RAM.") def generate_response(self, prompt: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str: """ Generate response using Phi-3 - should be under 10 seconds """ try: # Build simple prompt full_prompt = self._build_simple_prompt(prompt, context, tools_output) # Tokenize and move to same device as model inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=1024) inputs = {k: v.to(self.model.device) for k, v in inputs.items()} # Generate with torch.no_grad(): outputs = self.model.generate( inputs["input_ids"], max_new_tokens=200, # Limit response length temperature=0.7, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, attention_mask=inputs["attention_mask"] ) # Decode response response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) return response.strip() except Exception as e: logger.error(f"Error generating response: {e}") return f"Error generating response: {str(e)}" def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str: """Simple prompt builder""" prompt_parts = ["You are a helpful AI assistant."] if context and len(context) < 300: prompt_parts.append(f"Context: {context}") if tools_output and len(tools_output) < 200: prompt_parts.append(f"Additional info: {tools_output}") prompt_parts.append(f"User: {user_input}") prompt_parts.append("Assistant:") return "\n\n".join(prompt_parts) def add_to_history(self, user_input: str, assistant_response: str): """ Add exchange to conversation history """ self.conversation_history.append({ 'user': user_input, 'assistant': assistant_response }) # Keep only recent history if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY: self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:] def clear_history(self): """ Clear conversation history """ self.conversation_history = [] def get_available_models(self) -> List[str]: """ Get list of available Ollama models """ try: models = self.client.list() return [model['name'] for model in models['models']] except Exception as e: logger.error(f"Error getting models: {e}") return [Settings.DEFAULT_MODEL] def switch_model(self, model_name: str) -> bool: """ Switch to a different model """ try: # Test if model is available self.client.generate(model=model_name, prompt="test", options={'num_predict': 1}) self.model_name = model_name logger.info(f"Switched to model: {model_name}") return True except Exception as e: logger.error(f"Error switching to model {model_name}: {e}") return False def generate_embedding(self, text: str) -> List[float]: """ Generate embeddings for text using Ollama """ try: response = self.client.embeddings( model=Settings.EMBEDDING_MODEL, prompt=text ) return response['embedding'] except Exception as e: logger.error(f"Error generating embedding: {e}") return []