Spaces:
Running
Running
| import os | |
| import logging | |
| import asyncio | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| logger = logging.getLogger("nexari.chat") | |
| BASE_DIR = "./models/chat" | |
| model = None | |
| # === OPTIMIZED: Llama 3.2 3B (Q4_K_M) === | |
| # Using Q4_K_M reduces memory bandwidth pressure on the CPU significantly. | |
| REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF" | |
| FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" | |
| def load_model(local_dir: str = None): | |
| global model | |
| if not local_dir: local_dir = BASE_DIR | |
| try: | |
| os.makedirs(local_dir, exist_ok=True) | |
| path = os.path.join(local_dir, FILENAME) | |
| if not os.path.exists(path): | |
| logger.info(f"⬇️ Downloading Chat Model (Fast Q4)...") | |
| hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir) | |
| model = Llama( | |
| model_path=path, | |
| n_ctx=4096, | |
| n_threads=2, | |
| n_batch=512, # Helps process "Search Results" text block faster | |
| verbose=False | |
| ) | |
| logger.info("✅ Chat Model Ready (Turbo Mode)") | |
| return model | |
| except Exception as e: | |
| logger.error(f"Chat Load Error: {e}") | |
| model = None | |
| async def load_model_async(): | |
| return await asyncio.to_thread(load_model) | |