Spaces:
Running
Running
| import os | |
| import logging | |
| import asyncio | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| logger = logging.getLogger("nexari.coder") | |
| BASE_DIR = "./models/coder" | |
| model = None | |
| # === OPTIMIZED FOR 2 vCPU: Qwen 2.5 Coder 3B (Q4_K_M) === | |
| # TECHNIQUE: Q6 (Heavy) -> Q4 (Fast). | |
| # Quality drop is negligible, but speed boosts by ~40%. | |
| REPO_ID = "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF" | |
| FILENAME = "qwen2.5-coder-3b-instruct-q4_k_m.gguf" | |
| def load_model(local_dir: str = None): | |
| global model | |
| if not local_dir: local_dir = BASE_DIR | |
| try: | |
| os.makedirs(local_dir, exist_ok=True) | |
| path = os.path.join(local_dir, FILENAME) | |
| # Download (~1.7 GB instead of 2.8 GB) | |
| if not os.path.exists(path): | |
| logger.info(f"⬇️ Downloading Qwen 3B Coder (Fast Q4)...") | |
| hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir) | |
| model = Llama( | |
| model_path=path, | |
| n_ctx=8192, | |
| n_threads=2, | |
| n_batch=512, # Batch size increased to ingest prompts faster | |
| verbose=False | |
| ) | |
| logger.info("✅ Coder Model Ready (Qwen 3B - Turbo Mode)") | |
| return model | |
| except Exception as e: | |
| logger.error(f"Coder Load Error: {e}") | |
| model = None | |
| async def load_model_async(): | |
| return await asyncio.to_thread(load_model) | |