Spaces:

Nexari-Research
/

Nexari-G1.1

Running

Nexari-G1.1 / chat_model.py

Update chat_model.py

d6a6892 verified about 10 hours ago

1.29 kB

	import os
	import logging
	import asyncio
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	logger = logging.getLogger("nexari.chat")
	BASE_DIR = "./models/chat"
	model = None

	# === OPTIMIZED: Llama 3.2 3B (Q4_K_M) ===
	# Using Q4_K_M reduces memory bandwidth pressure on the CPU significantly.
	REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
	FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

	def load_model(local_dir: str = None):
	global model
	if not local_dir: local_dir = BASE_DIR
	try:
	os.makedirs(local_dir, exist_ok=True)
	path = os.path.join(local_dir, FILENAME)

	if not os.path.exists(path):
	logger.info(f"⬇️ Downloading Chat Model (Fast Q4)...")
	hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir)

	model = Llama(
	model_path=path,
	n_ctx=4096,
	n_threads=2,
	n_batch=512, # Helps process "Search Results" text block faster
	verbose=False
	)
	logger.info("✅ Chat Model Ready (Turbo Mode)")
	return model
	except Exception as e:
	logger.error(f"Chat Load Error: {e}")
	model = None

	async def load_model_async():
	return await asyncio.to_thread(load_model)