Upload 13 files

5d8fd4f verified 18 days ago

5.59 kB

	"""
	LLM Engine — local Qwen2.5-0.5B-Instruct fallback via llama-cpp-python.

	This is the bottom layer of the AnveshAI hierarchy:

	Math → math_engine (instant, rule-based)
	Knowledge → knowledge_engine (keyword retrieval from knowledge.txt)
	└─ no match → LLMEngine.generate (Qwen2.5-0.5B)
	Conversation → conversation_engine (pattern matching from conversation.txt)
	└─ no match → LLMEngine.generate (Qwen2.5-0.5B)

	Model: Qwen/Qwen2.5-0.5B-Instruct (Q4_K_M GGUF, ~350 MB)
	─ Best-in-class quality at 0.5B parameters
	─ Runs entirely on CPU via llama.cpp
	─ Downloaded once into ~/.cache/huggingface/ on first use
	─ Loaded LAZILY: the model only loads when first needed,
	keeping startup instant.
	"""

	MODEL_REPO = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
	MODEL_FILE = "qwen2.5-0.5b-instruct-q4_k_m.gguf"

	SYSTEM_PROMPT = (
	"You are AnveshAI Edge, a helpful offline AI assistant. "
	"Answer questions thoroughly and completely. Show full working steps "
	"for math or technical questions. Do not repeat the question back. "
	"If you are unsure about something, say so clearly."
	)

	MATH_SYSTEM_PROMPT = (
	"You are a mathematics tutor. "
	"You will be given a VERIFIED ANSWER computed by a symbolic engine. "
	"That answer is 100% correct — do NOT change it, do NOT recompute it. "
	"Your ONLY job is to explain, step by step, HOW a student would work through "
	"the problem and arrive at that exact answer. "
	"Every step must lead logically toward the verified answer. "
	"State the verified answer word-for-word at the end of your explanation."
	)

	MAX_TOKENS = 1024 # enough for detailed explanations and step-by-step answers
	TEMPERATURE = 0.7
	MATH_TEMPERATURE = 0.1 # near-deterministic for math explanations
	TOP_P = 0.9
	N_CTX = 16384 # match model's trained context (supports up to 32768)


	class LLMEngine:
	"""
	Lazy-loading wrapper around Qwen2.5-0.5B-Instruct (GGUF via llama.cpp).

	Usage:
	engine = LLMEngine()
	response = engine.generate("What is photosynthesis?")

	The GGUF model is downloaded from HuggingFace on the first call to
	generate() and cached locally. Every subsequent call reuses the
	in-memory model — no re-loading.
	"""

	def __init__(self) -> None:
	self._llm = None
	self._loaded: bool = False
	self._failed: bool = False
	self._fail_reason: str = ""

	def is_available(self) -> bool:
	"""True once the model has loaded without error."""
	return self._loaded and not self._failed

	# ------------------------------------------------------------------
	# Internal helpers
	# ------------------------------------------------------------------

	def _load(self) -> None:
	"""Download (first run only) and load the GGUF model into memory."""
	if self._loaded or self._failed:
	return

	try:
	print(
	f"\n [LLM] Loading {MODEL_FILE} … "
	"(first run downloads ~350 MB, then cached locally)",
	flush=True,
	)

	from llama_cpp import Llama

	self._llm = Llama.from_pretrained(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	n_ctx=N_CTX,
	n_threads=4, # use up to 4 CPU threads
	verbose=False,
	)

	self._loaded = True
	print(" [LLM] Qwen2.5-0.5B-Instruct ready\n", flush=True)

	except Exception as exc:
	self._failed = True
	self._fail_reason = str(exc)
	print(f" [LLM] Failed to load: {exc}\n", flush=True)

	# ------------------------------------------------------------------
	# Public API
	# ------------------------------------------------------------------

	def generate(
	self,
	user_input: str,
	context: str = "",
	system_prompt: str = "",
	temperature: float = TEMPERATURE,
	) -> str:
	"""
	Generate a response using the local LLM.

	Args:
	user_input : The user's message or question.
	context : Optional retrieved text to inject as background.
	system_prompt : Override the default system prompt (e.g. for math).
	temperature : Sampling temperature; use low values for math.

	Returns:
	The model's reply as a plain string.
	"""
	self._load()

	if self._failed:
	return (
	"The local LLM is currently unavailable "
	f"({self._fail_reason}). "
	"Ensure 'llama-cpp-python' is installed and the model "
	"could be downloaded."
	)

	try:
	system_content = system_prompt if system_prompt else SYSTEM_PROMPT
	if context:
	system_content += f"\n\nRelevant background:\n{context}"

	messages = [
	{"role": "system", "content": system_content},
	{"role": "user", "content": user_input},
	]

	output = self._llm.create_chat_completion(
	messages=messages,
	max_tokens=MAX_TOKENS,
	temperature=temperature,
	top_p=TOP_P,
	)

	response: str = output["choices"][0]["message"]["content"]
	return response.strip()

	except Exception as exc:
	return f"LLM generation error: {exc}"