AnveshAI-Edge / llm_engine.py
developeranveshraman's picture
Upload 13 files
5d8fd4f verified
"""
LLM Engine β€” local Qwen2.5-0.5B-Instruct fallback via llama-cpp-python.
This is the bottom layer of the AnveshAI hierarchy:
Math β†’ math_engine (instant, rule-based)
Knowledge β†’ knowledge_engine (keyword retrieval from knowledge.txt)
└─ no match β†’ LLMEngine.generate (Qwen2.5-0.5B)
Conversation β†’ conversation_engine (pattern matching from conversation.txt)
└─ no match β†’ LLMEngine.generate (Qwen2.5-0.5B)
Model: Qwen/Qwen2.5-0.5B-Instruct (Q4_K_M GGUF, ~350 MB)
─ Best-in-class quality at 0.5B parameters
─ Runs entirely on CPU via llama.cpp
─ Downloaded once into ~/.cache/huggingface/ on first use
─ Loaded LAZILY: the model only loads when first needed,
keeping startup instant.
"""
MODEL_REPO = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
MODEL_FILE = "qwen2.5-0.5b-instruct-q4_k_m.gguf"
SYSTEM_PROMPT = (
"You are AnveshAI Edge, a helpful offline AI assistant. "
"Answer questions thoroughly and completely. Show full working steps "
"for math or technical questions. Do not repeat the question back. "
"If you are unsure about something, say so clearly."
)
MATH_SYSTEM_PROMPT = (
"You are a mathematics tutor. "
"You will be given a VERIFIED ANSWER computed by a symbolic engine. "
"That answer is 100% correct β€” do NOT change it, do NOT recompute it. "
"Your ONLY job is to explain, step by step, HOW a student would work through "
"the problem and arrive at that exact answer. "
"Every step must lead logically toward the verified answer. "
"State the verified answer word-for-word at the end of your explanation."
)
MAX_TOKENS = 1024 # enough for detailed explanations and step-by-step answers
TEMPERATURE = 0.7
MATH_TEMPERATURE = 0.1 # near-deterministic for math explanations
TOP_P = 0.9
N_CTX = 16384 # match model's trained context (supports up to 32768)
class LLMEngine:
"""
Lazy-loading wrapper around Qwen2.5-0.5B-Instruct (GGUF via llama.cpp).
Usage:
engine = LLMEngine()
response = engine.generate("What is photosynthesis?")
The GGUF model is downloaded from HuggingFace on the first call to
generate() and cached locally. Every subsequent call reuses the
in-memory model β€” no re-loading.
"""
def __init__(self) -> None:
self._llm = None
self._loaded: bool = False
self._failed: bool = False
self._fail_reason: str = ""
def is_available(self) -> bool:
"""True once the model has loaded without error."""
return self._loaded and not self._failed
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _load(self) -> None:
"""Download (first run only) and load the GGUF model into memory."""
if self._loaded or self._failed:
return
try:
print(
f"\n [LLM] Loading {MODEL_FILE} … "
"(first run downloads ~350 MB, then cached locally)",
flush=True,
)
from llama_cpp import Llama
self._llm = Llama.from_pretrained(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
n_ctx=N_CTX,
n_threads=4, # use up to 4 CPU threads
verbose=False,
)
self._loaded = True
print(" [LLM] Qwen2.5-0.5B-Instruct ready\n", flush=True)
except Exception as exc:
self._failed = True
self._fail_reason = str(exc)
print(f" [LLM] Failed to load: {exc}\n", flush=True)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def generate(
self,
user_input: str,
context: str = "",
system_prompt: str = "",
temperature: float = TEMPERATURE,
) -> str:
"""
Generate a response using the local LLM.
Args:
user_input : The user's message or question.
context : Optional retrieved text to inject as background.
system_prompt : Override the default system prompt (e.g. for math).
temperature : Sampling temperature; use low values for math.
Returns:
The model's reply as a plain string.
"""
self._load()
if self._failed:
return (
"The local LLM is currently unavailable "
f"({self._fail_reason}). "
"Ensure 'llama-cpp-python' is installed and the model "
"could be downloaded."
)
try:
system_content = system_prompt if system_prompt else SYSTEM_PROMPT
if context:
system_content += f"\n\nRelevant background:\n{context}"
messages = [
{"role": "system", "content": system_content},
{"role": "user", "content": user_input},
]
output = self._llm.create_chat_completion(
messages=messages,
max_tokens=MAX_TOKENS,
temperature=temperature,
top_p=TOP_P,
)
response: str = output["choices"][0]["message"]["content"]
return response.strip()
except Exception as exc:
return f"LLM generation error: {exc}"