AnveshAI-Edge / knowledge_engine.py
developeranveshraman's picture
Upload 13 files
5d8fd4f verified
"""
Knowledge Engine — retrieves relevant information from a local knowledge base.
How it works:
1. Loads 'knowledge.txt' at startup (one paragraph per blank-line block).
2. For a given query, scores each paragraph using keyword overlap.
3. Returns the highest-scoring paragraph + a boolean indicating confidence.
If confidence is low, the caller (main.py) will escalate to the LLM.
This is intentionally lightweight and fully offline. In the future it can be
swapped for a vector-based retrieval system (FAISS + sentence-transformers)
without changing the rest of the architecture.
"""
import os
import re
from typing import List, Tuple
KNOWLEDGE_FILE = os.path.join(os.path.dirname(__file__), "knowledge.txt")
# A paragraph must score at least this much to be considered a real match.
# Queries below this score are escalated to the LLM fallback.
MIN_RELEVANCE_SCORE = 2
STOP_WORDS = {
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "shall",
"should", "may", "might", "must", "can", "could", "to", "of", "in",
"on", "at", "by", "for", "with", "about", "against", "between", "into",
"through", "during", "before", "after", "above", "below", "from",
"up", "down", "out", "off", "over", "under", "again", "and", "but",
"or", "nor", "so", "yet", "both", "either", "neither", "not", "no",
"what", "which", "who", "whom", "this", "that", "these", "those",
"i", "me", "my", "myself", "we", "our", "you", "your", "he", "she",
"it", "they", "them", "their", "tell", "explain", "describe", "give",
"me", "some", "information", "about",
}
def _load_paragraphs(filepath: str) -> List[str]:
if not os.path.exists(filepath):
return []
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
raw = re.split(r"\n\s*\n", content.strip())
return [p.strip() for p in raw if p.strip()]
def _tokenize(text: str) -> List[str]:
words = re.findall(r"\b[a-z]+\b", text.lower())
return [w for w in words if w not in STOP_WORDS and len(w) > 2]
def _score_paragraph(query_tokens: List[str], paragraph: str) -> int:
para_lower = paragraph.lower()
score = 0
for token in query_tokens:
if re.search(r"\b" + re.escape(token) + r"\b", para_lower):
score += 2
elif token in para_lower:
score += 1
return score
def _strip_knowledge_prefixes(text: str) -> str:
prefixes = [
"what is", "what are", "who is", "who are", "explain", "define",
"tell me about", "describe", "how does", "why is", "when was",
"where is", "history of", "meaning of", "knowledge:", "knowledge :",
"learn about", "facts about", "information about",
]
lowered = text.lower().strip()
for prefix in prefixes:
if lowered.startswith(prefix):
return text[len(prefix):].strip()
return text
class KnowledgeEngine:
"""Local keyword-scored knowledge retrieval over knowledge.txt."""
def __init__(self, knowledge_file: str = KNOWLEDGE_FILE):
self.paragraphs: List[str] = _load_paragraphs(knowledge_file)
self._loaded = len(self.paragraphs) > 0
def is_loaded(self) -> bool:
return self._loaded
def query(self, user_input: str) -> Tuple[str, bool]:
"""
Find the most relevant paragraph for the given query.
Returns:
(response, found)
found = True → a high-confidence match was found in the KB
found = False → no confident match; caller should try the LLM
"""
if not self._loaded:
return (
"Knowledge base unavailable. Ensure 'knowledge.txt' exists.",
False,
)
clean_query = _strip_knowledge_prefixes(user_input)
query_tokens = _tokenize(clean_query)
if not query_tokens:
return ("Could you rephrase? I couldn't parse the query.", False)
scored: List[Tuple[int, str]] = [
(_score_paragraph(query_tokens, para), para)
for para in self.paragraphs
]
best_score, best_para = max(scored, key=lambda x: x[0])
if best_score < MIN_RELEVANCE_SCORE:
# Signal to caller: escalate to LLM
return ("", False)
return (best_para, True)