Spaces:

BinKhoaLe1812
/

Medical-Chatbot

Running

App Files Files Community

Medical-Chatbot / memory.py

LiamKhoaLe

Lazy load models

4f5341e 10 days ago

raw

history blame contribute delete

20.1 kB

	# memory.py
	import re, time, hashlib, asyncio, os
	from collections import defaultdict, deque
	from typing import List, Dict
	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer
	from google import genai # must be configured in app.py and imported globally
	import logging

	_LLM_SMALL = "gemini-2.5-flash-lite-preview-06-17"
	# Load embedding model
	EMBED = SentenceTransformer("/app/model_cache", device="cpu").half()
	logger = logging.getLogger("rag-agent")
	logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader

	api_key = os.getenv("FlashAPI")
	client = genai.Client(api_key=api_key)

	class MemoryManager:
	def __init__(self, max_users=1000, history_per_user=20, max_chunks=60):
	# STM: recent conversation summaries (topic + summary), up to 5 entries
	self.stm_summaries = defaultdict(lambda: deque(maxlen=history_per_user)) # deque of {topic,text,vec,timestamp,used}
	# Legacy raw cache (kept for compatibility if needed)
	self.text_cache = defaultdict(lambda: deque(maxlen=history_per_user))
	# LTM: semantic chunk store (approx 3 chunks x 20 rounds)
	self.chunk_index = defaultdict(self._new_index) # user_id -> faiss index
	self.chunk_meta = defaultdict(list) # '' -> list[{text,tag,vec,timestamp,used}]
	self.user_queue = deque(maxlen=max_users) # LRU of users
	self.max_chunks = max_chunks # hard cap per user
	self.chunk_cache = {} # hash(query+resp) -> [chunks]

	# ---------- Public API ----------
	def add_exchange(self, user_id: str, query: str, response: str, lang: str = "EN"):
	self._touch_user(user_id)
	# Keep raw record (optional)
	self.text_cache[user_id].append(((query or "").strip(), (response or "").strip()))
	if not response: return []
	# Avoid re-chunking identical response
	cache_key = hashlib.md5((query + response).encode()).hexdigest()
	if cache_key in self.chunk_cache:
	chunks = self.chunk_cache[cache_key]
	else:
	chunks = self.chunk_response(response, lang, question=query)
	self.chunk_cache[cache_key] = chunks
	# Update STM with merging/deduplication
	for chunk in chunks:
	self._upsert_stm(user_id, chunk, lang)
	# Update LTM with merging/deduplication
	self._upsert_ltm(user_id, chunks, lang)
	return chunks

	def get_relevant_chunks(self, user_id: str, query: str, top_k: int = 3, min_sim: float = 0.30) -> List[str]:
	"""Return texts of chunks whose cosine similarity ≥ min_sim."""
	if self.chunk_index[user_id].ntotal == 0:
	return []
	# Encode chunk
	qvec = self._embed(query)
	sims, idxs = self.chunk_index[user_id].search(np.array([qvec]), k=top_k)
	results = []
	# Append related result with smart-decay to optimize storage and prioritize most-recent chat
	for sim, idx in zip(sims[0], idxs[0]):
	if idx < len(self.chunk_meta[user_id]) and sim >= min_sim:
	chunk = self.chunk_meta[user_id][idx]
	chunk["used"] += 1 # increment usage
	# Decay function
	age_sec = time.time() - chunk["timestamp"]
	decay = 1.0 / (1.0 + age_sec / 300) # 5-min half-life
	score = sim * decay * (1 + 0.1 * chunk["used"])
	# Append chunk with score
	results.append((score, chunk))
	# Sort result on best scored
	results.sort(key=lambda x: x[0], reverse=True)
	# logger.info(f"[Memory] RAG Retrieved Topic: {results}") # Inspect vector data
	return [f"### Topic: {c['tag']}\n{c['text']}" for _, c in results]

	def get_recent_chat_history(self, user_id: str, num_turns: int = 5) -> List[Dict]:
	"""
	Get the most recent short-term memory summaries.
	Returns: a list of entries containing only the summarized bot context.
	"""
	if user_id not in self.stm_summaries:
	return []
	recent = list(self.stm_summaries[user_id])[-num_turns:]
	formatted = []
	for entry in recent:
	formatted.append({
	"user": "",
	"bot": f"Topic: {entry['topic']}\n{entry['text']}",
	"timestamp": entry.get("timestamp", time.time())
	})
	return formatted

	def get_context(self, user_id: str, num_turns: int = 5) -> str:
	# Prefer STM summaries
	history = self.get_recent_chat_history(user_id, num_turns=num_turns)
	return "\n".join(h["bot"] for h in history)

	def get_contextual_chunks(self, user_id: str, current_query: str, lang: str = "EN") -> str:
	"""
	Use Gemini Flash Lite to create a summarization of relevant context from both recent history and RAG chunks.
	This ensures conversational continuity while providing a concise summary for the main LLM.
	"""
	# Get both types of context
	recent_history = self.get_recent_chat_history(user_id, num_turns=5)
	rag_chunks = self.get_relevant_chunks(user_id, current_query, top_k=3)

	logger.info(f"[Contextual] Retrieved {len(recent_history)} recent history items")
	logger.info(f"[Contextual] Retrieved {len(rag_chunks)} RAG chunks")

	# Return empty string if no context is found
	if not recent_history and not rag_chunks:
	logger.info(f"[Contextual] No context found, returning empty string")
	return ""
	# Prepare context for Gemini to summarize
	context_parts = []
	# Add recent chat history
	if recent_history:
	history_text = "\n".join([
	f"User: {item['user']}\nBot: {item['bot']}"
	for item in recent_history
	])
	context_parts.append(f"Recent conversation history:\n{history_text}")
	# Add RAG chunks
	if rag_chunks:
	rag_text = "\n".join(rag_chunks)
	context_parts.append(f"Semantically relevant historical medical information:\n{rag_text}")

	# Build summarization prompt
	summarization_prompt = f"""
	You are a medical assistant creating a concise summary of conversation context for continuity.

	Current user query: "{current_query}"

	Available context information:
	{chr(10).join(context_parts)}

	Task: Create a brief, coherent summary that captures the key points from the conversation history and relevant medical information that are important for understanding the current query.

	Guidelines:
	1. Focus on medical symptoms, diagnoses, treatments, or recommendations mentioned
	2. Include any patient concerns or questions that are still relevant
	3. Highlight any follow-up needs or pending clarifications
	4. Keep the summary concise but comprehensive enough for context
	5. Maintain conversational flow and continuity

	Output: Provide a single, well-structured summary paragraph that can be used as context for the main LLM to provide a coherent response.
	If no relevant context exists, return "No relevant context found."

	Language context: {lang}
	"""

	logger.debug(f"[Contextual] Full prompt: {summarization_prompt}")
	# Loop through the prompt and log the length of each part
	try:
	# Use Gemini Flash Lite for summarization
	client = genai.Client(api_key=os.getenv("FlashAPI"))
	result = client.models.generate_content(
	model=_LLM_SMALL,
	contents=summarization_prompt
	)
	summary = result.text.strip()
	if "No relevant context found" in summary:
	logger.info(f"[Contextual] Gemini indicated no relevant context found")
	return ""

	logger.info(f"[Contextual] Gemini created summary: {summary[:100]}...")
	return summary

	except Exception as e:
	logger.warning(f"[Contextual] Gemini summarization failed: {e}")
	logger.info(f"[Contextual] Using fallback summarization method")
	# Fallback: create a simple summary
	fallback_summary = []
	# Fallback: add recent history
	if recent_history:
	recent_summary = f"Recent conversation: User asked about {recent_history[-1]['user'][:50]}... and received a response about {recent_history[-1]['bot'][:50]}..."
	fallback_summary.append(recent_summary)
	logger.info(f"[Contextual] Fallback: Added recent history summary")
	# Fallback: add RAG chunks
	if rag_chunks:
	rag_summary = f"Relevant medical information: {len(rag_chunks)} chunks found covering various medical topics."
	fallback_summary.append(rag_summary)
	logger.info(f"[Contextual] Fallback: Added RAG chunks summary")
	final_fallback = " ".join(fallback_summary) if fallback_summary else ""
	return final_fallback

	def reset(self, user_id: str):
	self._drop_user(user_id)

	# ---------- Internal helpers ----------
	def _touch_user(self, user_id: str):
	if user_id not in self.text_cache and len(self.user_queue) >= self.user_queue.maxlen:
	self._drop_user(self.user_queue.popleft())
	if user_id in self.user_queue:
	self.user_queue.remove(user_id)
	self.user_queue.append(user_id)

	def _drop_user(self, user_id: str):
	self.text_cache.pop(user_id, None)
	self.chunk_index.pop(user_id, None)
	self.chunk_meta.pop(user_id, None)
	if user_id in self.user_queue:
	self.user_queue.remove(user_id)

	def _rebuild_index(self, user_id: str, keep_last: int):
	"""Trim chunk list + rebuild FAISS index for user."""
	self.chunk_meta[user_id] = self.chunk_meta[user_id][-keep_last:]
	index = self._new_index()
	# Store each chunk's vector once and reuse it.
	for chunk in self.chunk_meta[user_id]:
	index.add(np.array([chunk["vec"]]))
	self.chunk_index[user_id] = index

	@staticmethod
	def _new_index():
	# Use cosine similarity (vectors must be L2-normalised)
	return faiss.IndexFlatIP(384)

	@staticmethod
	def _embed(text: str):
	vec = EMBED.encode(text, convert_to_numpy=True)
	# L2 normalise for cosine on IndexFlatIP
	return vec / (np.linalg.norm(vec) + 1e-9)

	def chunk_response(self, response: str, lang: str, question: str = "") -> List[Dict]:
	"""
	Calls Gemini to:
	- Translate (if needed)
	- Chunk by context/topic (exclude disclaimer section)
	- Summarise
	Returns: [{"tag": ..., "text": ...}, ...]
	"""
	if not response: return []
	# Gemini instruction
	instructions = []
	# if lang.upper() != "EN":
	# instructions.append("- Translate the response to English.")
	instructions.append("- Break the translated (or original) text into semantically distinct parts, grouped by medical topic, symptom, assessment, plan, or instruction (exclude disclaimer section).")
	instructions.append("- For each part, generate a clear, concise summary. The summary may vary in length depending on the complexity of the topic — do not omit key clinical instructions and exact medication names/doses if present.")
	instructions.append("- At the start of each part, write `Topic: <concise but specific sentence (10-20 words) capturing patient context, condition, and action>`.")
	instructions.append("- Separate each part using three dashes `---` on a new line.")
	# if lang.upper() != "EN":
	# instructions.append(f"Below is the user-provided medical response written in `{lang}`")
	# Gemini prompt
	prompt = f"""
	You are a medical assistant helping organize and condense a clinical response.
	If helpful, use the user's latest question for context to craft specific topics.
	User's latest question (context): {question}
	------------------------
	{response}
	------------------------
	Please perform the following tasks:
	{chr(10).join(instructions)}

	Output only the structured summaries, separated by dashes.
	"""
	retries = 0
	while retries < 5:
	try:
	client = genai.Client(api_key=os.getenv("FlashAPI"))
	result = client.models.generate_content(
	model=_LLM_SMALL,
	contents=prompt
	# ,generation_config={"temperature": 0.4} # Skip temp configs for gem-flash
	)
	output = result.text.strip()
	logger.info(f"[Memory] 📦 Gemini summarized chunk output: {output}")
	return [
	{"tag": self._quick_extract_topic(chunk), "text": chunk.strip()}
	for chunk in output.split('---') if chunk.strip()
	]
	except Exception as e:
	logger.warning(f"[Memory] ❌ Gemini chunking failed: {e}")
	retries += 1
	time.sleep(0.5)
	return [{"tag": "general", "text": response.strip()}] # fallback

	@staticmethod
	def _quick_extract_topic(chunk: str) -> str:
	"""Heuristically extract the topic from a chunk (title line or first 3 words)."""
	# Expecting 'Topic: <something>'
	match = re.search(r'^Topic:\s*(.+)', chunk, re.IGNORECASE \| re.MULTILINE)
	if match:
	return match.group(1).strip()
	lines = chunk.strip().splitlines()
	for line in lines:
	if len(line.split()) <= 8 and line.strip().endswith(":"):
	return line.strip().rstrip(":")
	return " ".join(chunk.split()[:3]).rstrip(":.,")

	# ---------- New merging/dedup logic ----------
	def _upsert_stm(self, user_id: str, chunk: Dict, lang: str):
	"""Insert or merge a summarized chunk into STM with semantic dedup/merge.
	Identical: replace the older with new. Partially similar: merge extra details from older into newer.
	"""
	topic = self._enrich_topic(chunk.get("tag", ""), chunk.get("text", ""))
	text = chunk.get("text", "").strip()
	vec = self._embed(text)
	now = time.time()
	entry = {"topic": topic, "text": text, "vec": vec, "timestamp": now, "used": 0}
	stm = self.stm_summaries[user_id]
	if not stm:
	stm.append(entry)
	return
	# find best match
	best_idx = -1
	best_sim = -1.0
	for i, e in enumerate(stm):
	sim = float(np.dot(vec, e["vec"]))
	if sim > best_sim:
	best_sim = sim
	best_idx = i
	if best_sim >= 0.92: # nearly identical
	# replace older with current
	stm.rotate(-best_idx)
	stm.popleft()
	stm.rotate(best_idx)
	stm.append(entry)
	elif best_sim >= 0.75: # partially similar → merge
	base = stm[best_idx]
	merged_text = self._merge_texts(new_text=text, old_text=base["text"]) # add bits from old not in new
	merged_topic = base["topic"] if len(base["topic"]) > len(topic) else topic
	merged_vec = self._embed(merged_text)
	merged_entry = {"topic": merged_topic, "text": merged_text, "vec": merged_vec, "timestamp": now, "used": base.get("used", 0)}
	stm.rotate(-best_idx)
	stm.popleft()
	stm.rotate(best_idx)
	stm.append(merged_entry)
	else:
	stm.append(entry)

	def _upsert_ltm(self, user_id: str, chunks: List[Dict], lang: str):
	"""Insert or merge chunks into LTM with semantic dedup/merge, then rebuild index.
	Keeps only the most recent self.max_chunks entries.
	"""
	current_list = self.chunk_meta[user_id]
	for chunk in chunks:
	text = chunk.get("text", "").strip()
	if not text:
	continue
	vec = self._embed(text)
	topic = self._enrich_topic(chunk.get("tag", ""), text)
	now = time.time()
	new_entry = {"tag": topic, "text": text, "vec": vec, "timestamp": now, "used": 0}
	if not current_list:
	current_list.append(new_entry)
	continue
	# find best similar entry
	best_idx = -1
	best_sim = -1.0
	for i, e in enumerate(current_list):
	sim = float(np.dot(vec, e["vec"]))
	if sim > best_sim:
	best_sim = sim
	best_idx = i
	if best_sim >= 0.92:
	# replace older with new
	current_list[best_idx] = new_entry
	elif best_sim >= 0.75:
	# merge details
	base = current_list[best_idx]
	merged_text = self._merge_texts(new_text=text, old_text=base["text"]) # add unique sentences from old
	merged_topic = base["tag"] if len(base["tag"]) > len(topic) else topic
	merged_vec = self._embed(merged_text)
	current_list[best_idx] = {"tag": merged_topic, "text": merged_text, "vec": merged_vec, "timestamp": now, "used": base.get("used", 0)}
	else:
	current_list.append(new_entry)
	# Trim and rebuild index
	if len(current_list) > self.max_chunks:
	current_list[:] = current_list[-self.max_chunks:]
	self._rebuild_index(user_id, keep_last=self.max_chunks)

	@staticmethod
	def _split_sentences(text: str) -> List[str]:
	# naive sentence splitter by ., !, ?
	parts = re.split(r"(?<=[\.!?])\s+", text.strip())
	return [p.strip() for p in parts if p.strip()]

	def _merge_texts(self, new_text: str, old_text: str) -> str:
	"""Append sentences from old_text that are not already contained in new_text (by fuzzy match)."""
	new_sents = self._split_sentences(new_text)
	old_sents = self._split_sentences(old_text)
	new_set = set(s.lower() for s in new_sents)
	merged = list(new_sents)
	for s in old_sents:
	s_norm = s.lower()
	# consider present if significant overlap with any existing sentence
	if s_norm in new_set:
	continue
	# simple containment check
	if any(self._overlap_ratio(s_norm, t.lower()) > 0.8 for t in merged):
	continue
	merged.append(s)
	return " ".join(merged)

	@staticmethod
	def _overlap_ratio(a: str, b: str) -> float:
	"""Compute token overlap ratio between two sentences."""
	ta = set(re.findall(r"\w+", a))
	tb = set(re.findall(r"\w+", b))
	if not ta or not tb:
	return 0.0
	inter = len(ta & tb)
	union = len(ta \| tb)
	return inter / union

	@staticmethod
	def _enrich_topic(topic: str, text: str) -> str:
	"""Make topic more descriptive if it's too short by using the first sentence of the text.
	Does not call LLM to keep latency low.
	"""
	topic = (topic or "").strip()
	if len(topic.split()) < 5 or len(topic) < 20:
	sents = re.split(r"(?<=[\.!?])\s+", text.strip())
	if sents:
	first = sents[0]
	# cap to ~16 words
	words = first.split()
	if len(words) > 16:
	first = " ".join(words[:16])
	# ensure capitalized
	return first.strip().rstrip(':')
	return topic