Spaces:

MedSwin
/

MedicalDiagnosisSystem

Sleeping

dylanglenister commited on 28 days ago

Commit

4ca8eaf

1 Parent(s): e5c9fd8

REFACTOR: RAG ready embedding.

Reworked the emebdding file to match the embedding used for the knowledge base so that RAG can be implemented correctly.

Files changed (4) hide show

scripts/download_model.py +1 -1
src/config/settings.py +2 -1
src/core/state.py +2 -1
src/utils/embeddings.py +115 -77

scripts/download_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 from huggingface_hub import snapshot_download
 # Set up paths
-MODEL_REPO = "sentence-transformers/all-MiniLM-L6-v2"
 MODEL_CACHE_DIR = "/app/model_cache"
 HF_CACHE_DIR = os.getenv("HF_HOME", "/home/user/.cache/huggingface")

 from huggingface_hub import snapshot_download
 # Set up paths
+MODEL_REPO = "abhinand/MedEmbed-large-v0.1"
 MODEL_CACHE_DIR = "/app/model_cache"
 HF_CACHE_DIR = os.getenv("HF_HOME", "/home/user/.cache/huggingface")

src/config/settings.py CHANGED Viewed

@@ -8,7 +8,8 @@ class Settings:
 	DEFAULT_TOP_K: int = 5
 	SEMANTIC_CONTEXT_SIZE: int = 17
 	SIMILARITY_THRESHOLD: float = 0.15
 	# Safety Guard settings
 	SAFETY_GUARD_ENABLED: bool = os.getenv("SAFETY_GUARD_ENABLED", "true").lower() == "true"
 	SAFETY_GUARD_TIMEOUT: int = int(os.getenv("SAFETY_GUARD_TIMEOUT", "30"))

 	DEFAULT_TOP_K: int = 5
 	SEMANTIC_CONTEXT_SIZE: int = 17
 	SIMILARITY_THRESHOLD: float = 0.15
+	EMBEDDING_MODEL_NAME: str = "MedEmbed-large-v0.1"
 	# Safety Guard settings
 	SAFETY_GUARD_ENABLED: bool = os.getenv("SAFETY_GUARD_ENABLED", "true").lower() == "true"
 	SAFETY_GUARD_TIMEOUT: int = int(os.getenv("SAFETY_GUARD_TIMEOUT", "30"))

src/core/state.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # src/core/state.py
 from src.core.memory_manager import MemoryManager
 from src.utils.embeddings import EmbeddingClient
 from src.utils.rotator import APIKeyRotator
@@ -34,7 +35,7 @@ class AppState:
 	def initialize(self):
 		"""Initializes all core application components in the correct order."""
 		# Initialize components with no dependencies first
-		self.embedding_client = EmbeddingClient(model_name="all-MiniLM-L6-v2", dimension=384)
 		self.gemini_rotator = APIKeyRotator("GEMINI_API_", max_slots=5)
 		self.nvidia_rotator = APIKeyRotator("NVIDIA_API_", max_slots=5)

 # src/core/state.py
+from src.config.settings import settings
 from src.core.memory_manager import MemoryManager
 from src.utils.embeddings import EmbeddingClient
 from src.utils.rotator import APIKeyRotator
 	def initialize(self):
 		"""Initializes all core application components in the correct order."""
 		# Initialize components with no dependencies first
+		self.embedding_client = EmbeddingClient(model_name=settings.EMBEDDING_MODEL_NAME)
 		self.gemini_rotator = APIKeyRotator("GEMINI_API_", max_slots=5)
 		self.nvidia_rotator = APIKeyRotator("NVIDIA_API_", max_slots=5)

src/utils/embeddings.py CHANGED Viewed

@@ -1,125 +1,163 @@
 # src/utils/embeddings.py
 import numpy as np
 from numpy.typing import NDArray
 from src.config.settings import settings
 from src.utils.logger import logger
 class EmbeddingClient:
-	"""A simple embedding client with a fallback mechanism."""
-	def __init__(self, model_name: str = "default", dimension: int = 384):
 		self.model_name = model_name
-		self.dimension = dimension
-		self.model = None
-		self._fallback_mode = True
-		self._init_embedding_model()
-	def _init_embedding_model(self):
-		"""Initializes the sentence-transformer embedding model."""
 		try:
-			from sentence_transformers import SentenceTransformer  # type: ignore
-			self.model = SentenceTransformer(self.model_name)
-			self._fallback_mode = False
-			logger().info(f"Successfully loaded embedding model: {self.model_name}")
-		except ImportError:
-			logger().warning("sentence-transformers not found, using fallback embedding mode.")
 		except Exception as e:
-			logger().error(f"Error loading embedding model '{self.model_name}': {e}")
-	def embed(self, texts: str | list[str]) -> list[list[float]]:
-		"""Generates embeddings for the given texts."""
 		if isinstance(texts, str):
 			texts = [texts]
-		return self._fallback_embed(texts) if self._fallback_mode else self._proper_embed(texts)
-	def _proper_embed(self, texts: list[str]) -> list[list[float]]:
-		"""Generates embeddings using the sentence-transformer model."""
-		try:
-			embeddings = self.model.encode(texts, convert_to_numpy=True) # type: ignore
-			return embeddings.tolist()
-		except Exception as e:
-			logger().error(f"Error during embedding generation: {e}")
-			return self._fallback_embed(texts)
-	def _fallback_embed(self, texts: list[str]) -> list[list[float]]:
-		"""Generates deterministic, hash-based embeddings as a fallback."""
-		embeddings = []
-		for text in texts:
-			# Create a deterministic hash-based embedding
-			text_hash = hash(text) % (2**32)
-			np.random.seed(text_hash)
-			vector = np.random.normal(0, 1, self.dimension)
-			norm = np.linalg.norm(vector)
-			if norm > 0:
-				vector /= norm
-			embeddings.append(vector.tolist())
-		return embeddings
 	def is_available(self) -> bool:
-		"""Checks if the proper embedding model is available."""
-		return not self._fallback_mode
 	def semantic_search(
 		self,
 		query: str,
 		candidates: list[str],
 		top_k: int = settings.SEMANTIC_CONTEXT_SIZE,
-		threshold: float = settings.SIMILARITY_THRESHOLD
 	) -> list[str]:
 		"""Finds semantically similar texts using embedding-based search."""
-		if not candidates:
 			return []
 		query_vector = np.array(self.embed(query)[0], dtype="float32")
-		candidate_vectors = self.embed([s.strip() for s in candidates])
 		similarities = [
-			(self._cosine_similarity(query_vector, np.array(vec, dtype="float32")), text)
-			for vec, text in zip(candidate_vectors, candidates)
 		]
 		similarities.sort(key=lambda x: x[0], reverse=True)
 		return [text for score, text in similarities[:top_k] if score > threshold]
-	def similarity(self, text1: str, text2: str) -> float:
-		"""Calculate cosine similarity between two texts."""
-		emb1 = self.embed([text1])[0]
-		emb2 = self.embed([text2])[0]
-		# Convert to numpy arrays
-		emb1_np = np.array(emb1)
-		emb2_np = np.array(emb2)
-		return self._cosine_similarity(emb1_np, emb2_np)
-	def batch_similarity(self, query: str, candidates: list[str]) -> list[float]:
-		"""Calculate similarity between a query and multiple candidate texts."""
-		query_emb = self.embed([query])[0]
-		candidate_embs = self.embed(candidates)
-		similarities = []
-		query_emb_np = np.array(query_emb)
-		for candidate_emb in candidate_embs:
-			candidate_emb_np = np.array(candidate_emb)
-			similarities.append(self._cosine_similarity(query_emb_np, candidate_emb_np))
-		return similarities
 	def get_model_info(self) -> dict:
-		"""Get information about the current embedding model"""
 		return {
 			"model_name": self.model_name,
 			"dimension": self.dimension,
-			"fallback_mode": self._fallback_mode,
-			"available": self.is_available()
 		}
 	@staticmethod
-	def _cosine_similarity(vec_a: NDArray[np.float32], vec_b: NDArray[np.float32]) -> float:
 		"""Calculates the cosine similarity between two vectors."""
 		norm_a = np.linalg.norm(vec_a)
 		norm_b = np.linalg.norm(vec_b)

 # src/utils/embeddings.py
 import numpy as np
+import torch
+import torch.nn.functional as F
 from numpy.typing import NDArray
+from transformers import (AutoModel, AutoTokenizer, PreTrainedModel,
+                          PreTrainedTokenizer)
 from src.config.settings import settings
 from src.utils.logger import logger
 class EmbeddingClient:
+	"""
+	An embedding client that generates vector embeddings for text using a
+	transformer model, mirroring the logic used for knowledge base creation.
+	"""
+	def __init__(self, model_name: str):
 		self.model_name = model_name
+		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+		self.tokenizer: PreTrainedTokenizer | None = None
+		self.model: PreTrainedModel | None = None
+		self.dimension: int | None = None
+		self._available = self._init_embedding_model()
+	def _init_embedding_model(self) -> bool:
+		"""Initializes the transformer model and tokenizer."""
 		try:
+			logger().info(f"Loading embedding model '{self.model_name}' on {self.device}")
+			self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+			self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
+			self.model.eval()
+			# Dynamically determine the embedding dimension
+			self.dimension = self._get_embedding_dimension()
+			logger().info(f"Successfully loaded model. Embedding dimension: {self.dimension}")
+			return True
 		except Exception as e:
+			logger().error(f"Failed to load embedding model '{self.model_name}': {e}")
+			return False
+	def _get_embedding_dimension(self) -> int:
+		"""Runs a test input to determine the model's output dimension."""
+		if not self.tokenizer or not self.model:
+			raise RuntimeError("Model and tokenizer must be initialized.")
+		test_input = self.tokenizer(
+			"test", return_tensors="pt", truncation=True, padding=True
+		).to(self.device)
+		with torch.no_grad():
+			test_output = self.model(**test_input)
+			test_embedding = self._mean_pooling(
+				test_output.last_hidden_state, test_input["attention_mask"]
+			)
+		return test_embedding.shape[1]
+	def _mean_pooling(
+		self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
+	) -> torch.Tensor:
+		"""Performs mean pooling on token embeddings using an attention mask."""
+		input_mask_expanded = (
+			attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+		)
+		masked_embeddings = token_embeddings * input_mask_expanded
+		summed_embeddings = torch.sum(masked_embeddings, 1)
+		summed_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+		return summed_embeddings / summed_mask
+	def embed(self, texts: str | list[str], batch_size: int = 64) -> list[list[float]]:
+		"""
+		Generates normalized, mean-pooled embeddings for the given texts.
+		Returns an empty list if the model is not available or an error occurs.
+		"""
+		if not self.is_available() or not self.tokenizer or not self.model:
+			logger().error("Embedding model is not available, cannot generate embeddings.")
+			return [[] for _ in range(len(texts) if isinstance(texts, list) else 1)]
 		if isinstance(texts, str):
 			texts = [texts]
+		all_embeddings = []
+		for i in range(0, len(texts), batch_size):
+			batch_texts = texts[i : i + batch_size]
+			try:
+				inputs = self.tokenizer(
+					batch_texts,
+					truncation=True,
+					padding=True,
+					max_length=512,
+					return_tensors="pt",
+				).to(self.device)
+				with torch.no_grad():
+					outputs = self.model(**outputs)
+				attention_mask = inputs["attention_mask"]
+				chunk_embeddings = self._mean_pooling(
+					outputs.last_hidden_state, attention_mask
+				)
+				# L2 Normalization - CRITICAL STEP FOR COMPATIBILITY
+				normalized_embeddings = F.normalize(chunk_embeddings, p=2, dim=1)
+				all_embeddings.extend(normalized_embeddings.cpu().numpy().tolist())
+			except Exception as e:
+				logger().error(f"Error during embedding generation for a batch: {e}")
+				# Add empty embeddings for the failed batch
+				all_embeddings.extend([[] for _ in batch_texts])
+		return all_embeddings
 	def is_available(self) -> bool:
+		"""Checks if the embedding model was loaded successfully."""
+		return self._available
 	def semantic_search(
 		self,
 		query: str,
 		candidates: list[str],
 		top_k: int = settings.SEMANTIC_CONTEXT_SIZE,
+		threshold: float = settings.SIMILARITY_THRESHOLD,
 	) -> list[str]:
 		"""Finds semantically similar texts using embedding-based search."""
+		if not self.is_available() or not candidates:
 			return []
 		query_vector = np.array(self.embed(query)[0], dtype="float32")
+		if query_vector.size == 0:
+			return []
+		candidate_vectors = self.embed(candidates)
 		similarities = [
+			(
+				self._cosine_similarity(query_vector, np.array(vec, dtype="float32")),
+				text,
+			)
+			for vec, text in zip(candidate_vectors, candidates) if vec
 		]
 		similarities.sort(key=lambda x: x[0], reverse=True)
 		return [text for score, text in similarities[:top_k] if score > threshold]
 	def get_model_info(self) -> dict:
+		"""Get information about the current embedding model."""
 		return {
 			"model_name": self.model_name,
 			"dimension": self.dimension,
+			"device": str(self.device),
+			"available": self.is_available(),
 		}
 	@staticmethod
+	def _cosine_similarity(
+		vec_a: NDArray[np.float32], vec_b: NDArray[np.float32]
+	) -> float:
 		"""Calculates the cosine similarity between two vectors."""
 		norm_a = np.linalg.norm(vec_a)
 		norm_b = np.linalg.norm(vec_b)