| import os |
| from sentence_transformers import SentenceTransformer |
| import numpy as np |
| from typing import List, Dict |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
| class EmbeddingService: |
| def __init__(self): |
| try: |
| |
| cache_dir = "/app/models_cache" |
| os.makedirs(cache_dir, exist_ok=True) |
| |
| |
| os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir |
| os.environ['TRANSFORMERS_CACHE'] = cache_dir |
| os.environ['HF_HOME'] = cache_dir |
| |
| print(f"π§ [CACHE] Using cache directory: {cache_dir}", flush=True) |
| logger.info(f"π§ Using cache directory: {cache_dir}") |
| |
| |
| self.model = SentenceTransformer( |
| 'all-MiniLM-L6-v2', |
| cache_folder=cache_dir, |
| device='cpu' |
| ) |
| |
| print("β
[MODEL] SentenceTransformer loaded successfully!", flush=True) |
| logger.info("π€ Local embedding service initialized (all-MiniLM-L6-v2)") |
| |
| except Exception as e: |
| print(f"β [ERROR] Failed to load model: {e}", flush=True) |
| logger.error(f"β Failed to load SentenceTransformer model: {e}") |
| raise Exception(f"Failed to initialize local embedding model: {e}") |
| |
| async def generate_embedding(self, text: str, title: str = "") -> List[float]: |
| try: |
| content = f"File: {title}\n\nCode:\n{text}" if title else text |
| embedding = self.model.encode(content) |
| return embedding.tolist() |
| except Exception as e: |
| logger.error(f"β Error generating local embedding: {e}") |
| raise |
| |
| async def generate_embeddings_batch(self, chunks: List[Dict]) -> List[Dict]: |
| print(f"π§ [EMBED] Generating LOCAL embeddings for {len(chunks)} chunks...", flush=True) |
| logger.info(f"π Generating LOCAL embeddings for {len(chunks)} chunks...") |
| |
| texts = [] |
| for chunk in chunks: |
| content = f"""File: {chunk['file_path']} |
| Lines: {chunk['start_line']}-{chunk['end_line']} |
| Type: {chunk['chunk_type']} |
| |
| Code: |
| {chunk['content']}""" |
| texts.append(content) |
| |
| try: |
| print(f"β‘ [EMBED] Processing {len(texts)} texts with SentenceTransformer...", flush=True) |
| embeddings = self.model.encode(texts, show_progress_bar=True, batch_size=32) |
| |
| embedded_chunks = [] |
| for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): |
| embedded_chunk = { |
| **chunk, |
| 'embedding': embedding.tolist(), |
| 'content_length': len(chunk['content']) |
| } |
| embedded_chunks.append(embedded_chunk) |
| |
| |
| if (i + 1) % 10 == 0: |
| print(f"β
[EMBED] Processed {i + 1}/{len(chunks)} embeddings", flush=True) |
| |
| except Exception as e: |
| print(f"β [EMBED] Failed to generate batch embeddings: {e}", flush=True) |
| logger.error(f"β Failed to generate batch embeddings: {e}") |
| raise |
| |
| print(f"π [EMBED] Generated {len(embedded_chunks)} LOCAL embeddings successfully!", flush=True) |
| logger.info(f"β
Generated {len(embedded_chunks)} LOCAL embeddings successfully") |
| return embedded_chunks |
| |
| async def generate_query_embedding(self, query: str) -> List[float]: |
| try: |
| embedding = self.model.encode(query) |
| return embedding.tolist() |
| except Exception as e: |
| logger.error(f"β Error generating query embedding: {e}") |
| raise |