| | """ |
| | models/anomaly-detection/src/utils/vectorizer.py |
| | Text vectorization using language-specific BERT models (downloaded locally) |
| | """ |
| | import os |
| | import logging |
| | from typing import Dict, List, Optional, Tuple |
| | from pathlib import Path |
| | import numpy as np |
| |
|
| | logger = logging.getLogger("vectorizer") |
| |
|
| | |
| | try: |
| | from transformers import AutoTokenizer, AutoModel |
| | import torch |
| | TRANSFORMERS_AVAILABLE = True |
| | except ImportError: |
| | TRANSFORMERS_AVAILABLE = False |
| | logger.warning("Transformers not available. Install with: pip install transformers torch") |
| |
|
| | |
| | try: |
| | from sentence_transformers import SentenceTransformer |
| | SENTENCE_TRANSFORMERS_AVAILABLE = True |
| | except ImportError: |
| | SENTENCE_TRANSFORMERS_AVAILABLE = False |
| |
|
| |
|
| | class MultilingualVectorizer: |
| | """ |
| | Vectorizer using language-specific BERT models. |
| | Downloads and caches models locally from HuggingFace. |
| | |
| | Models: |
| | - English: distilbert-base-uncased (fast, accurate) |
| | - Sinhala: keshan/SinhalaBERTo (specialized) |
| | - Tamil: l3cube-pune/tamil-bert (specialized) |
| | """ |
| |
|
| | MODEL_MAP = { |
| | "english": "distilbert-base-uncased", |
| | "sinhala": "keshan/SinhalaBERTo", |
| | "tamil": "l3cube-pune/tamil-bert" |
| | } |
| |
|
| | def __init__(self, models_cache_dir: Optional[str] = None, device: Optional[str] = None): |
| | """ |
| | Initialize the multilingual vectorizer. |
| | |
| | Args: |
| | models_cache_dir: Directory to cache downloaded models |
| | device: 'cuda' or 'cpu' (auto-detected if None) |
| | """ |
| | self.models_cache_dir = models_cache_dir or str( |
| | Path(__file__).parent.parent.parent / "models_cache" |
| | ) |
| | Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | os.environ["TRANSFORMERS_CACHE"] = self.models_cache_dir |
| | os.environ["HF_HOME"] = self.models_cache_dir |
| |
|
| | |
| | if device is None: |
| | if TRANSFORMERS_AVAILABLE and torch.cuda.is_available(): |
| | self.device = "cuda" |
| | else: |
| | self.device = "cpu" |
| | else: |
| | self.device = device |
| |
|
| | logger.info(f"[Vectorizer] Using device: {self.device}") |
| |
|
| | |
| | self.models: Dict[str, Tuple] = {} |
| | self.fallback_model = None |
| |
|
| | def _load_model(self, language: str) -> Tuple: |
| | """ |
| | Load language-specific model from cache or download. |
| | |
| | Returns: |
| | Tuple of (tokenizer, model) |
| | """ |
| | if language in self.models: |
| | return self.models[language] |
| |
|
| | model_name = self.MODEL_MAP.get(language, self.MODEL_MAP["english"]) |
| |
|
| | if not TRANSFORMERS_AVAILABLE: |
| | raise RuntimeError("Transformers library not available") |
| |
|
| | logger.info(f"[Vectorizer] Loading model: {model_name}") |
| |
|
| | try: |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | model_name, |
| | cache_dir=self.models_cache_dir |
| | ) |
| | model = AutoModel.from_pretrained( |
| | model_name, |
| | cache_dir=self.models_cache_dir |
| | ).to(self.device) |
| | model.eval() |
| |
|
| | self.models[language] = (tokenizer, model) |
| | logger.info(f"[Vectorizer] ✓ Loaded {model_name} ({language})") |
| | return tokenizer, model |
| |
|
| | except Exception as e: |
| | logger.error(f"[Vectorizer] Failed to load {model_name}: {e}") |
| | |
| | if language != "english": |
| | logger.info("[Vectorizer] Falling back to English model") |
| | return self._load_model("english") |
| | raise |
| |
|
| | def _get_embedding(self, text: str, tokenizer, model) -> np.ndarray: |
| | """ |
| | Get embedding vector using mean pooling. |
| | |
| | Args: |
| | text: Input text |
| | tokenizer: HuggingFace tokenizer |
| | model: HuggingFace model |
| | |
| | Returns: |
| | 768-dim numpy array |
| | """ |
| | if not TRANSFORMERS_AVAILABLE: |
| | raise RuntimeError("Transformers not available") |
| |
|
| | |
| | inputs = tokenizer( |
| | text, |
| | return_tensors="pt", |
| | truncation=True, |
| | max_length=512, |
| | padding=True |
| | ).to(self.device) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | |
| | attention_mask = inputs["attention_mask"] |
| | hidden_states = outputs.last_hidden_state |
| |
|
| | |
| | mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float() |
| | sum_embeddings = torch.sum(hidden_states * mask_expanded, 1) |
| | sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) |
| | mean_embedding = sum_embeddings / sum_mask |
| |
|
| | return mean_embedding.cpu().numpy().flatten() |
| |
|
| | def vectorize(self, text: str, language: str = "english") -> np.ndarray: |
| | """ |
| | Convert text to vector embedding. |
| | |
| | Args: |
| | text: Input text |
| | language: 'english', 'sinhala', 'tamil', or 'unknown' |
| | |
| | Returns: |
| | 768-dim numpy array |
| | """ |
| | if not text or not text.strip(): |
| | return np.zeros(768) |
| |
|
| | |
| | if language == "unknown": |
| | language = "english" |
| |
|
| | try: |
| | tokenizer, model = self._load_model(language) |
| | return self._get_embedding(text, tokenizer, model) |
| | except Exception as e: |
| | logger.error(f"[Vectorizer] Error vectorizing: {e}") |
| | |
| | return np.zeros(768) |
| |
|
| | def vectorize_batch( |
| | self, |
| | texts: List[str], |
| | languages: Optional[List[str]] = None |
| | ) -> np.ndarray: |
| | """ |
| | Batch vectorization for multiple texts. |
| | |
| | Args: |
| | texts: List of text strings |
| | languages: Optional list of language codes (same length as texts) |
| | |
| | Returns: |
| | numpy array of shape (n_texts, 768) |
| | """ |
| | if languages is None: |
| | languages = ["english"] * len(texts) |
| |
|
| | embeddings = [] |
| | for text, lang in zip(texts, languages): |
| | emb = self.vectorize(text, lang) |
| | embeddings.append(emb) |
| |
|
| | return np.array(embeddings) |
| |
|
| | def download_all_models(self): |
| | """Pre-download all language models""" |
| | for language in self.MODEL_MAP.keys(): |
| | try: |
| | logger.info(f"[Vectorizer] Pre-downloading {language} model...") |
| | self._load_model(language) |
| | except Exception as e: |
| | logger.warning(f"[Vectorizer] Failed to download {language}: {e}") |
| |
|
| |
|
| | |
| | _vectorizer: Optional[MultilingualVectorizer] = None |
| |
|
| |
|
| | def get_vectorizer(models_cache_dir: Optional[str] = None) -> MultilingualVectorizer: |
| | """Get or create singleton vectorizer instance""" |
| | global _vectorizer |
| | if _vectorizer is None: |
| | _vectorizer = MultilingualVectorizer(models_cache_dir) |
| | return _vectorizer |
| |
|
| |
|
| | def vectorize_text(text: str, language: str = "english") -> np.ndarray: |
| | """ |
| | Convenience function for text vectorization. |
| | |
| | Args: |
| | text: Input text |
| | language: Language code |
| | |
| | Returns: |
| | 768-dim numpy array |
| | """ |
| | return get_vectorizer().vectorize(text, language) |
| |
|