from transformers import AutoTokenizer, AutoModel import torch import numpy as np from typing import List class Embedder: def __init__(self, model_name: str = "BAAI/bge-m3"): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) def embed(self, texts: List[str]) -> np.ndarray: inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") with torch.no_grad(): outputs = self.model(**inputs) embeddings = outputs.last_hidden_state[:, 0] # lấy embedding từ CLS token embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) return embeddings.cpu().numpy()