Spaces:
Running
Running
from transformers import AutoTokenizer, AutoModel | |
import torch | |
import numpy as np | |
from typing import List | |
class Embedder: | |
def __init__(self, model_name: str = "BAAI/bge-m3"): | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
self.model = AutoModel.from_pretrained(model_name) | |
def embed(self, texts: List[str]) -> np.ndarray: | |
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
embeddings = outputs.last_hidden_state[:, 0] # lấy embedding từ CLS token | |
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) | |
return embeddings.cpu().numpy() |