from transformers import AutoTokenizer, AutoModel | |
import torch | |
from typing import List | |
from model import PersonEmbeddings | |
class CustomEmbeddingPipeline: | |
def __init__(self, model_id="answerdotai/ModernBERT-base"): | |
# Load your base tokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# Load your PersonEmbeddings | |
self.model = PersonEmbeddings(model_id) | |
ckpt_path = "pytorch_model.bin" | |
state_dict = torch.load(ckpt_path) | |
self.model.load_state_dict(state_dict) | |
self.model.eval() | |
def __call__(self, text: str) -> List[float]: | |
# Tokenize | |
inputs = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt") | |
with torch.no_grad(): | |
emb = self.model(inputs["input_ids"], inputs["attention_mask"]) | |
# Return the embedding of shape (1, 1536) as a Python list | |
return emb[0].tolist() | |
def pipeline(*args, **kwargs): | |
return CustomEmbeddingPipeline() |