search_engine / models /vectorizer.py
Vitomir Jovanović
Search Engine
01f5415
raw
history blame
998 Bytes
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Sequence
import faiss
class Vectorizer:
def __init__(self, model) -> None:
"""Initialize the vectorizer with a pre-trained embedding model.
Args: model: The pre-trained embedding model to use for transforming prompts.
"""
self.model = model
self.index_size = 50000
self.index = faiss.IndexFlatIP(self.index_size)
self.cached_index_idx_to_retrieval_db_idx = []
def transform_and_add_to_index(self, prompts: Sequence[str]) -> np.ndarray:
"""Transform texts into numerical vectors using the specified model.
Args: prompts: The sequence of raw corpus prompts. Returns: Vectorized prompts
"""
embeddings = self.model.encode(prompts)
embedding_dimension = embeddings.shape[1]
print('Embedding dimension:', embedding_dimension)
self.index.add(np.array(embeddings))