from typing import Dict, List, Optional, Union import numpy as np import requests from mteb import DRESModel from tqdm import tqdm class SionicEmbeddingModel(DRESModel): def __init__(self, url: str, instruction: Optional[str] = None, batch_size: int = 128, dimension: int = 2048, **kwargs) -> None: self.url = url self.instruction = instruction self.batch_size = batch_size self.dimension = dimension def get_embeddings(self, queries: List[str]) -> np.ndarray: return np.asarray( requests.post(self.url, json={'inputs': queries}).json()['embedding'], dtype=np.float32, ).reshape(len(queries), self.dimension) def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray: return self.encode([f'{self.instruction}{query}' for query in queries]) def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kwargs) -> np.ndarray: sentences: List[str] = ( [f"{doc.get('title', '')} {doc['text']}".strip() for doc in corpus] if isinstance(corpus[0], dict) else corpus ) return self.encode(sentences) def encode(self, sentences: List[str], **kwargs) -> np.ndarray: return np.concatenate( [ self.get_embeddings(sentences[idx:idx + self.batch_size]) for idx in tqdm(range(0, len(sentences), self.batch_size), desc='encode') ], axis=0, )