metadata
license: bigscience-bloom-rail-1.0
datasets:
- squad
language:
- fr
- en
pipeline_tag: sentence-similarity
import numpy as np
from transformers import pipeline
from scipy.spatial.distance import cdist
retriever = pipeline('feature-extraction', 'cmarkea/bloomz-560m-retriever')
infer = lambda x: [ii[0][-1] for ii in retriever(x)]
list_of_contexts = [...]
emb_contexts = np.concatenate(infer(list_of_contexts), axis=0)
list_of_queries = [...]
emb_queries = np.concatenate(infer(list_of_queries), axis=0)
dist = cdist(emb_queries, emb_contexts, 'euclidean')
# top 5 nearest contexts for each queries
top_k = lambda x: [[list_of_contexts[qq] for qq in ii] for ii in dist.argsort(axis=-1)[:,:x]]
top_contexts = top_k(5)