|
--- |
|
license: bigscience-bloom-rail-1.0 |
|
datasets: |
|
- squad |
|
language: |
|
- fr |
|
- en |
|
pipeline_tag: sentence-similarity |
|
--- |
|
|
|
```python |
|
import numpy as np |
|
from transformers import pipeline |
|
from scipy.spatial.distance import cdist |
|
|
|
retriever = pipeline('feature-extraction', 'cmarkea/bloomz-3b-retriever') |
|
infer = lambda x: [ii[0][-1] for ii in retriever(x)] |
|
|
|
list_of_contexts = [...] |
|
emb_contexts = np.concatenate(infer(list_of_contexts), axis=0) |
|
list_of_queries = [...] |
|
emb_queries = np.concatenate(infer(list_of_queries), axis=0) |
|
|
|
dist = cdist(emb_queries, emb_contexts, 'euclidean') |
|
top_k = lambda x: [[list_of_contexts[qq] for qq in ii] for ii in dist.argsort(axis=-1)[:,:x]] |
|
# top 5 nearest contexts for each queries |
|
top_contexts = top_k(5) |
|
``` |