Spaces:
Sleeping
Sleeping
from tqdm import tqdm | |
import joblib | |
import numpy as np | |
from sentence_transformers import SentenceTransformer, util | |
# Load the model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def get_documents_from_scores(scores): | |
rankings = [] | |
for score in scores: | |
rankings.append(score[0]) | |
return rankings | |
def cosine_similarity(v1, v2): | |
v1 = np.array(v1) | |
v2 = np.array(v2) | |
if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0): | |
sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) | |
else: | |
sim = 0 | |
return sim | |
def get_open_source_embeddings(documents): | |
documents_embeddings = [] | |
for document in tqdm(documents): | |
documents_embeddings.append(model.encode(document)) | |
return documents_embeddings | |
def open_source_rankings(query, document_embeddings, k): | |
query_embedding = model.encode(query) | |
scores = [] | |
for idx, embedding in enumerate(document_embeddings): | |
scores.append((idx, cosine_similarity(query_embedding, embedding))) | |
scores = sorted(scores, key=lambda x: x[1], reverse=True) | |
scores = scores[:k] | |
rankings = get_documents_from_scores(scores) | |
return rankings, scores | |
def open_source_pipeline(query, documents_embeddings_path="Retrieval/savedModels/open_source_embeddings.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100): | |
document_embeddings = joblib.load(documents_embeddings_path) | |
ids = joblib.load(ids_path) | |
rankings, scores = open_source_rankings(query, document_embeddings, k) | |
rankings2 = [] | |
for ranking in tqdm(rankings): | |
rankings2.append(ids[ranking]) | |
return rankings2 |