|
import torch |
|
import json |
|
import numpy as np |
|
|
|
|
|
def create_dense_embeddings(query, model, instruction=None): |
|
if instruction == None: |
|
dense_emb = model.encode([query]).tolist() |
|
else: |
|
|
|
json_output_embedding = model.predict( |
|
instruction, |
|
query, |
|
api_name="/predict", |
|
) |
|
|
|
json_file = open(json_output_embedding, "r") |
|
json_dict = json.load(json_file) |
|
dense_array = np.array(json_dict["data"], dtype=np.float64) |
|
dense_emb = dense_array.tolist() |
|
return dense_emb |
|
|
|
|
|
def create_sparse_embeddings(query, model, tokenizer): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
inputs = tokenizer(query, return_tensors="pt").to(device) |
|
|
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
inter = torch.log1p(torch.relu(logits[0])) |
|
token_max = torch.max(inter, dim=0) |
|
nz_tokens = torch.where(token_max.values > 0)[0] |
|
nz_weights = token_max.values[nz_tokens] |
|
|
|
order = torch.sort(nz_weights, descending=True) |
|
nz_weights = nz_weights[order[1]] |
|
nz_tokens = nz_tokens[order[1]] |
|
return { |
|
"indices": nz_tokens.cpu().numpy().tolist(), |
|
"values": nz_weights.cpu().numpy().tolist(), |
|
} |
|
|
|
|
|
def hybrid_score_norm(dense, sparse, alpha: float): |
|
"""Hybrid score using a convex combination |
|
|
|
alpha * dense + (1 - alpha) * sparse |
|
|
|
Args: |
|
dense: Array of floats representing |
|
sparse: a dict of `indices` and `values` |
|
alpha: scale between 0 and 1 |
|
""" |
|
if alpha < 0 or alpha > 1: |
|
raise ValueError("Alpha must be between 0 and 1") |
|
hsparse = { |
|
"indices": sparse["indices"], |
|
"values": [v * (1 - alpha) for v in sparse["values"]], |
|
} |
|
|
|
hdense = [[v * alpha for v in dense[0]]] |
|
return hdense, hsparse |
|
|