Spaces:
Running
Running
import pprint | |
import torch | |
import pickle | |
import numpy as np | |
import gradio as gr | |
import pandas as pd | |
from rank_bm25 import * | |
from sentence_transformers import SentenceTransformer, util | |
# read data | |
df = pd.read_csv("./assets/final_combined_raw.csv")[['category', 'brand', 'product_name']].to_dict(orient='records') | |
doc_embeddings = np.load("./assets/e5_embed.npy", allow_pickle=True) | |
# Semantic Search model | |
semantic_model = SentenceTransformer("Abdul-Ib/multilingual-e5-small-2024", cache_folder = "./assets") | |
# full-text search model | |
with open('./assets/bm25_L.pkl', 'rb') as bm25result_file: | |
keyword_search = pickle.load(bm25result_file) | |
def full_text_search(normalized_query): | |
tokenized_query = normalized_query.lower().split(" ") | |
ft_scores = keyword_search.get_scores(tokenized_query) | |
return ft_scores | |
def semantic_search(normalized_query): | |
query_embedding = semantic_model.encode(normalized_query.lower()) | |
rr_scores = util.cos_sim(query_embedding, doc_embeddings)[0] | |
return rr_scores | |
def hybrid_search(ft_scores, rr_scores): | |
ft_scores = 2 / np.pi * np.arctan(ft_scores) - 0.5 | |
ft_scores[ft_scores < 0] = 0 | |
hybrid_scores = 0.7 * ft_scores + 0.3 * rr_scores.numpy() | |
return torch.topk(torch.tensor(hybrid_scores), k=10) | |
def print_results(hits): | |
results = "" | |
for score, idx in zip(hits[0], hits[1]): | |
results += pprint.pformat(df[idx.numpy()], indent=4) + "\n" | |
return results | |
def predict(query): | |
normalized_query = query | |
bm25_scores = full_text_search(normalized_query) | |
sem_scores = semantic_search(normalized_query) | |
hits = hybrid_search(bm25_scores, sem_scores) | |
return print_results(hits) | |
app = gr.Interface( | |
fn = predict, | |
inputs = gr.Textbox(lines=3, placeholder="Enter Search Query..."), | |
outputs = "text", | |
title = "Hybrid Search (Lexical Search + Semantic Search)" | |
) | |
app.launch() |