import pprint import torch import pickle import numpy as np import gradio as gr import pandas as pd from rank_bm25 import * from sentence_transformers import SentenceTransformer, util # read data df = pd.read_csv("./assets/final_combined_raw.csv")[['category', 'brand', 'product_name']].to_dict(orient='records') doc_embeddings = np.load("./assets/multi_embed.npy", allow_pickle=True) # Semantic Search model semantic_model = SentenceTransformer("Abdul-Ib/paraphrase-multilingual-MiniLM-L12-v2-2024", cache_folder = "./assets") # full-text search model with open('./assets/bm25_L.pkl', 'rb') as bm25result_file: keyword_search = pickle.load(bm25result_file) def full_text_search(normalized_query): tokenized_query = normalized_query.lower().split(" ") ft_scores = keyword_search.get_scores(tokenized_query) return ft_scores def semantic_search(normalized_query): query_embedding = semantic_model.encode(normalized_query.lower()) rr_scores = util.cos_sim(query_embedding, doc_embeddings)[0] return rr_scores def hybrid_search(ft_scores, rr_scores): ft_scores = 2 / np.pi * np.arctan(ft_scores) - 0.5 ft_scores[ft_scores < 0] = 0 hybrid_scores = 0.7 * ft_scores + 0.3 * rr_scores.numpy() return torch.topk(torch.tensor(hybrid_scores), k=10) def print_results(hits): results = "" for score, idx in zip(hits[0], hits[1]): results += pprint.pformat(df[idx.numpy()], indent=4) + "\n" return results def predict(query): normalized_query = query bm25_scores = full_text_search(normalized_query) sem_scores = semantic_search(normalized_query) hits = hybrid_search(bm25_scores, sem_scores) return print_results(hits) app = gr.Interface( fn = predict, inputs = gr.Textbox(lines=3, placeholder="Enter Search Query..."), outputs = "text", title = "Hybrid Search (Lexical Search + Semantic Search)" ) app.launch()