E-commerce-App / app.py
Abdul-Ib's picture
Update app.py
6020ca3 verified
raw
history blame
1.93 kB
import pprint
import torch
import pickle
import numpy as np
import gradio as gr
import pandas as pd
from rank_bm25 import *
from sentence_transformers import SentenceTransformer, util
# read data
df = pd.read_csv("./assets/final_combined_raw.csv")[['category', 'brand', 'product_name']].to_dict(orient='records')
doc_embeddings = np.load("./assets/e5_embed.npy", allow_pickle=True)
# Semantic Search model
semantic_model = SentenceTransformer("Abdul-Ib/multilingual-e5-small-2024", cache_folder = "./assets")
# full-text search model
with open('./assets/bm25_L.pkl', 'rb') as bm25result_file:
keyword_search = pickle.load(bm25result_file)
def full_text_search(normalized_query):
tokenized_query = normalized_query.lower().split(" ")
ft_scores = keyword_search.get_scores(tokenized_query)
return ft_scores
def semantic_search(normalized_query):
query_embedding = semantic_model.encode(normalized_query.lower())
rr_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
return rr_scores
def hybrid_search(ft_scores, rr_scores):
ft_scores = 2 / np.pi * np.arctan(ft_scores) - 0.5
ft_scores[ft_scores < 0] = 0
hybrid_scores = 0.7 * ft_scores + 0.3 * rr_scores.numpy()
return torch.topk(torch.tensor(hybrid_scores), k=10)
def print_results(hits):
results = ""
for score, idx in zip(hits[0], hits[1]):
results += pprint.pformat(df[idx.numpy()], indent=4) + "\n"
return results
def predict(query):
normalized_query = query
bm25_scores = full_text_search(normalized_query)
sem_scores = semantic_search(normalized_query)
hits = hybrid_search(bm25_scores, sem_scores)
return print_results(hits)
app = gr.Interface(
fn = predict,
inputs = gr.Textbox(lines=3, placeholder="Enter Search Query..."),
outputs = "text",
title = "Hybrid Search (Lexical Search + Semantic Search)"
)
app.launch()