E-commerce-App / app.py
Abdul-Ib's picture
Update app.py
ab1e42c verified
raw
history blame
No virus
1.94 kB
import pprint
import torch
import pickle
import numpy as np
import gradio as gr
import pandas as pd
from rank_bm25 import *
from sentence_transformers import SentenceTransformer, util
# read data
df = pd.read_csv("./assets/final_combined_raw.csv")[['category', 'brand', 'product_name']].to_dict(orient='records')
doc_embeddings = np.load("./assets/multi_embed.npy", allow_pickle=True)
# Semantic Search model
semantic_model = SentenceTransformer("Abdul-Ib/paraphrase-multilingual-MiniLM-L12-v2-2024", cache_folder = "./assets")
# full-text search model
with open('./assets/bm25_L.pkl', 'rb') as bm25result_file:
keyword_search = pickle.load(bm25result_file)
def full_text_search(normalized_query):
tokenized_query = normalized_query.lower().split(" ")
ft_scores = keyword_search.get_scores(tokenized_query)
return ft_scores
def semantic_search(normalized_query):
query_embedding = semantic_model.encode(normalized_query.lower())
rr_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
return rr_scores
def hybrid_search(ft_scores, rr_scores):
ft_scores = 2 / np.pi * np.arctan(ft_scores) - 0.5
ft_scores[ft_scores < 0] = 0
hybrid_scores = 0.7 * ft_scores + 0.3 * rr_scores.numpy()
return torch.topk(torch.tensor(hybrid_scores), k=10)
def print_results(hits):
results = ""
for score, idx in zip(hits[0], hits[1]):
results += pprint.pformat(df[idx.numpy()], indent=4) + "\n"
return results
def predict(query):
normalized_query = query
bm25_scores = full_text_search(normalized_query)
sem_scores = semantic_search(normalized_query)
hits = hybrid_search(bm25_scores, sem_scores)
return print_results(hits)
app = gr.Interface(
fn = predict,
inputs = gr.Textbox(lines=3, placeholder="Enter Search Query..."),
outputs = "text",
title = "Hybrid Search (Lexical Search + Semantic Search)"
)
app.launch()