Spaces:

pradeep4321
/

sample_multi_search

Running

App Files Files Community

sample_multi_search / app.py

pradeep4321

Update app.py

828c082 verified about 4 hours ago

raw

history blame contribute delete

8.82 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import re

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sentence_transformers import SentenceTransformer
	from rank_bm25 import BM25Okapi
	from rapidfuzz import fuzz
	import faiss
	import nltk

	# ==============================
	# NLTK FIX
	# ==============================
	nltk.download('wordnet', quiet=True)
	from nltk.corpus import wordnet

	# ==============================
	# PAGE CONFIG
	# ==============================
	st.set_page_config(page_title="Multi Search Engine", layout="wide")
	st.title("🔍 Advanced Multi-Search Product Engine")

	# ==============================
	# LOAD MODEL
	# ==============================
	if "model" not in st.session_state:
	with st.spinner("Loading AI model..."):
	st.session_state.model = SentenceTransformer(
	'all-MiniLM-L6-v2',
	device='cpu'
	)

	model = st.session_state.model

	# ==============================
	# SEARCH INFO (UPDATED)
	# ==============================
	search_info = {
	"Keyword": ("Exact match", "iphone"),
	"Regex": ("Pattern match", "^Samsung"),
	"Boolean": ("AND / OR logic", "nike AND shoes"),
	"Fuzzy": ("Spelling mistakes", "iphon"),
	"N-Gram": ("Partial word", "iph"),
	"Prefix": ("Word starts with", "Sam"),
	"Suffix": ("Word ends with", "phone"),
	"TF-IDF": ("Keyword ranking", "wireless headphones"),
	"BM25": ("Advanced ranking", "gaming laptop"),
	"Semantic": ("Meaning search", "sports footwear"),
	"FAISS": ("Fast semantic", "music device"),
	"Hybrid": ("TF-IDF + Semantic", "running shoes"),
	"Query Expansion": ("Auto synonyms", "speaker"),
	"Weighted Hybrid": ("TF-IDF + Semantic + BM25", "best laptop"),
	"Ensemble": ("Combine all scores", "smartphone")
	}

	# ==============================
	# FILE LOAD (KEEP YOUR LOGIC)
	# ==============================
	uploaded_file = st.file_uploader("Upload CSV", type=["csv"])

	if uploaded_file:
	df = pd.read_csv(uploaded_file)
	else:
	st.info("Using sample dataset")
	df = pd.DataFrame({
	"product_name": [
	"iPhone 14 Pro",
	"Samsung Galaxy S23",
	"Nike Running Shoes",
	"Dell Gaming Laptop",
	"Bluetooth Speaker"
	],
	"category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
	"brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
	"description": [
	"Latest smartphone",
	"Android flagship phone",
	"Comfort sports shoes",
	"High performance laptop",
	"Portable music device"
	]
	})

	# ==============================
	# DATA PREVIEW CONTROL
	# ==============================
	st.subheader("📄 Data Preview")

	rows_to_show = st.selectbox("Select rows to view", [10, 20, 50, 100])
	st.dataframe(df.head(rows_to_show))

	# ==============================
	# COMBINE TEXT
	# ==============================
	df["combined"] = (
	df["product_name"].astype(str) + " " +
	df["category"].astype(str) + " " +
	df["brand"].astype(str) + " " +
	df["description"].astype(str)
	)

	products = df["combined"].tolist()

	# ==============================
	# PREPROCESS
	# ==============================
	@st.cache(allow_output_mutation=True)
	def preprocess_data(products):
	tfidf = TfidfVectorizer()
	tfidf_matrix = tfidf.fit_transform(products)

	embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
	faiss.normalize_L2(embeddings)

	index = faiss.IndexFlatIP(embeddings.shape[1])
	index.add(np.array(embeddings))

	tokenized = [p.split() for p in products]
	bm25 = BM25Okapi(tokenized)

	return tfidf, tfidf_matrix, embeddings, index, bm25

	tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)

	# ==============================
	# SYNONYMS
	# ==============================
	def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word):
	for lemma in syn.lemmas():
	synonyms.add(lemma.name())
	return synonyms

	# ==============================
	# SEARCH FUNCTIONS
	# ==============================
	def keyword_search(q):
	return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]

	def regex_search(q):
	return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)]

	def boolean_search(q):
	if "AND" in q:
	terms = q.split("AND")
	return [(i, 1) for i, p in enumerate(products)
	if all(t.strip().lower() in p.lower() for t in terms)]
	elif "OR" in q:
	terms = q.split("OR")
	return [(i, 1) for i, p in enumerate(products)
	if any(t.strip().lower() in p.lower() for t in terms)]
	return []

	def fuzzy_search(q):
	scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
	return sorted(scores, key=lambda x: x[1], reverse=True)

	def ngram_search(q):
	return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]

	# ✅ FIXED PREFIX (word-level)
	def prefix_search(q):
	return [(i, 1) for i, p in enumerate(products)
	if any(word.startswith(q.lower()) for word in p.lower().split())]

	# ✅ FIXED SUFFIX (word-level)
	def suffix_search(q):
	return [(i, 1) for i, p in enumerate(products)
	if any(word.endswith(q.lower()) for word in p.lower().split())]

	def tfidf_search(q):
	q_vec = tfidf.transform([q])
	scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
	return list(enumerate(scores))

	def bm25_search(q):
	scores = bm25.get_scores(q.split())
	return list(enumerate(scores))

	def semantic_search(q):
	q_emb = model.encode([q], show_progress_bar=False)
	faiss.normalize_L2(q_emb)
	scores = np.dot(embeddings, q_emb.T).flatten()
	return list(enumerate(scores))

	def faiss_search(q):
	q_emb = model.encode([q], show_progress_bar=False)
	faiss.normalize_L2(q_emb)
	D, I = index.search(np.array(q_emb), 10)
	return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]

	def hybrid_search(q):
	tfidf_res = dict(tfidf_search(q))
	sem_res = dict(semantic_search(q))
	return [(i, tfidf_res.get(i, 0) + sem_res.get(i, 0)) for i in range(len(products))]

	# ✅ IMPROVED QUERY EXPANSION
	def query_expansion_search(q):
	expanded = q.split()
	for word in q.split():
	expanded += list(get_synonyms(word))
	return tfidf_search(" ".join(expanded))

	# ✅ IMPROVED WEIGHTED HYBRID
	def weighted_hybrid(q):
	tfidf_res = dict(tfidf_search(q))
	sem_res = dict(semantic_search(q))
	bm25_res = dict(bm25_search(q))

	return [(i,
	0.4 * tfidf_res.get(i, 0) +
	0.4 * sem_res.get(i, 0) +
	0.2 * bm25_res.get(i, 0))
	for i in range(len(products))]

	# ✅ FIXED ENSEMBLE (NORMALIZED)
	def ensemble_search(q):
	tfidf_res = np.array([s for _, s in tfidf_search(q)])
	sem_res = np.array([s for _, s in semantic_search(q)])
	bm25_res = np.array([s for _, s in bm25_search(q)])

	combined = tfidf_res/np.max(tfidf_res+1e-6) + \
	sem_res/np.max(sem_res+1e-6) + \
	bm25_res/np.max(bm25_res+1e-6)

	return list(enumerate(combined))

	# ==============================
	# UI
	# ==============================
	search_type = st.selectbox("🔎 Select Search Type", list(search_info.keys()))
	explanation, example = search_info[search_type]

	st.markdown(f"""
	### 🔍 {search_type}
	- Explanation: {explanation}
	- Example: `{example}`
	""")

	query = st.text_input("Enter your search query")

	if st.button("Try Example"):
	query = example
	st.success(f"Loaded: {query}")

	top_k = st.slider("Top Results", 5, 20, 10)

	# ==============================
	# SEARCH EXECUTION
	# ==============================
	if st.button("Search"):
	if not query:
	st.warning("Enter query")
	else:
	func_map = {
	"Keyword": keyword_search,
	"Regex": regex_search,
	"Boolean": boolean_search,
	"Fuzzy": fuzzy_search,
	"N-Gram": ngram_search,
	"Prefix": prefix_search,
	"Suffix": suffix_search,
	"TF-IDF": tfidf_search,
	"BM25": bm25_search,
	"Semantic": semantic_search,
	"FAISS": faiss_search,
	"Hybrid": hybrid_search,
	"Query Expansion": query_expansion_search,
	"Weighted Hybrid": weighted_hybrid,
	"Ensemble": ensemble_search
	}

	results = func_map[search_type](query)

	# Sort results
	results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]

	indices = [i for i, _ in results]
	result_df = df.iloc[indices].copy()
	result_df["Score"] = [round(score, 4) for _, score in results]

	st.subheader("🔎 Results")
	st.dataframe(result_df)