Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sentence_transformers import SentenceTransformer | |
| from rank_bm25 import BM25Okapi | |
| from rapidfuzz import fuzz | |
| import faiss | |
| import nltk | |
| # ============================== | |
| # NLTK FIX | |
| # ============================== | |
| nltk.download('wordnet', quiet=True) | |
| from nltk.corpus import wordnet | |
| # ============================== | |
| # PAGE CONFIG | |
| # ============================== | |
| st.set_page_config(page_title="Multi Search Engine", layout="wide") | |
| st.title("π Advanced Multi-Search Product Engine") | |
| # ============================== | |
| # LOAD MODEL | |
| # ============================== | |
| if "model" not in st.session_state: | |
| with st.spinner("Loading AI model..."): | |
| st.session_state.model = SentenceTransformer( | |
| 'all-MiniLM-L6-v2', | |
| device='cpu' | |
| ) | |
| model = st.session_state.model | |
| # ============================== | |
| # SEARCH INFO (UPDATED) | |
| # ============================== | |
| search_info = { | |
| "Keyword": ("Exact match", "iphone"), | |
| "Regex": ("Pattern match", "^Samsung"), | |
| "Boolean": ("AND / OR logic", "nike AND shoes"), | |
| "Fuzzy": ("Spelling mistakes", "iphon"), | |
| "N-Gram": ("Partial word", "iph"), | |
| "Prefix": ("Word starts with", "Sam"), | |
| "Suffix": ("Word ends with", "phone"), | |
| "TF-IDF": ("Keyword ranking", "wireless headphones"), | |
| "BM25": ("Advanced ranking", "gaming laptop"), | |
| "Semantic": ("Meaning search", "sports footwear"), | |
| "FAISS": ("Fast semantic", "music device"), | |
| "Hybrid": ("TF-IDF + Semantic", "running shoes"), | |
| "Query Expansion": ("Auto synonyms", "speaker"), | |
| "Weighted Hybrid": ("TF-IDF + Semantic + BM25", "best laptop"), | |
| "Ensemble": ("Combine all scores", "smartphone") | |
| } | |
| # ============================== | |
| # FILE LOAD (KEEP YOUR LOGIC) | |
| # ============================== | |
| uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) | |
| if uploaded_file: | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| st.info("Using sample dataset") | |
| df = pd.DataFrame({ | |
| "product_name": [ | |
| "iPhone 14 Pro", | |
| "Samsung Galaxy S23", | |
| "Nike Running Shoes", | |
| "Dell Gaming Laptop", | |
| "Bluetooth Speaker" | |
| ], | |
| "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"], | |
| "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"], | |
| "description": [ | |
| "Latest smartphone", | |
| "Android flagship phone", | |
| "Comfort sports shoes", | |
| "High performance laptop", | |
| "Portable music device" | |
| ] | |
| }) | |
| # ============================== | |
| # DATA PREVIEW CONTROL | |
| # ============================== | |
| st.subheader("π Data Preview") | |
| rows_to_show = st.selectbox("Select rows to view", [10, 20, 50, 100]) | |
| st.dataframe(df.head(rows_to_show)) | |
| # ============================== | |
| # COMBINE TEXT | |
| # ============================== | |
| df["combined"] = ( | |
| df["product_name"].astype(str) + " " + | |
| df["category"].astype(str) + " " + | |
| df["brand"].astype(str) + " " + | |
| df["description"].astype(str) | |
| ) | |
| products = df["combined"].tolist() | |
| # ============================== | |
| # PREPROCESS | |
| # ============================== | |
| def preprocess_data(products): | |
| tfidf = TfidfVectorizer() | |
| tfidf_matrix = tfidf.fit_transform(products) | |
| embeddings = model.encode(products, batch_size=64, show_progress_bar=False) | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) | |
| index.add(np.array(embeddings)) | |
| tokenized = [p.split() for p in products] | |
| bm25 = BM25Okapi(tokenized) | |
| return tfidf, tfidf_matrix, embeddings, index, bm25 | |
| tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products) | |
| # ============================== | |
| # SYNONYMS | |
| # ============================== | |
| def get_synonyms(word): | |
| synonyms = set() | |
| for syn in wordnet.synsets(word): | |
| for lemma in syn.lemmas(): | |
| synonyms.add(lemma.name()) | |
| return synonyms | |
| # ============================== | |
| # SEARCH FUNCTIONS | |
| # ============================== | |
| def keyword_search(q): | |
| return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()] | |
| def regex_search(q): | |
| return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)] | |
| def boolean_search(q): | |
| if "AND" in q: | |
| terms = q.split("AND") | |
| return [(i, 1) for i, p in enumerate(products) | |
| if all(t.strip().lower() in p.lower() for t in terms)] | |
| elif "OR" in q: | |
| terms = q.split("OR") | |
| return [(i, 1) for i, p in enumerate(products) | |
| if any(t.strip().lower() in p.lower() for t in terms)] | |
| return [] | |
| def fuzzy_search(q): | |
| scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)] | |
| return sorted(scores, key=lambda x: x[1], reverse=True) | |
| def ngram_search(q): | |
| return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()] | |
| # β FIXED PREFIX (word-level) | |
| def prefix_search(q): | |
| return [(i, 1) for i, p in enumerate(products) | |
| if any(word.startswith(q.lower()) for word in p.lower().split())] | |
| # β FIXED SUFFIX (word-level) | |
| def suffix_search(q): | |
| return [(i, 1) for i, p in enumerate(products) | |
| if any(word.endswith(q.lower()) for word in p.lower().split())] | |
| def tfidf_search(q): | |
| q_vec = tfidf.transform([q]) | |
| scores = (tfidf_matrix @ q_vec.T).toarray().flatten() | |
| return list(enumerate(scores)) | |
| def bm25_search(q): | |
| scores = bm25.get_scores(q.split()) | |
| return list(enumerate(scores)) | |
| def semantic_search(q): | |
| q_emb = model.encode([q], show_progress_bar=False) | |
| faiss.normalize_L2(q_emb) | |
| scores = np.dot(embeddings, q_emb.T).flatten() | |
| return list(enumerate(scores)) | |
| def faiss_search(q): | |
| q_emb = model.encode([q], show_progress_bar=False) | |
| faiss.normalize_L2(q_emb) | |
| D, I = index.search(np.array(q_emb), 10) | |
| return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])] | |
| def hybrid_search(q): | |
| tfidf_res = dict(tfidf_search(q)) | |
| sem_res = dict(semantic_search(q)) | |
| return [(i, tfidf_res.get(i, 0) + sem_res.get(i, 0)) for i in range(len(products))] | |
| # β IMPROVED QUERY EXPANSION | |
| def query_expansion_search(q): | |
| expanded = q.split() | |
| for word in q.split(): | |
| expanded += list(get_synonyms(word)) | |
| return tfidf_search(" ".join(expanded)) | |
| # β IMPROVED WEIGHTED HYBRID | |
| def weighted_hybrid(q): | |
| tfidf_res = dict(tfidf_search(q)) | |
| sem_res = dict(semantic_search(q)) | |
| bm25_res = dict(bm25_search(q)) | |
| return [(i, | |
| 0.4 * tfidf_res.get(i, 0) + | |
| 0.4 * sem_res.get(i, 0) + | |
| 0.2 * bm25_res.get(i, 0)) | |
| for i in range(len(products))] | |
| # β FIXED ENSEMBLE (NORMALIZED) | |
| def ensemble_search(q): | |
| tfidf_res = np.array([s for _, s in tfidf_search(q)]) | |
| sem_res = np.array([s for _, s in semantic_search(q)]) | |
| bm25_res = np.array([s for _, s in bm25_search(q)]) | |
| combined = tfidf_res/np.max(tfidf_res+1e-6) + \ | |
| sem_res/np.max(sem_res+1e-6) + \ | |
| bm25_res/np.max(bm25_res+1e-6) | |
| return list(enumerate(combined)) | |
| # ============================== | |
| # UI | |
| # ============================== | |
| search_type = st.selectbox("π Select Search Type", list(search_info.keys())) | |
| explanation, example = search_info[search_type] | |
| st.markdown(f""" | |
| ### π {search_type} | |
| - **Explanation:** {explanation} | |
| - **Example:** `{example}` | |
| """) | |
| query = st.text_input("Enter your search query") | |
| if st.button("Try Example"): | |
| query = example | |
| st.success(f"Loaded: {query}") | |
| top_k = st.slider("Top Results", 5, 20, 10) | |
| # ============================== | |
| # SEARCH EXECUTION | |
| # ============================== | |
| if st.button("Search"): | |
| if not query: | |
| st.warning("Enter query") | |
| else: | |
| func_map = { | |
| "Keyword": keyword_search, | |
| "Regex": regex_search, | |
| "Boolean": boolean_search, | |
| "Fuzzy": fuzzy_search, | |
| "N-Gram": ngram_search, | |
| "Prefix": prefix_search, | |
| "Suffix": suffix_search, | |
| "TF-IDF": tfidf_search, | |
| "BM25": bm25_search, | |
| "Semantic": semantic_search, | |
| "FAISS": faiss_search, | |
| "Hybrid": hybrid_search, | |
| "Query Expansion": query_expansion_search, | |
| "Weighted Hybrid": weighted_hybrid, | |
| "Ensemble": ensemble_search | |
| } | |
| results = func_map[search_type](query) | |
| # Sort results | |
| results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k] | |
| indices = [i for i, _ in results] | |
| result_df = df.iloc[indices].copy() | |
| result_df["Score"] = [round(score, 4) for _, score in results] | |
| st.subheader("π Results") | |
| st.dataframe(result_df) |