Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 15, 2025

Commit

ef513a5

verified ·

1 Parent(s): fd2d4ca

Create app.py

Browse files

Files changed (1) hide show

app.py +189 -0

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+from rank_bm25 import BM25Okapi
+import pypdf
+import docx
+from io import BytesIO
+# --- CONFIGURATION ---
+st.set_page_config(page_title="Hybrid Semantic Search", layout="wide")
+# --- HELPER FUNCTIONS: FILE PARSING ---
+def parse_file(uploaded_file):
+    """Extracts text from various file formats."""
+    text = ""
+    try:
+        if uploaded_file.name.endswith(".pdf"):
+            reader = pypdf.PdfReader(uploaded_file)
+            for page in reader.pages:
+                text += page.extract_text() + "\n"
+        elif uploaded_file.name.endswith(".docx"):
+            doc = docx.Document(uploaded_file)
+            text = "\n".join([para.text for para in doc.paragraphs])
+        elif uploaded_file.name.endswith(".txt"):
+            text = uploaded_file.read().decode("utf-8")
+        elif uploaded_file.name.endswith(".csv"):
+            df = pd.read_csv(uploaded_file)
+            # Assuming a generic CSV, we just flatten it to text for now
+            text = df.to_string()
+    except Exception as e:
+        st.error(f"Error reading file: {e}")
+    return text
+def chunk_text(text, chunk_size=300, overlap=50):
+    """Splits text into overlapping chunks for better context."""
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i + chunk_size])
+        if len(chunk) > 50:  # Filter out tiny chunks
+            chunks.append(chunk)
+    return chunks
+# --- CORE LOGIC: HYBRID SEARCH ENGINE ---
+class HybridSearchEngine:
+    def __init__(self, model_name):
+        self.model = SentenceTransformer(model_name)
+        self.documents = []
+        self.faiss_index = None
+        self.bm25 = None
+    def fit(self, documents):
+        self.documents = documents
+        # 1. Build Dense Index (FAISS)
+        embeddings = self.model.encode(documents)
+        # Normalize for Cosine Similarity (Inner Product)
+        faiss.normalize_L2(embeddings)
+        dimension = embeddings.shape[1]
+        self.faiss_index = faiss.IndexFlatIP(dimension) # Inner Product = Cosine Sim
+        self.faiss_index.add(embeddings)
+        # 2. Build Sparse Index (BM25)
+        tokenized_corpus = [doc.lower().split() for doc in documents]
+        self.bm25 = BM25Okapi(tokenized_corpus)
+    def search(self, query, top_k=5, alpha=0.5):
+        """
+        Alpha: Weighting factor.
+        1.0 = Pure Vector Search
+        0.0 = Pure Keyword Search
+        0.5 = Equal Hybrid
+        """
+        # --- Vector Search ---
+        query_vector = self.model.encode([query])
+        faiss.normalize_L2(query_vector)
+        # Search more than we need to allow for re-ranking
+        v_scores, v_indices = self.faiss_index.search(query_vector, len(self.documents))
+        # Create a map of {doc_index: vector_score}
+        # Normalize vector scores to 0-1 range (approx)
+        v_results = {}
+        for i, idx in enumerate(v_indices[0]):
+            if idx != -1:
+                v_results[idx] = v_scores[0][i]
+        # --- Keyword Search (BM25) ---
+        tokenized_query = query.lower().split()
+        bm25_scores = self.bm25.get_scores(tokenized_query)
+        # Normalize BM25 scores (Min-Max Scaling) to match Vector scale
+        if max(bm25_scores) > 0:
+            bm25_scores = (bm25_scores - min(bm25_scores)) / (max(bm25_scores) - min(bm25_scores))
+        # --- Hybrid Combination ---
+        final_results = []
+        for idx, doc in enumerate(self.documents):
+            v_score = v_results.get(idx, 0.0)
+            k_score = bm25_scores[idx]
+            # Weighted Score
+            final_score = (alpha * v_score) + ((1 - alpha) * k_score)
+            final_results.append({
+                "chunk": doc,
+                "score": final_score,
+                "vector_score": v_score,
+                "keyword_score": k_score
+            })
+        # Sort by final score
+        final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
+        return final_results[:top_k]
+# --- STREAMLIT UI ---
+st.title("⚡ Hybrid Search: Vector + Keywords")
+st.caption("Robust semantic search powered by FAISS (Dense) and BM25 (Sparse).")
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    # 3. Select Embedding Model
+    model_choice = st.selectbox(
+        "Embedding Model",
+        options=["all-MiniLM-L6-v2", "all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"],
+        index=0,
+        help="MiniLM is fast; MPNet is more accurate but slower."
+    )
+    # 2. Results Count
+    top_k = st.number_input("Results to Retrieve", min_value=1, max_value=50, value=5, step=1)
+    # Hybrid Weight Slider
+    alpha = st.slider("Hybrid Balance (Alpha)", 0.0, 1.0, 0.5,
+                      help="0.0 = Keywords Only, 1.0 = Vectors Only")
+    st.divider()
+    # 1. File Upload
+    uploaded_files = st.file_uploader(
+        "Upload Knowledge Base",
+        type=['txt', 'pdf', 'docx', 'csv'],
+        accept_multiple_files=True
+    )
+    process_btn = st.button("Build Database")
+# --- APP STATE MANAGEMENT ---
+if 'search_engine' not in st.session_state:
+    st.session_state.search_engine = None
+if process_btn and uploaded_files:
+    with st.spinner(f"Parsing files and initializing {model_choice}..."):
+        all_chunks = []
+        for file in uploaded_files:
+            raw_text = parse_file(file)
+            file_chunks = chunk_text(raw_text)
+            all_chunks.extend(file_chunks)
+        if all_chunks:
+            engine = HybridSearchEngine(model_choice)
+            engine.fit(all_chunks)
+            st.session_state.search_engine = engine
+            st.success(f"Indexed {len(all_chunks)} chunks from {len(uploaded_files)} files!")
+        else:
+            st.warning("No text found in uploaded files.")
+# --- SEARCH INTERFACE ---
+if st.session_state.search_engine:
+    query = st.text_input("Enter your query:", placeholder="e.g., 'What are the safety protocols for the engine room?'")
+    if query:
+        results = st.session_state.search_engine.search(query, top_k=top_k, alpha=alpha)
+        st.subheader(f"Top {top_k} Matches")
+        for i, res in enumerate(results):
+            with st.expander(f"Rank {i+1} (Score: {res['score']:.4f})", expanded=(i==0)):
+                st.markdown(f"**{res['chunk']}**")
+                # Metadata columns
+                c1, c2, c3 = st.columns(3)
+                c1.metric("Hybrid Score", f"{res['score']:.4f}")
+                c2.metric("Vector Match", f"{res['vector_score']:.4f}")
+                c3.metric("Keyword Match", f"{res['keyword_score']:.4f}")
+else:
+    st.info("👈 Please upload documents in the sidebar to begin.")