Spaces:

fireworks-ai
/

search-alchemy

Running

App Files Files Community

RobertoBarrosoLuque commited on 29 days ago

Commit

34d08ee

1 Parent(s): e59c3d6

Lexical search is working

Browse files

Files changed (2) hide show

src/app.py +71 -35
src/data_prep/data_prep.py +1 -1

src/app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import List, Dict, Tuple
 from pathlib import Path
 import os
 from config import GRADIO_THEME, CUSTOM_CSS, EXAMPLE_QUERIES
 _FILE_PATH = Path(__file__).parents[1]
@@ -44,16 +45,23 @@ SAMPLE_PRODUCTS = [
 def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
-    """Format search results as HTML."""
     html_parts = [f"### {stage_name} Results\n\n"]
     for idx, result in enumerate(results, 1):
         html_parts.append(
             f"""
 <div class="result-card">
-<strong>{idx}. {result['title']}</strong><br/>
-<span style="color: #64748B; font-size: 0.9em;">{result['description']}</span><br/>
-<span style="color: #94A3B8; font-size: 0.85em;">Category: {result['category']}</span><br/>
 <span style="color: #6720FF; font-weight: 600;">Score: {result['score']:.3f}</span>
 </div>
 """
@@ -77,18 +85,21 @@ def search_stage_1(query: str) -> Tuple[str, Dict]:
     """Stage 1: Baseline BM25 keyword search."""
     start_time = time.time()
-    # Placeholder: Simple keyword matching
-    results = []
-    for product in SAMPLE_PRODUCTS[:3]:
-        results.append({**product, "score": 0.65 + (len(results) * 0.05)})
     latency = int((time.time() - start_time) * 1000)
     metrics = {
-        "semantic_match": 0.58,
-        "diversity": 0.60,
-        "latency_ms": max(50, latency),
     }
     return format_results(results, "Stage 1: BM25 Baseline", metrics), metrics
@@ -97,10 +108,17 @@ def search_stage_2(query: str) -> Tuple[str, Dict]:
     """Stage 2: BM25 + Vector Embeddings."""
     start_time = time.time()
-    # Placeholder: Simulated embedding search
-    results = []
-    for product in SAMPLE_PRODUCTS[:4]:
-        results.append({**product, "score": 0.72 + (len(results) * 0.04)})
     latency = int((time.time() - start_time) * 1000)
@@ -117,10 +135,17 @@ def search_stage_3(query: str) -> Tuple[str, Dict]:
     """Stage 3: BM25 + Embeddings + Query Expansion."""
     start_time = time.time()
-    # Placeholder: Simulated query expansion
-    results = []
-    for product in SAMPLE_PRODUCTS[:5]:
-        results.append({**product, "score": 0.78 + (len(results) * 0.03)})
     latency = int((time.time() - start_time) * 1000)
@@ -137,10 +162,17 @@ def search_stage_4(query: str) -> Tuple[str, Dict]:
     """Stage 4: BM25 + Embeddings + Query Expansion + LLM Reranking."""
     start_time = time.time()
-    # Placeholder: Simulated reranking
-    results = []
-    for product in SAMPLE_PRODUCTS[:5]:
-        results.append({**product, "score": 0.85 + (len(results) * 0.025)})
     latency = int((time.time() - start_time) * 1000)
@@ -224,21 +256,25 @@ def set_example(example: str) -> str:
 # Code snippets for each stage
 CODE_STAGE_1 = """
 ```python
-from rank_bm25 import BM25Okapi
-# Tokenize documents
-tokenized_docs = [doc.split() for doc in documents]
-# Create BM25 index
-bm25 = BM25Okapi(tokenized_docs)
-# Search
-query_tokens = query.split()
-scores = bm25.get_scores(query_tokens)
-# Get top results
-top_indices = scores.argsort()[-5:][::-1]
-results = [documents[i] for i in top_indices]
 ```
 """

 from pathlib import Path
 import os
 from config import GRADIO_THEME, CUSTOM_CSS, EXAMPLE_QUERIES
+from src.search.bm25_lexical_search import search_bm25
 _FILE_PATH = Path(__file__).parents[1]
 def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
+    """Format search results as HTML.
+    Args:
+        results: List of dicts with keys: product_name, description, main_category, secondary_category, score
+        stage_name: Name of the search stage
+        metrics: Dict with keys: semantic_match, diversity, latency_ms
+    """
     html_parts = [f"### {stage_name} Results\n\n"]
     for idx, result in enumerate(results, 1):
+        category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
         html_parts.append(
             f"""
 <div class="result-card">
+<strong>{idx}. {result['product_name']}</strong><br/>
+<span style="color: #64748B; font-size: 0.9em;">{result['description'][:150]}...</span><br/>
+<span style="color: #94A3B8; font-size: 0.85em;">Category: {category}</span><br/>
 <span style="color: #6720FF; font-weight: 600;">Score: {result['score']:.3f}</span>
 </div>
 """
     """Stage 1: Baseline BM25 keyword search."""
     start_time = time.time()
+    results = search_bm25(query, top_k=5)
     latency = int((time.time() - start_time) * 1000)
+    unique_categories = len(set(r["main_category"] for r in results)) if results else 0
+    diversity = min(1.0, unique_categories / 5.0)
+    avg_score = sum(r["score"] for r in results) / len(results) if results else 0
+    semantic_match = min(1.0, avg_score / 10.0)
     metrics = {
+        "semantic_match": semantic_match,
+        "diversity": diversity,
+        "latency_ms": latency,
     }
+    print(f"Searched BM25 for {query} in {latency}ms")
     return format_results(results, "Stage 1: BM25 Baseline", metrics), metrics
     """Stage 2: BM25 + Vector Embeddings."""
     start_time = time.time()
+    # Placeholder: Simulated embedding search with correct format
+    results = [
+        {
+            "product_name": product["title"],
+            "description": product["description"],
+            "main_category": product["category"],
+            "secondary_category": "Placeholder",
+            "score": 0.72 + (idx * 0.04),
+        }
+        for idx, product in enumerate(SAMPLE_PRODUCTS[:4])
+    ]
     latency = int((time.time() - start_time) * 1000)
     """Stage 3: BM25 + Embeddings + Query Expansion."""
     start_time = time.time()
+    # Placeholder: Simulated query expansion with correct format
+    results = [
+        {
+            "product_name": product["title"],
+            "description": product["description"],
+            "main_category": product["category"],
+            "secondary_category": "Placeholder",
+            "score": 0.78 + (idx * 0.03),
+        }
+        for idx, product in enumerate(SAMPLE_PRODUCTS[:5])
+    ]
     latency = int((time.time() - start_time) * 1000)
     """Stage 4: BM25 + Embeddings + Query Expansion + LLM Reranking."""
     start_time = time.time()
+    # Placeholder: Simulated reranking with correct format
+    results = [
+        {
+            "product_name": product["title"],
+            "description": product["description"],
+            "main_category": product["category"],
+            "secondary_category": "Placeholder",
+            "score": 0.85 + (idx * 0.025),
+        }
+        for idx, product in enumerate(SAMPLE_PRODUCTS[:5])
+    ]
     latency = int((time.time() - start_time) * 1000)
 # Code snippets for each stage
 CODE_STAGE_1 = """
 ```python
+import bm25s
+import pandas as pd
+# Step 1: Create BM25 index (one-time setup)
+df = pd.read_parquet("data/amazon_products.parquet")
+corpus = df["FullText"].tolist()
+corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
+retriever = bm25s.BM25()
+retriever.index(corpus_tokens)
+retriever.save("data/bm25_index")
+# Step 2: Load index and search
+bm25_index = bm25s.BM25.load("data/bm25_index", load_corpus=False)
+query_tokens = bm25s.tokenize(query, stopwords="en")
+results, scores = bm25_index.retrieve(query_tokens, k=5)
+# Extract top results
+top_products = [df.iloc[idx] for idx in results[0]]
 ```
 """

src/data_prep/data_prep.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 import numpy as np
 import faiss
 import bm25s
-from src.modules.inference import create_client
 from src.config import EMBEDDING_MODEL
 _FILE_PATH = Path(__file__).parents[2]

 import numpy as np
 import faiss
 import bm25s
+from src.fireworks.inference import create_client
 from src.config import EMBEDDING_MODEL
 _FILE_PATH = Path(__file__).parents[2]