Spaces:

fireworks-ai
/

search-alchemy

Running

App Files Files Community

RobertoBarrosoLuque commited on 18 days ago

Commit

32b5f27

1 Parent(s): 099c385

Cleanup frontend

Browse files

Files changed (3) hide show

requirements.txt +1 -1
src/app.py +161 -48
src/config.py +27 -7

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 gradio==5.42.0
 openai
 python-dotenv
-datasets>=2.19.0
 numpy
 pandas
 scikit-learn

 gradio==5.42.0
 openai
 python-dotenv
+datasets==4.2.0
 numpy
 pandas
 scikit-learn

src/app.py CHANGED Viewed

@@ -3,8 +3,13 @@ import time
 from typing import List, Dict, Tuple
 from pathlib import Path
 import os
-from config import GRADIO_THEME, CUSTOM_CSS, EXAMPLE_QUERIES
 from src.search.bm25_lexical_search import search_bm25
 _FILE_PATH = Path(__file__).parents[1]
@@ -52,7 +57,7 @@ def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
         stage_name: Name of the search stage
         metrics: Dict with keys: semantic_match, diversity, latency_ms
     """
-    html_parts = [f"### {stage_name} Results\n\n"]
     for idx, result in enumerate(results, 1):
         category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
@@ -67,14 +72,14 @@ def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
 """
         )
-    html_parts.append("\n### Metrics\n\n")
     html_parts.append(
         f"""
-<div class="metric-box">
-" <strong>Semantic Match:</strong> {metrics['semantic_match']:.3f}<br/>
-" <strong>Diversity:</strong> {metrics['diversity']:.3f}<br/>
-" <strong>Latency:</strong> {metrics['latency_ms']}ms
-</div>
 """
     )
@@ -210,47 +215,129 @@ def generate_comparison_table(all_metrics: List[Dict]) -> str:
         "Stage 4: + Reranking",
     ]
     html = """
-### Comparison Across All Stages
 <table class="comparison-table">
 <tr>
-    <th>Stage</th>
-    <th>Semantic Match</th>
-    <th>Diversity</th>
-    <th>Latency (ms)</th>
 </tr>
 """
-    for idx, (name, metrics) in enumerate(zip(stage_names, all_metrics)):
         html += f"""
 <tr>
-    <td><strong>{name}</strong></td>
-    <td>{metrics['semantic_match']:.3f}</td>
-    <td>{metrics['diversity']:.3f}</td>
-    <td>{metrics['latency_ms']}ms</td>
 </tr>
 """
-    html += "</table>"
-    html += """
-### Key Insights
-<div class="metric-box">
-" <strong>Semantic Match improves by 52%</strong> from Stage 1 to Stage 4<br/>
-" <strong>Diversity increases by 33%</strong> showing more varied results<br/>
-" <strong>Latency stays under 200ms</strong> maintaining fast performance<br/>
-" Each stage adds incremental value to search quality
-</div>
 """
-    return html
-def set_example(example: str) -> str:
-    """Set an example query."""
-    return example
 # Code snippets for each stage
@@ -388,6 +475,7 @@ with gr.Blocks(
             query_input = gr.Textbox(
                 label="Search Query",
                 placeholder="Enter your search query...",
                 scale=3,
                 elem_classes="search-box",
             )
@@ -401,51 +489,76 @@ with gr.Blocks(
                 container=True,
                 elem_classes="compact-input",
             )
     with gr.Row():
-        search_btn = gr.Button("Search", variant="primary", scale=1)
-    # Example queries
-    with gr.Row():
-        gr.Markdown("**Quick Examples:**")
     with gr.Row():
-        example_buttons = []
-        for example in EXAMPLE_QUERIES:
-            btn = gr.Button(example, size="sm", variant="secondary")
-            example_buttons.append(btn)
-            btn.click(fn=set_example, inputs=[gr.State(example)], outputs=[query_input])
-    # Tabs for each stage
     with gr.Tabs() as tabs:
-        # Stage 1 Tab
         with gr.Tab("Stage 1: BM25 Baseline"):
             stage1_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_1)
-        # Stage 2 Tab
         with gr.Tab("Stage 2: + Vector Embeddings"):
             stage2_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_2)
-        # Stage 3 Tab
         with gr.Tab("Stage 3: + Query Expansion"):
             stage3_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_3)
-        # Stage 4 Tab
         with gr.Tab("Stage 4: + LLM Reranking"):
             stage4_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_4)
-        # Comparison Tab
         with gr.Tab("Compare All Stages"):
             comparison_output = gr.Markdown(label="Comparison")
-    # Search button click handler
     search_btn.click(
         fn=search_all_stages,
         inputs=[query_input],

 from typing import List, Dict, Tuple
 from pathlib import Path
 import os
+from config import (
+    GRADIO_THEME,
+    CUSTOM_CSS,
+    EXAMPLE_QUERIES_BY_CATEGORY,
+)
 from src.search.bm25_lexical_search import search_bm25
+from src.data_prep.data_prep import load_clean_amazon_product_data
 _FILE_PATH = Path(__file__).parents[1]
         stage_name: Name of the search stage
         metrics: Dict with keys: semantic_match, diversity, latency_ms
     """
+    html_parts = [f"## 🔍 {stage_name}\n\n"]
     for idx, result in enumerate(results, 1):
         category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
 """
         )
+    html_parts.append("\n---\n\n### Performance Metrics\n\n")
     html_parts.append(
         f"""
+| Metric | Score |
+|--------|-------|
+| **Semantic Match** | {metrics['semantic_match']:.3f} |
+| **Diversity** | {metrics['diversity']:.3f} |
+| **Latency** | {metrics['latency_ms']}ms |
 """
     )
         "Stage 4: + Reranking",
     ]
+    # Build markdown table
+    html = "## Stage-by-Stage Comparison\n\n"
+    html += "| Stage | Semantic Match | Diversity | Latency (ms) |\n"
+    html += "|-------|----------------|-----------|---------------|\n"
+    for name, metrics in zip(stage_names, all_metrics):
+        html += f"| **{name}** | {metrics['semantic_match']:.3f} | {metrics['diversity']:.3f} | {metrics['latency_ms']} |\n"
+    # Calculate improvements
+    semantic_improvement = (
+        (
+            (all_metrics[3]["semantic_match"] - all_metrics[0]["semantic_match"])
+            / all_metrics[0]["semantic_match"]
+            * 100
+        )
+        if all_metrics[0]["semantic_match"] > 0
+        else 0
+    )
+    diversity_improvement = (
+        (
+            (all_metrics[3]["diversity"] - all_metrics[0]["diversity"])
+            / all_metrics[0]["diversity"]
+            * 100
+        )
+        if all_metrics[0]["diversity"] > 0
+        else 0
+    )
+    html += "\n---\n\n"
+    html += "## Key Insights\n\n"
+    html += f"- **Semantic Match** improves by **{semantic_improvement:.0f}%** from Stage 1 to Stage 4\n"
+    html += f"- **Diversity** increases by **{diversity_improvement:.0f}%** showing more varied results\n"
+    html += f"- **Latency** stays under **{max(m['latency_ms'] for m in all_metrics)}ms** maintaining fast performance\n"
+    html += "- Each stage adds incremental value to search quality\n"
+    return html
+def set_example(example: str) -> str:
+    """Set an example query."""
+    return example
+def load_example_query(category: str, ambiguity: str) -> str:
+    """Load example query based on category and ambiguity level."""
+    ambiguity_key = ambiguity.lower().replace(" ", "_")
+    return EXAMPLE_QUERIES_BY_CATEGORY[category][ambiguity_key]
+def generate_category_distribution_table() -> str:
+    """Generate HTML table showing MainCategory distribution."""
+    df = load_clean_amazon_product_data()
+    category_counts = df["MainCategory"].value_counts()
+    total = len(df)
     html = """
+### Dataset Category Distribution
 <table class="comparison-table">
 <tr>
+    <th>Category</th>
+    <th>Count</th>
+    <th>Percentage</th>
 </tr>
 """
+    for category, count in category_counts.items():
+        percentage = (count / total) * 100
         html += f"""
 <tr>
+    <td><strong>{category}</strong></td>
+    <td>{count:,}</td>
+    <td>{percentage:.1f}%</td>
 </tr>
 """
+    html += f"""
+<tr style="background: #F3F0FF; font-weight: 600;">
+    <td><strong>Total</strong></td>
+    <td>{total:,}</td>
+    <td>100.0%</td>
+</tr>
+</table>
+"""
+    return html
+def generate_sample_data_table() -> str:
+    """Generate HTML table showing sample rows from the dataset."""
+    df = load_clean_amazon_product_data()
+    sample_df = df.sample(n=5, random_state=42)
+    html = """
+### Sample Products from Dataset
+<table class="comparison-table">
+<tr>
+    <th>Product Name</th>
+    <th>Main Category</th>
+    <th>Secondary Category</th>
+    <th>Description</th>
+</tr>
 """
+    for _, row in sample_df.iterrows():
+        description = (
+            row["Description"][:80] + "..."
+            if len(row["Description"]) > 80
+            else row["Description"]
+        )
+        html += f"""
+<tr>
+    <td><strong>{row["Product Name"]}</strong></td>
+    <td>{row["MainCategory"]}</td>
+    <td>{row["SecondaryCategory"]}</td>
+    <td style="color: #64748B; font-size: 0.9em;">{description}</td>
+</tr>
+"""
+    html += "</table>"
+    return html
 # Code snippets for each stage
             query_input = gr.Textbox(
                 label="Search Query",
                 placeholder="Enter your search query...",
+                value=EXAMPLE_QUERIES_BY_CATEGORY["Toys & Games"]["clear"],
                 scale=3,
                 elem_classes="search-box",
             )
                 container=True,
                 elem_classes="compact-input",
             )
+    # Clean example query selector
     with gr.Row():
+        gr.Markdown(
+            "**Try Example Queries:** Select a category and specificity level to auto-load an example"
+        )
     with gr.Row():
+        with gr.Column(scale=1):
+            category_dropdown = gr.Dropdown(
+                choices=list(EXAMPLE_QUERIES_BY_CATEGORY.keys()),
+                value=list(EXAMPLE_QUERIES_BY_CATEGORY.keys())[0],
+                label="Category",
+                container=True,
+            )
+        with gr.Column(scale=1):
+            ambiguity_dropdown = gr.Dropdown(
+                choices=["Clear", "Somewhat Ambiguous", "Ambiguous"],
+                value="Clear",
+                label="Query Specificity",
+                container=True,
+            )
+        with gr.Column(scale=1):
+            search_btn = gr.Button("Search", variant="primary", scale=1, size="lg")
     with gr.Tabs() as tabs:
         with gr.Tab("Stage 1: BM25 Baseline"):
             stage1_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_1)
         with gr.Tab("Stage 2: + Vector Embeddings"):
             stage2_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_2)
         with gr.Tab("Stage 3: + Query Expansion"):
             stage3_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_3)
         with gr.Tab("Stage 4: + LLM Reranking"):
             stage4_output = gr.Markdown(label="Results")
             with gr.Accordion("Show Code", open=False):
                 gr.Markdown(CODE_STAGE_4)
         with gr.Tab("Compare All Stages"):
             comparison_output = gr.Markdown(label="Comparison")
+    with gr.Accordion("Dataset Information", open=False):
+        gr.Markdown("Explore the dataset used for this search demo")
+        with gr.Row():
+            category_dist_table = gr.Markdown(
+                value=generate_category_distribution_table()
+            )
+        with gr.Row():
+            sample_data_table = gr.Markdown(value=generate_sample_data_table())
+    # Event handlers - auto-load query when dropdown changes
+    category_dropdown.change(
+        fn=load_example_query,
+        inputs=[category_dropdown, ambiguity_dropdown],
+        outputs=[query_input],
+    )
+    ambiguity_dropdown.change(
+        fn=load_example_query,
+        inputs=[category_dropdown, ambiguity_dropdown],
+        outputs=[query_input],
+    )
     search_btn.click(
         fn=search_all_stages,
         inputs=[query_input],

src/config.py CHANGED Viewed

@@ -184,10 +184,30 @@ summary:hover {
 """
-EXAMPLE_QUERIES = [
-    "gift for 5 year old who likes science",
-    "cheap wireless headphones good battery",
-    "running shoes",
-    "waterproof bluetooth speaker",
-    "ergonomic office chair under 200",
-]

 """
+EXAMPLE_QUERIES_BY_CATEGORY = {
+    "Toys & Games": {
+        "clear": "magnetic construction building blocks educational toy",
+        "somewhat_ambiguous": "learning toy for preschool kids",
+        "ambiguous": "fun gift for child",
+    },
+    "Home & Kitchen": {
+        "clear": "kids octopus comforter bedding set full size",
+        "somewhat_ambiguous": "colorful bedding set for children",
+        "ambiguous": "bedroom decoration items",
+    },
+    "Clothing, Shoes & Jewelry": {
+        "clear": "star wars stormtrooper halloween costume kids",
+        "somewhat_ambiguous": "character costume for children",
+        "ambiguous": "dress up outfit",
+    },
+    "Sports & Outdoors": {
+        "clear": "55 inch trampoline with safety net enclosure",
+        "somewhat_ambiguous": "small trampoline for children",
+        "ambiguous": "backyard play equipment",
+    },
+    "Baby Products": {
+        "clear": "nursery wall decor quotes motivational stickers",
+        "somewhat_ambiguous": "wall decorations for baby room",
+        "ambiguous": "cute nursery items",
+    },
+}