RobertoBarrosoLuque commited on
Commit
32b5f27
·
1 Parent(s): 099c385

Cleanup frontend

Browse files
Files changed (3) hide show
  1. requirements.txt +1 -1
  2. src/app.py +161 -48
  3. src/config.py +27 -7
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  gradio==5.42.0
2
  openai
3
  python-dotenv
4
- datasets>=2.19.0
5
  numpy
6
  pandas
7
  scikit-learn
 
1
  gradio==5.42.0
2
  openai
3
  python-dotenv
4
+ datasets==4.2.0
5
  numpy
6
  pandas
7
  scikit-learn
src/app.py CHANGED
@@ -3,8 +3,13 @@ import time
3
  from typing import List, Dict, Tuple
4
  from pathlib import Path
5
  import os
6
- from config import GRADIO_THEME, CUSTOM_CSS, EXAMPLE_QUERIES
 
 
 
 
7
  from src.search.bm25_lexical_search import search_bm25
 
8
 
9
  _FILE_PATH = Path(__file__).parents[1]
10
 
@@ -52,7 +57,7 @@ def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
52
  stage_name: Name of the search stage
53
  metrics: Dict with keys: semantic_match, diversity, latency_ms
54
  """
55
- html_parts = [f"### {stage_name} Results\n\n"]
56
 
57
  for idx, result in enumerate(results, 1):
58
  category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
@@ -67,14 +72,14 @@ def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
67
  """
68
  )
69
 
70
- html_parts.append("\n### Metrics\n\n")
71
  html_parts.append(
72
  f"""
73
- <div class="metric-box">
74
- " <strong>Semantic Match:</strong> {metrics['semantic_match']:.3f}<br/>
75
- " <strong>Diversity:</strong> {metrics['diversity']:.3f}<br/>
76
- " <strong>Latency:</strong> {metrics['latency_ms']}ms
77
- </div>
78
  """
79
  )
80
 
@@ -210,47 +215,129 @@ def generate_comparison_table(all_metrics: List[Dict]) -> str:
210
  "Stage 4: + Reranking",
211
  ]
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  html = """
214
- ### Comparison Across All Stages
215
 
216
  <table class="comparison-table">
217
  <tr>
218
- <th>Stage</th>
219
- <th>Semantic Match</th>
220
- <th>Diversity</th>
221
- <th>Latency (ms)</th>
222
  </tr>
223
  """
224
 
225
- for idx, (name, metrics) in enumerate(zip(stage_names, all_metrics)):
 
226
  html += f"""
227
  <tr>
228
- <td><strong>{name}</strong></td>
229
- <td>{metrics['semantic_match']:.3f}</td>
230
- <td>{metrics['diversity']:.3f}</td>
231
- <td>{metrics['latency_ms']}ms</td>
232
  </tr>
233
  """
234
 
235
- html += "</table>"
 
 
 
 
 
 
 
 
 
236
 
237
- html += """
238
- ### Key Insights
239
 
240
- <div class="metric-box">
241
- " <strong>Semantic Match improves by 52%</strong> from Stage 1 to Stage 4<br/>
242
- " <strong>Diversity increases by 33%</strong> showing more varied results<br/>
243
- " <strong>Latency stays under 200ms</strong> maintaining fast performance<br/>
244
- " Each stage adds incremental value to search quality
245
- </div>
 
 
 
 
 
 
 
 
 
246
  """
247
 
248
- return html
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
 
250
 
251
- def set_example(example: str) -> str:
252
- """Set an example query."""
253
- return example
254
 
255
 
256
  # Code snippets for each stage
@@ -388,6 +475,7 @@ with gr.Blocks(
388
  query_input = gr.Textbox(
389
  label="Search Query",
390
  placeholder="Enter your search query...",
 
391
  scale=3,
392
  elem_classes="search-box",
393
  )
@@ -401,51 +489,76 @@ with gr.Blocks(
401
  container=True,
402
  elem_classes="compact-input",
403
  )
 
404
  with gr.Row():
405
- search_btn = gr.Button("Search", variant="primary", scale=1)
406
-
407
- # Example queries
408
- with gr.Row():
409
- gr.Markdown("**Quick Examples:**")
410
  with gr.Row():
411
- example_buttons = []
412
- for example in EXAMPLE_QUERIES:
413
- btn = gr.Button(example, size="sm", variant="secondary")
414
- example_buttons.append(btn)
415
- btn.click(fn=set_example, inputs=[gr.State(example)], outputs=[query_input])
 
 
 
 
 
 
 
 
 
 
 
416
 
417
- # Tabs for each stage
418
  with gr.Tabs() as tabs:
419
 
420
- # Stage 1 Tab
421
  with gr.Tab("Stage 1: BM25 Baseline"):
422
  stage1_output = gr.Markdown(label="Results")
423
  with gr.Accordion("Show Code", open=False):
424
  gr.Markdown(CODE_STAGE_1)
425
 
426
- # Stage 2 Tab
427
  with gr.Tab("Stage 2: + Vector Embeddings"):
428
  stage2_output = gr.Markdown(label="Results")
429
  with gr.Accordion("Show Code", open=False):
430
  gr.Markdown(CODE_STAGE_2)
431
 
432
- # Stage 3 Tab
433
  with gr.Tab("Stage 3: + Query Expansion"):
434
  stage3_output = gr.Markdown(label="Results")
435
  with gr.Accordion("Show Code", open=False):
436
  gr.Markdown(CODE_STAGE_3)
437
 
438
- # Stage 4 Tab
439
  with gr.Tab("Stage 4: + LLM Reranking"):
440
  stage4_output = gr.Markdown(label="Results")
441
  with gr.Accordion("Show Code", open=False):
442
  gr.Markdown(CODE_STAGE_4)
443
 
444
- # Comparison Tab
445
  with gr.Tab("Compare All Stages"):
446
  comparison_output = gr.Markdown(label="Comparison")
447
 
448
- # Search button click handler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  search_btn.click(
450
  fn=search_all_stages,
451
  inputs=[query_input],
 
3
  from typing import List, Dict, Tuple
4
  from pathlib import Path
5
  import os
6
+ from config import (
7
+ GRADIO_THEME,
8
+ CUSTOM_CSS,
9
+ EXAMPLE_QUERIES_BY_CATEGORY,
10
+ )
11
  from src.search.bm25_lexical_search import search_bm25
12
+ from src.data_prep.data_prep import load_clean_amazon_product_data
13
 
14
  _FILE_PATH = Path(__file__).parents[1]
15
 
 
57
  stage_name: Name of the search stage
58
  metrics: Dict with keys: semantic_match, diversity, latency_ms
59
  """
60
+ html_parts = [f"## 🔍 {stage_name}\n\n"]
61
 
62
  for idx, result in enumerate(results, 1):
63
  category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
 
72
  """
73
  )
74
 
75
+ html_parts.append("\n---\n\n### Performance Metrics\n\n")
76
  html_parts.append(
77
  f"""
78
+ | Metric | Score |
79
+ |--------|-------|
80
+ | **Semantic Match** | {metrics['semantic_match']:.3f} |
81
+ | **Diversity** | {metrics['diversity']:.3f} |
82
+ | **Latency** | {metrics['latency_ms']}ms |
83
  """
84
  )
85
 
 
215
  "Stage 4: + Reranking",
216
  ]
217
 
218
+ # Build markdown table
219
+ html = "## Stage-by-Stage Comparison\n\n"
220
+ html += "| Stage | Semantic Match | Diversity | Latency (ms) |\n"
221
+ html += "|-------|----------------|-----------|---------------|\n"
222
+
223
+ for name, metrics in zip(stage_names, all_metrics):
224
+ html += f"| **{name}** | {metrics['semantic_match']:.3f} | {metrics['diversity']:.3f} | {metrics['latency_ms']} |\n"
225
+
226
+ # Calculate improvements
227
+ semantic_improvement = (
228
+ (
229
+ (all_metrics[3]["semantic_match"] - all_metrics[0]["semantic_match"])
230
+ / all_metrics[0]["semantic_match"]
231
+ * 100
232
+ )
233
+ if all_metrics[0]["semantic_match"] > 0
234
+ else 0
235
+ )
236
+ diversity_improvement = (
237
+ (
238
+ (all_metrics[3]["diversity"] - all_metrics[0]["diversity"])
239
+ / all_metrics[0]["diversity"]
240
+ * 100
241
+ )
242
+ if all_metrics[0]["diversity"] > 0
243
+ else 0
244
+ )
245
+
246
+ html += "\n---\n\n"
247
+ html += "## Key Insights\n\n"
248
+ html += f"- **Semantic Match** improves by **{semantic_improvement:.0f}%** from Stage 1 to Stage 4\n"
249
+ html += f"- **Diversity** increases by **{diversity_improvement:.0f}%** showing more varied results\n"
250
+ html += f"- **Latency** stays under **{max(m['latency_ms'] for m in all_metrics)}ms** maintaining fast performance\n"
251
+ html += "- Each stage adds incremental value to search quality\n"
252
+
253
+ return html
254
+
255
+
256
+ def set_example(example: str) -> str:
257
+ """Set an example query."""
258
+ return example
259
+
260
+
261
+ def load_example_query(category: str, ambiguity: str) -> str:
262
+ """Load example query based on category and ambiguity level."""
263
+ ambiguity_key = ambiguity.lower().replace(" ", "_")
264
+ return EXAMPLE_QUERIES_BY_CATEGORY[category][ambiguity_key]
265
+
266
+
267
+ def generate_category_distribution_table() -> str:
268
+ """Generate HTML table showing MainCategory distribution."""
269
+ df = load_clean_amazon_product_data()
270
+ category_counts = df["MainCategory"].value_counts()
271
+ total = len(df)
272
+
273
  html = """
274
+ ### Dataset Category Distribution
275
 
276
  <table class="comparison-table">
277
  <tr>
278
+ <th>Category</th>
279
+ <th>Count</th>
280
+ <th>Percentage</th>
 
281
  </tr>
282
  """
283
 
284
+ for category, count in category_counts.items():
285
+ percentage = (count / total) * 100
286
  html += f"""
287
  <tr>
288
+ <td><strong>{category}</strong></td>
289
+ <td>{count:,}</td>
290
+ <td>{percentage:.1f}%</td>
 
291
  </tr>
292
  """
293
 
294
+ html += f"""
295
+ <tr style="background: #F3F0FF; font-weight: 600;">
296
+ <td><strong>Total</strong></td>
297
+ <td>{total:,}</td>
298
+ <td>100.0%</td>
299
+ </tr>
300
+ </table>
301
+ """
302
+
303
+ return html
304
 
 
 
305
 
306
+ def generate_sample_data_table() -> str:
307
+ """Generate HTML table showing sample rows from the dataset."""
308
+ df = load_clean_amazon_product_data()
309
+ sample_df = df.sample(n=5, random_state=42)
310
+
311
+ html = """
312
+ ### Sample Products from Dataset
313
+
314
+ <table class="comparison-table">
315
+ <tr>
316
+ <th>Product Name</th>
317
+ <th>Main Category</th>
318
+ <th>Secondary Category</th>
319
+ <th>Description</th>
320
+ </tr>
321
  """
322
 
323
+ for _, row in sample_df.iterrows():
324
+ description = (
325
+ row["Description"][:80] + "..."
326
+ if len(row["Description"]) > 80
327
+ else row["Description"]
328
+ )
329
+ html += f"""
330
+ <tr>
331
+ <td><strong>{row["Product Name"]}</strong></td>
332
+ <td>{row["MainCategory"]}</td>
333
+ <td>{row["SecondaryCategory"]}</td>
334
+ <td style="color: #64748B; font-size: 0.9em;">{description}</td>
335
+ </tr>
336
+ """
337
 
338
+ html += "</table>"
339
 
340
+ return html
 
 
341
 
342
 
343
  # Code snippets for each stage
 
475
  query_input = gr.Textbox(
476
  label="Search Query",
477
  placeholder="Enter your search query...",
478
+ value=EXAMPLE_QUERIES_BY_CATEGORY["Toys & Games"]["clear"],
479
  scale=3,
480
  elem_classes="search-box",
481
  )
 
489
  container=True,
490
  elem_classes="compact-input",
491
  )
492
+ # Clean example query selector
493
  with gr.Row():
494
+ gr.Markdown(
495
+ "**Try Example Queries:** Select a category and specificity level to auto-load an example"
496
+ )
 
 
497
  with gr.Row():
498
+ with gr.Column(scale=1):
499
+ category_dropdown = gr.Dropdown(
500
+ choices=list(EXAMPLE_QUERIES_BY_CATEGORY.keys()),
501
+ value=list(EXAMPLE_QUERIES_BY_CATEGORY.keys())[0],
502
+ label="Category",
503
+ container=True,
504
+ )
505
+ with gr.Column(scale=1):
506
+ ambiguity_dropdown = gr.Dropdown(
507
+ choices=["Clear", "Somewhat Ambiguous", "Ambiguous"],
508
+ value="Clear",
509
+ label="Query Specificity",
510
+ container=True,
511
+ )
512
+ with gr.Column(scale=1):
513
+ search_btn = gr.Button("Search", variant="primary", scale=1, size="lg")
514
 
 
515
  with gr.Tabs() as tabs:
516
 
 
517
  with gr.Tab("Stage 1: BM25 Baseline"):
518
  stage1_output = gr.Markdown(label="Results")
519
  with gr.Accordion("Show Code", open=False):
520
  gr.Markdown(CODE_STAGE_1)
521
 
 
522
  with gr.Tab("Stage 2: + Vector Embeddings"):
523
  stage2_output = gr.Markdown(label="Results")
524
  with gr.Accordion("Show Code", open=False):
525
  gr.Markdown(CODE_STAGE_2)
526
 
 
527
  with gr.Tab("Stage 3: + Query Expansion"):
528
  stage3_output = gr.Markdown(label="Results")
529
  with gr.Accordion("Show Code", open=False):
530
  gr.Markdown(CODE_STAGE_3)
531
 
 
532
  with gr.Tab("Stage 4: + LLM Reranking"):
533
  stage4_output = gr.Markdown(label="Results")
534
  with gr.Accordion("Show Code", open=False):
535
  gr.Markdown(CODE_STAGE_4)
536
 
 
537
  with gr.Tab("Compare All Stages"):
538
  comparison_output = gr.Markdown(label="Comparison")
539
 
540
+ with gr.Accordion("Dataset Information", open=False):
541
+ gr.Markdown("Explore the dataset used for this search demo")
542
+ with gr.Row():
543
+ category_dist_table = gr.Markdown(
544
+ value=generate_category_distribution_table()
545
+ )
546
+ with gr.Row():
547
+ sample_data_table = gr.Markdown(value=generate_sample_data_table())
548
+
549
+ # Event handlers - auto-load query when dropdown changes
550
+ category_dropdown.change(
551
+ fn=load_example_query,
552
+ inputs=[category_dropdown, ambiguity_dropdown],
553
+ outputs=[query_input],
554
+ )
555
+
556
+ ambiguity_dropdown.change(
557
+ fn=load_example_query,
558
+ inputs=[category_dropdown, ambiguity_dropdown],
559
+ outputs=[query_input],
560
+ )
561
+
562
  search_btn.click(
563
  fn=search_all_stages,
564
  inputs=[query_input],
src/config.py CHANGED
@@ -184,10 +184,30 @@ summary:hover {
184
  """
185
 
186
 
187
- EXAMPLE_QUERIES = [
188
- "gift for 5 year old who likes science",
189
- "cheap wireless headphones good battery",
190
- "running shoes",
191
- "waterproof bluetooth speaker",
192
- "ergonomic office chair under 200",
193
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  """
185
 
186
 
187
+ EXAMPLE_QUERIES_BY_CATEGORY = {
188
+ "Toys & Games": {
189
+ "clear": "magnetic construction building blocks educational toy",
190
+ "somewhat_ambiguous": "learning toy for preschool kids",
191
+ "ambiguous": "fun gift for child",
192
+ },
193
+ "Home & Kitchen": {
194
+ "clear": "kids octopus comforter bedding set full size",
195
+ "somewhat_ambiguous": "colorful bedding set for children",
196
+ "ambiguous": "bedroom decoration items",
197
+ },
198
+ "Clothing, Shoes & Jewelry": {
199
+ "clear": "star wars stormtrooper halloween costume kids",
200
+ "somewhat_ambiguous": "character costume for children",
201
+ "ambiguous": "dress up outfit",
202
+ },
203
+ "Sports & Outdoors": {
204
+ "clear": "55 inch trampoline with safety net enclosure",
205
+ "somewhat_ambiguous": "small trampoline for children",
206
+ "ambiguous": "backyard play equipment",
207
+ },
208
+ "Baby Products": {
209
+ "clear": "nursery wall decor quotes motivational stickers",
210
+ "somewhat_ambiguous": "wall decorations for baby room",
211
+ "ambiguous": "cute nursery items",
212
+ },
213
+ }