awacke1 commited on
Commit
fcc5344
Β·
verified Β·
1 Parent(s): a2236b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -15
app.py CHANGED
@@ -124,14 +124,14 @@ class FastDatasetSearcher:
124
  return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
125
 
126
  def quick_search(self, query, df):
127
- """Enhanced search with improved relevance filtering"""
128
  if df.empty or not query.strip():
129
  return df
130
 
131
  try:
132
- # Define relevance thresholds
133
- MIN_KEYWORD_MATCHES = 0.1
134
- MIN_SEMANTIC_SCORE = 0.3
135
 
136
  # Get searchable columns
137
  searchable_cols = []
@@ -150,34 +150,55 @@ class FastDatasetSearcher:
150
  for _, row in df.iterrows():
151
  text_parts = []
152
  row_matched = False
 
153
 
154
- # Check for direct matches
155
- for col in searchable_cols:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  val = row[col]
157
  if val is not None:
158
  val_str = str(val).lower()
159
- if any(term in val_str for term in query_terms):
 
 
160
  row_matched = True
161
  text_parts.append(str(val))
162
 
163
  text = ' '.join(text_parts)
164
 
165
  if text.strip():
166
- # Calculate term-based keyword score
167
- text_terms = set(text.lower().split())
168
- matching_terms = query_terms.intersection(text_terms)
169
  keyword_score = len(matching_terms) / len(query_terms)
170
 
171
  # Calculate semantic score
172
  text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
173
  semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
174
 
175
- # Weighted combination
176
- combined_score = 0.7 * keyword_score + 0.3 * semantic_score
177
 
178
- # Boost exact matches
179
- if row_matched:
180
- combined_score *= 1.5
 
181
  else:
182
  combined_score = 0.0
183
  row_matched = False
@@ -460,6 +481,7 @@ def perform_arxiv_lookup(query, vocal_summary=True, titles_summary=True, full_au
460
  st.audio(audio_file_full)
461
 
462
  def render_result(result):
 
463
  score = result.get('relevance_score', 0)
464
  result_filtered = {k: v for k, v in result.items()
465
  if k not in ['relevance_score', 'video_embed', 'description_embed', 'audio_embed']}
@@ -469,12 +491,36 @@ def render_result(result):
469
 
470
  cols = st.columns([2, 1])
471
  with cols[0]:
 
472
  for key, value in result_filtered.items():
473
  if isinstance(value, (str, int, float)):
474
  st.write(f"**{key}:** {value}")
 
 
475
 
476
  with cols[1]:
477
  st.metric("Relevance Score", f"{score:.2%}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
  def main():
480
  st.title("πŸŽ₯ Advanced Video & Dataset Search with Voice")
 
124
  return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
125
 
126
  def quick_search(self, query, df):
127
+ """Enhanced search with strict token matching and semantic relevance"""
128
  if df.empty or not query.strip():
129
  return df
130
 
131
  try:
132
+ # Define stricter thresholds
133
+ MIN_SEMANTIC_SCORE = 0.5 # Higher semantic threshold
134
+ EXACT_MATCH_BOOST = 2.0 # Boost for exact matches
135
 
136
  # Get searchable columns
137
  searchable_cols = []
 
150
  for _, row in df.iterrows():
151
  text_parts = []
152
  row_matched = False
153
+ exact_match = False
154
 
155
+ # Prioritize description and matched_text fields
156
+ priority_fields = ['description', 'matched_text']
157
+ other_fields = [col for col in searchable_cols if col not in priority_fields]
158
+
159
+ # First check priority fields for exact matches
160
+ for col in priority_fields:
161
+ if col in row:
162
+ val = row[col]
163
+ if val is not None:
164
+ val_str = str(val).lower()
165
+ # Check for exact token matches
166
+ if query_lower in val_str.split():
167
+ exact_match = True
168
+ if any(term in val_str.split() for term in query_terms):
169
+ row_matched = True
170
+ text_parts.append(str(val))
171
+
172
+ # Then check other fields
173
+ for col in other_fields:
174
  val = row[col]
175
  if val is not None:
176
  val_str = str(val).lower()
177
+ if query_lower in val_str.split():
178
+ exact_match = True
179
+ if any(term in val_str.split() for term in query_terms):
180
  row_matched = True
181
  text_parts.append(str(val))
182
 
183
  text = ' '.join(text_parts)
184
 
185
  if text.strip():
186
+ # Calculate exact token matches
187
+ text_tokens = set(text.lower().split())
188
+ matching_terms = query_terms.intersection(text_tokens)
189
  keyword_score = len(matching_terms) / len(query_terms)
190
 
191
  # Calculate semantic score
192
  text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
193
  semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
194
 
195
+ # Weighted scoring with priority for exact matches
196
+ combined_score = 0.8 * keyword_score + 0.2 * semantic_score
197
 
198
+ if exact_match:
199
+ combined_score *= EXACT_MATCH_BOOST
200
+ elif row_matched:
201
+ combined_score *= 1.2
202
  else:
203
  combined_score = 0.0
204
  row_matched = False
 
481
  st.audio(audio_file_full)
482
 
483
  def render_result(result):
484
+ """Render a search result with voice selection and TTS options"""
485
  score = result.get('relevance_score', 0)
486
  result_filtered = {k: v for k, v in result.items()
487
  if k not in ['relevance_score', 'video_embed', 'description_embed', 'audio_embed']}
 
491
 
492
  cols = st.columns([2, 1])
493
  with cols[0]:
494
+ text_content = [] # Collect text for TTS
495
  for key, value in result_filtered.items():
496
  if isinstance(value, (str, int, float)):
497
  st.write(f"**{key}:** {value}")
498
+ if isinstance(value, str) and len(value.strip()) > 0:
499
+ text_content.append(f"{key}: {value}")
500
 
501
  with cols[1]:
502
  st.metric("Relevance Score", f"{score:.2%}")
503
+
504
+ # Voice selection for TTS
505
+ voices = {
506
+ "Aria (US Female)": "en-US-AriaNeural",
507
+ "Guy (US Male)": "en-US-GuyNeural",
508
+ "Sonia (UK Female)": "en-GB-SoniaNeural",
509
+ "Tony (UK Male)": "en-GB-TonyNeural",
510
+ "Jenny (US Female)": "en-US-JennyNeural"
511
+ }
512
+
513
+ selected_voice = st.selectbox(
514
+ "Select Voice",
515
+ list(voices.keys()),
516
+ key=f"voice_{result.get('video_id', '')}"
517
+ )
518
+
519
+ if st.button("πŸ”Š Read Description", key=f"read_{result.get('video_id', '')}"):
520
+ text_to_read = ". ".join(text_content)
521
+ audio_file = asyncio.run(generate_speech(text_to_read, voices[selected_voice]))
522
+ if audio_file:
523
+ st.audio(audio_file)
524
 
525
  def main():
526
  st.title("πŸŽ₯ Advanced Video & Dataset Search with Voice")